This shows you the differences between two versions of the page.
Next revision | Previous revision Last revision Both sides next revision | ||
code:model_selection [2015/10/05 13:25] asa created |
code:model_selection [2016/10/06 14:58] asa |
||
---|---|---|---|
Line 1: | Line 1: | ||
- | ===== model selection and cross validation in scikit-learn ===== | + | ===== model selection in scikit-learn ===== |
- | + | ||
- | First let's import some modules and read in some data: | + | |
<code python> | <code python> | ||
- | In [1]: import numpy as np | ||
- | In [2]: from sklearn import cross_validation | + | """classifier evaluation using scikit-learn |
+ | |||
+ | more details at: | ||
+ | http://scikit-learn.org/stable/modules/cross_validation.html | ||
+ | http://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html | ||
+ | """ | ||
+ | |||
+ | import numpy as np | ||
+ | from sklearn import cross_validation | ||
+ | from sklearn import svm | ||
+ | from sklearn import metrics | ||
+ | |||
+ | # read in the heart dataset | ||
+ | |||
+ | data=np.genfromtxt("../data/heart_scale.data", delimiter=",") | ||
+ | X=data[:,1:] | ||
+ | y=data[:,0] | ||
+ | |||
+ | # first let's do regular cross-validation: | ||
+ | |||
+ | cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0) | ||
+ | print (cv.test_folds) | ||
+ | |||
+ | classifier = svm.SVC(kernel='linear', C=1) | ||
+ | |||
+ | y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv) | ||
+ | print(metrics.accuracy_score(y, y_predict)) | ||
+ | |||
+ | |||
+ | # grid search | ||
+ | |||
+ | # let's perform model selection using grid search | ||
+ | |||
+ | from sklearn.grid_search import GridSearchCV | ||
+ | Cs = np.logspace(-2, 3, 6) | ||
+ | classifier = GridSearchCV(estimator=svm.LinearSVC(), param_grid=dict(C=Cs) ) | ||
+ | classifier.fit(X, y) | ||
+ | |||
+ | # print the best accuracy, classifier and parameters: | ||
+ | print (classifier.best_score_) | ||
+ | print (classifier.best_estimator_) | ||
+ | print (classifier.best_params_) | ||
+ | |||
+ | # performing nested cross validation: | ||
- | In [3]: from sklearn import svm | + | y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv) |
+ | print(metrics.accuracy_score(y, y_predict)) | ||
- | In [4]: from sklearn import metrics | ||
- | In [5]: data=np.genfromtxt("../data/heart_scale.data", delimiter=",") | + | # if we want to do grid search over multiple parameters: |
+ | param_grid = [ | ||
+ | {'C': [1, 10, 100, 1000], 'kernel': ['linear']}, | ||
+ | {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}, | ||
+ | ] | ||
+ | classifier = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid) | ||
- | In [6]: X=data[:,1:] | + | y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv) |
+ | print(metrics.accuracy_score(y, y_predict)) | ||
- | In [7]: y=data[:,0] | + | </file> |
- | </code> |