This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision | ||
code:model_selection [2015/10/05 15:06] asa |
code:model_selection [2016/10/06 14:58] asa |
||
---|---|---|---|
Line 1: | Line 1: | ||
- | ===== model selection and cross validation in scikit-learn ===== | + | ===== model selection in scikit-learn ===== |
- | First let's import some modules and read in some data: | + | <file python model_selection.py> |
- | <code python> | ||
- | In [1]: import numpy as np | + | """classifier evaluation using scikit-learn |
- | In [2]: from sklearn import cross_validation | + | more details at: |
+ | http://scikit-learn.org/stable/modules/cross_validation.html | ||
+ | http://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html | ||
+ | """ | ||
- | In [3]: from sklearn import svm | ||
- | |||
- | In [4]: from sklearn import metrics | ||
- | |||
- | In [5]: data=np.genfromtxt("../data/heart_scale.data", delimiter=",") | ||
- | |||
- | In [6]: X=data[:,1:] | ||
- | |||
- | In [7]: y=data[:,0] | ||
- | |||
- | </code> | ||
- | |||
- | The simplest form of model evaluation uses a validation/test set: | ||
- | |||
- | <code python> | ||
- | In [9]: X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0) | ||
- | |||
- | In [10]: classifier = svm.SVC(kernel='linear', C=1).fit(X_train, y_train) | ||
- | |||
- | In [11]: classifier.score(X_test, y_test) | ||
- | Out[11]: 0.7592592592592593 | ||
- | |||
- | |||
- | </code> | ||
- | |||
- | Next, let'd perform cross-validation: | ||
- | |||
- | <code python> | ||
- | |||
- | In [18]: cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='accuracy') | ||
- | Out[18]: array([ 0.7962963 , 0.83333333, 0.88888889, 0.83333333, 0.83333333]) | ||
- | |||
- | In [19]: | ||
- | |||
- | In [19]: # you can obtain accuracy for other metrics, such as area under the roc curve: | ||
- | |||
- | In [20]: cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='roc_auc') | ||
- | Out[20]: array([ 0.89166667, 0.89166667, 0.95833333, 0.87638889, 0.91388889]) | ||
- | |||
- | In [21]: | ||
- | |||
- | In [21]: # you can also obtain the predictions by cross-validation and then compute the accuracy: | ||
- | |||
- | In [22]: y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=5) | ||
- | |||
- | In [23]: metrics.accuracy_score(y, y_predict) | ||
- | Out[23]: 0.83703703703703702 | ||
- | |||
- | </code> | ||
- | |||
- | H ere's an alternative way of doing cross-validation. | ||
- | |||
- | <code python> | ||
- | In [25]: # first divide the data into folds: | ||
- | |||
- | In [26]: cv = cross_validation.StratifiedKFold(y, 5) | ||
- | |||
- | In [27]: # now use these folds: | ||
- | |||
- | In [28]: print cross_validation.cross_val_score(classifier, X, y, cv=cv, scoring='roc_auc') | ||
- | [ 0.89166667 0.89166667 0.95833333 0.87638889 0.91388889] | ||
- | |||
- | In [29]: # you can see how examples were divided into folds by looking at the test_folds attribute: | ||
- | |||
- | In [30]: print cv.test_folds | ||
- | [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | ||
- | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | ||
- | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 | ||
- | 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 | ||
- | 2 2 2 2 2 2 2 2 2 2 2 2 3 3 2 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 | ||
- | 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 | ||
- | 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 | ||
- | 4 4 4 4 4 4 4 4 4 4 4] | ||
- | |||
- | In [31]: # hmm... perhaps we should shuffle things a bit... | ||
- | |||
- | In [32]: cv = cross_validation.StratifiedKFold(y, 5, shuffle=True) | ||
- | |||
- | In [33]: print cv.test_folds | ||
- | [0 1 1 2 0 1 4 3 4 3 2 0 2 3 2 3 2 0 4 1 1 3 4 1 1 4 1 4 4 2 2 3 0 2 3 1 4 | ||
- | 0 3 2 0 2 0 1 3 2 0 0 2 3 0 4 2 0 4 3 4 1 1 0 3 2 4 3 2 3 1 1 1 1 4 3 1 1 | ||
- | 4 2 2 3 3 1 4 2 1 0 2 1 0 2 4 1 0 3 2 3 1 2 2 1 1 0 4 1 3 0 1 1 3 3 0 3 3 | ||
- | 4 2 0 2 0 2 4 0 1 0 4 4 1 1 0 4 0 1 4 4 3 1 3 3 2 4 3 4 2 4 3 4 1 4 2 0 3 | ||
- | 3 3 3 0 0 0 4 3 4 2 3 0 1 1 0 0 4 0 4 1 4 0 0 0 0 3 3 0 4 4 2 0 3 3 0 1 2 | ||
- | 2 2 3 2 1 3 4 4 4 1 1 4 2 1 0 3 1 2 0 0 0 0 2 3 4 3 2 0 0 4 1 3 2 2 0 1 2 | ||
- | 4 2 4 0 2 1 1 0 4 4 1 4 4 3 4 2 3 3 1 4 2 1 4 1 3 2 1 3 2 1 3 1 3 0 2 2 0 | ||
- | 4 4 2 2 4 3 3 0 2 0 2] | ||
- | |||
- | In [34]: # if you run division into folds multiple times you will get a different answer: | ||
- | |||
- | In [35]: cv = cross_validation.StratifiedKFold(y, 5, shuffle=True) | ||
- | |||
- | In [36]: print cv.test_folds | ||
- | [3 0 2 2 0 2 2 4 1 4 0 2 3 4 2 0 4 0 3 3 4 0 2 0 4 4 0 1 4 4 3 4 1 2 3 3 1 | ||
- | 2 1 4 4 4 0 0 4 2 0 0 2 0 1 3 1 0 3 4 0 3 0 4 1 1 2 4 2 0 2 3 1 0 3 0 1 2 | ||
- | 3 2 4 0 0 0 1 4 3 2 2 4 3 1 3 2 0 2 0 0 3 2 1 2 4 4 0 0 4 2 1 4 3 0 4 3 4 | ||
- | 1 4 0 0 4 2 1 4 4 3 4 1 1 3 0 2 2 3 1 2 3 1 0 4 1 4 1 3 1 3 3 4 4 1 0 0 0 | ||
- | 0 4 3 1 2 2 3 0 3 2 4 3 2 2 3 0 3 1 0 4 2 3 0 2 4 3 0 4 3 4 3 3 0 3 1 2 2 | ||
- | 1 3 4 1 0 4 3 4 0 0 0 3 2 2 1 3 4 4 2 3 4 3 2 1 3 0 4 0 1 3 1 2 2 2 2 0 3 | ||
- | 1 1 1 2 0 1 4 1 1 1 2 2 1 2 3 3 1 4 4 3 4 2 0 2 2 1 1 1 2 0 3 0 2 1 1 3 1 | ||
- | 3 1 0 1 3 4 4 2 1 1 1] | ||
- | |||
- | In [37]: # if you want to consistently get the same division into folds: | ||
- | |||
- | In [38]: cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0) | ||
- | |||
- | In [39]: # this sets the seed for the random number generator. | ||
- | |||
- | </code> | ||
- | |||
- | Let's do grid search for the optimal set of parameters: | ||
- | |||
- | <code python> | ||
- | In [40]: from sklearn.grid_search import GridSearchCV | ||
- | |||
- | In [41]: Cs = np.logspace(-2, 3, 6) | ||
- | |||
- | In [42]: classifier = GridSearchCV(estimator=svm.LinearSVC(), param_grid=dict(C=Cs) ) | ||
- | |||
- | In [43]: classifier.fit(X, y) | ||
- | Out[43]: | ||
- | GridSearchCV(cv=None, error_score='raise', | ||
- | estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, | ||
- | intercept_scaling=1, loss='squared_hinge', max_iter=1000, | ||
- | multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, | ||
- | verbose=0), | ||
- | fit_params={}, iid=True, loss_func=None, n_jobs=1, | ||
- | param_grid={'C': array([ 1.00000e-02, 1.00000e-01, 1.00000e+00, 1.00000e+01, | ||
- | 1.00000e+02, 1.00000e+03])}, | ||
- | pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None, | ||
- | verbose=0) | ||
- | |||
- | In [44]: | ||
- | |||
- | In [44]: # print the best accuracy, classifier and parameters: | ||
- | |||
- | In [45]: print classifier.best_score_ | ||
- | 0.844444444444 | ||
- | |||
- | In [46]: print classifier.best_estimator_ | ||
- | LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, | ||
- | intercept_scaling=1, loss='squared_hinge', max_iter=1000, | ||
- | multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, | ||
- | verbose=0) | ||
- | |||
- | In [47]: print classifier.best_params_ | ||
- | {'C': 1.0} | ||
- | |||
- | n [48]: # performing nested cross validation: | ||
- | |||
- | In [49]: print cross_validation.cross_val_score(classifier, X, y, cv=5) | ||
- | [ 0.7962963 0.81481481 0.88888889 0.83333333 0.83333333] | ||
- | |||
- | In [50]: # if we want to do grid search over multiple parameters: | ||
- | |||
- | In [51]: param_grid = [ | ||
- | ....: {'C': [1, 10, 100, 1000], 'kernel': ['linear']}, | ||
- | ....: {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}, | ||
- | ....: ] | ||
- | |||
- | In [52]: classifier = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid) | ||
- | |||
- | In [53]: print cross_validation.cross_val_score(classifier, X, y, cv=5) | ||
- | [ 0.7962963 0.83333333 0.88888889 0.7962963 0.87037037] | ||
- | |||
- | </code> | ||
- | |||
- | And to make things easier for you here's the whole thing without the output: | ||
- | |||
- | <file python model_selection.py> | ||
import numpy as np | import numpy as np | ||
from sklearn import cross_validation | from sklearn import cross_validation | ||
from sklearn import svm | from sklearn import svm | ||
from sklearn import metrics | from sklearn import metrics | ||
+ | |||
+ | # read in the heart dataset | ||
data=np.genfromtxt("../data/heart_scale.data", delimiter=",") | data=np.genfromtxt("../data/heart_scale.data", delimiter=",") | ||
Line 188: | Line 22: | ||
y=data[:,0] | y=data[:,0] | ||
- | # let's train/test an svm on the heart dataset: | + | # first let's do regular cross-validation: |
- | X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0) | + | cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0) |
- | classifier = svm.SVC(kernel='linear', C=1).fit(X_train, y_train) | + | print (cv.test_folds) |
- | print classifier.score(X_test, y_test) | + | |
- | # now let's use cross-validation instead: | + | classifier = svm.SVC(kernel='linear', C=1) |
- | print cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='accuracy') | + | |
- | # you can obtain accuracy for other metrics, such as area under the roc curve: | + | y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv) |
- | print cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='roc_auc') | + | print(metrics.accuracy_score(y, y_predict)) |
- | + | ||
- | # you can also obtain the predictions by cross-validation and then compute the accuracy: | + | |
- | y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=5) | + | |
- | metrics.accuracy_score(y, y_predict) | + | |
- | + | ||
- | # here's an alternative way of doing cross-validation. | + | |
- | # first divide the data into folds: | + | |
- | cv = cross_validation.StratifiedKFold(y, 5) | + | |
- | # now use these folds: | + | |
- | print cross_validation.cross_val_score(classifier, X, y, cv=cv, scoring='roc_auc') | + | |
- | + | ||
- | # you can see how examples were divided into folds by looking at the test_folds attribute: | + | |
- | print cv.test_folds | + | |
- | + | ||
- | # hmm... perhaps we should shuffle things a bit... | + | |
- | + | ||
- | cv = cross_validation.StratifiedKFold(y, 5, shuffle=True) | + | |
- | print cv.test_folds | + | |
- | + | ||
- | # if you run division into folds multiple times you will get a different answer: | + | |
- | cv = cross_validation.StratifiedKFold(y, 5, shuffle=True) | + | |
- | print cv.test_folds | + | |
- | + | ||
- | # if you want to consistently get the same division into folds: | + | |
- | cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0) | + | |
- | # this sets the seed for the random number generator. | + | |
Line 237: | Line 43: | ||
# print the best accuracy, classifier and parameters: | # print the best accuracy, classifier and parameters: | ||
- | print classifier.best_score_ | + | print (classifier.best_score_) |
- | print classifier.best_estimator_ | + | print (classifier.best_estimator_) |
- | print classifier.best_params_ | + | print (classifier.best_params_) |
# performing nested cross validation: | # performing nested cross validation: | ||
- | print cross_validation.cross_val_score(classifier, X, y, cv=5) | + | |
+ | y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv) | ||
+ | print(metrics.accuracy_score(y, y_predict)) | ||
# if we want to do grid search over multiple parameters: | # if we want to do grid search over multiple parameters: | ||
Line 250: | Line 59: | ||
] | ] | ||
classifier = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid) | classifier = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid) | ||
- | print cross_validation.cross_val_score(classifier, X, y, cv=5) | + | |
+ | y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv) | ||
+ | print(metrics.accuracy_score(y, y_predict)) | ||
</file> | </file> | ||