Differences

This shows you the differences between two versions of the page.

--- code:model_selection [2015/10/05 15:06]
asa
+++ code:model_selection [2016/10/06 14:58]
asa
@@ Line 1: / Line 1: @@
-===== model selection and cross validation in scikit-learn =====
+===== model selection in scikit-learn =====
-First let's import some modules and read in some data:
+<file python model_selection.py>
-<code python>
-In [1]: import numpy as np
+"""classifier evaluation using scikit-learn
-In [2]: from sklearn import cross_validation
+more details at:
+http://scikit-learn.org/stable/modules/cross_validation.html
+http://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html
+"""
-In [3]: from sklearn import svm
-In [4]: from sklearn import metrics
-In [5]: data=np.genfromtxt("../data/heart_scale.data", delimiter=",")
-In [6]: X=data[:,1:]
-In [7]: y=data[:,0]
-</code>
-The simplest form of model evaluation uses a validation/test set:
-<code python>
-In [9]: X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
-In [10]: classifier = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
-In [11]: classifier.score(X_test, y_test)
-Out[11]: 0.7592592592592593
-</code>
-Next, let'd perform cross-validation:
-<code python>
-In [18]: cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='accuracy')
-Out[18]: array([ 0.7962963 ,  0.83333333,  0.88888889,  0.83333333,  0.83333333])
-In [19]:
-In [19]: # you can obtain accuracy for other metrics, such as area under the roc curve:
-In [20]: cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='roc_auc')
-Out[20]: array([ 0.89166667,  0.89166667,  0.95833333,  0.87638889,  0.91388889])
-In [21]:
-In [21]: # you can also obtain the predictions by cross-validation and then compute the accuracy:
-In [22]: y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=5)
-In [23]: metrics.accuracy_score(y, y_predict)
-Out[23]: 0.83703703703703702
-</code>
-H ere's an alternative way of doing cross-validation.
-<code python>
-In [25]: # first divide the data into folds:
-In [26]: cv = cross_validation.StratifiedKFold(y, 5)
-In [27]: # now use these folds:
-In [28]: print cross_validation.cross_val_score(classifier, X, y, cv=cv, scoring='roc_auc')
-[ 0.89166667  0.89166667  0.95833333  0.87638889  0.91388889]
-In [29]: # you can see how examples were divided into folds by looking at the test_folds attribute:
-In [30]: print cv.test_folds
-[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2
-2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
-2 2 2 2 2 2 2 2 2 2 2 3 3 2 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
-3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4
-4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
-4 4 4 4 4 4 4 4 4 4]
-In [31]: # hmm... perhaps we should shuffle things a bit...
-In [32]: cv = cross_validation.StratifiedKFold(y, 5, shuffle=True)
-In [33]: print cv.test_folds
-[0 1 1 2 0 1 4 3 4 3 2 0 2 3 2 3 2 0 4 1 1 3 4 1 1 4 1 4 4 2 2 3 0 2 3 1 4
-3 2 0 2 0 1 3 2 0 0 2 3 0 4 2 0 4 3 4 1 1 0 3 2 4 3 2 3 1 1 1 1 4 3 1 1
-2 2 3 3 1 4 2 1 0 2 1 0 2 4 1 0 3 2 3 1 2 2 1 1 0 4 1 3 0 1 1 3 3 0 3 3
-2 0 2 0 2 4 0 1 0 4 4 1 1 0 4 0 1 4 4 3 1 3 3 2 4 3 4 2 4 3 4 1 4 2 0 3
-3 3 0 0 0 4 3 4 2 3 0 1 1 0 0 4 0 4 1 4 0 0 0 0 3 3 0 4 4 2 0 3 3 0 1 2
-2 3 2 1 3 4 4 4 1 1 4 2 1 0 3 1 2 0 0 0 0 2 3 4 3 2 0 0 4 1 3 2 2 0 1 2
-2 4 0 2 1 1 0 4 4 1 4 4 3 4 2 3 3 1 4 2 1 4 1 3 2 1 3 2 1 3 1 3 0 2 2 0
-4 2 2 4 3 3 0 2 0 2]
-In [34]: # if you run division into folds multiple times you will get a different answer:
-In [35]: cv = cross_validation.StratifiedKFold(y, 5, shuffle=True)
-In [36]: print cv.test_folds
-[3 0 2 2 0 2 2 4 1 4 0 2 3 4 2 0 4 0 3 3 4 0 2 0 4 4 0 1 4 4 3 4 1 2 3 3 1
-1 4 4 4 0 0 4 2 0 0 2 0 1 3 1 0 3 4 0 3 0 4 1 1 2 4 2 0 2 3 1 0 3 0 1 2
-2 4 0 0 0 1 4 3 2 2 4 3 1 3 2 0 2 0 0 3 2 1 2 4 4 0 0 4 2 1 4 3 0 4 3 4
-4 0 0 4 2 1 4 4 3 4 1 1 3 0 2 2 3 1 2 3 1 0 4 1 4 1 3 1 3 3 4 4 1 0 0 0
-4 3 1 2 2 3 0 3 2 4 3 2 2 3 0 3 1 0 4 2 3 0 2 4 3 0 4 3 4 3 3 0 3 1 2 2
-3 4 1 0 4 3 4 0 0 0 3 2 2 1 3 4 4 2 3 4 3 2 1 3 0 4 0 1 3 1 2 2 2 2 0 3
-1 1 2 0 1 4 1 1 1 2 2 1 2 3 3 1 4 4 3 4 2 0 2 2 1 1 1 2 0 3 0 2 1 1 3 1
-1 0 1 3 4 4 2 1 1 1]
-In [37]: # if you want to consistently get the same division into folds:
-In [38]: cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0)
-In [39]: # this sets the seed for the random number generator.
-</code>
-Let's do grid search for the optimal set of parameters:
-<code python>
-In [40]: from sklearn.grid_search import GridSearchCV
-In [41]: Cs = np.logspace(-2, 3, 6)
-In [42]: classifier = GridSearchCV(estimator=svm.LinearSVC(), param_grid=dict(C=Cs) )
-In [43]: classifier.fit(X, y)
-Out[43]:
-GridSearchCV(cv=None, error_score='raise',
-       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
-     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
-     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
-     verbose=0),
-       fit_params={}, iid=True, loss_func=None, n_jobs=1,
-       param_grid={'C': array([  1.00000e-02,   1.00000e-01,   1.00000e+00,   1.00000e+01,
-.00000e+02,   1.00000e+03])},
-       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
-       verbose=0)
-In [44]:
-In [44]: # print the best accuracy, classifier and parameters:
-In [45]: print classifier.best_score_
-.844444444444
-In [46]: print classifier.best_estimator_
-LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
-     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
-     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
-     verbose=0)
-In [47]: print classifier.best_params_
-{'C': 1.0}
-n [48]: # performing nested cross validation:
-In [49]: print  cross_validation.cross_val_score(classifier, X, y, cv=5)
-[ 0.7962963   0.81481481  0.88888889  0.83333333  0.83333333]
-In [50]: # if we want to do grid search over multiple parameters:
-In [51]: param_grid = [
-   ....:   {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
-   ....:   {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
-   ....:  ]
-In [52]: classifier = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid)
-In [53]: print cross_validation.cross_val_score(classifier, X, y, cv=5)
-[ 0.7962963   0.83333333  0.88888889  0.7962963   0.87037037]
-</code>
-And to make things easier for you here's the whole thing without the output:
-<file python model_selection.py>
 import numpy as np
 from sklearn import cross_validation
 from sklearn import svm
 from sklearn import metrics
+# read in the heart dataset
 data=np.genfromtxt("../data/heart_scale.data", delimiter=",")
@@ Line 188: / Line 22: @@
 y=data[:,0]
-# let's train/test an svm on the heart dataset:
+# first let's do regular cross-validation:
-X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
+cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0)
-classifier = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
+print (cv.test_folds)
-print classifier.score(X_test, y_test)
-# now let's use cross-validation instead:
+classifier = svm.SVC(kernel='linear', C=1)
-print cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='accuracy')
-# you can obtain accuracy for other metrics, such as area under the roc curve:
+y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv)
-print cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='roc_auc')
+print(metrics.accuracy_score(y, y_predict))
-# you can also obtain the predictions by cross-validation and then compute the accuracy:
-y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=5)
-metrics.accuracy_score(y, y_predict)
-# here's an alternative way of doing cross-validation.
-# first divide the data into folds:
-cv = cross_validation.StratifiedKFold(y, 5)
-# now use these folds:
-print cross_validation.cross_val_score(classifier, X, y, cv=cv, scoring='roc_auc')
-# you can see how examples were divided into folds by looking at the test_folds attribute:
-print cv.test_folds
-# hmm... perhaps we should shuffle things a bit...
-cv = cross_validation.StratifiedKFold(y, 5, shuffle=True)
-print cv.test_folds
-# if you run division into folds multiple times you will get a different answer:
-cv = cross_validation.StratifiedKFold(y, 5, shuffle=True)
-print cv.test_folds
-# if you want to consistently get the same division into folds:
-cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0)
-# this sets the seed for the random number generator.
@@ Line 237: / Line 43: @@
 # print the best accuracy, classifier and parameters:
-print classifier.best_score_
+print (classifier.best_score_)
-print classifier.best_estimator_
+print (classifier.best_estimator_)
-print classifier.best_params_
+print (classifier.best_params_)
 # performing nested cross validation:
-print  cross_validation.cross_val_score(classifier, X, y, cv=5)
+y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv)
+print(metrics.accuracy_score(y, y_predict))
 # if we want to do grid search over multiple parameters:
@@ Line 250: / Line 59: @@
  ]
 classifier = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid)
-print cross_validation.cross_val_score(classifier, X, y, cv=5)
+y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv)
+print(metrics.accuracy_score(y, y_predict))
 </file>

CS545 fall 2016

User Tools

Site Tools

Differences

Page Tools