import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.svm import SVC from sklearn import cross_validation from sklearn.decomposition import PCA from sklearn import preprocessing digits = datasets.load_digits() X = digits.data y = digits.target # if you want to standardize the data, uncomment the following lines #scaler = preprocessing.StandardScaler().fit(X) #X = scaler.transform(X) pca = PCA(n_components=10) X_reduced = pca.fit_transform(X) print (pca.explained_variance_ratio_) # a scatter-plot in the space of the principal components: plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, cmap=plt.cm.Paired) # let's see if this feature representation is useful: X /= X.max() from sklearn.grid_search import GridSearchCV param_grid = [ {'C': [1, 10, 100], 'kernel': ['linear']}, {'C': [1, 10, 100], 'gamma': [0.01, 0.001, 0.0001], 'kernel': ['rbf']}, ] classifier = GridSearchCV(estimator=SVC(), param_grid=param_grid) cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0) # accuracy with all the features: print (np.mean(cross_validation.cross_val_score(classifier, X, y, cv=cv))) # accuracy with the PCA features: print (np.mean(cross_validation.cross_val_score(classifier, X_reduced, y, cv=cv)))