Let's compare some ensemble methods:
""" ensemble methods in scikit-learn http://scikit-learn.org/stable/modules/ensemble.html """ from sklearn.ensemble import RandomForestClassifier from sklearn import cross_validation from sklearn import metrics from sklearn.datasets import load_digits digits = load_digits() X = digits.data X = X + np.random.binomial(1, 0.5, X.shape) * np.random.uniform(1, 15, X.shape) y = digits.target X /= X.max() cv = cross_validation.StratifiedKFold(y, 5, shuffle=True, random_state=0) # let's look at accuracy as a function of the number of trees: n_estimators = [10, 20, 50, 100, 200, 500] accuracy = [] for estimators in n_estimators : print ("num estimators: ", estimators) classifier = RandomForestClassifier(n_estimators=estimators) y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv) accuracy.append(metrics.accuracy_score(y, y_predict)) import matplotlib.pyplot as plt plt.semilogx(n_estimators, accuracy, 'ob') plt.title('performance of random forests on the digits data') plt.xlabel('number of estimators') plt.ylabel('accuracy') plt.show() # let's compare to decision trees: from sklearn.tree import DecisionTreeClassifier classifier = DecisionTreeClassifier(max_depth=None) y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv) print(metrics.accuracy_score(y, y_predict)) # bagging from sklearn.ensemble import BaggingClassifier model = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=200, random_state=0) y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv) print(metrics.accuracy_score(y, y_predict)) # AdaBoost from sklearn.ensemble import AdaBoostClassifier classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=500) y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv) print(metrics.accuracy_score(y, y_predict)) # SVM from sklearn import svm from sklearn.grid_search import GridSearchCV param_grid = [ {'C': [1, 10, 100], 'kernel': ['linear']}, {'C': [1, 10, 100], 'gamma': [1, 0.1, 0.01], 'kernel': ['rbf']}, ] classifier = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid) y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=cv) print(metrics.accuracy_score(y, y_predict))