First of all, LAST week, I spent three or four days looking at some knowledge of Ensemble Learning. I have not yet learned the principles of Ensemble Learning in depth, so I have a macroscopic understanding of the idea of Ensemble Learning and some implementation principles.

  • They are both parallel models Bagging and Serial model Boosting, as well as stacking.
  • The integration strategies are mainly avG Vote and Learning Combiner.

Learn to write notes by hand

Specific notes are attached below:

2 Ensemble Learning Demo

2.1 Voting

from sklearn import datasets, model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

digits = datasets.load_digits()
x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.7, random_state=1)

print(">>>VotingClassifierTest<<<")
knn = KNeighborsClassifier(n_neighbors=1)
dtree = DecisionTreeClassifier()
lr = LogisticRegression(max_iter=10000)
svm = SVC()
voting_clf = VotingClassifier([('knn', knn), ('dt', dtree), ('lr', lr), ('svm', svm)])

for clf in (knn, dtree, lr, svm, voting_clf):
    clf.fit(x_train, y_train)
    print(clf.__class__.__name__, Accuracy: % % % 0.5 f "" % (clf.score(x_test, y_test) * 100))
    #scores = model_selection.cross_val_score(clf, x_data, y_data, cv=5, scoring='accuracy')
    # print (CLF. Magic __class__. __name__, "acc = % 0.5 f" % (scores. The scheme ()))

# for clf in (knn, dt, lr, svm, voting_clf):
# clf.fit(x_data, y_data)
# y_predict = clf.predict(x_data)
# print (CLF. Magic __class__. __name__, "acc = % 0.5 f" % accuracy_score (y_data y_predict))
Copy the code

2.2 Bagging

from sklearn import datasets
from sklearn.ensemble import BaggingClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split

digits = datasets.load_digits()
x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, random_state=55)

print(">>>BaggingClassifierTest<<<")
dtree = tree.DecisionTreeClassifier()
bagging_tree = BaggingClassifier(base_estimator=dtree, n_estimators=800) Training multi-group decision tree classifier using Bagging algorithm

dtree.fit(x_train, y_train)
print("DecisionTree accuracy: % 0.5 f % %" % (dtree.score(x_test, y_test) * 100))

bagging_tree.fit(x_train, y_train)
print("BaggingTree accuracy: % 0.5 f % %" % (bagging_tree.score(x_test, y_test) * 100))
Copy the code

2.3 AdaBoost

from sklearn import datasets
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
# Since random sampling is not used here, the data set needs to be split
digits = datasets.load_digits()
x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.7, random_state=88)

print(">>>AdaBoostClassifierTest<<<")

This parameter is based on the decision tree parameter
dtree = DecisionTreeClassifier()
adaboost_clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
                                  n_estimators=600, learning_rate=0.5, algorithm="SAMME")

dtree.fit(x_train, y_train)
print("DecisionTree accuracy: % 0.5 f % %" % (dtree.score(x_test, y_test) * 100))

adaboost_clf.fit(x_train, y_train)
print("AdaBoost accuracy: % 0.5 f % %" % (adaboost_clf.score(x_test, y_test) * 100))
Copy the code

2.4 XGBoost

from sklearn import datasets
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

digits = datasets.load_digits()
x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.3, random_state=33)

print(">>>XGBoostClassifierTest<<<")

xgb = XGBClassifier()
dt = DecisionTreeClassifier()

dt.fit(x_train, y_train)
y_predict = dt.predict(x_test)
accuracy = accuracy_score(y_test, y_predict)
print("DecisionTree accuarcy: %.5f%%" % (accuracy * 100.0))

xgb.fit(x_train, y_train)
y_predict = xgb.predict(x_test)
accuracy = accuracy_score(y_test, y_predict)
print("XGBoost accuarcy: %.5f%%" % (accuracy * 100.0))

# Feature importance
# import matplotlib.pyplot as plt
# from xgboost import plot_importance
# FIG, ax = PLT. Subplots (figsize = (10, 15))
# plot_importance(XGB, height=0.5, max_num_features=64, ax=ax) # plot_importance(XGB, height=0.5, max_num_features=64, ax=ax)
# plt.show()
Copy the code

2.5 CatBoost

rom catboost import CatBoostClassifier, Pool
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np

train_data = Pool(data=[[1.4.5.6],
                        [4.5.6.7],
                        [30.40.50.60]],
                  label=[1.1, -1],
                  weight=[0.1.0.2.0.3])

model = CatBoostClassifier(iterations=10)

model.fit(train_data, plot=True)
preds_class = model.predict(train_data)
print("% 0.5 f % %" % (model.score(train_data) * 100))

# import matplotlib.pyplot as plt
# fea_ = model.feature_importances_
# fea_name = model.feature_names_
# plt.figure(figsize=(10, 10))
# PLT. Barh (fea_name fea_, height = 0.5)

# plt.show()
Copy the code

Note: CatBoost supports drawing training diagrams and needs to be run in Jupyter Notebook. Here’s the code for a random run of the Titanic data set, because CatBoost is better suited for this kind of multi-feature training and high quality prediction without tuning. But my code is very crude, very low accuracy… Let’s optimize it.

from catboost.datasets import titanic
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
import numpy as np

data_train, data_test = titanic()
print(data_train.head(10))

null_value_stats = data_train.isnull().sum(axis=0)
print(null_value_stats[null_value_stats != 0])
data_train.fillna(-999, inplace=True)
data_test.fillna(-999, inplace=True)
X = data_train.drop('Survived', axis=1)
y = data_train.Survived
print("data:", X)
print("label:", y)
print(X.dtypes)

# remove = ['PassengerId', 'Cabin', 'Name', 'Ticket']
# X = data_train.drop(remove, axis=1)
# X = data_train.dropna(how='any', axis='rows')
# y = X.pop('Survived')
# print(X.head(10))
#
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=66) cat_features = np.where(X.dtypes ! =float) [0]
# cat_features = [0, 1, 6]
train_pool = Pool(x_train, y_train, cat_features=cat_features)
test_pool = Pool(x_test, y_test, cat_features=cat_features)

cat_clf = CatBoostClassifier()
cat_clf.fit(train_pool, eval_set=test_pool, plot=True, silent=False, use_best_model=True)
Copy the code

2.6 Stacking

from sklearn import datasets, model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

digits = datasets.load_digits()
print(digits)
x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.7, random_state=66)

print(">>>StackingClassifierTest<<<")

knn = KNeighborsClassifier(n_neighbors=1)
dtree = DecisionTreeClassifier()
lr = LogisticRegression(max_iter=10000)
svm = SVC()
stacking_clf = StackingClassifier(estimators=[('knn', knn), ('dt', dtree), ('lr', lr)], final_estimator=svm)

for clf in (knn, dtree, lr, svm, stacking_clf):
    clf.fit(x_train, y_train)
    print(clf.__class__.__name__, Accuracy: % % % 0.5 f "" % (clf.score(x_test, y_test) * 100))
    # scores = model_selection.cross_val_score(clf, x_data, y_data, cv=5, scoring='accuracy')
    # print (CLF. Magic __class__. __name__, "acc = % 0.5 f" % (scores. The scheme ()))
Copy the code

3 Reference Materials

  • Data Mining: Integrated Learning
  • Ensemble Learning for Machine Learning
  • Machine Learning — Ensemble Learning for Machine Learning
  • Using XGBoost with Scikit-learn
  • Basic use of XgBoost in sklearn for integrated learning
  • Machine Learning — Ensemble Learning
  • fit – CatBoost
  • CatBoost handles CTR clicks
  • Many years ago CatBoost processing
  • CatBoost parameter interpretation for integration algorithms