• 机器学习:集成学习(Python)


    一、Adaboost算法

    1.1 Adaboost分类算法

    adaboost_discrete_c.py

    1. import numpy as np
    2. import copy
    3. from ch4.decision_tree_C import DecisionTreeClassifier
    4. class AdaBoostClassifier:
    5. """
    6. adaboost分类算法:既可以做二分类、也可以做多分类,取决于基本分类器
    7. 1. 同质学习器:非列表形式,按同种类型的基学习器构造
    8. 2. 异质学习器:列表传递[logisticsregression, svm, cart, ...]
    9. """
    10. def __init__(self, base_estimator=None, n_estimators=10, learning_rate=1.0):
    11. """
    12. :param base_estimator: 基学习器
    13. :param n_estimators: 基学习器的个数T
    14. :param learning_rate: 学习率,降低后续训练的基分类器的权重,避免过拟合
    15. """
    16. self.base_estimator = base_estimator
    17. self.n_estimators = n_estimators
    18. self.learning_rate = learning_rate
    19. # 如果不提供基学习器,则默认按照深度为2的决策树作为基分类器
    20. if self.base_estimator is None:
    21. self.base_estimator = DecisionTreeClassifier(max_depth=2)
    22. if type(base_estimator) != list:
    23. # 同质(同种类型)的分类器,深拷贝
    24. self.base_estimator = [copy.deepcopy(self.base_estimator)
    25. for _ in range(self.n_estimators)]
    26. else:
    27. # 异质(不同种类型)的分类器
    28. self.n_estimators = len(self.base_estimator)
    29. self.estimator_weights = [] # 每个基学习器的权重系数
    30. def fit(self, x_train, y_train):
    31. """
    32. 训练AdaBoost每个基学习器,计算权重分布,每个基学习器的误差率和权重系数α,
    33. :param x_train: 训练集,二维数组:m * k
    34. :param y_train: 目标集
    35. :return:
    36. """
    37. x_train, y_train = np.asarray(x_train), np.asarray(y_train)
    38. n_samples, n_class = x_train.shape[0], len(set(y_train)) # 样本量,类别数
    39. sample_weight = np.ones(n_samples) # 为适应自写的基学习器,设置样本均匀权重为1.0
    40. # 针对每一个基学习器,根据带有权重分布的训练集训练基学习器,计算相关参数
    41. for idx in range(self.n_estimators):
    42. # 1. 使用具有权重分布的Dm的训练数据集学习,并预测
    43. self.base_estimator[idx].fit(x_train, y_train, sample_weight=sample_weight)
    44. # 只关心分类错误的,如果分类错误,则为0,正确则为1
    45. y_hat_0 = (self.base_estimator[idx].predict(x_train) == y_train).astype(int)
    46. # 2. 计算分类误差率
    47. error_rate = sample_weight.dot(1.0 - y_hat_0) / n_samples
    48. if error_rate > 0.5:
    49. self.estimator_weights.append(0) # 当前基分类器不起作用
    50. continue
    51. # 3. 计算基分类器的权重系数,考虑溢出
    52. alpha_rate = 0.5 * np.log((1 - error_rate) / error_rate + 1e-8) + np.log(n_class - 1)
    53. alpha_rate = min(10.0, alpha_rate) # 避免权重系数过大
    54. self.estimator_weights.append(alpha_rate)
    55. # 4. 更新样本权重,为了适应多分类,yi*Gm(xi)计算np.power(-1.0, 1 - y_hat_0)
    56. sample_weight *= np.exp(-1.0 * alpha_rate * np.power(-1.0, 1 - y_hat_0))
    57. sample_weight = sample_weight / np.sum(sample_weight) * n_samples
    58. # 5. 更新estimator的权重系数,按照学习率
    59. for i in range(self.n_estimators):
    60. self.estimator_weights[i] *= np.power(self.learning_rate, i)
    61. def predict_proba(self, x_test):
    62. """
    63. 预测测试样本所属类别概率,软投票
    64. :param x_test: 测试样本集
    65. :return:
    66. """
    67. x_test = np.asarray(x_test)
    68. # 按照加法模型,线性组合基学习器
    69. # 每个测试样本,每个基学习器预测概率(10,[(0.68, 0.32),(0.55, 0.45)]...)
    70. y_hat_prob = np.sum([self.base_estimator[i].predict_proba(x_test) *
    71. self.estimator_weights[i] for i in range(self.n_estimators)], axis=0)
    72. return y_hat_prob / y_hat_prob.sum(axis=1, keepdims=True)
    73. def predict(self, x_test):
    74. """
    75. 预测测试样本所属类别
    76. :param x_test: 测试样本集
    77. :return:
    78. """
    79. return np.argmax(self.predict_proba(x_test), axis=1)

    1.2 Adaboost回归算法

    adaboost_regressor.py

    1. import numpy as np
    2. import copy
    3. from ch4.decision_tree_R import DecisionTreeRegression # CART
    4. class AdaBoostRegressior:
    5. """
    6. adaboost回归算法:结合(集成)策略:加权中位数、预测值的平均加权
    7. 1. 同质学习器,异质学习器
    8. 2. 回归误差率依赖于相对误差:平方误差、线性误差、指数误差
    9. """
    10. def __init__(self, base_estimator=None, n_estimators=10, learning_rate=1.0,
    11. loss="square", comb_strategy="weight_median"):
    12. """
    13. :param base_estimator: 基学习器
    14. :param n_estimators: 基学习器的个数T
    15. :param learning_rate: 学习率,降低后续训练的基分类器的权重,避免过拟合
    16. :param loss: 损失函数:linear、square、exp
    17. :param comb_strategy: weight_median、weight_mean
    18. """
    19. self.base_estimator = base_estimator
    20. self.n_estimators = n_estimators
    21. self.learning_rate = learning_rate
    22. self.loss = loss # 相对误差的损失函数
    23. self.comb_strategy = comb_strategy # 结合策略
    24. # 如果不提供基学习器,则默认按照深度为2的决策树作为基分类器
    25. if self.base_estimator is None:
    26. self.base_estimator = DecisionTreeRegression(max_depth=2)
    27. if type(base_estimator) != list:
    28. # 同质(同种类型)的分类器,深拷贝
    29. self.base_estimator = [copy.deepcopy(self.base_estimator)
    30. for _ in range(self.n_estimators)]
    31. else:
    32. # 异质(不同种类型)的分类器
    33. self.n_estimators = len(self.base_estimator)
    34. self.estimator_weights = [] # 每个基学习器的权重系数
    35. def _cal_loss(self, y_true, y_hat):
    36. """
    37. 根据损失函数计算相对误差
    38. :param y_true: 真值
    39. :param y_hat: 预测值
    40. :return:
    41. """
    42. errors = np.abs(y_true - y_hat) # 绝对值误差
    43. if self.loss.lower() == "linear": # 线性
    44. return errors / np.max(errors)
    45. elif self.loss.lower() == "square": # 平方
    46. errors_s = (y_true - y_hat) ** 2
    47. return errors_s / np.max(errors) ** 2
    48. elif self.loss.lower() == "exp": # 指数
    49. return 1 - np.exp(-errors / np.max(errors))
    50. else:
    51. raise ValueError("仅支持linear、square和exp...")
    52. def fit(self, x_train, y_train):
    53. """
    54. Adaboost回归算法,T个基学习器的训练:
    55. 1. 基学习器基于权重分布Dt的训练集训练
    56. 2. 计算最大绝对误差、相对误差、回归误差率
    57. 3. 计算当前ht的置信度
    58. 4. 更新下一轮的权重分布
    59. :param x_train:
    60. :param y_train:
    61. :return:
    62. """
    63. x_train, y_train = np.asarray(x_train), np.asarray(y_train)
    64. n_samples, n_class = x_train.shape[0], len(set(y_train)) # 样本量,类别数
    65. sample_weight = np.ones(n_samples) # 为适应自写的基学习器,设置样本均匀权重为1.0
    66. for idx in range(self.n_estimators):
    67. # 1. 基学习器基于权重分布Dt的训练集训练以及预测
    68. self.base_estimator[idx].fit(x_train, y_train, sample_weight=sample_weight)
    69. y_hat = self.base_estimator[idx].predict(x_train) # 当前训练集的预测值
    70. # 2. 计算最大绝对误差、相对误差、回归误差率
    71. errors = self._cal_loss(y_train, y_hat) # 相对误差
    72. error_rate = np.dot(errors, sample_weight / n_samples) # 回归误差率
    73. # 3. 计算当前ht的置信度,基学习器的权重参数
    74. alpha_rate = error_rate / (1 - error_rate)
    75. self.estimator_weights.append(alpha_rate)
    76. # 4. 更新下一轮的权重分布
    77. sample_weight *= np.power(alpha_rate, 1 - errors)
    78. sample_weight = sample_weight / np.sum(sample_weight) * n_samples
    79. # 5. 计算基学习器的权重系数以及考虑学习率
    80. self.estimator_weights = np.log(1 / np.asarray(self.estimator_weights))
    81. for i in range(self.n_estimators):
    82. self.estimator_weights[i] *= np.power(self.learning_rate, i)
    83. def predict(self, x_test):
    84. """
    85. Adaboost回归算法预测,按照加权中位数以及加权平均两种结合策略
    86. :param x_test: 测试样本集
    87. :return:
    88. """
    89. x_test = np.asarray(x_test)
    90. if self.comb_strategy == "weight_mean": # 加权平均
    91. self.estimator_weights /= np.sum(self.estimator_weights)
    92. # n * T
    93. y_hat_mat = np.array([self.estimator_weights[i] *
    94. self.base_estimator[i].predict(x_test)
    95. for i in range(self.n_estimators)])
    96. # print(y_hat_mat.shape) (10, 5160)
    97. return np.sum(y_hat_mat, axis=0)
    98. elif self.comb_strategy == "weight_median": # 加权中位数
    99. # T个基学习器的预测结果构成一个二维数组(10, 5160)
    100. y_hat_mat = np.array([self.estimator_weights[i] *
    101. self.base_estimator[i].predict(x_test)
    102. for i in range(self.n_estimators)]).T
    103. sorted_idx = np.argsort(y_hat_mat, axis=1) # 二维数组
    104. # 按照每个样本预测值的升序排列序号,排序权重系数,然后累加计算
    105. weight_cdf = np.cumsum(self.estimator_weights[sorted_idx], axis=1)
    106. # 选择最小的t,如下代码产生二维bool数组
    107. median_or_above = weight_cdf >= 0.5 * weight_cdf[:, -1][:, np.newaxis]
    108. # print(median_idx)
    109. median_idx = np.argmax(median_or_above, axis=1) # 返回每个样本的t索引值
    110. median_estimators = sorted_idx[np.arange(x_test.shape[0]), median_idx]
    111. return y_hat_mat[np.arange(x_test.shape[0]), median_estimators]

    1.3 SAMME算法

    samme_r_muti_classifier.py 

    1. import numpy as np
    2. import copy
    3. from ch4.decision_tree_C import DecisionTreeClassifier
    4. class SAMMERClassifier:
    5. """
    6. SAMME.R算法是将SAMME拓展到连续数值型的范畴。
    7. 基学习器的输出为连续型,一般为类别概率的预测值。
    8. """
    9. def __init__(self, base_estimator=None, n_estimators=10):
    10. """
    11. :param base_estimator: 基学习器
    12. :param n_estimators: 基学习器的个数T
    13. """
    14. self.base_estimator = base_estimator
    15. self.n_estimators = n_estimators
    16. # 如果不提供基学习器,则默认按照深度为2的决策树作为基分类器
    17. if self.base_estimator is None:
    18. self.base_estimator = DecisionTreeClassifier(max_depth=2)
    19. if type(base_estimator) != list:
    20. # 同质(同种类型)的分类器,深拷贝
    21. self.base_estimator = [copy.deepcopy(self.base_estimator)
    22. for _ in range(self.n_estimators)]
    23. else:
    24. # 异质(不同种类型)的分类器
    25. self.n_estimators = len(self.base_estimator)
    26. self.estimator_weights = [] # 每个基学习器的权重系数
    27. self.n_samples, self.n_class = None, None # 样本量和类别数
    28. def _target_encoding(self, y_train):
    29. """
    30. 对目标值进行编码
    31. :param y_train: 训练目标集
    32. :return:
    33. """
    34. self.n_samples, self.n_class = len(y_train), len(set(y_train))
    35. target = -1 / (self.n_class - 1) * np.ones((self.n_samples, self.n_class))
    36. for i in range(self.n_samples):
    37. target[i, y_train[i]] = 1 # 对应该样本的类别所在编码中的列改为1
    38. return target
    39. def fit(self, x_train, y_train):
    40. """
    41. 训练SAMME.Rt每个基学习器,根据预测类别概率计算权重分布
    42. :param x_train: 训练集,二维数组:m * k
    43. :param y_train: 目标集
    44. :return:
    45. """
    46. x_train, y_train = np.asarray(x_train), np.asarray(y_train)
    47. target = self._target_encoding(y_train) # 编码
    48. sample_weight = np.ones(self.n_samples) # 为适应自写的基学习器,设置样本均匀权重为1.0
    49. # 针对每一个基学习器,根据带有权重分布的训练集训练基学习器,计算相关参数
    50. c = (self.n_class - 1) / self.n_class
    51. for idx in range(self.n_estimators):
    52. # 1. 使用具有权重分布的Dm的训练数据集学习,并预测
    53. self.base_estimator[idx].fit(x_train, y_train, sample_weight=sample_weight)
    54. # 根据训练的基学习器,获得其样本的预测类别概率
    55. pred_p = self.base_estimator[idx].predict_proba(x_train)
    56. # 针对预测概率,小于eps的值替换为eps,避免log函数溢出
    57. np.clip(pred_p, np.finfo(pred_p.dtype).eps, None, out=pred_p)
    58. # 2. 更新样本权重
    59. sample_weight *= np.exp(-c * (target * np.log(pred_p)).sum(axis=1))
    60. sample_weight = sample_weight / np.sum(sample_weight) * self.n_samples
    61. @staticmethod
    62. def softmax_func(x):
    63. """
    64. softmax函数,为避免上溢或下溢,对参数x做限制
    65. :param x: 数组: batch_size * n_classes
    66. :return: 1 * n_classes
    67. """
    68. exps = np.exp(x - np.max(x)) # 避免溢出,每个数减去其最大值
    69. exp_sum = np.sum(exps, axis=1, keepdims=True)
    70. return exps / exp_sum
    71. def predict_proba(self, x_test):
    72. """
    73. 预测测试样本所属类别概率,软投票
    74. :param x_test: 测试样本集
    75. :return:
    76. """
    77. x_test = np.asarray(x_test)
    78. C_x = np.zeros((x_test.shape[0], self.n_class))
    79. for i in range(self.n_estimators):
    80. y_prob = self.base_estimator[i].predict_proba(x_test)
    81. np.clip(y_prob, np.finfo(y_prob.dtype).eps, None, out=y_prob)
    82. y_ln = np.log(y_prob)
    83. C_x += (self.n_class - 1) * (y_ln - np.sum(y_ln, axis=1, keepdims=True) / self.n_class)
    84. return C_x
    85. def predict(self, x_test):
    86. """
    87. 预测测试样本所属类别
    88. :param x_test: 测试样本集
    89. :return:
    90. """
    91. return np.argmax(self.predict_proba(x_test), axis=1)

    1.4 Adaboost分类算法测试

    test_adaboost_c.py

    1. from sklearn.datasets import make_classification
    2. from sklearn.metrics import classification_report
    3. from ch4.decision_tree_C import DecisionTreeClassifier # 基学习器,决策树
    4. from ch3.logistic_regression_2class import LogisticRegression # 逻辑回归
    5. from ch6.svm_smo_classifier import SVMClassifier # 支持向量机
    6. from adaboost_discrete_c import AdaBoostClassifier
    7. from ch8.plt_decision_function import plot_decision_function
    8. X, y = make_classification(n_samples=300, n_features=2, n_informative=1, n_redundant=0, n_repeated=0, n_classes=2,
    9. n_clusters_per_class=1, class_sep=1, random_state=42)
    10. # 同质:同种类型的基学习器
    11. base_tree = DecisionTreeClassifier(max_depth=3, is_feature_all_R=True, max_bins=20)
    12. ada_bc = AdaBoostClassifier(base_estimator=base_tree, n_estimators=10, learning_rate=1.0)
    13. ada_bc.fit(X, y) # adaboost训练
    14. print("基学习器的权重系数:\n", ada_bc.estimator_weights)
    15. y_pred = ada_bc.predict(X) # 预测类别
    16. print(classification_report(y, y_pred))
    17. plot_decision_function(X, y, ada_bc)
    18. # 异质:不同类型的基学习器
    19. log_reg = LogisticRegression(batch_size=20, max_epochs=5)
    20. cart = DecisionTreeClassifier(max_depth=4, is_feature_all_R=True)
    21. svm = SVMClassifier(C=5.0, max_epochs=20)
    22. ada_bc2 = AdaBoostClassifier(base_estimator=[log_reg, cart, svm], learning_rate=1.0)
    23. ada_bc2.fit(X, y) # adaboost训练
    24. print("异质基学习器的权重系数:", ada_bc2.estimator_weights)
    25. y_pred = ada_bc2.predict(X) # 预测类别
    26. print(classification_report(y, y_pred))
    27. plot_decision_function(X, y, ada_bc2)

     test_adaboost_c2.py

    1. import numpy as np
    2. import matplotlib.pyplot as plt
    3. from sklearn.preprocessing import StandardScaler
    4. from sklearn.model_selection import KFold
    5. from sklearn.metrics import accuracy_score
    6. from sklearn.datasets import make_blobs
    7. from ch4.decision_tree_C import DecisionTreeClassifier
    8. from ch8.adaboost_discrete_c import AdaBoostClassifier
    9. X, y = make_blobs(n_samples=1000, n_features=10, centers=5, cluster_std=[1.5, 2, 0.9, 3, 2.8], random_state=0)
    10. X = StandardScaler().fit_transform(X)
    11. base_em = DecisionTreeClassifier(max_depth=4, is_feature_all_R=True, max_bins=10)
    12. acc_scores = [] # 存储每次交叉验证的均分
    13. # 用10折交叉验证评估不同基学习器个数T下的分类正确率
    14. for n in range(1, 21):
    15. scores = [] # 一次交叉验证的acc均值
    16. k_fold = KFold(n_splits=10)
    17. for idx_train, idx_test in k_fold.split(X, y):
    18. classifier = AdaBoostClassifier(base_estimator=base_em, n_estimators=n, learning_rate=1)
    19. classifier.fit(X[idx_train, :], y[idx_train])
    20. y_test_pred = classifier.predict(X[idx_test, :])
    21. scores.append(accuracy_score(y[idx_test], y_test_pred))
    22. acc_scores.append(np.mean(scores))
    23. print(n, ":", acc_scores[-1])
    24. plt.figure(figsize=(7, 5))
    25. plt.plot(range(1, 21), acc_scores, "ko-", lw=1)
    26. plt.xlabel("Number of Estimations", fontdict={"fontsize": 12})
    27. plt.ylabel("Accuracy Score", fontdict={"fontsize": 12})
    28. plt.title("Cross Validation Scores of Different Number of Base Learners", fontdict={"fontsize": 14})
    29. plt.grid(ls=":")
    30. plt.show()

    1.5 Adaboost回归算法测试

    test_adaboost_regressor.py

    1. import numpy as np
    2. import matplotlib.pyplot as plt
    3. from sklearn.datasets import fetch_california_housing
    4. from sklearn.metrics import r2_score
    5. from sklearn.preprocessing import StandardScaler
    6. from sklearn.model_selection import train_test_split
    7. from ch4.decision_tree_R import DecisionTreeRegression
    8. from ch8.adaboost_regressor import AdaBoostRegressior
    9. housing = fetch_california_housing()
    10. X, y = housing.data, housing.target
    11. # print(X.shape)
    12. # print(y.shape)
    13. X = StandardScaler().fit_transform(X)
    14. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    15. base_ht = DecisionTreeRegression(max_bins=50, max_depth=5)
    16. plt.figure(figsize=(14, 15))
    17. def train_plot(cs, loss, i):
    18. abr = AdaBoostRegressior(base_estimator=base_ht, n_estimators=30,
    19. comb_strategy=cs, loss=loss)
    20. abr.fit(X_train, y_train)
    21. y_hat = abr.predict(X_test)
    22. # print(r2_score(y_test, y_hat))
    23. plt.subplots(231 + i)
    24. idx = np.argsort(y_test) # 对真值排序
    25. plt.plot(y_test[idx], "k-", lw=1.5, label="Test True")
    26. plt.plot(y_hat[idx], "r-", lw=1, label="Predict")
    27. plt.legend(frameon=False)
    28. plt.title("%s, %s, R2 = %.5f, MSE = %.5f" %
    29. (cs, loss, r2_score(y_test, y_hat), ((y_test - y_hat) ** 2).mean()))
    30. plt.xlabel("Test Samples Serial Number", fontdict={"fontsize": 12})
    31. plt.ylabel("True VS Predict", fontdict={"fontsize": 12})
    32. plt.grid(ls=":")
    33. print(cs, loss)
    34. loss_func = ["linear", "square", "exp"]
    35. comb_strategy = ["weight_mean", "weight_median"]
    36. i = 0
    37. for loss in loss_func:
    38. for cs in comb_strategy:
    39. train_plot(cs, loss, i)
    40. i += 1
    41. plt.show()

     1.6 SAMME算法测试

    test_samme_r_c.py

    1. import numpy as np
    2. import matplotlib.pyplot as plt
    3. from sklearn.preprocessing import StandardScaler
    4. from sklearn.model_selection import KFold
    5. from sklearn.metrics import accuracy_score
    6. from sklearn.datasets import make_blobs
    7. # from ch4.decision_tree_C import DecisionTreeClassifierR
    8. from sklearn.tree import DecisionTreeClassifier
    9. from ch8.samme_r_muti_classifier import SAMMERClassifier
    10. X, y = make_blobs(n_samples=1000, n_features=10, centers=5, cluster_std=[1.5, 2, 0.9, 3, 2.8], random_state=0)
    11. X = StandardScaler().fit_transform(X)
    12. base_em = DecisionTreeClassifier(max_depth=4)
    13. acc_scores = [] # 存储每次交叉验证的均分
    14. # 用10折交叉验证评估不同基学习器个数T下的分类正确率
    15. for n in range(1, 21):
    16. scores = [] # 一次交叉验证的acc均值
    17. k_fold = KFold(n_splits=10)
    18. for idx_train, idx_test in k_fold.split(X, y):
    19. classifier = SAMMERClassifier(base_estimator=base_em, n_estimators=n)
    20. classifier.fit(X[idx_train, :], y[idx_train])
    21. y_test_pred = classifier.predict(X[idx_test, :])
    22. scores.append(accuracy_score(y[idx_test], y_test_pred))
    23. acc_scores.append(np.mean(scores))
    24. print(n, ":", acc_scores[-1])
    25. plt.figure(figsize=(7, 5))
    26. plt.plot(range(1, 21), acc_scores, "ko-", lw=1)
    27. plt.xlabel("Number of Estimations", fontdict={"fontsize": 12})
    28. plt.ylabel("Accuracy Score", fontdict={"fontsize": 12})
    29. plt.title("Cross Validation Scores of Different Number of Base Learners", fontdict={"fontsize": 14})
    30. plt.grid(ls=":")
    31. plt.show()

    1.7 可视化分类边界函数

     plt_decision_function.py

    1. import matplotlib.pyplot as plt
    2. import numpy as np
    3. def plot_decision_function(X, y, clf, is_show=True):
    4. """
    5. 可视化分类边界函数
    6. :param X: 测试样本
    7. :param y: 测试样本的类别
    8. :param clf: 分类模型
    9. :param is_show: 是否在当前显示图像,用于父函数绘制子图
    10. :return:
    11. """
    12. if is_show:
    13. plt.figure(figsize=(7, 5))
    14. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    15. y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    16. xi, yi = np.meshgrid(np.linspace(x_min, x_max, 100),
    17. np.linspace(y_min, y_max, 100))
    18. y_pred = clf.predict(np.c_[xi.ravel(), yi.ravel()]) # 模型预测值
    19. y_pred = y_pred.reshape(xi.shape)
    20. plt.contourf(xi, yi, y_pred, cmap="winter", alpha=0.4)
    21. plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors="k")
    22. plt.xlabel("Feature 1", fontdict={"fontsize": 12})
    23. plt.ylabel("Feature 2", fontdict={"fontsize": 12})
    24. plt.title("Model Classification Boundary", fontdict={"fontsize": 14})
    25. if is_show:
    26. plt.show()

    二、提升树算法boosting tree

    2.1 提升树回归算法

    boostingtree_r.py

    1. import numpy as np
    2. import copy
    3. from ch4.decision_tree_R import DecisionTreeRegression # CART
    4. class BoostTreeRegressor:
    5. """
    6. 提升树回归算法,采用平方误差损失
    7. """
    8. def __init__(self, base_estimator=None, n_estimators=10, learning_rate=1.0):
    9. """
    10. :param base_estimator: 基学习器
    11. :param n_estimators: 基学习器的个数T
    12. :param learning_rate: 学习率,降低后续训练的基分类器的权重,避免过拟合
    13. """
    14. self.base_estimator = base_estimator
    15. self.n_estimators = n_estimators
    16. self.learning_rate = learning_rate
    17. # 如果不提供基学习器,则默认按照深度为2的决策树作为基分类器
    18. if self.base_estimator is None:
    19. self.base_estimator = DecisionTreeRegression(max_depth=2)
    20. if type(base_estimator) != list:
    21. # 同质(同种类型)的分类器,深拷贝
    22. self.base_estimator = [copy.deepcopy(self.base_estimator)
    23. for _ in range(self.n_estimators)]
    24. else:
    25. # 异质(不同种类型)的分类器
    26. self.n_estimators = len(self.base_estimator)
    27. def fit(self, x_train, y_train):
    28. """
    29. 提升树的训练,针对每个基决策树算法,拟合上一轮的残差
    30. :param x_train: 训练集
    31. :param y_train: 目标集
    32. :return:
    33. """
    34. x_train, y_train = np.asarray(x_train), np.asarray(y_train)
    35. # 1. 训练第一棵回归决策树,并预测
    36. self.base_estimator[0].fit(x_train, y_train)
    37. y_hat = self.base_estimator[0].predict(x_train)
    38. y_residual = y_train - y_hat # 残差,MSE的负梯度
    39. # 2. 从第二棵树开始,每一轮拟合上一轮的残差
    40. for idx in range(1, self.n_estimators):
    41. self.base_estimator[idx].fit(x_train, y_residual) # 拟合残差
    42. # 累加第m-1棵树的预测值,当前模型是f_(m-1)
    43. y_hat += self.base_estimator[idx].predict(x_train) * self.learning_rate
    44. y_residual = y_train - y_hat # 当前模型的残差
    45. def predict(self, x_test):
    46. """
    47. 回归提升树的预测
    48. :param x_test: 测试样本集
    49. :return:
    50. """
    51. x_test = np.asarray(x_test)
    52. y_hat_mat = np.sum([self.base_estimator[0].predict(x_test)] +
    53. [np.power(self.learning_rate, i) * self.base_estimator[i].predict(x_test)
    54. for i in range(1, self.n_estimators - 1)] +
    55. [self.base_estimator[-1].predict(x_test)], axis=0)
    56. return y_hat_mat

    2.2 梯度提升树分类算法

    gradientboosting_c

    1. import numpy as np
    2. import copy
    3. from ch4.decision_tree_R import DecisionTreeRegression # CART
    4. class GradientBoostClassifier:
    5. """
    6. 梯度提升树多分类算法:多分类也可用回归树来做,即训练与类别数相同的几组回归树,
    7. 每一组代表一个类别,然后对所有组的输出进行softmax操作将其转换为概率分布,
    8. 再通过交叉熵或者KL一类的损失函数求每棵树相应的负梯度,指导下一轮的训练。
    9. """
    10. def __init__(self, base_estimator=None, n_estimators=10, learning_rate=1.0):
    11. """
    12. :param base_estimator: 基学习器
    13. :param n_estimators: 基学习器的个数T
    14. :param learning_rate: 学习率,降低后续训练的基分类器的权重,避免过拟合
    15. """
    16. self.base_estimator = base_estimator
    17. self.n_estimators = n_estimators
    18. self.learning_rate = learning_rate
    19. # 如果不提供基学习器,则默认按照深度为2的决策树作为基分类器
    20. if self.base_estimator is None:
    21. self.base_estimator = DecisionTreeRegression(max_depth=2)
    22. if type(base_estimator) != list:
    23. # 同质(同种类型)的分类器,深拷贝
    24. self.base_estimator = [copy.deepcopy(self.base_estimator)
    25. for _ in range(self.n_estimators)]
    26. else:
    27. # 异质(不同种类型)的分类器
    28. self.n_estimators = len(self.base_estimator)
    29. self.base_estimators = [] # 扩展到class_num组分类器
    30. @staticmethod
    31. def one_hot_encoding(target):
    32. class_labels = np.unique(target)
    33. target_y = np.zeros((len(target), len(class_labels)), dtype=np.int32)
    34. for i, label in enumerate(target):
    35. target_y[i, label] = 1 # 对应类别所在的列为1
    36. return target_y
    37. @staticmethod
    38. def softmax_func(x):
    39. exps = np.exp(x - np.max(x))
    40. return exps / np.sum(exps, axis=1, keepdims=True)
    41. def fit(self, x_train, y_train):
    42. """
    43. 梯度提升分类算法的训练,共训练M * K个基学习器
    44. :param x_train: 训练集
    45. :param y_train: 目标集
    46. :return:
    47. """
    48. x_train, y_train = np.asarray(x_train), np.asarray(y_train)
    49. class_num = len(np.unique(y_train)) # 类别数
    50. y_encoded = self.one_hot_encoding(y_train)
    51. # 深拷贝class_num组分类器,每组(每个类别)n_estimators个基学习器
    52. # 假设是三分类:[[0, 1, 2, ..., 9], [10], [10]]
    53. self.base_estimators = [copy.deepcopy(self.base_estimator) for _ in range(class_num)]
    54. # 初始化第一轮基学习器,针对每个类别,分别训练一个基学习器
    55. y_hat_scores = [] # 用于存储每个类别的预测值
    56. for c_idx in range(class_num):
    57. self.base_estimators[c_idx][0].fit(x_train, y_encoded[:, c_idx])
    58. y_hat_scores.append(self.base_estimators[c_idx][0].predict(x_train))
    59. y_hat_scores = np.c_[y_hat_scores].T # 把每个类别的预测值构成一列,(120, 3) (n_samples, class_num)
    60. # print(np.asarray(y_hat_vals).shape)
    61. grad_y = y_encoded - self.softmax_func(y_hat_scores) # 按类别计算样本的负梯度值
    62. # 训练后续基学习器,共M - 1轮,每轮针对每个类别,分别训练一个基学习器
    63. for idx in range(1, self.n_estimators):
    64. y_hat_values = [] # 用于存储每个类别的预测值
    65. for c_idx in range(class_num):
    66. self.base_estimators[c_idx][idx].fit(x_train, grad_y[:, c_idx])
    67. y_hat_values.append(self.base_estimators[c_idx][idx].predict(x_train))
    68. y_hat_scores += np.c_[y_hat_values].T * self.learning_rate
    69. # print(np.asarray(y_hat_vals).shape)
    70. grad_y = y_encoded - self.softmax_func(y_hat_scores) # 按类别计算样本的负梯度值
    71. def predict_proba(self, x_test):
    72. """
    73. 预测测试样本所属类别的概率
    74. :param x_test: 测试样本集
    75. :return:
    76. """
    77. x_test = np.asarray(x_test)
    78. y_hat_scores = []
    79. for c_idx in range(len(self.base_estimators)):
    80. # 取当前类别的M个基学习器
    81. estimator = self.base_estimators[c_idx]
    82. y_hat_scores.append(np.sum([estimator[0].predict(x_test)] +
    83. [self.learning_rate * estimator[i].predict(x_test)
    84. for i in range(1, self.n_estimators - 1)] +
    85. [estimator[-1].predict(x_test)], axis=0))
    86. # y_hat_scores的维度(3 * 30)
    87. return self.softmax_func(np.c_[y_hat_scores].T)
    88. def predict(self, x_test):
    89. """
    90. 预测测试样本所属类别,概率大的idx标记为类别
    91. :param x_test: 测试样本集
    92. :return:
    93. """
    94. print(self.predict_proba(x_test))
    95. return np.argmax(self.predict_proba(x_test), axis=1)

     2.3 梯度提升树回归算法

    gradientboosting_r

    1. import numpy as np
    2. import copy
    3. from ch4.decision_tree_R import DecisionTreeRegression # CART
    4. class GradientBoostRegressor:
    5. """
    6. 梯度提升树回归算法,损失函数:五个,以损失函数在当前模型的负梯度近似为残差
    7. """
    8. def __init__(self, base_estimator=None, n_estimators=10, learning_rate=1.0,
    9. loss="ls", huber_threshold=0.1, quantile_threshold=0.5):
    10. """
    11. :param base_estimator: 基学习器
    12. :param n_estimators: 基学习器的个数T
    13. :param learning_rate: 学习率,降低后续训练的基分类器的权重,避免过拟合
    14. """
    15. self.base_estimator = base_estimator
    16. self.n_estimators = n_estimators
    17. self.learning_rate = learning_rate
    18. # 如果不提供基学习器,则默认按照深度为2的决策树作为基分类器
    19. if self.base_estimator is None:
    20. self.base_estimator = DecisionTreeRegression(max_depth=2)
    21. if type(base_estimator) != list:
    22. # 同质(同种类型)的分类器,深拷贝
    23. self.base_estimator = [copy.deepcopy(self.base_estimator)
    24. for _ in range(self.n_estimators)]
    25. else:
    26. # 异质(不同种类型)的分类器
    27. self.n_estimators = len(self.base_estimator)
    28. self.loss = loss # 损失函数的类型
    29. self.huber_threshold = huber_threshold # 仅对Huber损失有效
    30. self.quantile_threshold = quantile_threshold # 仅对分位数损失函数有效
    31. def _cal_negative_gradient(self, y_true, y_pred):
    32. """
    33. 计算负梯度值
    34. :param y_true: 真值
    35. :param y_pred: 预测值
    36. :return:
    37. """
    38. if self.loss.lower() == "ls": # MSE
    39. return y_true - y_pred
    40. elif self.loss.lower() == "lae": # MAE
    41. return np.sign(y_true - y_pred)
    42. elif self.loss.lower() == "huber": # 平滑平均绝对损失
    43. return np.where(np.abs(y_true - y_pred) > self.huber_threshold,
    44. self.huber_threshold * np.sign(y_true - y_pred),
    45. y_true - y_pred)
    46. elif self.loss.lower() == "quantile": # 分位数损失
    47. return np.where(y_true > y_pred, self.quantile_threshold,
    48. self.quantile_threshold - 1)
    49. elif self.loss.lower() == "logcosh": # 双曲余弦的对数的负梯度
    50. return -np.tanh(y_pred - y_true)
    51. else:
    52. raise ValueError("仅限于ls、lae、huber、quantile和logcosh,选择有误...")
    53. def fit(self, x_train, y_train):
    54. """
    55. 提升树的训练,针对每个基决策树算法,拟合上一轮的残差
    56. 1. 假设回归决策树以mse构建的,针对不同的损失函数,计算不同的基尼指数划分标准
    57. 2. 预测,集成,也根据不同的损失函数,预测叶子结点的输出...
    58. :param x_train: 训练集
    59. :param y_train: 目标集
    60. :return:
    61. """
    62. x_train, y_train = np.asarray(x_train), np.asarray(y_train)
    63. # 1. 训练第一棵回归决策树,并预测
    64. self.base_estimator[0].fit(x_train, y_train)
    65. y_hat = self.base_estimator[0].predict(x_train)
    66. y_residual = self._cal_negative_gradient(y_train, y_hat) # 负梯度
    67. # 2. 从第二棵树开始,每一轮拟合上一轮的残差
    68. for idx in range(1, self.n_estimators):
    69. self.base_estimator[idx].fit(x_train, y_residual) # 拟合残差
    70. # 累加第m-1棵树的预测值,当前模型是f_(m-1)
    71. y_hat += self.base_estimator[idx].predict(x_train) * self.learning_rate
    72. y_residual = self._cal_negative_gradient(y_train, y_hat) # 负梯度
    73. def predict(self, x_test):
    74. """
    75. 回归提升树的预测
    76. :param x_test: 测试样本集
    77. :return:
    78. """
    79. x_test = np.asarray(x_test)
    80. y_hat_mat = np.sum([self.base_estimator[0].predict(x_test)] +
    81. [np.power(self.learning_rate, i) * self.base_estimator[i].predict(x_test)
    82. for i in range(1, self.n_estimators - 1)] +
    83. [self.base_estimator[-1].predict(x_test)], axis=0)
    84. return y_hat_mat

     2.4 提升树算法测试

    test_boosting_tree_r.py

    1. import numpy as np
    2. import matplotlib.pyplot as plt
    3. from sklearn.datasets import fetch_california_housing
    4. from sklearn.metrics import r2_score
    5. from sklearn.preprocessing import StandardScaler
    6. from sklearn.model_selection import train_test_split
    7. from ch4.decision_tree_R import DecisionTreeRegression
    8. from ch8.boostingtree_r import BoostTreeRegressor
    9. from sklearn.tree import DecisionTreeRegressor
    10. # housing = fetch_california_housing()
    11. # X, y = housing.data[0:20000:100, :], housing.target[0:20000:100]
    12. # print(X.shape)
    13. # print(y.shape)
    14. # X = StandardScaler().fit_transform(X)
    15. # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    16. X = np.linspace(1, 10, 10).reshape(-1, 1)
    17. y = np.array([5.56, 5.70, 5.91, 6.40, 6.80, 7.05, 8.90, 8.70, 9.00, 9.05])
    18. # base_ht = DecisionTreeRegression(max_bins=10, max_depth=1)
    19. base_ht = DecisionTreeRegressor(max_depth=1)
    20. # n_estimators = np.linspace(2, 31, 29, dtype=np.int32)
    21. # r2_scores = []
    22. # for n in n_estimators:
    23. # btr = BoostTreeRegressior(base_estimator=base_ht, n_estimators=n)
    24. # btr.fit(X_train, y_train)
    25. # y_hat = btr.predict(X_test)
    26. # # print(r2_score(y_test, y_hat))
    27. # r2_scores.append(r2_score(y_test, y_hat))
    28. # print(n, ":", r2_scores[-1])
    29. r2_scores = []
    30. for n in range(1, 7):
    31. btr = BoostTreeRegressor(base_estimator=base_ht, n_estimators=n)
    32. btr.fit(X, y)
    33. y_hat = btr.predict(X)
    34. # print(r2_score(y_test, y_hat))
    35. r2_scores.append(r2_score(y, y_hat))
    36. print(n, ":", r2_scores[-1], np.sum((y - y_hat) ** 2))
    37. # plt.figure(figsize=(7, 5))
    38. # plt.plot(n_estimators, r2_scores, "ko-", lw=1)
    39. # plt.show()
    40. # idx = np.argsort(y_test) # 对真值排序
    41. #
    42. # plt.figure(figsize=(7, 5))
    43. # plt.plot(y_test[idx], "k-", lw=1.5, label="Test True")
    44. # plt.plot(y_hat[idx], "r-", lw=1, label="Predict")
    45. # plt.legend(frameon=False)
    46. # plt.title("Regression Boosting Tree, R2 = %.5f, MSE = %.5f" %
    47. # (r2_score(y_test, y_hat), ((y_test - y_hat) ** 2).mean()))
    48. # plt.xlabel("Test Samples Serial Number", fontdict={"fontsize": 12})
    49. # plt.ylabel("True VS Predict", fontdict={"fontsize": 12})
    50. # plt.grid(ls=":")
    51. #
    52. # plt.show()

     2.5 梯度提升树算法测试

    test_gradboost_c1.py

    1. from ch8.gradientboosting_c import GradientBoostClassifier
    2. from sklearn.datasets import load_iris, load_digits, load_breast_cancer
    3. from sklearn.decomposition import PCA
    4. from sklearn.preprocessing import StandardScaler
    5. from sklearn.model_selection import train_test_split
    6. from sklearn.metrics import classification_report
    7. from ch4.decision_tree_R import DecisionTreeRegression
    8. from sklearn.tree import DecisionTreeRegressor
    9. # iris = load_iris()
    10. # X, y = iris.data, iris.target
    11. digits = load_digits()
    12. X, y = digits.data, digits.target
    13. # bc_data = load_breast_cancer()
    14. # X, y = bc_data.data, bc_data.target
    15. X = PCA(n_components=10).fit_transform(X)
    16. X = StandardScaler().fit_transform(X)
    17. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
    18. # base_es = DecisionTreeRegression(max_bins=50, max_depth=3)
    19. base_es = DecisionTreeRegressor(max_depth=3)
    20. gbc = GradientBoostClassifier(base_estimator=base_es, n_estimators=50)
    21. gbc.fit(X_train, y_train)
    22. y_hat = gbc.predict(X_test)
    23. print(classification_report(y_test, y_hat))

     三、Bagging算法

    3.1 Bagging算法

    bagging_c_r.py

    1. import numpy as np
    2. import copy
    3. from ch4.decision_tree_R import DecisionTreeRegression # CART
    4. from ch4.decision_tree_C import DecisionTreeClassifier
    5. from sklearn.metrics import accuracy_score, r2_score
    6. class BaggingClassifierRegressor:
    7. """
    8. 1. Bagging的基本流程:采样出T个含m个训练样本的采样集,然后基于每个采样集训练出一个基学习器,再集成。
    9. 2. 预测输出进行结合:Bagging通常对分类任务采用简单投票法,对回归任务使用简单平均法。
    10. 3. 把回归任务与分类任务集成到一个算法中,右参数task来控制,包外估计OOB控制
    11. """
    12. def __init__(self, base_estimator=None, n_estimators=10, task="C", OOB=False):
    13. """
    14. :param base_estimator: 基学习器
    15. :param n_estimators: 基学习器的个数T
    16. :param task: 任务:C代表分类任务,R代表回归任务
    17. :param OOB: 布尔变量,True表示进行包外估计
    18. """
    19. self.base_estimator = base_estimator
    20. self.n_estimators = n_estimators
    21. self.task = task
    22. if task.lower() not in ["c", "r"]:
    23. raise ValueError("Bagging任务仅限分类(C/c)、回归(R/r)")
    24. # 如果不提供基学习器,则默认按照深度为2的决策树作为基分类器
    25. if self.base_estimator is None:
    26. if self.task.lower() == "c":
    27. self.base_estimator = DecisionTreeClassifier()
    28. elif self.task.lower() == "r":
    29. self.base_estimator = DecisionTreeRegression()
    30. if type(base_estimator) != list:
    31. # 同质(同种类型)的分类器,深拷贝
    32. self.base_estimator = [copy.deepcopy(self.base_estimator)
    33. for _ in range(self.n_estimators)]
    34. else:
    35. # 异质(不同种类型)的分类器
    36. self.n_estimators = len(self.base_estimator)
    37. self.OOB = OOB # 是否进行包外估计
    38. self.oob_indices = [] # 保存每次有放回采样未被使用的样本索引
    39. self.y_oob_hat = None # 包括估计样本预测值(回归)或预测类别概率(分类)
    40. self.oob_score = None # 包外估计的评分,分类和回归
    41. def fit(self, x_train, y_train):
    42. """
    43. Bagging算法(包含分类和回归)的训练
    44. :param x_train: 训练集
    45. :param y_train: 目标集
    46. :return:
    47. """
    48. x_train, y_train = np.asarray(x_train), np.asarray(y_train)
    49. n_samples = x_train.shape[0]
    50. for estimator in self.base_estimator:
    51. # 1. 有放回的随机重采样训练集
    52. indices = np.random.choice(n_samples, n_samples, replace=True) # 采样样本索引
    53. indices = np.unique(indices)
    54. x_bootstrap, y_bootstrap = x_train[indices, :], y_train[indices]
    55. # 2. 基于采样数据,训练基学习器
    56. estimator.fit(x_bootstrap, y_bootstrap)
    57. # 存储每个基学习器未使用的样本索引
    58. n_indices = set(np.arange(n_samples)).difference(set(indices))
    59. self.oob_indices.append(list(n_indices)) # 每个基学习器未参与训练的样本索引
    60. # 3. 包外估计
    61. if self.OOB:
    62. if self.task.lower() == "c":
    63. self._oob_score_classifier(x_train, y_train)
    64. else:
    65. self._oob_score_regressor(x_train, y_train)
    66. def _oob_score_classifier(self, x_train, y_train):
    67. """
    68. 分类任务的包外估计
    69. :param x_train:
    70. :param y_train:
    71. :return:
    72. """
    73. self.y_oob_hat, y_true = [], []
    74. for i in range(x_train.shape[0]): # 针对每个训练样本
    75. y_hat_i = [] # 当前样本在每个基学习器下的预测概率,个数未必等于self.n_estimators
    76. for idx in range(self.n_estimators): # 针对每个基学习器
    77. if i in self.oob_indices[idx]: # 如果该样本属于包外估计
    78. y_hat = self.base_estimator[idx].predict_proba(x_train[i, np.newaxis])
    79. y_hat_i.append(y_hat[0])
    80. # print(y_hat_i)
    81. if y_hat_i: # 非空,计算各基学习器预测类别概率的均值
    82. self.y_oob_hat.append(np.mean(np.c_[y_hat_i], axis=0))
    83. y_true.append(y_train[i]) # 存储对应的真值
    84. self.y_oob_hat = np.asarray(self.y_oob_hat)
    85. self.oob_score = accuracy_score(y_true, np.argmax(self.y_oob_hat, axis=1))
    86. def _oob_score_regressor(self, x_train, y_train):
    87. """
    88. 回归任务的包外估计
    89. :param x_train:
    90. :param y_train:
    91. :return:
    92. """
    93. self.y_oob_hat, y_true = [], []
    94. for i in range(x_train.shape[0]): # 针对每个训练样本
    95. y_hat_i = [] # 当前样本在每个基学习器下的预测概率,个数未必等于self.n_estimators
    96. for idx in range(self.n_estimators): # 针对每个基学习器
    97. if i in self.oob_indices[idx]: # 如果该样本属于包外估计
    98. y_hat = self.base_estimator[idx].predict(x_train[i, np.newaxis])
    99. y_hat_i.append(y_hat[0])
    100. # print(y_hat_i)
    101. if y_hat_i: # 非空,计算各基学习器预测类别概率的均值
    102. self.y_oob_hat.append(np.mean(y_hat_i))
    103. y_true.append(y_train[i]) # 存储对应的真值
    104. self.y_oob_hat = np.asarray(self.y_oob_hat)
    105. self.oob_score = r2_score(y_true, self.y_oob_hat)
    106. def predict_proba(self, x_test):
    107. """
    108. 分类任务中测试样本所属类别的概率预测
    109. :param x_test:
    110. :return:
    111. """
    112. if self.task.lower() != "c":
    113. raise ValueError("predict_proba()仅适用于分类任务。")
    114. x_test = np.asarray(x_test)
    115. y_test_hat = [] # 用于存储测试样本所属类别概率
    116. for estimator in self.base_estimator:
    117. y_test_hat.append(estimator.predict_proba(x_test))
    118. # print(y_test_hat)
    119. return np.mean(y_test_hat, axis=0)
    120. def predict(self, x_test):
    121. """
    122. 分类任务:预测测试样本所属类别,类别概率大者索引为所属类别
    123. 回归任务:预测测试样本,对每个基学习器预测值简单平均
    124. :param x_test:
    125. :return:
    126. """
    127. if self.task.lower() == "c":
    128. return np.argmax(self.predict_proba(x_test), axis=1)
    129. elif self.task.lower() == "r":
    130. y_hat = [] # 预测值
    131. for estimator in self.base_estimator:
    132. y_hat.append(estimator.predict(x_test))
    133. return np.mean(y_hat, axis=0)

    3.2 Bagging算法测试

    test_bagging_c1.py

    1. from sklearn.datasets import load_iris
    2. from ch8.bagging_c_r import BaggingClassifierRegressor
    3. from ch4.decision_tree_C import DecisionTreeClassifier
    4. from sklearn.metrics import classification_report
    5. from sklearn.model_selection import train_test_split
    6. from sklearn.preprocessing import StandardScaler
    7. iris = load_iris()
    8. X, y = iris.data, iris.target
    9. X = StandardScaler().fit_transform(X)
    10. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42)
    11. base_es = DecisionTreeClassifier(max_depth=10, max_bins=50, is_feature_all_R=True)
    12. bagcr = BaggingClassifierRegressor(base_estimator=base_es, n_estimators=20, task="c", OOB=True)
    13. bagcr.fit(X_train, y_train)
    14. y_hat = bagcr.predict(X_test)
    15. print(classification_report(y_test, y_hat))
    16. print("包外估计的精度:", bagcr.oob_score)

     test_bagging_c2.py

    1. from sklearn.datasets import load_iris
    2. from ch8.bagging_c_r import BaggingClassifierRegressor
    3. from ch4.decision_tree_C import DecisionTreeClassifier
    4. from sklearn.metrics import classification_report, accuracy_score
    5. from sklearn.model_selection import train_test_split
    6. from sklearn.preprocessing import StandardScaler, LabelEncoder
    7. import pandas as pd
    8. import numpy as np
    9. import matplotlib.pyplot as plt
    10. nursery = pd.read_csv("../ch4/data/nursery.csv").dropna()
    11. X, y = np.asarray(nursery.iloc[:, :-1]), np.asarray(nursery.iloc[:, -1])
    12. y = LabelEncoder().fit_transform(y)
    13. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, shuffle=True, random_state=42)
    14. base_es = DecisionTreeClassifier(max_depth=10)
    15. bagcr = BaggingClassifierRegressor(base_estimator=base_es, n_estimators=30, task="c")
    16. bagcr.fit(X_train, y_train)
    17. y_hat = bagcr.predict(X_test)
    18. print(classification_report(y_test, y_hat))
    19. # print("包外估计的精度:", bagcr.oob_score)
    20. y_test_scores = []
    21. for i in range(30):
    22. bagcr = BaggingClassifierRegressor(base_estimator=base_es, n_estimators=1, task="c")
    23. bagcr.fit(X_train, y_train)
    24. y_hat = bagcr.predict(X_test)
    25. y_test_scores.append(accuracy_score(y_test, y_hat))
    26. plt.figure(figsize=(7, 5))
    27. plt.plot(range(1, 31), y_test_scores, "ko-", lw=1.5)
    28. plt.xlabel("Training Times", fontsize=12)
    29. plt.ylabel("Test Accuracy", fontsize=12)
    30. plt.grid(ls=":")
    31. plt.show()

     test_bagging_r.py

    1. import numpy as np
    2. import matplotlib.pyplot as plt
    3. from ch4.decision_tree_R import DecisionTreeRegression
    4. from ch8.bagging_c_r import BaggingClassifierRegressor
    5. from sklearn.metrics import r2_score
    6. f = lambda x: 0.5 * np.exp(-(x + 3) ** 2) + np.exp(-x ** 2) + 1.5 * np.exp(-(x - 3) ** 2)
    7. np.random.seed(0)
    8. N = 200
    9. X = np.random.rand(N) * 10 - 5
    10. X = np.sort(X)
    11. y = f(X) + 0.05 * np.random.randn(N)
    12. X = X.reshape(-1, 1)
    13. # print(X)
    14. base_estimator = DecisionTreeRegression(max_bins=30, max_depth=8)
    15. model = BaggingClassifierRegressor(base_estimator=base_estimator, n_estimators=100, task="r")
    16. model.fit(X, y)
    17. X_test = np.linspace(1.1 * X.min(axis=0), 1.1 * X.max(axis=0), 1000).reshape(-1, 1)
    18. y_bagging_hat = model.predict(X_test)
    19. base_estimator.fit(X, y)
    20. y_cart_hat = base_estimator.predict(X_test)
    21. plt.figure(figsize=(7, 5))
    22. plt.scatter(X, y, s=10, c="k", label="Raw Data")
    23. plt.plot(X_test, f(X_test), "k-", lw=1.5, label="True F(x)")
    24. plt.plot(X_test, y_bagging_hat, "r-", label="Bagging(R2 = %.5f)" % r2_score(f(X_test), y_bagging_hat))
    25. plt.plot(X_test, y_cart_hat, "b-", label="CART(R2 = %.5f)" % r2_score(f(X_test), y_cart_hat))
    26. plt.legend(frameon=False)
    27. plt.xlabel("X", fontsize=12)
    28. plt.ylabel("Y", fontsize=12)
    29. plt.grid(ls=":")
    30. plt.title("Bagging(100 estimators) VS CART Regression", fontsize=14)
    31. plt.show()

    四、随机森林算法

    4.1 随机森林算法

     rf_classifier_regressor.py

    1. import numpy as np
    2. import copy
    3. from ch4.decision_tree_R import DecisionTreeRegression # CART
    4. from ch4.decision_tree_C import DecisionTreeClassifier
    5. from sklearn.metrics import accuracy_score, r2_score
    6. class RandomForestClassifierRegressor:
    7. """
    8. 随机森林RF是Bagging的一个扩展变体。 RF在以决策树为基学习器构建Bagging集成的基础上,
    9. 进一步在决策树的训练过程中引入了随机属性选择, 即对训练样本和输入变量增加随机扰动。
    10. """
    11. def __init__(self, base_estimator=None, n_estimators=10, feature_sampling_rate=0.5,
    12. task="C", OOB=False, feature_importance=False):
    13. """
    14. :param base_estimator: 基学习器
    15. :param n_estimators: 基学习器的个数T
    16. :param task: 任务:C代表分类任务,R代表回归任务
    17. :param OOB: 布尔变量,True表示进行包外估计
    18. :param feature_sampling_rate: 特征变量的抽样率
    19. :param feature_importance: 布尔变量,表示是否进行特征重要性的评估
    20. """
    21. self.base_estimator = base_estimator
    22. self.n_estimators = n_estimators
    23. self.feature_sampling_rate = feature_sampling_rate
    24. if task.lower() not in ["c", "r"]:
    25. raise ValueError("Bagging任务仅限分类(C/c)、回归(R/r)")
    26. self.task = task
    27. # 如果不提供基学习器,则默认决策树作为基分类器
    28. if self.base_estimator is None:
    29. if self.task.lower() == "c":
    30. base_estimator = DecisionTreeClassifier()
    31. elif self.task.lower() == "r":
    32. base_estimator = DecisionTreeRegression()
    33. self.base_estimator = [copy.deepcopy(base_estimator)
    34. for _ in range(self.n_estimators)]
    35. self.OOB = OOB # 是否进行包外估计
    36. self.oob_indices = [] # 保存每次有放回采样未被使用的样本索引
    37. self.y_oob_hat = None # 包括估计样本预测值(回归)或预测类别概率(分类)
    38. self.oob_score = None # 包外估计的评分,分类和回归
    39. self.feature_importance = feature_importance
    40. self.feature_importance_scores = None # 特征变量的重要性评分
    41. self.feature_importance_indices = [] # 针对每个基学习器,存储特征变量的抽样索引
    42. def fit(self, x_train, y_train):
    43. """
    44. 随机森林算法(包含分类和回归)的训练
    45. :param x_train: 训练集
    46. :param y_train: 目标集
    47. :return:
    48. """
    49. x_train, y_train = np.asarray(x_train), np.asarray(y_train)
    50. n_samples, n_features = x_train.shape
    51. for estimator in self.base_estimator:
    52. # 1. 有放回的随机重采样训练集
    53. indices = np.random.choice(n_samples, n_samples, replace=True) # 采样样本索引
    54. indices = np.unique(indices)
    55. x_bootstrap, y_bootstrap = x_train[indices, :], y_train[indices]
    56. # 2. 对特征属性变量进行抽样
    57. fb_num = int(self.feature_sampling_rate * n_features) # 抽样特征数
    58. feature_idx = np.random.choice(n_features, fb_num, replace=False) # 不放回
    59. self.feature_importance_indices.append(feature_idx)
    60. x_bootstrap = x_bootstrap[:, feature_idx] # 获取特征变量抽样后的训练样本
    61. # 3. 基于采样数据,训练基学习器
    62. estimator.fit(x_bootstrap, y_bootstrap)
    63. # 存储每个基学习器未使用的样本索引
    64. n_indices = set(np.arange(n_samples)).difference(set(indices))
    65. self.oob_indices.append(list(n_indices)) # 每个基学习器未参与训练的样本索引
    66. # 4. 包外估计
    67. if self.OOB:
    68. if self.task.lower() == "c":
    69. self._oob_score_classifier(x_train, y_train)
    70. else:
    71. self._oob_score_regressor(x_train, y_train)
    72. # 5. 特征重要性估计
    73. if self.feature_importance:
    74. if self.task.lower() == "c":
    75. self._feature_importance_score_classifier(x_train, y_train)
    76. else:
    77. self._feature_importance_score_regressor(x_train, y_train)
    78. def _oob_score_classifier(self, x_train, y_train):
    79. """
    80. 分类任务的包外估计
    81. :param x_train:
    82. :param y_train:
    83. :return:
    84. """
    85. self.y_oob_hat, y_true = [], []
    86. for i in range(x_train.shape[0]): # 针对每个训练样本
    87. y_hat_i = [] # 当前样本在每个基学习器下的预测概率,个数未必等于self.n_estimators
    88. for idx in range(self.n_estimators): # 针对每个基学习器
    89. if i in self.oob_indices[idx]: # 如果该样本属于包外估计
    90. x_sample = x_train[i, self.feature_importance_indices[idx]]
    91. y_hat = self.base_estimator[idx].predict_proba(x_sample.reshape(1, -1))
    92. y_hat_i.append(y_hat[0])
    93. # print(y_hat_i)
    94. if y_hat_i: # 非空,计算各基学习器预测类别概率的均值
    95. self.y_oob_hat.append(np.mean(np.c_[y_hat_i], axis=0))
    96. y_true.append(y_train[i]) # 存储对应的真值
    97. self.y_oob_hat = np.asarray(self.y_oob_hat)
    98. self.oob_score = accuracy_score(y_true, np.argmax(self.y_oob_hat, axis=1))
    99. def _oob_score_regressor(self, x_train, y_train):
    100. """
    101. 回归任务的包外估计
    102. :param x_train:
    103. :param y_train:
    104. :return:
    105. """
    106. self.y_oob_hat, y_true = [], []
    107. for i in range(x_train.shape[0]): # 针对每个训练样本
    108. y_hat_i = [] # 当前样本在每个基学习器下的预测概率,个数未必等于self.n_estimators
    109. for idx in range(self.n_estimators): # 针对每个基学习器
    110. if i in self.oob_indices[idx]: # 如果该样本属于包外估计
    111. x_sample = x_train[i, self.feature_importance_indices[idx]]
    112. y_hat = self.base_estimator[idx].predict(x_sample.reshape(1, -1))
    113. y_hat_i.append(y_hat[0])
    114. # print(y_hat_i)
    115. if y_hat_i: # 非空,计算各基学习器预测类别概率的均值
    116. self.y_oob_hat.append(np.mean(y_hat_i))
    117. y_true.append(y_train[i]) # 存储对应的真值
    118. self.y_oob_hat = np.asarray(self.y_oob_hat)
    119. self.oob_score = r2_score(y_true, self.y_oob_hat)
    120. def _feature_importance_score_classifier(self, x_train, y_train):
    121. """
    122. 分类问题的特征变量重要性评估计算
    123. :param x_train:
    124. :param y_train:
    125. :return:
    126. """
    127. n_feature = x_train.shape[1]
    128. self.feature_importance_scores = np.zeros(n_feature) # 特征变量重要性评分
    129. for f_j in range(n_feature): # 针对每个特征变量
    130. f_j_scores = [] # 当前第j个特征变量在所有基学习器预测的OOB误差变化
    131. for idx, estimator in enumerate(self.base_estimator):
    132. f_s_indices = list(self.feature_importance_indices[idx]) # 获取当前基学习器的特征变量索引
    133. if f_j in f_s_indices: # 表示当前基学习器中存在第j个特征变量
    134. # 1. 计算基于OOB的测试误差error
    135. x_samples = x_train[self.oob_indices[idx], :][:, f_s_indices] # OOB样本以及特征抽样
    136. y_hat = estimator.predict(x_samples)
    137. error = 1 - accuracy_score(y_train[self.oob_indices[idx]], y_hat)
    138. # 2. 计算第j个特征随机打乱顺序后的测试误差
    139. np.random.shuffle(x_samples[:, f_s_indices.index(f_j)]) # 原地打乱第j个特征变量取值,其他特征取值不变
    140. y_hat_j = estimator.predict(x_samples)
    141. error_j = 1 - accuracy_score(y_train[self.oob_indices[idx]], y_hat_j)
    142. f_j_scores.append(error_j - error)
    143. # 3. 计算所有基学习器对当前第j个特征评分的均值
    144. self.feature_importance_scores[f_j] = np.mean(f_j_scores)
    145. return self.feature_importance_scores
    146. def _feature_importance_score_regressor(self, x_train, y_train):
    147. """
    148. 回归任务的特征变量重要性评估计算
    149. :param x_train:
    150. :param y_train:
    151. :return:
    152. """
    153. n_feature = x_train.shape[1]
    154. self.feature_importance_scores = np.zeros(n_feature) # 特征变量重要性评分
    155. for f_j in range(n_feature): # 针对每个特征变量
    156. f_j_scores = [] # 当前第j个特征变量在所有基学习器预测的OOB误差变化
    157. for idx, estimator in enumerate(self.base_estimator):
    158. f_s_indices = list(self.feature_importance_indices[idx]) # 获取当前基学习器的特征变量索引
    159. if f_j in f_s_indices: # 表示当前基学习器中存在第j个特征变量
    160. # 1. 计算基于OOB的测试误差error
    161. x_samples = x_train[self.oob_indices[idx], :][:, f_s_indices] # OOB样本以及特征抽样
    162. y_hat = estimator.predict[x_samples]
    163. error = 1 - r2_score(y_train[self.oob_indices[idx]], y_hat)
    164. # 2. 计算第j个特征随机打乱顺序后的测试误差
    165. np.random.shuffle(x_samples[:, f_s_indices.index(f_j)]) # 原地打乱第j个特征变量取值,其他特征取值不变
    166. y_hat_j = estimator.predict[x_samples]
    167. error_j = 1 - r2_score(y_train[self.oob_indices[idx]], y_hat_j)
    168. f_j_scores.append(error_j - error)
    169. # 3. 计算所有基学习器对当前第j个特征评分的均值
    170. self.feature_importance_scores[f_j] = np.mean(f_j_scores)
    171. return self.feature_importance_scores
    172. def predict_proba(self, x_test):
    173. """
    174. 分类任务中测试样本所属类别的概率预测
    175. :param x_test:
    176. :return:
    177. """
    178. if self.task.lower() != "c":
    179. raise ValueError("predict_proba()仅适用于分类任务。")
    180. x_test = np.asarray(x_test)
    181. y_test_hat = [] # 用于存储测试样本所属类别概率
    182. for idx, estimator in enumerate(self.base_estimator):
    183. x_test_bootstrap = x_test[:, self.feature_importance_indices[idx]]
    184. y_test_hat.append(estimator.predict_proba(x_test_bootstrap))
    185. # print(y_test_hat)
    186. return np.mean(y_test_hat, axis=0)
    187. def predict(self, x_test):
    188. """
    189. 分类任务:预测测试样本所属类别,类别概率大者索引为所属类别
    190. 回归任务:预测测试样本,对每个基学习器预测值简单平均
    191. :param x_test:
    192. :return:
    193. """
    194. if self.task.lower() == "c":
    195. return np.argmax(self.predict_proba(x_test), axis=1)
    196. elif self.task.lower() == "r":
    197. y_hat = [] # 预测值
    198. for idx, estimator in enumerate(self.base_estimator):
    199. x_test_bootstrap = x_test[:, self.feature_importance_indices[idx]]
    200. y_hat.append(estimator.predict(x_test_bootstrap))
    201. return np.mean(y_hat, axis=0)

     4.2 随机森林算法测试

    test_rf_c1.py

     

    1. from sklearn.datasets import load_iris, load_wine, load_digits
    2. from ch8.randomforest.rf_classifier_regressor import RandomForestClassifierRegressor
    3. # from ch4.decision_tree_C import DecisionTreeClassifier
    4. from sklearn.tree import DecisionTreeClassifier
    5. from sklearn.metrics import classification_report
    6. from sklearn.model_selection import train_test_split
    7. from sklearn.preprocessing import StandardScaler
    8. import matplotlib.pyplot as plt
    9. import seaborn as sns
    10. import pandas as pd
    11. iris = load_iris()
    12. X, y = iris.data, iris.target
    13. X = StandardScaler().fit_transform(X)
    14. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42)
    15. # base_es = DecisionTreeClassifier(max_depth=10, max_bins=50, is_feature_all_R=True)
    16. base_es = DecisionTreeClassifier(max_depth=10)
    17. rf_model = RandomForestClassifierRegressor(base_estimator=base_es, n_estimators=30,
    18. task="c", OOB=True, feature_importance=True)
    19. rf_model.fit(X_train, y_train)
    20. y_hat = rf_model.predict(X_test)
    21. print(classification_report(y_test, y_hat))
    22. print("包外估计的精度:", rf_model.oob_score)
    23. print("特征重要性评分:", rf_model.feature_importance_scores)
    24. plt.figure(figsize=(9, 5))
    25. data_pd = pd.DataFrame([iris.feature_names, rf_model.feature_importance_scores]).T
    26. data_pd.columns = ["Feature Names", "Importance"]
    27. sns.barplot(x="Importance", y="Feature Names", data=data_pd)
    28. plt.title("Iris DataSet Feature Importance Scores", fontdict={"fontsize": 14})
    29. plt.grid(ls=":")
    30. print(data_pd)
    31. plt.show()

     

  • 相关阅读:
    把key-value键值对的数据转换成xml格式,并根据key来匹配元素
    暴雨天,看天翼云如何“快准稳”防涝
    MySQL(7) Innodb 原理和日志
    Java 第三阶段增强分析需求,代码实现能力【JDBC】
    FreeRTOS教程7 事件组
    golang pprof
    编写方法将一个数组扁平化并且去重和递增排序
    软考网络工程师笔记-分值分布
    (主)9.26锁存器&状态机方法、题解大综合(加码加码加码)
    镜头下的光学
  • 原文地址:https://blog.csdn.net/2302_78896863/article/details/136423203