• Chapter 7 XGBoost


    1 XGBoost简介

    • XGBoost是使用梯度提升框架实现的高效、灵活、可移植的机器学习库,全称是eXtreme Gradient Boosting,是GBDT(GBM)的一个C++实现。它将树的生成并行完成,从而提高学习速度。
    • 一般地说,XGBoost的速度和性能优于sklearn.ensemble.GradientBoostingClassifier类。
    • XGBoost的作者为华盛顿大学陈天奇,并封装了Python接口,随着在机器学习竞赛中的优异表现,其他学者封装完成了R/Julia等接口。
    • 在实际的工作中,效果不错。

    代码:
    数据

    训练

     预测

    2 Kaggle简介

    Kaggle是一个数据分析的竞赛平台。

    3 代码实现

    泰坦尼克号的生死预测

    数据说明

    pclass:舱位

    sex:性别。使用0、1来标记男性和女性。

    age:年龄,存在部分缺失值。使用决策树or随机森林or根据舱位,将舱位相同的均值作为该舱位存在缺失值的人的年龄等等。

    fare票价,与舱位有直接的联系。

    cabin房间号:缺失值严重。

    embarked:起始城市,有三个C、Q、S,外加缺失值Unknow,所以人为将embarked变成四列,

    S——1000   C——0100   Q——0010

    数据预处理

    代码

    使用logistic回归/随机森林/XGBoost

    1. import xgboost as xgb
    2. import numpy as np
    3. from sklearn.linear_model import LogisticRegression
    4. from sklearn.model_selection import train_test_split
    5. from sklearn.ensemble import RandomForestRegressor
    6. from sklearn.ensemble import RandomForestClassifier
    7. from sklearn.metrics import accuracy_score
    8. import pandas as pd
    9. import csv
    10. def show_accuracy(a, b, tip):
    11. acc = a.ravel() == b.ravel()
    12. acc_rate = 100 * float(acc.sum()) / a.size
    13. print '%s正确率:%.3f%%' % (tip, acc_rate)
    14. return acc_rate
    15. def load_data(file_name, is_train):
    16. data = pd.read_csv(file_name) # 数据文件路径
    17. # print 'data.describe() = \n', data.describe()
    18. # 性别
    19. data['Sex'] = data['Sex'].map({'female': 0, 'male': 1}).astype(int)
    20. # 补齐船票价格缺失值
    21. if len(data.Fare[data.Fare.isnull()]) > 0:
    22. fare = np.zeros(3)
    23. for f in range(0, 3):
    24. fare[f] = data[data.Pclass == f + 1]['Fare'].dropna().median()
    25. for f in range(0, 3): # loop 0 to 2
    26. data.loc[(data.Fare.isnull()) & (data.Pclass == f + 1), 'Fare'] = fare[f]
    27. # 年龄:使用均值代替缺失值
    28. # mean_age = data['Age'].dropna().mean()
    29. # data.loc[(data.Age.isnull()), 'Age'] = mean_age
    30. if is_train:
    31. # 年龄:使用随机森林预测年龄缺失值
    32. print '随机森林预测缺失年龄:--start--'
    33. data_for_age = data[['Age', 'Survived', 'Fare', 'Parch', 'SibSp', 'Pclass']]
    34. age_exist = data_for_age.loc[(data.Age.notnull())] # 年龄不缺失的数据
    35. age_null = data_for_age.loc[(data.Age.isnull())]
    36. # print age_exist
    37. x = age_exist.values[:, 1:]
    38. y = age_exist.values[:, 0]
    39. rfr = RandomForestRegressor(n_estimators=1000)
    40. rfr.fit(x, y)
    41. age_hat = rfr.predict(age_null.values[:, 1:])
    42. # print age_hat
    43. data.loc[(data.Age.isnull()), 'Age'] = age_hat
    44. print '随机森林预测缺失年龄:--over--'
    45. else:
    46. print '随机森林预测缺失年龄2:--start--'
    47. data_for_age = data[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
    48. age_exist = data_for_age.loc[(data.Age.notnull())] # 年龄不缺失的数据
    49. age_null = data_for_age.loc[(data.Age.isnull())]
    50. # print age_exist
    51. x = age_exist.values[:, 1:]
    52. y = age_exist.values[:, 0]
    53. rfr = RandomForestRegressor(n_estimators=1000)
    54. rfr.fit(x, y)
    55. age_hat = rfr.predict(age_null.values[:, 1:])
    56. # print age_hat
    57. data.loc[(data.Age.isnull()), 'Age'] = age_hat
    58. print '随机森林预测缺失年龄2:--over--'
    59. # 起始城市
    60. data.loc[(data.Embarked.isnull()), 'Embarked'] = 'S' # 保留缺失出发城市
    61. # data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2, 'U': 0}).astype(int)
    62. # print data['Embarked']
    63. embarked_data = pd.get_dummies(data.Embarked)
    64. print embarked_data
    65. # embarked_data = embarked_data.rename(columns={'S': 'Southampton', 'C': 'Cherbourg', 'Q': 'Queenstown', 'U': 'UnknownCity'})
    66. embarked_data = embarked_data.rename(columns=lambda x: 'Embarked_' + str(x))
    67. data = pd.concat([data, embarked_data], axis=1)
    68. print data.describe()
    69. data.to_csv('New_Data.csv')
    70. x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
    71. # x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
    72. y = None
    73. if 'Survived' in data:
    74. y = data['Survived']
    75. x = np.array(x)
    76. y = np.array(y)
    77. # 思考:这样做,其实发生了什么?
    78. x = np.tile(x, (5, 1))
    79. y = np.tile(y, (5, ))
    80. if is_train:
    81. return x, y
    82. return x, data['PassengerId']
    83. def write_result(c, c_type):
    84. file_name = 'Titanic.test.csv'
    85. x, passenger_id = load_data(file_name, False)
    86. if type == 3:
    87. x = xgb.DMatrix(x)
    88. y = c.predict(x)
    89. y[y > 0.5] = 1
    90. y[~(y > 0.5)] = 0
    91. predictions_file = open("Prediction_%d.csv" % c_type, "wb")
    92. open_file_object = csv.writer(predictions_file)
    93. open_file_object.writerow(["PassengerId", "Survived"])
    94. open_file_object.writerows(zip(passenger_id, y))
    95. predictions_file.close()
    96. if __name__ == "__main__":
    97. x, y = load_data('Titanic.train.csv', True)
    98. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
    99. #
    100. lr = LogisticRegression(penalty='l2')
    101. lr.fit(x_train, y_train)
    102. y_hat = lr.predict(x_test)
    103. lr_acc = accuracy_score(y_test, y_hat)
    104. # write_result(lr, 1)
    105. rfc = RandomForestClassifier(n_estimators=100)
    106. rfc.fit(x_train, y_train)
    107. y_hat = rfc.predict(x_test)
    108. rfc_acc = accuracy_score(y_test, y_hat)
    109. # write_result(rfc, 2)
    110. # XGBoost
    111. data_train = xgb.DMatrix(x_train, label=y_train)
    112. data_test = xgb.DMatrix(x_test, label=y_test)
    113. watch_list = [(data_test, 'eval'), (data_train, 'train')]
    114. param = {'max_depth': 6, 'eta': 0.8, 'silent': 1, 'objective': 'binary:logistic'}
    115. # 'subsample': 1, 'alpha': 0, 'lambda': 0, 'min_child_weight': 1}
    116. bst = xgb.train(param, data_train, num_boost_round=100, evals=watch_list)
    117. y_hat = bst.predict(data_test)
    118. # write_result(bst, 3)
    119. y_hat[y_hat > 0.5] = 1
    120. y_hat[~(y_hat > 0.5)] = 0
    121. xgb_acc = accuracy_score(y_test, y_hat)
    122. print 'Logistic回归:%.3f%%' % lr_acc
    123. print '随机森林:%.3f%%' % rfc_acc
    124. print 'XGBoost:%.3f%%' % xgb_acc

  • 相关阅读:
    初识 Node.js 与内置模块:初识 Node.js及Node.js 环境的安装
    Spire.Office for Java 8.10.2 同步更新Crk
    DB2HADR 一主多备 环境搭建 centos7搭建db2 hadr 一主多备
    JDK中的SPI 与 Dubbo中的SPI
    【并发】J.U.C之Java锁
    基于JAVAWEBOA办公信息管理系统计算机毕业设计源码+数据库+lw文档+系统+部署
    论文总结:3D Talking Face With Personalized Pose Dynamics
    Win11怎么添加pdf虚拟打印机
    Springboot配置Swagger2
    2022 8.9 模拟
  • 原文地址:https://blog.csdn.net/qwertyuiop0208/article/details/126045689