本次实战基于给出的数据进行保险预测。数据集:Allstate Claims Severity | Kaggle
给出的训练数据是116列(cat1-cat116)的离散数据和14列(con1-con14)的连续数据。数据集中的每一行表示一个保险索赔。必须预测‘loss’列的值。

- import matplotlib.pyplot as plt
- import numpy as np
- import pandas as pd
- from pyecharts.charts import Bar
- import pyecharts.options as opts
- from scipy import stats
- import seaborn as sns
- import xgboost as xgb
- from sklearn.metrics import mean_absolute_error, mean_squared_error
- from sklearn.model_selection import train_test_split
- from sklearn.model_selection import GridSearchCV
- from sklearn.metrics import make_scorer
- train_data = pd.read_csv('./allstate-claims-severity/train.csv')
- test_data = pd.read_csv('./allstate-claims-severity/test.csv')
- print(train_data.info())

print(train_data.head())

print(train_data.shape)
![]()
- print('前20列名:', list(train_data.columns[:20]))
- print('后20列名:', list(train_data.columns[-20:]))

print(train_data.describe())

print(pd.isnull(train_data).values.any()) # False表示没有缺失值
pandas的any方法
- cat_features = list(train_data.select_dtypes(include=['object']).columns)
- print('离散型特征有{}个,它们是:{}'.format(len(cat_features), cat_features))
- cont_features = [cont for cont in list(train_data.select_dtypes(include=['float64']).columns) if cont not in ['loss']]
- print('连续型特征有{}个,它们是:{}'.format(len(cont_features), cont_features))

pandas的selectdtypes方法
- cat_uniques = {}
- for cat in cat_features:
- cat_uniques[cat] = len(list(train_data[cat].unique()))
- cat_uniques_df = pd.DataFrame({'cat_name':list(cat_uniques.keys()),
- 'unique_values': list(cat_uniques.values())})
- print(cat_uniques_df)
- data = cat_uniques_df['unique_values'].value_counts().to_dict()
- bar = Bar()
- bar.add_xaxis(list(data.keys()))
- bar.add_yaxis('', list(data.values()))
- bar.set_global_opts(title_opts=opts.TitleOpts(title='离散特征值分布情况', pos_left='center'),
- xaxis_opts=opts.AxisOpts(name='离散特征中的\n不同值的数目'),
- yaxis_opts=opts.AxisOpts(name='具有X个不同值的\n分类特征的数量', is_show=True, splitline_opts=opts.SplitLineOpts(is_show=True))
- )
- bar.render('a.html')


正如我们所看到的,大部分的分类特征(72/116)是二值的,其中有一个具有326个值的特征。
- fig, ax = plt.subplots(2, 5, figsize=(16, 9))
- for i in range(2):
- for j in range(5):
- cat = cat_features[5*i+j]
- data = train_data[cat].value_counts().to_dict()
- ax[i][j].bar(list(data.keys()), list(data.values()), width=0.5)
- ax[i][j].set_xlabel(cat)
- plt.savefig('./g.png')

- train_data[cont_features].hist(bins=50, figsize=(16, 12))
- plt.savefig('./d.png', dpi=300)

- plt.subplots(figsize=(16, 9))
- corr_data = train_data[cont_features].corr()
- sns.heatmap(corr_data, annot=True)
- plt.savefig('./e.png', dpi=300)

- cont_features.append('loss')
- res = train_data[cont_features].corr()['loss'].apply(lambda x: round(x, 4)).to_dict()
- print(list(res.keys()))
- bar2 = Bar()
- bar2.add_xaxis(list(res.keys())[:-1])
- bar2.add_yaxis('', list(res.values())[:-1], label_opts=opts.LabelOpts(formatter='{c}'))
- bar2.set_global_opts(xaxis_opts=opts.AxisOpts(axistick_opts=opts.AxisTickOpts(is_inside=True), # 设置刻度线朝里还是外
- axislabel_opts=opts.LabelOpts(interval=0))) # 解决pyechart x轴标签隔一个展示一个的问题
- bar2.render('f.html')

- plt.figure(figsize=(16, 8))
- plt.plot(train_data['id'], train_data['loss'])
- print('train_data[\'id\']的个数', len(train_data['id']))
- plt.title('Loss values per id')
- plt.xlabel('id')
- plt.ylabel('loss')
- plt.legend()
- plt.savefig('./b.png', dpi=300)
- # plt.show()

偏度:度量了实值随机变量的均值分布的不对称性,下面让我们来计算一下loss的偏度。
- # scipy.stats 统计指标
- skew = stats.mstats.skew(train_data['loss']).data
- print(skew)
- after_transform_skew = stats.mstats.skew(np.log(train_data['loss'])).data
- print(after_transform_skew)
- # 两种loss分布对比
- fig, (ax1, ax2) = plt.subplots(1, 2)
- fig.set_size_inches(16, 5)
- ax1.hist(train_data['loss'], bins=50)
- ax1.set_title('Train Loss target histogram')
- ax1.grid(True)
- ax2.hist(np.log(train_data['loss']), bins=50, color='g')
- ax2.set_title('Train Log Loss target histogram')
- ax2.grid(True)
- plt.savefig('./c.png', dpi=300)
- # plt.show()


- train_data['log_loss'] = np.log(train_data['loss'])
- train_y = train_data['log_loss']
- features = [x for x in train_data.columns if x not in ['id', 'loss', 'log_loss']]
- train_x = train_data[features]
- # 将目标列,转换为 category 类型;然后,转换为整形的编码
- for c in range(len(cat_features)):
- train_x[cat_features[c]] = train_x[cat_features[c]].astype('category').cat.codes
- print(train_x)

- features_test = [x for x in test_data.columns if x not in ['id']]
- test_x = test_data[features]
- # 将目标列,转换为 category 类型;然后,转换为整形的编码
- for c in range(len(cat_features)):
- test_x[cat_features[c]] = test_x[cat_features[c]].astype('category').cat.codes
- print(test_x)
booster : gbtree, 用什么方法进行结点分裂。梯度提升树来进行结点分裂。
objective : multi softmax, 使用的损失函数,softmax 是多分类问题
num_class : 10, 类别数,与 multi softmax 并用
gamma : 损失下降多少才进行分裂
max_depth : 12, 构建树的深度, 越大越容易过拟合
lambda : 2, 控制模型复杂度的权重值的L2正则化项参数,参数越大。模型越不容易过拟合。
subsample : 0.7 , 随机采样训练样本,取70%的数据训练
colsample_bytree : 0.7, 生成树时进行的列采样
min_child_weight : 3, 孩子节点中最小的样本权重和,如果一个叶子结点的样本权重和小于 min_child_weight 则拆分过程结果
slient : 0, 设置成 1 则没有运行信息输出,最好是设置为0
eta : 0.007, 如同学习率。前面的树都不变了,新加入一棵树后对结果的影响占比
seed : 1000
Thread : 7, cup 线程数
- x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.2, random_state=14)
- print(x_test)
- dtrain = xgb.DMatrix(x_train, y_train)
- d_test = xgb.DMatrix(x_test, y_test)
- xgb_params = {
- 'seed': 0,
- 'eta': 0.1,
- 'colsample_bytree': 0.5,
- 'silent': 1,
- 'subsample': 0.5,
- 'objective': 'reg:squarederror',
- 'max_depth': 5,
- 'min_child_weight': 3
- }
评估策略:e的次幂,用来评估。
结果衡量方法:使用平均绝对误差来衡量,mean_absolute_error(np.exp(y), np.exp(yhat))。
- def xg_eval_mae(y_hat, dtrain):
- y = dtrain.get_label()
- return 'mae', mean_absolute_error(np.exp(y), np.exp(y_hat))
- # feval:评估策略
- bst_cv1 = xgb.cv(xgb_params, dtrain, num_boost_round=50, nfold=3, seed=0,
- feval=xg_eval_mae, maximize=False, early_stopping_rounds=10)
- print('CV score:', bst_cv1.iloc[-1, :]['test-mae-mean'])
- plt.figure()
- bst_cv1[['train-mae-mean', 'test-mae-mean']].plot()
- plt.savefig('./h.png', dpi=300)

上面建立的树模型为50, 我们设置为100对比下结果。
- bst_cv2 = xgb.cv(xgb_params, dtrain, num_boost_round=100, nfold=3, seed=0,
- feval=xg_eval_mae, maximize=False, early_stopping_rounds=10)
- print ('CV score:', bst_cv1.iloc[-1, :]['test-mae-mean'])

- fig, (ax1, ax2) = plt.subplots(1, 2)
- fig.set_size_inches(16, 4)
- ax1.set_title('100 rounds of training')
- ax1.set_xlabel('Rounds')
- ax1.set_ylabel('Loss')
- ax1.grid(True)
- ax1.plot(bst_cv2[['train-mae-mean', 'test-mae-mean']])
- ax1.legend(['Training Loss', 'Test Loss'])
- ax2.set_title('60 last rounds of training')
- ax2.set_xlabel('Rounds')
- ax2.set_ylabel('Loss')
- ax2.grid(True)
- ax2.plot(bst_cv2.iloc[40:][['train-mae-mean', 'test-mae-mean']])
- ax2.legend(['Training Loss', 'Test Loss'])
- plt.savefig('./i.png', dpi=300)

我们把树模型的数量增加到了100。效果不是很明显。看最后的60次。我们可以看到 测试集仅比训练集高那么一丁点。存在一丁点的过拟合。
不过我们的CV score更低了。接下来,我们改变其他参数。
改变参数之前,需要输出当前模型下测试集的预测值,以观察模型结果有无提升。
dtest_x = xgb.DMatrix(test_x)
- model = xgb.train(xgb_params, dtrain, num_boost_round=50)
- test_predict = model.predict(d_test)
- mse = mean_squared_error(y_test, test_predict)
- print(mse)
- test_y = model.predict(dtest_x)
- print(test_y[1], len(test_y))

调优前的MSE为0.321。
衡量标准
- def mae_score(y_true, y_pred):
- return mean_absolute_error(np.exp(y_true), np.exp(y_pred))
- mae_scorer = make_scorer(mae_score, greater_is_better=False)
- params = {
- 'seed': 0,
- 'eta': 0.1,
- 'colsample_bytree': 0.5,
- 'silent': 1,
- 'subsample': 0.5,
- 'objective': 'reg:squarederror',
- # 'max_depth': 5,
- # 'min_child_weight': 3,
- 'num_boost_round': 50,
- }
- cv_params = {'max_depth': list(range(4, 9)), 'min_child_weight': list((1, 3, 6))}
- model = xgb.XGBRegressor(**params)
- grid = GridSearchCV(estimator=model, param_grid=cv_params, scoring=mae_scorer, cv=5, verbose=1, n_jobs=4)
- grid.fit(x_train, y_train)
- test_predict = grid.predict(x_test)
- mse = mean_squared_error(y_test, test_predict)
- test_y = grid.predict(test_x)
- print(grid.best_params_)
- print(mse)
- print(grid.best_score_)

改变 max_depth 和 min_child_weight后,MSE:0.321-->0.296。
- params = {
- 'seed': 0,
- 'eta': 0.1,
- 'colsample_bytree': 0.5,
- 'silent': 1,
- 'subsample': 0.5,
- 'objective': 'reg:squarederror',
- 'max_depth': 8,
- 'min_child_weight': 3,
- 'num_boost_round': 50,
- }
- cv_params = {'gamma': [0.1 * i for i in range(0, 5)]}
- model = xgb.XGBRegressor(**params)
- grid = GridSearchCV(estimator=model, param_grid=cv_params, scoring=mae_scorer, cv=5, verbose=1, n_jobs=4)
- grid.fit(x_train, y_train)
- test_predict = grid.predict(x_test)
- mse = mean_squared_error(y_test, test_predict)
- test_y = grid.predict(test_x)
- print(grid.best_params_)
- print(mse)
- print(grid.best_score_)

调节gamma后,MSE:0.296-->0.297。
- params = {
- 'seed': 0,
- 'eta': 0.1,
- # 'colsample_bytree': 0.5,
- 'silent': 1,
- # 'subsample': 0.5,
- 'objective': 'reg:squarederror',
- 'max_depth': 8,
- 'min_child_weight': 3,
- 'num_boost_round': 50,
- 'gamma': 0.1
- }
- cv_params = {'subsample': [0.1 * i for i in range(6, 9)],
- 'colsample_bytree': [0.1 * i for i in range(6, 9)]}
- model = xgb.XGBRegressor(**params)
- grid = GridSearchCV(estimator=model, param_grid=cv_params, scoring=mae_scorer, cv=5, verbose=1, n_jobs=4)
- grid.fit(x_train, y_train)
- test_predict = grid.predict(x_test)
- mse = mean_squared_error(y_test, test_predict)
- test_y = grid.predict(test_x)
- print(grid.best_params_)
- print(mse)
- print(grid.best_score_)

调节样本采样方式 subsample 和 colsample_bytree,MSE:0.297-->0.296。
- params = {
- 'seed': 0,
- # 'eta': 0.1,
- 'colsample_bytree': 0.6,
- 'silent': 1,
- 'subsample': 0.7,
- 'objective': 'reg:squarederror',
- 'max_depth': 8,
- 'min_child_weight': 3,
- 'num_boost_round': 100,
- 'gamma': 0.1
- }
- cv_params = {'eta': [0.5, 0.4, 0.3, 0.2, 0.1, 0.075, 0.05, 0.04, 0.03]}
- model = xgb.XGBRegressor(**params)
- grid = GridSearchCV(estimator=model, param_grid=cv_params, scoring=mae_scorer, cv=5, verbose=1, n_jobs=4)
- grid.fit(x_train, y_train)
- test_predict = grid.predict(x_test)
- mse = mean_squared_error(y_test, test_predict)
- test_y = grid.predict(test_x)
- print(grid.best_params_)
- print(mse)
- print(grid.best_score_)

减小学习率并增大树个数,由于eta没变,MSE:不变。
通过GridSerchCV调节参数,找打了一组最优参数,并切该参数下的模型结果最优,MSE最低,从调优前0.321的降到0.296。
最优参数:
best_params = {'max_depth' : 8,
'min_child_weight' : 3,
'gamma' : 0.1,
'colsample'_bytree : 0.6,
'subsample' : 0.7}