from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import time
import warnings
# Setting configuration.
warnings.filterwarnings('ignore')
SEED = 42
# Load dataset
path = '../../datasets/Home-Credit-Default-Risk/'
data = pd.read_csv(path + 'prepared_data.csv', index_col='SK_ID_CURR')
data.shape
(307511, 158)
data.groupby('TARGET').size()
TARGET
0 282686
1 24825
dtype: int64
X_train, X_test, y_train, y_test = train_test_split(
data.drop('TARGET', axis=1),
data['TARGET'],
test_size=0.25
)
gbc = HistGradientBoostingClassifier(
class_weight = 'balanced',
scoring = 'roc_auc',
max_iter = 1000,
max_depth = 8,
max_features = 0.35,
learning_rate = 0.015,
l2_regularization = 15,
n_iter_no_change = 20,
random_state = SEED,
verbose = 0
)
gbc.fit(X_train, y_train)
train_auc = roc_auc_score(y_train, gbc.predict_proba(X_train)[:, 1])
test_auc = roc_auc_score(y_test, gbc.predict_proba(X_test)[:, 1])
print(f"train's auc: {train_auc:.4f}")
print(f"test's auc: {test_auc:.4f}")
train's auc: 0.7963
test's auc: 0.7608
from sklearn.ensemble import GradientBoostingClassifier
# Create data stream
def get_minibatch(minibatch_size):
path = '../../datasets/Home-Credit-Default-Risk/'
return pd.read_csv(
path + 'prepared_data.csv',
index_col='SK_ID_CURR',
chunksize = minibatch_size # return iterator
)
gbdt = GradientBoostingClassifier(
learning_rate = 0.015,
n_estimators = 0,
subsample = 1.0,
max_features = 0.35,
max_depth = 8,
n_iter_no_change = 20,
warm_start = True,
random_state = SEED,
verbose = 0
)
def gbdt_with_warm_start(X_train, y_train, X_test, y_test, i=None):
gbdt.n_estimators += 100
gbdt.fit(X_train, y_train)
train_pred = gbdt.predict_proba(X_train)[:, 1]
test_pred = gbdt.predict_proba(X_test)[:, 1]
return train_pred, test_pred, gbdt.n_estimators
def incremental_learning(iterator, refresh):
test = iterator.get_chunk(size = 75000)
print('test data shape:', test.shape)
X_test = test.drop('TARGET', axis=1)
y_test = test['TARGET']
tick = time.time()
n_train = 0
auc_history = []
# Main loop : iterate on mini-batches of examples
for i, train in enumerate(iterator):
X_train = train.drop('TARGET', axis = 1)
y_train = train['TARGET']
n_train += X_train.shape[0]
# update model with examples in the current mini-batch
train_pred, test_pred, num_trees = refresh(X_train, y_train, X_test, y_test, i)
duration = time.time() - tick
train_auc = roc_auc_score(y_train, train_pred)
test_auc = roc_auc_score(y_test, test_pred)
# report progress information
if num_trees is None:
num = n_train
condition = f'{n_train} train samples'
else:
num = num_trees
condition = f'{num_trees} train trees'
auc_history.append((num, train_auc, test_auc))
progress = f"{condition}, valid's auc: {test_auc:.4f} in {duration:.2f}s"
print(progress)
print(f"finally:")
print(f" train's auc: {train_auc:.4f}")
print(f" valid's auc: {test_auc:.4f}")
return auc_history
batch_size = 20000
minibatch_iterator = get_minibatch(minibatch_size = batch_size)
auc_history = incremental_learning(minibatch_iterator, gbdt_with_warm_start)
test data shape: (75000, 158)
100 train trees, valid's auc: 0.7274 in 11.37s
200 train trees, valid's auc: 0.7359 in 22.87s
300 train trees, valid's auc: 0.7378 in 34.54s
400 train trees, valid's auc: 0.7387 in 46.21s
500 train trees, valid's auc: 0.7396 in 58.19s
600 train trees, valid's auc: 0.7390 in 70.33s
700 train trees, valid's auc: 0.7390 in 82.73s
800 train trees, valid's auc: 0.7390 in 95.44s
900 train trees, valid's auc: 0.7376 in 108.18s
1000 train trees, valid's auc: 0.7375 in 121.03s
1100 train trees, valid's auc: 0.7350 in 134.19s
1200 train trees, valid's auc: 0.7316 in 143.22s
finally:
train's auc: 0.8363
valid's auc: 0.7316
def plot_score_evolution(auc_history, xlable):
"""xlabel: training examples (#) or num trees"""
plt.figure()
ticks, train_auc, test_auc = zip(*auc_history)
plt.title('Metric during incremental learning')
plt.xlabel(xlable)
plt.ylabel('auc')
plt.grid(True)
plt.plot(ticks, train_auc, label='train')
plt.plot(ticks, test_auc, label='test')
plt.gca()
plt.legend(loc='best', title='auc')
plot_score_evolution(auc_history, 'num trees')
plt.show()

from sklearn.linear_model import SGDClassifier
num_rounds = 1200 # 同上节 warm_start 树的数量
sgd = SGDClassifier(
class_weight = {1: 11, 0: 1},
loss='log_loss',
alpha = 0.01,
max_iter = num_rounds,
penalty='elasticnet',
l1_ratio = 0.5
)
def sgd_partial_fit(X_train, y_train, X_test, y_test, i=None):
sgd.partial_fit(X_train, y_train, classes=[1, 0])
train_pred = sgd.predict_proba(X_train)[:, 1]
test_pred = sgd.predict_proba(X_test)[:, 1]
return train_pred, test_pred, None
minibatch_iterator = get_minibatch(minibatch_size = batch_size)
auc_history = incremental_learning(minibatch_iterator, sgd_partial_fit)
test data shape: (75000, 158)
20000 train samples, valid's auc: 0.5011 in 0.21s
40000 train samples, valid's auc: 0.5024 in 0.47s
60000 train samples, valid's auc: 0.5234 in 0.72s
80000 train samples, valid's auc: 0.5448 in 0.98s
100000 train samples, valid's auc: 0.5000 in 1.23s
120000 train samples, valid's auc: 0.5185 in 1.50s
140000 train samples, valid's auc: 0.4999 in 1.76s
160000 train samples, valid's auc: 0.5160 in 2.01s
180000 train samples, valid's auc: 0.5141 in 2.27s
200000 train samples, valid's auc: 0.5310 in 2.54s
220000 train samples, valid's auc: 0.5233 in 2.81s
232511 train samples, valid's auc: 0.5014 in 2.99s
finally:
train's auc: 0.5025
valid's auc: 0.5014
运行结果展示:
plot_score_evolution(auc_history, 'training examples (#)')
plt.show()

XGBoost 提供两种增量学习的方式:
import xgboost as xgb
# specify parameters via map
params = dict(
booster = 'gbtree',
objective = 'binary:logistic',
eval_metric = 'auc',
scale_pos_weight = 11,
learning_rate = 0.015,
max_depth = 8,
subsample = 1.0,
colsample_bytree = 0.35,
reg_alpha = 65,
reg_lambda = 15,
seed = SEED,
verbosity = 0
)
bst = None # init model
# Train 1000 iterations, with the each chunk runs for 100 iterations.
def xgb_continue(X_train, y_train, X_test, y_test, i=None):
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
global bst
bst = xgb.train(
params,
dtrain,
num_boost_round = 100,
xgb_model = bst,
evals = [(dtrain, "train")],
callbacks = [xgb.callback.EarlyStopping(20)],
verbose_eval = 0
)
train_pred = bst.predict(dtrain)
test_pred = bst.predict(dtest)
return train_pred, test_pred, bst.num_boosted_rounds()
minibatch_iterator = get_minibatch(minibatch_size = batch_size)
auc_history = incremental_learning(minibatch_iterator, xgb_continue)
test data shape: (75000, 158)
100 train trees, valid's auc: 0.7279 in 1.58s
200 train trees, valid's auc: 0.7374 in 3.06s
300 train trees, valid's auc: 0.7414 in 4.60s
400 train trees, valid's auc: 0.7455 in 6.19s
500 train trees, valid's auc: 0.7477 in 7.83s
600 train trees, valid's auc: 0.7486 in 9.55s
700 train trees, valid's auc: 0.7504 in 11.32s
800 train trees, valid's auc: 0.7505 in 13.14s
900 train trees, valid's auc: 0.7499 in 15.03s
1000 train trees, valid's auc: 0.7505 in 16.96s
1100 train trees, valid's auc: 0.7514 in 18.95s
1200 train trees, valid's auc: 0.7518 in 20.66s
finally:
train's auc: 0.7873
valid's auc: 0.7518
plot_score_evolution(auc_history, 'num trees')
plt.show()

使用 process_type 参数更新叶节点数值
# specify parameters via map
params = dict(
process_type = 'default', # Set `process_type` to `default` if you want to build new trees.
booster = 'gbtree',
objective = 'binary:logistic',
eval_metric = 'auc',
scale_pos_weight = 11,
learning_rate = 0.015,
max_depth = 8,
subsample = 1.0,
colsample_bytree = 0.35,
reg_alpha = 65,
reg_lambda = 15,
seed = SEED,
verbosity = 0
)
bst = None # init model
# The model will adapt to new data by changing leaf value (no change in split condition)
def xgb_refresh(X_train, y_train, X_test, y_test, i=None):
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# update estimator with examples in the current mini-batch
global bst
bst = xgb.train(
params,
dtrain,
num_boost_round = num_rounds,
xgb_model = bst,
evals = [(dtrain, "train")],
# callbacks = [xgb.callback.EarlyStopping(20)],
verbose_eval = 0
)
train_pred = bst.predict(dtrain)
test_pred = bst.predict(dtest)
if i == 0 :
params['process_type'] = "update"
params["updater"] = "refresh"
params["refresh_leaf"] = True # Refresh the leaf value and tree statistic
return train_pred, test_pred, None
minibatch_iterator = get_minibatch(minibatch_size = batch_size)
auc_history = incremental_learning(minibatch_iterator, xgb_refresh)
test data shape: (75000, 158)
20000 train samples, valid's auc: 0.7369 in 8.38s
40000 train samples, valid's auc: 0.7419 in 15.76s
60000 train samples, valid's auc: 0.7411 in 23.15s
80000 train samples, valid's auc: 0.7428 in 30.53s
100000 train samples, valid's auc: 0.7414 in 37.93s
120000 train samples, valid's auc: 0.7422 in 45.48s
140000 train samples, valid's auc: 0.7431 in 53.24s
160000 train samples, valid's auc: 0.7413 in 60.97s
180000 train samples, valid's auc: 0.7404 in 68.75s
200000 train samples, valid's auc: 0.7416 in 76.57s
220000 train samples, valid's auc: 0.7410 in 84.31s
232511 train samples, valid's auc: 0.7394 in 90.01s
finally:
train's auc: 0.7557
valid's auc: 0.7394
plot_score_evolution(auc_history, 'training examples (#)')
plt.show()

lightGBM 有两种方法控制增量学习模式:
import lightgbm as lgb
# specify parameters via map
params = dict(
boosting_type = 'gbdt',
objective = 'binary',
metric = 'auc',
is_unbalance = True,
learning_rate = 0.015,
max_depth = 8,
feature_fraction = 0.35,
bagging_fraction = 1.0,
lambda_l1 = 65,
lambda_l2 = 15,
subsample_freq = 5,
random_state = SEED,
verbosity = -1
)
gbm = None # init model
# Train 1200 iterations, with the each chunk runs for 100 iterations.
def lgb_continue(X_train, y_train, X_test, y_test, i=None):
dtrain = lgb.Dataset(X_train, label=y_train)
dtest = lgb.Dataset(X_test, label=y_test)
global gbm
gbm = lgb.train(
params,
dtrain,
num_boost_round = 100,
init_model = gbm,
valid_sets = [dtrain],
callbacks = [lgb.early_stopping(stopping_rounds=20)],
keep_training_booster=True
)
train_pred = gbm.predict(X_train)
test_pred = gbm.predict(X_test)
return train_pred, test_pred, gbm.num_trees()
minibatch_iterator = get_minibatch(minibatch_size = batch_size)
auc_history = incremental_learning(minibatch_iterator, lgb_continue)
test data shape: (75000, 158)
100 train trees, valid's auc: 0.7179 in 0.62s
200 train trees, valid's auc: 0.7301 in 1.55s
300 train trees, valid's auc: 0.7372 in 2.49s
400 train trees, valid's auc: 0.7420 in 3.56s
500 train trees, valid's auc: 0.7447 in 4.73s
600 train trees, valid's auc: 0.7461 in 6.05s
700 train trees, valid's auc: 0.7482 in 7.48s
800 train trees, valid's auc: 0.7486 in 9.04s
900 train trees, valid's auc: 0.7488 in 10.73s
1000 train trees, valid's auc: 0.7494 in 12.52s
1100 train trees, valid's auc: 0.7500 in 14.45s
1200 train trees, valid's auc: 0.7506 in 16.18s
finally:
train's auc: 0.7768
valid's auc: 0.7506
其中 keep_training_booster (bool) 参数表示返回的模型 (booster) 是否将用于保持训练,默认 False。当模型非常大并导致内存错误时,可以尝试将此参数设置为 True,以避免 model_to_string 转换。然后仍然可以使用返回的 booster 作为 init_model,用于未来的继续训练。
plot_score_evolution(auc_history, 'num trees')
plt.show()

# specify parameters via map
params = dict(
boosting_type = 'gbdt',
objective = 'binary',
metric = 'auc',
is_unbalance = True,
learning_rate = 0.015,
max_depth = 8,
feature_fraction = 0.35,
bagging_fraction = 1.0,
lambda_l1 = 65,
lambda_l2 = 15,
subsample_freq = 5,
random_state = SEED,
verbosity = -1
)
gbm = None # init model
# The model will adapt to new data by changing leaf value (no change in split condition)
def lgb_refit(X_train, y_train, X_test, y_test, i=None):
dtrain = lgb.Dataset(X_train, label=y_train)
dtest = lgb.Dataset(X_test, label=y_test)
# update estimator with examples in the current mini-batch
global gbm
gbm = lgb.train(
params,
dtrain,
num_boost_round = num_rounds,
init_model = gbm,
valid_sets = [dtrain],
keep_training_booster=True
)
train_pred = gbm.predict(X_train)
test_pred = gbm.predict(X_test)
if i == 0:
params['task'] = 'refit'
params['refit_decay_rate'] = 0.9
return train_pred, test_pred, None
minibatch_iterator = get_minibatch(minibatch_size = batch_size)
auc_history = incremental_learning(minibatch_iterator, lgb_refit)
test data shape: (75000, 158)
20000 train samples, valid's auc: 0.7373 in 4.96s
40000 train samples, valid's auc: 0.7423 in 10.99s
60000 train samples, valid's auc: 0.7423 in 18.07s
80000 train samples, valid's auc: 0.7424 in 26.49s
100000 train samples, valid's auc: 0.7438 in 36.72s
120000 train samples, valid's auc: 0.7409 in 48.54s
140000 train samples, valid's auc: 0.7419 in 63.20s
160000 train samples, valid's auc: 0.7394 in 80.62s
180000 train samples, valid's auc: 0.7363 in 101.09s
200000 train samples, valid's auc: 0.7386 in 125.00s
220000 train samples, valid's auc: 0.7360 in 153.26s
232511 train samples, valid's auc: 0.7347 in 182.33s
finally:
train's auc: 0.8629
valid's auc: 0.7347
其中 refit_decay_rate 控制 refit 任务中叶节点输出的衰减率。重新拟合后,叶子结点的输出的计算公式为:
leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output
plot_score_evolution(auc_history, 'training examples (#)')
plt.show()
