• 增量学习 Demo


    Scikit-learn

    Baseline

    from sklearn.ensemble import HistGradientBoostingClassifier
    from sklearn.metrics import roc_auc_score
    from sklearn.model_selection import train_test_split
    import pandas as pd
    import matplotlib.pyplot as plt
    import time
    import warnings
    
    # Setting configuration.
    warnings.filterwarnings('ignore')
    
    SEED = 42
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    # Load dataset
    path = '../../datasets/Home-Credit-Default-Risk/'
    data = pd.read_csv(path + 'prepared_data.csv', index_col='SK_ID_CURR')
    
    • 1
    • 2
    • 3
    data.shape
    
    • 1
    (307511, 158)
    
    • 1
    data.groupby('TARGET').size()
    
    • 1
    TARGET
    0    282686
    1     24825
    dtype: int64
    
    • 1
    • 2
    • 3
    • 4
    X_train, X_test, y_train, y_test = train_test_split(
        data.drop('TARGET', axis=1), 
        data['TARGET'], 
        test_size=0.25
    )
    
    • 1
    • 2
    • 3
    • 4
    • 5
    gbc = HistGradientBoostingClassifier(
        class_weight = 'balanced',
        scoring = 'roc_auc',
        max_iter = 1000, 
        max_depth = 8,
        max_features = 0.35,
        learning_rate = 0.015,
        l2_regularization = 15,
        n_iter_no_change = 20,
        random_state = SEED,
        verbose = 0
    )
    gbc.fit(X_train, y_train)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    train_auc = roc_auc_score(y_train, gbc.predict_proba(X_train)[:, 1])
    test_auc = roc_auc_score(y_test, gbc.predict_proba(X_test)[:, 1])
    print(f"train's auc: {train_auc:.4f}")
    print(f"test's auc: {test_auc:.4f}")
    
    • 1
    • 2
    • 3
    • 4
    train's auc: 0.7963
    test's auc: 0.7608
    
    • 1
    • 2

    设置 warm_start 参数增加新树

    from sklearn.ensemble import GradientBoostingClassifier
    
    • 1
    # Create data stream
    def get_minibatch(minibatch_size):
        path = '../../datasets/Home-Credit-Default-Risk/'
        return pd.read_csv(
            path + 'prepared_data.csv', 
            index_col='SK_ID_CURR', 
            chunksize = minibatch_size  # return iterator
        )
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    gbdt = GradientBoostingClassifier(
        learning_rate = 0.015,
        n_estimators = 0,
        subsample = 1.0,
        max_features = 0.35,
        max_depth = 8,
        n_iter_no_change = 20,
        warm_start = True,
        random_state = SEED,
        verbose = 0
    )
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    def gbdt_with_warm_start(X_train, y_train, X_test, y_test, i=None):
        gbdt.n_estimators += 100
        gbdt.fit(X_train, y_train)
        train_pred = gbdt.predict_proba(X_train)[:, 1]
        test_pred = gbdt.predict_proba(X_test)[:, 1]
        return train_pred, test_pred, gbdt.n_estimators
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    def incremental_learning(iterator, refresh):
        test = iterator.get_chunk(size = 75000)
        print('test data shape:', test.shape)
        
        X_test = test.drop('TARGET', axis=1)
        y_test = test['TARGET']
    
        tick = time.time()
        n_train = 0
        auc_history = []
        
        # Main loop : iterate on mini-batches of examples
        for i, train in enumerate(iterator):
            X_train = train.drop('TARGET', axis = 1)
            y_train = train['TARGET']
            n_train += X_train.shape[0]
            
            # update model with examples in the current mini-batch
            train_pred, test_pred, num_trees = refresh(X_train, y_train, X_test, y_test, i)
            duration = time.time() - tick
            
            train_auc = roc_auc_score(y_train, train_pred)
            test_auc = roc_auc_score(y_test, test_pred)
            
            # report progress information
            if num_trees is None:
                num = n_train
                condition = f'{n_train} train samples'
            else:
                num = num_trees
                condition = f'{num_trees} train trees'
            
            auc_history.append((num, train_auc, test_auc))
            progress = f"{condition}, valid's auc: {test_auc:.4f} in {duration:.2f}s" 
            print(progress)
        
        print(f"finally:")  
        print(f"  train's auc: {train_auc:.4f}") 
        print(f"  valid's auc: {test_auc:.4f}") 
        
        return auc_history
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    batch_size = 20000
    
    minibatch_iterator = get_minibatch(minibatch_size = batch_size)
    auc_history = incremental_learning(minibatch_iterator, gbdt_with_warm_start)
    
    • 1
    • 2
    • 3
    • 4
    test data shape: (75000, 158)
    100 train trees, valid's auc: 0.7274 in 11.37s
    200 train trees, valid's auc: 0.7359 in 22.87s
    300 train trees, valid's auc: 0.7378 in 34.54s
    400 train trees, valid's auc: 0.7387 in 46.21s
    500 train trees, valid's auc: 0.7396 in 58.19s
    600 train trees, valid's auc: 0.7390 in 70.33s
    700 train trees, valid's auc: 0.7390 in 82.73s
    800 train trees, valid's auc: 0.7390 in 95.44s
    900 train trees, valid's auc: 0.7376 in 108.18s
    1000 train trees, valid's auc: 0.7375 in 121.03s
    1100 train trees, valid's auc: 0.7350 in 134.19s
    1200 train trees, valid's auc: 0.7316 in 143.22s
    finally:
      train's auc: 0.8363
      valid's auc: 0.7316
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    def plot_score_evolution(auc_history, xlable):
        """xlabel: training examples (#) or num trees"""
        plt.figure()
        ticks, train_auc, test_auc = zip(*auc_history)
        plt.title('Metric during incremental learning') 
        plt.xlabel(xlable)
        plt.ylabel('auc')
        plt.grid(True)
        plt.plot(ticks, train_auc, label='train')
        plt.plot(ticks, test_auc, label='test')
        plt.gca()
        plt.legend(loc='best', title='auc')
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    plot_score_evolution(auc_history, 'num trees')
    plt.show()
    
    • 1
    • 2

    调用 partial_fit 方法刷新叶节点

    from sklearn.linear_model import SGDClassifier
    
    • 1
    num_rounds = 1200  # 同上节 warm_start 树的数量
    
    sgd = SGDClassifier(
        class_weight = {1: 11, 0: 1}, 
        loss='log_loss', 
        alpha = 0.01,
        max_iter = num_rounds,  
        penalty='elasticnet',
        l1_ratio = 0.5
    )
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    def sgd_partial_fit(X_train, y_train, X_test, y_test, i=None):
        sgd.partial_fit(X_train, y_train, classes=[1, 0])
        train_pred = sgd.predict_proba(X_train)[:, 1]
        test_pred = sgd.predict_proba(X_test)[:, 1]
        return train_pred, test_pred, None
    
    minibatch_iterator = get_minibatch(minibatch_size = batch_size)
    auc_history = incremental_learning(minibatch_iterator, sgd_partial_fit)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    test data shape: (75000, 158)
    20000 train samples, valid's auc: 0.5011 in 0.21s
    40000 train samples, valid's auc: 0.5024 in 0.47s
    60000 train samples, valid's auc: 0.5234 in 0.72s
    80000 train samples, valid's auc: 0.5448 in 0.98s
    100000 train samples, valid's auc: 0.5000 in 1.23s
    120000 train samples, valid's auc: 0.5185 in 1.50s
    140000 train samples, valid's auc: 0.4999 in 1.76s
    160000 train samples, valid's auc: 0.5160 in 2.01s
    180000 train samples, valid's auc: 0.5141 in 2.27s
    200000 train samples, valid's auc: 0.5310 in 2.54s
    220000 train samples, valid's auc: 0.5233 in 2.81s
    232511 train samples, valid's auc: 0.5014 in 2.99s
    finally:
      train's auc: 0.5025
      valid's auc: 0.5014
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16

    运行结果展示:

    plot_score_evolution(auc_history, 'training examples (#)')
    plt.show()
    
    • 1
    • 2

    在这里插入图片描述

    XGBoost

    XGBoost 提供两种增量学习的方式:

    • 一种是在当前迭代树的基础上增加新树,原树不变;
    • 一种是当前迭代树结构不变,重新计算叶节点权重和 / 或叶节点值。
    import xgboost as xgb
    
    • 1

    设置 xgb_model 参数增加新树

    # specify parameters via map
    params = dict(
        booster = 'gbtree',
        objective = 'binary:logistic',
        eval_metric = 'auc',
        scale_pos_weight = 11,
        learning_rate = 0.015,
        max_depth = 8,
        subsample = 1.0,
        colsample_bytree = 0.35,
        reg_alpha = 65,
        reg_lambda = 15,
        seed = SEED,
        verbosity = 0
    )
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    bst = None   # init model
    
    # Train 1000 iterations, with the each chunk runs for 100 iterations.
    def xgb_continue(X_train, y_train, X_test, y_test, i=None):
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)
        global bst 
        bst = xgb.train(
            params,
            dtrain,
            num_boost_round = 100,
            xgb_model = bst,
            evals = [(dtrain, "train")],
            callbacks = [xgb.callback.EarlyStopping(20)],
            verbose_eval = 0
        )
        train_pred = bst.predict(dtrain)
        test_pred = bst.predict(dtest)
        return train_pred, test_pred, bst.num_boosted_rounds()
    
    minibatch_iterator = get_minibatch(minibatch_size = batch_size)
    auc_history = incremental_learning(minibatch_iterator, xgb_continue)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    test data shape: (75000, 158)
    100 train trees, valid's auc: 0.7279 in 1.58s
    200 train trees, valid's auc: 0.7374 in 3.06s
    300 train trees, valid's auc: 0.7414 in 4.60s
    400 train trees, valid's auc: 0.7455 in 6.19s
    500 train trees, valid's auc: 0.7477 in 7.83s
    600 train trees, valid's auc: 0.7486 in 9.55s
    700 train trees, valid's auc: 0.7504 in 11.32s
    800 train trees, valid's auc: 0.7505 in 13.14s
    900 train trees, valid's auc: 0.7499 in 15.03s
    1000 train trees, valid's auc: 0.7505 in 16.96s
    1100 train trees, valid's auc: 0.7514 in 18.95s
    1200 train trees, valid's auc: 0.7518 in 20.66s
    finally:
      train's auc: 0.7873
      valid's auc: 0.7518
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    plot_score_evolution(auc_history, 'num trees')
    plt.show()
    
    • 1
    • 2

    在这里插入图片描述

    刷新叶子节点

    使用 process_type 参数更新叶节点数值

    # specify parameters via map
    params = dict(
        process_type = 'default',   # Set `process_type` to `default` if you want to build new trees.
        booster = 'gbtree',
        objective = 'binary:logistic',
        eval_metric = 'auc',
        scale_pos_weight = 11,
        learning_rate = 0.015,
        max_depth = 8,
        subsample = 1.0,
        colsample_bytree = 0.35,
        reg_alpha = 65,
        reg_lambda = 15,
        seed = SEED,
        verbosity = 0
    )
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    bst = None   # init model
    
    # The model will adapt to new data by changing leaf value (no change in split condition)
    def xgb_refresh(X_train, y_train, X_test, y_test, i=None):
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)
        # update estimator with examples in the current mini-batch
        global bst 
        bst = xgb.train(
            params,
            dtrain,
            num_boost_round = num_rounds,
            xgb_model = bst,
            evals = [(dtrain, "train")],
            # callbacks = [xgb.callback.EarlyStopping(20)],
            verbose_eval = 0
        )
        train_pred = bst.predict(dtrain)
        test_pred = bst.predict(dtest)
    
        if i == 0 :
            params['process_type'] = "update"
            params["updater"] =  "refresh"
            params["refresh_leaf"] = True  # Refresh the leaf value and tree statistic
        return train_pred, test_pred, None
    
    minibatch_iterator = get_minibatch(minibatch_size = batch_size)
    auc_history = incremental_learning(minibatch_iterator, xgb_refresh)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    test data shape: (75000, 158)
    20000 train samples, valid's auc: 0.7369 in 8.38s
    40000 train samples, valid's auc: 0.7419 in 15.76s
    60000 train samples, valid's auc: 0.7411 in 23.15s
    80000 train samples, valid's auc: 0.7428 in 30.53s
    100000 train samples, valid's auc: 0.7414 in 37.93s
    120000 train samples, valid's auc: 0.7422 in 45.48s
    140000 train samples, valid's auc: 0.7431 in 53.24s
    160000 train samples, valid's auc: 0.7413 in 60.97s
    180000 train samples, valid's auc: 0.7404 in 68.75s
    200000 train samples, valid's auc: 0.7416 in 76.57s
    220000 train samples, valid's auc: 0.7410 in 84.31s
    232511 train samples, valid's auc: 0.7394 in 90.01s
    finally:
      train's auc: 0.7557
      valid's auc: 0.7394
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    plot_score_evolution(auc_history, 'training examples (#)')
    plt.show()
    
    • 1
    • 2

    在这里插入图片描述

    LightGBM

    lightGBM 有两种方法控制增量学习模式:

    • 如果 init_model 不为 None,将从原有模型基础上继续训练,添加 num_boost_round 棵新树
    • 调用 refit 任务,将在原有模型的树结构都不变的基础上,重新拟合新数据更新叶子节点权重

    设置 init_model 参数增加新树

    import lightgbm as lgb
    
    • 1
    # specify parameters via map
    params = dict(
        boosting_type = 'gbdt',
        objective = 'binary',
        metric = 'auc',
        is_unbalance = True,
        learning_rate = 0.015,
        max_depth = 8,
        feature_fraction = 0.35,
        bagging_fraction = 1.0,
        lambda_l1 = 65,
        lambda_l2 = 15,
        subsample_freq = 5,
        random_state = SEED,
        verbosity = -1
    )
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    gbm = None   # init model
    
    # Train 1200 iterations, with the each chunk runs for 100 iterations.
    def lgb_continue(X_train, y_train, X_test, y_test, i=None):
        dtrain = lgb.Dataset(X_train, label=y_train)
        dtest = lgb.Dataset(X_test, label=y_test)
        global gbm
        gbm = lgb.train(
            params,
            dtrain,
            num_boost_round = 100,
            init_model = gbm,
            valid_sets = [dtrain],
            callbacks = [lgb.early_stopping(stopping_rounds=20)],
            keep_training_booster=True
        )
        train_pred = gbm.predict(X_train)
        test_pred = gbm.predict(X_test)
        return train_pred, test_pred, gbm.num_trees()
    
    minibatch_iterator = get_minibatch(minibatch_size = batch_size)
    auc_history = incremental_learning(minibatch_iterator, lgb_continue)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    test data shape: (75000, 158)
    100 train trees, valid's auc: 0.7179 in 0.62s
    200 train trees, valid's auc: 0.7301 in 1.55s
    300 train trees, valid's auc: 0.7372 in 2.49s
    400 train trees, valid's auc: 0.7420 in 3.56s
    500 train trees, valid's auc: 0.7447 in 4.73s
    600 train trees, valid's auc: 0.7461 in 6.05s
    700 train trees, valid's auc: 0.7482 in 7.48s
    800 train trees, valid's auc: 0.7486 in 9.04s
    900 train trees, valid's auc: 0.7488 in 10.73s
    1000 train trees, valid's auc: 0.7494 in 12.52s
    1100 train trees, valid's auc: 0.7500 in 14.45s
    1200 train trees, valid's auc: 0.7506 in 16.18s
    finally:
      train's auc: 0.7768
      valid's auc: 0.7506
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16

    其中 keep_training_booster (bool) 参数表示返回的模型 (booster) 是否将用于保持训练,默认 False。当模型非常大并导致内存错误时,可以尝试将此参数设置为 True,以避免 model_to_string 转换。然后仍然可以使用返回的 booster 作为 init_model,用于未来的继续训练。

    plot_score_evolution(auc_history, 'num trees')
    plt.show()
    
    • 1
    • 2

    在这里插入图片描述

    刷新叶子节点

    # specify parameters via map
    params = dict(
        boosting_type = 'gbdt',
        objective = 'binary',
        metric = 'auc',
        is_unbalance = True,
        learning_rate = 0.015,
        max_depth = 8,
        feature_fraction = 0.35,
        bagging_fraction = 1.0,
        lambda_l1 = 65,
        lambda_l2 = 15,
        subsample_freq = 5,
        random_state = SEED,
        verbosity = -1
    )
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    gbm = None   # init model
    
    # The model will adapt to new data by changing leaf value (no change in split condition)
    def lgb_refit(X_train, y_train, X_test, y_test, i=None):
        dtrain = lgb.Dataset(X_train, label=y_train)
        dtest = lgb.Dataset(X_test, label=y_test)
        # update estimator with examples in the current mini-batch
        global gbm
        gbm = lgb.train(
            params,
            dtrain,
            num_boost_round = num_rounds,
            init_model = gbm,
            valid_sets = [dtrain],
            keep_training_booster=True
        )
        train_pred = gbm.predict(X_train)
        test_pred = gbm.predict(X_test)    
        if i == 0:
            params['task'] = 'refit'
            params['refit_decay_rate'] = 0.9
        return train_pred, test_pred, None
    
    minibatch_iterator = get_minibatch(minibatch_size = batch_size)
    auc_history = incremental_learning(minibatch_iterator, lgb_refit)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    test data shape: (75000, 158)
    20000 train samples, valid's auc: 0.7373 in 4.96s
    40000 train samples, valid's auc: 0.7423 in 10.99s
    60000 train samples, valid's auc: 0.7423 in 18.07s
    80000 train samples, valid's auc: 0.7424 in 26.49s
    100000 train samples, valid's auc: 0.7438 in 36.72s
    120000 train samples, valid's auc: 0.7409 in 48.54s
    140000 train samples, valid's auc: 0.7419 in 63.20s
    160000 train samples, valid's auc: 0.7394 in 80.62s
    180000 train samples, valid's auc: 0.7363 in 101.09s
    200000 train samples, valid's auc: 0.7386 in 125.00s
    220000 train samples, valid's auc: 0.7360 in 153.26s
    232511 train samples, valid's auc: 0.7347 in 182.33s
    finally:
      train's auc: 0.8629
      valid's auc: 0.7347
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16

    其中 refit_decay_rate 控制 refit 任务中叶节点输出的衰减率。重新拟合后,叶子结点的输出的计算公式为:
    leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output

    plot_score_evolution(auc_history, 'training examples (#)')
    plt.show()
    
    • 1
    • 2

    在这里插入图片描述

  • 相关阅读:
    python 应用之 request 请求调用
    NoSQL之Redis配置使用
    浔川身份证号码查询——浔川python科技社
    【Java SE】方法的使用
    Vision-Dialog Navigation和Vision-and-Language Navigation简单总结
    ffmpeg for android编译全过程与遇到的问题
    如何把.mhd和.raw文件转换为DICOM文件
    4--MySQL:多表查询
    python自动化测试之selenium环境安装
    Mac配置git ssh公钥
  • 原文地址:https://blog.csdn.net/qq_41518277/article/details/138048961