• 【机器学习】阿里云天池竞赛——工业蒸汽量预测(5)


    机器学习经典赛题:工业蒸汽量预测(5):模型验证(赛题实战)

    5.3 模型验证与调参实战

    5.3.1 模型过拟合与欠拟合

    1. 基础代码
      导入工具包,用于模型验证和数据处理。
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import stats
    import warnings
    warnings.filterwarnings("ignore")
    
    from sklearn.linear_model import LinearRegression   #从sklearn引入线性模型
    from sklearn.neighbors import KNeighborsRegressor   #k近邻回归模型
    from sklearn.tree import DecisionTreeRegressor     #决策树回归模型
    from sklearn.ensemble import RandomForestRegressor    #随机森林回归模型
    from sklearn.svm import SVR    #支持向量机
    from lightgbm import LGBMRegressor   #LightGBM回归模型
    
    from sklearn.model_selection import train_test_split #切分数据
    from sklearn.metrics import mean_squared_error  #评价指标
    from sklearn.linear_model import SGDRegressor
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18

    读取数据:

    #读取数据
    train_data_file = '../data/zhengqi_train.txt'
    test_data_file = '../data/zhengqi_test.txt'
    train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
    test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')
    
    • 1
    • 2
    • 3
    • 4
    • 5

    对数据进行归一化处理:

    from sklearn import preprocessing
    
    #归一化处理
    features_columns = [col for col in train_data.columns if col not in ['target']]
    min_max_scaler = preprocessing.MinMaxScaler()
    min_max_scaler = min_max_scaler.fit(train_data[features_columns])
    
    train_data_scaler = min_max_scaler.transform(train_data[features_columns])
    test_data_scaler = min_max_scaler.transform(test_data[features_columns])
    
    train_data_scaler = pd.DataFrame(train_data_scaler)
    train_data_scaler.columns = features_columns
    
    test_data_scaler = pd.DataFrame(test_data_scaler)
    test_data_scaler.columns = features_columns
    train_data_scaler['target'] = train_data['target']
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16

    使用PCA进行特征降维:

    #PCA方法降维
    from sklearn.decomposition import PCA
    
    # 保留16个主成分
    pca = PCA(n_components=16)
    new_train_pca_16 = pca.fit_transform(train_data_scaler.iloc[:, 0:-1])
    new_test_pca_16 = pca.transform(test_data_scaler)
    new_train_pca_16 = pd.DataFrame(new_train_pca_16)
    new_test_pca_16 = pd.DataFrame(new_test_pca_16)
    new_train_pca_16['target'] = train_data_scaler['target']
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10

    切分数据集,分为训练集和验证集:

    #保留16维特征并切分数据
    new_train_pca_16 = new_train_pca_16.fillna(0)
    train = new_train_pca_16[new_train_pca_16.columns]
    target = new_train_pca_16['target']
    
    #划分数据集, 训练集80% : 验证机20%
    train_data, test_data, train_target, test_target = train_test_split(train, target,test_size=0.2, random_state=0)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    1. 欠拟合
      模型欠拟合的情况:
    # 模型欠拟合的情况
    clf = SGDRegressor(max_iter=500, tol=1e-2)
    clf.fit(train_data, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data))
    score_test = mean_squared_error(test_target, clf.predict(test_data))
    print("SGDRegressor train MSE:", score_train)
    print("SGDRegressor test MSE:", score_test)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7

    输出结果:

    SGDRegressor train MSE: 0.0009160671723394329
    SGDRegressor test MSE: 0.000981634366517726
    
    • 1
    • 2
    1. 过拟合
      模型过拟合的情况:
    from sklearn.preprocessing import PolynomialFeatures
    
    poly = PolynomialFeatures(5)
    train_data_poly = poly.fit_transform(train_data)
    test_data_poly = poly.fit_transform(test_data)
    clf = SGDRegressor(max_iter=1000, tol=1e-3)
    clf.fit(train_data_poly, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
    score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
    print("SGDRegressor train MSE:", score_train)
    print("SGDRegressor test MSE:", score_test)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11

    输出结果:

    SGDRegressor train MSE: 1.0898632608641039e+24
    SGDRegressor test MSE: 1.6982104334110243e+24
    
    • 1
    • 2
    1. 正常拟合
      模型正常拟合的情况:
    from sklearn.preprocessing import PolynomialFeatures
    
    poly = PolynomialFeatures(3)
    train_data_poly = poly.fit_transform(train_data)
    test_data_poly = poly.fit_transform(test_data)
    clf = SGDRegressor(max_iter=1000, tol=1e-3)
    clf.fit(train_data_poly, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
    score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
    print("SGDRegressor train MSE:", score_train)
    print("SGDRegressor test MSE:", score_test)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11

    输出结果:

    SGDRegressor train MSE: 0.001055038113307342
    SGDRegressor test MSE: 0.001233821154013793
    
    • 1
    • 2

    5.3.2 模型正则化

    1. L2范数正则化
      采用L2范数正则化处理模型:
    from sklearn.preprocessing import PolynomialFeatures
    
    poly = PolynomialFeatures(3)
    train_data_poly = poly.fit_transform(train_data)
    test_data_poly = poly.fit_transform(test_data)
    clf = SGDRegressor(max_iter=1000, tol=1e-3, penalty='L2', alpha=0.0001)
    clf.fit(train_data_poly, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
    score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
    print("SGDRegressor train MSE:", score_train)
    print("SGDRegressor test MSE:", score_test)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11

    输出结果:

    SGDRegressor train MSE: 0.0010417279350570649
    SGDRegressor test MSE: 0.0012885335267450496
    
    • 1
    • 2
    1. L1范数正则化
      使用L1范数正则化处理模型
    from sklearn.preprocessing import PolynomialFeatures
    
    poly = PolynomialFeatures(3)
    train_data_poly = poly.fit_transform(train_data)
    test_data_poly = poly.fit_transform(test_data)
    clf = SGDRegressor(max_iter=1000, tol=1e-3, penalty='L1', alpha=0.0001)
    clf.fit(train_data_poly, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
    score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
    print("SGDRegressor train MSE:", score_train)
    print("SGDRegressor test MSE:", score_test)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    1. ElasticNet联合L1和L2范数加权正则化
      使用ElasticNet正则化处理模型:
    from sklearn.preprocessing import PolynomialFeatures
    
    poly = PolynomialFeatures(3)
    train_data_poly = poly.fit_transform(train_data)
    test_data_poly = poly.fit_transform(test_data)
    clf = SGDRegressor(max_iter=1000, tol=1e-3, penalty='elasticnet', l1_ratio=0.9, alpha=.0001)
    clf.fit(train_data_poly, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
    score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
    print("SGDRegressor train MSE:", score_train)
    print("SGDRegressor test MSE:", score_test)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11

    输出结果:

    SGDRegressor train MSE: 0.0009336657359501396
    SGDRegressor test MSE: 0.0011472413880708912
    
    • 1
    • 2

    5.3.3 模型交叉验证

    1. 简单交叉验证
      使用简单交叉验证方法对模型进行交叉验证并切分数据集,其中训练数据为80%,验证数据为20%。
    #划分数据集, 训练集80% : 验证机20%
    train_data, test_data, train_target, test_target = train_test_split(train, target,test_size=0.2, random_state=0)
    
    clf = SGDRegressor(max_iter=1000, tol=1e-3)
    clf.fit(train_data, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data))
    score_test = mean_squared_error(test_target, clf.predict(test_data))
    print("SGDRegressor train MSE:", score_train)
    print("SGDRegressor test MSE:", score_test)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9

    输出结果:

    SGDRegressor train MSE: 0.0008805332436144762
    SGDRegressor test MSE: 0.0009742019879530982
    
    • 1
    • 2
    1. K折交叉验证
      使用K折交叉验证方法对模型进行交叉验证:
    from sklearn.model_selection import KFold
    
    kf = KFold(n_splits=5)
    for k, (train_index, test_index) in enumerate(kf.split(train)):
        train_data, test_data, train_target, test_target = train.values[train_index], train.values[test_index], target[
            train_index], target[test_index]
        clf = SGDRegressor(max_iter=1000, tol=1e-3)
        clf.fit(train_data, train_target)
        score_train = mean_squared_error(train_target, clf.predict(train_data))
        score_test = mean_squared_error(test_target, clf.predict(test_data))
        print(k, "折", "SGDRegressor train MSE:", score_train)
        print(k, "折", "SGDRegressor test MSE:", score_test, "\n")
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12

    运行结果:

    0 折 SGDRegressor train MSE: 0.0009220591043969376
    0 折 SGDRegressor test MSE: 0.0005310778285509602 
    
    1 折 SGDRegressor train MSE: 0.0008800234249554588
    1 折 SGDRegressor test MSE: 0.001259968578517734 
    
    2 折 SGDRegressor train MSE: 0.0008211966326584619
    2 折 SGDRegressor test MSE: 0.0009107924815045727 
    
    3 折 SGDRegressor train MSE: 0.0009456226800574005
    3 折 SGDRegressor test MSE: 0.0009853654948970164 
    
    4 折 SGDRegressor train MSE: 0.0009055864978733344
    4 折 SGDRegressor test MSE: 0.0011048017616418345 
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    1. 留一法交叉验证
      使用留一法交叉验证对模型进行交叉验证:
    from sklearn.model_selection import LeaveOneOut
    
    loo = LeaveOneOut()
    num = 100
    for k, (train_index, test_index) in enumerate(loo.split(train)):
        train_data, test_data, train_target, test_target = train.values[train_index], train.values[test_index], target[
            train_index], target[test_index]
        clf = SGDRegressor(max_iter=1000, tol=1e-3)
        clf.fit(train_data, train_target)
        score_train = mean_squared_error(train_target, clf.predict(train_data))
        score_test = mean_squared_error(test_target, clf.predict(test_data))
        print(k, "个", "SGDRegressor train MSE:", score_train)
        print(k, "个", "SGDRegressor test MSE:", score_test, "\n")
        if k >= 9:
            break
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15

    运行结果:

    0 个 SGDRegressor train MSE: 0.0007447292899787238
    0 个 SGDRegressor test MSE: 5.2599252754080336e-05 
    ...
    9 个 SGDRegressor train MSE: 0.0007887842650802939
    9 个 SGDRegressor test MSE: 0.0005482147049830396 
    
    • 1
    • 2
    • 3
    • 4
    • 5
    1. 留P法交叉验证
      使用留P法交叉验证对模型进行交叉验证:
    from sklearn.model_selection import LeavePOut
    
    lpo = LeavePOut(p=10)
    num = 100
    for k, (train_index, test_index) in enumerate(lpo.split(train)):
        train_data, test_data, train_target, test_target = train.values[train_index], train.values[test_index], target[
            train_index], target[test_index]
        clf = SGDRegressor(max_iter=1000, tol=1e-3)
        clf.fit(train_data, train_target)
        score_train = mean_squared_error(train_target, clf.predict(train_data))
        score_test = mean_squared_error(test_target, clf.predict(test_data))
        print(k, "10个", "SGDRegressor train MSE:", score_train)
        print(k, "10个", "SGDRegressor test MSE:", score_test, "\n")
        if k >= 9:
            break
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15

    运行结果:

    0 10个 SGDRegressor train MSE: 0.0007482296929583594
    0 10个 SGDRegressor test MSE: 0.00029421354601069736 
    ...
    9 10个 SGDRegressor train MSE: 0.0008062741989626481
    9 10个 SGDRegressor test MSE: 0.00023679186419471267 
    
    • 1
    • 2
    • 3
    • 4
    • 5

    5.3.4 模型超参空间及调参

    1. 穷举网格搜索
      使用数据训练随机森林模型,采用网格搜索方法调参:
    #使用数据训练随机森林模型,采用穷举网格搜索方法调参
    from sklearn.model_selection import GridSearchCV
    from sklearn.ensemble import RandomForestRegressor
    
    RandomForestRegressor = RandomForestRegressor()
    parameters = {'n_estimators': [50, 100, 200], 'max_depth': [1, 2, 3]}
    clf = GridSearchCV(RandomForestRegressor, parameters, cv=5)
    clf.fit(train_data, train_target)
    score_test = mean_squared_error(test_target, clf.predict(test_data))
    print("RandomForestRegressor GridSearchCV test MSE:", score_test)
    print(sorted(clf.cv_results_.keys()))  #包含训练时间和验证指标的一些信息
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11

    运行结果:

    RandomForestRegressor GridSearchCV test MSE: 0.012637546435789914
    ['mean_fit_time', 'mean_score_time', 'mean_test_score', 'param_max_depth', 'param_n_estimators', 'params', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'std_fit_time', 'std_score_time', 'std_test_score']
    
    • 1
    • 2
    1. 随机参数优化
      使用数据训练随机森林模型,采用随机参数优化方法调参:
    #使用数据训练随机森林模型,采用随即参数优化方法调参
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.ensemble import RandomForestRegressor
    
    RandomForestRegressor = RandomForestRegressor()
    parameters = {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [1, 2, 3, 4, 5]
    }
    clf = RandomizedSearchCV(RandomForestRegressor, parameters, cv=5)
    clf.fit(train_data, train_target)
    print('Best parameters found are:', clf.best_params_)
    score_test = mean_squared_error(test_target, clf.predict(test_data))
    print("RandomForestRegressor RandomizedSearchCV GridSearchCV test MSE:", score_test)
    print(sorted(clf.cv_results_.keys()))  #包含训练时间和验证指标的一些信息
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15

    运行结果:

    Best parameters found are: {'n_estimators': 300, 'max_depth': 5}
    RandomForestRegressor RandomizedSearchCV GridSearchCV test MSE: 8.52446793094484e-05
    ['mean_fit_time', 'mean_score_time', 'mean_test_score', 'param_max_depth', 'param_n_estimators', 'params', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'std_fit_time', 'std_score_time', 'std_test_score']
    
    • 1
    • 2
    • 3
    1. LGB调参
      使用数据训练LGB模型,采用网格搜索方法调参:
    import lightgbm as lgb
    
    clf = lgb.LGBMRegressor(num_leaves=31)
    parameters = {'learning_rate': [0.01, 0.1, 1], 'n_estimators': [20, 40]}
    clf.fit(train_data, train_target)
    
    score_test = mean_squared_error(test_target, clf.predict(test_data))
    print("LGBMRegressor GridSearchCV test MSE:", score_test)
    # LGBMRegressor GridSearchCV test MSE: 1.560809562908417e-05
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    1. LGB线下验证
      下面给出对数据建模、5折交叉验证、划分数据、对LGB模型进行训练、计算MSE评价性能等流程的代码:
    # 加载数据
    train_data2 = pd.read_csv('../data/zhengqi_train.txt', sep='\t')
    test_data2 = pd.read_csv('../data/zhengqi_test.txt', sep='\t')
    
    train_data2_f = train_data2[test_data2.columns].values
    train_data2_target = train_data2['target'].values
    
    # lgb 模型
    from sklearn.model_selection import KFold
    import lightgbm as lgb
    import numpy as np
    
    # 5折交叉验证
    Folds = 5
    kf = KFold(n_splits=Folds, random_state=2022, shuffle=True)
    # 记录训练和预测MSE
    MSE_DICT = {
        'train_mse': [],
        'test_mse': []
    }
    
    # 线下训练预测
    for i, (train_index, test_index) in enumerate(kf.split(train_data2_f)):
        # lgb树模型
        lgb_reg = lgb.LGBMRegressor(
            learning_rate=0.01,
            max_depth=-1,
            n_estimators=5000,
            boosting_type='gbdt',
            random_state=2022,
            objective='regression',
        )
    
        # 切分训练集和预测集
        X_train_KFold, X_test_KFold = train_data2_f[train_index], train_data2_f[test_index]
        y_train_KFold, y_test_KFold = train_data2_target[train_index], train_data2_target[test_index]
    
        # 训练模型
        #     reg.fit(X_train_KFold, y_train_KFold)
        lgb_reg.fit(
            X=X_train_KFold, y=y_train_KFold,
            eval_set=[(X_train_KFold, y_train_KFold), (X_test_KFold, y_test_KFold)],
            eval_names=['Train', 'Test'],
            early_stopping_rounds=100,
            eval_metric='MSE',
            verbose=50
        )
    
        # 训练集预测 测试集预测
        y_train_KFold_predict = lgb_reg.predict(X_train_KFold, num_iteration=lgb_reg.best_iteration_)
        y_test_KFold_predict = lgb_reg.predict(X_test_KFold, num_iteration=lgb_reg.best_iteration_)
    
        print('第{}折 训练和预测 训练MSE 预测MSE'.format(i + 1))
        train_mse = mean_squared_error(y_train_KFold_predict, y_train_KFold)
        print('------\n', '训练MSE\n', train_mse, '\n------')
        test_mse = mean_squared_error(y_test_KFold_predict, y_test_KFold)
        print('------\n', '预测MSE\n', test_mse, '\n------\n')
    
        MSE_DICT['train_mse'].append(train_mse)
        MSE_DICT['test_mse'].append(test_mse)
    print('------\n', '训练MSE\n', MSE_DICT['train_mse'], '\n', np.mean(MSE_DICT['train_mse']), '\n------')
    print('------\n', '预测MSE\n', MSE_DICT['test_mse'], '\n', np.mean(MSE_DICT['test_mse']), '\n------')
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62

    5.3.5 学习曲线和验证曲线

    1. 学习曲线
      绘制数据的学习曲线,使用模型SGDRegressor
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    from sklearn.model_selection import ShuffleSplit
    from sklearn.linear_model import SGDRegressor
    from sklearn.model_selection import learning_curve
    
    train_data_file = "../data/zhengqi_train.txt"
    test_data_file = "../data/zhengqi_test.txt"
    train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
    test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')
    
    plt.figure(figsize=(18, 10), dpi=150)
    
    
    def plot_learning_curve(estimator, title, x, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel("Training examples")
        plt.ylabel("Score")
        train_sizes, train_scores, test_scores = learning_curve(estimator, x, y, cv=cv, n_jobs=n_jobs,
                                                                train_sizes=train_sizes)
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        plt.grid()
        plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1,
                         color='r')
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1,
                         color='g')
        plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label="training score")
        plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label="corss-validation score")
        plt.legend(loc="best")
        return plt
    
    
    x = train_data[test_data.columns].values
    y = train_data['target'].values
    title = "LinearRegression"
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
    estimator = SGDRegressor()
    plot_learning_curve(estimator, title, x, y, ylim=(0.7, 1.01), cv=cv, n_jobs=-1).show()
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45

    学习曲线
    2. 验证曲线

    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    from sklearn.linear_model import SGDRegressor
    from sklearn.model_selection import validation_curve
    
    train_data_file = "../data/zhengqi_train.txt"
    test_data_file = "../data/zhengqi_test.txt"
    train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
    test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')
    x = train_data[test_data.columns].values
    y = train_data['target'].values
    
    param_range = [0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]
    train_scores, test_scores = validation_curve(SGDRegressor(max_iter=1000, tol=1e-3, penalty='L1')
                                                 , x, y, param_name='alpha', param_range=param_range,
                                                 cv=10, scoring='r2', n_jobs=1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.title("Validation Curve with SCDRegressor")
    plt.xlabel("alpha")
    plt.ylabel("Score")
    plt.ylim(0.0, 1.1)
    plt.semilogx(param_range, train_scores_mean, label="Training scores", color='r')
    plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2,
                     color='r')
    plt.semilogx(param_range, test_scores_mean, label="Sross_validation score", color='g')
    plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2,
                     color='g')
    plt.legend(loc="best")
    plt.show()
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33

    验证曲线

    参考资料

    [1] 《阿里云天池大赛赛题解析——机器学习篇》

  • 相关阅读:
    正点原子嵌入式linux驱动开发——pinctrl和gpio子系统
    MinGW-w64下载文件失败the file has been downloaded incorrectly!
    springboot 拦截器与文件上传
    网络游戏协议:基于Protobuf的序列化与反序列化
    Android连载43-Netd相关学习笔记
    MySQL 数据库常用操作语句的总结
    Jenkins更换主目录
    VBA入门2——程序结构
    【SpringCloud-学习笔记】http客户端Feign
    ChatGLM2-6B微调实践
  • 原文地址:https://blog.csdn.net/ARPOSPF/article/details/127339499