• Datawhale-新能源时间序列赛事学习笔记(1)


    1.赛题描述

            在电动汽车充电站运营管理中,准确预测充电站的电量需求对于提高充电站运营服务水平和优化区域电网供给能力非常关键。本次赛题旨在建立站点充电量预测模型,根据充电站的相关信息和历史电量数据,准确预测未来某段时间内充电站的充电量需求。赛题数据中提供了电动汽车充电站的场站编号、位置信息、历史电量等基本信息。参赛者需要基于这些数据,利用人工智能相关技术,建立预测模型来预测未来一段时间内的需求电量,帮助管理者提高充电站的运营效益和服务水平,促进电动汽车行业的整体发展。

    2.赛题任务

            根据赛题提供的电动汽车充电站多维度脱敏数据,构造合理特征及算法模型,预估站点未来一周每日的充电量。(以天为单位)

    3.赛题数据集

            本赛题提供的数据集包含三张数据表。其中,power_forecast_history.csv 为站点运营数据,power.csv为站点充电量数据,stub_info.csv为站点静态数据,训练集为历史一年的数据,测试集为未来一周的数据。

    数据集清单与格式说明:

    注:

    (1)h3编码是一种用于分层地理编码的系统,可以将地球划分为不同的六边形网格。选手可以尝试使用 h3 编码来构造与地理位置相关的额外特征。

    (2)脱敏字段,不提供字段业务描述,供选手自由探索。

    4.评估指标-RMSE

    y_{i}为第个数据的真实值,y^{*}_{i}为第个数据的预测值,n为样本总数。

    5.Baseline

    5.1 导入库

    1. import numpy as np
    2. import pandas as pd
    3. import lightgbm as lgb
    4. import xgboost as xgb
    5. from catboost import CatBoostRegressor
    6. from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
    7. from sklearn.metrics import mean_squared_error, mean_absolute_error
    8. import matplotlib.pyplot as plt
    9. import tqdm
    10. import sys
    11. import os
    12. import gc
    13. import argparse
    14. import warnings
    15. warnings.filterwarnings('ignore')

    5.2 数据准备与探索

    1. # 读取数据
    2. train_power_forecast_history = pd.read_csv(r'F:\Jupyter Files\比赛\新能源赛道初赛数据集\初赛1008\训练集\power_forecast_history.csv')
    3. train_power = pd.read_csv(r'F:\Jupyter Files\比赛\新能源赛道初赛数据集\初赛1008\训练集\power.csv')
    4. train_stub_info = pd.read_csv('F:\Jupyter Files\比赛\新能源赛道初赛数据集\初赛1008\训练集\stub_info.csv')
    5. test_power_forecast_history = pd.read_csv(r'F:\Jupyter Files\比赛\新能源赛道初赛数据集\初赛1008\测试集\power_forecast_history.csv')
    6. test_stub_info = pd.read_csv(r'F:\Jupyter Files\比赛\新能源赛道初赛数据集\初赛1008\测试集\stub_info.csv')
    7. # 聚合数据(按日期+场站编码分组后取每一组第一条数据)
    8. train_df = train_power_forecast_history.groupby(['id_encode','ds']).head(1)
    9. del train_df['hour']
    10. test_df = test_power_forecast_history.groupby(['id_encode','ds']).head(1)
    11. del test_df['hour']
    12. tmp_df = train_power.groupby(['id_encode','ds'])['power'].sum()
    13. tmp_df.columns = ['id_encode','ds','power']
    14. # 合并充电量数据
    15. train_df = train_df.merge(tmp_df, on=['id_encode','ds'], how='left')
    16. ### 合并数据
    17. train_df = train_df.merge(train_stub_info, on='id_encode', how='left')
    18. test_df = test_df.merge(test_stub_info, on='id_encode', how='left')
    1. # 定义要绘制的列
    2. cols = ['power']
    3. # 遍历id_encode的五个值
    4. for ie in [0,1,2,3,4]:
    5. # 获取train_df中id_encode为当前值ie的所有行,并重置索引
    6. tmp_df = train_df[train_df['id_encode']==ie].reset_index(drop=True)
    7. # 再次重置索引,并为新索引添加一个名为'index'的列
    8. tmp_df = tmp_df.reset_index(drop=True).reset_index()
    9. # 遍历要绘制的列
    10. for num, col in enumerate(cols):
    11. # 设置图的大小
    12. plt.figure(figsize=(20,10))
    13. # 创建子图,总共有41列,当前为第num+1个子图
    14. plt.subplot(4,1,num+1)
    15. # 绘制图形:x轴为'index',y轴为当前列的值
    16. plt.plot(tmp_df['index'],tmp_df[col])
    17. # 为当前子图设置标题,标题为当前列的名称
    18. plt.title(col)
    19. # 显示图形
    20. plt.show()
    21. # 创建一个新的图,大小为20x5
    22. plt.figure(figsize=(20,5))

    5.3 特征工程

    1. train_df['flag'] = train_df['flag'].map({'A':0,'B':1})
    2. test_df['flag'] = test_df['flag'].map({'A':0,'B':1})
    3. def get_time_feature(df, col):
    4. df_copy = df.copy()
    5. prefix = col + "_"
    6. df_copy['new_'+col] = df_copy[col].astype(str)
    7. col = 'new_'+col
    8. df_copy[col] = pd.to_datetime(df_copy[col], format='%Y%m%d')
    9. df_copy[prefix + 'year'] = df_copy[col].dt.year
    10. df_copy[prefix + 'month'] = df_copy[col].dt.month
    11. df_copy[prefix + 'day'] = df_copy[col].dt.day
    12. # df_copy[prefix + 'weekofyear'] = df_copy[col].dt.weekofyear
    13. df_copy[prefix + 'dayofweek'] = df_copy[col].dt.dayofweek
    14. df_copy[prefix + 'is_wknd'] = df_copy[col].dt.dayofweek // 6
    15. df_copy[prefix + 'quarter'] = df_copy[col].dt.quarter
    16. df_copy[prefix + 'is_month_start'] = df_copy[col].dt.is_month_start.astype(int)
    17. df_copy[prefix + 'is_month_end'] = df_copy[col].dt.is_month_end.astype(int)
    18. del df_copy[col]
    19. return df_copy
    20. train_df = get_time_feature(train_df, 'ds')
    21. test_df = get_time_feature(test_df, 'ds')
    22. cols = [f for f in test_df.columns if f not in ['ds','power','h3']]

    5.4 模型训练与验证

    1. def cv_model(clf, train_x, train_y, test_x, clf_name, seed = 2023):
    2. '''
    3. clf:调用模型
    4. train_x:训练数据
    5. train_y:训练数据对应标签
    6. test_x:测试数据
    7. clf_name:选择使用模型名
    8. seed:随机种子
    9. '''
    10. folds = 5
    11. kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    12. oof = np.zeros(train_x.shape[0])
    13. test_predict = np.zeros(test_x.shape[0])
    14. cv_scores = []
    15. for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
    16. print('************************************ {} ************************************'.format(str(i+1)))
    17. trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
    18. if clf_name == "lgb":
    19. train_matrix = clf.Dataset(trn_x, label=trn_y)
    20. valid_matrix = clf.Dataset(val_x, label=val_y)
    21. params = {
    22. 'boosting_type': 'gbdt',
    23. 'objective': 'regression',
    24. 'metric': 'mae',
    25. 'min_child_weight': 6,
    26. 'num_leaves': 2 ** 6,
    27. 'lambda_l2': 10,
    28. 'feature_fraction': 0.8,
    29. 'bagging_fraction': 0.8,
    30. 'bagging_freq': 4,
    31. 'learning_rate': 0.1,
    32. 'seed': 2023,
    33. 'nthread' : 16,
    34. 'verbose' : -1,
    35. }
    36. model = clf.train(params, train_matrix, 2000, valid_sets=[train_matrix, valid_matrix],
    37. categorical_feature=[], verbose_eval=200, early_stopping_rounds=100)
    38. val_pred = model.predict(val_x, num_iteration=model.best_iteration)
    39. test_pred = model.predict(test_x, num_iteration=model.best_iteration)
    40. if clf_name == "xgb":
    41. xgb_params = {
    42. 'booster': 'gbtree',
    43. 'objective': 'reg:squarederror',
    44. 'eval_metric': 'mae',
    45. 'max_depth': 5,
    46. 'lambda': 10,
    47. 'subsample': 0.7,
    48. 'colsample_bytree': 0.7,
    49. 'colsample_bylevel': 0.7,
    50. 'eta': 0.1,
    51. 'tree_method': 'hist',
    52. 'seed': 520,
    53. 'nthread': 16
    54. }
    55. train_matrix = clf.DMatrix(trn_x , label=trn_y)
    56. valid_matrix = clf.DMatrix(val_x , label=val_y)
    57. test_matrix = clf.DMatrix(test_x)
    58. watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
    59. model = clf.train(xgb_params, train_matrix, num_boost_round=2000, evals=watchlist, verbose_eval=200, early_stopping_rounds=100)
    60. val_pred = model.predict(valid_matrix)
    61. test_pred = model.predict(test_matrix)
    62. if clf_name == "cat":
    63. params = {'learning_rate': 0.1, 'depth': 5, 'bootstrap_type':'Bernoulli','random_seed':2023,
    64. 'od_type': 'Iter', 'od_wait': 100, 'random_seed': 11, 'allow_writing_files': False}
    65. model = clf(iterations=2000, **params)
    66. model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
    67. metric_period=200,
    68. use_best_model=True,
    69. cat_features=[],
    70. verbose=1)
    71. val_pred = model.predict(val_x)
    72. test_pred = model.predict(test_x)
    73. oof[valid_index] = val_pred
    74. test_predict += test_pred / kf.n_splits
    75. score = mean_absolute_error(val_y, val_pred)
    76. cv_scores.append(score)
    77. print(cv_scores)
    78. return oof, test_predict

    ◒LightGBM

    1. lgb_oof, lgb_test = cv_model(lgb, train_df[cols], train_df['power'], test_df[cols])
    2. ----------------------------------------------------------------------------------
    3. #交叉验证分数
    4. [266.7260370569527, 269.9232639345857, 265.154677843001, 265.21192193943574, 266.49163591068003]
    1. test_df['power'] = lgb_test
    2. test_df['power'] = test_df['power'].apply(lambda x: 0 if x<0 else x)
    3. test_df[['id_encode','ds','power']].to_csv(r'F:\Jupyter Files\比赛\新能源赛道初赛数据集\初赛1008\result.csv', index=False)

    线上分数:240.51261765759443

    ◒XGBoost

    1. xgb_oof, xgb_test = cv_model(xgb, train_df[cols], train_df['power'], test_df[cols], 'xgb')
    2. ------------------------------------------------------------------------------------
    3. #交叉验证分数
    4. [188.14222230685203, 189.79883333942658, 189.98780480651146, 188.90711501159402, 189.63885769696023]
    1. test_df['power'] = xgb_test
    2. test_df['power'] = test_df['power'].apply(lambda x: 0 if x<0 else x)
    3. test_df[['id_encode','ds','power']].to_csv(r'F:\Jupyter Files\比赛\新能源赛道初赛数据集\初赛1008\xgb_result.csv', index=False)

    线上分数:269.1201702406025

    ◒CatBoost

    1. cat_oof, cat_test = cv_model(CatBoostRegressor, train_df[cols], train_df['power'], test_df[cols], 'cat')
    2. ---------------------------------------------------------------------------------
    3. #交叉验证分数
    4. [217.60469992799398, 221.48162281844884, 221.30109254841568, 220.89774625184162, 219.70713010328046]
    1. test_df['power'] = cat_test
    2. test_df['power'] = test_df['power'].apply(lambda x: 0 if x<0 else x)
    3. test_df[['id_encode','ds','power']].to_csv(r'F:\Jupyter Files\比赛\新能源赛道初赛数据集\初赛1008\cat_result.csv', index=False)

    线上分数:302.69904271933

    6.总结

    从线下结果来看,XGBoost>CatBoost>LightGBM,即XGBoost的效果最优;而从线上结果来看,LightGBM>XGBoost>CatBoost,即LightGBM的效果最优。因为不同模型结果相差较大,所以没有考虑对这三模型进行融合。接下来会更侧重对LightGBM和XGBoost进行调参,并做特征工程,争取有更好的效果。

  • 相关阅读:
    Git学习1
    将json-bigint处理为数值分区数组的字段全部自动转为字符串
    DA3 网站的第10位用户信息读取
    Java笔记(12)------JDBC
    AcWing 505. 火柴排队(每日一题)
    【Python】蒙特卡洛模拟 | LCG 算法 | 马特赛特旋转算法 | Random 模块
    vue中node-sass下载失败Failed at the node-sass@7.0.1 postinstall script.解决方法
    ABeam Insight | 智能制造系列(4):物联网(IoT)× 智能制造
    基于Vite初始化前端项目
    【机器学习】深度神经网络(DNN):原理、应用与代码实践
  • 原文地址:https://blog.csdn.net/weixin_60200880/article/details/133916030