• 随机森林项目实战---气温预测


    实战项目的三个任务:

    1.使用随机森林算法完成基本建模:包括数据预处理,特征展示,完成建模并进行可视化展示分析。

    2.分析数据样本量与特征个数对结果的影响,在保证算法一致的前提下,增加样本个数,观察结果变化,重新进行特征工程,引入新的特征后,观察结果走势。

    3.对随机森林算法进行调参,找到最合适的参数,掌握机器学习中两种调参方法,找到模型最优参数。

    任务1:

    1. import pandas as pd
    2. data =pd.read_csv()
    3. data.head()
    4. import datetime
    5. year = data['year']
    6. month =data['month']
    7. day =data['day']
    8. dates = [(str(year)+'-'+str(month)+'-'+str(day)) for year,month,day in zip(year,month,day)]
    9. dates=[datetime.datetime.strptime(date,'%Y-%m-%d') for date in dates]
    10. dates[:5]

    对时间序列进行重新调整,进行特征绘制。

    1. ##进行绘图
    2. import matplotlib.pyplot as plt
    3. %matplotlib inline
    4. plt.style.use('fivethirtyeight')##风格设置
    5. # 设置布局
    6. fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize = (10,10))
    7. fig.autofmt_xdate(rotation = 45)
    8. # 标签值
    9. ax1.plot(dates, data['actual'])
    10. ax1.set_xlabel(''); ax1.set_ylabel('Temperature'); ax1.set_title('Max Temp')
    11. # 昨天
    12. ax2.plot(dates, data['temp_1'])
    13. ax2.set_xlabel(''); ax2.set_ylabel('Temperature'); ax2.set_title('Previous Max Temp')
    14. # 前天
    15. ax3.plot(dates, data['temp_2'])
    16. ax3.set_xlabel('Date'); ax3.set_ylabel('Temperature'); ax3.set_title('Two Days Prior Max Temp')
    17. # 我的逗逼朋友
    18. ax4.plot(dates, data['friend'])
    19. ax4.set_xlabel('Date'); ax4.set_ylabel('Temperature'); ax4.set_title('Friend Estimate')
    20. plt.tight_layout(pad=2)

    从中可以看出4个特征的基本影响走势。

    1. import numpy as np
    2. y = np.array(data['actual'])
    3. x = data.drop(['actual'],axis=1)
    4. x_list =list(x.columns)
    5. x = np.array(x)
    6. ##数据分类
    7. from sklearn.model_selection import train_test_split
    8. x_train,x_test,y_train,y_test =train_test_split(x,y,test_size=0.25,random_state=42)
    9. ##建立随机森林模型
    10. from sklearn.ensemble import RandomForestRegressor
    11. rfr = RandomForestRegressor(n_estimators=1000,random_state=42)
    12. rfr.fit(x_train,y_train)
    13. y_pred = rfr.predict(x_test)
    14. from sklearn.metrics import mean_squared_error
    15. mse=mean_squared_error(y_test,y_pred)
    16. print('mse',mse)

     这里进行了测试集与训练集的分割与随机森林模型的建立。通过建立的模型预测了结果与真实值计算量mse的值。

    随后进行了决策树树的可视化

    1. from sklearn.tree import export_graphviz
    2. import pydot
    3. tree = rfr.estimators_[5]
    4. export_graphviz(tree,out_file='tree.dot',
    5. feature_names=x_list,
    6. rounded=True,precision=1)
    7. (graph,) = pydot.graph_from_dot_file('tree.dot')
    8. graph.write_png('tree.png')

    由于树枝过于复杂繁多,所以进行预剪枝。

    1. ##进行预剪枝
    2. rfr_small = RandomForestRegressor(n_estimators=10,max_depth=3,random_state=42)
    3. rfr_small.fit(x_train,y_train)
    4. tree_small = rfr_small.estimators_[5]
    5. export_graphviz(tree_small,out_file='small_tree.dot',
    6. feature_names=x_list,
    7. rounded=True,precision=1)
    8. (graph,) = pydot.graph_from_dot_file('small_tree.dot')
    9. graph.write_png('small_tree.png')

    2.选择出重点的特征,然后对全特征与重点特征的结果进行比较

    这里使用了randomforestregressor.feature_importance_可以输出重要值。

    1. ##通过randomforestregressor的feature_importance_显示特征重要性
    2. importance = list(rfr.feature_importances_)
    3. feature_importances =[(feature_name,importance) for feature_name,importance in zip(x_list,importance)]
    4. feature_importances =sorted(feature_importances,key =lambda x:x[1],reverse =True)##key 为以那一列数据为排列对象
    5. feature_importances
    6. ##以这两个特征为唯二的特征进行计算
    7. rfr = RandomForestRegressor(n_estimators=100,random_state=42)
    8. new_x = np.array(data.iloc[:,4:5])
    9. new_x_train,new_x_test,new_y_train,new_y_test =train_test_split(new_x,y,test_size=.25,random_state=42)
    10. rfr.fit(new_x_train,new_y_train)
    11. y_pred = rfr.predict(new_x_test)
    12. print('mse',mean_squared_error(new_y_test,y_pred))

    相比之下,mse值上升,说明效果不好,其他特征有也重要效果。

    任务二:数据与特征对结果的影响分析。

    这里读取了数据的拓展包进行测试。操作与上面的一样

    1. import pandas as pd
    2. data =pd.read_csv()
    3. data.head()
    4. ##绘图观察数据
    5. # 转换成标准格式
    6. import datetime
    7. # 得到各种日期数据
    8. years = data['year']
    9. months = data['month']
    10. days = data['day']
    11. # 格式转换
    12. dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years, months, days)]
    13. dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in dates]
    14. # 绘图
    15. import matplotlib.pyplot as plt
    16. %matplotlib inline
    17. # 风格设置
    18. plt.style.use('fivethirtyeight')
    19. # Set up the plotting layout
    20. fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize = (15,10))
    21. fig.autofmt_xdate(rotation = 45)
    22. # Actual max temperature measurement
    23. ax1.plot(dates, data['actual'])
    24. ax1.set_xlabel(''); ax1.set_ylabel('Temperature (F)'); ax1.set_title('Max Temp')
    25. # Temperature from 1 day ago
    26. ax2.plot(dates, data['temp_1'])
    27. ax2.set_xlabel(''); ax2.set_ylabel('Temperature (F)'); ax2.set_title('Prior Max Temp')
    28. # Temperature from 2 days ago
    29. ax3.plot(dates, data['temp_2'])
    30. ax3.set_xlabel('Date'); ax3.set_ylabel('Temperature (F)'); ax3.set_title('Two Days Prior Max Temp')
    31. # Friend Estimate
    32. ax4.plot(dates, data['friend'])
    33. ax4.set_xlabel('Date'); ax4.set_ylabel('Temperature (F)'); ax4.set_title('Friend Estimate')
    34. plt.tight_layout(pad=2)

     由于多了特征,对多出来的特征进行组合与处理。

    1. seasons=[]
    2. for month in data['month']:
    3. if month in[1,2,12]:
    4. seasons.append('winter')
    5. elif month in [3,4,5]:
    6. seasons.append('spring')
    7. elif month in [6,7,8]:
    8. seasons.append('summer')
    9. else:
    10. seasons.append('auntumn')
    11. reduced_x = data[['temp_1','prcp_1','average','actual']]
    12. reduced_x['seasons']=seasons
    13. # 导入seaborn工具包
    14. import seaborn as sns
    15. sns.set(style="ticks", color_codes=True);
    16. # 选择你喜欢的颜色模板
    17. palette = sns.xkcd_palette(['dark blue', 'dark green', 'gold', 'orange'])
    18. # 绘制pairplot
    19. sns.pairplot(reduced_x, hue = 'seasons', diag_kind = 'kde', palette= palette, plot_kws=dict(alpha = 0.7),
    20. diag_kws=dict(shade=True));

     画出了四个月气温变化的相关图。

    先改变数据量,测试数据量对模型效果的影响。

    1. data = pd.get_dummies(data)
    2. new_y = np.array(data['actual'])
    3. new_x = data.drop(['actual'],axis=1)
    4. new_x_list =list(new_x.columns)
    5. new_x = np.array(new_x)
    6. from sklearn.model_selection import train_test_split
    7. new_x_train,new_x_test,new_y_train,new_y_test =train_test_split(new_x,new_y,test_size=0.25,random_state=42)
    8. old_y = np.array(data['actual'])
    9. old_x = data.drop(['actual'],axis=1)
    10. old_x_list =list(old_x.columns)
    11. old_x = np.array(old_x)
    12. from sklearn.model_selection import train_test_split
    13. old_x_train,old_x_test,old_y_train,old_y_test =train_test_split(x,y,test_size=0.25,random_state=42)
    14. def model_train_predict(x_train,y_train,x_test,y_test):
    15. rfr = RandomForestRegressor(n_estimators=100,random_state=42)
    16. rfr.fit(x_train,y_train)
    17. y_pred = rfr.predict(x_test)
    18. errors= abs(y_pred-y_test)
    19. print('平均误差',round(np.mean(errors),2))
    20. accuracy = 100-np.mean(errors)
    21. print('平均正确率',accuracy)
    22. model_train_predict(old_x_train,old_y_train,old_x_test,old_y_test)
    23. model_train_predict(ori_new_x_train,new_y_train,ori_new_x_test,new_y_test)

    从结果可以发现,当数据量增加,误差减少。

    然后改变特征数量,判断其对效果的影响。

    1. rfr = RandomForestRegressor(n_estimators=100,random_state=42)
    2. rfr.fit(new_x_train,new_y_train)
    3. y_pred = rfr.predict(new_x_test)
    4. errors= abs(y_pred-new_y_test)
    5. print('平均误差',round(np.mean(errors),2))
    6. accuracy = 100-np.mean(errors)
    7. print('平均正确率',accuracy)
    8. importances = list(rfr.feature_importances_)
    9. feature_importances =[(feature,importance) for feature,importance in zip(new_x_list,importances)]
    10. feature_importances = sorted(feature_importances,key =lambda x:x[1],reverse =True)
    11. # 对特征进行排序
    12. x_values =list(range(len(importances)))
    13. sorted_importances = [importance[1] for importance in feature_importances]
    14. sorted_features = [importance[0] for importance in feature_importances]
    15. # 累计重要性
    16. cumulative_importances = np.cumsum(sorted_importances)
    17. # 绘制折线图
    18. plt.plot(x_values, cumulative_importances, 'g-')
    19. # 画一条红色虚线,0.95那
    20. plt.hlines(y = 0.95, xmin=0, xmax=len(sorted_importances), color = 'r', linestyles = 'dashed')
    21. # X轴
    22. plt.xticks(x_values, sorted_features, rotation = 'vertical')
    23. # Y轴和名字
    24. plt.xlabel('Variable'); plt.ylabel('Cumulative Importance'); plt.title('Cumulative Importances');

     根据主成分分析,总重要性大于95%基本可以概括为这5个特征可以涵盖所有重要性。

    1. important_feature_names =[feature[0] for feature in feature_importances[0:5]]
    2. important_feature_indices =[new_x_list.index(feature) for feature in important_feature_names]
    3. important_x_train = new_x_train[:,important_feature_indices]
    4. important_x_test = new_x_test[:,important_feature_indices]
    5. model_train_predict(important_x_train,new_y_train,important_x_test,new_y_test)
    6. ##运行时间的提升
    7. import time
    8. all_features_time=[]
    9. for _ in range(10):
    10. start_time = time.time()
    11. rfr.fit(new_x_train,new_y_train)
    12. y_pred = rfr.predict(new_x_test)
    13. end_time =time.time()
    14. all_features_time.append((end_time-start_time))
    15. all_features_times=np.mean(all_features_time)
    16. all_features_time=[]
    17. for _ in range(10):
    18. start_time = time.time()
    19. rfr.fit(important_x_train,new_y_train)
    20. y_pred = rfr.predict(important_x_test)
    21. end_time =time.time()
    22. all_features_time.append((end_time-start_time))
    23. reduced_features_times=np.mean(all_features_time)
    24. all_accuracy =100*(1-np.mean(abs(all_y_pred-new_y_test)/new_y_test))
    25. reduced_accuracy =100*(1-np.mean(abs(reduced_y_pred-new_y_test)/new_y_test))
    26. comparison = pd.DataFrame({'features': ['all (17)', 'reduced (5)'],
    27. 'run_time': [all_features_times, reduced_features_times],
    28. 'accuracy': [all_accuracy, reduced_accuracy]})
    29. comparison[['features', 'accuracy', 'run_time']]

    这里通过比较对运行时间的优化与正确率的提升进行比较,发现当数据量多与特征多的时候,对模型建立的效果越好。

    任务三:调参:这里使用RandomizeSearchCV与GridSearchCV两种调参方式进行参数的选择。

    1. from sklearn.model_selection import RandomizedSearchCV
    2. n_estimators =[int(x) for x in np.linspace(start=200,stop=2000,num=10)]
    3. max_features=['auto','sqrt']
    4. max_depth = [int(x) for x in np.linspace(10,20,num=2)]
    5. max_depth.append(None)
    6. min_samples_split=[2,5,10]
    7. min_samples_leaf=[1,2,4]
    8. bootstrap = [True,False]
    9. random_grid = {'n_estimators': n_estimators,
    10. 'max_features': max_features,
    11. 'max_depth': max_depth,
    12. 'min_samples_split': min_samples_split,
    13. 'min_samples_leaf': min_samples_leaf,
    14. 'bootstrap': bootstrap}
    15. rf = RandomForestRegressor()
    16. rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
    17. n_iter = 100, scoring='neg_mean_absolute_error',
    18. cv = 3, verbose=2, random_state=42, n_jobs=-1)
    19. # 执行寻找操作
    20. rf_random.fit(new_x_train, new_y_train)
    21. from sklearn.model_selection import GridSearchCV
    22. # 网络搜索
    23. param_grid = {
    24. 'bootstrap': [True],
    25. 'max_depth': [8,10,12],
    26. 'max_features': ['auto'],
    27. 'min_samples_leaf': [2,3, 4, 5,6],
    28. 'min_samples_split': [3, 5, 7],
    29. 'n_estimators': [800, 900, 1000, 1200]
    30. }
    31. # 选择基本算法模型
    32. rf = RandomForestRegressor()
    33. # 网络搜索
    34. grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
    35. scoring = 'neg_mean_absolute_error', cv = 3,
    36. n_jobs = -1, verbose = 2)
    37. grid_search.fit(train_features, train_labels)

    最后发现可以通过随机搜索确定大方向,使用网格化搜索进行精细化搜索

  • 相关阅读:
    数组——螺旋矩阵II
    shell练习
    Linux ps 命令使用介绍
    太极限了,JDK的这个BUG都能被我踩到!
    STM32定时器篇——通用定时器的使用(定时中断,PWM输出)
    一款WPF的精简版MVVM框架——stylet框架的初体验(包括MVVM绑定、依赖注入等操作)
    GlassFish内存马分析
    SpringBoot中优雅地实现统一响应对象
    十大基础排序算法
    Linux应用开发基础知识——输入系统应用编程(八)
  • 原文地址:https://blog.csdn.net/lovexyyforever/article/details/126066062