import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings("ignore")
df = pd.read_excel('预处理之后的数据.xlsx')
# data = df.apply(lambda x: (x - np.mean(x)) / np.std(x))
data = df
# data.head()#查看前5行数据
# data.shape
# 整理数据集
attributes = list(data.columns) #获取所有列名
attributes.remove('房屋总价') # 删除'房屋总价'
X_cols = attributes.copy()
attributes.append('房屋总价') # 将'房屋总价'添加到列表末尾
# 输出调整后的列表
print(X_cols)
print(attributes)
#提取自变量X和因变量y
X = np.array(data[X_cols])
y = np.array(data['房屋总价'])
#将数据集划分为训练集和测试集
from sklearn import model_selection
X_train,X_test,y_train,y_test=model_selection.train_test_split(
X,y,test_size=0.20,random_state=123)
# parameters = {'n_estimators':np.arange(10,200,5),
# 'max_depth':np.arange(1,15),
# 'max_features':np.linspace(0.1,1.0,10)}
# model = RandomForestRegressor()
# grid_search = GridSearchCV(model,parameters,cv=5)
# grid_search.fit(X_train,y_train)
# params = grid_search.best_params_
# params
model = RandomForestRegressor(n_estimators=50,
max_depth=14,
max_features=0.3,
random_state=123)
model.fit(X_train,y_train)
print('Training score',model.score(X_train,y_train))
print('Testing score',model.score(X_test,y_test))
df_new = pd.DataFrame()
df_new['变量名'] = X_cols
df_new['重要性'] = list(model.feature_importances_)
df_new.sort_values(by='重要性',ascending=False)
df_new.set_index('变量名',inplace=True)
df_new.plot(kind="barh",legend=False)
#后续绘制饼图
如何根据上述代码继续编写绘制各影响因素对房屋总价的影响程度饼图,已经导入了数据580条