- import xlrd as xl
- from bunch import *
-
- Data1 = Bunch()
- Data2 = Bunch()
- Data3 = Bunch()
-
- data1 = xl.open_workbook("1.xls")
- data2 = xl.open_workbook("2.xls")
- data3 = xl.open_workbook("3.xls")
-
- work_sheet1 = data1.sheets()[0]
- work_sheet2 = data2.sheets()[0]
- work_sheet3 = data3.sheets()[0]
-
- all_rows1 = work_sheet1.nrows
- all_rows2 = work_sheet2.nrows
- all_rows3 = work_sheet3.nrows
-
- all_cols1 = work_sheet1.ncols
- all_cols2 = work_sheet2.ncols
- all_cols3 = work_sheet3.ncols
- list0,list1,list2,list3 = [],[],[],[]
-
- start = "list"
- for i in range(4):
- c = start + str(i)
- eval(c).append(1)
-
- print("list0",end=" ")
- print(list0)
-
-
- >>>list0 [1]
你看,这个eval()函数就很好。
- feature = []
- for i in range(1,all_rows1):
- d = []
- for j in range(3,all_cols1-2):
- d.append(work_sheet1.cell_value(i,j))
- feature.append(d)
- Data1["feature"] = feature
-
- target = []
- for i in range(1,all_rows1):
- target.append(work_sheet1.cell_value(i,all_cols1-2))
- Data1["target"] = target
- from sklearn.linear_model import LinearRegression
- regression = LinearRegression()
- model = regression.fit(Data1["feature"],Data1["target"])
- print("1 model.intercept_",end = " ")
- print(model.intercept_)
- print("model.coef_",end=" ")
- print(model.coef_)
- 1 model.intercept_ -0.00040030675346258704
- model.coef_ [0.50008753 0.99928412 0.99998659 1.00022466 1.0001154 ]
- 2 model.intercept_ 1.3076787747084921
- model.coef_ [0.49392547 0.93628899 0.96930705 1.05106066 1.0070238 ]
- 3 model.intercept_ -8.526512829121202e-14
- model.coef_ [0.5 1. 1. 1. 1. ]
- assert(len(Data2["target"])==len(Data2["feature"])),"wrong!"
- S = []
- for i in range(1,all_rows2-2):
- try:
- sum = 0.5 * Data2["feature"][i][0]
- except:
- print(i)
- for j in range(1,len(Data2["feature"][i])):
- sum += Data2["feature"][i][j]
- S.append(sum)
- Q = []
- for i in range(len(S)):
- Q.append((S[i]-Data2["target"][i])/Data2["target"][i])
- Q = [abs(i) for i in Q]
- Q = [abs(i) for i in Q]
- Q1 = [i for i in Q]
- for i in range(10):
- t = Q.index(max(Q1))
- print(t,end=" ")
- print(max(Q1))
- Q1.remove(max(Q1))
-
- >>>
- >>>
- 7 0.10667617011166551
- 44 0.04207573632538569
- 70 0.03807947019867545
- 69 0.027375201288244812
- 65 0.026153846153846198
- 71 0.018932874354561126
- 58 0.017569546120058607
- 72 0.017543859649122806
- 14 0.017052375152253246
- 30 0.01686121919584951
- from sklearn.ensemble import RandomForestRegressor
- features = Data1["feature"]
- targets = Data1["target"]
- randomforest = RandomForestRegressor(random_state=0)
- model = randomforest.fit(features,targets)
这个误差显然有一点点大的。。
- >>>model.predict([[90,10,10,10,10],[80,20,10,10,10],[95,10,10,10,10],[90,15,5,10,10],[85,20,10,0,10],[100,10,10,10,10]])
-
- >>>array([82.746 , 80.1668, 86.5164, 82.6754, 73.749 , 88.352 ])
好了,我们的分析将继续。
那我们能不能让随机森林干点更有用的事情呢?
比如,我们按同学的分数打等第。
- from sklearn.ensemble import RandomForestClassifier
- Data1["target"] = [2*(i>=90)+1*((i<90)&(i>80))+0 for i in Data1["target"]]
- randomforest = RandomForestClassifier(random_state=0)
- model = randomforest.fit(Data1["feature"],Data1["target"]
程序的第二行,是一个很有用的编程小技巧。值得记录
- >>>model.predict([[100,20,10,10,10],[90,10,10,10,10],[80,20,10,10,10],[100,0,10,10,10]])
- >>>array([2, 1, 0, 1])
嗯,效果还是不尽人意
为什么呢?