- import pandas as pd
- import numpy as np
- from sklearn.preprocessing import StandardScaler
- from sklearn.linear_model import LogisticRegression
- from sklearn.metrics import classification_report
- import matplotlib.pyplot as plt
- def train():
- # 1)读取数据:
- df1=pd.read_csv('horseColicTraining.txt',delimiter='\t',header=None)
- df2=pd.read_csv('horseColicTest.txt',delimiter='\t',header=None)
- last_column = df1.iloc[:, -1] # 获取最后一列数据
- x_train1 = df1.iloc[:, :-1] # 第一个DataFrame包含除最后一列以外的所有列
- y_train1 = pd.DataFrame(last_column) # 第二个DataFrame只包含最后一列
- last_column1 = df2.iloc[:, -1] # 获取最后一列数据
- x_test1 = df2.iloc[:, :-1] # 第一个DataFrame包含除最后一列以外的所有列
- y_test1 = pd.DataFrame(last_column1) # 第二个DataFrame只包含最后一列
- # 2)缺失值处理:
- #3)划分数据集:
-
- # 筛选特征值和目标值
-
- # 4)特征工程标准化
- transfer=StandardScaler()
- x_train=transfer.fit_transform(x_train1)
- # print(x_train)
- x_test=transfer.transform(x_test1)
- # transfer1=StandardScaler()
- # y_train=transfer.fit_transform(y_train1)
- # y_test=transfer.transform(y_test1)
-
- # 二维数组
- two_dimensional_array = np.array(y_train1)
-
- # 使用flatten()函数将二维数组转换为一维数组
- y_train = two_dimensional_array.flatten()
-
- # print(y_train)
-
- # 5)逻辑回归的预估器:
- estimator=LogisticRegression(C=0.04,max_iter=10000)
- estimator.fit(x_train,y_train)
-
- # 回归系数和偏置
- print('回归系数为:\n',estimator.coef_)
- print('偏置为:',estimator.intercept_)
- # 6)分类模型的评估
- y_predict=estimator.predict(x_test)
- print('测试集的预测值为:\n',y_predict)
- error=estimator.score(x_test,y_test1)
- print('模型预测准确率为:',error)
- # 查看精确率和召回率和F1—score
- report=classification_report(y_test1,y_predict,labels=[1,0],target_names=['死亡','没死'])
- print(report)#precision:精确率 recall:召回率 f1-score support:数量
- return y_predict,y_test1
-
- y1,y2=train()
- # print(y)
- # plt.plot(np.linspace(0,67,67),y)
- fig=plt.figure()
- plt.scatter(np.linspace(0,67,67),y1,alpha=0.5)
- plt.scatter(np.linspace(0,67,67),y2,alpha=0.5)
-
-
- plt.show()
结果可视化:(随便写的一个)
