该心脏病数据集是通过组合 5 个已经独立可用但以前未合并的流行心脏病数据集来策划的。在这个数据集中,5 个心脏数据集结合了 11 个共同特征,使其成为迄今为止可用于研究目的的最大心脏病数据集。
该数据集由 1190 个实例和 11 个特征组成。这些数据集被收集并组合在一个地方,以帮助推进与CAD相关的机器学习和数据挖掘算法的研究,并希望最终推进临床诊断和早期治疗。
import numpy as np # 线性代数
import pandas as pd # 数据处理, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from IPython.core.display import display, HTML
df = pd.read_csv("./data/heart_statlog_cleveland_hungary_final.csv")
df.shape
df.head()
df.describe()
df.isna().sum()
categoricals = ["sex", "chest pain type", "fasting blood sugar", "resting ecg", "exercise angina", "ST slope", "target"]
fig, axes = plt.subplots(nrows=1, ncols=7, figsize=(12, 5))
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(12, 7))
index = 0
sns.pairplot(df, vars=numericals, hue=df.columns[-1])
fig, axes = plt.subplots(nrows=1, ncols=5, figsize=(10, 7))
for i, j in enumerate(numericals):
plots(df, df.columns[-1], j, axes[i])
fig, axes = plt.subplots(nrows=1, ncols=5, figsize=(10, 7))
for i, j in enumerate(numericals):
for i in df[df.columns[-1]].unique():
if i == 0:
display(HTML(
"柱状图,条形图,箱形图显示了无心脏病患者的数值分布
"))
else:
display(HTML(
"柱状图,柱状图,箱形图显示了心脏病患者的数值分布
"))
temp_df = df[df[df.columns[-1]] == i]
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
scaler = MinMaxScaler()
x = scaler.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()
etc = ExtraTreesClassifier()
abc = AdaBoostClassifier()
lgr = LogisticRegression()
svc = SVC()
xgb = XGBClassifier()
lgb = LGBMClassifier()
scores, reports, matrices = [], dict(), dict()
for i, j in zip(models, names):
score, report, matrix = training(i, j)
scores += [score]
reports[j] = report
matrices[j] = matrix
dt = pd.DataFrame({"score": scores}, index=names)
fig, axes = plt.subplots()
dt = dt.sort_values("score", ascending=False)
dt["score"] = round(dt["score"], 2)
sns.barplot(x=dt.index, y=dt.iloc[:, 0], ax=axes)
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(11, 7))
index = 0
for i in range(2):
for i in dt.index:
print("*"*30)
print(i)
print(reports[i])
print("\n\n")