• 机器学习的数据质量


    概述

    简而言之,机器学习模型使用输入数据并产生预测。预测的质量直接对应于您训练模型的数据质量;垃圾进,垃圾出。查看这篇文章,了解在哪里使用 AI 以及如何正确应用它。

    我们将通过具体的代码示例和一些合成数据来训练我们的模型。任务是根据白细胞(白细胞)计数和血压确定肿瘤是良性(无害)还是恶性(有害)。这是我们创建的合成数据集,没有临床相关性。

    设置

    我们将为可重复性播下种子。

    1. import numpy as np
    2. import random
    SEED = 1234
    1. # Set seed for reproducibility
    2. np.random.seed(SEED)
    3. random.seed(SEED)

    完整数据集

    我们将首先用整个数据集训练一个模型。稍后我们将删除数据集的一个子集,并查看它对我们模型的影响。

    加载数据

    1. import matplotlib.pyplot as plt
    2. import pandas as pd
    3. from pandas.plotting import scatter_matrix
    1. # Load data
    2. url = "https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/tumors.csv"
    3. df = pd.read_csv(url, header=0) # load
    4. df = df.sample(frac=1).reset_index(drop=True) # shuffle
    5. df.head()
    白细胞计数血压肿瘤类
    015.33586014.637535良性
    19.85753514.518942恶性的
    217.63257915.869585良性
    318.36917414.774547良性
    414.50936715.892224恶性的
    1. # Define X and y
    2. X = df[["leukocyte_count", "blood_pressure"]].values
    3. y = df["tumor_class"].values
    4. print ("X: ", np.shape(X))
    5. print ("y: ", np.shape(y))

     X:  (1000, 2)
    y:  (1000,)

    1. # Plot data
    2. colors = {"benign": "red", "malignant": "blue"}
    3. plt.scatter(X[:, 0], X[:, 1], c=[colors[_y] for _y in y], s=25, edgecolors="k")
    4. plt.xlabel("leukocyte count")
    5. plt.ylabel("blood pressure")
    6. plt.legend(["malignant", "benign"], loc="upper right")
    7. plt.show()

    我们希望为我们的任务选择具有强预测信号的特征。如果你想提高性能,你需要通过收集和添加新信号来不断地做特征工程。因此,您可能会遇到与现有特征具有高度相关性(正交信号)的新特征,但它可能仍具有一些独特的信号来提高您的预测性能。

    1. # Correlation matrix
    2. scatter_matrix(df, figsize=(5, 5));
    3. df.corr()
    白细胞计数血压
    白细胞计数1.000000-0.162875
    血压-0.1628751.000000

    拆分数据

    1. import collections
    2. from sklearn.model_selection import train_test_split
    1. TRAIN_SIZE = 0.70
    2. VAL_SIZE = 0.15
    3. TEST_SIZE = 0.15
    1. def train_val_test_split(X, y, train_size):
    2. """Split dataset into data splits."""
    3. X_train, X_, y_train, y_ = train_test_split(X, y, train_size=TRAIN_SIZE, stratify=y)
    4. X_val, X_test, y_val, y_test = train_test_split(X_, y_, train_size=0.5, stratify=y_)
    5. return X_train, X_val, X_test, y_train, y_val, y_test
    1. # Create data splits
    2. X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(
    3. X=X, y=y, train_size=TRAIN_SIZE)
    4. print (f"X_train: {X_train.shape}, y_train: {y_train.shape}")
    5. print (f"X_val: {X_val.shape}, y_val: {y_val.shape}")
    6. print (f"X_test: {X_test.shape}, y_test: {y_test.shape}")
    7. print (f"Sample point: {X_train[0]}{y_train[0]}")

    X_train: (700, 2), y_train: (700,)
    X_val: (150, 2), y_val: (150,)
    X_test: (150, 2), y_test: (150,)
    Sample point: [11.5066204  15.98030799] → malignant 

    标签编码

    from sklearn.preprocessing import LabelEncoder
    1. # Output vectorizer
    2. label_encoder = LabelEncoder()
    1. # Fit on train data
    2. label_encoder = label_encoder.fit(y_train)
    3. classes = list(label_encoder.classes_)
    4. print (f"classes: {classes}")

    classes: ["benign", "malignant"] 

    1. # Convert labels to tokens
    2. print (f"y_train[0]: {y_train[0]}")
    3. y_train = label_encoder.transform(y_train)
    4. y_val = label_encoder.transform(y_val)
    5. y_test = label_encoder.transform(y_test)
    6. print (f"y_train[0]: {y_train[0]}")

     y_train[0]: malignant
    y_train[0]: 1

    1. # Class weights
    2. counts = np.bincount(y_train)
    3. class_weights = {i: 1.0/count for i, count in enumerate(counts)}
    4. print (f"counts: {counts}\nweights: {class_weights}")

     counts: [272 428]

    weights: {0: 0.003676470588235294, 1: 0.002336448598130841}

    标准化数据

    from sklearn.preprocessing import StandardScaler
    1. # Standardize the data (mean=0, std=1) using training data
    2. X_scaler = StandardScaler().fit(X_train)
    1. # Apply scaler on training and test data (don't standardize outputs for classification)
    2. X_train = X_scaler.transform(X_train)
    3. X_val = X_scaler.transform(X_val)
    4. X_test = X_scaler.transform(X_test)
    1. # Check (means should be ~0 and std should be ~1)
    2. print (f"X_test[0]: mean: {np.mean(X_test[:, 0], axis=0):.1f}, std: {np.std(X_test[:, 0], axis=0):.1f}")
    3. print (f"X_test[1]: mean: {np.mean(X_test[:, 1], axis=0):.1f}, std: {np.std(X_test[:, 1], axis=0):.1f}")

     X_test[0]: mean: 0.0, std: 1.0

    X_test[1]: mean: 0.0, std: 1.0

    模型

    1. import torch
    2. from torch import nn
    3. import torch.nn.functional as F
    1. # Set seed for reproducibility
    2. torch.manual_seed(SEED)
    1. INPUT_DIM = 2 # X is 2-dimensional
    2. HIDDEN_DIM = 100
    3. NUM_CLASSES = 2
    1. class MLP(nn.Module):
    2. def __init__(self, input_dim, hidden_dim, num_classes):
    3. super(MLP, self).__init__()
    4. self.fc1 = nn.Linear(input_dim, hidden_dim)
    5. self.fc2 = nn.Linear(hidden_dim, num_classes)
    6. def forward(self, x_in):
    7. z = F.relu(self.fc1(x_in)) # ReLU activation function added!
    8. z = self.fc2(z)
    9. return z
    1. # Initialize model
    2. model = MLP(input_dim=INPUT_DIM, hidden_dim=HIDDEN_DIM, num_classes=NUM_CLASSES)
    3. print (model.named_parameters)

    训练

    from torch.optim import Adam
    1. LEARNING_RATE = 1e-3
    2. NUM_EPOCHS = 5
    3. BATCH_SIZE = 32
    1. # Define Loss
    2. class_weights_tensor = torch.Tensor(list(class_weights.values()))
    3. loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)
    1. # Accuracy
    2. def accuracy_fn(y_pred, y_true):
    3. n_correct = torch.eq(y_pred, y_true).sum().item()
    4. accuracy = (n_correct / len(y_pred)) * 100
    5. return accuracy
    1. # Optimizer
    2. optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
    1. # Convert data to tensors
    2. X_train = torch.Tensor(X_train)
    3. y_train = torch.LongTensor(y_train)
    4. X_val = torch.Tensor(X_val)
    5. y_val = torch.LongTensor(y_val)
    6. X_test = torch.Tensor(X_test)
    7. y_test = torch.LongTensor(y_test)
    1. # Training
    2. for epoch in range(NUM_EPOCHS*10):
    3. # Forward pass
    4. y_pred = model(X_train)
    5. # Loss
    6. loss = loss_fn(y_pred, y_train)
    7. # Zero all gradients
    8. optimizer.zero_grad()
    9. # Backward pass
    10. loss.backward()
    11. # Update weights
    12. optimizer.step()
    13. if epoch%10==0:
    14. predictions = y_pred.max(dim=1)[1] # class
    15. accuracy = accuracy_fn(y_pred=predictions, y_true=y_train)
    16. print (f"Epoch: {epoch} | loss: {loss:.2f}, accuracy: {accuracy:.1f}")

    Output:

    Epoch: 0 | loss: 0.70, accuracy: 49.6
    Epoch: 10 | loss: 0.54, accuracy: 93.7
    Epoch: 20 | loss: 0.43, accuracy: 97.1
    Epoch: 30 | loss: 0.35, accuracy: 97.0
    Epoch: 40 | loss: 0.30, accuracy: 97.4

    评估

    1. import json
    2. import matplotlib.pyplot as plt
    3. from sklearn.metrics import precision_recall_fscore_support
    1. def get_metrics(y_true, y_pred, classes):
    2. """Per-class performance metrics."""
    3. # Performance
    4. performance = {"overall": {}, "class": {}}
    5. # Overall performance
    6. metrics = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    7. performance["overall"]["precision"] = metrics[0]
    8. performance["overall"]["recall"] = metrics[1]
    9. performance["overall"]["f1"] = metrics[2]
    10. performance["overall"]["num_samples"] = np.float64(len(y_true))
    11. # Per-class performance
    12. metrics = precision_recall_fscore_support(y_true, y_pred, average=None)
    13. for i in range(len(classes)):
    14. performance["class"][classes[i]] = {
    15. "precision": metrics[0][i],
    16. "recall": metrics[1][i],
    17. "f1": metrics[2][i],
    18. "num_samples": np.float64(metrics[3][i]),
    19. }
    20. return performance
    1. # Predictions
    2. y_prob = F.softmax(model(X_test), dim=1)
    3. y_pred = y_prob.max(dim=1)[1]
    1. # # Performance
    2. performance = get_metrics(y_true=y_test, y_pred=y_pred, classes=classes)
    3. print (json.dumps(performance, indent=2))

     Output:

    {
      "overall": {
        "precision": 0.9461538461538461,
        "recall": 0.9619565217391304,
        "f1": 0.9517707041477195,
        "num_samples": 150.0
      },
      "class": {
        "benign": {
          "precision": 0.8923076923076924,
          "recall": 1.0,
          "f1": 0.9430894308943091,
          "num_samples": 58.0
        },
        "malignant": {
          "precision": 1.0,
          "recall": 0.9239130434782609,
          "f1": 0.96045197740113,
          "num_samples": 92.0
        }
      }
    }
    

    推理

    我们将绘制一个点,我们知道它属于恶性肿瘤类。我们训练有素的模型可以准确预测它确实是恶性肿瘤!

    1. def plot_multiclass_decision_boundary(model, X, y):
    2. x_min, x_max = X[:, 0].min() - 0.1, X[:, 0].max() + 0.1
    3. y_min, y_max = X[:, 1].min() - 0.1, X[:, 1].max() + 0.1
    4. xx, yy = np.meshgrid(np.linspace(x_min, x_max, 101), np.linspace(y_min, y_max, 101))
    5. cmap = plt.cm.Spectral
    6. X_test = torch.from_numpy(np.c_[xx.ravel(), yy.ravel()]).float()
    7. y_pred = F.softmax(model(X_test), dim=1)
    8. _, y_pred = y_pred.max(dim=1)
    9. y_pred = y_pred.reshape(xx.shape)
    10. plt.contourf(xx, yy, y_pred, cmap=plt.cm.Spectral, alpha=0.8)
    11. plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.RdYlBu)
    12. plt.xlim(xx.min(), xx.max())
    13. plt.ylim(yy.min(), yy.max())
    1. # Visualize the decision boundary
    2. plt.figure(figsize=(8,5))
    3. plt.title("Test")
    4. plot_multiclass_decision_boundary(model=model, X=X_test, y=y_test)
    5. # Sample point near the decision boundary
    6. mean_leukocyte_count, mean_blood_pressure = X_scaler.transform(
    7. [[np.mean(df.leukocyte_count), np.mean(df.blood_pressure)]])[0]
    8. plt.scatter(mean_leukocyte_count+0.05, mean_blood_pressure-0.05, s=200,
    9. c="b", edgecolor="w", linewidth=2)
    10. # Annotate
    11. plt.annotate("true: malignant,\npred: malignant",
    12. color="white",
    13. xy=(mean_leukocyte_count, mean_blood_pressure),
    14. xytext=(0.4, 0.65),
    15. textcoords="figure fraction",
    16. fontsize=16,
    17. arrowprops=dict(facecolor="white", shrink=0.1))
    18. plt.show()

    伟大的!我们在训练和测试数据拆分上都获得了出色的表现。我们将使用这个数据集来展示数据质量的重要性。

    减少数据集

    让我们移除决策边界附近的一些训练数据,看看模型现在有多健壮。

    加载数据

    1. # Raw reduced data
    2. url = "https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/tumors_reduced.csv"
    3. df_reduced = pd.read_csv(url, header=0) # load
    4. df_reduced = df_reduced.sample(frac=1).reset_index(drop=True) # shuffle
    5. df_reduced.head()
    白细胞计数血压肿瘤类
    016.79518614.434741良性
    113.47296915.250393恶性的
    29.84045016.434717恶性的
    316.39073014.419258良性
    413.36797415.741790恶性的
    1. # Define X and y
    2. X = df_reduced[["leukocyte_count", "blood_pressure"]].values
    3. y = df_reduced["tumor_class"].values
    4. print ("X: ", np.shape(X))
    5. print ("y: ", np.shape(y))

     Output:

    X:  (720, 2)
    y:  (720,)
    1. # Plot data
    2. colors = {"benign": "red", "malignant": "blue"}
    3. plt.scatter(X[:, 0], X[:, 1], c=[colors[_y] for _y in y], s=25, edgecolors="k")
    4. plt.xlabel("leukocyte count")
    5. plt.ylabel("blood pressure")
    6. plt.legend(["malignant", "benign"], loc="upper right")
    7. plt.show()

     

    拆分数据

    1. # Create data splits
    2. X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(
    3. X=X, y=y, train_size=TRAIN_SIZE)
    4. print (f"X_train: {X_train.shape}, y_train: {y_train.shape}")
    5. print (f"X_val: {X_val.shape}, y_val: {y_val.shape}")
    6. print (f"X_test: {X_test.shape}, y_test: {y_test.shape}")
    7. print (f"Sample point: {X_train[0]}{y_train[0]}")

     Output:

    X_train: (503, 2), y_train: (503,)
    X_val: (108, 2), y_val: (108,)
    X_test: (109, 2), y_test: (109,)
    Sample point: [19.66235758 15.65939541] → benign

    标签编码

    1. # Encode class labels
    2. label_encoder = LabelEncoder()
    3. label_encoder = label_encoder.fit(y_train)
    4. num_classes = len(label_encoder.classes_)
    5. y_train = label_encoder.transform(y_train)
    6. y_val = label_encoder.transform(y_val)
    7. y_test = label_encoder.transform(y_test)
    1. # Class weights
    2. counts = np.bincount(y_train)
    3. class_weights = {i: 1.0/count for i, count in enumerate(counts)}
    4. print (f"counts: {counts}\nweights: {class_weights}")

     Output:

    counts: [272 231]
    weights: {0: 0.003676470588235294, 1: 0.004329004329004329}

    标准化数据

    1. # Standardize inputs using training data
    2. X_scaler = StandardScaler().fit(X_train)
    3. X_train = X_scaler.transform(X_train)
    4. X_val = X_scaler.transform(X_val)
    5. X_test = X_scaler.transform(X_test)

    模型

    1. # Initialize model
    2. model = MLP(input_dim=INPUT_DIM, hidden_dim=HIDDEN_DIM, num_classes=NUM_CLASSES)

    训练

    1. # Define Loss
    2. class_weights_tensor = torch.Tensor(list(class_weights.values()))
    3. loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)
    1. # Optimizer
    2. optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
    1. # Convert data to tensors
    2. X_train = torch.Tensor(X_train)
    3. y_train = torch.LongTensor(y_train)
    4. X_val = torch.Tensor(X_val)
    5. y_val = torch.LongTensor(y_val)
    6. X_test = torch.Tensor(X_test)
    7. y_test = torch.LongTensor(y_test)
    1. # Training
    2. for epoch in range(NUM_EPOCHS*10):
    3. # Forward pass
    4. y_pred = model(X_train)
    5. # Loss
    6. loss = loss_fn(y_pred, y_train)
    7. # Zero all gradients
    8. optimizer.zero_grad()
    9. # Backward pass
    10. loss.backward()
    11. # Update weights
    12. optimizer.step()
    13. if epoch%10==0:
    14. predictions = y_pred.max(dim=1)[1] # class
    15. accuracy = accuracy_fn(y_pred=predictions, y_true=y_train)
    16. print (f"Epoch: {epoch} | loss: {loss:.2f}, accuracy: {accuracy:.1f}")

     Output:

    Epoch: 0 | loss: 0.68, accuracy: 69.8
    Epoch: 10 | loss: 0.53, accuracy: 99.6
    Epoch: 20 | loss: 0.42, accuracy: 99.6
    Epoch: 30 | loss: 0.33, accuracy: 99.6
    Epoch: 40 | loss: 0.27, accuracy: 99.8

    评估

    1. # Predictions
    2. y_prob = F.softmax(model(X_test), dim=1)
    3. y_pred = y_prob.max(dim=1)[1]
    1. # # Performance
    2. performance = get_metrics(y_true=y_test, y_pred=y_pred, classes=classes)
    3. print (json.dumps(performance, indent=2))

     Output:

    {
      "overall": {
        "precision": 1.0,
        "recall": 1.0,
        "f1": 1.0,
        "num_samples": 109.0
      },
      "class": {
        "benign": {
          "precision": 1.0,
          "recall": 1.0,
          "f1": 1.0,
          "num_samples": 59.0
        },
        "malignant": {
          "precision": 1.0,
          "recall": 1.0,
          "f1": 1.0,
          "num_samples": 50.0
        }
      }
    }

    推理

    现在让我们看看之前的相同推理点现在如何在缩减数据集上训练的模型上执行。

    1. # Visualize the decision boundary
    2. plt.figure(figsize=(8,5))
    3. plt.title("Test")
    4. plot_multiclass_decision_boundary(model=model, X=X_test, y=y_test)
    5. # Sample point near the decision boundary (same point as before)
    6. plt.scatter(mean_leukocyte_count+0.05, mean_blood_pressure-0.05, s=200,
    7. c="b", edgecolor="w", linewidth=2)
    8. # Annotate
    9. plt.annotate("true: malignant,\npred: benign",
    10. color="white",
    11. xy=(mean_leukocyte_count, mean_blood_pressure),
    12. xytext=(0.45, 0.60),
    13. textcoords="figure fraction",
    14. fontsize=16,
    15. arrowprops=dict(facecolor="white", shrink=0.1))
    16. plt.show()

    这是一个非常脆弱但非常现实的场景。基于我们减少的合成数据集,我们已经实现了一个在测试数据上泛化得非常好的模型。但是,当我们要求对之前测试的同一点(我们知道是恶性的)进行预测时,现在的预测是良性肿瘤。我们会完全错过肿瘤。为了缓解这种情况,我们可以:

    1. 获取更多关于我们关注的空间的数据
    2. 在接近决策边界时谨慎使用预测

    带走

    模型不是水晶球。因此,重要的是,在任何机器学习之前,我们要真正查看我们的数据并问自己它是否真正代表了我们想要解决的任务。该模型本身可能非常适合您的数据并且可以很好地概括您的数据,但如果数据质量较差,则该模型不可信。

    一旦你确信你的数据质量很好,你终于可以开始考虑建模了。您选择的模型类型取决于许多因素,包括任务、数据类型、所需的复杂性等。

    所以一旦你弄清楚你的任务需要什么类型的模型,从简单的模型开始,然后慢慢增加复杂性。您不想立即开始使用神经网络,因为这可能不是您的数据和任务的正确模型。在模型复杂性中取得平衡是数据科学家的关键任务之一。简单模型 → 复杂模型

  • 相关阅读:
    北大青鸟计算机学校好就业吗?
    Sentinel持久限流化化规则到Nacos
    TCP & UDP
    社区团购新玩法,生鲜蔬菜配货发货小程序商城
    Java进阶学习路线图
    Hadoop的概述与安装
    UE5开发游戏Tutorial
    RF电路设计常见bug及解决方法
    网络安全系列-四十三:使用Suricata分析恶意流量pcap文件
    LabVIEW在无线设备中的应用
  • 原文地址:https://blog.csdn.net/sikh_0529/article/details/126772952