• 李宏毅2023机器学习作业1--homework1


    一、前期准备

    下载训练数据和测试数据

    1. # dropbox link
    2. !wget -O covid_train.csv https://www.dropbox.com/s/lmy1riadzoy0ahw/covid.train.csv?dl=0
    3. !wget -O covid_test.csv https://www.dropbox.com/s/zalbw42lu4nmhr2/covid.test.csv?dl=0

    导入包

    1. # Numerical Operations
    2. import math
    3. import numpy as np # numpy操作数据,增加删除查找修改
    4. # Reading/Writing Data
    5. import pandas as pd # pandas读取csv文件
    6. import os # 进行文件夹操作
    7. import csv
    8. # For Progress Bar
    9. from tqdm import tqdm # 可视化
    10. # Pytorch
    11. import torch # pytorch
    12. import torch.nn as nn
    13. from torch.utils.data import Dataset, DataLoader, random_split
    14. # For plotting learning curve
    15. from torch.utils.tensorboard import SummaryWriter

    定义一些功能函数

    1. def same_seed(seed):
    2. '''Fixes random number generator seeds for reproducibility.'''
    3. torch.backends.cudnn.deterministic = True
    4. torch.backends.cudnn.benchmark = False
    5. np.random.seed(seed)
    6. torch.manual_seed(seed)
    7. if torch.cuda.is_available():
    8. torch.cuda.manual_seed_all(seed)
    9. # 划分训练数据集和验证数据集
    10. def train_valid_split(data_set, valid_ratio, seed):
    11. '''Split provided training data into training set and validation set'''
    12. valid_set_size = int(valid_ratio * len(data_set))
    13. train_set_size = len(data_set) - valid_set_size
    14. train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
    15. return np.array(train_set), np.array(valid_set)

    配置项

    1. device = 'cuda' if torch.cuda.is_available() else 'cpu'
    2. config = {
    3. 'seed': 5201314, # Your seed number, you can pick your lucky number. :)
    4. 'select_all': False, # Whether to use all features.
    5. 'valid_ratio': 0.2, # validation_size = train_size * valid_ratio
    6. 'n_epochs': 5000, # Number of epochs.
    7. 'batch_size': 256,
    8. 'learning_rate': 1e-5,
    9. 'early_stop': 600, # If model has not improved for this many consecutive epochs, stop training.
    10. 'save_path': './models/model.ckpt' # Your model will be saved here.
    11. }

    二、创建数据

    创建Dataset

    1. class COVID19Dataset(Dataset):
    2. '''
    3. x: Features.
    4. y: Targets, if none, do prediction.
    5. '''
    6. def __init__(self, x, y=None):
    7. if y is None:
    8. self.y = y
    9. else:
    10. self.y = torch.FloatTensor(y)
    11. self.x = torch.FloatTensor(x)
    12. def __getitem__(self, idx):
    13. if self.y is None:
    14. return self.x[idx]
    15. else:
    16. return self.x[idx], self.y[idx]
    17. def __len__(self):
    18. return len(self.x)

    特征选择

    删除了belife和mental 的特征,belife和mental都是心理上精神上的特征,感觉可能和阳性率的偏差较大,就删去了这两类的特征

    1. def select_feat(train_data, valid_data, test_data, select_all=True):
    2. '''Selects useful features to perform regression'''
    3. # [:,-1]第一个维度选择所有,选取所有行,第二个维度选择-1,-1是倒数第一个元素,也就是标签label
    4. y_train, y_valid = train_data[:,-1], valid_data[:,-1] # 选择标签元素
    5. # [:,:-1]第一个维度选择所有,所有行,第二个维度从开始元素到倒数第一个元素(不包含倒数第一个元素)
    6. raw_x_train, raw_x_valid, raw_x_test = train_data[:,:-1], valid_data[:,:-1], test_data
    7. if select_all:
    8. feat_idx = list(range(raw_x_train.shape[1]))
    9. else:
    10. # feat_idx = list(range(35, raw_x_train.shape[1])) # TODO: Select suitable feature columns.
    11. """删除了belife和mental 的特征
    12. [0, 38, 39, 46, 51, 56, 57, 64, 69, 74, 75, 82, 87]是belife和mental所在列
    13. """
    14. del_col = [0, 38, 39, 46, 51, 56, 57, 64, 69, 74, 75, 82, 87]
    15. raw_x_train = np.delete(raw_x_train, del_col, axis=1) # numpy数组增删查改方法
    16. raw_x_valid = np.delete(raw_x_valid, del_col, axis=1)
    17. raw_x_test = np.delete(raw_x_test, del_col, axis=1)
    18. return raw_x_train, raw_x_valid, raw_x_test, y_train, y_valid
    19. return raw_x_train[:,feat_idx], raw_x_valid[:,feat_idx], raw_x_test[:,feat_idx], y_train, y_valid

     创建 Dataloader

    读取文件,设置训练,验证和测试数据集

    1. # Set seed for reproducibility
    2. same_seed(config['seed'])
    3. # train_data size: 3009 x 89 (35 states + 18 features x 3 days)
    4. # train_data共3009条数据,每条数据89个维度
    5. # test_data size: 997 x 88 (without last day's positive rate)
    6. # test_data共997条数据,每条数据88个维度,没有最后一天的最后一列数据positive rate
    7. # pands读取csv数据
    8. train_data, test_data = pd.read_csv('./covid_train.csv').values, pd.read_csv('./covid_test.csv').values
    9. # train_valid_split切分训练集和验证集
    10. train_data, valid_data = train_valid_split(train_data, config['valid_ratio'], config['seed'])
    11. # Print out the data size.打印数据尺寸
    12. print(f"""train_data size: {train_data.shape}
    13. valid_data size: {valid_data.shape}
    14. test_data size: {test_data.shape}""")
    15. # Select features 选择特征
    16. x_train, x_valid, x_test, y_train, y_valid = select_feat(train_data, valid_data, test_data, config['select_all'])
    17. # Print out the number of features. 打印特征数
    18. print(f'number of features: {x_train.shape[1]}')
    19. # 生成dataset
    20. train_dataset, valid_dataset, test_dataset = COVID19Dataset(x_train, y_train), \
    21. COVID19Dataset(x_valid, y_valid), \
    22. COVID19Dataset(x_test)
    23. # Pytorch data loader loads pytorch dataset into batches.
    24. # pytorch的dataloder加载dataset
    25. train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
    26. valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
    27. test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)

     三、创建神经网络模型

    1. class My_Model(nn.Module):
    2. def __init__(self, input_dim):
    3. super(My_Model, self).__init__()
    4. # TODO: modify model's structure, be aware of dimensions.
    5. self.layers = nn.Sequential(
    6. nn.Linear(input_dim, 16),
    7. nn.ReLU(),
    8. nn.Linear(16, 8),
    9. nn.ReLU(),
    10. nn.Linear(8, 1)
    11. )
    12. def forward(self, x):
    13. x = self.layers(x)
    14. x = x.squeeze(1) # (B, 1) -> (B)
    15. return x

    四、模型训练和模型测试

    模型训练

    1. def trainer(train_loader, valid_loader, model, config, device):
    2. criterion = nn.MSELoss(reduction='mean') # Define your loss function, do not modify this.
    3. # Define your optimization algorithm.
    4. # TODO: Please check https://pytorch.org/docs/stable/optim.html to get more available algorithms.
    5. # TODO: L2 regularization (optimizer(weight decay...) or implement by your self).
    6. optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=0.9)
    7. writer = SummaryWriter() # Writer of tensoboard.
    8. # 如果没有models文件夹,创建名称为models的文件夹,保存模型
    9. if not os.path.isdir('./models'):
    10. os.mkdir('./models') # Create directory of saving models.
    11. # math.inf为无限大
    12. n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0
    13. for epoch in range(n_epochs):
    14. model.train() # Set your model to train mode.
    15. loss_record = [] # 记录损失
    16. # tqdm is a package to visualize your training progress.
    17. train_pbar = tqdm(train_loader, position=0, leave=True)
    18. for x, y in train_pbar:
    19. optimizer.zero_grad() # Set gradient to zero.
    20. x, y = x.to(device), y.to(device) # Move your data to device.
    21. pred = model(x) # 数据传入模型model,生成预测值pred
    22. loss = criterion(pred, y) # 预测值pred和真实值y计算损失loss
    23. loss.backward() # Compute gradient(backpropagation).
    24. optimizer.step() # Update parameters.
    25. step += 1
    26. loss_record.append(loss.detach().item()) # 当前步骤的loss加到loss_record[]
    27. # Display current epoch number and loss on tqdm progress bar.
    28. train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
    29. train_pbar.set_postfix({'loss': loss.detach().item()})
    30. mean_train_loss = sum(loss_record)/len(loss_record) # 计算训练集上平均损失
    31. writer.add_scalar('Loss/train', mean_train_loss, step)
    32. model.eval() # Set your model to evaluation mode.
    33. loss_record = []
    34. for x, y in valid_loader:
    35. x, y = x.to(device), y.to(device)
    36. with torch.no_grad():
    37. pred = model(x)
    38. loss = criterion(pred, y)
    39. loss_record.append(loss.item())
    40. mean_valid_loss = sum(loss_record)/len(loss_record) # 计算验证集上平均损失
    41. print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
    42. writer.add_scalar('Loss/valid', mean_valid_loss, step)
    43. # 保存验证集上平均损失最小的模型
    44. if mean_valid_loss < best_loss:
    45. best_loss = mean_valid_loss
    46. torch.save(model.state_dict(), config['save_path']) # Save your best model
    47. print('Saving model with loss {:.3f}...'.format(best_loss))
    48. early_stop_count = 0
    49. else:
    50. early_stop_count += 1
    51. # 设置早停early_stop_count
    52. # 如果early_stop_count次数,验证集上的平均损失没有变化,模型性能没有提升,停止训练
    53. if early_stop_count >= config['early_stop']:
    54. print('\nModel is not improving, so we halt the training session.')
    55. return

    模型测试

    1. # 测试数据集的预测
    2. def predict(test_loader, model, device):
    3. model.eval() # Set your model to evaluation mode.
    4. preds = []
    5. for x in tqdm(test_loader):
    6. x = x.to(device)
    7. with torch.no_grad(): # 关闭梯度
    8. pred = model(x)
    9. preds.append(pred.detach().cpu())
    10. preds = torch.cat(preds, dim=0).numpy()
    11. return preds


     

    五、训练模型

    1. model = My_Model(input_dim=x_train.shape[1]).to(device) # put your model and data on the same computation device.
    2. trainer(train_loader, valid_loader, model, config, device)

    六、测试模型,生成预测值

    1. def save_pred(preds, file):
    2. ''' Save predictions to specified file '''
    3. with open(file, 'w') as fp:
    4. writer = csv.writer(fp)
    5. writer.writerow(['id', 'tested_positive'])
    6. for i, p in enumerate(preds):
    7. writer.writerow([i, p])
    8. model = My_Model(input_dim=x_train.shape[1]).to(device)
    9. model.load_state_dict(torch.load(config['save_path'])) # 加载模型
    10. preds = predict(test_loader, model, device) # 生成预测结果preds
    11. save_pred(preds, 'pred.csv') # 保存preds到pred.csv

    tensorboard可视化训练和验证损失图像

    1. %reload_ext tensorboard
    2. %tensorboard --logdir=./runs/

    参考:

    李宏毅_机器学习_作业1(详解)_COVID-19 Cases Prediction (Regression)-物联沃-IOTWORD物联网

    【深度学习】2023李宏毅homework1作业一代码详解_李宏毅作业1-CSDN博客

    np.delete详解-CSDN博客

  • 相关阅读:
    什么是 React JS 以及为什么要使用它?
    C语言 #define _INTSIZEOF(n)对齐的算法
    2024级199管理类联考之英语二2200核心词汇(第三天)
    【C语言】popen()函数详解
    SpringBoot底层原理----配置优先级/Bean管理/springboot原理
    golang--module
    面试官:聊聊分布式事务,再说说解决方案!
    深入了解 npm 命令
    死锁,死锁避免
    使用net core 6 c# 的 NPOI 包,读取excel..xlsx单元格内的图片,并存储到指定服务器
  • 原文地址:https://blog.csdn.net/qq_18815817/article/details/136262667