• 【LSTM实战】股票走势预测全流程实战(stock predict)


    一、import packages|导入第三方库

    import pandas as pd
    import matplotlib.pyplot as plt
    import datetime
    import torch
    import torch.nn as nn
    import numpy as np
    from torch.utils.data import Dataset, DataLoader
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    # read dataset and check it
    # 读入数据并且查看
    df = pd.read_csv('../input/stock-predict/IBM_2006-01-01_to_2018-01-01.csv', index_col=0)
    df.index = list(map(lambda x:datetime.datetime.strptime(x, '%Y-%m-%d'), df.index))
    df.head(20)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    OpenHighLowCloseVolumeName
    2006-01-0382.4582.5580.8182.0611715200IBM
    2006-01-0482.2082.5081.3381.959840600IBM
    2006-01-0581.4082.9081.0082.507213500IBM
    2006-01-0683.9585.0383.4184.958197400IBM
    2006-01-0984.1084.2583.3883.736858200IBM
    2006-01-1083.1584.1283.1284.075701000IBM
    2006-01-1184.3784.8183.4084.175776500IBM
    2006-01-1283.8283.9683.4083.574926500IBM
    2006-01-1383.0083.4582.5083.176921700IBM
    2006-01-1782.8083.1682.5483.008761700IBM
    2006-01-1884.0084.7083.5284.4611032800IBM
    2006-01-1984.1484.3983.0283.096484000IBM
    2006-01-2083.0483.0581.2581.368614500IBM
    2006-01-2381.3381.9280.9281.416114100IBM
    2006-01-2481.3982.1580.8080.856069000IBM
    2006-01-2581.0581.6280.6180.916374300IBM
    2006-01-2681.5081.6580.5980.727810200IBM
    2006-01-2780.7581.7780.7581.026103400IBM
    2006-01-3080.2181.8180.2181.635325100IBM
    2006-01-3181.5082.0081.1781.306771600IBM

    根据日期的数据列可以大致总结,周六周日有两天不进行股价交易

    # the amount of datasets 
    # 数据集数量
    len(df)
    
    • 1
    • 2
    • 3
    3020
    
    • 1

    二、data processing|数据处理

    def getData(df, column, train_end=-250, days_before=7, return_all=True, generate_index=False):
        series = df[column].copy()
        # split data
        # 划分数据
        train_series, test_series = series[:train_end], series[train_end - days_before:]
        train_data = pd.DataFrame()
            
        # 以七天为一个周期构建数据集和标签
        for i in range(days_before):
            train_data['c%d' % i] = train_series.tolist()[i: -days_before + i]
        # get train labels
        # 获取对应的 label
        train_data['y'] = train_series.tolist()[days_before:]
        # gen index
        # 是否生成 index
        if generate_index:
            train_data.index = train_series.index[n:]
                    
        if return_all:
            return train_data, series, df.index.tolist()
        
        return train_data
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    # build dataloader
    # 构建用于模型训练的dataloader
    class TrainSet(Dataset):
        def __init__(self, data):
            self.data, self.label = data[:, :-1].float(), data[:, -1].float()
    
        def __getitem__(self, index):
            return self.data[index], self.label[index]
    
        def __len__(self):
            return len(self.data)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11

    三、build model|构建模型

    # build LSTM model
    class LSTM(nn.Module):
        def __init__(self):
            super(LSTM, self).__init__()
            
            self.lstm = nn.LSTM(
                input_size=1,
                hidden_size=64,
                num_layers=1, 
                batch_first=True)
            
            self.out = nn.Sequential(
                nn.Linear(64,1))
            
        def forward(self, x):
            r_out, (h_n, h_c) = self.lstm(x, None)
            out = self.out(r_out[:, -1, :])
            
            return out
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19

    四、train model|模型训练

    # 数据集建立
    train_data, all_series, df_index = getData(df, 'High')
    
    # 获取所有原始数据
    all_series = np.array(all_series.tolist())
    # 绘制原始数据的图
    plt.figure(figsize=(12,8))
    plt.plot(df_index, all_series, label='real-data')
    
    # 归一化
    train_data_numpy = np.array(train_data)
    train_mean = np.mean(train_data_numpy)
    train_std  = np.std(train_data_numpy)
    train_data_numpy = (train_data_numpy - train_mean) / train_std
    train_data_tensor = torch.Tensor(train_data_numpy)
    
    # 创建 dataloader
    train_set = TrainSet(train_data_tensor)
    train_loader = DataLoader(train_set, batch_size=10, shuffle=True)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19

    在这里插入图片描述

    4.1 train model from zero|从头开始训练模型

    rnn = LSTM()
    
    if torch.cuda.is_available():
        rnn = rnn.cuda()
    
    # 设置优化器和损失函数
    optimizer = torch.optim.Adam(rnn.parameters(), lr=0.0001)
    loss_func = nn.MSELoss()
    
    for step in range(100):
        for tx, ty in train_loader:
            
            if torch.cuda.is_available():
                tx = tx.cuda()
                ty = ty.cuda()       
            
            output = rnn(torch.unsqueeze(tx, dim=2))
            loss = loss_func(torch.squeeze(output), ty)
            optimizer.zero_grad() 
            loss.backward()
            optimizer.step()
        if step % 10==0:
            print(step, loss.cpu())
    torch.save(rnn, 'model.pkl')
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    0 tensor(0.0756, grad_fn=)
    10 tensor(0.0087, grad_fn=)
    20 tensor(0.0024, grad_fn=)
    30 tensor(0.0042, grad_fn=)
    40 tensor(0.0078, grad_fn=)
    50 tensor(0.0057, grad_fn=)
    60 tensor(0.0001, grad_fn=)
    70 tensor(0.0077, grad_fn=)
    80 tensor(0.0027, grad_fn=)
    90 tensor(0.0015, grad_fn=)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10

    4.2 load model|加载训练好的模型

    rnn = LSTM()
    
    rnn = torch.load('model.pkl')
    
    • 1
    • 2
    • 3
    generate_data_train = []
    generate_data_test = []
    
    # 测试数据开始的索引
    test_start = len(all_series)-250
    
    # 对所有的数据进行相同的归一化
    all_series = (all_series - train_mean) / train_std
    all_series = torch.Tensor(all_series)
    
    for i in range(7, len(all_series)):
        x = all_series[i - 7:i]
        # 将 x 填充到 (bs, ts, is) 中的 timesteps
        x = torch.unsqueeze(torch.unsqueeze(x, dim=0), dim=2)
        
        if torch.cuda.is_available():
            x = x.cuda()
    
        y = rnn(x)
        
        if i < test_start:
            generate_data_train.append(torch.squeeze(y.cpu()).detach().numpy() * train_std + train_mean)
        else:
            generate_data_test.append(torch.squeeze(y.cpu()).detach().numpy() * train_std + train_mean)
            
    plt.figure(figsize=(12,8))
    plt.plot(df_index[7: -250], generate_data_train, 'b', label='generate_train', )
    plt.plot(df_index[-250:], generate_data_test, 'k', label='generate_test')
    plt.plot(df_index, all_series.clone().numpy()* train_std + train_mean, 'r', label='real_data')
    plt.legend()
    plt.show()
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32

    在这里插入图片描述

    五、test model|测试模型

    DAYS_BEFORE=7
    TRAIN_END=-250
    
    plt.figure(figsize=(10,16))
    
    plt.subplot(2,1,1)
    plt.plot(df_index[100 + DAYS_BEFORE: 130 + DAYS_BEFORE], generate_data_train[100: 130], 'b', label='generate_train')
    plt.plot(df_index[100 + DAYS_BEFORE: 130 + DAYS_BEFORE], (all_series.clone().numpy()* train_std + train_mean)[100 + DAYS_BEFORE: 130 + DAYS_BEFORE], 'r', label='real_data')
    plt.legend()
    
    plt.subplot(2,1,2)
    plt.plot(df_index[TRAIN_END + 5: TRAIN_END + 230], generate_data_test[5:230], 'k', label='generate_test')
    plt.plot(df_index[TRAIN_END + 5: TRAIN_END + 230], (all_series.clone().numpy()* train_std + train_mean)[TRAIN_END + 5: TRAIN_END + 230], 'r', label='real_data')
    plt.legend()
    plt.show()
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15

    在这里插入图片描述

    第一张图表示训练的模型在train集上的表现,第二张图表示在test上预测的表现。

  • 相关阅读:
    面试: Hashtable vs ConcurrentHashMap
    1671 得到山行数组的最少删除次数(贪心+二分)
    手机怎么压缩图片?通过三种压缩操作
    计算机专业英语
    Ansible - templates实战
    【蓝桥杯真题练习】STEMA科技素养练习题库 答案版014 持续更新中~
    华为云会议,轻松实现远程智能办公
    RabbitMQ复习笔记
    硬件工程师年薪26万美元、软件工程师30万,在谷歌打工“香”吗?
    Python进阶教学——装饰器与闭包
  • 原文地址:https://blog.csdn.net/u014297502/article/details/127929118