• Kaggle房价预测—模型改进与超参数对比实验


    尝试在李沐大神的一层线性模型上进行模型改进和对比实验,结果笑死我了,只能说太上老君本人了。
    先上代码,后上乐子,哦不,结果。
    实验代码:

    第一部分,数据集下载部分:

    import hashlib
    import os
    import tarfile
    import zipfile
    import requests
    
    #@save
    DATA_HUB = dict()
    DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'
    
    def download(name, cache_dir=os.path.join('..', 'data')):  #@save
        """下载一个DATA_HUB中的文件,返回本地文件名"""
        assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
        url, sha1_hash = DATA_HUB[name]
        os.makedirs(cache_dir, exist_ok=True)
        fname = os.path.join(cache_dir, url.split('/')[-1])
        if os.path.exists(fname):
            sha1 = hashlib.sha1()
            with open(fname, 'rb') as f:
                while True:
                    data = f.read(1048576)
                    if not data:
                        break
                    sha1.update(data)
            if sha1.hexdigest() == sha1_hash:
                return fname  # 命中缓存
        print(f'正在从{url}下载{fname}...')
        r = requests.get(url, stream=True, verify=True)
        with open(fname, 'wb') as f:
            f.write(r.content)
        return fname
    
    def download_extract(name, folder=None):  #@save
        """下载并解压zip/tar文件"""
        fname = download(name)
        base_dir = os.path.dirname(fname)
        data_dir, ext = os.path.splitext(fname)
        if ext == '.zip':
            fp = zipfile.ZipFile(fname, 'r')
        elif ext in ('.tar', '.gz'):
            fp = tarfile.open(fname, 'r')
        else:
            assert False, '只有zip/tar文件可以被解压缩'
        fp.extractall(base_dir)
        return os.path.join(base_dir, folder) if folder else data_dir
    
    def download_all():  #@save
        """下载DATA_HUB中的所有文件"""
        for name in DATA_HUB:
            download(name)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50

    第二部分,正菜:

    %matplotlib inline
    import numpy as np
    import pandas as pd
    import torch
    from torch import nn
    from d2l import torch as d2l
    
    # 设置数据集参数
    DATA_HUB['kaggle_house_train'] = (#@save
        DATA_URL + 'kaggle_house_pred_train.csv',
        '585e9cc93e70b39160e7921475f9bcd7d31219ce')
    
    DATA_HUB['kaggle_house_test'] = (  #@save
        DATA_URL + 'kaggle_house_pred_test.csv',
        'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')
        
    # 下载数据集
    train_data = pd.read_csv(download('kaggle_house_train'))
    test_data = pd.read_csv(download('kaggle_house_test'))
    
    print(train_data.shape)
    print(test_data.shape)
    
    print(train_data.iloc[0:4,[0, 1, 2, 3, -3, -2, -1]])
    
    # 去掉无价值的ID列,以及train_set中的lable列
    all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
    
    #找到特征中数字部分,将数据标准化
    numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
    all_features[numeric_features] = all_features[numeric_features].apply(
        lambda x: (x - x.mean()) / (x.std()))
    
    #标准化后,均值消失,可以将数据中NAN值设置为其平均数0
    all_features[numeric_features] = all_features[numeric_features].fillna(0)
    
    #用get_dummies函数将所有NA值视为有效特征值,且为其创建指示符特征
    all_features = pd.get_dummies(all_features, dummy_na=True)
    all_features.shape
    
    #设置分割点n_train以区分训练/验证数据集,将房价设置为label
    n_train = train_data.shape[0]
    train_features = torch.tensor(all_features[ :n_train].values, dtype=torch.float32)
    test_features = torch.tensor(all_features[n_train: ].values, dtype=torch.float32)
    train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1, 1), 
                                dtype=torch.float32)
    
    print(train_features.shape[1])
    #根据公式确定神经元数量
    a, Ns, Ni, No = 2.0, 1460.0, 331.0, 1.0
    Nh = Ns / (a * (Ni + No))   
    #其中,Ns为训练集样本数,Ni是输入层神经元个数,No为输出层神经元个数,a取(2, 10)
    print(Nh)
    
    loss = nn.MSELoss()
    #loss = nn.CrossEntropyLoss()  #交叉熵函数适合分类,而MSE函数适合回归
    in_features = train_features.shape[1]
    
    # 李沐大神这里使用了单层线性模型,尝试将其作为baseline
    def get_net():
        net = nn.Sequential(
                            nn.Linear(in_features, 8),
                            nn.ReLU(),
                            nn.Linear(8, 4),
                            nn.ReLU(),
                            nn.Linear(4, 1))
        return net
    
    def log_rmse(net, features, labels):
        # 为了在取对数时进一步稳定该值,将小于1的值设置为1
        clipped_preds = torch.clamp(net(features), 1, float('inf'))
        rmse = torch.sqrt(loss(torch.log(clipped_preds),
                               torch.log(labels)))
        return rmse.item()
    
    # 训练函数
    def train(net, train_features, train_labels, test_features, test_labels,
              num_epochs, learning_rate, weight_decay, batch_size):
        train_ls, test_ls = [], []
        train_iter = d2l.load_array((train_features, train_labels), batch_size)
        # 使用Adam优化器
        optimizer = torch.optim.Adam(net.parameters(),
                                     lr = learning_rate,
                                     weight_decay = weight_decay)
        for epoch in range(num_epochs):
            for X, y in train_iter:
                optimizer.zero_grad()
                l = loss(net(X), y)
                l.backward()
                optimizer.step()
            train_ls.append(log_rmse(net, train_features, train_labels))
            if test_labels is not None:
                test_ls.append(log_rmse(net, test_features, test_labels))
        return train_ls, test_ls
    
    def get_k_fold_data(k, i, X, y):
        assert k > 1
        fold_size = X.shape[0] // k
        X_train, y_train = None, None
        for j in range(k):
            idx = slice(j * fold_size, (j + 1) * fold_size)
            X_part, y_part = X[idx, :], y[idx]
            if j == i:
                X_valid, y_valid = X_part, y_part
            elif X_train is None:
                X_train, y_train = X_part, y_part
            else:
                X_train = torch.cat([X_train, X_part], 0)
                y_train = torch.cat([y_train, y_part], 0)
        return X_train, y_train, X_valid, y_valid
    
    # K折交叉验证
    def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,
               batch_size):
        train_l_sum, valid_l_sum = 0, 0
        for i in range(k):
            data = get_k_fold_data(k, i, X_train, y_train)
            net = get_net()
            train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
                                       weight_decay, batch_size)
            train_l_sum += train_ls[-1]
            valid_l_sum += valid_ls[-1]
            if i == 0:
                d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],
                         xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
                         legend=['train', 'valid'], yscale='log')
            print(f'折{i + 1},训练log rmse{float(train_ls[-1]):f}, '
                  f'验证log rmse{float(valid_ls[-1]):f}')
        return train_l_sum / k, valid_l_sum / k
    
    k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
    train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,
                              weight_decay, batch_size)
    print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, '
          f'平均验证log rmse: {float(valid_l):f}')
    
    def train_and_pred(train_features, test_features, train_labels, test_data, num_epochs, lr,
                      weight_decay, batch_size):
        net = get_net()
        train_ls,_ = train(net, train_features, train_labels, None, None, num_epochs, lr,
                          weight_decay, batch_size)
        d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch',
                ylabel='log rmse', xlim=[1, num_epochs], yscale='log')
        print(f'训练log rmse:{float(train_ls[-1]):f}')
        # 将神经网络用于测试集
        preds = net(test_features).detach().numpy()
        # 将其重新格式化以导出到kaggle
        test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    
    train_and_pred(train_features, test_features, train_labels, test_data,
                   num_epochs, lr, weight_decay, batch_size)
    
    submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
    submission.to_csv('submission.csv', index=False)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121
    • 122
    • 123
    • 124
    • 125
    • 126
    • 127
    • 128
    • 129
    • 130
    • 131
    • 132
    • 133
    • 134
    • 135
    • 136
    • 137
    • 138
    • 139
    • 140
    • 141
    • 142
    • 143
    • 144
    • 145
    • 146
    • 147
    • 148
    • 149
    • 150
    • 151
    • 152
    • 153
    • 154

    模型实验部分:
    首先我寻思着一层太少了,至少得一个隐藏层叭,因此加了一个2个神经单元的隐藏层,具体神经单元个数依照某个大神的公式算出来的,后面多做了几个实验发现这个公式真的有用,手动改变了将近十次神经单元个数发现最好的还是2。接下来做出来的结果差点笑死我:
    在这里插入图片描述
    属于是悬崖峭壁了,跳下来直接摔死。
    接着我寻思着是不是层数少了捏,那么我再加一层!
    下面是2个隐藏层,第一层是10个单元,第二层是4个的结果:
    在这里插入图片描述
    哇,什么叫壮观啊(战术后仰),刀山火海了属于是。
    接着我寻思着是不是参数问题呢?我就开始了一顿乱调,该调的都调了,最后最好的结果是这个:
    2个隐藏层,第一层8个单元,第二层4个:
    在这里插入图片描述
    非常不稳定,这次结果是0.128395,但是我后面又训练了一次,到了0.085左右。虽然不稳定但是架不住它结果不错啊哈哈哈哈哈哈,我捡了条值最低的模型,做了submission,传到了kaggle上排名。我那么一看,
    在这里插入图片描述
    居然还不错哈哈哈哈哈哈哈哈哈哈哈哈哈哈

  • 相关阅读:
    【LeetCode-数组】--搜索插入位置
    Android自定义视图
    CSS 中px、em、rem、%、vw、vh单位之间的区别详解【全网最全】
    疫情之下,普通人高薪就业指南:学软件测试,路就不会遥远
    springboot配置
    C++ 类和对象篇(六) 拷贝构造函数
    分页查询实现
    人工智能 AI 概念梳理
    Hutool工具包中BeanUtil的使用
    C语言多线程编程 semaphore 信号量(信号灯)是什么?如何使用?
  • 原文地址:https://blog.csdn.net/weixin_44543614/article/details/126321546