• LSTM 词语模型上的动态量化


    原文链接 

    (beta) Dynamic Quantization on an LSTM Word Language Model — PyTorch Tutorials 2.3.0+cu121 documentation

    引言

    量化涉及将模型的权重和激活值从浮点数转换为整数,这样可以缩小模型大小,加快推理速度,但对准确性的影响很小。
    在本教程中,我们将把最简单的量化形式--动态量化--应用到基于 LSTM 的下一个单词预测模型中,这与 PyTorch 示例中的单词语言模型密切相关。

    1. # imports
    2. import os
    3. from io import open
    4. import time
    5. import torch
    6. import torch.nn as nn
    7. import torch.nn.functional as F

     定义模型

      在此,我们按照单词语言模型示例中的模型,定义 LSTM 模型架构。

    1. class LSTMModel(nn.Module):
    2. """Container module with an encoder, a recurrent module, and a decoder."""
    3. def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5):
    4. super(LSTMModel, self).__init__()
    5. self.drop = nn.Dropout(dropout)
    6. self.encoder = nn.Embedding(ntoken, ninp)
    7. self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
    8. self.decoder = nn.Linear(nhid, ntoken)
    9. self.init_weights()
    10. self.nhid = nhid
    11. self.nlayers = nlayers
    12. def init_weights(self):
    13. initrange = 0.1
    14. self.encoder.weight.data.uniform_(-initrange, initrange)
    15. self.decoder.bias.data.zero_()
    16. self.decoder.weight.data.uniform_(-initrange, initrange)
    17. def forward(self, input, hidden):
    18. emb = self.drop(self.encoder(input))
    19. output, hidden = self.rnn(emb, hidden)
    20. output = self.drop(output)
    21. decoded = self.decoder(output)
    22. return decoded, hidden
    23. def init_hidden(self, bsz):
    24. weight = next(self.parameters())
    25. return (weight.new_zeros(self.nlayers, bsz, self.nhid),
    26. weight.new_zeros(self.nlayers, bsz, self.nhid))

    加载文本数据

     接下来,我们将 Wikitext-2 数据集加载到[Corpus]{.title-ref}中,同样按照单词语言模型示例进行预处理。

    1. class Dictionary(object):
    2. def __init__(self):
    3. self.word2idx = {}
    4. self.idx2word = []
    5. def add_word(self, word):
    6. if word not in self.word2idx:
    7. self.idx2word.append(word)
    8. self.word2idx[word] = len(self.idx2word) - 1
    9. return self.word2idx[word]
    10. def __len__(self):
    11. return len(self.idx2word)
    12. class Corpus(object):
    13. def __init__(self, path):
    14. self.dictionary = Dictionary()
    15. self.train = self.tokenize(os.path.join(path, 'train.txt'))
    16. self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
    17. self.test = self.tokenize(os.path.join(path, 'test.txt'))
    18. def tokenize(self, path):
    19. """Tokenizes a text file."""
    20. print(path)
    21. assert os.path.exists(path), f"Error: The path {path} does not exist."
    22. # Add words to the dictionary
    23. with open(path, 'r', encoding="utf8") as f:
    24. for line in f:
    25. words = line.split() + ['']
    26. for word in words:
    27. self.dictionary.add_word(word)
    28. # Tokenize file content
    29. with open(path, 'r', encoding="utf8") as f:
    30. idss = []
    31. for line in f:
    32. words = line.split() + ['']
    33. ids = []
    34. for word in words:
    35. ids.append(self.dictionary.word2idx[word])
    36. idss.append(torch.tensor(ids).type(torch.int64))
    37. ids = torch.cat(idss)
    38. return ids
    39. model_data_filepath = ".\data\\"
    40. corpus = Corpus(model_data_filepath + 'wikitext-2')

    加载预训练模型

     这是一个关于动态量化的教程,一种在模型训练完成后应用的量化技术。因此,我们只需将一些预先训练好的权重加载到该模型架构中;这些权重是通过使用单词语言模型示例中的默认设置进行五次历时训练获得的。

    1. ntokens = len(corpus.dictionary)
    2. model = LSTMModel(
    3. ntoken=ntokens,
    4. ninp=512,
    5. nhid=256,
    6. nlayers=5,
    7. )
    8. # model.load_state_dict(
    9. # torch.load(
    10. # model_data_filepath + 'word_language_model_quantize.pth',
    11. # map_location=torch.device('cpu')
    12. # )
    13. # )
    14. model.eval()
    15. print(model)

    现在让我们生成一些文本,以确保预训练模型正常工作 - 与之前类似,我们遵循此处

    1. input_ = torch.randint(ntokens, (1, 1), dtype=torch.long)
    2. hidden = model.init_hidden(1)
    3. temperature = 1.0
    4. num_words = 1000
    5. with open(model_data_filepath + 'out.txt', 'w') as outf:
    6. with torch.no_grad(): # no tracking history
    7. for i in range(num_words):
    8. output, hidden = model(input_, hidden)
    9. word_weights = output.squeeze().div(temperature).exp().cpu()
    10. word_idx = torch.multinomial(word_weights, 1)[0]
    11. input_.fill_(word_idx)
    12. word = corpus.dictionary.idx2word[word_idx]
    13. outf.write(str(word.encode('utf-8')) + ('\n' if i % 20 == 19 else ' '))
    14. if i % 100 == 0:
    15. print('| Generated {}/{} words'.format(i, 1000))
    16. with open(model_data_filepath + 'out.txt', 'r') as outf:
    17. all_output = outf.read()
    18. print(all_output)

    虽然不是 GPT-2,但看起来模型已经开始学习语言结构了!
    我们差不多可以演示动态量化了。我们只需要再定义几个辅助函数:

    1. bptt = 25
    2. criterion = nn.CrossEntropyLoss()
    3. eval_batch_size = 1
    4. # create test data set
    5. def batchify(data, bsz):
    6. # Work out how cleanly we can divide the dataset into ``bsz`` parts.
    7. nbatch = data.size(0) // bsz
    8. # Trim off any extra elements that wouldn't cleanly fit (remainders).
    9. data = data.narrow(0, 0, nbatch * bsz)
    10. # Evenly divide the data across the ``bsz`` batches.
    11. return data.view(bsz, -1).t().contiguous()
    12. test_data = batchify(corpus.test, eval_batch_size)
    13. # Evaluation functions
    14. def get_batch(source, i):
    15. seq_len = min(bptt, len(source) - 1 - i)
    16. data = source[i:i + seq_len]
    17. target = source[i + 1:i + 1 + seq_len].reshape(-1)
    18. return data, target
    19. def repackage_hidden(h):
    20. """Wraps hidden states in new Tensors, to detach them from their history."""
    21. if isinstance(h, torch.Tensor):
    22. return h.detach()
    23. else:
    24. return tuple(repackage_hidden(v) for v in h)
    25. def evaluate(model_, data_source):
    26. # Turn on evaluation mode which disables dropout.
    27. model_.eval()
    28. total_loss = 0.
    29. hidden = model_.init_hidden(eval_batch_size)
    30. with torch.no_grad():
    31. for i in range(0, data_source.size(0) - 1, bptt):
    32. data, targets = get_batch(data_source, i)
    33. output, hidden = model_(data, hidden)
    34. hidden = repackage_hidden(hidden)
    35. output_flat = output.view(-1, ntokens)
    36. total_loss += len(data) * criterion(output_flat, targets).item()
    37. return total_loss / (len(data_source) - 1)

    测试动态量化

    最后,我们可以在模型上调用 torch.quantization.quantize_dynamic!具体来说就是
    我们指定要对模型中的 nn.LSTM 和 nn.Linear 模块进行量化
    我们指定要将权重转换为 int8 值

    1. import torch.quantization
    2. quantized_model = torch.quantization.quantize_dynamic(
    3. model, {nn.LSTM, nn.Linear}, dtype=torch.qint8
    4. )
    5. print(quantized_model)
    6. # 模型看起来没有变化,这对我们有什么好处呢?首先,我们看到模型的尺寸大幅缩小:
    7. def print_size_of_model(model):
    8. torch.save(model.state_dict(), "temp.p")
    9. print('Size (MB):', os.path.getsize("temp.p") / 1e6)
    10. os.remove('temp.p')
    11. print_size_of_model(model)
    12. print_size_of_model(quantized_model)

    其次,我们看到推理时间更快,而评估损失没有区别:
    注:我们将单线程比较的线程数设为一个,因为量化模型是单线程运行的。

    1. torch.set_num_threads(1)
    2. def time_model_evaluation(model, test_data):
    3. s = time.time()
    4. loss = evaluate(model, test_data)
    5. elapsed = time.time() - s
    6. print('''loss: {0:.3f}\nelapsed time (seconds): {1:.1f}'''.format(loss, elapsed))
    7. time_model_evaluation(model, test_data)
    8. time_model_evaluation(quantized_model, test_data)

    在本地 MacBook Pro 上运行这个程序,在不进行量化的情况下,推理时间约为 200 秒,而在进行量化的情况下,推理时间仅为 100 秒左右。

     结论

    动态量化是减少模型大小的一种简单方法,但对准确性的影响有限。
    感谢您的阅读!我们一如既往地欢迎任何反馈,如果您有任何问题,请在此创建一个问题。

  • 相关阅读:
    利用uvicorn、Starlette和pipeline将一个训练好的大模型发布成一个web服务
    模块化与单片化优缺点解析:为什么单片链仍是 DeFi 协议的最好选择?
    Spring Boot 2.x系列【23】应用监控篇之Info端点
    第十三届蓝桥杯大赛软件赛决赛(C/C++ 大学C组)
    R语言绘制环状柱状堆积图+分组+显著性
    android 如何使用aidl编译CPP接口
    恢复数据库 NBU ZDLRA Backup
    项目管理的核心是什么?
    实战:Spring Boot 环境准备
    【2017NOIP普及组】T3:棋盘 试题解析
  • 原文地址:https://blog.csdn.net/qq_35629563/article/details/139474120