• Word2Vec源码解读(Pytorch版本)


    1. 模型结构图
      在这里插入图片描述
    2. 模型实现
      Skip-gram模型
      # code by Tae Hwan Jung @graykode modified by 前行follow
      import numpy as np
      import torch
      import torch.nn as nn
      import torch.optim as optim
      import matplotlib.pyplot as plt
      
      def random_batch():
          random_inputs = []
          random_labels = []
          random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False)
      
          for i in random_index:
              random_inputs.append(np.eye(voc_size)[skip_grams[i][0]])  # target
              random_labels.append(skip_grams[i][1])  # context word
      
          return random_inputs, random_labels
      
      # Model
      class Word2Vec(nn.Module):
          def __init__(self):
              super(Word2Vec, self).__init__()
              # W and WT is not Traspose relationship
              self.W = nn.Linear(voc_size, embedding_size, bias=False) # voc_size > embedding_size Weight
              self.WT = nn.Linear(embedding_size, voc_size, bias=False) # embedding_size > voc_size Weight
      
          def forward(self, X):
              # X : [batch_size, voc_size]
              hidden_layer = self.W(X) # hidden_layer : [batch_size, embedding_size]
              output_layer = self.WT(hidden_layer) # output_layer : [batch_size, voc_size]
              return output_layer
      
      if __name__ == '__main__':
          batch_size = 2 # mini-batch size
          embedding_size = 2 # embedding size
      
          sentences = ["apple banana fruit", "banana orange fruit", "orange banana fruit",
                       "dog cat animal", "cat monkey animal", "monkey dog animal"]
      
          word_sequence = " ".join(sentences).split()
          word_list = " ".join(sentences).split()
          word_list = list(set(word_list))
          word_dict = {w: i for i, w in enumerate(word_list)}
          voc_size = len(word_list)
      
          # Make skip gram of one size window
          skip_grams = []
          for i in range(1, len(word_sequence) - 1):
              target = word_dict[word_sequence[i]]
              context = [word_dict[word_sequence[i - 1]], word_dict[word_sequence[i + 1]]]
              # 每个target(中心词)和左右窗口的每个词放到一个列表中
              for w in context:
                  skip_grams.append([target, w])
      
          model = Word2Vec()
      
          criterion = nn.CrossEntropyLoss()
          optimizer = optim.Adam(model.parameters(), lr=0.001)
      
          # Training
          for epoch in range(5000):
              input_batch, target_batch = random_batch()
              input_batch = torch.Tensor(input_batch)
              target_batch = torch.LongTensor(target_batch)
      
              optimizer.zero_grad()
              output = model(input_batch)
      
              # output : [batch_size, voc_size], target_batch : [batch_size] (LongTensor, not one-hot)
              loss = criterion(output, target_batch)
              if (epoch + 1) % 1000 == 0:
                  print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
      
              loss.backward()
              optimizer.step()
      
          for i, label in enumerate(word_list):
              W, WT = model.parameters()
              x, y = W[0][i].item(), W[1][i].item()
              plt.scatter(x, y)
              plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
          plt.show()
      
      
      • 1
      • 2
      • 3
      • 4
      • 5
      • 6
      • 7
      • 8
      • 9
      • 10
      • 11
      • 12
      • 13
      • 14
      • 15
      • 16
      • 17
      • 18
      • 19
      • 20
      • 21
      • 22
      • 23
      • 24
      • 25
      • 26
      • 27
      • 28
      • 29
      • 30
      • 31
      • 32
      • 33
      • 34
      • 35
      • 36
      • 37
      • 38
      • 39
      • 40
      • 41
      • 42
      • 43
      • 44
      • 45
      • 46
      • 47
      • 48
      • 49
      • 50
      • 51
      • 52
      • 53
      • 54
      • 55
      • 56
      • 57
      • 58
      • 59
      • 60
      • 61
      • 62
      • 63
      • 64
      • 65
      • 66
      • 67
      • 68
      • 69
      • 70
      • 71
      • 72
      • 73
      • 74
      • 75
      • 76
      • 77
      • 78
      • 79
      • 80
      • 81
      • 82
      • 83
      运行结果:
      在这里插入图片描述
      CBOW模型
      import numpy as np
      from torchtext.vocab import vocab
      from collections import Counter, OrderedDict
      from torch.utils.data import Dataset, DataLoader
      from torchtext.transforms import VocabTransform  # 注意:torchtext版本0.12+
      import torch
      from torch import nn
      from torch.nn import functional as F
      
      
      def get_text():
          sentence_list = [  # 假设这是全部的训练语料
              "nlp drives computer programs that translate text from one language to another",
              "nlp combines computational linguistics rule based modeling of human language with statistical",
              "nlp model respond to text or voice data and respond with text",
          ]
          return sentence_list
      
      
      class CbowDataSet(Dataset):
          def __init__(self, text_list, side_window=3):
              """
              构造Word2vec的CBOW采样Dataset
              :param text_list: 语料
              :param side_window: 单侧正例(构造背景词)采样数,总正例是:2 * side_window
              """
              super(CbowDataSet, self).__init__()
              self.side_window = side_window
              text_vocab, vocab_transform = self.reform_vocab(text_list)
              self.text_list = text_list  # 原始文本
              self.text_vocab = text_vocab  # torchtext的vocab
              self.vocab_transform = vocab_transform  # torchtext的vocab_transform
              self.cbow_data = self.generate_cbow()
      
          def __len__(self):
              return len(self.cbow_data)
      
          def __getitem__(self, idx):
              data_row = self.cbow_data[idx]
              return data_row[0], data_row[1]
      
          def reform_vocab(self, text_list):
              """根据语料构造torchtext的vocab"""
              total_word_list = []
              for _ in text_list:  # 将嵌套的列表([[xx,xx],[xx,xx]...])拉平 ([xx,xx,xx...])
                  total_word_list += _.split(" ")
              counter = Counter(total_word_list)  # 统计计数
              sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)  # 构造成可接受的格式:[(单词,num), ...]
              ordered_dict = OrderedDict(sorted_by_freq_tuples)
              # 开始构造 vocab
              special_token = ["", ""]  # 特殊字符
              text_vocab = vocab(ordered_dict, specials=special_token)  # 单词转token,specials里是特殊字符,可以为空
              text_vocab.set_default_index(0)
              vocab_transform = VocabTransform(text_vocab)
              return text_vocab, vocab_transform
      
          def generate_cbow(self):
              """生成CBOW的训练数据"""
              cbow_data = []
              for sentence in self.text_list:
                  sentence_id_list = np.array(self.vocab_transform(sentence.split(' ')))
                  for center_index in range(
                          self.side_window, len(sentence_id_list) - self.side_window):  # 防止前面或后面取不到足够的值,这是取index的上下界
                      pos_index = list(range(center_index - self.side_window, center_index + self.side_window + 1))
                      del pos_index[self.side_window]
                      cbow_data.append([sentence_id_list[center_index], sentence_id_list[pos_index]])
              return cbow_data
      
          def get_vocab_transform(self):
              return self.vocab_transform
      
          def get_vocab_size(self):
              return len(self.text_vocab)
      
      
      class Word2VecModel(nn.Module):
          def __init__(self, vocab_size, batch_size, word_embedding_size=100, hidden=64):
              """
              Word2vec模型CBOW实现
              :param vocab_size: 单词个数
              :param word_embedding_size: 每个词的词向量维度
              :param hidden: 隐层维度
              """
              super(Word2VecModel, self).__init__()
              self.vocab_size = vocab_size
              self.word_embedding_size = word_embedding_size
              self.hidden = hidden
              self.batch_size = batch_size
              self.word_embedding = nn.Embedding(self.vocab_size, self.word_embedding_size)  # token对应的embedding
              # 建模
              self.linear_in = nn.Linear(self.word_embedding_size, self.hidden)
              self.linear_out = nn.Linear(self.hidden, self.vocab_size)
      
          def forward(self, input_labels):
              around_embedding = self.word_embedding(input_labels)
              avg_around_embedding = torch.mean(around_embedding, dim=1)  # 1. 输入的词向量对应位置求平均
              in_emb = F.relu(self.linear_in(avg_around_embedding))  # 2. 过第一个linear,使用relu激活函数
              out_emb = F.log_softmax(self.linear_out(in_emb))  # 3. 过第二个linear,得到维度是:[batch_size, 单词总数]
              return out_emb
      
          def get_embedding(self, token_list: list):
              return self.word_embedding(torch.Tensor(token_list).long())
      
      
      def main():
          batch_size = 7
          sentence_list = get_text()
          cbow_data_set = CbowDataSet(sentence_list)  # 构造 DataSet
          data_loader = DataLoader(cbow_data_set, batch_size=batch_size, drop_last=True)  # 将DataSet封装成DataLoader
          # 开始训练
          model = Word2VecModel(cbow_data_set.get_vocab_size(), batch_size)
          optimizer = torch.optim.Adam(model.parameters())
          criterion = nn.CrossEntropyLoss()
          for _epoch_i in range(100):
              loss_list = []
              for center_token, back_token in data_loader:
                  # 开始训练
                  optimizer.zero_grad()
                  model_out = model(back_token)
                  loss = criterion(model_out, center_token)
                  loss.backward()
                  optimizer.step()
                  loss_list.append(loss.item())
              print("训练中:", _epoch_i, "Loss:", np.sum(loss_list))
      
          # 最后测试一下
          # 得到: nlp can translate text from one language to another 的词向量
          sentence = "nlp can translate text from one language to another"
          vocab_transform = cbow_data_set.get_vocab_transform()
          sentence_ids = vocab_transform(sentence.split(' '))
          sentence_embedding = model.get_embedding(sentence_ids)
          print("这个是句向量的维度:", sentence_embedding.shape)
      
      
      if __name__ == '__main__':
          main()
      
      
      • 1
      • 2
      • 3
      • 4
      • 5
      • 6
      • 7
      • 8
      • 9
      • 10
      • 11
      • 12
      • 13
      • 14
      • 15
      • 16
      • 17
      • 18
      • 19
      • 20
      • 21
      • 22
      • 23
      • 24
      • 25
      • 26
      • 27
      • 28
      • 29
      • 30
      • 31
      • 32
      • 33
      • 34
      • 35
      • 36
      • 37
      • 38
      • 39
      • 40
      • 41
      • 42
      • 43
      • 44
      • 45
      • 46
      • 47
      • 48
      • 49
      • 50
      • 51
      • 52
      • 53
      • 54
      • 55
      • 56
      • 57
      • 58
      • 59
      • 60
      • 61
      • 62
      • 63
      • 64
      • 65
      • 66
      • 67
      • 68
      • 69
      • 70
      • 71
      • 72
      • 73
      • 74
      • 75
      • 76
      • 77
      • 78
      • 79
      • 80
      • 81
      • 82
      • 83
      • 84
      • 85
      • 86
      • 87
      • 88
      • 89
      • 90
      • 91
      • 92
      • 93
      • 94
      • 95
      • 96
      • 97
      • 98
      • 99
      • 100
      • 101
      • 102
      • 103
      • 104
      • 105
      • 106
      • 107
      • 108
      • 109
      • 110
      • 111
      • 112
      • 113
      • 114
      • 115
      • 116
      • 117
      • 118
      • 119
      • 120
      • 121
      • 122
      • 123
      • 124
      • 125
      • 126
      • 127
      • 128
      • 129
      • 130
      • 131
      • 132
      • 133
      • 134
      • 135
      • 136
      • 137
    3. Reference
      1. spik-gram源代码
      2. CBOW源代码
  • 相关阅读:
    学习 MySQL 需要知道的 28 个小技巧
    Windows端ZLMediaKit编译与webrtc推拉流测试
    HarmonyOS 性能优化
    Apple Watch设计原则,让你开发app思路更清晰
    MS1861 视频处理与显示控制器 HDMI转MIPI LVDS转MIPI带旋转功能 图像带缩放,旋转,锐化
    C_3练习题
    【Unity3D】Android 打包 ① ( Android 编译选项 | 安装 Android Build Support 模块 )
    编写后台登录滑动成功获取验证码 人机验证
    Dapr 发布模糊测试报告|Dapr 完成模糊测试审核
    2022面试,Java面试项目推荐,15个项目吃透两个offer拿到手软
  • 原文地址:https://blog.csdn.net/y1040468929/article/details/126023012