模型实现
Skip-gram模型
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
def random_batch():
random_inputs = []
random_labels = []
random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False)
for i in random_index:
random_inputs.append(np.eye(voc_size)[skip_grams[i][0]])
random_labels.append(skip_grams[i][1])
return random_inputs, random_labels
class Word2Vec(nn.Module):
def __init__(self):
super(Word2Vec, self).__init__()
self.W = nn.Linear(voc_size, embedding_size, bias=False)
self.WT = nn.Linear(embedding_size, voc_size, bias=False)
def forward(self, X):
hidden_layer = self.W(X)
output_layer = self.WT(hidden_layer)
return output_layer
if __name__ == '__main__':
batch_size = 2
embedding_size = 2
sentences = ["apple banana fruit", "banana orange fruit", "orange banana fruit",
"dog cat animal", "cat monkey animal", "monkey dog animal"]
word_sequence = " ".join(sentences).split()
word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
voc_size = len(word_list)
skip_grams = []
for i in range(1, len(word_sequence) - 1):
target = word_dict[word_sequence[i]]
context = [word_dict[word_sequence[i - 1]], word_dict[word_sequence[i + 1]]]
for w in context:
skip_grams.append([target, w])
model = Word2Vec()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in range(5000):
input_batch, target_batch = random_batch()
input_batch = torch.Tensor(input_batch)
target_batch = torch.LongTensor(target_batch)
optimizer.zero_grad()
output = model(input_batch)
loss = criterion(output, target_batch)
if (epoch + 1) % 1000 == 0:
print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
loss.backward()
optimizer.step()
for i, label in enumerate(word_list):
W, WT = model.parameters()
x, y = W[0][i].item(), W[1][i].item()
plt.scatter(x, y)
plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()

- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
运行结果:

CBOW模型import numpy as np
from torchtext.vocab import vocab
from collections import Counter, OrderedDict
from torch.utils.data import Dataset, DataLoader
from torchtext.transforms import VocabTransform
import torch
from torch import nn
from torch.nn import functional as F
def get_text():
sentence_list = [
"nlp drives computer programs that translate text from one language to another",
"nlp combines computational linguistics rule based modeling of human language with statistical",
"nlp model respond to text or voice data and respond with text",
]
return sentence_list
class CbowDataSet(Dataset):
def __init__(self, text_list, side_window=3):
"""
构造Word2vec的CBOW采样Dataset
:param text_list: 语料
:param side_window: 单侧正例(构造背景词)采样数,总正例是:2 * side_window
"""
super(CbowDataSet, self).__init__()
self.side_window = side_window
text_vocab, vocab_transform = self.reform_vocab(text_list)
self.text_list = text_list
self.text_vocab = text_vocab
self.vocab_transform = vocab_transform
self.cbow_data = self.generate_cbow()
def __len__(self):
return len(self.cbow_data)
def __getitem__(self, idx):
data_row = self.cbow_data[idx]
return data_row[0], data_row[1]
def reform_vocab(self, text_list):
"""根据语料构造torchtext的vocab"""
total_word_list = []
for _ in text_list:
total_word_list += _.split(" ")
counter = Counter(total_word_list)
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
special_token = ["", ""]
text_vocab = vocab(ordered_dict, specials=special_token)
text_vocab.set_default_index(0)
vocab_transform = VocabTransform(text_vocab)
return text_vocab, vocab_transform
def generate_cbow(self):
"""生成CBOW的训练数据"""
cbow_data = []
for sentence in self.text_list:
sentence_id_list = np.array(self.vocab_transform(sentence.split(' ')))
for center_index in range(
self.side_window, len(sentence_id_list) - self.side_window):
pos_index = list(range(center_index - self.side_window, center_index + self.side_window + 1))
del pos_index[self.side_window]
cbow_data.append([sentence_id_list[center_index], sentence_id_list[pos_index]])
return cbow_data
def get_vocab_transform(self):
return self.vocab_transform
def get_vocab_size(self):
return len(self.text_vocab)
class Word2VecModel(nn.Module):
def __init__(self, vocab_size, batch_size, word_embedding_size=100, hidden=64):
"""
Word2vec模型CBOW实现
:param vocab_size: 单词个数
:param word_embedding_size: 每个词的词向量维度
:param hidden: 隐层维度
"""
super(Word2VecModel, self).__init__()
self.vocab_size = vocab_size
self.word_embedding_size = word_embedding_size
self.hidden = hidden
self.batch_size = batch_size
self.word_embedding = nn.Embedding(self.vocab_size, self.word_embedding_size)
self.linear_in = nn.Linear(self.word_embedding_size, self.hidden)
self.linear_out = nn.Linear(self.hidden, self.vocab_size)
def forward(self, input_labels):
around_embedding = self.word_embedding(input_labels)
avg_around_embedding = torch.mean(around_embedding, dim=1)
in_emb = F.relu(self.linear_in(avg_around_embedding))
out_emb = F.log_softmax(self.linear_out(in_emb))
return out_emb
def get_embedding(self, token_list: list):
return self.word_embedding(torch.Tensor(token_list).long())
def main():
batch_size = 7
sentence_list = get_text()
cbow_data_set = CbowDataSet(sentence_list)
data_loader = DataLoader(cbow_data_set, batch_size=batch_size, drop_last=True)
model = Word2VecModel(cbow_data_set.get_vocab_size(), batch_size)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
for _epoch_i in range(100):
loss_list = []
for center_token, back_token in data_loader:
optimizer.zero_grad()
model_out = model(back_token)
loss = criterion(model_out, center_token)
loss.backward()
optimizer.step()
loss_list.append(loss.item())
print("训练中:", _epoch_i, "Loss:", np.sum(loss_list))
sentence = "nlp can translate text from one language to another"
vocab_transform = cbow_data_set.get_vocab_transform()
sentence_ids = vocab_transform(sentence.split(' '))
sentence_embedding = model.get_embedding(sentence_ids)
print("这个是句向量的维度:", sentence_embedding.shape)
if __name__ == '__main__':
main()

- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
- 129
- 130
- 131
- 132
- 133
- 134
- 135
- 136
- 137