• 3、自然语言和单词的分布式表示(下)


            本节主要以实验为主

    一、学习数据的准备

    1、将语料库的文本转化为单词ID

    1. import numpy as np
    2. def preprocess( text ):
    3. text = text.lower() # 大写字母变为小写字母
    4. text = text.replace('.', ' .') # 用‘ .’代替‘.’
    5. words = text.split(' ') # 以空格为分隔条件,对语句进行分隔。
    6. word_to_id = {}
    7. id_to_word = {}
    8. for word in words:
    9. if word not in word_to_id:
    10. new_id = len(word_to_id)
    11. word_to_id[word] = new_id
    12. id_to_word[new_id] = word
    13. print(word_to_id)
    14. corpus = np.array([word_to_id[w] for w in words])
    15. print(corpus)
    16. text = "You say goodbye and I say hello."
    17. preprocess(text)

            输出结果:

             word_to_id字典中保存的是单词与ID的对应关系;corpus数组中保存的是:将单词用ID替换。

    2、从单词ID列表corpus生成contexts和target

            word2vec中使用的神经网络的输入是上下文,它的正确解标签是被这些上下文包围在中间的单词,即目标词。我们要做的事情是:当向神经网络输入上下文时,使目标词出现的概率高。故从语料库生成上下文和目标词。

            代码实现:

    1. import numpy as np
    2. def preprocess( text ):
    3. text = text.lower() # 大写字母变为小写字母
    4. text = text.replace('.', ' .') # 用‘ .’代替‘.’
    5. words = text.split(' ') # 以空格为分隔条件,对语句进行分隔。
    6. word_to_id = {}
    7. id_to_word = {}
    8. for word in words:
    9. if word not in word_to_id:
    10. new_id = len(word_to_id)
    11. word_to_id[word] = new_id
    12. id_to_word[new_id] = word
    13. corpus = np.array([word_to_id[w] for w in words])
    14. return corpus, word_to_id, id_to_word
    15. def create_conext_target(corpus, windows_size=1): # 单词ID列表和上下文的窗口大小
    16. target = corpus[windows_size:-windows_size]
    17. contexts = []
    18. for idx in range(windows_size, len(corpus)-windows_size):
    19. cs = []
    20. for t in range(-windows_size, windows_size+1):
    21. if t == 0:
    22. continue
    23. cs.append(corpus[idx + t])
    24. contexts.append(cs)
    25. return np.array(contexts), np.array(target)
    26. text = "You say goodbye and I say hello."
    27. corpus, word_to_id, id_to_word = preprocess(text)
    28. contexts, target = create_conext_target(corpus, windows_size=1)
    29. print(contexts)
    30. print(target)

            结果:

    3、转化为one-hot表示

            将上下文和目标词转化为one-hot表示,如下图所示:

             代码实现:

    1. import numpy as np
    2. def preprocess( text ): # 预处理:生成单词ID,单词ID列表
    3. text = text.lower() # 大写字母变为小写字母
    4. text = text.replace('.', ' .') # 用‘ .’代替‘.’
    5. words = text.split(' ') # 以空格为分隔条件,对语句进行分隔。
    6. word_to_id = {}
    7. id_to_word = {}
    8. for word in words:
    9. if word not in word_to_id:
    10. new_id = len(word_to_id)
    11. word_to_id[word] = new_id
    12. id_to_word[new_id] = word
    13. corpus = np.array([word_to_id[w] for w in words])
    14. return corpus, word_to_id, id_to_word
    15. def create_conext_target(corpus, windows_size=1): # 生成上下文和目标词
    16. target = corpus[windows_size:-windows_size]
    17. contexts = []
    18. for idx in range(windows_size, len(corpus)-windows_size):
    19. cs = []
    20. for t in range(-windows_size, windows_size+1):
    21. if t == 0:
    22. continue
    23. cs.append(corpus[idx + t])
    24. contexts.append(cs)
    25. return np.array(contexts), np.array(target)
    26. def convert_one_hot(corpus, vocab_size): # 转化为one_hot表示
    27. N = corpus.shape[0] # shape返回各个维度上的元素个数
    28. if corpus.ndim == 1: # ndim返回数组的维度
    29. one_hot = np.zeros((N, vocab_size), dtype=np.int32) #生成一个N行vocab_size列的零矩阵
    30. for idx, word_id in enumerate(corpus):
    31. one_hot[idx, word_id] = 1
    32. elif corpus.ndim == 2:
    33. C = corpus.shape[1]
    34. one_hot = np.zeros((N, C, vocab_size), dtype=np.int32)
    35. for idx_0, word_ids in enumerate(corpus):
    36. for idx_1, word_id in enumerate(word_ids):
    37. one_hot[idx_0, idx_1, word_id] = 1
    38. return one_hot
    39. text = "You say goodbye and I say hello."
    40. corpus, word_to_id, id_to_word = preprocess(text)
    41. contexts, target = create_conext_target(corpus, windows_size=1)
    42. vocab_size = len(word_to_id)
    43. target = convert_one_hot(target, vocab_size)
    44. contexts = convert_one_hot(contexts, vocab_size)
    45. print(target)
    46. print(contexts)

            结果:

    二、CBOW模型的实现

            神经网络示意图:

     

    1. import numpy as np
    2. import matplotlib.pyplot as plt
    3. import time
    4. def preprocess( text ): # 预处理:生成单词ID,单词ID列表
    5. text = text.lower() # 大写字母变为小写字母
    6. text = text.replace('.', ' .') # 用‘ .’代替‘.’
    7. words = text.split(' ') # 以空格为分隔条件,对语句进行分隔。
    8. word_to_id = {}
    9. id_to_word = {}
    10. for word in words:
    11. if word not in word_to_id:
    12. new_id = len(word_to_id)
    13. word_to_id[word] = new_id
    14. id_to_word[new_id] = word
    15. corpus = np.array([word_to_id[w] for w in words])
    16. return corpus, word_to_id, id_to_word
    17. def create_conext_target(corpus, windows_size): # 生成上下文和目标词
    18. target = corpus[windows_size:-windows_size]
    19. contexts = []
    20. for idx in range(windows_size, len(corpus)-windows_size):
    21. cs = []
    22. for t in range(-windows_size, windows_size+1):
    23. if t == 0:
    24. continue
    25. cs.append(corpus[idx + t])
    26. contexts.append(cs)
    27. return np.array(contexts), np.array(target)
    28. def convert_one_hot(corpus, vocab_size): # 转化为one_hot表示
    29. N = corpus.shape[0] # shape返回各个维度上的元素个数
    30. if corpus.ndim == 1: # ndim返回数组的维度
    31. one_hot = np.zeros((N, vocab_size), dtype=np.int32) #生成一个N行vocab_size列的零矩阵
    32. for idx, word_id in enumerate(corpus):
    33. one_hot[idx, word_id] = 1
    34. elif corpus.ndim == 2:
    35. C = corpus.shape[1]
    36. one_hot = np.zeros((N, C, vocab_size), dtype=np.int32)
    37. for idx_0, word_ids in enumerate(corpus):
    38. for idx_1, word_id in enumerate(word_ids):
    39. one_hot[idx_0, idx_1, word_id] = 1
    40. return one_hot
    41. class MatMul:
    42. def __init__(self, W):
    43. self.params = [W]
    44. self.grads = [np.zeros_like(W)]
    45. self.x = None
    46. def forward(self, x):
    47. W, = self.params
    48. out = np.dot(x, W)
    49. self.x = x
    50. return out
    51. def backward(self, dout):
    52. W, = self.params
    53. dx = np.dot(dout, W.T)
    54. dW = np.dot(self.x.T, dout)
    55. self.grads[0][...] = dW
    56. return dx
    57. def softmax(x):
    58. if x.ndim == 2:
    59. x = x - x.max(axis=1, keepdims=True)
    60. x = np.exp(x)
    61. x /= x.sum(axis=1, keepdims=True)
    62. elif x.ndim == 1:
    63. x = x - np.max(x)
    64. x = np.exp(x) / np.sum(np.exp(x))
    65. return x
    66. def cross_entropy_error(y, t):
    67. if y.ndim == 1:
    68. t = t.reshape(1, t.size)
    69. y = y.reshape(1, y.size)
    70. # 在监督标签为one-hot-vector的情况下,转换为正确解标签的索引
    71. if t.size == y.size:
    72. t = t.argmax(axis=1)
    73. batch_size = y.shape[0]
    74. return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size
    75. class SoftmaxWithLoss:
    76. def __init__(self):
    77. self.params, self.grads = [], []
    78. self.y = None # softmax的输出
    79. self.t = None # 监督标签
    80. def forward(self, x, t):
    81. self.t = t
    82. self.y = softmax(x)
    83. # 在监督标签为one-hot向量的情况下,转换为正确解标签的索引
    84. if self.t.size == self.y.size:
    85. self.t = self.t.argmax(axis=1)
    86. loss = cross_entropy_error(self.y, self.t)
    87. return loss
    88. def backward(self, dout):
    89. batch_size = self.t.shape[0]
    90. dx = self.y.copy()
    91. dx[np.arange(batch_size), self.t] -= 1
    92. dx *= dout
    93. dx = dx / batch_size
    94. return dx
    95. class Trainer:
    96. def __init__(self, model, optimizer):
    97. self.model = model
    98. self.optimizer = optimizer
    99. self.loss_list = []
    100. self.eval_interval = None
    101. self.current_epoch = 0
    102. def fit(self, x, t, max_epoch, batch_size, max_grad=None, eval_interval=20):
    103. data_size = len(x)
    104. print(data_size)
    105. max_iters = data_size // batch_size
    106. print(max_iters)
    107. self.eval_interval = eval_interval
    108. model, optimizer = self.model, self.optimizer
    109. total_loss = 0
    110. loss_count = 0
    111. start_time = time.time()
    112. for epoch in range(max_epoch):
    113. # 打乱
    114. idx = np.random.permutation(np.arange(data_size))
    115. x = x[idx]
    116. t = t[idx]
    117. for iters in range(max_iters):
    118. batch_x = x[iters*batch_size:(iters+1)*batch_size]
    119. batch_t = t[iters*batch_size:(iters+1)*batch_size]
    120. # 计算梯度,更新参数
    121. loss = model.forward(batch_x, batch_t)
    122. model.backward()
    123. params, grads = remove_duplicate(model.params, model.grads) # 将共享的权重整合为1个
    124. if max_grad is not None:
    125. clip_grads(grads, max_grad)
    126. optimizer.update(params, grads)
    127. total_loss += loss
    128. loss_count += 1
    129. # 评价
    130. if (eval_interval is not None) and (iters % eval_interval) == 0:
    131. avg_loss = total_loss / loss_count
    132. elapsed_time = time.time() - start_time
    133. print('| epoch %d | iter %d / %d | time %d[s] | loss %.2f'
    134. % (self.current_epoch + 1, iters + 1, max_iters, elapsed_time, avg_loss))
    135. self.loss_list.append(float(avg_loss))
    136. total_loss, loss_count = 0, 0
    137. self.current_epoch += 1
    138. def plot(self, ylim=None):
    139. x = np.arange(len(self.loss_list))
    140. if ylim is not None:
    141. plt.ylim(*ylim)
    142. plt.plot(x, self.loss_list, label='train')
    143. plt.xlabel('iterations (x' + str(self.eval_interval) + ')')
    144. plt.ylabel('loss')
    145. plt.show()
    146. def remove_duplicate(params, grads):
    147. '''
    148. 将参数列表中重复的权重整合为1个,
    149. 加上与该权重对应的梯度
    150. '''
    151. params, grads = params[:], grads[:] # copy list
    152. while True:
    153. find_flg = False
    154. L = len(params)
    155. for i in range(0, L - 1):
    156. for j in range(i + 1, L):
    157. # 在共享权重的情况下
    158. if params[i] is params[j]:
    159. grads[i] += grads[j] # 加上梯度
    160. find_flg = True
    161. params.pop(j)
    162. grads.pop(j)
    163. # 在作为转置矩阵共享权重的情况下(weight tying)
    164. elif params[i].ndim == 2 and params[j].ndim == 2 and \
    165. params[i].T.shape == params[j].shape and np.all(params[i].T == params[j]):
    166. grads[i] += grads[j].T
    167. find_flg = True
    168. params.pop(j)
    169. grads.pop(j)
    170. if find_flg: break
    171. if find_flg: break
    172. if not find_flg: break
    173. return params, grads
    174. def clip_grads(grads, max_norm):
    175. total_norm = 0
    176. for grad in grads:
    177. total_norm += np.sum(grad ** 2)
    178. total_norm = np.sqrt(total_norm)
    179. rate = max_norm / (total_norm + 1e-6)
    180. if rate < 1:
    181. for grad in grads:
    182. grad *= rate
    183. class SimpleCBOW:
    184. def __init__(self, vocab_size, hidden_size):
    185. V, H = vocab_size, hidden_size
    186. # 初始化权重
    187. W_in = 0.01 * np.random.randn(V, H).astype('f')
    188. W_out = 0.01 * np.random.randn(H, V).astype('f')
    189. # 生成层
    190. self.in_layer0 = MatMul(W_in)
    191. self.in_layer1 = MatMul(W_in)
    192. self.out_layer = MatMul(W_out)
    193. self.loss_layer = SoftmaxWithLoss()
    194. # 将所有的权重和梯度整理到列表中
    195. layers = [self.in_layer0, self.in_layer1, self.out_layer]
    196. self.params, self.grads = [], []
    197. for layer in layers:
    198. self.params += layer.params
    199. self.grads += layer.grads
    200. # 将单词的分布式表示设置为成员变量
    201. self.word_vecs = W_in
    202. def forward(self, contexts, target):
    203. h0 = self.in_layer0.forward(contexts[:, 0])
    204. h1 = self.in_layer1.forward(contexts[:, 1])
    205. h = (h0 + h1) * 0.5
    206. score = self.out_layer.forward(h)
    207. loss = self.loss_layer.forward(score, target)
    208. return loss
    209. def backward(self, dout=1):
    210. ds = self.loss_layer.backward(dout)
    211. da = self.out_layer.backward(ds)
    212. da *= 0.5
    213. self.in_layer1.backward(da)
    214. self.in_layer0.backward(da)
    215. return None
    216. class Adam:
    217. '''
    218. Adam (http://arxiv.org/abs/1412.6980v8)
    219. '''
    220. def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
    221. self.lr = lr
    222. self.beta1 = beta1
    223. self.beta2 = beta2
    224. self.iter = 0
    225. self.m = None
    226. self.v = None
    227. def update(self, params, grads):
    228. if self.m is None:
    229. self.m, self.v = [], []
    230. for param in params:
    231. self.m.append(np.zeros_like(param))
    232. self.v.append(np.zeros_like(param))
    233. self.iter += 1
    234. lr_t = self.lr * np.sqrt(1.0 - self.beta2 ** self.iter) / (1.0 - self.beta1 ** self.iter)
    235. for i in range(len(params)):
    236. self.m[i] += (1 - self.beta1) * (grads[i] - self.m[i])
    237. self.v[i] += (1 - self.beta2) * (grads[i] ** 2 - self.v[i])
    238. params[i] -= lr_t * self.m[i] / (np.sqrt(self.v[i]) + 1e-7)
    239. windows_size = 1
    240. hidden_size = 5
    241. batch_size = 3
    242. max_epoch = 1000
    243. text = "You say goodbye and I say hello."
    244. corpus, word_to_id, id_to_word = preprocess(text)
    245. vocab_size = len(word_to_id)
    246. contexts, target = create_conext_target(corpus, windows_size)
    247. target = convert_one_hot(target, vocab_size)
    248. contexts = convert_one_hot(contexts, vocab_size)
    249. model =SimpleCBOW(vocab_size, hidden_size)
    250. optimizer = Adam()
    251. trainer = Trainer(model, optimizer)
    252. trainer.fit(contexts, target, max_epoch, batch_size)
    253. trainer.plot()
    254. word_vecs = model.word_vecs
    255. for word_id, word in id_to_word.items():
    256. print(word, word_vecs[word_id])

            训练结果:

             单词的分布式表示:

     

  • 相关阅读:
    如何识别图片文字?这几个识别图片文字软件简单又高效
    BoW - Bag of Words - 词袋模型
    使用riscv-tests进行指令测试(二)
    2022-04-03 排查问题要知识沉淀
    Node.js内置模块
    使用GPT-4训练数据微调GPT-3.5 RAG管道
    【java】基本数据类型、包装类、string之间的转换
    力扣刷题学习SQL篇——1-10 选择(丢失信息的雇员——合并两个表的相同列union all)
    【c++】搜索二叉树的模拟实现
    EWM 过账期间修改(Posting only possible in periods***)
  • 原文地址:https://blog.csdn.net/qq_55202378/article/details/127676652