• word2vec的算法原理(不用开源包,python实现)


    看了很多关于word2vec的算法原理的介绍文章,看明白了,但依然有点不深刻。

    以下是python直接实现的word2vec的算法,简单明了,读完就懂了

    1. import numpy as np
    2. def tokenize(text):
    3. return text.lower().split()
    4. def generate_word_pairs(sentences, window_size):
    5. word_pairs = []
    6. for sentence in sentences:
    7. for i, center_word in enumerate(sentence):
    8. for j in range(i - window_size, i + window_size + 1):
    9. if j >= 0 and j < len(sentence) and j != i:
    10. context_word = sentence[j]
    11. word_pairs.append((center_word, context_word))
    12. return word_pairs
    13. def create_word_index(sentences):
    14. word_set = set(word for sentence in sentences for word in sentence)
    15. return {word: i for i, word in enumerate(word_set)}
    16. def one_hot_encoding(word, word_index):
    17. one_hot = np.zeros(len(word_index))
    18. one_hot[word_index[word]] = 1
    19. return one_hot
    20. def train_word2vec(sentences, vector_size, window_size, learning_rate, epochs):
    21. word_index = create_word_index(sentences)
    22. W1 = np.random.rand(len(word_index), vector_size)
    23. W2 = np.random.rand(vector_size, len(word_index))
    24. word_pairs = generate_word_pairs(sentences, window_size)
    25. for epoch in range(epochs):
    26. loss = 0
    27. for center_word, context_word in word_pairs:
    28. center_word_encoded = one_hot_encoding(center_word, word_index)
    29. context_word_encoded = one_hot_encoding(context_word, word_index)
    30. hidden_layer = np.dot(center_word_encoded, W1)
    31. output_layer = np.dot(hidden_layer, W2)
    32. exp_output = np.exp(output_layer)
    33. softmax_output = exp_output / np.sum(exp_output)
    34. error = softmax_output - context_word_encoded
    35. dW2 = np.outer(hidden_layer, error)
    36. dW1 = np.outer(center_word_encoded, np.dot(W2, error))
    37. W1 -= learning_rate * dW1
    38. W2 -= learning_rate * dW2
    39. loss += -np.sum(output_layer * context_word_encoded) + np.log(np.sum(exp_output))
    40. print(f"Epoch: {epoch + 1}, Loss: {loss}")
    41. return W1, word_index
    42. sentences = [
    43. tokenize("This is a sample sentence"),
    44. tokenize("Another example sentence"),
    45. tokenize("One more example")
    46. ]
    47. vector_size = 100
    48. window_size = 2
    49. learning_rate = 0.01
    50. epochs = 100
    51. W1, word_index = train_word2vec(sentences, vector_size, window_size, learning_rate, epochs)
    52. for word, index in word_index.items():
    53. print(f"{word}: {W1[index]}")

  • 相关阅读:
    LrC 13 & ACR 16:点颜色
    【微信开发第二章】SpringBoot实现微信公众号普通消息和模板消息回复
    [python刷题模板] 最短路(Dijkstra)
    mysql表引擎批量转换--mysql_convert_table_format
    MySQL——数据的删除以及MySQL中的约束
    通讯网关软件023——利用CommGate X2HTTP实现HTTP访问Modbus TCP
    python面试题——版本管理工具GIT(二)
    骨传导耳机推荐:2022年好用的骨传导耳机
    开源代码安全 | 西门子为保护代码安全采取了什么措施?
    Facebook社媒营销的5大技巧,迅速提高独立站转化率!
  • 原文地址:https://blog.csdn.net/u010859498/article/details/134561153