看了很多关于word2vec的算法原理的介绍文章,看明白了,但依然有点不深刻。
以下是python直接实现的word2vec的算法,简单明了,读完就懂了
- import numpy as np
-
- def tokenize(text):
- return text.lower().split()
-
- def generate_word_pairs(sentences, window_size):
- word_pairs = []
- for sentence in sentences:
- for i, center_word in enumerate(sentence):
- for j in range(i - window_size, i + window_size + 1):
- if j >= 0 and j < len(sentence) and j != i:
- context_word = sentence[j]
- word_pairs.append((center_word, context_word))
- return word_pairs
-
- def create_word_index(sentences):
- word_set = set(word for sentence in sentences for word in sentence)
- return {word: i for i, word in enumerate(word_set)}
-
-
- def one_hot_encoding(word, word_index):
- one_hot = np.zeros(len(word_index))
- one_hot[word_index[word]] = 1
- return one_hot
-
- def train_word2vec(sentences, vector_size, window_size, learning_rate, epochs):
- word_index = create_word_index(sentences)
- W1 = np.random.rand(len(word_index), vector_size)
- W2 = np.random.rand(vector_size, len(word_index))
-
- word_pairs = generate_word_pairs(sentences, window_size)
-
- for epoch in range(epochs):
- loss = 0
- for center_word, context_word in word_pairs:
- center_word_encoded = one_hot_encoding(center_word, word_index)
- context_word_encoded = one_hot_encoding(context_word, word_index)
-
- hidden_layer = np.dot(center_word_encoded, W1)
- output_layer = np.dot(hidden_layer, W2)
-
- exp_output = np.exp(output_layer)
- softmax_output = exp_output / np.sum(exp_output)
-
- error = softmax_output - context_word_encoded
-
- dW2 = np.outer(hidden_layer, error)
- dW1 = np.outer(center_word_encoded, np.dot(W2, error))
-
- W1 -= learning_rate * dW1
- W2 -= learning_rate * dW2
-
- loss += -np.sum(output_layer * context_word_encoded) + np.log(np.sum(exp_output))
-
- print(f"Epoch: {epoch + 1}, Loss: {loss}")
-
- return W1, word_index
-
- sentences = [
- tokenize("This is a sample sentence"),
- tokenize("Another example sentence"),
- tokenize("One more example")
- ]
-
- vector_size = 100
- window_size = 2
- learning_rate = 0.01
- epochs = 100
-
- W1, word_index = train_word2vec(sentences, vector_size, window_size, learning_rate, epochs)
-
- for word, index in word_index.items():
- print(f"{word}: {W1[index]}")