必看:【参考:【论文笔记】:Convolutional Neural Networks for Sentence Classification用CNN做句子分类 - 小千同学超级爱写代码 - 博客园】
【参考:卷积在NLP领域的应用–以TextCNN为例_哔哩哔哩_bilibili】
【参考:论文阅读:Convolutional Neural Networks for Sentence Classification 卷积神经网络的句子分类_南有芙蕖-CSDN博客】
【参考:TextCNN天池授课_哔哩哔哩_bilibili】讲的非常好(有Pytorch代码讲解)
配套代码:【参考:Datawhale零基础入门NLP赛事 - Task5 基于深度学习的文本分类2-2TextCNN-天池实验室-实时在线的数据分析协作工具,享受免费计算资源】
代码也可以到这里搜索 【参考:天池实验室-实时在线的数据分析协作工具,享受免费计算资源】
参考论文1:《Convolutional Neural Networks for Sentence Classification 》用于句子分类的卷积神经网络 2014
图上的红线部分,窗口大小为2,每次选择两个词进行特征提取;
黄线部分窗口大小为3,每次选择三个词进行特征提取,也就是说,“窗口”的含义是“每次作用几个单词”,反应在图上就是“滤波器一次性遍历几行”;
参考论文:《A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional Neural Networks for Sentence Classification 》用于句子分类的卷积神经网络(和从业者指南)的敏感性分析 2016
TextCNN调参指导(现在常用的就是这个版本)
2个窗口为2的卷积核(黄色系),2个窗口为3的卷积核(绿色系),2个窗口为4的卷积核(红色系)
【参考:TextCNN的PyTorch实现_哔哩哔哩_bilibili】
配套文章 https://wmathor.com/index.php/archives/1445/ 写的非常详细
import torch
import numpy as np
import torch.optim as optim
import torch.utils.data as Data
import torch.nn.functional as F
dtype = torch.FloatTensor
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 3 words sentences (=sequence_length is 3)
sentences = ["i love you", "he loves me", "she likes baseball", "i hate you", "sorry for that", "this is awful"]
labels = [1, 1, 1, 0, 0, 0] # 1 is good, 0 is not good.
# TextCNN 参数
embedding_size=2 # 一个词使用二维向量表示
sequence_length=len(sentences[0].split ()) # 3 序列长度 这里默认全部一样长,都是三个单词
num_classes=len(set(labels)) # 2
batch_size=3
word_list=" ".join(sentences).split() # 包含了sentences中所有的词 有重复的词
vocab=list(set(word_list)) # 词典 包含了sentences中所有的词
word2idx = {w:i for i,w in enumerate(vocab)} # 词:索引
vocab_size = len(vocab)
def make_data(sentences,labels):
inputs=[]
for sentence in sentences:
inputs.append([word2idx[n] for n in sentence.split()]) # 把句子变成对应的索引序列
targets=[]
for out in labels:
targets.append(out)
return inputs,targets
input_batch,target_batch=make_data(sentences,labels)
input_batch,target_batch=torch.LongTensor(input_batch),torch.LongTensor(target_batch)
dataset=Data.TensorDataset(input_batch,target_batch)
loader=Data.DataLoader(dataset,batch_size,True)
input_batch
tensor([[ 4, 11, 14],
[ 5, 9, 10],
[15, 0, 12],
[ 4, 6, 14],
[ 1, 2, 13],
[ 7, 3, 8]])
target_batch
tensor([1, 1, 1, 0, 0, 0])
from torch import nn
class TextCNN(nn.Module):
def __init__(self):
super(TextCNN, self).__init__()
self.W=nn.Embedding(num_embeddings=vocab_size,embedding_dim=embedding_size)
out_channels=3
self.conv = nn.Sequential(
# conv : [input_channel(=1), output_channel, kernel_size=(filter_height, filter_width=embedding_size), stride=1]
# 因为out_channels=3,所以会有3个卷积核与输入数据做卷积
# 这里filter_height=2 只有一个卷积核
nn.Conv2d(in_channels=1,out_channels=out_channels,kernel_size=(2,embedding_size))
# 输出[batch_size, out_channels=3, 2, 1] out_channels行batch_size列给元素,每个元素是2*1的
,nn.ReLU()
# pool : ((filter_height, filter_width))
,nn.MaxPool2d(kernel_size=(2,1)) # 把2*1的元素最大池化输出1*1
)
# fc
self.fc = nn.Linear(in_features=out_channels,out_features=num_classes) # 输出二分类
def forward(self, x):
'''
X: [batch_size, sequence_length]
'''
batch_size=x.shape[0] # 多少个句子
# 变成一个立方体 比如[[ 4, 11, 14],...] 4表示一个单词,然后用词向量[1,2]来表示该词,即[[ [1,2], 11, 14],...]
embedding_x = self.W(x) # [batch_size, sequence_length, embedding_size]
# 在第二个维度的位置增加值为1的维度 即通道数,单通道,类似于图片的黑白图像
# 即有一行batch_size列元素,每个元素都是sequence_length行embedding_size列
# 现在的数据才能做卷积,因为在传统 CNN 中,输入数据就应该是 [batch_size, in_channel, height, width] 这种维度
embedding_x = embedding_x.unsqueeze(1) # add channel(=1) [batch, channel(=1), sequence_length, embedding_size]
conved=self.conv(embedding_x) # [batch_size, output_channel,1,1]
flatten=conved.view(batch_size,-1) # [batch_size, output_channel*1*1]
output = self.fc(flatten)
return output
model=TextCNN().to(device=device)
criterion=nn.CrossEntropyLoss().to(device=device)
optimizer=optim.Adam(model.parameters(), lr=1e-3)
for epoch in range(5000):
for batch_x, batch_y in loader:
batch_x=batch_x.to(device=device)
batch_y=batch_y.to(device=device)
pred=model(batch_x)
loss = criterion(pred, batch_y)
if (epoch +1) %1000 == 0:
print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))
optimizer.zero_grad()
loss.backward()
optimizer.step()
Epoch: 1000 loss = 0.030200
Epoch: 1000 loss = 0.054546
Epoch: 2000 loss = 0.014919
Epoch: 2000 loss = 0.007824
Epoch: 3000 loss = 0.002666
Epoch: 3000 loss = 0.005158
Epoch: 4000 loss = 0.001931
Epoch: 4000 loss = 0.000988
Epoch: 5000 loss = 0.000379
Epoch: 5000 loss = 0.000743
# Test
test_text = 'i hate me'
tests = [[word2idx[n] for n in test_text.split()]]
test_batch = torch.LongTensor(tests).to(device)
# Predict
model = model.eval()
predict = model(test_batch).data.max(1, keepdim=True)[1]
if predict[0][0] == 0:
print(test_text,"is Bad Mean...")
else:
print(test_text,"is Good Mean!!")
i hate me is Bad Mean...
【参考:nlp-tutorial/TextCNN.py at master · graykode/nlp-tutorial】
# %%
# code by Tae Hwan Jung @graykode
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
class TextCNN(nn.Module):
def __init__(self):
super(TextCNN, self).__init__()
self.num_filters_total = num_filters * len(filter_sizes)
self.W = nn.Embedding(vocab_size, embedding_size)
self.Weight = nn.Linear(self.num_filters_total, num_classes, bias=False)
self.Bias = nn.Parameter(torch.ones([num_classes]))
self.filter_list = nn.ModuleList([
nn.Conv2d(1, num_filters, kernel_size=(size, embedding_size))
for size in filter_sizes])
def forward(self, X):
embedded_chars = self.W(X) # [batch_size, sequence_length, sequence_length]
# 加入通道数1
embedded_chars = embedded_chars.unsqueeze(1) # add channel(=1) [batch, channel(=1), sequence_length, embedding_size]
pooled_outputs = []
for i, conv in enumerate(self.filter_list):
# conv : [input_channel(=1), output_channel(=3), (filter_height, filter_width), bias_option]
h = F.relu(conv(embedded_chars)) # h:[batch_size(=6), output_channel(=3),output_height(=2), output_width(=1)]
# mp : ((filter_height, filter_width))
mp = nn.MaxPool2d((sequence_length - filter_sizes[i] + 1, 1)) # [2,1]
# mp(h):[batch_size(=6),output_channel(=3), output_height(=1), output_width(=1)]
# pooled : [batch_size(=6), output_height(=1), output_width(=1), output_channel(=3)]
pooled = mp(h).permute(0, 3, 2, 1)
pooled_outputs.append(pooled)
h_pool = torch.cat(pooled_outputs, len(filter_sizes)) # [batch_size(=6), output_height(=1), output_width(=1), output_channel(=3) * 3]
h_pool_flat = torch.reshape(h_pool, [-1, self.num_filters_total]) # [batch_size(=6), output_height * output_width * (output_channel * 3)]
model = self.Weight(h_pool_flat) + self.Bias # [batch_size, num_classes]
return model
if __name__ == '__main__':
embedding_size = 2 # embedding size
sequence_length = 3 # sequence length
num_classes = 2 # number of classes
# 论文中是2,3,4
filter_sizes = [2, 2, 2] # n-gram windows # 卷积核大小 [filter_size,embedding_size]
num_filters = 3 # number of filters # 3个卷积核会把输入数据变成三通道的数据
# 3 words sentences (=sequence_length is 3)
sentences = ["i love you", "he loves me", "she likes baseball", "i hate you", "sorry for that", "this is awful"]
labels = [1, 1, 1, 0, 0, 0] # 1 is good, 0 is not good.
word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
vocab_size = len(word_dict)
model = TextCNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
inputs = torch.LongTensor([np.asarray([word_dict[n] for n in sen.split()]) for sen in sentences])
targets = torch.LongTensor([out for out in labels]) # To using Torch Softmax Loss function
# Training
for epoch in range(5000):
optimizer.zero_grad()
output = model(inputs)
# output : [batch_size, num_classes], target_batch : [batch_size] (LongTensor, not one-hot)
loss = criterion(output, targets)
if (epoch + 1) % 1000 == 0:
print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
loss.backward()
optimizer.step()
# Test
test_text = 'sorry hate you'
tests = [np.asarray([word_dict[n] for n in test_text.split()])]
test_batch = torch.LongTensor(tests)
# Predict
predict = model(test_batch).data.max(1, keepdim=True)[1]
if predict[0][0] == 0:
print(test_text,"is Bad Mean...")
else:
print(test_text,"is Good Mean!!")
【参考:手写AI出品: TextCNN文本分类,逐行代码复现! 可加UP免费答疑!_哔哩哔哩_bilibili】
【参考:手写AI出品: TextCNN文本分类,逐行代码复现! 可加UP免费答疑!_哔哩哔哩_bilibili】