• 基于飞浆NLP的BERT-finetuning新闻文本分类


    目录

    1.数据预处理

    2.加载模型

    3.批训练

    4.准确率

    1.数据预处理

    导入所需库

    1. import numpy as np
    2. from paddle.io import DataLoader,TensorDataset
    3. from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
    4. from sklearn.model_selection import train_test_split
    5. import paddle
    6. import matplotlib.pyplot as plt
    7. import jieba

    训练集格式 标签ID+\t+标签+\t+原文标题

    1. contents=[]
    2. datas=[]
    3. labels=[]
    4. with open('data/data126283/data/Train.txt',mode='r',encoding='utf-8') as f:
    5. contents=f.read().split('\n')
    6. for item in contents:
    7. if item=='':
    8. continue
    9. labels.append(item.split('\t')[0])
    10. datas.append(remove_stopwords(jieba.cut(item.split('\t')[-1])))
    11. datas=convert(datas)

    去除停用词、

    1. stop=[]
    2. with open('stop.txt',mode='r',encoding='utf-8') as f:
    3. stop=f.read().split('\n')
    4. stop_word={}
    5. for s in stop:
    6. stop_word[s]=True
    7. def remove_stopwords(datas):
    8. filtered_words = [text for text in datas if text not in stop_word]
    9. return ' '.join(filtered_words)

    进行中文分词、转换为token序列

    1. tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    2. def convert(datas, max_seq_length=40):
    3. ans=[]
    4. for text in datas:
    5. input_ids = tokenizer(text, max_seq_len=max_seq_length)['input_ids']
    6. input_ids = input_ids[:max_seq_length] # 截断
    7. input_ids = input_ids + [tokenizer.pad_token_id] * (max_seq_length - len(input_ids)) # 填充
    8. ans.append(input_ids)
    9. return ans

    导入数据,进行预处理,数据集在最后

    1. contents=[]
    2. datas=[]
    3. labels=[]
    4. with open('data/data126283/data/Train.txt',mode='r',encoding='utf-8') as f:
    5. contents=f.read().split('\n')
    6. for item in contents:
    7. if item=='':
    8. continue
    9. labels.append(item.split('\t')[0])
    10. datas.append(remove_stopwords(jieba.cut(item.split('\t')[-1])))
    11. datas=convert(datas)

     

    2.加载模型 

    加载预训练模型,冻结大部分参数
    1. model = BertForSequenceClassification.from_pretrained('bert-base-chinese')
    2. model.classifier = paddle.nn.Linear(768, 14)
    3. for name, param in model.named_parameters():
    4. if "classifier" not in name and 'bert.pooler.dense' not in name and 'bert.encoder.layers.11' not in name:
    5. param.stop_gradient = True

    ps:如果只保留classifier用来训练,效果欠佳。

    设置超参数,学习率初始设为0.01~0.1

    1. epochs=2
    2. batch_size=1024*4
    3. learning_rate=0.001

    损失函数和优化器

    1. criterion = paddle.nn.CrossEntropyLoss()
    2. optimizer = paddle.optimizer.Adam(learning_rate=learning_rate, parameters=model.parameters())

    3.批训练

    划分训练集和测试集

    1. datas=np.array(datas)
    2. labels=np.array(labels)
    3. x_train,x_test,y_train,y_test=train_test_split(datas,labels,random_state=42,test_size=0.2)
    4. train_dataset=TensorDataset([x_train,y_train])
    5. train_loader=DataLoader(train_dataset,shuffle=True,batch_size=batch_size)

    迭代分批训练,可视化损失函数

    1. total_loss=[]
    2. for epoch in range(epochs):
    3. for batch_data,batch_label in train_loader:
    4. batch_label=paddle.to_tensor(batch_label,dtype='int64')
    5. batch_data=paddle.to_tensor(batch_data,dtype='int64')
    6. outputs=model(batch_data)
    7. loss=criterion(outputs,batch_label)
    8. print(epoch,loss.numpy()[0])
    9. total_loss.append(loss.numpy()[0])
    10. optimizer.clear_grad()
    11. loss.backward()
    12. optimizer.step()
    13. paddle.save({'model':model.state_dict()},'model.param')
    14. paddle.save({'optimizer':optimizer.state_dict()},'optimizer.param')
    15. plt.plot(range(len(total_loss)),total_loss)
    16. plt.show()

    4.准确率

    在测试集上如法炮制,查看准确率

    1. total_loss=[]
    2. x_test=np.array(x_test)
    3. y_test=np.array(y_test)
    4. test_dataset=TensorDataset([x_test,y_test])
    5. test_loader=DataLoader(test_dataset,shuffle=True,batch_size=batch_size)
    6. with paddle.no_grad():
    7. for batch_data,batch_label in test_loader:
    8. batch_label=paddle.to_tensor(batch_label,dtype='int64')
    9. batch_data=paddle.to_tensor(batch_data,dtype='int64')
    10. outputs=model(batch_data)
    11. loss=criterion(outputs,batch_label)
    12. print(loss)
    13. outputs=paddle.argmax(outputs,axis=1)
    14. total_loss.append(loss.numpy()[0])
    15. score=0
    16. for predict,label in zip(outputs,batch_label):
    17. if predict==label:
    18. score+=1
    19. print(score/len(batch_label))
    20. plt.plot(range(len(total_loss)),total_loss)
    21. plt.show()

    最后在验证集上输出要求的类别

    1. arr=['财经','彩票','房产','股票','家居','教育','科技','社会','时尚','时政','体育','星座','游戏','娱乐']
    2. evals=[]
    3. contetns=[]
    4. with open('data/data126283/data/Test.txt',mode='r',encoding='utf-8') as f:
    5. contents=f.read().split('\n')
    6. for item in contents:
    7. if item=='':
    8. continue
    9. evals.append(item)
    10. evals=convert(evals)
    11. evals=np.array(evals)
    12. with paddle.no_grad():
    13. for i in range(0,len(evals),2048):
    14. i=min(len(evals),i)
    15. batch_data=evals[i:i+2048]
    16. batch_data=paddle.to_tensor(batch_data,dtype='int64')
    17. predict=model(batch_data)
    18. predict=list(paddle.argmax(predict,axis=1))
    19. print(i,len(predict))
    20. for j in range(len(predict)):
    21. predict[j]=arr[predict[j]]
    22. with open('result.txt',mode='a',encoding='utf-8') as f:
    23. f.write('\n'.join(predict))
    24. f.write('\n')

    ps:注意最后的f.write('\n'),否则除第一次,每次打印少一行,很坑

    最后损失函数收敛在0.2或0.1左右比较正常,四舍五入差不多90准确率,当然如果你解冻更多参数,自然可以更加精确,看运行环境的配置了,建议不要使用免费平台配置,否则比乌龟还慢。。

    欢迎提出问题

    数据集

  • 相关阅读:
    leetcode 20
    【AI设计模式】02-数据表示-嵌入(Embeddings)模式
    Glide:DecodeJob
    【Axure高保真原型】人物卡片多条件搜索案例
    什么牌子的蓝牙耳机好?音质好的蓝牙耳机推荐
    [附源码]计算机毕业设计JAVAjsp运动器材网上销售系统
    架构师选择题--数据库技术
    vue3 笔记
    平台化,强链补链的一个支点
    操作系统的进程详解
  • 原文地址:https://blog.csdn.net/weixin_61067952/article/details/134265720