需要解决的问题:在电商中有一些黑产使用机器脚本自动注册大量的垃圾店铺,而这些垃圾店铺的店铺名有一些是无意义的乱文,例如“唇评照桌”,“脑冻砸路忻故”等,因此需要训练一个中文文本的二分类模型来识别哪些是垃圾店铺名哪些是正常的店铺名
下面的文章中包含了:
1)使用paddle的预训练模型ernie_tiny进行中文文本的二分类模型训练
2)使用动态图模型进行数据预测
3)将动态图模型转化为静态图模型
4)加载静态图模型,并用其进行数据预测
注意:动态图模型便于模型的调试,但是预测的速度较慢;但是静态图模型不利于模型的调试,但是预测的速度快,一般是训练好的动态图模型转化为静态图模型,然后部署上线预测
import pandas as pd
import paddle
import paddlehub as hub
import ast
import argparse
from paddlehub.datasets.base_nlp_dataset import TextClassificationDataset
class MyDataset(TextClassificationDataset):
# 数据集存放目录
base_path = 'data/shop_name'
# 数据集的标签列表,多分类标签格式为['0', '1', '2', '3',...]
label_list = ['0', '1']
def __init__(self, tokenizer, max_seq_len: int = 10, mode: str = 'train'):
if mode == 'train':
data_file = 'train.tsv'
elif mode == 'test':
data_file = 'test.tsv'
else:
data_file = 'dev.tsv'
super().__init__(
base_path=self.base_path,
tokenizer=tokenizer,
max_seq_len=max_seq_len,
mode=mode,
data_file=data_file,
label_list=self.label_list,
is_file_with_header=True)
# 转成tsv格式
file_path = "data/shop_name/shop_name_train.csv"
text = pd.read_csv(file_path, sep="\t")
text = text.sample(frac=1) # 打乱数据集
print(len(text))
train = text[:int(len(text) * 0.8)]
dev = text[int(len(text) * 0.8):int(len(text) * 0.9)]
test = text[int(len(text) * 0.9):]
train.to_csv('data/shop_name/train.tsv', sep='\t', header=None, index=False, columns=None, mode="w")
dev.to_csv('data/shop_name/dev.tsv', sep='\t', header=None, index=False, columns=None, mode="w")
test.to_csv('data/shop_name/test.tsv', sep='\t', header=None, index=False, columns=None, mode="w")
# 验证train,dev,test标签分布是否均匀
for file in ['train', 'dev', 'test']:
file_path = f"data/shop_name/{file}.tsv"
text = pd.read_csv(file_path, sep="\t", header=None)
prob = dict()
total = len(text[0])
for i in text[0]:
if prob.get(i) is None:
prob[i] = 1
else:
prob[i] += 1
# 按标签排序
prob = {i[0]: round(i[1] / total, 3) for i in sorted(prob.items(), key=lambda k: k[0])}
print(file, prob, total)
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--num_epoch", type=int, default=1, help="Number of epoches for fine-tuning.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=False,
help="Whether use GPU for fine-tuning, input should be True or False")
parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate used to train with warmup.")
parser.add_argument("--max_seq_len", type=int, default=10, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=256, help="Total examples' number in batch for training.")
parser.add_argument("--checkpoint_dir", type=str, default='./ernie_checkpoint/shop_name',
help="Directory to model checkpoint")
parser.add_argument("--save_interval", type=int, default=1, help="Save checkpoint every n epoch.")
args = parser.parse_args()
# NOTE: 最大序列长度max_seq_len是可以调整的参数,建议值128,根据任务文本长度不同可以调整该值,但最大不超过512。
# tokenizer的作用是将原始输入文本转化成模型model可以接受的输入数据形式。
# PaddleHub 2.0中的各种预训练模型已经内置了相应的tokenizer,可以通过model.get_tokenizer方法获取。
# 选择模型、任务和类别数
# task:fine-tune任务。此处为seq-cls,表示文本分类任务。
# num_classes:表示当前文本分类任务的类别数,根据具体使用的数据集确定,默认为2
# model初始化为一个适用于文本分类任务的模型,为ERNIE的预训练模型后拼接上一个全连接网络(Full Connected)。
model = hub.Module(name='ernie_tiny', task='seq-cls', num_classes=len(MyDataset.label_list))
train_dataset = MyDataset(tokenizer=model.get_tokenizer(), max_seq_len=args.max_seq_len, mode='train')
dev_dataset = MyDataset(tokenizer=model.get_tokenizer(), max_seq_len=args.max_seq_len, mode='dev')
test_dataset = MyDataset(tokenizer=model.get_tokenizer(), max_seq_len=args.max_seq_len, mode='test')
# 选择优化策略和运行配置
optimizer = paddle.optimizer.Adam(learning_rate=args.learning_rate, parameters=model.parameters())
trainer = hub.Trainer(model, optimizer, checkpoint_dir=args.checkpoint_dir, use_gpu=args.use_gpu)
# 执行fine-tune并评估模型
trainer.train(train_dataset, epochs=args.num_epoch, batch_size=args.batch_size, eval_dataset=dev_dataset,
save_interval=args.save_interval)
# 在测试集上评估当前训练模型
trainer.evaluate(test_dataset, batch_size=args.batch_size)
运行结果:
[2022-06-16 14:37:48,957] [ TRAIN] - Epoch=1/1, Step=10/125 loss=0.4091 acc=0.8008 lr=0.000100 step/sec=0.13 | ETA 00:16:01
[2022-06-16 14:39:20,914] [ TRAIN] - Epoch=1/1, Step=20/125 loss=0.1642 acc=0.9363 lr=0.000100 step/sec=0.11 | ETA 00:17:35
[2022-06-16 14:40:58,558] [ TRAIN] - Epoch=1/1, Step=30/125 loss=0.1113 acc=0.9602 lr=0.000100 step/sec=0.10 | ETA 00:18:30
[2022-06-16 14:42:41,333] [ TRAIN] - Epoch=1/1, Step=40/125 loss=0.0905 acc=0.9684 lr=0.000100 step/sec=0.10 | ETA 00:19:14
[2022-06-16 14:44:15,565] [ TRAIN] - Epoch=1/1, Step=50/125 loss=0.0864 acc=0.9691 lr=0.000100 step/sec=0.11 | ETA 00:19:18
[2022-06-16 14:45:51,437] [ TRAIN] - Epoch=1/1, Step=60/125 loss=0.0846 acc=0.9691 lr=0.000100 step/sec=0.10 | ETA 00:19:25
[2022-06-16 14:47:41,354] [ TRAIN] - Epoch=1/1, Step=70/125 loss=0.0649 acc=0.9789 lr=0.000100 step/sec=0.09 | ETA 00:19:55
[2022-06-16 14:49:28,905] [ TRAIN] - Epoch=1/1, Step=80/125 loss=0.0781 acc=0.9762 lr=0.000100 step/sec=0.09 | ETA 00:20:13
[2022-06-16 14:51:19,631] [ TRAIN] - Epoch=1/1, Step=90/125 loss=0.0513 acc=0.9832 lr=0.000100 step/sec=0.09 | ETA 00:20:32
[2022-06-16 14:53:24,563] [ TRAIN] - Epoch=1/1, Step=100/125 loss=0.0745 acc=0.9762 lr=0.000100 step/sec=0.08 | ETA 00:21:05
[2022-06-16 14:55:09,146] [ TRAIN] - Epoch=1/1, Step=110/125 loss=0.0625 acc=0.9801 lr=0.000100 step/sec=0.10 | ETA 00:21:09
[2022-06-16 14:56:54,613] [ TRAIN] - Epoch=1/1, Step=120/125 loss=0.0674 acc=0.9797 lr=0.000100 step/sec=0.09 | ETA 00:21:13
[2022-06-16 14:58:24,199] [ EVAL] - [Evaluation result] avg_acc=0.9782-
[2022-06-16 14:58:28,934] [ EVAL] - Saving best model to ./ernie_checkpoint/shop_name/best_model [best acc=0.9782]
[2022-06-16 14:58:28,938] [ INFO] - Saving model checkpoint to ./ernie_checkpoint/shop_name/epoch_1
# coding:udt-8
import paddle
import paddlehub as hub
#当Finetune完成后,我们加载训练后保存的最佳模型来进行预测,完整预测代码如下:
data = [
['唇评照桌'],
['谫背屯投谮'],
['槿栀马'],
['脑冻砸路忻故'],
['织谜毛'],
['梦碎人亦醒骆'],
['值衙晌丛屡克'],
['只为守护你诸'],
['挂甲台'],
['来一口章鱼烧'],
['大大的电包游戏充值'],
['小巫的解忧杂货铺'],
['GOLBALBY'],
['地之礼精油'],
['智童大朗校服工厂店'],
['DBA宝典'],
['沐春茶业'],
['字字乾坤'],
['X-fan'],
['THEONE'],
['Gailei买买店'],
['小博士书品']
]
label_map = {0: 'normal', 1: 'spam'}
model = hub.Module(
name='ernie_tiny',
version='2.0.1',
task='seq-cls',
load_checkpoint='ernie_checkpoint/shop_name/best_model/model.pdparams',
label_map=label_map)
results = model.predict(data, max_seq_len=7, batch_size=1, use_gpu=False)
for idx, text in enumerate(data):
print('Data: {} \t Lable: {}'.format(text[0], results[idx]))
import paddle
from paddle.jit import to_static
import pandas as pd
import paddlehub as hub
from paddle.static import InputSpec
label_map = {0: 'normal', 1: 'spam'}
# 加载动态图模型
model = hub.Module(
name='ernie_tiny',
version='2.0.1',
task='seq-cls',
load_checkpoint='ernie_checkpoint/shop_name/best_model/model.pdparams',
label_map=label_map)
#print(model)
# 其中shape = [batch的大小,max_seq_len句子的最大长度]
static_model = paddle.jit.to_static(model, input_spec=[InputSpec(shape=[1, 10], dtype = 'int64', name = 'x'),
InputSpec(shape=[1,1], dtype = 'int64', name = 'y')
]) # 动静转换
paddle.jit.save(static_model, 'ernie_model/shop_name/model')
执行成功后,在ernie_model/shop_name目录下会出现下面3个文件:
model.pdiparams model.pdiparams.info model.pdmodel
from paddle_predict import ErnieTinyPredict
import sys
def load_model(model_dir):
print('prepare to load model!')
model = ErnieTinyPredict(model_dir=model_dir)
print('already load model!')
return model
def model_predict(model, item_name):
result = model.predict([item_name])
#print(item_name + "," + str(result[0]))
return result
def predict_data(test_file):
model = load_model('ernie_model/shop_name')
num = 0
for line in open(test_file):
try:
name = line.strip().split('\t')[1]
result = model_predict(model, name)
positive_prediction = result[0]
negative_prediction = 1 - positive_prediction
#label = '1' if positive_prediction >= 0.9 else '0'
row = '\t'.join([name, str(negative_prediction), str(positive_prediction)])
print(row)
except Exception as e:
print(e)
predict_data(sys.argv[1]) # 输入待预测数据
paddle_predict.py
from typing import Tuple
import paddle.inference as paddle_infer
from paddlenlp.transformers import ErnieTinyTokenizer
from paddlenlp.data import Pad, Tuple
import os
import json
class ErnieTinyPredict():
def __init__(self, model_dir=""):
self.max_seq_len = 200
self.threshold = 0.985 # 0.75
self.tokenzier = ErnieTinyTokenizer.from_pretrained("ernie-tiny", vocab_file="vocab.txt",
sentencepiece_model_file="spm_cased_simp_sampled.model",
word_dict="dict.wordseg.pickle")
self.data_batchify_fn = Tuple(
Pad(axis=0, pad_val=self.tokenzier.pad_token_type_id), # pad_token_id
Pad(axis=0, pad_val=self.tokenzier.pad_token_type_id)
)
assert model_dir
config = paddle_infer.Config(
os.path.join(model_dir, "model.pdmodel"),
os.path.join(model_dir, "model.pdiparams"))
# config.disable_gpu()
config.enable_mkldnn()
config.set_cpu_math_library_num_threads(2)
config.switch_ir_optim()
config.enable_memory_optim()
self.Predictor = paddle_infer.create_predictor(config)
self.input_handles = [self.Predictor.get_input_handle(name)
for name in self.Predictor.get_input_names()]
def text_token(self, data):
ds_list = []
for text in data:
token = self.tokenzier(
text, max_seq_len=self.max_seq_len, pad_to_max_seq_len=True,
is_split_into_words=False)
ds = self.data_batchify_fn(
[(token["input_ids"], token["token_type_ids"])])
ds_list.append(ds)
return ds_list
def predict(self, data):
ds_list = self.text_token(data)
result = []
id = 0
for ds in ds_list:
for input_file, input_handle in zip(ds, self.input_handles):
input_handle.copy_from_cpu(input_file)
self.Predictor.run()
out_names = self.Predictor.get_output_names()
out_handle = self.Predictor.get_output_handle(out_names[0])
out_data = out_handle.copy_to_cpu()
# lable = ['__label__-1','__label__1']
# prob = [out_data[0][0],out_data[0][1]]
good_prob = float(out_data[0][1])
# result[str(id)+' '+data[id]] = good_prob
result.append(good_prob)
id += 1
return result