• paddle2.3和torch1.8在SentenceBert上的性能对比


    写在前面。。。对比了个寂寞,paddle框架目前有bug——或者我代码有bug。。。torch完胜

    目录

    一、背景介绍

    二、paddle和torch实现SentenceBert

    1、SentenceBert

    2、torch实现

    3、paddle实现

    三、效果对比


    一、背景介绍

            为什么要写这样的一篇对比paddle和torch的博客呢?

            百度的paddle开源的一些模型和权重非常好,为了后续工作能更好的利用百度开源的一些模型权重和方法,因此学习好paddle也是有必要的;另外paddle的生态也是越发完善,对标huggingface的transformer和它的模型下载中心,百度也出品了paddlenlp.transformer和paddlehub,同时paddle框架本身也做的越来越好,越来越成熟,它的API和torch也是大同小异,学习成本也比较低,所有更有必要好好学习一下paddle这个框架。

            习惯使用了torch做深度学习,再使用paddle必然也会有一点点不顺手,毕竟API还是有一些差异的,但是这个真的要比torch和tensorflow的差距要小很多。为了能顺利的使用paddle来完成一个文本分类的例子,强迫自己输出一篇博客,把实现SentenceBert文本分类过程中一些问题和经验积累下来,也是作为学习paddle的一个开始。

            总体来说,百度出品的paddle可以用起来了,做的越来越好了,开源的一些模型也是有中文领域的,和torch的差距正在变小,好的东西都是要掌握的。

    SentenceBert做文本分类和匹配是很经典的了,到现在我都认为还是比较能打的,效果确实比较好,实现起来比较容易,当然现在的提示学习呀还有苏神的consenBert在效果上确实要比SentenceBert要好,不过这里重点在于使用paddle实现文本分类的任务,以及对比一下torch,它们在效果和效率上的差异。

    二、paddle和torch实现SentenceBert

    1、SentenceBert

    具体的模型细节就不多说了,放个图做个简介吧

     Bert模型共享参数,把v、u和|u-v|拼接起来,得到分类特征;pooling这里也比较有特色,采用mean_pooling并且把padding部分的0都考虑进来了,消除这部分数据对结果的影响。

    2、torch实现

    模型代码sentence_bert.py

    1. import torch.nn as nn
    2. from transformers import BertModel
    3. from transformers import BertPreTrainedModel
    4. import torch
    5. class SentenceBert(BertPreTrainedModel):
    6. def __init__(self,config):
    7. super(SentenceBert,self).__init__(config)
    8. self.bert = BertModel(config=config)
    9. self.classifier = nn.Linear(3*config.hidden_size,config.num_labels)
    10. def forward(self,inputs):
    11. input_a = inputs[0]
    12. input_b = inputs[1]
    13. output_a = self.bert(**input_a,return_dict=True, output_hidden_states=True)
    14. output_b = self.bert(**input_b,return_dict=True, output_hidden_states=True)
    15. #采用最后一层
    16. embedding_a = output_a.hidden_states[-1]
    17. embedding_b = output_b.hidden_states[-1]
    18. embedding_a = self.pooling(embedding_a,input_a)
    19. embedding_b = self.pooling(embedding_b, input_b)
    20. embedding_abs = torch.abs(embedding_a - embedding_b)
    21. vectors_concat = []
    22. vectors_concat.append(embedding_a)
    23. vectors_concat.append(embedding_b)
    24. vectors_concat.append(embedding_abs)
    25. # 列拼接3个768————>3*768
    26. features = torch.cat(vectors_concat, 1)
    27. output = self.classifier(features)
    28. return output
    29. def pooling(self,token_embeddings,input):
    30. output_vectors = []
    31. #attention_mask
    32. attention_mask = input['attention_mask']
    33. #[B,L]------>[B,L,1]------>[B,L,768],矩阵的值是0或者1
    34. input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    35. #这里做矩阵点积,就是对元素相乘(序列中padding字符,通过乘以0给去掉了)[B,L,768]
    36. t = token_embeddings * input_mask_expanded
    37. #[B,768]
    38. sum_embeddings = torch.sum(t, 1)
    39. # [B,768],最大值为seq_len
    40. sum_mask = input_mask_expanded.sum(1)
    41. #限定每个元素的最小值是1e-9,保证分母不为0
    42. sum_mask = torch.clamp(sum_mask, min=1e-9)
    43. #得到最后的具体embedding的每一个维度的值——元素相除
    44. output_vectors.append(sum_embeddings / sum_mask)
    45. #列拼接
    46. output_vector = torch.cat(output_vectors, 1)
    47. return output_vector

    数据读取dataReader_tsv.py

    1. from tqdm import tqdm
    2. import torch
    3. import pandas as pd
    4. class DataReader(object):
    5. def __init__(self,tokenizer,filepath,max_len):
    6. self.tokenizer = tokenizer
    7. self.filepath = filepath
    8. self.max_len = max_len
    9. self.dataList = self.datas_to_torachTensor()
    10. self.allLength = len(self.dataList)
    11. def convert_text2ids(self,text):
    12. text = text[0:self.max_len-2]
    13. inputs = self.tokenizer(text)
    14. input_ids = inputs['input_ids']
    15. attention_mask = inputs['attention_mask']
    16. paddings = [0] * (self.max_len - len(input_ids))
    17. input_ids += paddings
    18. attention_mask += paddings
    19. token_type_id = [0] * self.max_len
    20. return input_ids, attention_mask, token_type_id
    21. def datas_to_torachTensor(self):
    22. # with open(self.filepath,'r',encoding='utf-8') as f:
    23. # lines = f.readlines()
    24. df = pd.read_csv(self.filepath,sep='\t')
    25. df.dropna(inplace=True)
    26. lines_a = df['sentence1'].values.tolist()
    27. lines_b = df['sentence2'].values.tolist()
    28. labels = df['label'].values.tolist()
    29. res = []
    30. for line_a,line_b,label in tqdm(zip(lines_a,lines_b,labels),desc='tokenization',ncols=50):
    31. temp = []
    32. try:
    33. if line_a != '' and len(line_a) > 0 and line_b != '' and len(line_b) > 0:
    34. input_ids_a, attention_mask_a, token_type_id_a = self.convert_text2ids(text=line_a)
    35. input_ids_a = torch.as_tensor(input_ids_a, dtype=torch.long)
    36. attention_mask_a = torch.as_tensor(attention_mask_a, dtype=torch.long)
    37. token_type_id_a = torch.as_tensor(token_type_id_a, dtype=torch.long)
    38. temp.append(input_ids_a)
    39. temp.append(attention_mask_a)
    40. temp.append(token_type_id_a)
    41. input_ids_b, attention_mask_b, token_type_id_b = self.convert_text2ids(text=line_b)
    42. input_ids_b = torch.as_tensor(input_ids_b, dtype=torch.long)
    43. attention_mask_b = torch.as_tensor(attention_mask_b, dtype=torch.long)
    44. token_type_id_b = torch.as_tensor(token_type_id_b, dtype=torch.long)
    45. temp.append(input_ids_b)
    46. temp.append(attention_mask_b)
    47. temp.append(token_type_id_b)
    48. label = torch.as_tensor(label,dtype=torch.long)
    49. temp.append(label)
    50. res.append(temp)
    51. except Exception as e:
    52. print(e)
    53. return res
    54. def __getitem__(self, item):
    55. input_ids_a = self.dataList[item][0]
    56. attention_mask_a = self.dataList[item][1]
    57. token_type_id_a = self.dataList[item][2]
    58. input_ids_b = self.dataList[item][3]
    59. attention_mask_b = self.dataList[item][4]
    60. token_type_id_b = self.dataList[item][5]
    61. label = self.dataList[item][6]
    62. return input_ids_a, attention_mask_a, token_type_id_a, input_ids_b, attention_mask_b, token_type_id_b, label
    63. def __len__(self):
    64. return self.allLength

    模型训练train_sentence_bert.py

    1. import torch
    2. import argparse
    3. from data_reader.dataReader_tsv import DataReader
    4. from model.sentence_bert_copy import SentenceBert
    5. from torch.utils.data import DataLoader
    6. import torch.nn.functional as F
    7. from torch.optim import AdamW
    8. from torch.optim.lr_scheduler import ReduceLROnPlateau
    9. from transformers import BertTokenizer,BertConfig
    10. import os
    11. from tools.progressbar import ProgressBar
    12. from tools.log import Logger
    13. from datetime import datetime
    14. from torch.nn.utils.rnn import pad_sequence
    15. logger = Logger('sbert_loger',log_level=10,log_file = "./log_output/sbert_pawsx_torch.log").logger
    16. os.environ['CUDA_VISIBLE_DEVICES'] = "0"
    17. def parse_args():
    18. parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    19. parser.add_argument("--max_len",type=int,default=64)
    20. parser.add_argument("--train_file", type=str,default='./data/paws_x/translated_train.tsv', help="train text file")
    21. parser.add_argument("--val_file", type=str, default='./data/paws_x/dev_2k.tsv',help="val text file")
    22. parser.add_argument("--pretrained", type=str, default="./pretrained_models/torch/chinese-bert-wwm-ext", help="huggingface pretrained model")
    23. parser.add_argument("--model_out", type=str, default="./output/pawsx", help="model output path")
    24. parser.add_argument("--batch_size", type=int, default=128, help="batch size")
    25. parser.add_argument("--epochs", type=int, default=10, help="epochs")
    26. parser.add_argument("--lr", type=float, default=1e-5, help="epochs")
    27. parser.add_argument("--task_type",type=str,default='classification')
    28. args = parser.parse_args()
    29. return args
    30. def collate_fn(batch):
    31. input_ids_a, attention_mask_a, token_type_id_a, input_ids_b, attention_mask_b, token_type_id_b, label = zip(*batch)
    32. input_ids_a = pad_sequence(input_ids_a,batch_first=True,padding_value=0)
    33. attention_mask_a = pad_sequence(attention_mask_a, batch_first=True, padding_value=0)
    34. token_type_id_a = pad_sequence(token_type_id_a, batch_first=True, padding_value=0)
    35. input_ids_b = pad_sequence(input_ids_b, batch_first=True, padding_value=0)
    36. attention_mask_b = attention_mask_b(attention_mask_b, batch_first=True, padding_value=0)
    37. token_type_id_b = pad_sequence(token_type_id_b, batch_first=True, padding_value=0)
    38. label = torch.stack(label)
    39. return input_ids_a, attention_mask_a, token_type_id_a, input_ids_b, attention_mask_b, token_type_id_b, label
    40. def train(args):
    41. logger.info("args: %s",args)
    42. tokenizer = BertTokenizer.from_pretrained(args.pretrained)
    43. config = BertConfig.from_pretrained(args.pretrained)
    44. device = "cuda" if torch.cuda.is_available() else "cpu"
    45. task_type = args.task_type
    46. model = SentenceBert.from_pretrained(config=config, pretrained_model_name_or_path=args.pretrained)
    47. model.to(device)
    48. train_dataset = DataReader(tokenizer=tokenizer,filepath=args.train_file,max_len=args.max_len)
    49. train_dataloader = DataLoader(train_dataset,batch_size=args.batch_size,shuffle=True, collate_fn = collate_fn)
    50. val_dataset = DataReader(tokenizer=tokenizer,filepath=args.val_file,max_len=args.max_len)
    51. val_dataloader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False,collate_fn=collate_fn)
    52. optimizer = AdamW(model.parameters(),lr=args.lr)
    53. scheduler = ReduceLROnPlateau(optimizer=optimizer,mode='max',factor=0.5, patience=5)
    54. model.train()
    55. logger.info("***** Running training *****")
    56. logger.info(" Num examples = %d", len(train_dataloader))
    57. logger.info(" Num Epochs = %d", args.epochs)
    58. time_srt = datetime.now().strftime('%Y-%m-%d')
    59. save_path = os.path.join(args.model_out, "paddle_" + time_srt)
    60. patience = 5
    61. no_best_epoch = 0
    62. best_acc = 0.0
    63. for epoch in range(args.epochs):
    64. pbar = ProgressBar(n_total=len(train_dataloader), desc='Training')
    65. for step,batch in enumerate(train_dataloader):
    66. batch = [t.to(device) for t in batch]
    67. inputs_a = {'input_ids':batch[0],'attention_mask':batch[1],'token_type_ids':batch[2]}
    68. inputs_b = {'input_ids': batch[3], 'attention_mask': batch[4], 'token_type_ids': batch[5]}
    69. labels = batch[6]
    70. inputs = []
    71. inputs.append(inputs_a)
    72. inputs.append(inputs_b)
    73. output = model(inputs)
    74. loss = F.cross_entropy(output,labels)
    75. loss.backward()
    76. optimizer.step()
    77. optimizer.zero_grad()
    78. pbar(step, {'loss':loss.item()})
    79. val_acc = valdation(model, val_dataloader, device, task_type)
    80. scheduler.step(val_acc)
    81. if val_acc > best_acc:
    82. best_acc = val_acc
    83. if not os.path.exists(save_path):
    84. os.makedirs(save_path)
    85. logger.info("save model")
    86. model.save_pretrained(save_path)
    87. tokenizer.save_vocabulary(save_path)
    88. no_best_epoch = 0
    89. else:
    90. no_best_epoch += 1
    91. logger.info("val_acc:%.4f------best_acc:%.4f" % (val_acc, best_acc))
    92. if no_best_epoch >= patience:
    93. logger.info("taining finished because of no improving" )
    94. break
    95. def valdation(model,val_dataloader,device):
    96. total = 0
    97. total_correct = 0
    98. model.eval()
    99. with torch.no_grad():
    100. pbar = ProgressBar(n_total=len(val_dataloader), desc='evaldation')
    101. for step, batch in enumerate(val_dataloader):
    102. batch = [t.to(device) for t in batch]
    103. inputs_a = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2]}
    104. inputs_b = {'input_ids': batch[3], 'attention_mask': batch[4], 'token_type_ids': batch[5]}
    105. labels = batch[6]
    106. inputs = []
    107. inputs.append(inputs_a)
    108. inputs.append(inputs_b)
    109. output = model(inputs)
    110. pred = torch.argmax(output, dim=1)
    111. correct = (labels == pred).sum()
    112. total_correct += correct
    113. total += labels.size()[0]
    114. loss = F.cross_entropy(output, labels)
    115. pbar(step, {'loss': loss.item()})
    116. acc = total_correct / total
    117. return acc
    118. def main():
    119. torch.manual_seed(1)
    120. torch.cuda.manual_seed(1)
    121. args =parse_args()
    122. train(args)
    123. if __name__ == '__main__':
    124. main()

    torch的代码比较简单,就不做多的解释和说明了,有一个需要注意的地方就是记得要设置随机种子。

    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    下面来看paddle的实现

    3、paddle实现

    模型组网代码

    模型实现,模型计算逻辑使用的API和torch基本相同,只需要修改如下:

    torch.cat()------------------paddle.concat()

    torch.clamp()------------------paddle.clip()

    ......

    其他的基本只需要把torch修改为paddle就可以了,主要的不同用法:

    transformer中的bert输出是一个字典

    bert(**input_a,return_dict=True, output_hidden_states=True).hidden_states[-1]

    paddlenlp.transformer中的bert输出是一个list 第一个元素是每一层的hidden_states;第二个元素就是pooled_output也就是cls向量;所以一般采用如下代码:
    bert(**input_a, output_hidden_states=True)[0][-1]最后一层的hidden_states

    关于模型权重和配置文件的加载也不一样

    paddle在指定模型名称的时候会采取自动下载模型和权重,然后再加载的方式

    SentenceBert.from_pretrained('bert-wwm-ext-chinese')
    

    会把模型权重和配置文件保存在/root/.paddlenlp/models/bert-wwm-ext-chinese路径下:

     内容如上图所示

    如果要像torch一样加载本地的bert等预训练模型,需要先使用paddlenlp.transformers把模型权重下载后,然后使用model.save_pretrained(local_path)保存模型权重和模型配置文件(自动下载的时候是没有模型配置文件的),tokenizer.save_pretrained(local_path)保存词表。上述保存的文件名和自动下载的是不一样的:

     注意模型权重文件model_state.pdparams;模型配置文件model_config.json,并且model_config.json和torch下的bert模型配置文件格式不同:

    {
        "init_args":[
            {
                "vocab_size":21128,
                "hidden_size":768,
                "num_hidden_layers":12,
                "num_attention_heads":12,
                "intermediate_size":3072,
                "hidden_act":"gelu",
                "hidden_dropout_prob":0.1,
                "attention_probs_dropout_prob":0.1,
                "max_position_embeddings":512,
                "type_vocab_size":2,
                "initializer_range":0.02,
                "pad_token_id":0,
                "init_class":"BertModel"
            }
        ],
        "init_class":"SentenceBert"
    }

    模型组网如下sentence_bert.py:

    1. from paddlenlp.transformers import BertPretrainedModel,BertModel
    2. import paddle.nn as nn
    3. import paddle
    4. class SentenceBert(BertPretrainedModel):
    5. """
    6. __init__(self,bert) 一定要带一个参数,外层from_pretrained()才能成功
    7. bert会在from_pretrained()的时候初始化成功然后传入
    8. """
    9. # base_model_prefix = "bert"
    10. def __init__(self, bert):
    11. super(SentenceBert,self).__init__()
    12. self.bert = bert
    13. self.classifier = nn.Linear(3*768,2)
    14. def forward(self, inputs, **kwargs):
    15. input_a = inputs[0]
    16. input_b = inputs[1]
    17. output_a = self.bert(**input_a, output_hidden_states=True)
    18. output_b = self.bert(**input_b, output_hidden_states=True)
    19. # 采用最后一层
    20. embedding_a = output_a[0][-1]
    21. embedding_b = output_b[0][-1]
    22. embedding_a = self.pooling(embedding_a, input_a)
    23. embedding_b = self.pooling(embedding_b, input_b)
    24. embedding_abs = paddle.abs(embedding_a - embedding_b)
    25. vectors_concat = []
    26. vectors_concat.append(embedding_a)
    27. vectors_concat.append(embedding_b)
    28. vectors_concat.append(embedding_abs)
    29. # 列拼接3个768————>3*768
    30. features = paddle.concat(vectors_concat,axis=-1)
    31. output = self.classifier(features)
    32. return output
    33. def pooling(self, token_embeddings, input):
    34. output_vectors = []
    35. # attention_mask
    36. attention_mask = input['attention_mask']
    37. # [B,L]------>[B,L,1]------>[B,L,768],矩阵的值是0或者1
    38. # input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    39. input_mask_expanded = paddle.unsqueeze(attention_mask,-1)
    40. input_mask_expanded = paddle.expand(input_mask_expanded,shape=[input_mask_expanded.shape[0],input_mask_expanded.shape[1],token_embeddings.shape[-1]])
    41. # 这里做矩阵点积,就是对元素相乘(序列中padding字符,通过乘以0给去掉了)[B,L,768]
    42. t = token_embeddings * input_mask_expanded
    43. # [B,768]
    44. sum_embeddings = paddle.sum(t, 1)
    45. # [B,768],最大值为seq_len
    46. sum_mask = input_mask_expanded.sum(1)
    47. # 限定每个元素的最小值是1e-9,保证分母不为0
    48. sum_mask = paddle.clip(sum_mask, min=1e-9)
    49. # 得到最后的具体embedding的每一个维度的值——元素相除
    50. output_vectors.append(sum_embeddings / sum_mask)
    51. # 列拼接
    52. output_vector = paddle.concat(output_vectors, 1)
    53. return output_vector

    上述组网的方式,加载权重的时候要注意这个init中的bert不是我们传入的参数,而是

    SentenceBert.from_pretrained(pretrained_model_name_or_path=args.pretrained)自动传入的参数
    

    但是在模型结构定义的时候一定的传入一个参数才不会报错。

    数据加载dataReader.py

    1. from paddle.io import Dataset
    2. import pandas as pd
    3. from tqdm import tqdm
    4. import paddle
    5. class DataReader(Dataset):
    6. def __init__(self, tokenizer, filepath, max_len):
    7. self.tokenizer = tokenizer
    8. self.filepath = filepath
    9. self.max_len = max_len
    10. self.dataList = self.datas_to_torachTensor()
    11. self.allLength = len(self.dataList)
    12. def convert_text2ids(self, text):
    13. text = text[0:self.max_len - 2]
    14. inputs = self.tokenizer(text)
    15. input_ids = inputs['input_ids']
    16. attention_mask = [1]*len(input_ids)
    17. token_type_id = inputs["token_type_ids"]
    18. return input_ids, attention_mask, token_type_id
    19. def datas_to_torachTensor(self):
    20. df = pd.read_csv(self.filepath, sep='\t')
    21. df.dropna(inplace=True)
    22. lines_a = df['sentence1'].values.tolist()
    23. lines_b = df['sentence2'].values.tolist()
    24. labels = df['label'].values.tolist()
    25. res = []
    26. for line_a, line_b, label in tqdm(zip(lines_a[0:], lines_b[0:], labels[0:]), desc='tokenization', ncols=50):
    27. temp = []
    28. try:
    29. if line_a != '' and len(line_a) > 0 and line_b != '' and len(line_b) > 0:
    30. input_ids_a, attention_mask_a, token_type_id_a = self.convert_text2ids(text=line_a)
    31. input_ids_a = paddle.to_tensor(input_ids_a,dtype='int64')
    32. attention_mask_a = paddle.to_tensor(attention_mask_a, dtype='int64')
    33. token_type_id_a = paddle.to_tensor(token_type_id_a, dtype='int64')
    34. temp.append(input_ids_a)
    35. temp.append(attention_mask_a)
    36. temp.append(token_type_id_a)
    37. input_ids_b, attention_mask_b, token_type_id_b = self.convert_text2ids(text=line_b)
    38. input_ids_b = paddle.to_tensor(input_ids_b, dtype='int64')
    39. attention_mask_b = paddle.to_tensor(attention_mask_b, dtype='int64')
    40. token_type_id_b = paddle.to_tensor(token_type_id_b, dtype='int64')
    41. temp.append(input_ids_b)
    42. temp.append(attention_mask_b)
    43. temp.append(token_type_id_b)
    44. label = paddle.to_tensor(label, dtype='int64')
    45. temp.append(label)
    46. res.append(temp)
    47. except Exception as e:
    48. print(e)
    49. return res
    50. def __getitem__(self, item):
    51. input_ids_a = self.dataList[item][0]
    52. attention_mask_a = self.dataList[item][1]
    53. token_type_id_a = self.dataList[item][2]
    54. input_ids_b = self.dataList[item][3]
    55. attention_mask_b = self.dataList[item][4]
    56. token_type_id_b = self.dataList[item][5]
    57. label = self.dataList[item][6]
    58. return input_ids_a, attention_mask_a, token_type_id_a, input_ids_b, attention_mask_b, token_type_id_b, label
    59. def __len__(self):
    60. return self.allLength

    模型训练train_sentence_bert.py

    注意的是这里是随机种子的设置比较严格仅仅一个paddle.seed()并不能保证训练的可复现和一致。还需要其他的设置:

    paddle.seed(1)
    random.seed(1)
    np.random.seed(1)

    这样设置比较靠谱,有可能不知道的地方隐式的调用了上面的random和numpy库,保险起见最好设置一下比较好。paddle的

    DataLoader(train_dataset,batch_size=args.batch_size,shuffle=True, collate_fn=collate_fn)

    这句代码中的 shuffle 应该是引用了random包 所以每次batch出来的数据不一样,所以random也要设置seed

    1. from data_reader.dataReader import DataReader
    2. from model.sentence_bert import SentenceBert
    3. import argparse
    4. from tools.progressbar import ProgressBar
    5. from tools.log import Logger
    6. from datetime import datetime
    7. import os
    8. from paddlenlp.transformers import BertTokenizer
    9. from paddle.io import DataLoader
    10. from paddle.optimizer import AdamW
    11. from paddle.optimizer.lr import ReduceOnPlateau
    12. import paddle
    13. import paddle.nn.functional as F
    14. import numpy as np
    15. import random
    16. paddle.seed(1)
    17. random.seed(1)
    18. np.random.seed(1)
    19. logger = Logger('paddle_sbert_loger',log_level=10,log_file='./log_output/pawsx_sbert_paddle.log').logger
    20. paddle.set_device('gpu:0')
    21. def parse_args():
    22. parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    23. parser.add_argument("--max_len",type=int,default=64)
    24. parser.add_argument("--train_file", type=str,default='./data/paws_x/translated_train.tsv', help="train text file")
    25. parser.add_argument("--val_file", type=str, default='./data/paws_x/dev_2k.tsv',help="val text file")
    26. parser.add_argument("--pretrained", type=str, default="pretrained_models/paddle/bert-wwm-ext-chinese", help="huggingface pretrained model")
    27. parser.add_argument("--model_out", type=str, default="./output", help="model output path")
    28. parser.add_argument("--batch_size", type=int, default=64, help="batch size")
    29. parser.add_argument("--epochs", type=int, default=10, help="epochs")
    30. parser.add_argument("--lr", type=float, default=1e-5, help="epochs")
    31. parser.add_argument("--task_type",type=str,default='classification')
    32. args = parser.parse_args()
    33. return args
    34. def collate_fn(batch):
    35. input_ids_a, attention_mask_a, token_type_id_a, input_ids_b, attention_mask_b, token_type_id_b, label = zip(*batch)
    36. print(input_ids_a)
    37. exit()
    38. bz = len(input_ids_a)
    39. max_a = 0
    40. for i in range(len(input_ids_a)):
    41. if input_ids_a[i].shape[0] > max_a:
    42. max_a = input_ids_a[i].shape[0]
    43. input_ids_a_pad = paddle.zeros(shape=[bz,max_a],dtype="int64")
    44. for i in range(len(input_ids_a)):
    45. l = input_ids_a[i].shape[0]
    46. input_ids_a_pad[i,0:l] = input_ids_a[i]
    47. attention_mask_a_pad = paddle.zeros(shape=[bz, max_a],dtype="int64")
    48. for i in range(len(input_ids_a)):
    49. l = attention_mask_a[i].shape[0]
    50. attention_mask_a_pad[i, 0:l] = attention_mask_a[i]
    51. token_type_id_a_pad = paddle.zeros(shape=[bz, max_a],dtype="int64")
    52. for i in range(len(input_ids_a)):
    53. l = token_type_id_a[i].shape[0]
    54. token_type_id_a_pad[i, 0:l] = token_type_id_a[i]
    55. max_b = 0
    56. for i in range(len(input_ids_b)):
    57. if input_ids_b[i].shape[0] > max_b:
    58. max_b = input_ids_b[i].shape[0]
    59. input_ids_b_pad = paddle.zeros(shape=[bz, max_b],dtype="int64")
    60. for i in range(len(input_ids_b)):
    61. l = input_ids_b[i].shape[0]
    62. input_ids_b_pad[i, 0:l] = input_ids_b[i]
    63. attention_mask_b_pad = paddle.zeros(shape=[bz, max_b],dtype="int64")
    64. for i in range(len(input_ids_b)):
    65. l = attention_mask_b[i].shape[0]
    66. attention_mask_b_pad[i, 0:l] = attention_mask_b[i]
    67. token_type_id_b_pad = paddle.zeros(shape=[bz, max_b],dtype="int64")
    68. for i in range(len(input_ids_b)):
    69. l = token_type_id_b[i].shape[0]
    70. token_type_id_b_pad[i, 0:l] = token_type_id_b[i]
    71. label = paddle.stack(label,axis=0)
    72. return input_ids_a_pad, attention_mask_a_pad, token_type_id_a_pad, input_ids_b_pad, attention_mask_b_pad, token_type_id_b_pad, label
    73. def train(args):
    74. logger.info("args: %s",args)
    75. ##会自动下载相应的权重和配置文件
    76. #tokenizer = BertTokenizer.from_pretrained('bert-wwm-ext-chinese')
    77. #model = SentenceBert.from_pretrained('bert-wwm-ext-chinese')
    78. tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path=args.pretrained)
    79. model = SentenceBert.from_pretrained(pretrained_model_name_or_path=args.pretrained)
    80. train_dataset = DataReader(tokenizer=tokenizer,filepath=args.train_file,max_len=args.max_len)
    81. train_dataloader = DataLoader(train_dataset,batch_size=args.batch_size,shuffle=True, collate_fn=collate_fn)
    82. val_dataset = DataReader(tokenizer=tokenizer,filepath=args.val_file,max_len=args.max_len)
    83. val_dataloader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn)
    84. optimizer = AdamW(parameters = model.parameters(),learning_rate = args.lr)
    85. scheduler = ReduceOnPlateau(learning_rate= args.lr,mode='max',factor=0.5, patience=5)
    86. model.train()
    87. logger.info("***** Running training *****")
    88. logger.info(" Num examples = %d", len(train_dataloader))
    89. logger.info(" Num Epochs = %d", args.epochs)
    90. time_srt = datetime.now().strftime('%Y-%m-%d')
    91. save_path = os.path.join(args.model_out,"paddle_" + time_srt)
    92. best_acc = 0.0
    93. patience = 5
    94. no_best_epoch = 0
    95. model.train()
    96. for epoch in range(args.epochs):
    97. pbar = ProgressBar(n_total=len(train_dataloader), desc='Training')
    98. for step,batch in enumerate(train_dataloader):
    99. inputs_a = {'input_ids':batch[0],'attention_mask':batch[1],'token_type_ids':batch[2]}
    100. inputs_b = {'input_ids': batch[3], 'attention_mask': batch[4], 'token_type_ids': batch[5]}
    101. labels = batch[6]
    102. inputs = []
    103. inputs.append(inputs_a)
    104. inputs.append(inputs_b)
    105. output = model(inputs)
    106. loss = F.cross_entropy(output,labels)
    107. loss.backward()
    108. optimizer.step()
    109. optimizer.clear_grad()
    110. pbar(step, {'loss':loss.item()})
    111. val_acc = valdation(model, val_dataloader)
    112. scheduler.step(val_acc)
    113. if val_acc > best_acc:
    114. best_acc = val_acc
    115. if not os.path.exists(save_path):
    116. os.makedirs(save_path)
    117. logger.info("save model")
    118. model.save_pretrained(save_path)
    119. tokenizer.save_pretrained(save_path)
    120. no_best_epoch = 0
    121. else:
    122. no_best_epoch += 1
    123. logger.info("val_acc:%.4f------best_acc:%.4f" % ( val_acc, best_acc))
    124. if no_best_epoch >= patience:
    125. logger.info("taining finished because of no improving" )
    126. exit()
    127. def valdation(model,val_dataloader):
    128. total = 0
    129. total_correct = 0
    130. model.eval()
    131. with paddle.no_grad():
    132. pbar = ProgressBar(n_total=len(val_dataloader), desc='evaldation')
    133. for step, batch in enumerate(val_dataloader):
    134. inputs_a = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2]}
    135. inputs_b = {'input_ids': batch[3], 'attention_mask': batch[4], 'token_type_ids': batch[5]}
    136. labels = paddle.squeeze(batch[6])
    137. inputs = []
    138. inputs.append(inputs_a)
    139. inputs.append(inputs_b)
    140. output = model(inputs)
    141. pred = paddle.argmax(output, axis=1)
    142. correct = (labels == pred).sum()
    143. total_correct += correct
    144. total += labels.shape[0]
    145. loss = F.cross_entropy(output, labels)
    146. pbar(step, {'loss': loss.item()})
    147. acc = total_correct / total
    148. return acc
    149. def main():
    150. args =parse_args()
    151. train(args)
    152. if __name__ == '__main__':
    153. main()

    三、效果对比

    直接看看模型训练的时间和准确率(更科学一点的应该是采用F1值召回率等等)

    torch结果

    torch的训练时间和准确率如下图

     paddle目前框架有问题,设置随机种子不生效,导致结果不可复现,每次训练结果乱飞(0.65-0.68之间),尝试如下设置后,均没有效果

    paddle.seed(100)
    random.seed(100)
    np.random.seed(100)
    FLAGS_cudnn_deterministic = True python train....

     需要等官方提供解决办法,看来paddle还是有很多bug呀,白开心一场呀。。。至于后续的对比就没有啥意义了;后面再来修改这篇博客吧

    和paddle官方沟通情况

  • 相关阅读:
    【Java8】对JSONArray 按指定字段排序,升序和倒序,经典
    React组件进阶
    【前端实习生备战秋招】—计算机网络面试题汇总,建议收藏系列
    钉钉添加自定义机器人,实现每周定时@某人
    C# 内存泄漏之 Internal 关键词代表什么?
    .Net Web项目创建比较不错的参考文章
    分布式ID生成器-rain
    python 自动化部署SpringBoot到远程linux系统
    Vue 中利用 template标签遍历多维数组
    docker 存储挂载比较
  • 原文地址:https://blog.csdn.net/HUSTHY/article/details/126144004