• bert----学习笔记


    一个简单基础模板:
    bert导入,分词,编码

    from transformers import BertConfig, BertTokenizer, BertModel
    import torch
    from transformers import BertModel, BertTokenizer
    # 指定模型文件夹路径(包含 pytorch_model.bin)
    model_path = "/remote-home/cs_tcci_huangyuqian/code/bert-base-uncased"  
    tokenizer = BertTokenizer.from_pretrained(model_path)
    model = BertModel.from_pretrained(model_path)
    text = "Hello, how are you doing?"
    # text='I have a good time, thank you.'
    # print(tokenizer.tokenize(text))
    # 使用分词器将文本转换为模型的输入格式
    input_ids = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt")
    # 使用BERT模型进行文本编码
    with torch.no_grad():
        outputs = model(input_ids)
    # 输出结果包含了文本的编码表示
    # outputs 是一个包含两个张量的元组,第一个张量是编码的输出,第二个张量是注意力掩码
    encoded_text = outputs[0]
    # model_path = "/remote-home/cs_tcci_huangyuqian/code/bert-base-uncased"
    # config = BertConfig.from_json_file(f"{model_path}/config.json")
    # tokenizer = BertTokenizer.from_pretrained(f"{model_path}/vocab.txt")
    # text='I have a good time, thank you.'
    # print(tokenizer.tokenize(text))
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23

    bert处理情感分类

    data.txt文件

    i hate you 0
    i love you 1
    i really like you 1
    i don't like this 0
    
    
    • 1
    • 2
    • 3
    • 4
    • 5

    一个简单的例子,理解如何使用bert做情感分类问题。

    import torch
    import numpy as np
    import torch.nn as nn
    import torch.optim as optimizer
    from torch.utils.data import Dataset,DataLoader
    from transformers import BertForSequenceClassification, BertTokenizer, BertConfig,AdamW
    import torch.nn.functional as F
    from torch.utils.data import Dataset
    import re
    from transformers import AutoTokenizer
    # tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    from transformers import BertTokenizer, BertForQuestionAnswering
    import torch
    model_name = "/remote-home/cs_tcci_huangyuqian/code/bert-base-uncased"
    tokenizer = BertTokenizer.from_pretrained(model_name)
    config = BertConfig.from_pretrained(model_name, num_labels=2)
    model_class = BertForSequenceClassification.from_pretrained(model_name, config=config)
    class Mydataset(Dataset):
        def __init__(self):
            f = open('/remote-home/cs_tcci_huangyuqian/code/data.txt')
            x = f.readlines()
            x = [re.sub(r'\n', '', i) for i in x]
            x = [i.split() for i in x]
            lables = []
            sentences = []
            for i in x:
                lables.append(int(i[-1]))
                sentences.append(' '.join(i[:-1]))
            self.lables=lables
            self.sentences=sentences
        def __len__(self):
            return len(self.lables)
        def __getitem__(self, item):
            return self.sentences[item],self.lables[item]
    mydataset=Mydataset()
    dataloder=DataLoader(dataset=mydataset,batch_size=1)
    #查看mydataset的方法如下:
    # # 创建 Mydataset 对象
    # my_dataset = Mydataset()
    #
    # # 创建 DataLoader 对象
    # batch_size = 32  # 设置批次大小
    # data_loader = DataLoader(my_dataset, batch_size=batch_size, shuffle=True)
    #
    # # 遍历 DataLoader 以查看批次数据
    # for batch in data_loader:
    #     sentences, labels = batch
    #     # 在这里你可以查看每个批次的 sentences 和 labels
    #     print("Sentences:", sentences)
    #     print("Labels:", labels)
    #     break
    
    
    class ClassEmotion(nn.Module):
    	def __init__(self, model):
    		super().__init__()
    		self.bert_class = model
    
    	def forward(self, input):
    		out = self.bert_class(**input)
    		return out
    
    classifier = ClassEmotion(model_class)
    
    # 设置训练参数
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    classifier.to(device)
    optimizer = AdamW(classifier.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    for epoch in range(20):
        total_loss = 0
        for batch in dataloder:
            lable=batch[1]
            input=tokenizer.encode_plus(" ".join(batch[0]),max_length=10,padding='max_length',return_tensors='pt',truncation=True)
            optimizer.zero_grad()
            logits = classifier(input)
            loss = criterion(logits[0],lable)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss}")
    x='you love apple'
    x_input=tokenizer.encode_plus(x,max_length=10,padding='max_length',return_tensors='pt',truncation=True)
    print(torch.argmax(classifier(x_input)[0], dim=1).item())
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85

    在这里插入图片描述
    结果判断正确

    bert处理问答任务

    import json
    from transformers import BertTokenizer, BertForQuestionAnswering
    import torch
    from transformers import AdamW, BertTokenizerFast, BertForQuestionAnswering
    from transformers import BertConfig
    # 指定预训练BERT模型的名称或文件夹路径(中文BERT模型的名称为"bert-base-chinese")
    model_name = "/remote-home/cs_tcci_huangyuqian/code/bert-base-uncased"
    # config = BertConfig.from_json_file("/remote-home/cs_tcci_huangyuqian/code/bert4keras-master/bert-base-chinese/config.json")
    # tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForQuestionAnswering.from_pretrained(model_name)
    chi_tokenizer=BertTokenizer.from_pretrained(model_name)
    question = 'what is the answer of the question'
    paragraph = 'the answer of question is 42'
    inputs = chi_tokenizer(question, paragraph, return_tensors='pt')
    output = model(**inputs, start_positions=torch.tensor([0]), end_positions=torch.tensor([16]))
    print("loss: ", output.loss)
    
    optimizer = AdamW(model.parameters(), lr=1e-4)
    output.loss.backward()
    optimizer.step()
    
    print("start_logits: ")
    print(output.start_logits)
    
    print("end_logits: ")
    print(output.end_logits)
    
    start = torch.argmax(output.start_logits) # 返回dim维度上张量最大值的索引。
    end = torch.argmax(output.end_logits)
    print("start position: ", start.item()) # 一个元素张量可以用x.item()得到元素值
    print("end position:   ", end.item())
    
    # 获取预测的start和end的token的id
    predict_id = inputs['input_ids'][0][ start:end + 1]
    print("predict_id:     ", predict_id)
    # 根据id解码出原文
    predict_answer = chi_tokenizer.decode(predict_id)
    print("predict_answer: ", predict_answer)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38

    在这里插入图片描述

    import json
    from transformers import BertTokenizer, BertForQuestionAnswering
    import torch
    from transformers import AdamW, BertTokenizerFast, BertForQuestionAnswering
    from transformers import BertConfig
    # 指定预训练BERT模型的名称或文件夹路径(中文BERT模型的名称为"bert-base-chinese")
    model_name = "/remote-home/cs_tcci_huangyuqian/code/bert-base-uncased"
    # config = BertConfig.from_json_file("/remote-home/cs_tcci_huangyuqian/code/bert4keras-master/bert-base-chinese/config.json")
    # tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForQuestionAnswering.from_pretrained(model_name)
    tokenizer=BertTokenizer.from_pretrained(model_name)
    question = "How many parameters does BERT-large have?"
    answer_text = "BERT-large is really big... it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab instance."
    # # Apply the tokenizer to the input text, treating them as a text-pair.
    # # 对输入文本应用标记器(tokenizer),将它们视为文本对。
    # input_ids = tokenizer.encode(question, answer_text)
    #
    # # print('The input has a total of {:} tokens.'.format(len(input_ids)))
    # # BERT only needs the token IDs, but for the purpose of inspecting the
    # # tokenizer's behavior, let's also get the token strings and display them.
    # # BERT只需要tokens 的id,但是为了检查token生成器的行为,让我们也获取token的字符串并显示它们。
    # tokens = tokenizer.convert_ids_to_tokens(input_ids)  # 转换为字符
    #
    # # For each token and its id...
    #
    # # 在input_ids中搜索`[SEP]`标记的第一个实例。
    # sep_index = input_ids.index(tokenizer.sep_token_id)    # 在sep出现的位置
    #
    # # 段A标记的数量包括[SEP]标记本身。
    # num_seg_a = sep_index + 1    # sep后面的位置
    #
    # # The remainder are segment B.
    # num_seg_b = len(input_ids) - num_seg_a    # 剩余的是B
    #
    # # Construct the list of 0s and 1s.
    # segment_ids = [0]*num_seg_a + [1]*num_seg_b
    #
    # # There should be a segment_id for every input token.
    # assert len(segment_ids) == len(input_ids)    # 每个输入令牌都应该有一个segment_id。
    # # 在模型中运行我们的示例。
    # output = model(torch.tensor([input_ids]), # The tokens representing our input text.
    #                                  token_type_ids=torch.tensor([segment_ids])) # The segment IDs to differentiate question from answer_text
    #
    # # Find the tokens with the highest `start` and `end` scores.
    # answer_start = torch.argmax(output.start_logits)
    # answer_end = torch.argmax(output.end_logits)
    #
    # # Combine the tokens in the answer and print it out.
    # answer = ' '.join(tokens[answer_start:answer_end+1])
    #
    # print('Answer: "' + answer + '"')
    
    
    def answer_question(question, answer_text):
        '''
        Takes a `question` string and an `answer_text` string (which contains the
        answer), and identifies the words within the `answer_text` that are the
        answer. Prints them out.
        设定`question`和`answer_text`(包含答案)字符串,定义单词的答案
        '''
        # ======== Tokenize ========
        # Apply the tokenizer to the input text, treating them as a text-pair.
        input_ids = tokenizer.encode(question, answer_text)
    
        # Report how long the input sequence is.
        print('Query has {:,} tokens.\n'.format(len(input_ids)))
    
        # ======== Set Segment IDs ========
        # Search the input_ids for the first instance of the `[SEP]` token.
        sep_index = input_ids.index(tokenizer.sep_token_id)
    
        # The number of segment A tokens includes the [SEP] token istelf.
        num_seg_a = sep_index + 1
    
        # The remainder are segment B.
        num_seg_b = len(input_ids) - num_seg_a
    
        # Construct the list of 0s and 1s.
        segment_ids = [0] * num_seg_a + [1] * num_seg_b
    
        # There should be a segment_id for every input token.
        assert len(segment_ids) == len(input_ids)
    
        # ======== Evaluate ========
        # Run our example question through the model.
        output = model(torch.tensor([input_ids]),  # The tokens representing our input text.
                       token_type_ids=torch.tensor(
                           [segment_ids]))  # The segment IDs to differentiate question from answer_text
    
        start_scores, end_scores = output.start_logits, output.end_logits
        # ======== Reconstruct Answer ========
        # Find the tokens with the highest `start` and `end` scores.
        answer_start = torch.argmax(start_scores)
        answer_end = torch.argmax(end_scores)
    
        # Get the string versions of the input tokens.
        tokens = tokenizer.convert_ids_to_tokens(input_ids)
    
        # Start with the first token.
        answer = tokens[answer_start]
    
        # Select the remaining answer tokens and join them with whitespace.
        for i in range(answer_start + 1, answer_end + 1):
    
            # If it's a subword token, then recombine it with the previous token.
            if tokens[i][0:2] == '##':
                answer += tokens[i][2:]
    
            # Otherwise, add a space then the token.
            else:
                answer += ' ' + tokens[i]
    
        print('Answer: "' + answer + '"')
    import textwrap
    
    # Wrap text to 80 characters.
    wrapper = textwrap.TextWrapper(width=80)
    
    bert_abstract = "We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pretrain deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be finetuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial taskspecific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement)."
    
    print(wrapper.fill(bert_abstract))
    question = "What does the 'B' in BERT stand for?"
    
    answer_question(question, bert_abstract)
    
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121
    • 122
    • 123
    • 124
    • 125
    • 126

    例子2:

    model_name = 'bert-base-chinese'
    
    # 通过词典导入分词器
    tokenizer = BertTokenizer.from_pretrained(model_name)
    # 导入配置文件
    model_config = BertConfig.from_pretrained(model_name)
    # 最终有两个输出,初始位置和结束位置
    model_config.num_labels = 2
    
    # 根据bert的 model_config 新建 BertForQuestionAnsering
    model = BertForQuestionAnswering(model_config)
    model.eval()
    
    question, text = '里昂是谁?', '里昂是一个杀手。'
    
    sen_code = tokenizer.encode_plus(question, text)
    
    tokens_tensor = torch.tensor([sen_code['input_ids']])
    segments_tensors = torch.tensor([sen_code['token_type_ids']]) # 区分两个句子的编码(上句全为0,下句全为1)
    
    start_pos, end_pos = model(tokens_tensor, token_type_ids = segments_tensors)
    # 进行逆编码,得到原始的token
    all_tokens = tokenizer.convert_ids_to_tokens(sen_code['input_ids'])
    print(all_tokens)  # ['[CLS]', '里', '昂', '是', '谁', '[SEP]', '里', '昂', '是', '一', '个', '杀', '手', '[SEP]']
    
    # 对输出的答案进行解码的过程
    answer = ' '.join(all_tokens[torch.argmax(start_pos) : torch.argmax(end_pos) + 1])
    
    # 每次执行的结果不一致,这里因为没有经过微调,所以效果不是很好,输出结果不佳,下面的输出是其中的一种。
    print(answer)   # 一 个 杀 手
    
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
  • 相关阅读:
    linux下使用crontab定时器,并且设置定时不执行的情况,附:项目启动遇到的一些问题和命令
    设计模式-原型模式-浅克隆和深克隆在Java中的使用示例
    异常的try-catch
    HCIP实验(11)
    3D全景:为各行业提供更真实的交互体验
    Python 语音识别系列-实战学习之初识语音识别
    数据结构与算法课后题-第三章(顺序队和链队)
    vue封装请求、合并js、合并多个js
    制作php的composer包
    处理查询结果集
  • 原文地址:https://blog.csdn.net/qq_48566899/article/details/133717029