• Transformers


    概述

    Transformer 是一种非常流行的架构,它利用和扩展自注意力的概念,为下游任务创建非常有用的输入数据表示。

    • 优点

      • 通过上下文嵌入更好地表示我们的输入令牌,其中令牌表示基于使用自我注意的特定相邻令牌。
      • 子词标记,而不是字符标记,因为它们可以为我们的许多关键字、前缀、后缀等提供更有意义的表示。
      • 参与(并行)我们输入中的所有标记,而不是受到过滤器跨度(CNN)或顺序处理(RNN)的内存问题的限制。
    • 缺点

      • 计算密集型
      • 需要大量数据(使用预训练模型缓解)

    设置

    让我们为我们的主要任务设置种子和设备。

    1. import numpy as np
    2. import pandas as pd
    3. import random
    4. import torch
    5. import torch.nn as nn
    SEED = 1234
    1. def set_seeds(seed=1234):
    2. """Set seeds for reproducibility."""
    3. np.random.seed(seed)
    4. random.seed(seed)
    5. torch.manual_seed(seed)
    6. torch.cuda.manual_seed(seed)
    7. torch.cuda.manual_seed_all(seed) # multi-GPU
    1. # Set seeds for reproducibility
    2. set_seeds(seed=SEED)
    1. # Set device
    2. cuda = True
    3. device = torch.device("cuda" if (
    4. torch.cuda.is_available() and cuda) else "cpu")
    5. torch.set_default_tensor_type("torch.FloatTensor")
    6. if device.type == "cuda":
    7. torch.set_default_tensor_type("torch.cuda.FloatTensor")
    8. print (device)

    加载数据

    我们将下载AG News 数据集Business,该数据集包含来自 4 个独特类别( 、Sci/TechSportsWorld) 的 120K 文本样本

    1. # Load data
    2. url = "https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/news.csv"
    3. df = pd.read_csv(url, header=0) # load
    4. df = df.sample(frac=1).reset_index(drop=True) # shuffle
    5. df.head()
    标题类别
    0沙龙接受减少加沙军队行动的计划......世界
    1野生动物犯罪斗争中的互联网关键战场科技
    27 月耐用品订单增长 1.7%商业
    3华尔街放缓的迹象越来越多商业
    4真人秀的新面孔世界
    1. # Reduce data size (too large to fit in Colab's limited memory)
    2. df = df[:10000]
    3. print (len(df))

    预处理

    我们将首先通过执行诸如下部文本、删除停止(填充)词、使用正则表达式的过滤器等操作来清理我们的输入数据。

    1. import nltk
    2. from nltk.corpus import stopwords
    3. from nltk.stem import PorterStemmer
    4. import re
    1. nltk.download("stopwords")
    2. STOPWORDS = stopwords.words("english")
    3. print (STOPWORDS[:5])
    4. porter = PorterStemmer()
    1. [nltk_data] 正在下载包停用词到 /root/nltk_data...
    2. [nltk_data] 包停用词已经是最新的!
    3. ['我','我','我的','我自己','我们']
    1. def preprocess(text, stopwords=STOPWORDS):
    2. """Conditional preprocessing on our text unique to our task."""
    3. # Lower
    4. text = text.lower()
    5. # Remove stopwords
    6. pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
    7. text = pattern.sub("", text)
    8. # Remove words in parenthesis
    9. text = re.sub(r"\([^)]*\)", "", text)
    10. # Spacing and filters
    11. text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
    12. text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
    13. text = re.sub(" +", " ", text) # remove multiple spaces
    14. text = text.strip()
    15. return text
    1. # Sample
    2. text = "Great week for the NYSE!"
    3. preprocess(text=text)
    纽约证券交易所伟大的一周
    1. # Apply to dataframe
    2. preprocessed_df = df.copy()
    3. preprocessed_df.title = preprocessed_df.title.apply(preprocess)
    4. print (f"{df.title.values[0]}\n\n{preprocessed_df.title.values[0]}")
    1. 沙龙接受减少加沙军队行动的计划,国土报说
    2. 沙龙接受减少加沙军队行动的计划 国土报说

    拆分数据

    1. import collections
    2. from sklearn.model_selection import train_test_split
    1. TRAIN_SIZE = 0.7
    2. VAL_SIZE = 0.15
    3. TEST_SIZE = 0.15
    1. def train_val_test_split(X, y, train_size):
    2. """Split dataset into data splits."""
    3. X_train, X_, y_train, y_ = train_test_split(X, y, train_size=TRAIN_SIZE, stratify=y)
    4. X_val, X_test, y_val, y_test = train_test_split(X_, y_, train_size=0.5, stratify=y_)
    5. return X_train, X_val, X_test, y_train, y_val, y_test
    1. # Data
    2. X = preprocessed_df["title"].values
    3. y = preprocessed_df["category"].values
    1. # Create data splits
    2. X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(
    3. X=X, y=y, train_size=TRAIN_SIZE)
    4. print (f"X_train: {X_train.shape}, y_train: {y_train.shape}")
    5. print (f"X_val: {X_val.shape}, y_val: {y_val.shape}")
    6. print (f"X_test: {X_test.shape}, y_test: {y_test.shape}")
    7. print (f"Sample point: {X_train[0]}{y_train[0]}")
    1. X_train: (7000,), y_train: (7000,)
    2. X_val: (1500,), y_val: (1500,)
    3. X_test: (1500,), y_test: (1500,)
    4. 样本点:失去的流感发薪日 → 业务

    标签编码

    接下来,我们将定义 aLabelEncoder将我们的文本标签编码为唯一索引

    import itertools
    1. class LabelEncoder(object):
    2. """Label encoder for tag labels."""
    3. def __init__(self, class_to_index={}):
    4. self.class_to_index = class_to_index or {} # mutable defaults ;)
    5. self.index_to_class = {v: k for k, v in self.class_to_index.items()}
    6. self.classes = list(self.class_to_index.keys())
    7. def __len__(self):
    8. return len(self.class_to_index)
    9. def __str__(self):
    10. return f""
    11. def fit(self, y):
    12. classes = np.unique(y)
    13. for i, class_ in enumerate(classes):
    14. self.class_to_index[class_] = i
    15. self.index_to_class = {v: k for k, v in self.class_to_index.items()}
    16. self.classes = list(self.class_to_index.keys())
    17. return self
    18. def encode(self, y):
    19. y_one_hot = np.zeros((len(y), len(self.class_to_index)), dtype=int)
    20. for i, item in enumerate(y):
    21. y_one_hot[i][self.class_to_index[item]] = 1
    22. return y_one_hot
    23. def decode(self, y):
    24. classes = []
    25. for i, item in enumerate(y):
    26. index = np.where(item == 1)[0][0]
    27. classes.append(self.index_to_class[index])
    28. return classes
    29. def save(self, fp):
    30. with open(fp, "w") as fp:
    31. contents = {'class_to_index': self.class_to_index}
    32. json.dump(contents, fp, indent=4, sort_keys=False)
    33. @classmethod
    34. def load(cls, fp):
    35. with open(fp, "r") as fp:
    36. kwargs = json.load(fp=fp)
    37. return cls(**kwargs)
    1. # Encode
    2. label_encoder = LabelEncoder()
    3. label_encoder.fit(y_train)
    4. NUM_CLASSES = len(label_encoder)
    5. label_encoder.class_to_index
    {“商业”:0,“科技”:1,“体育”:2,“世界”:3}
    1. # Class weights
    2. counts = np.bincount([label_encoder.class_to_index[class_] for class_ in y_train])
    3. class_weights = {i: 1.0/count for i, count in enumerate(counts)}
    4. print (f"counts: {counts}\nweights: {class_weights}")
    1. 计数:[1746 1723 1725 1806]
    2. 权重:{0: 0.000572737686139748, 1: 0.0005803830528148578, 2: 0.00057971
    1. # Convert labels to tokens
    2. print (f"y_train[0]: {y_train[0]}")
    3. y_train = label_encoder.encode(y_train)
    4. y_val = label_encoder.encode(y_val)
    5. y_test = label_encoder.encode(y_test)
    6. print (f"y_train[0]: {y_train[0]}")
    7. print (f"decode([y_train[0]]): {label_encoder.decode([y_train[0]])}")
    1. y_train[0]:业务
    2. y_train[0]:[1 0 0 0]
    3. 解码([y_train[0]]):['业务']

    分词器

    我们将使用BertTokenizer将输入文本标记为子词标记。

    1. from transformers import DistilBertTokenizer
    2. from transformers import BertTokenizer
    1. # Load tokenizer and model
    2. # tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
    3. tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
    4. vocab_size = len(tokenizer)
    5. print (vocab_size)
    31090
    1. # Tokenize inputs
    2. encoded_input = tokenizer(X_train.tolist(), return_tensors="pt", padding=True)
    3. X_train_ids = encoded_input["input_ids"]
    4. X_train_masks = encoded_input["attention_mask"]
    5. print (X_train_ids.shape, X_train_masks.shape)
    6. encoded_input = tokenizer(X_val.tolist(), return_tensors="pt", padding=True)
    7. X_val_ids = encoded_input["input_ids"]
    8. X_val_masks = encoded_input["attention_mask"]
    9. print (X_val_ids.shape, X_val_masks.shape)
    10. encoded_input = tokenizer(X_test.tolist(), return_tensors="pt", padding=True)
    11. X_test_ids = encoded_input["input_ids"]
    12. X_test_masks = encoded_input["attention_mask"]
    13. print (X_test_ids.shape, X_test_masks.shape)
    1. # Decode
    2. print (f"{X_train_ids[0]}\n{tokenizer.decode(X_train_ids[0])}")
    1. # Sub-word tokens
    2. print (tokenizer.convert_ids_to_tokens(ids=X_train_ids[0]))

    数据集

    我们将创建数据集和数据加载器,以便能够使用我们的数据拆分有效地创建批次。

    1. class TransformerTextDataset(torch.utils.data.Dataset):
    2. def __init__(self, ids, masks, targets):
    3. self.ids = ids
    4. self.masks = masks
    5. self.targets = targets
    6. def __len__(self):
    7. return len(self.targets)
    8. def __str__(self):
    9. return f""
    10. def __getitem__(self, index):
    11. ids = torch.tensor(self.ids[index], dtype=torch.long)
    12. masks = torch.tensor(self.masks[index], dtype=torch.long)
    13. targets = torch.FloatTensor(self.targets[index])
    14. return ids, masks, targets
    15. def create_dataloader(self, batch_size, shuffle=False, drop_last=False):
    16. return torch.utils.data.DataLoader(
    17. dataset=self,
    18. batch_size=batch_size,
    19. shuffle=shuffle,
    20. drop_last=drop_last,
    21. pin_memory=False)
    1. # Create datasets
    2. train_dataset = TransformerTextDataset(ids=X_train_ids, masks=X_train_masks, targets=y_train)
    3. val_dataset = TransformerTextDataset(ids=X_val_ids, masks=X_val_masks, targets=y_val)
    4. test_dataset = TransformerTextDataset(ids=X_test_ids, masks=X_test_masks, targets=y_test)
    5. print ("Data splits:\n"
    6. f" Train dataset:{train_dataset.__str__()}\n"
    7. f" Val dataset: {val_dataset.__str__()}\n"
    8. f" Test dataset: {test_dataset.__str__()}\n"
    9. "Sample point:\n"
    10. f" ids: {train_dataset[0][0]}\n"
    11. f" masks: {train_dataset[0][1]}\n"
    12. f" targets: {train_dataset[0][2]}")
    1. # Create dataloaders
    2. batch_size = 128
    3. train_dataloader = train_dataset.create_dataloader(
    4. batch_size=batch_size)
    5. val_dataloader = val_dataset.create_dataloader(
    6. batch_size=batch_size)
    7. test_dataloader = test_dataset.create_dataloader(
    8. batch_size=batch_size)
    9. batch = next(iter(train_dataloader))
    10. print ("Sample batch:\n"
    11. f" ids: {batch[0].size()}\n"
    12. f" masks: {batch[1].size()}\n"
    13. f" targets: {batch[2].size()}")

    培训

    让我们创建一个Trainer类,我们将使用它来促进我们的实验训练。

    import torch.nn.functional as F
    1. class Trainer(object):
    2. def __init__(self, model, device, loss_fn=None, optimizer=None, scheduler=None):
    3. # Set params
    4. self.model = model
    5. self.device = device
    6. self.loss_fn = loss_fn
    7. self.optimizer = optimizer
    8. self.scheduler = scheduler
    9. def train_step(self, dataloader):
    10. """Train step."""
    11. # Set model to train mode
    12. self.model.train()
    13. loss = 0.0
    14. # Iterate over train batches
    15. for i, batch in enumerate(dataloader):
    16. # Step
    17. batch = [item.to(self.device) for item in batch] # Set device
    18. inputs, targets = batch[:-1], batch[-1]
    19. self.optimizer.zero_grad() # Reset gradients
    20. z = self.model(inputs) # Forward pass
    21. J = self.loss_fn(z, targets) # Define loss
    22. J.backward() # Backward pass
    23. self.optimizer.step() # Update weights
    24. # Cumulative Metrics
    25. loss += (J.detach().item() - loss) / (i + 1)
    26. return loss
    27. def eval_step(self, dataloader):
    28. """Validation or test step."""
    29. # Set model to eval mode
    30. self.model.eval()
    31. loss = 0.0
    32. y_trues, y_probs = [], []
    33. # Iterate over val batches
    34. with torch.inference_mode():
    35. for i, batch in enumerate(dataloader):
    36. # Step
    37. batch = [item.to(self.device) for item in batch] # Set device
    38. inputs, y_true = batch[:-1], batch[-1]
    39. z = self.model(inputs) # Forward pass
    40. J = self.loss_fn(z, y_true).item()
    41. # Cumulative Metrics
    42. loss += (J - loss) / (i + 1)
    43. # Store outputs
    44. y_prob = F.softmax(z).cpu().numpy()
    45. y_probs.extend(y_prob)
    46. y_trues.extend(y_true.cpu().numpy())
    47. return loss, np.vstack(y_trues), np.vstack(y_probs)
    48. def predict_step(self, dataloader):
    49. """Prediction step."""
    50. # Set model to eval mode
    51. self.model.eval()
    52. y_probs = []
    53. # Iterate over val batches
    54. with torch.inference_mode():
    55. for i, batch in enumerate(dataloader):
    56. # Forward pass w/ inputs
    57. inputs, targets = batch[:-1], batch[-1]
    58. z = self.model(inputs)
    59. # Store outputs
    60. y_prob = F.softmax(z).cpu().numpy()
    61. y_probs.extend(y_prob)
    62. return np.vstack(y_probs)
    63. def train(self, num_epochs, patience, train_dataloader, val_dataloader):
    64. best_val_loss = np.inf
    65. for epoch in range(num_epochs):
    66. # Steps
    67. train_loss = self.train_step(dataloader=train_dataloader)
    68. val_loss, _, _ = self.eval_step(dataloader=val_dataloader)
    69. self.scheduler.step(val_loss)
    70. # Early stopping
    71. if val_loss < best_val_loss:
    72. best_val_loss = val_loss
    73. best_model = self.model
    74. _patience = patience # reset _patience
    75. else:
    76. _patience -= 1
    77. if not _patience: # 0
    78. print("Stopping early!")
    79. break
    80. # Logging
    81. print(
    82. f"Epoch: {epoch+1} | "
    83. f"train_loss: {train_loss:.5f}, "
    84. f"val_loss: {val_loss:.5f}, "
    85. f"lr: {self.optimizer.param_groups[0]['lr']:.2E}, "
    86. f"_patience: {_patience}"
    87. )
    88. return best_model

    变压器

    我们将首先了解 Transformer 架构中的独特组件,然后为我们的文本分类任务实现一个。

    缩放点积注意力

    最流行的自我关注类型是来自被广泛引用的Attention is all you need paper 的缩放点积注意力。这种类型的注意力涉及将我们编码的输入序列投影到三个矩阵上,查询(Q)、键(K)和值(V),我们学习它们的权重。

    多头注意力

    除了在整个编码输入中仅应用一次自我注意之外,我们还可以分离输入并将自我注意并行(头)应用到每个输入部分并将它们连接起来。这允许不同的头部学习独特的表示,同时保持复杂性,因为我们将输入分成更小的子空间。

    位置编码

    通过 self-attention,我们无法解释输入标记的顺序位置。为了解决这个问题,我们可以使用位置编码来创建每个标记相对于整个序列的位置的表示。这可以学习(使用权重),或者我们可以使用可以更好地扩展的固定函数来创建在推理期间未观察到的长度的位置编码.

    这有效地允许我们使用非常大的序列的固定函数来表示每个标记的相对位置。而且因为我们已经将位置编码限制为与我们的编码输入具有相同的维度,所以我们可以简单地将它们连接起来,然后再将它们输入到多头注意力头中。

    建筑学

    这就是这一切如何结合在一起的!它是一种端到端架构,可创建这些上下文表示并使用编码器-解码器架构来预测结果(一对一、多对一、多对多等)。在该架构中,它们需要大量数据进行训练而不会过度拟合,但是,它们可以用作预训练模型来微调类似于最初训练的较大数据集的较小数据集。

    我们不会从头开始实现 Transformer,但我们将在基线课程中使用Hugging Face 库来实现!

    模型

    我们将使用预训练的BertModel作为特征提取器。我们只会使用编码器来接收顺序和池化的输出(is_decoder=False默认)。

     

    from transformers import BertModel
    1. # transformer = BertModel.from_pretrained("distilbert-base-uncased")
    2. # embedding_dim = transformer.config.dim
    3. transformer = BertModel.from_pretrained("allenai/scibert_scivocab_uncased")
    4. embedding_dim = transformer.config.hidden_size
    1. class Transformer(nn.Module):
    2. def __init__(self, transformer, dropout_p, embedding_dim, num_classes):
    3. super(Transformer, self).__init__()
    4. self.transformer = transformer
    5. self.dropout = torch.nn.Dropout(dropout_p)
    6. self.fc1 = torch.nn.Linear(embedding_dim, num_classes)
    7. def forward(self, inputs):
    8. ids, masks = inputs
    9. seq, pool = self.transformer(input_ids=ids, attention_mask=masks)
    10. z = self.dropout(pool)
    11. z = self.fc1(z)
    12. return z

    我们决定使用池化输出,但我们可以同样轻松地使用顺序输出(每个子令牌的编码器表示)并在其上应用 CNN(或其他解码器选项)。

    1. # Initialize model
    2. dropout_p = 0.5
    3. model = Transformer(
    4. transformer=transformer, dropout_p=dropout_p,
    5. embedding_dim=embedding_dim, num_classes=num_classes)
    6. model = model.to(device)
    7. print (model.named_parameters)

    训练

    1. # Arguments
    2. lr = 1e-4
    3. num_epochs = 10
    4. patience = 10
    1. # Define loss
    2. class_weights_tensor = torch.Tensor(np.array(list(class_weights.values())))
    3. loss_fn = nn.BCEWithLogitsLoss(weight=class_weights_tensor)
    1. # Define optimizer & scheduler
    2. optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    3. scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    4. optimizer, mode="min", factor=0.1, patience=5)
    1. # Trainer module
    2. trainer = Trainer(
    3. model=model, device=device, loss_fn=loss_fn,
    4. optimizer=optimizer, scheduler=scheduler)

    评估

    1. import json
    2. from sklearn.metrics import precision_recall_fscore_support
    1. def get_performance(y_true, y_pred, classes):
    2. """Per-class performance metrics."""
    3. # Performance
    4. performance = {"overall": {}, "class": {}}
    5. # Overall performance
    6. metrics = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    7. performance["overall"]["precision"] = metrics[0]
    8. performance["overall"]["recall"] = metrics[1]
    9. performance["overall"]["f1"] = metrics[2]
    10. performance["overall"]["num_samples"] = np.float64(len(y_true))
    11. # Per-class performance
    12. metrics = precision_recall_fscore_support(y_true, y_pred, average=None)
    13. for i in range(len(classes)):
    14. performance["class"][classes[i]] = {
    15. "precision": metrics[0][i],
    16. "recall": metrics[1][i],
    17. "f1": metrics[2][i],
    18. "num_samples": np.float64(metrics[3][i]),
    19. }
    20. return performance
    1. # Get predictions
    2. test_loss, y_true, y_prob = trainer.eval_step(dataloader=test_dataloader)
    3. y_pred = np.argmax(y_prob, axis=1)
    1. # Determine performance
    2. performance = get_performance(
    3. y_true=np.argmax(y_true, axis=1), y_pred=y_pred, classes=label_encoder.classes)
    4. print (json.dumps(performance["overall"], indent=2))
    1. {
    2. “精度”:0.8085194951783808
    3. “召回”:0.8086666666666666
    4. “f1”:0.8083051845125695
    5. “num_samples”:1500.0
    6. }
    1. # Save artifacts
    2. from pathlib import Path
    3. dir = Path("transformers")
    4. dir.mkdir(parents=True, exist_ok=True)
    5. label_encoder.save(fp=Path(dir, "label_encoder.json"))
    6. torch.save(best_model.state_dict(), Path(dir, "model.pt"))
    7. with open(Path(dir, "performance.json"), "w") as fp:
    8. json.dump(performance, indent=2, sort_keys=False, fp=fp)

    推理

    1. def get_probability_distribution(y_prob, classes):
    2. """Create a dict of class probabilities from an array."""
    3. results = {}
    4. for i, class_ in enumerate(classes):
    5. results[class_] = np.float64(y_prob[i])
    6. sorted_results = {k: v for k, v in sorted(
    7. results.items(), key=lambda item: item[1], reverse=True)}
    8. return sorted_results
    1. # Load artifacts
    2. device = torch.device("cpu")
    3. tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
    4. label_encoder = LabelEncoder.load(fp=Path(dir, "label_encoder.json"))
    5. transformer = BertModel.from_pretrained("allenai/scibert_scivocab_uncased")
    6. embedding_dim = transformer.config.hidden_size
    7. model = Transformer(
    8. transformer=transformer, dropout_p=dropout_p,
    9. embedding_dim=embedding_dim, num_classes=num_classes)
    10. model.load_state_dict(torch.load(Path(dir, "model.pt"), map_location=device))
    11. model.to(device);
    1. # Initialize trainer
    2. trainer = Trainer(model=model, device=device)
    1. # Create datasets
    2. train_dataset = TransformerTextDataset(ids=X_train_ids, masks=X_train_masks, targets=y_train)
    3. val_dataset = TransformerTextDataset(ids=X_val_ids, masks=X_val_masks, targets=y_val)
    4. test_dataset = TransformerTextDataset(ids=X_test_ids, masks=X_test_masks, targets=y_test)
    5. print ("Data splits:\n"
    6. f" Train dataset:{train_dataset.__str__()}\n"
    7. f" Val dataset: {val_dataset.__str__()}\n"
    8. f" Test dataset: {test_dataset.__str__()}\n"
    9. "Sample point:\n"
    10. f" ids: {train_dataset[0][0]}\n"
    11. f" masks: {train_dataset[0][1]}\n"
    12. f" targets: {train_dataset[0][2]}")
    1. 数据拆分:
    2. 训练数据集:<Dataset(N=7000)>
    3. Val 数据集:<Dataset(N=1500)>
    4. 测试数据集:<Dataset(N=1500)>
    5. 样本点:
    6. ids: tensor([ 102, 6677, 1441, 3982, 17973, 103, 0, 0, 0, 0,
    7. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    8. 0, 0, 0, 0, 0])
    9. 掩码: 张量([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    10. 0, 0, 0])
    11. 目标:张量([1., 0., 0., 0.], device="cpu")
    1. # Dataloader
    2. text = "The final tennis tournament starts next week."
    3. X = preprocess(text)
    4. encoded_input = tokenizer(X, return_tensors="pt", padding=True).to(torch.device("cpu"))
    5. ids = encoded_input["input_ids"]
    6. masks = encoded_input["attention_mask"]
    7. y_filler = label_encoder.encode([label_encoder.classes[0]]*len(ids))
    8. dataset = TransformerTextDataset(ids=ids, masks=masks, targets=y_filler)
    9. dataloader = dataset.create_dataloader(batch_size=int(batch_size))
    1. # Inference
    2. y_prob = trainer.predict_step(dataloader)
    3. y_pred = np.argmax(y_prob, axis=1)
    4. label_encoder.index_to_class[y_pred[0]]
    运动的
    1. # Class distributions
    2. prob_dist = get_probability_distribution(y_prob=y_prob[0], classes=label_encoder.classes)
    3. print (json.dumps(prob_dist, indent=2))
    1. {
    2. “体育”:0.9999359846115112
    3. “世界”:4.0660612285137177e-05
    4. “科技”:1.1774928680097219e-05
    5. “商业”:1.1545793313416652e-05
    6. }

    可解释性

    让我们可视化编码器中每个注意力头的自注意力权重。

    1. import sys
    2. !rm -r bertviz_repo
    3. !test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
    4. if not "bertviz_repo" in sys.path:
    5. sys.path += ["bertviz_repo"]
    from bertviz import head_view
    1. # Print input ids
    2. print (ids)
    3. print (tokenizer.batch_decode(ids))
    1. # Get encoder attentions
    2. seq, pool, attn = model.transformer(input_ids=ids, attention_mask=masks, output_attentions=True)
    3. print (len(attn)) # 12 attention layers (heads)
    4. print (attn[0].shape)
    1. # HTML set up
    2. def call_html():
    3. import IPython
    4. display(IPython.core.display.HTML('''
    5. '''))
    1. # Visualize self-attention weights
    2. call_html()
    3. tokens = tokenizer.convert_ids_to_tokens(ids[0])
    4. head_view(attention=attn, tokens=tokens)

  • 相关阅读:
    10.28总结
    手把手教你做测开:开发Web平台之用户信息
    3种方法,关闭win10的IPC共享,怎么关不掉?
    mPEG-PLA-FITC MW:2K,1300 聚乙二醇2000-聚乳酸1300-绿色荧光素
    【强化学习论文】小样本策略泛化的提示决策转换器
    yum升级mysql
    适合学生党的百元蓝牙耳机,蓝牙耳机平价推荐
    第十二章 控制值的转换
    在Go编程中调用外部命令的几种场景
    FullCalendarDemo5 控件的实例讲解—拖拽实现值班排班(三)
  • 原文地址:https://blog.csdn.net/sikh_0529/article/details/126785500