Transformer:
先验假设(归纳偏置) 与 数据(样本)量,成反比:
Transformer的Loss函数
PyTorch中,CrossEntropy的输入,期望Class放在第2维,Batch放在第1维,可以是类别索引(Class indices),也可以是类别的概率(Probabilities for each class)。
reduction默认是mean,例如6个单词的平均交叉熵。reduction是none,默认交叉熵:先做softmax,再做-ln(prob)
参考:CLIP算法的Loss详解 和 交叉熵CrossEntropy实现
# 定义softmax函数
def softmax(x):
return np.exp(x) / np.sum(np.exp(x))
# 利用numpy计算
def cross_entropy_np(x, y):
x_softmax = [softmax(x[i]) for i in range(len(x))]
x_log = [np.log(x_softmax[i][y[i]]) for i in range(len(y))]
loss = - np.sum(x_log) / len(y)
return loss
# 测试逻辑
x = [[1.9269, 1.4873, 0.9007, -2.1055]]
y = [[2]]
v1 = cross_entropy_np(x, y)
print(f"v1: {v1}")
x = torch.unsqueeze(torch.Tensor(x), dim=0)
x = x.transpose(1, 2) # CrossEntropy输入期望: Class放在第2维,Batch放在第1维
y = torch.Tensor(y)
y = y.to(torch.long) # label的类型为long
v2 = F.cross_entropy(x, y, reduction="none")
print(f"v2: {v2}")
随机种子:torch.manual_seed(42)
,每个rand之前,都需要添加
构建序列建模的Mask,如下:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import numpy as np
# batch_size=2, seqlen=3, vocab_size=4
torch.manual_seed(42)
logits = torch.randn(2, 3, 4)
logits = logits.transpose(1, 2)
print(f'[Info] logits.shape: {logits.shape}')
print(f'[Info] logits: \n{logits}')
# logits_softmax = F.softmax(logits, dim=1)
# print(f'[Info] logits_softmax: \n{logits_softmax}')
# batch_size=2, vocab_size=4
torch.manual_seed(42)
label = torch.randint(0, 4, (2, 3))
print(f'[Info] label.shape: {label.shape}')
print(f'[Info] label: \n{label}')
# loss: torch.nn.CrossEntropyLoss -> F.cross_entropy
# (2x4x3) + (2x3) = (2x3)
val = F.cross_entropy(logits, label, reduction="none")
print(f"[Info] val.shape: {val.shape}")
print(f"[Info] val: \n{val}")
# 在loss中, 增加mask, 与ignore_index参数功能类似,默认值是-100
tgt_len = torch.Tensor([2,3]).to(torch.int32)
mask = [F.pad(torch.ones(L), (0, max(tgt_len)-L)) for L in tgt_len]
mask = torch.stack(mask)
print(f"[Info] mask: \n{mask}")
val = F.cross_entropy(logits, label, reduction="none") * mask
print(f"[Info] val.shape: {val.shape}")
print(f"[Info] val: \n{val}")
# 与ignore_index参数功能类似,默认值是-100
label[0, 2] = -100
val = F.cross_entropy(logits, label, reduction="none")
print(f"[Info] val.shape: {val.shape}")
print(f"[Info] val: \n{val}")