• 【Transformer from “Attention is all you need“ implementation】—— pytorch


    Components

    Embeddings

    class Embeddings(nn.Module):
      def __init__(self, vocab_size, embedding_dim):
        """
        vocab_size: 词表大小
        embedding_dim: 词嵌入维度
        """
        super().__init__()
        # lut : look up table, embed 过程其实类似于查找表的映射
        self.lut = nn.Embedding(vocab_size, embedding_dim) 
        self.embedding_dim = embedding_dim
    
      def forward(self, x):
        """
        对词汇映射后的数字张量进行缩放,控制数值大小
        """
        return self.lut(x) * math.sqrt(self.embedding_dim)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16

    PositionalEncoding

    class PositionalEncoding(nn.Module):
      def __init__(self, embedding_dim, dropout, max_len = 5000):
      
        super().__init__()
        
        self.dropout = nn.Dropout(p = dropout)
    
        pos_encode = torch.zeros(max_len, embedding_dim)
        position = torch.arange(0, max_len).unsqueeze(1)
    
        div_term = torch.exp(torch.arange(0, embedding_dim, 2) * 
                            - (math.log(1000.0) / embedding_dim))
        
        pos_encode[:, 0::2] = torch.sin(position * div_term)
        pos_encode[:, 1::2] = torch.cos(position * div_term)
    
        pos_encode = pos_encode.unsqueeze(0)
    
        self.register_buffer('pos_encode', pos_encode)
    
      def forward(self, x):
        x = x + Variable(self.pos_encode[:, :x.size(1)], requires_grad = False)
        return self.dropout(x)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23

    MultiHeadedAttention

    class MultiHeadedAttention(nn.Module):
      def __init__(self, head, embedding_dim, dropout = 0.1):
        super().__init__()
    
        assert embedding_dim % head == 0 
    
        self.head_dim = embedding_dim // head
    
        self.head = head
        self.embedding_dim = embedding_dim
    
        self.linears = clones(nn.Linear(embedding_dim, embedding_dim), 4)
        self.attention = None
        self.dropout = nn.Dropout(p = dropout)
    
      def forward(self, query, key, value, mask = None):
        if mask is not None:
          mask = mask.unsqueeze(1)
        
        batch_size = query.size(0)
    
        query, key, value = \
          [model(x).view(batch_size, -1, self.head, self.d_k).transpose(1, 2)
          for model, x in zip(self.linears, (query, key, value))]
    
        x, self.attention = attention(query, key, value, mask = mask, dropout = self.dropout)
        x = x.transpose(1, 2).view(batch_size, -1, self.head * self.head_dim)
    
        return self.linears[-1](x)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29

    PositionwiseFeedForward

    class PositionwiseFeedForward(nn.Module):
      def __init__(self, embedding_dim, ff_dim, dropout):
        super().__init__()
    
        self.l1 = nn.Linear(embedding_dim, ff_dim)
        self.l2 = nn.Linear(ff_dim, embedding_dim)
        self.dropout = nn.Dropout(p = dropout)
    
      def forward(self, x):
        return self.l2(self.dropout(F.relu(self.l1(x))))
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10

    LayerNorm

    class LayerNorm(nn.Module):
      def __init__(self, embedding_dim, eps = 1e-6):
        super().__init__()
    
        self.w1 = nn.Parameter(torch.ones(embedding_dim))
        self.w2 = nn.Parameter(torch.zeros(embedding_dim))
        self.eps = eps
    
      def forward(self, x):
        mean = x.mean(-1, keeepdim = True)
        std = x.std(-1, keepdim = True)
        return self.w1 * (x - mean) / (std + self.eps) + self.w2
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12

    SublayerConnection

    class SublayerConnection(nn.Module):
      def __init__(self, embedding_dim, dropout = 0.1):
        super().__init__()
    
        self.norm = LayerNorm(embedding_dim)
        self.dropout = nn.Dropout(p = dropout)
    
      def forward(self, x, func):
        return x + self.dropout(func(self.norm(x)))
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9

    Clones

    方便重复实现相同结构的层

    def clones(module, N):
      return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
    
    • 1
    • 2

    Attention

    def attention(query, key, value, mask = None, dropout = None):
      embedding_dim = query.size(-1)
    
      scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(embedding_dim)
    
      if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    
      scores = F.softmax(scores, dim = -1)
    
      if dropout is not None:
        scores = dropout(scores)
    
      return torch.matmul(scores, value), scores
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14

    Encoder

    EncoderLayer

    class EncoderLayer(nn.Module):
      def __init__(self, embedding_dim, self_attention, feed_forward, dropout):
        super().__init__()
    
        self.self_attention = self_attention
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(embedding_dim, dropout), 2)
        self.embedding_dim = embedding_dim
      
      def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x : self.self_attention(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12

    Encoder

    class Encoder(nn.Module):
      def __init__(self, encoder_layer, N):
        super().__init__()
    
        self.encoder_layers = clones(encoder_layer, N)
        self.norm = LayerNorm(encoder_layer.embedding_dim)
    
      def forward(self, x, mask):
        for encoder_layer in self.encoder_layers:
          x = encoder_layer(x, mask)
    
        return self.norm(x)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12

    Decoder

    DecoderLayer

    class DecoderLayer(nn.Module):
      def __init__(self, embedding_dim, self_attention, attention, feed_forward, dropout):
        super().__init__()
    
        self.embedding_dim = embedding_dim
        self.self_attention = self_attention
        self.attention = attention
        self.feed_forward = feed_forward
    
        self.sublayer = clones(SublayerConnection(embedding_dim, dropout), 3)
    
      def forward(self, x, encode_kv, src_mask, trg_mask):
        x = self.sublayer[0](x, lambda x : self.self_attention(x, x, x, trg_mask))
        x = self.sublayer[1](x, lambda x : self.attention(x, encode_kv, encode_kv, src_mask))
    
        return self.sublayer[2](x, self.feed_forward)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16

    Decoder

    class Decoder(nn.Module):
      def __init__(self, decoder_layer, N):
        super().__init__()
    
        self.decode_layers = clones(decoder_layer, N)
        self.norm = LayerNorm(decoder_layer.embedding_dim)
    
      def forward(self, x, encode_kv, src_mask, trg_msk):
        for decoder_layer in self.decoder_layers:
          x = decoder_layer(x, encode_kv, src_mask, trg_msk)
    
        return self.norm(x)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12

    Transformer

    OutputLayer

    class OutputLayer(nn.Module):
      def __init__(self, embedding_dim, vocab_size):
        super().__init__()
    
        self.linear = nn.Linear(embedding_dim, vocab_size)
    
      def forward(self, x):
        return F.log_softmax(self.linear(x), dim = -1)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8

    Transformer

    class Transofrmer(nn.Module):
      def __init__(self, encoder, decoder, src_embed, trg_embed, generator):
        super().__init__()
    
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.trg_embed = trg_embed
        self.generator = generator
    
      def forward(self, source, target, src_mask, trg_mask):
        return self.decode(self.encode(source, src_mask), src_mask, target, trg_mask)
    
      def encode(self, source, src_mask):
        return self.encoder(self.src_embed(source), src_mask)
    
      def decode(self, encode_kv, src_mask, target, trg_mask):
        return self.decoder(self.trg_embed(target), encode_kv, src_mask, trg_mask)
    
    def build_model(src_vocab_size, trg_vocab_size, N = 6, embedding_dim = 512, ff_dim = 2048, head  = 8, dropout = 0.1):
      c = copy.deepcopy
      attention = MultiHeadedAttention(head, embedding_dim, dropout)
      ff = PositionwiseFeedForward(embedding_dim, ff_dim, dropout)
      position = PositionalEncoding(embedding_dim, dropout)
    
      model = Transofrmer(
          Encoder(EncoderLayer(embedding_dim, c(attention), c(ff), dropout), N),
          Decoder(DecoderLayer(embedding_dim, c(attention), c(attention), c(ff), dropout), N),
          nn.Sequential(Embeddings(src_vocab_size, embedding_dim), c(position)),
          nn.Sequential(Embeddings(trg_vocab_size, embedding_dim), c(position)),
          OutputLayer(embedding_dim, trg_vocab_size)
      )
    
      for p in model.parameters():
        if p.dim() > 1:
          nn.init.xavier_uniform_(p)
    
      return model
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38

    Example

    embedding_dim = 512
    ff_dim = 64
    head = 8
    N = 6
    dropout = 0.1
    src_vocab_size = 10
    trg_vocab_size = 10
    
    model = build_model(src_vocab_size, trg_vocab_size, N, embedding_dim, ff_dim, head, dropout)
    print(model)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    Transofrmer(
      (encoder): Encoder(
        (encoder_layers): ModuleList(
          (0): EncoderLayer(
            (self_attention): MultiHeadedAttention(
              (linears): ModuleList(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Linear(in_features=512, out_features=512, bias=True)
                (2): Linear(in_features=512, out_features=512, bias=True)
                (3): Linear(in_features=512, out_features=512, bias=True)
              )
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (feed_forward): PositionwiseFeedForward(
              (l1): Linear(in_features=512, out_features=64, bias=True)
              (l2): Linear(in_features=64, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (sublayer): ModuleList(
              (0): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (1): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
            )
          )
          (1): EncoderLayer(
            (self_attention): MultiHeadedAttention(
              (linears): ModuleList(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Linear(in_features=512, out_features=512, bias=True)
                (2): Linear(in_features=512, out_features=512, bias=True)
                (3): Linear(in_features=512, out_features=512, bias=True)
              )
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (feed_forward): PositionwiseFeedForward(
              (l1): Linear(in_features=512, out_features=64, bias=True)
              (l2): Linear(in_features=64, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (sublayer): ModuleList(
              (0): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (1): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
            )
          )
          (2): EncoderLayer(
            (self_attention): MultiHeadedAttention(
              (linears): ModuleList(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Linear(in_features=512, out_features=512, bias=True)
                (2): Linear(in_features=512, out_features=512, bias=True)
                (3): Linear(in_features=512, out_features=512, bias=True)
              )
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (feed_forward): PositionwiseFeedForward(
              (l1): Linear(in_features=512, out_features=64, bias=True)
              (l2): Linear(in_features=64, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (sublayer): ModuleList(
              (0): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (1): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
            )
          )
          (3): EncoderLayer(
            (self_attention): MultiHeadedAttention(
              (linears): ModuleList(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Linear(in_features=512, out_features=512, bias=True)
                (2): Linear(in_features=512, out_features=512, bias=True)
                (3): Linear(in_features=512, out_features=512, bias=True)
              )
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (feed_forward): PositionwiseFeedForward(
              (l1): Linear(in_features=512, out_features=64, bias=True)
              (l2): Linear(in_features=64, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (sublayer): ModuleList(
              (0): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (1): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
            )
          )
          (4): EncoderLayer(
            (self_attention): MultiHeadedAttention(
              (linears): ModuleList(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Linear(in_features=512, out_features=512, bias=True)
                (2): Linear(in_features=512, out_features=512, bias=True)
                (3): Linear(in_features=512, out_features=512, bias=True)
              )
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (feed_forward): PositionwiseFeedForward(
              (l1): Linear(in_features=512, out_features=64, bias=True)
              (l2): Linear(in_features=64, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (sublayer): ModuleList(
              (0): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (1): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
            )
          )
          (5): EncoderLayer(
            (self_attention): MultiHeadedAttention(
              (linears): ModuleList(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Linear(in_features=512, out_features=512, bias=True)
                (2): Linear(in_features=512, out_features=512, bias=True)
                (3): Linear(in_features=512, out_features=512, bias=True)
              )
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (feed_forward): PositionwiseFeedForward(
              (l1): Linear(in_features=512, out_features=64, bias=True)
              (l2): Linear(in_features=64, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (sublayer): ModuleList(
              (0): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (1): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
            )
          )
        )
        (norm): LayerNorm()
      )
      (decoder): Decoder(
        (decode_layers): ModuleList(
          (0): DecoderLayer(
            (self_attention): MultiHeadedAttention(
              (linears): ModuleList(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Linear(in_features=512, out_features=512, bias=True)
                (2): Linear(in_features=512, out_features=512, bias=True)
                (3): Linear(in_features=512, out_features=512, bias=True)
              )
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (attention): MultiHeadedAttention(
              (linears): ModuleList(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Linear(in_features=512, out_features=512, bias=True)
                (2): Linear(in_features=512, out_features=512, bias=True)
                (3): Linear(in_features=512, out_features=512, bias=True)
              )
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (feed_forward): PositionwiseFeedForward(
              (l1): Linear(in_features=512, out_features=64, bias=True)
              (l2): Linear(in_features=64, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (sublayer): ModuleList(
              (0): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (1): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (2): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
            )
          )
          (1): DecoderLayer(
            (self_attention): MultiHeadedAttention(
              (linears): ModuleList(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Linear(in_features=512, out_features=512, bias=True)
                (2): Linear(in_features=512, out_features=512, bias=True)
                (3): Linear(in_features=512, out_features=512, bias=True)
              )
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (attention): MultiHeadedAttention(
              (linears): ModuleList(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Linear(in_features=512, out_features=512, bias=True)
                (2): Linear(in_features=512, out_features=512, bias=True)
                (3): Linear(in_features=512, out_features=512, bias=True)
              )
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (feed_forward): PositionwiseFeedForward(
              (l1): Linear(in_features=512, out_features=64, bias=True)
              (l2): Linear(in_features=64, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (sublayer): ModuleList(
              (0): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (1): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (2): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
            )
          )
          (2): DecoderLayer(
            (self_attention): MultiHeadedAttention(
              (linears): ModuleList(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Linear(in_features=512, out_features=512, bias=True)
                (2): Linear(in_features=512, out_features=512, bias=True)
                (3): Linear(in_features=512, out_features=512, bias=True)
              )
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (attention): MultiHeadedAttention(
              (linears): ModuleList(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Linear(in_features=512, out_features=512, bias=True)
                (2): Linear(in_features=512, out_features=512, bias=True)
                (3): Linear(in_features=512, out_features=512, bias=True)
              )
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (feed_forward): PositionwiseFeedForward(
              (l1): Linear(in_features=512, out_features=64, bias=True)
              (l2): Linear(in_features=64, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (sublayer): ModuleList(
              (0): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (1): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (2): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
            )
          )
          (3): DecoderLayer(
            (self_attention): MultiHeadedAttention(
              (linears): ModuleList(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Linear(in_features=512, out_features=512, bias=True)
                (2): Linear(in_features=512, out_features=512, bias=True)
                (3): Linear(in_features=512, out_features=512, bias=True)
              )
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (attention): MultiHeadedAttention(
              (linears): ModuleList(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Linear(in_features=512, out_features=512, bias=True)
                (2): Linear(in_features=512, out_features=512, bias=True)
                (3): Linear(in_features=512, out_features=512, bias=True)
              )
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (feed_forward): PositionwiseFeedForward(
              (l1): Linear(in_features=512, out_features=64, bias=True)
              (l2): Linear(in_features=64, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (sublayer): ModuleList(
              (0): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (1): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (2): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
            )
          )
          (4): DecoderLayer(
            (self_attention): MultiHeadedAttention(
              (linears): ModuleList(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Linear(in_features=512, out_features=512, bias=True)
                (2): Linear(in_features=512, out_features=512, bias=True)
                (3): Linear(in_features=512, out_features=512, bias=True)
              )
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (attention): MultiHeadedAttention(
              (linears): ModuleList(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Linear(in_features=512, out_features=512, bias=True)
                (2): Linear(in_features=512, out_features=512, bias=True)
                (3): Linear(in_features=512, out_features=512, bias=True)
              )
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (feed_forward): PositionwiseFeedForward(
              (l1): Linear(in_features=512, out_features=64, bias=True)
              (l2): Linear(in_features=64, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (sublayer): ModuleList(
              (0): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (1): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (2): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
            )
          )
          (5): DecoderLayer(
            (self_attention): MultiHeadedAttention(
              (linears): ModuleList(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Linear(in_features=512, out_features=512, bias=True)
                (2): Linear(in_features=512, out_features=512, bias=True)
                (3): Linear(in_features=512, out_features=512, bias=True)
              )
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (attention): MultiHeadedAttention(
              (linears): ModuleList(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Linear(in_features=512, out_features=512, bias=True)
                (2): Linear(in_features=512, out_features=512, bias=True)
                (3): Linear(in_features=512, out_features=512, bias=True)
              )
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (feed_forward): PositionwiseFeedForward(
              (l1): Linear(in_features=512, out_features=64, bias=True)
              (l2): Linear(in_features=64, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (sublayer): ModuleList(
              (0): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (1): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (2): SublayerConnection(
                (norm): LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
            )
          )
        )
        (norm): LayerNorm()
      )
      (src_embed): Sequential(
        (0): Embeddings(
          (lut): Embedding(10, 512)
        )
        (1): PositionalEncoding(
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (trg_embed): Sequential(
        (0): Embeddings(
          (lut): Embedding(10, 512)
        )
        (1): PositionalEncoding(
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (generator): OutputLayer(
        (linear): Linear(in_features=512, out_features=10, bias=True)
      )
    )
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121
    • 122
    • 123
    • 124
    • 125
    • 126
    • 127
    • 128
    • 129
    • 130
    • 131
    • 132
    • 133
    • 134
    • 135
    • 136
    • 137
    • 138
    • 139
    • 140
    • 141
    • 142
    • 143
    • 144
    • 145
    • 146
    • 147
    • 148
    • 149
    • 150
    • 151
    • 152
    • 153
    • 154
    • 155
    • 156
    • 157
    • 158
    • 159
    • 160
    • 161
    • 162
    • 163
    • 164
    • 165
    • 166
    • 167
    • 168
    • 169
    • 170
    • 171
    • 172
    • 173
    • 174
    • 175
    • 176
    • 177
    • 178
    • 179
    • 180
    • 181
    • 182
    • 183
    • 184
    • 185
    • 186
    • 187
    • 188
    • 189
    • 190
    • 191
    • 192
    • 193
    • 194
    • 195
    • 196
    • 197
    • 198
    • 199
    • 200
    • 201
    • 202
    • 203
    • 204
    • 205
    • 206
    • 207
    • 208
    • 209
    • 210
    • 211
    • 212
    • 213
    • 214
    • 215
    • 216
    • 217
    • 218
    • 219
    • 220
    • 221
    • 222
    • 223
    • 224
    • 225
    • 226
    • 227
    • 228
    • 229
    • 230
    • 231
    • 232
    • 233
    • 234
    • 235
    • 236
    • 237
    • 238
    • 239
    • 240
    • 241
    • 242
    • 243
    • 244
    • 245
    • 246
    • 247
    • 248
    • 249
    • 250
    • 251
    • 252
    • 253
    • 254
    • 255
    • 256
    • 257
    • 258
    • 259
    • 260
    • 261
    • 262
    • 263
    • 264
    • 265
    • 266
    • 267
    • 268
    • 269
    • 270
    • 271
    • 272
    • 273
    • 274
    • 275
    • 276
    • 277
    • 278
    • 279
    • 280
    • 281
    • 282
    • 283
    • 284
    • 285
    • 286
    • 287
    • 288
    • 289
    • 290
    • 291
    • 292
    • 293
    • 294
    • 295
    • 296
    • 297
    • 298
    • 299
    • 300
    • 301
    • 302
    • 303
    • 304
    • 305
    • 306
    • 307
    • 308
    • 309
    • 310
    • 311
    • 312
    • 313
    • 314
    • 315
    • 316
    • 317
    • 318
    • 319
    • 320
    • 321
    • 322
    • 323
    • 324
    • 325
    • 326
    • 327
    • 328
    • 329
    • 330
    • 331
    • 332
    • 333
    • 334
    • 335
    • 336
    • 337
    • 338
    • 339
    • 340
    • 341
    • 342
    • 343
    • 344
    • 345
    • 346
    • 347
    • 348
    • 349
    • 350
    • 351
    • 352
    • 353
    • 354
    • 355
    • 356
    • 357
    • 358
    • 359
    • 360
    • 361
    • 362
    • 363
    • 364
    • 365
    • 366
    • 367
    • 368
    • 369
    • 370
    • 371
    • 372
    • 373
    • 374
    • 375
    • 376
    • 377
    • 378
    • 379
    • 380
    • 381
    • 382
    • 383
    • 384
    • 385
    • 386
    • 387
    • 388
    • 389
    • 390
    • 391
    • 392
    • 393
    • 394
    • 395
    • 396
    • 397
    • 398
    • 399
    • 400
    • 401
    • 402
    • 403
    • 404
    • 405
    • 406
    • 407
    • 408
    • 409
    • 410
    • 411
    • 412
    • 413
    • 414
    • 415
    • 416
    • 417
    • 418
    • 419
    • 420
    • 421
  • 相关阅读:
    以训辅教,以战促学 | 新版攻防世界平台正式上线运营!
    C:数组传值调用和传地址调用
    前端开发:JS的解构
    SQL库函数
    使用JDK自带java.util.logging.Logger引起的冲突问题
    Web基础与HTTP协议
    [含lw+源码等]计算机毕业论文Java项目源码下载微信小程序记事本+后台管理系统[包运行成功]
    数据结构与算法训练:第二十九弹
    Maven介绍、优缺点、生命周期、坐标、依赖...
    《R语言与农业数据统计分析及建模》学习——方差分析
  • 原文地址:https://blog.csdn.net/weixin_45591452/article/details/126557038