【paddle】Vision Transformer（attention）

可以看看paddle的transformer.py的源码

在这里插入图片描述

多头注意力

在这里插入图片描述

class Attetion(nn.layer):
    """
    多头注意力
        - 使用伸缩点积模型

    Forward:
        - 输入每条为[N,D]的数据，初始化QKV矩阵
        - 再将QKV矩阵转化为多头，并把每条数据的一部分[N,head_dims]分配给每个头
        - 伸缩点积模型计算，获得多头结果
        - 将多头结果进行组合还原（通过线性层的方式），还原回原来的维度D
    """
    def __init__(self,
                 embed_dims=768,
                 num_head=12,
                 attn_dropout_rate=0.,
                 dropout_rate=0.):
        """
        B(batch_size),N(seq_len),D(embed_dims)
        :param embed_dims: 输入数据的维度
        :param num_head: 注意力头数
        :param attn_dropout_rate: 注意力分布的丢弃率
        :param dropout_rate: 注意力结果的丢弃率
        """
        super(Attetion, self).__init__()
        self.embed_dims = embed_dims
        self.num_head = num_head
        assert embed_dims % num_head == 0, \
            "Warning Attetion embed_dims % num_head != 0"
        self.head_dims = embed_dims // num_head
        self.scale = self.head_dims ** -0.5  # 开根号再取倒数

        # q,k,v初始化
        # B(batch_size),N(seq_len),D(embed_dims) -> B,N,3*D
        self.qkv_proj = nn.Linear(in_features=embed_dims,
                                  out_features=3 * self.head_dims * self.num_head)
        self.out = nn.Linear(in_features=self.head_dims * self.num_head,
                             out_features=embed_dims)

        self.softmax = nn.Softmax()
        self.attn_dropout = nn.Dropout(p=attn_dropout_rate)
        self.dropout = nn.Dropout(p=dropout_rate)

    def forward(self, inputs):
        # inputs:B,N,D

        qkv = self.qkv_proj(inputs)  # B,N,3*D
        q, k, v = qkv.chunk(3, axis=-1)  # B,N,D

        B, N, D = q.shape

        # 将最后一个维度embed_dims平分给每个头
        q = q.reshape(shape=[B, N, self.num_head, self.head_dims])
        # [B,N,self.num_head,self.head_dims] -> [B,self.num_head,N,self.head_dims]
        # 这样每个头都获得了每条数据的一部分 [N,self.head_dims]
        q = q.transpose(perm=[0, 2, 1, 3])

        k = k.reshape(shape=[B, N, self.num_head, self.head_dims])
        k = k.transpose(perm=[0, 2, 1, 3])
        v = v.reshape(shape=[B, N, self.num_head, self.head_dims])
        v = v.transpose(perm=[0, 2, 1, 3])

        # [B,self.num_head,N,N]
        attn = paddle.matmul(q, k, transpose_y=True)  # q*k^T
        attn = attn * self.scale
        attn = self.softmax(attn)  # 注意力分布
        attn = self.attn_dropout(attn)

        z = paddle.matmul(attn, v)  # # [B,self.num_head,N, self.head_dims]
        z = z.transpose(perm=[0, 2, 1, 3])  # [B,N,self.num_head, self.head_dims]
        z = z.reshape(shape=[B, N, self.num_head * self.head_dims])

        # 将多头结果进行组合还原（通过线性层的方式）
        # 论文中是先concat再通过Linear
        z = self.out(z)  # [B,N,D]
        z = self.dropout(z)

        return z
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77

代码

import paddle
from paddle import nn


class MLP(nn.layer):
    """
    Forward
        - 将输入特征映射到更高维度去学习隐藏特征
        - 然后经过激活，丢弃，再回到原始输入特征大小
    """

    def __init__(self,
                 in_features,
                 out_features=None,
                 mlp_ratio=4,
                 dropout_rate=0.,
                 act=nn.GELU):
        """

        :param in_features: 输入特征大小
        :param out_features: 输出特征大小 default:None
        :param mlp_ratio: MLP中隐藏层伸缩比例
        :param dropout_rate: 丢弃率
        :param act: 激活函数 nn.GELU or nn.functional
        """
        super(MLP, self).__init__()
        self.in_features = in_features
        self.out_features = out_features if out_features is None \
            else in_features
        self.mlp_ratio = mlp_ratio
        self.dropout_rate = dropout_rate

        # 将输入维度映射到隐藏层特征维度
        self.fc1 = nn.Linear(in_features=in_features,
                             out_features=int(in_features * mlp_ratio))

        # 将输入从隐藏层维度降回指定的输出维度
        self.fc2 = nn.Linear(in_features=int(in_features * mlp_ratio),
                             out_features=self.out_features)

        self.act = act()
        self.dropout = nn.Dropout(p=dropout_rate)

    def forward(self, inputs):
        x = self.fc1(inputs)
        x = self.act(x)
        x = self.fc2(x)
        x = self.dropout(x)
        return x


class Attetion(nn.layer):
    """
    多头注意力
        - 使用伸缩点积模型

    Forward:
        - 输入每条为[N,D]的数据，初始化QKV矩阵
        - 再将QKV矩阵转化为多头，并把每条数据的一部分[N,head_dims]分配给每个头
        - 伸缩点积模型计算，获得多头结果
        - 将多头结果进行组合还原（通过线性层的方式），还原回原来的维度D
    """

    def __init__(self,
                 embed_dims=768,
                 num_head=12,
                 attn_dropout_rate=0.,
                 dropout_rate=0.):
        """
        B(batch_size),N(seq_len),D(embed_dims)
        :param embed_dims: 输入数据的维度
        :param num_head: 注意力头数
        :param attn_dropout_rate: 注意力分布的丢弃率
        :param dropout_rate: 注意力结果的丢弃率
        """
        super(Attetion, self).__init__()
        self.embed_dims = embed_dims
        self.num_head = num_head
        assert embed_dims % num_head == 0, \
            "Warning Attetion embed_dims % num_head != 0"
        self.head_dims = embed_dims // num_head
        self.scale = self.head_dims ** -0.5  # 开根号再取倒数

        # q,k,v初始化
        # B(batch_size),N(seq_len),D(embed_dims) -> B,N,3*D
        self.qkv_proj = nn.Linear(in_features=embed_dims,
                                  out_features=3 * self.head_dims * self.num_head)
        self.out = nn.Linear(in_features=self.head_dims * self.num_head,
                             out_features=embed_dims)

        self.softmax = nn.Softmax()
        self.attn_dropout = nn.Dropout(p=attn_dropout_rate)
        self.dropout = nn.Dropout(p=dropout_rate)

    def forward(self, inputs):
        # inputs:B,N,D

        qkv = self.qkv_proj(inputs)  # B,N,3*D
        q, k, v = qkv.chunk(3, axis=-1)  # B,N,D

        B, N, D = q.shape

        # 将最后一个维度embed_dims平分给每个头
        q = q.reshape(shape=[B, N, self.num_head, self.head_dims])
        # [B,N,self.num_head,self.head_dims] -> [B,self.num_head,N,self.head_dims]
        # 这样每个头都获得了每条数据的一部分 [N,self.head_dims]
        q = q.transpose(perm=[0, 2, 1, 3])

        k = k.reshape(shape=[B, N, self.num_head, self.head_dims])
        k = k.transpose(perm=[0, 2, 1, 3])
        v = v.reshape(shape=[B, N, self.num_head, self.head_dims])
        v = v.transpose(perm=[0, 2, 1, 3])

        # [B,self.num_head,N,N]
        attn = paddle.matmul(q, k, transpose_y=True)  # q*k^T
        attn = attn * self.scale
        attn = self.softmax(attn)  # 注意力分布
        attn = self.attn_dropout(attn)

        z = paddle.matmul(attn, v)  # # [B,self.num_head,N, self.head_dims]
        z = z.transpose(perm=[0, 2, 1, 3])  # [B,N,self.num_head, self.head_dims]
        z = z.reshape(shape=[B, N, self.num_head * self.head_dims])

        # 将多头结果进行组合还原（通过线性层的方式）
        # 论文中是先concat再通过Linear
        z = self.out(z)  # [B,N,D]
        z = self.dropout(z)

        return z


class DropPath(nn.layer):
    """
    多分支的Dropout
    B,N,C 沿着B这个维度丢弃

    paddle源码使用的是Dropout(dropout, mode="upscale_in_train")
    """

    def __init__(self, p=0.):
        super(DropPath, self).__init__()
        self.p = p

    def forward(self, inputs):
        if self.p > 0 and self.training:
            keep_p = 1 - self.p  # 保留的部分
            keep_p = paddle.to_tensor([keep_p], dtype='float32')
            # B,1,1
            # [B] + [1]*(inputs.ndim-1) == [1,1]
            # [B,1,1]
            shape = [inputs.shape[0]] + [1.] * (inputs.ndim - 1)  # ??? 没理解
            # 加上一个0到1的正态分布随机数
            random_keep = keep_p + paddle.rand(shape=shape, dtype='float32')
            # > 1.0 == 1 , < 1.0 == 0
            random_mask = random_keep.floor()  # 向下丢弃
            # inputs: B,N,D
            # random_mask: B,1,1
            # 1,N,D -> 全部丢弃
            output = inputs.divide(keep_p) * random_mask  # 保持总的期望不变 ??? 没理解


class EncoderLayer(nn.layer):
    def __init__(self,
                 # MLP 参数和 Attetion参数
                 embed_dims=768,
                 mlp_ratio=4,
                 num_head=12,
                 attn_dropout_rate=0.,
                 dropout_rate=0.,
                 droppath_rate=0.,
                 act=nn.GELU,
                 norm=nn.LayerNorm
                 ):
        """

        :param embed_dims:
        :param mlp_ratio:
        :param num_head:
        :param attn_dropout_rate:
        :param dropout_rate: 注意力结果丢弃率&MLP丢弃率
        :param droppath_rate: 多分支丢弃率
        :param act:
        :param norm: 归一化层
        """
        super(EncoderLayer, self).__init__()
        self.embed_dims = embed_dims
        self.mlp_ratio = mlp_ratio
        self.num_head = num_head
        self.attn_dropout_rate = attn_dropout_rate
        self.dropout_rate = dropout_rate

        # 两个不同的norm
        self.attn_norm = norm(embed_dims)
        self.mlp_norm = norm(embed_dims)

        self.multi_attn = Attetion(embed_dims=embed_dims,
                                   num_head=num_head,
                                   attn_dropout_rate=attn_dropout_rate,
                                   dropout_rate=dropout_rate)
        self.mlp = MLP(in_features=embed_dims,
                       mlp_ratio=4,
                       dropout_rate=dropout_rate,
                       act=act)
        # paddle源码使用的是Dropout(dropout, mode="upscale_in_train")
        self.attn_droppath = DropPath(p=droppath_rate)
        self.mlp_droppath = DropPath(p=droppath_rate)

    def forward(self, inputs):
        res = inputs  # 残差1
        x = self.attn_norm(inputs)
        x = self.mutil_attn(x)
        x = self.attn_droppath(x)  # dropout
        x = x + res

        res = x  # 残差2
        x = self.mlp_norm(x)
        x = self.mlp(x)
        x = self.mlp_droppath(x)  # dropout
        x = x + res

        return x


class Encoder(nn.layer):

    def __init__(self,
                 num_layers,
                 embed_dims=768,
                 mlp_ratio=4,
                 num_head=12,
                 attn_dropout_rate=0.,
                 dropout_rate=0.,
                 droppath_rate=0.,
                 act=nn.GELU,
                 norm=nn.LayerNorm
                 ):
        super(Encoder, self).__init__()

        self.num_layers = num_layers
        self.embed_dims = embed_dims
        self.mlp_ratio = mlp_ratio
        self.num_head = num_head
        self.attn_dropout_rate = attn_dropout_rate
        self.dropout_rate = dropout_rate

        blocks = []
        for i in range(num_layers):
            blocks.append(
                EncoderLayer(
                    embed_dims=embed_dims,
                    mlp_ratio=mlp_ratio,
                    num_head=num_head,
                    attn_dropout_rate=attn_dropout_rate,
                    dropout_rate=dropout_rate,
                    droppath_rate=droppath_rate,
                    act=act,
                    norm=norm
                )
            )
        self.encoder_blocks = nn.LayerList(blocks)  # 像list一样可以索引

    def forward(self, inputs):
        x = self.encoder_blocks[0](inputs)

        for i in range(1, self.num_layers):
            x = self.encoder_blocks[i](x)
        return x

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268

相关阅读:
LVGL_基础控件滑动条slider
使用soapUI获取webservice接口的调用格式
 leetcode刷题 (9.1) 动态规划
 Qt中各个功能模块遵循的协议
 gitLab安装文档
 PCB设计---深入浅出的反弹图
 leetcode做题笔记150. 逆波兰表达式求值
 ESP32-C3入门教程 IoT篇⑦——微软云 Microsoft Azure 物联网 IoT 中心 EspAzureIoT 实战
 网络编程TCP/UDP通信
 P1404 平均数
原文地址：https://blog.csdn.net/Jruo911/article/details/126530133