大家拿到任何一个代码,想要加入到模型的内部,我们都需要先将其导入到模型的内部,才可以将其添加到模型的结构中去,下面的代码是一个ODConv,和我创建的一个ODConv_yolo的类(官方的代码报错进行一定的处理想知道为啥可以看我单独讲解它的博客), 我们先拿其进行举例。
- import torch
- import torch.nn as nn
- import torch.nn.functional as F
- import torch.autograd
- class Attention(nn.Module):
- def __init__(self, in_planes, out_planes, kernel_size, groups=1, reduction=0.0625, kernel_num=4, min_channel=16):
- super(Attention, self).__init__()
- attention_channel = max(int(in_planes * reduction), min_channel)
- self.kernel_size = kernel_size
- self.kernel_num = kernel_num
- self.temperature = 1.0
- self.avgpool = nn.AdaptiveAvgPool2d(1)
- self.fc = nn.Conv2d(in_planes, attention_channel, 1, bias=False)
- self.bn = nn.BatchNorm2d(attention_channel)
- self.relu = nn.ReLU(inplace=True)
- self.channel_fc = nn.Conv2d(attention_channel, in_planes, 1, bias=True)
- self.func_channel = self.get_channel_attention
- if in_planes == groups and in_planes == out_planes: # depth-wise convolution
- self.func_filter = self.skip
- else:
- self.filter_fc = nn.Conv2d(attention_channel, out_planes, 1, bias=True)
- self.func_filter = self.get_filter_attention
- if kernel_size == 1: # point-wise convolution
- self.func_spatial = self.skip
- else:
- self.spatial_fc = nn.Conv2d(attention_channel, kernel_size * kernel_size, 1, bias=True)
- self.func_spatial = self.get_spatial_attention
- if kernel_num == 1:
- self.func_kernel = self.skip
- else:
- self.kernel_fc = nn.Conv2d(attention_channel, kernel_num, 1, bias=True)
- self.func_kernel = self.get_kernel_attention
- self._initialize_weights()
- def _initialize_weights(self):
- for m in self.modules():
- if isinstance(m, nn.Conv2d):
- nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
- if m.bias is not None:
- nn.init.constant_(m.bias, 0)
- if isinstance(m, nn.BatchNorm2d):
- nn.init.constant_(m.weight, 1)
- nn.init.constant_(m.bias, 0)
- def update_temperature(self, temperature):
- self.temperature = temperature
- @staticmethod
- def skip(_):
- return 1.0
- def get_channel_attention(self, x):
- channel_attention = torch.sigmoid(self.channel_fc(x).view(x.size(0), -1, 1, 1) / self.temperature)
- return channel_attention
- def get_filter_attention(self, x):
- filter_attention = torch.sigmoid(self.filter_fc(x).view(x.size(0), -1, 1, 1) / self.temperature)
- return filter_attention
- def get_spatial_attention(self, x):
- spatial_attention = self.spatial_fc(x).view(x.size(0), 1, 1, 1, self.kernel_size, self.kernel_size)
- spatial_attention = torch.sigmoid(spatial_attention / self.temperature)
- return spatial_attention
- def get_kernel_attention(self, x):
- kernel_attention = self.kernel_fc(x).view(x.size(0), -1, 1, 1, 1, 1)
- kernel_attention = F.softmax(kernel_attention / self.temperature, dim=1)
- return kernel_attention
- def forward(self, x):
- x = self.avgpool(x)
- x = self.fc(x)
- # x = self.bn(x) # 在外面我提供了一个bn这里会报错
- x = self.relu(x)
- return self.func_channel(x), self.func_filter(x), self.func_spatial(x), self.func_kernel(x)
- class ODConv2d(nn.Module):
- def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=1, dilation=1, groups=1,
- reduction=0.0625, kernel_num=4):
- super(ODConv2d, self).__init__()
- kernel_size = kernel_size[0]
- in_planes = in_planes
- self.in_planes = in_planes
- self.out_planes = out_planes
- self.kernel_size = kernel_size
- self.stride = stride
- self.padding = padding
- self.dilation = dilation
- self.groups = groups
- self.kernel_num = kernel_num
- self.attention = Attention(in_planes, out_planes, kernel_size, groups=groups,
- reduction=reduction, kernel_num=kernel_num)
- self.weight = nn.Parameter(torch.randn(kernel_num, out_planes, in_planes//groups, kernel_size, kernel_size),
- requires_grad=True)
- self._initialize_weights()
- if self.kernel_size == 1 and self.kernel_num == 1:
- self._forward_impl = self._forward_impl_pw1x
- else:
- self._forward_impl = self._forward_impl_common
- def _initialize_weights(self):
- for i in range(self.kernel_num):
- nn.init.kaiming_normal_(self.weight[i], mode='fan_out', nonlinearity='relu')
- def update_temperature(self, temperature):
- self.attention.update_temperature(temperature)
- def _forward_impl_common(self, x):
- # Multiplying channel attention (or filter attention) to weights and feature maps are equivalent,
- # while we observe that when using the latter method the models will run faster with less gpu memory cost.
- channel_attention, filter_attention, spatial_attention, kernel_attention = self.attention(x)
- batch_size, in_planes, height, width = x.size()
- x = x * channel_attention
- x = x.reshape(1, -1, height, width)
- aggregate_weight = spatial_attention * kernel_attention * self.weight.unsqueeze(dim=0)
- aggregate_weight = torch.sum(aggregate_weight, dim=1).view(
- [-1, self.in_planes // self.groups, self.kernel_size, self.kernel_size])
- output = F.conv2d(x, weight=aggregate_weight, bias=None, stride=self.stride, padding=self.padding,
- dilation=self.dilation, groups=self.groups * batch_size)
- output = output.view(batch_size, self.out_planes, output.size(-2), output.size(-1))
- output = output * filter_attention
- return output
- def _forward_impl_pw1x(self, x):
- channel_attention, filter_attention, spatial_attention, kernel_attention = self.attention(x)
- x = x * channel_attention
- output = F.conv2d(x, weight=self.weight.squeeze(dim=0), bias=None, stride=self.stride, padding=self.padding,
- dilation=self.dilation, groups=self.groups)
- output = output * filter_attention
- return output
- def forward(self, x):
- return self._forward_impl(x)
- class ODConv2d_yolo(nn.Module):
- def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, groups=1, dilation=1):
- super().__init__()
- self.conv = Conv(in_channels, out_channels, k=1)
- self.dcnv3 = ODConv2d(out_channels,out_channels, kernel_size=kernel_size, stride=stride, groups=groups,
- dilation=dilation)
- self.bn = nn.BatchNorm2d(out_channels)
- self.gelu = nn.GELU()
- def forward(self, x):
- x = self.conv(x)
- x = self.dcnv3(x)
- x = self.gelu(self.bn(x))
- return x
修改四 、重复情况一的步骤
我们先把我们在上面"ultralytics/nn/modules/__init__.py" 文件的函数头中导入的类,在下面的地方导入进"ultralytics/nn/tasks.py"文件中,修改内容如下->
我们在这个文件中找到一个方法(def定义的就叫方法),因为其代码很长,我们一行一行搜索很麻烦,我们适用文件搜索功能(快捷键Ctrl + F),弹出快捷栏如下->
我们搜索下面这个代码"parse_model" 然后进行翻滚很容易就找到了下面的部分,同时进行红框内部的修改
- class Bottleneck_ODConv(nn.Module):
- """Standard bottleneck."""
- def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):
- """Initializes a bottleneck module with given input/output channels, shortcut option, group, kernels, and
- expansion.
- """
- super().__init__()
- c_ = int(c2 * e) # hidden channels
- self.cv1 = Conv(c1, c_, k[0], 1)
- self.cv2 = ODConv2d_yolo(c_, c2, k[1], 1, groups=g)
- self.add = shortcut and c1 == c2
- def forward(self, x):
- """'forward()' applies the YOLO FPN to input data."""
- return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
- class C2f_ODConv(nn.Module):
- """Faster Implementation of CSP Bottleneck with 2 convolutions."""
- def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
- """Initialize CSP bottleneck layer with two convolutions with arguments ch_in, ch_out, number, shortcut, groups,
- expansion.
- """
- super().__init__()
- self.c = int(c2 * e) # hidden channels
- self.cv1 = Conv(c1, 2 * self.c, 1, 1)
- self.cv2 = Conv((2 + n) * self.c, c2, 1) # optional act=FReLU(c2)
- self.m = nn.ModuleList(Bottleneck_ODConv(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
- def forward(self, x):
- """Forward pass through C2f layer."""
- y = list(self.cv1(x).chunk(2, 1))
- y.extend(m(y[-1]) for m in self.m)
- return self.cv2(torch.cat(y, 1))
- def forward_split(self, x):
- """Forward pass using split() instead of chunk()."""
- y = list(self.cv1(x).split((self.c, self.c), 1))
- y.extend(m(y[-1]) for m in self.m)
- return self.cv2(torch.cat(y, 1))
至于修改这个ODConv的 效果如何可以看我的其它博客里面有详细的讲解~
- """
- Bi-Level Routing Attention.
- """
- from typing import Tuple, Optional
- import torch
- import torch.nn as nn
- import torch.nn.functional as F
- from einops import rearrange
- from torch import Tensor, LongTensor
- class TopkRouting(nn.Module):
- """
- differentiable topk routing with scaling
- Args:
- qk_dim: int, feature dimension of query and key
- topk: int, the 'topk'
- qk_scale: int or None, temperature (multiply) of softmax activation
- with_param: bool, wether inorporate learnable params in routing unit
- diff_routing: bool, wether make routing differentiable
- soft_routing: bool, wether make output value multiplied by routing weights
- """
- def __init__(self, qk_dim, topk=4, qk_scale=None, param_routing=False, diff_routing=False):
- super().__init__()
- self.topk = topk
- self.qk_dim = qk_dim
- self.scale = qk_scale or qk_dim ** -0.5
- self.diff_routing = diff_routing
- # TODO: norm layer before/after linear?
- self.emb = nn.Linear(qk_dim, qk_dim) if param_routing else nn.Identity()
- # routing activation
- self.routing_act = nn.Softmax(dim=-1)
- def forward(self, query: Tensor, key: Tensor) -> Tuple[Tensor]:
- """
- Args:
- q, k: (n, p^2, c) tensor
- Return:
- r_weight, topk_index: (n, p^2, topk) tensor
- """
- if not self.diff_routing:
- query, key = query.detach(), key.detach()
- query_hat, key_hat = self.emb(query), self.emb(key) # per-window pooling -> (n, p^2, c)
- attn_logit = (query_hat * self.scale) @ key_hat.transpose(-2, -1) # (n, p^2, p^2)
- topk_attn_logit, topk_index = torch.topk(attn_logit, k=self.topk, dim=-1) # (n, p^2, k), (n, p^2, k)
- r_weight = self.routing_act(topk_attn_logit) # (n, p^2, k)
- return r_weight, topk_index
- class KVGather(nn.Module):
- def __init__(self, mul_weight='none'):
- super().__init__()
- assert mul_weight in ['none', 'soft', 'hard']
- self.mul_weight = mul_weight
- def forward(self, r_idx: Tensor, r_weight: Tensor, kv: Tensor):
- """
- r_idx: (n, p^2, topk) tensor
- r_weight: (n, p^2, topk) tensor
- kv: (n, p^2, w^2, c_kq+c_v)
- Return:
- (n, p^2, topk, w^2, c_kq+c_v) tensor
- """
- # select kv according to routing index
- n, p2, w2, c_kv = kv.size()
- topk = r_idx.size(-1)
- # print(r_idx.size(), r_weight.size())
- # FIXME: gather consumes much memory (topk times redundancy), write cuda kernel?
- topk_kv = torch.gather(kv.view(n, 1, p2, w2, c_kv).expand(-1, p2, -1, -1, -1),
- # (n, p^2, p^2, w^2, c_kv) without mem cpy
- dim=2,
- index=r_idx.view(n, p2, topk, 1, 1).expand(-1, -1, -1, w2, c_kv)
- # (n, p^2, k, w^2, c_kv)
- )
- if self.mul_weight == 'soft':
- topk_kv = r_weight.view(n, p2, topk, 1, 1) * topk_kv # (n, p^2, k, w^2, c_kv)
- elif self.mul_weight == 'hard':
- raise NotImplementedError('differentiable hard routing TBA')
- # else: #'none'
- # topk_kv = topk_kv # do nothing
- return topk_kv
- class QKVLinear(nn.Module):
- def __init__(self, dim, qk_dim, bias=True):
- super().__init__()
- self.dim = dim
- self.qk_dim = qk_dim
- self.qkv = nn.Linear(dim, qk_dim + qk_dim + dim, bias=bias)
- def forward(self, x):
- q, kv = self.qkv(x).split([self.qk_dim, self.qk_dim + self.dim], dim=-1)
- return q, kv
- # q, k, v = self.qkv(x).split([self.qk_dim, self.qk_dim, self.dim], dim=-1)
- # return q, k, v
- class BiLevelRoutingAttention(nn.Module):
- """
- n_win: number of windows in one side (so the actual number of windows is n_win*n_win)
- kv_per_win: for kv_downsample_mode='ada_xxxpool' only, number of key/values per window. Similar to n_win, the actual number is kv_per_win*kv_per_win.
- topk: topk for window filtering
- param_attention: 'qkvo'-linear for q,k,v and o, 'none': param free attention
- param_routing: extra linear for routing
- diff_routing: wether to set routing differentiable
- soft_routing: wether to multiply soft routing weights
- """
- def __init__(self, dim, n_win=7, num_heads=8, qk_dim=None, qk_scale=None,
- kv_per_win=4, kv_downsample_ratio=4, kv_downsample_kernel=None, kv_downsample_mode='identity',
- topk=4, param_attention="qkvo", param_routing=False, diff_routing=False, soft_routing=False,
- side_dwconv=3,
- auto_pad=True):
- super().__init__()
- # local attention setting
- self.dim = dim
- self.n_win = n_win # Wh, Ww
- self.num_heads = num_heads
- self.qk_dim = qk_dim or dim
- assert self.qk_dim % num_heads == 0 and self.dim % num_heads == 0, 'qk_dim and dim must be divisible by num_heads!'
- self.scale = qk_scale or self.qk_dim ** -0.5
- ################side_dwconv (i.e. LCE in ShuntedTransformer)###########
- self.lepe = nn.Conv2d(dim, dim, kernel_size=side_dwconv, stride=1, padding=side_dwconv // 2,
- groups=dim) if side_dwconv > 0 else \
- lambda x: torch.zeros_like(x)
- ################ global routing setting #################
- self.topk = topk
- self.param_routing = param_routing
- self.diff_routing = diff_routing
- self.soft_routing = soft_routing
- # router
- assert not (self.param_routing and not self.diff_routing) # cannot be with_param=True and diff_routing=False
- self.router = TopkRouting(qk_dim=self.qk_dim,
- qk_scale=self.scale,
- topk=self.topk,
- diff_routing=self.diff_routing,
- param_routing=self.param_routing)
- if self.soft_routing: # soft routing, always diffrentiable (if no detach)
- mul_weight = 'soft'
- elif self.diff_routing: # hard differentiable routing
- mul_weight = 'hard'
- else: # hard non-differentiable routing
- mul_weight = 'none'
- self.kv_gather = KVGather(mul_weight=mul_weight)
- # qkv mapping (shared by both global routing and local attention)
- self.param_attention = param_attention
- if self.param_attention == 'qkvo':
- self.qkv = QKVLinear(self.dim, self.qk_dim)
- self.wo = nn.Linear(dim, dim)
- elif self.param_attention == 'qkv':
- self.qkv = QKVLinear(self.dim, self.qk_dim)
- self.wo = nn.Identity()
- else:
- raise ValueError(f'param_attention mode {self.param_attention} is not surpported!')
- self.kv_downsample_mode = kv_downsample_mode
- self.kv_per_win = kv_per_win
- self.kv_downsample_ratio = kv_downsample_ratio
- self.kv_downsample_kenel = kv_downsample_kernel
- if self.kv_downsample_mode == 'ada_avgpool':
- assert self.kv_per_win is not None
- self.kv_down = nn.AdaptiveAvgPool2d(self.kv_per_win)
- elif self.kv_downsample_mode == 'ada_maxpool':
- assert self.kv_per_win is not None
- self.kv_down = nn.AdaptiveMaxPool2d(self.kv_per_win)
- elif self.kv_downsample_mode == 'maxpool':
- assert self.kv_downsample_ratio is not None
- self.kv_down = nn.MaxPool2d(self.kv_downsample_ratio) if self.kv_downsample_ratio > 1 else nn.Identity()
- elif self.kv_downsample_mode == 'avgpool':
- assert self.kv_downsample_ratio is not None
- self.kv_down = nn.AvgPool2d(self.kv_downsample_ratio) if self.kv_downsample_ratio > 1 else nn.Identity()
- elif self.kv_downsample_mode == 'identity': # no kv downsampling
- self.kv_down = nn.Identity()
- elif self.kv_downsample_mode == 'fracpool':
- # assert self.kv_downsample_ratio is not None
- # assert self.kv_downsample_kenel is not None
- # TODO: fracpool
- # 1. kernel size should be input size dependent
- # 2. there is a random factor, need to avoid independent sampling for k and v
- raise NotImplementedError('fracpool policy is not implemented yet!')
- elif kv_downsample_mode == 'conv':
- # TODO: need to consider the case where k != v so that need two downsample modules
- raise NotImplementedError('conv policy is not implemented yet!')
- else:
- raise ValueError(f'kv_down_sample_mode {self.kv_downsaple_mode} is not surpported!')
- # softmax for local attention
- self.attn_act = nn.Softmax(dim=-1)
- self.auto_pad = auto_pad
- def forward(self, x, ret_attn_mask=False):
- """
- x: NHWC tensor
- Return:
- NHWC tensor
- """
- x = rearrange(x, "n c h w -> n h w c")
- # NOTE: use padding for semantic segmentation
- ###################################################
- if self.auto_pad:
- N, H_in, W_in, C = x.size()
- pad_l = pad_t = 0
- pad_r = (self.n_win - W_in % self.n_win) % self.n_win
- pad_b = (self.n_win - H_in % self.n_win) % self.n_win
- x = F.pad(x, (0, 0, # dim=-1
- pad_l, pad_r, # dim=-2
- pad_t, pad_b)) # dim=-3
- _, H, W, _ = x.size() # padded size
- else:
- N, H, W, C = x.size()
- assert H % self.n_win == 0 and W % self.n_win == 0 #
- ###################################################
- # patchify, (n, p^2, w, w, c), keep 2d window as we need 2d pooling to reduce kv size
- x = rearrange(x, "n (j h) (i w) c -> n (j i) h w c", j=self.n_win, i=self.n_win)
- #################qkv projection###################
- # q: (n, p^2, w, w, c_qk)
- # kv: (n, p^2, w, w, c_qk+c_v)
- # NOTE: separte kv if there were memory leak issue caused by gather
- q, kv = self.qkv(x)
- # pixel-wise qkv
- # q_pix: (n, p^2, w^2, c_qk)
- # kv_pix: (n, p^2, h_kv*w_kv, c_qk+c_v)
- q_pix = rearrange(q, 'n p2 h w c -> n p2 (h w) c')
- kv_pix = self.kv_down(rearrange(kv, 'n p2 h w c -> (n p2) c h w'))
- kv_pix = rearrange(kv_pix, '(n j i) c h w -> n (j i) (h w) c', j=self.n_win, i=self.n_win)
- q_win, k_win = q.mean([2, 3]), kv[..., 0:self.qk_dim].mean(
- [2, 3]) # window-wise qk, (n, p^2, c_qk), (n, p^2, c_qk)
- ##################side_dwconv(lepe)##################
- # NOTE: call contiguous to avoid gradient warning when using ddp
- lepe = self.lepe(rearrange(kv[..., self.qk_dim:], 'n (j i) h w c -> n c (j h) (i w)', j=self.n_win,
- i=self.n_win).contiguous())
- lepe = rearrange(lepe, 'n c (j h) (i w) -> n (j h) (i w) c', j=self.n_win, i=self.n_win)
- ############ gather q dependent k/v #################
- r_weight, r_idx = self.router(q_win, k_win) # both are (n, p^2, topk) tensors
- kv_pix_sel = self.kv_gather(r_idx=r_idx, r_weight=r_weight, kv=kv_pix) # (n, p^2, topk, h_kv*w_kv, c_qk+c_v)
- k_pix_sel, v_pix_sel = kv_pix_sel.split([self.qk_dim, self.dim], dim=-1)
- # kv_pix_sel: (n, p^2, topk, h_kv*w_kv, c_qk)
- # v_pix_sel: (n, p^2, topk, h_kv*w_kv, c_v)
- ######### do attention as normal ####################
- k_pix_sel = rearrange(k_pix_sel, 'n p2 k w2 (m c) -> (n p2) m c (k w2)',
- m=self.num_heads) # flatten to BMLC, (n*p^2, m, topk*h_kv*w_kv, c_kq//m) transpose here?
- v_pix_sel = rearrange(v_pix_sel, 'n p2 k w2 (m c) -> (n p2) m (k w2) c',
- m=self.num_heads) # flatten to BMLC, (n*p^2, m, topk*h_kv*w_kv, c_v//m)
- q_pix = rearrange(q_pix, 'n p2 w2 (m c) -> (n p2) m w2 c',
- m=self.num_heads) # to BMLC tensor (n*p^2, m, w^2, c_qk//m)
- # param-free multihead attention
- attn_weight = (
- q_pix * self.scale) @ k_pix_sel # (n*p^2, m, w^2, c) @ (n*p^2, m, c, topk*h_kv*w_kv) -> (n*p^2, m, w^2, topk*h_kv*w_kv)
- attn_weight = self.attn_act(attn_weight)
- out = attn_weight @ v_pix_sel # (n*p^2, m, w^2, topk*h_kv*w_kv) @ (n*p^2, m, topk*h_kv*w_kv, c) -> (n*p^2, m, w^2, c)
- out = rearrange(out, '(n j i) m (h w) c -> n (j h) (i w) (m c)', j=self.n_win, i=self.n_win,
- h=H // self.n_win, w=W // self.n_win)
- out = out + lepe
- # output linear
- out = self.wo(out)
- # NOTE: use padding for semantic segmentation
- # crop padded region
- if self.auto_pad and (pad_r > 0 or pad_b > 0):
- out = out[:, :H_in, :W_in, :].contiguous()
- if ret_attn_mask:
- return out, r_weight, r_idx, attn_weight
- else:
- return rearrange(out, "n h w c -> n c h w")
from ultralytics.nn.modules.Biformer import BiLevelRoutingAttention as Biformer
当然如果你不想用快捷键也可以自己寻找大概在 650行左右,有一个方法的名字叫"parse_model"
我们可以在某一层中添加Biformer注意力机制,具体添加到哪里由你自己决定,我这里建议添加到 Neck层,也就是我们的特征融合层,添加之后的效果如下,这里我在三个地方添加了Biformer注意力机制。