相对重量级网络而言,轻量级网络的特点是参数少、计算量小、推理时间短。更适用于存储空间和功耗受限的场景,例如移动端嵌入式设备等边缘计算设备。因此轻量级网络受到了广泛的关注,其中MobileNet可谓是其中的佼佼者。MobileNetV3经过了V1和V2前两代的积累,性能和速度都表现优异,MobileNetV3 参数是由NAS(network architecture search)搜索获取的,又继承的V1和V2的一些实用成果,并引人SE通道注意力机制,可谓集大成者。本文以应用为主,结合代码剖析MobileNetV3的网络结构。
上图为MobileNetV3的网络结构图,large和small的整体结构一致,区别就是基本单元bneck的个数以及内部参数上,主要是通道数目。(左图为small,右图为large)
上表为具体的参数设置,其中bneck是网络的基本结构。SE代表是否使用通道注意力机制。NL代表激活函数的类型,包括HS(h-swish),RE(ReLU)。NBN 代表没有BN操作。 s 是stride的意思,网络使用卷积stride操作进行降采样,没有使用pooling操作。
给出修改后的 代码
- from typing import Callable, List, Optional
-
- import torch
- from torch import nn, Tensor
- from torch.nn import functional as F
- from functools import partial
-
-
- def _make_divisible(ch, divisor=8, min_ch=None):
- """
- This function is taken from the original tf repo.
- It ensures that all layers have a channel number that is divisible by 8
- It can be seen here:
- https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
- """
- if min_ch is None:
- min_ch = divisor
- new_ch = max(min_ch, int(ch + divisor / 2) // divisor * divisor)
- # Make sure that round down does not go down by more than 10%.
- if new_ch < 0.9 * ch:
- new_ch += divisor
- return new_ch
-
-
- class ConvBNActivation(nn.Sequential):
- def __init__(self,
- in_planes: int,
- out_planes: int,
- kernel_size: int = 3,
- stride: int = 1,
- groups: int = 1,
- norm_layer: Optional[Callable[..., nn.Module]] = None,
- activation_layer: Optional[Callable[..., nn.Module]] = None):
- padding = (kernel_size - 1) // 2
- if norm_layer is None:
- norm_layer = nn.BatchNorm2d
- if activation_layer is None:
- activation_layer = nn.ReLU6
- super(ConvBNActivation, self).__init__(nn.Conv2d(in_channels=in_planes,
- out_channels=out_planes,
- kernel_size=kernel_size,
- stride=stride,
- padding=padding,
- groups=groups,
- bias=False),
- norm_layer(out_planes),
- activation_layer(inplace=True))
-
-
- class SqueezeExcitation(nn.Module):
- def __init__(self, input_c: int, squeeze_factor: int = 4):
- super(SqueezeExcitation, self).__init__()
- squeeze_c = _make_divisible(input_c // squeeze_factor, 8)
- self.fc1 = nn.Conv2d(input_c, squeeze_c, 1)
- self.fc2 = nn.Conv2d(squeeze_c, input_c, 1)
-
- def forward(self, x: Tensor) -> Tensor:
- scale = F.adaptive_avg_pool2d(x, output_size=(1, 1))
- scale = self.fc1(scale)
- scale = F.relu(scale, inplace=True)
- scale = self.fc2(scale)
- scale = F.hardsigmoid(scale, inplace=True)
- return scale * x
-
-
- class InvertedResidualConfig:
- def __init__(self,
- input_c: int,
- kernel: int,
- expanded_c: int,
- out_c: int,
- use_se: bool,
- activation: str,
- stride: int,
- width_multi: float):
- self.input_c = self.adjust_channels(input_c, width_multi)
- self.kernel = kernel
- self.expanded_c = self.adjust_channels(expanded_c, width_multi)
- self.out_c = self.adjust_channels(out_c, width_multi)
- self.use_se = use_se
- self.use_hs = activation == "HS" # whether using h-swish activation
- self.stride = stride
-
- @staticmethod
- def adjust_channels(channels: int, width_multi: float):
- return _make_divisible(channels * width_multi, 8)
-
-
- class InvertedResidual(nn.Module):
- def __init__(self,
- cnf: InvertedResidualConfig,
- norm_layer: Callable[..., nn.Module]):
- super(InvertedResidual, self).__init__()
-
- if cnf.stride not in [1, 2]:
- raise ValueError("illegal stride value.")
-
- self.use_res_connect = (cnf.stride == 1 and cnf.input_c == cnf.out_c)
-
- layers: List[nn.Module] = []
- activation_layer = nn.Hardswish if cnf.use_hs else nn.ReLU
-
- # expand
- if cnf.expanded_c != cnf.input_c:
- layers.append(ConvBNActivation(cnf.input_c,
- cnf.expanded_c,
- kernel_size=1,
- norm_layer=norm_layer,
- activation_layer=activation_layer))
-
- # depthwise
- layers.append(ConvBNActivation(cnf.expanded_c,
- cnf.expanded_c,
- kernel_size=cnf.kernel,
- stride=cnf.stride,
- groups=cnf.expanded_c,
- norm_layer=norm_layer,
- activation_layer=activation_layer))
-
- if cnf.use_se:
- layers.append(SqueezeExcitation(cnf.expanded_c))
-
- # project
- layers.append(ConvBNActivation(cnf.expanded_c,
- cnf.out_c,
- kernel_size=1,
- norm_layer=norm_layer,
- activation_layer=nn.Identity))
-
- self.block = nn.Sequential(*layers)
- self.out_channels = cnf.out_c
- self.is_strided = cnf.stride > 1
-
- def forward(self, x: Tensor) -> Tensor:
- result = self.block(x)
- if self.use_res_connect:
- result += x
-
- return result
-
-
- class MobileNetV3(nn.Module):
- def __init__(self,
- inverted_residual_setting: List[InvertedResidualConfig],
- last_channel: int,
- num_classes: int = 1000,
- block: Optional[Callable[..., nn.Module]] = None,
- norm_layer: Optional[Callable[..., nn.Module]] = None):
- super(MobileNetV3, self).__init__()
-
- if not inverted_residual_setting:
- raise ValueError("The inverted_residual_setting should not be empty.")
- elif not (isinstance(inverted_residual_setting, List) and
- all([isinstance(s, InvertedResidualConfig) for s in inverted_residual_setting])):
- raise TypeError("The inverted_residual_setting should be List[InvertedResidualConfig]")
-
- if block is None:
- block = InvertedResidual
-
- if norm_layer is None:
- norm_layer = partial(nn.BatchNorm2d, eps=0.001, momentum=0.01)
-
- layers: List[nn.Module] = []
-
- # building first layer
- firstconv_output_c = inverted_residual_setting[0].input_c
- layers.append(ConvBNActivation(3,
- firstconv_output_c,
- kernel_size=3,
- stride=2,
- norm_layer=norm_layer,
- activation_layer=nn.Hardswish))
- # building inverted residual blocks
- for cnf in inverted_residual_setting:
- layers.append(block(cnf, norm_layer))
-
- # building last several layers
- lastconv_input_c = inverted_residual_setting[-1].out_c
- lastconv_output_c = 6 * lastconv_input_c
- layers.append(ConvBNActivation(lastconv_input_c,
- lastconv_output_c,
- kernel_size=1,
- norm_layer=norm_layer,
- activation_layer=nn.Hardswish))
- self.features = nn.Sequential(*layers)
- self.avgpool = nn.AdaptiveAvgPool2d(1)
- self.classifier = nn.Sequential(nn.Linear(lastconv_output_c, last_channel),
- nn.Hardswish(inplace=True),
- nn.Dropout(p=0.2, inplace=True),
- nn.Linear(last_channel, num_classes))
-
- # initial weights
- for m in self.modules():
- if isinstance(m, nn.Conv2d):
- nn.init.kaiming_normal_(m.weight, mode="fan_out")
- if m.bias is not None:
- nn.init.zeros_(m.bias)
- elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
- nn.init.ones_(m.weight)
- nn.init.zeros_(m.bias)
- elif isinstance(m, nn.Linear):
- nn.init.normal_(m.weight, 0, 0.01)
- nn.init.zeros_(m.bias)
-
- def _forward_impl(self, x: Tensor) -> Tensor:
- x = self.features(x)
- x = self.avgpool(x)
- x = torch.flatten(x, 1)
- x = self.classifier(x)
-
- return x
-
- def forward(self, x: Tensor) -> Tensor:
- return self._forward_impl(x)
-
-
- def mobilenet_v3_large(num_classes: int = 1000,
- reduced_tail: bool = False) -> MobileNetV3:
- """
- Constructs a large MobileNetV3 architecture from
- "Searching for MobileNetV3"
. - weights_link:
- https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth
- Args:
- num_classes (int): number of classes
- reduced_tail (bool): If True, reduces the channel counts of all feature layers
- between C4 and C5 by 2. It is used to reduce the channel redundancy in the
- backbone for Detection and Segmentation.
- """
- width_multi = 1.0
- bneck_conf = partial(InvertedResidualConfig, width_multi=width_multi)
- adjust_channels = partial(InvertedResidualConfig.adjust_channels, width_multi=width_multi)
-
- reduce_divider = 2 if reduced_tail else 1
-
- inverted_residual_setting = [
- # input_c, kernel, expanded_c, out_c, use_se, activation, stride
- bneck_conf(16, 3, 16, 16, False, "RE", 1),
- bneck_conf(16, 3, 64, 24, False, "RE", 2), # C1
- bneck_conf(24, 3, 72, 24, False, "RE", 1),
- bneck_conf(24, 5, 72, 40, True, "RE", 2), # C2
- bneck_conf(40, 5, 120, 40, True, "RE", 1),
- bneck_conf(40, 5, 120, 40, True, "RE", 1),
- bneck_conf(40, 3, 240, 80, False, "HS", 2), # C3
- bneck_conf(80, 3, 200, 80, False, "HS", 1),
- bneck_conf(80, 3, 184, 80, False, "HS", 1),
- bneck_conf(80, 3, 184, 80, False, "HS", 1),
- bneck_conf(80, 3, 480, 112, True, "HS", 1),
- bneck_conf(112, 3, 672, 112, True, "HS", 1),
- bneck_conf(112, 5, 672, 160 // reduce_divider, True, "HS", 2), # C4
- bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1),
- bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1),
- ]
- last_channel = adjust_channels(1280 // reduce_divider) # C5
-
- return MobileNetV3(inverted_residual_setting=inverted_residual_setting,
- last_channel=last_channel,
- num_classes=num_classes)
-
-
- def mobilenet_v3_small(num_classes: int = 1000,
- reduced_tail: bool = False) -> MobileNetV3:
- """
- Constructs a large MobileNetV3 architecture from
- "Searching for MobileNetV3"
. - weights_link:
- https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth
- Args:
- num_classes (int): number of classes
- reduced_tail (bool): If True, reduces the channel counts of all feature layers
- between C4 and C5 by 2. It is used to reduce the channel redundancy in the
- backbone for Detection and Segmentation.
- """
- width_multi = 1.0
- bneck_conf = partial(InvertedResidualConfig, width_multi=width_multi)
- adjust_channels = partial(InvertedResidualConfig.adjust_channels, width_multi=width_multi)
-
- reduce_divider = 2 if reduced_tail else 1
-
- inverted_residual_setting = [
- # input_c, kernel, expanded_c, out_c, use_se, activation, stride
- bneck_conf(16, 3, 16, 16, True, "RE", 2), # C1
- bneck_conf(16, 3, 72, 24, False, "RE", 2), # C2
- bneck_conf(24, 3, 88, 24, False, "RE", 1),
- bneck_conf(24, 5, 96, 40, True, "HS", 2), # C3
- bneck_conf(40, 5, 240, 40, True, "HS", 1),
- bneck_conf(40, 5, 240, 40, True, "HS", 1),
- bneck_conf(40, 5, 120, 48, True, "HS", 1),
- bneck_conf(48, 5, 144, 48, True, "HS", 1),
- bneck_conf(48, 5, 288, 96 // reduce_divider, True, "HS", 2), # C4
- bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96 // reduce_divider, True, "HS", 1),
- bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96 // reduce_divider, True, "HS", 1)
- ]
- last_channel = adjust_channels(1024 // reduce_divider) # C5
-
- return MobileNetV3(inverted_residual_setting=inverted_residual_setting,
- last_channel=last_channel,
- num_classes=num_classes)