1. 几个工具函数
| def box_corner_to_center(boxes): |
| """从(左上,右下)转换到(中间,宽度,高度)""" |
| x1, y1, x2, y2 = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3] |
| cx = (x1 + x2) / 2 |
| cy = (y1 + y2) / 2 |
| w = x2 - x1 |
| h = y2 - y1 |
| boxes = torch.stack((cx, cy, w, h), axis=-1) |
| return boxes |
| |
| def box_center_to_corner(boxes): |
| """从(中间,宽度,高度)转换到(左上,右下)""" |
| cx, cy, w, h = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3] |
| x1 = cx - 0.5 * w |
| y1 = cy - 0.5 * h |
| x2 = cx + 0.5 * w |
| y2 = cy + 0.5 * h |
| boxes = torch.stack((x1, y1, x2, y2), axis=-1) |
| return boxes |
| |
| def bbox_to_rect(bbox, color): |
| |
| |
| return plt.Rectangle( |
| xy=(bbox[0], bbox[1]), width=bbox[2]-bbox[0], height=bbox[3]-bbox[1], |
| fill=False, edgecolor=color, linewidth=2) |
| |
| |
| def show_bboxes(axes, bboxes, labels=None, colors=None): |
| """显示所有边界框""" |
| def _make_list(obj, default_values=None): |
| if obj is None: |
| obj = default_values |
| elif not isinstance(obj, (list, tuple)): |
| obj = [obj] |
| return obj |
| labels = _make_list(labels) |
| colors = _make_list(colors, ['b','g','r','m','c']) |
| for i, bbox in enumerate(bboxes): |
| color = colors[i % len(colors)] |
| rect = bbox_to_rect(bbox.detach().numpy(),color) |
| axes.add_patch(rect) |
| if labels and len(labels) > i: |
| text_color = 'k' if color == 'w' else 'w' |
| axes.text(rect.xy[0], rect.xy[1], labels[i], va='center', |
| ha='center', fontsize=9, color=text_color, |
| bbox=dict(facecolor=color, lw=0)) |
2. 生成锚框

| def multibox_prior(img, sizes, ratios): |
| """生成以每个像素为中心具有不同形状的锚框""" |
| in_height, in_width = img.shape[-2:] |
| device, num_sizes, num_ratios = img.device, len(sizes), len(ratios) |
| boxes_per_pixel = (num_sizes + num_ratios - 1) |
| size_tensor = torch.tensor(sizes, device=device) |
| ratio_tensor = torch.tensor(ratios, device=device) |
| |
| |
| center_h = (torch.arange(in_height, device=device) + 0.5) / in_height |
| center_w = (torch.arange(in_width, device=device) + 0.5) / in_width |
| shift_y, shift_x = torch.meshgrid(center_h, center_w, indexing='ij') |
| shift_y, shift_x = shift_y.reshape(-1), shift_x.reshape(-1) |
| |
| |
| |
| w = torch.cat((sizes[0] * torch.sqrt(in_height * ratio_tensor[:] / in_width), |
| size_tensor[1:] * torch.sqrt(in_height * ratio_tensor[0] / in_width))) |
| |
| h = torch.cat((sizes[0] * torch.sqrt(in_width / ratio_tensor[:] / in_height), |
| size_tensor[1:] * torch.sqrt(in_width / ratio_tensor[0] / in_height))) |
| |
| anchor_manipulations = torch.stack((-w, -h, w, h)).T.repeat(in_height * in_width, 1) / 2 |
| |
| |
| out_grid = torch.stack([shift_x, shift_y, shift_x, shift_y], |
| dim=1).repeat_interleave(boxes_per_pixel, dim=0) |
| output = out_grid + anchor_manipulations |
| |
| return output.unsqueeze(0) |
3. 给锚框打标签
| |
| def box_iou(boxes1,boxes2): |
| |
| |
| |
| """计算两个锚框或边界框列表中成对的交并比""" |
| box_area = lambda boxes: ((boxes[:,2] - boxes[:,0]) * |
| (boxes[:,3] - boxes[:,1])) |
| areas1 = box_area(boxes1) |
| areas2 = box_area(boxes2) |
| inter_upperlefts = torch.max(boxes1[:,None,:2],boxes2[:,:2]) |
| inter_lowerrights = torch.min(boxes1[:,None,2:],boxes2[:,2:]) |
| inters = (inter_lowerrights - inter_upperlefts).clamp(min=0) |
| |
| inter_areas = inters[:,:,0] * inters[:,:,1] |
| union_areas = areas1[:,None] + areas2 - inter_areas |
| return inter_areas / union_areas |
- 为锚框分配真实边界框,这样之后才能打标签,偏移量和类别,这里只会把所有满足和真实边界框的iou值大于一定范围的锚框分配,未分配的值为-1
| |
| def assign_anchor_to_bbox(anchors,ground_truth,device,iou_threshold=0.5): |
| """将最接近的真实边界框分配给锚框""" |
| num_anchors, num_gt_boxes = anchors.shape[0], ground_truth.shape[0] |
| jaccard = box_iou(anchors,ground_truth) |
| anchors_bbox_map = torch.full((num_anchors,), -1, dtype=torch.long, device=device) |
| max_ious, indices = torch.max(jaccard, dim=1) |
| anc_i = torch.nonzero(max_ious >= iou_threshold).reshape(-1) |
| box_j = indices[max_ious >= iou_threshold] |
| anchors_bbox_map[anc_i] = box_j |
| |
| |
| col_discard = torch.full((num_anchors,),-1) |
| row_discard = torch.full((num_gt_boxes,),-1) |
| for _ in range(num_gt_boxes): |
| max_idx = torch.argmax(jaccard, dim=None) |
| box_idx = (max_idx % num_gt_boxes).long() |
| anc_idx = (max_idx / num_gt_boxes).long() |
| anchors_bbox_map[anc_idx] = box_idx |
| jaccard[:,box_idx] = col_discard |
| jaccard[anc_idx,:] = row_discard |
| return anchors_bbox_map |
| def offset_boxes(anchors, assigned_bb, eps=1e-6): |
| """对锚框偏移量的转换""" |
| c_anc = box_corner_to_center(anchors) |
| c_assigned_bb = box_corner_to_center(assigned_bb) |
| offset_xy = 10 * (c_assigned_bb[:, :2] - c_anc[:, :2]) / c_anc[:, 2:] |
| offset_wh = 5 * torch.log(eps + c_assigned_bb[:, 2:] / c_anc[:, 2:]) |
| offset = torch.cat([offset_xy, offset_wh], axis=1) |
| return offset |
| |
| |
| def multibox_target(anchors, labels): |
| """使用真实边界框标记锚框""" |
| batch_size, anchors = labels.shape[0], anchors.squeeze(0) |
| batch_offset, batch_mask, batch_class_labels = [], [], [] |
| device, num_anchors = anchors.device, anchors.shape[0] |
| for i in range(batch_size): |
| label = labels[i,:,:] |
| anchors_bbox_map = assign_anchor_to_bbox(anchors,label[:,1:],device) |
| class_labels = torch.zeros(num_anchors, dtype=torch.long,device=device) |
| assigned_bb = torch.zeros((num_anchors,4), dtype=torch.float32,device=device) |
| indices_true =torch.nonzero(anchors_bbox_map >= 0) |
| bb_idx = anchors_bbox_map[indices_true] |
| class_labels[indices_true] = label[bb_idx,0].long() + 1 |
| assigned_bb[indices_true] = label[bb_idx, 1:] |
| bbox_mask = ((anchors_bbox_map >= 0).float().unsqueeze(-1)).repeat(1,4) |
| offset = offset_boxes(anchors, assigned_bb) * bbox_mask |
| batch_offset.append(offset.reshape(-1)) |
| batch_mask.append(bbox_mask.reshape(-1)) |
| batch_class_labels.append(class_labels) |
| bbox_offset = torch.stack(batch_offset) |
| bbox_mask = torch.stack(batch_mask) |
| class_labels = torch.stack(batch_class_labels) |
| |
| |
| |
| return (bbox_offset, bbox_mask, class_labels) |
4. 接下来准备模型
- 首先提供用来预测锚框类别和偏移量的层
因为锚框个数太多,如果把每个锚框都拿出来做回归和分类会导致计算量太大,这里采用的方式是利用通道数作为每一个像素点生成的所有锚框的预测,锚框的标签是利用上面的方式标记的,损失便是由这两者产生的,根据损失的梯度下降,网路便可以学会通道便是锚框的预测结果。
| def cls_predictor(num_input_channels, num_anchors, num_classes): |
| """类别预测层 |
| 这里卷积核用的是高宽不变的,因为如果每一个像素点都生成多个锚框,在进行分类会导致参数过大, |
| 所以这里省略了这个步骤,直接通过把每一个像素点生成num_anchors*num_classes个通道,就代替了上面的步骤 |
| Args: |
| num_input_channels (_type_):特征图的输入通道 |
| num_anchors (_type_):每一个像素生成多少锚框,num_sizes+num_ratios-1 |
| num_classes (_type_):类别数,要包含背景的 |
| """ |
| return nn.Conv2d(num_input_channels, num_anchors*num_classes, kernel_size=3, padding=1) |
| def offset_predictor(num_input_channels, num_anchors): |
| """边界框预测层 |
| 这里卷积核用的是高宽不变的,因为如果每一个像素点都生成多个锚框,在进行预测4个偏移量导致参数过大, |
| 所以这里省略了这个步骤,直接通过把每一个像素点生成num_anchors*4个通道,就代替了上面的步骤 |
| Args: |
| num_input_channels (_type_):特征图的输入通道 |
| num_anchors (_type_):每一个像素生成多少锚框,num_sizes+num_ratios-1 |
| """ |
| return nn.Conv2d(num_input_channels, num_anchors*4, kernel_size=3, padding=1) |
- 接下来提供网络模型的结构,总共有5层

| def down_sample_blk(in_channels, out_channels): |
| |
| blk = [] |
| for _ in range(2): |
| blk.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)) |
| blk.append(nn.BatchNorm2d(out_channels)) |
| blk.append(nn.ReLU()) |
| in_channels = out_channels |
| blk.append(nn.MaxPool2d(kernel_size=2)) |
| return nn.Sequential(*blk) |
| def base_net(): |
| |
| |
| |
| blk = [] |
| num_filters = [3, 16, 32, 64] |
| for i in range(len(num_filters)-1): |
| blk.append(down_sample_blk(num_filters[i], num_filters[i+1])) |
| return nn.Sequential(*blk) |
| def get_blk(i): |
| if i == 0: |
| blk = base_net() |
| elif i == 1: |
| blk = down_sample_blk(64, 128) |
| elif i == 4: |
| blk = nn.AdaptiveMaxPool2d((1, 1)) |
| else: |
| blk = down_sample_blk(128, 128) |
| return blk |
| def blk_forward(x, blk, size, ratio, cls_predictor, offset_predictor): |
| """为每个块定义前向传播 |
| Args: |
| x (_type_):输入的特征图,batch * channel * h * w |
| blk (_type_):网络 |
| size (_type_):生成锚框的尺寸 |
| ratio (_type_):生成锚框的高宽比 |
| cls_predictor (_type_):用于分类的conv |
| offset_predictor (_type_):用于回归的conv |
| Returns: |
| CNN特征图y;在当前尺度下根据y生成的锚框;预测的这些锚框的类别和偏移量(基于y) |
| """ |
| y = blk(x) |
| |
| cls_preds = cls_predictor(y) |
| offset_preds = offset_predictor(y) |
| |
| anchors = mb.multibox_prior(y, sizes=size, ratios=ratio) |
| return (y, anchors, cls_preds, offset_preds) |
| class TinySSD(nn.Module): |
| def __init__(self, num_classes): |
| super().__init__() |
| self.num_classes = num_classes |
| idx_to_in_channels = [64, 128, 128, 128, 128] |
| for i in range(5): |
| setattr(self, f'blk_{i}', get_blk(i)) |
| setattr(self, f'cls_{i}', cls_predictor(idx_to_in_channels[i], num_anchors, self.num_classes)) |
| setattr(self, f'offset_{i}', offset_predictor(idx_to_in_channels[i], num_anchors)) |
| def flatten_pred(self, pred): |
| """把四维的pred展平成二维 |
| 先把通道放到最后一维是因为这样对于每一个像素点的预测后面都是连续的值 |
| Args: |
| pred (_type_):一个batch的预测 |
| Returns: |
| 展平后的张量 |
| """ |
| return torch.flatten(pred.permute(0, 2, 3, 1), start_dim=1) |
| |
| def concat_preds(self, preds): |
| return torch.cat([self.flatten_pred(p) for p in preds], dim=1) |
| |
| def forward(self, x): |
| anchors, cls_preds, offset_preds = [None] * 5, [None] * 5, [None] * 5 |
| for i in range(5): |
| x, anchors[i], cls_preds[i], offset_preds[i] = blk_forward( |
| x, getattr(self, f'blk_{i}'), sizes[i], ratios[i], getattr(self, f'cls_{i}'), getattr(self, f'offset_{i}')) |
| |
| anchors = torch.cat(anchors, dim=1) |
| cls_preds = self.concat_preds(cls_preds) |
| cls_preds = cls_preds.reshape(cls_preds.shape[0], -1, self.num_classes) |
| offset_preds = self.concat_preds(offset_preds) |
| return anchors, cls_preds, offset_preds |
| cls_loss = nn.CrossEntropyLoss(reduction='none') |
| offset_loss = nn.L1Loss(reduction='none') |
| |
| def calc_loss(cls_preds, cls_labels, offset_preds, offset_labels, offset_masks): |
| batch_size, num_classes = cls_preds.shape[0], cls_preds.shape[2] |
| l1 = cls_loss(cls_preds.reshape(-1, num_classes), cls_labels.reshape(-1)) |
| l1 = l1.reshape(batch_size, -1).mean(dim=1) |
| l2 = offset_loss(offset_preds*offset_masks, offset_labels*offset_masks).mean(dim=1) |
| return l1 + l2 |
| |
| def cls_eval(cls_preds, cls_labels): |
| |
| return float((cls_preds.argmax(dim=-1).type(cls_labels.dtype) == cls_labels).sum()) |
| |
| def offset_eval(offset_preds, offset_labels, offset_mask): |
| |
| return float((torch.abs((offset_preds - offset_labels) * offset_mask)).sum()) |
5. 训练
| device, net = mb.try_gpu(), TinySSD(num_classes = 2) |
| updater = torch.optim.SGD(net.parameters(), lr = 0.2, weight_decay=0) |
| num_epochs = 10 |
| animator = mb.Animator(xlabel='epoch', xlim=[1, num_epochs], legend=['class error', 'offset mae']) |
| net = net.to(device) |
| |
| sizes = [[0.2, 0.272], [0.37, 0.447], [0.54, 0.619], [0.71, 0.79], [0.88, 0.961]] |
| ratios = [[1, 2, 0.5]]*5 |
| num_anchors = len(sizes[0]) + len(ratios[0]) - 1 |
| |
| for epoch in range(num_epochs): |
| metric = mb.Accumulator(4) |
| net.train() |
| for x, y in train_iter: |
| x, y = x.to(device), y.to(device) |
| y = y / 256 |
| anchors, cls_preds, offset_preds = net(x) |
| offset_labels, offset_mask, cls_labels = mb.multibox_target(anchors, y) |
| l = calc_loss(cls_preds, cls_labels, offset_preds, offset_labels, offset_mask) |
| updater.zero_grad() |
| l.mean().backward() |
| updater.step() |
| metric.add(cls_eval(cls_preds, cls_labels), cls_labels.numel(), |
| offset_eval(offset_preds, offset_labels, offset_mask), offset_labels.numel()) |
| |
| cls_err = 1 - metric[0] / metric[1] |
| offset_mae = metric[2] / metric[3] |
| animator.add(epoch+1, (cls_err, offset_mae)) |
6. 预测
- 首先定义非极大抑制函数
非极大抑制就是把所有的预测框按照置信度从大到小排序,把和最大的预测框的iou值大于一定阈值的预测框设置为背景
| def nms(boxes, scores, iou_threshold): |
| |
| |
| B = torch.argsort(scores, dim = -1, descending=True) |
| keep = [] |
| while B.numel()>0: |
| i = B[0] |
| keep.append(i) |
| if B.numel() == 1: break |
| |
| iou = box_iou(boxes[i,:].reshape(-1,4), |
| boxes[B[1:],:].reshape(-1,4)).reshape(-1) |
| inds = torch.nonzero(iou <= iou_threshold).reshape(-1) |
| B = B[inds + 1] |
| return torch.tensor(keep, dtype=torch.long, device=boxes.device) |
| |
| |
| def multibox_detection(cls_probs,offset_preds,anchors,nms_threshold=0.5,pos_threshold=0.009999999): |
| |
| |
| device, batch_size = cls_probs.device, cls_probs.shape[0] |
| anchors = anchors.squeeze(0) |
| num_classes, num_anchors = cls_probs.shape[1], cls_probs.shape[2] |
| out = [] |
| for i in range(batch_size): |
| cls_prob, offset_pred = cls_probs[i], offset_preds[i].reshape(-1,4) |
| predicted_bb = offset_inverse(anchors,offset_pred) |
| |
| |
| conf, class_id = torch.max(cls_prob ,dim = 0) |
| background_indices = torch.nonzero(class_id == 0).reshape(-1) |
| non_background_indices = torch.nonzero(class_id != 0).reshape(-1) |
| non_back_predictbb = predicted_bb[non_background_indices] |
| non_back_conf = conf[non_background_indices] |
| keep = nms(non_back_predictbb, non_back_conf, nms_threshold) |
| |
| non_keep = [] |
| for idx, _ in enumerate(non_back_predictbb): |
| if idx not in keep: |
| non_keep.append(idx) |
| |
| class_id[non_background_indices[non_keep]] = 0 |
| |
| all_sorted_indices = torch.cat((non_background_indices[keep], |
| non_background_indices[non_keep], |
| background_indices), dim=0) |
| class_id = class_id[all_sorted_indices] |
| conf = conf[all_sorted_indices] |
| predicted_bb = predicted_bb[all_sorted_indices] |
| |
| below_min_idx = (conf < pos_threshold) |
| class_id[below_min_idx] = 0 |
| conf[below_min_idx] = 1 - conf[below_min_idx] |
| |
| pred_info = torch.cat((class_id.unsqueeze(1), conf.unsqueeze(1), predicted_bb), dim=1) |
| out.append(pred_info) |
| return torch.stack(out) |
| x, y = next(iter(train_iter)) |
| x = x[0] |
| img = x.permute(1, 2, 0) |
| x = x.unsqueeze(0) |
| |
| def predict(X): |
| net.eval() |
| net.to(device) |
| X = X.to(device) |
| anchors, cls_preds, offset_preds = net(X) |
| cls_probs = F.softmax(cls_preds, dim=2).permute(0, 2, 1) |
| output = mb.multibox_detection(cls_probs, offset_preds, anchors) |
| idx = [i for i, row in enumerate(output[0]) if row[0] != 0] |
| return output[0, idx] |
| |
| output = predict(x) |
| |
| def display(img, output, threshold): |
| fig = mb.plt.imshow(img) |
| for row in output: |
| score = float(row[1]) |
| if score < threshold: |
| continue |
| h, w = img.shape[0:2] |
| bbox = [row[2:6] * torch.tensor((w, h, w, h), device=row.device)] |
| mb.show_bboxes(fig.axes, bbox, '%.2f' % score, 'w') |
| |
| display(img, output.cpu(), threshold=0.9) |
- 结果

7. SSD原理性分析
SSD通过多尺度的特征图实现对不同大小目标的检测,处在深层的特征图的感受野比较大,所以设置的锚框的sizes较大,卷积核通过对特征图中每一个位置做卷积进而生成该位置处所对应的锚框们的预测,因为在该位置处卷积核的感受野足够大(包含了该位置对应的锚框),所以可以通过该位置的卷积(这个卷积和全连接其实是一样的,把卷积的元素展平就是做的向量内积wT∗x,所以这里相当于用卷积去实现局部全连接)去做锚框的预测,因为有锚框的信息在里面,由于感受野很大,所以可以更加准确的预测距离真实边界框的距离(因为卷积核可以看到锚框核真实框的距离信息)
(以上内容完全来自个人理解,不足请不吝指出)
__EOF__
- 本文作者: SXQ-BLOG 本文链接: https://www.cnblogs.com/sxq-blog/p/16672884.html 关于博主: 评论和私信会在第一时间回复。或者直接私信我。 版权声明: 本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处! 声援博主: 如果您觉得文章对您有帮助,可以点击文章右下角【推荐】一下。