主要内容:
二. Backbone – 特征提取
2.1 voxelize:体素化
2.2 点云voxel编码方式: PillarFeatureNet(PFN)
2.3 点云中间编码方式: PointPillarsScatter
2.4 backbone: SECOND
三. Neck
四. Head和loss
4.1 CenterHead
4.2 loss
主要实现类Voxelization,将点云转为voxel表征方式。
def forward(ctx,
points,
voxel_size,
coors_range,
max_points=35,
max_voxels=20000,
deterministic=True):
"""convert kitti points(N, >=3) to voxels.
"""
if max_points == -1 or max_voxels == -1:
coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int)
dynamic_voxelize(points, coors, voxel_size, coors_range, 3)
return coors
else:
voxels = points.new_zeros(
size=(max_voxels, max_points, points.size(1))) #30000,20,5
coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int) #30000,3
num_points_per_voxel = points.new_zeros(
size=(max_voxels, ), dtype=torch.int)
voxel_num = hard_voxelize(points, voxels, coors,
num_points_per_voxel, voxel_size,
coors_range, max_points, max_voxels, 3,
deterministic) # cuda中体素化ops,29249
# select the valid voxels,去掉空的voxel
voxels_out = voxels[:voxel_num]
coors_out = coors[:voxel_num]
num_points_per_voxel_out = num_points_per_voxel[:voxel_num] #每个体素中点数
return voxels_out, coors_out, num_points_per_voxel_out
主要作用是将点云voxel表征方式进行编码,并建立稠密特征张量。
将上一步中的体素化点云编码成为10维的向量D(x,y,z,r,delt_t,xc,yc,zc,xp,yp)
, 其中x,y,z,r,delt_t
分别表示点云3个坐标、反射强度、多帧时点的时间戳差值;xc,yc,zc
表示到该Pillar中所有点的算术平均值点(中心)的距离,xp,yp
表示该点到该Pillar的x,y坐标中心的偏移值,得到一个(P,N,D)
稠密张量。再通过多层 PFNLayer = linear线性层 + BatchNorm + ReLU + max pooling,转换得到(P,N,C)
,N代表每一个pillar中的点数,C代表channel数目,最终经过对每个pillar进行最大池化max_pooling得到(P,C)
的张量。
def forward(self, features, num_points, coors):
"""Forward function.
"""
features_ls = [features]
# Find distance of x, y, and z from cluster center,到每个pillar中心点的距离
if self._with_cluster_center:
points_mean = features[:, :, :3].sum(
dim=1, keepdim=True) / num_points.type_as(features).view(
-1, 1, 1)
f_cluster = features[:, :, :3] - points_mean
features_ls.append(f_cluster)
# Find distance of x, y, and z from pillar center, 到pillar中心坐标距离
dtype = features.dtype
if self._with_voxel_center:
if not self.legacy:
f_center = torch.zeros_like(features[:, :, :2])
f_center[:, :, 0] = features[:, :, 0] - (
coors[:, 3].to(dtype).unsqueeze(1) * self.vx +
self.x_offset)
f_center[:, :, 1] = features[:, :, 1] - (
coors[:, 2].to(dtype).unsqueeze(1) * self.vy +
self.y_offset)
else:
f_center = features[:, :, :2]
f_center[:, :, 0] = f_center[:, :, 0] - (
coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
self.x_offset)
f_center[:, :, 1] = f_center[:, :, 1] - (
coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
self.y_offset)
features_ls.append(f_center)
#计算点到中心(0,0)距离
if self._with_distance:
points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
features_ls.append(points_dist)
# Combine together feature decorations,合并
features = torch.cat(features_ls, dim=-1)
# The feature decorations were calculated without regard to whether
# pillar was empty. Need to ensure that
# empty pillars remain set to zeros.
voxel_count = features.shape[1]
mask = get_paddings_indicator(num_points, voxel_count, axis=0)
mask = torch.unsqueeze(mask, -1).type_as(features)
features *= mask
for pfn in self.pfn_layers:
features = pfn(features, num_points)
return features.squeeze() #[P,C] 27059, 64
作用:将学习到的稠密特征[C,P]
还原成伪图像[C,W,H]
def forward_batch(self, voxel_features, coors, batch_size):
"""Scatter features of single sample.
"""
# batch_canvas will be the final output.
batch_canvas = []
for batch_itt in range(batch_size):
# Create the canvas for this sample
canvas = torch.zeros(
self.in_channels,
self.nx * self.ny,
dtype=voxel_features.dtype,
device=voxel_features.device)
# Only include non-empty pillars
batch_mask = coors[:, 0] == batch_itt
this_coors = coors[batch_mask, :]
indices = this_coors[:, 2] * self.nx + this_coors[:, 3]
indices = indices.type(torch.long)
voxels = voxel_features[batch_mask, :]
voxels = voxels.t()
# Now scatter the blob back to the canvas.
canvas[:, indices] = voxels
# Append to a list for later stacking.
batch_canvas.append(canvas)
# Stack to 3-dim tensor (batch-size, in_channels, nrows*ncols)
batch_canvas = torch.stack(batch_canvas, 0)
# Undo the column stacking to final 4-dim tensor
batch_canvas = batch_canvas.view(batch_size, self.in_channels, self.ny,
self.nx)
return batch_canvas
使用多层的conv+BN+Relu
三件套进行特征提取,总共有[4,6,6]层三件套组成, channel维度分别对应[64, 128, 256]。
blocks = []
for i, layer_num in enumerate(layer_nums):
block = [
build_conv_layer(
conv_cfg,
in_filters[i],
out_channels[i],
3,
stride=layer_strides[i],
padding=1),
build_norm_layer(norm_cfg, out_channels[i])[1],
nn.ReLU(inplace=True),
]
for j in range(layer_num):
block.append(
build_conv_layer(
conv_cfg,
out_channels[i],
out_channels[i],
3,
padding=1))
block.append(build_norm_layer(norm_cfg, out_channels[i])[1])
block.append(nn.ReLU(inplace=True))
block = nn.Sequential(*block)
blocks.append(block)
self.blocks = nn.ModuleList(blocks)
SECONDFPN, 对Backbone得到特征进行加工和合理利用
。主要还是由类似conv+BN+Relu
三件套构成,进行上采样解码操作
,将上一步channel[64, 128, 256]均变成128,然后合并,得到[B,C,W,H]
的张量,此中C为128*3 = 384,结构如下:
(pts_neck): SECONDFPN(
(deblocks): ModuleList(
(0): Sequential(
(0): Conv2d(64, 128, kernel_size=(2, 2), stride=(2, 2), bias=False)
(1): BatchNorm2d(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
)
(1): Sequential(
(0): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
)
(2): Sequential(
(0): ConvTranspose2d(256, 128, kernel_size=(2, 2), stride=(2, 2), bias=False)
(1): BatchNorm2d(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
)
)
)
CenterHead, 先经过一个共享卷积
,将特征由[B,384,128,128]
变为[B,64,128,128]
。然后分别对每个任务tasks进行推理,最后得到预测结果字典。
def forward(self, feats):
"""Forward pass.
"""
return multi_apply(self.forward_single, feats)
def forward_single(self, x):
"""Forward function for CenterPoint.
"""
ret_dicts = []
x = self.shared_conv(x) # 共享卷积,三件套
for task in self.task_heads:
ret_dicts.append(task(x))
return ret_dicts
每个大类别,含有一个task,每一个task,对应1个SeparateHead,每个SeparateHead包含6个需要回归的head。
故配置中有6个task,6个head,6*6=36
个需要回归的head。其中一个SeparateHead结构如下,6个head分别为reg、height、dim、rot、vel、heatmap
。最终经过CenterHead处理后,得到关于6个tasks的list。
注意:因为不同类别,BEV视角下尺寸不同,如car和pedestrian,故将其分为不同的任务;而pedestrian与traffic_cone在BEV视角下,尺寸相近,故作为一个task进行回归。
(0): SeparateHead(
(reg): Sequential(
(0): ConvModule(
(conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activate): ReLU(inplace=True)
)
(1): Conv2d(64, 2, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(height): Sequential(
(0): ConvModule(
(conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activate): ReLU(inplace=True)
)
(1): Conv2d(64, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(dim): Sequential(
(0): ConvModule(
(conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activate): ReLU(inplace=True)
)
(1): Conv2d(64, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(rot): Sequential(
(0): ConvModule(
(conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activate): ReLU(inplace=True)
)
(1): Conv2d(64, 2, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(vel): Sequential(
(0): ConvModule(
(conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activate): ReLU(inplace=True)
)
(1): Conv2d(64, 2, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(heatmap): Sequential(
(0): ConvModule(
(conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activate): ReLU(inplace=True)
)
(1): Conv2d(64, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
针对每一个task,利用gtbox真值和get_targets
得到heatmaps, anno_boxes, inds, masks
,这四个量含义如下表:
参数 | heatmap | anno_box | ind | mask |
---|---|---|---|---|
说明 | 中心点热图分数 | 框的gt真值 | 框的中心点在热力图中的位置 | 有效box的掩码,1/0划分 |
尺寸 | [class_num, 128, 128] | [500, 10] | [500] | [500] |
取值举例 | 每个class有一张热图 | 10维参数的含义,第1-2维表示中心点的偏移量offset_x、 offset_y,第3维表示中心点的高度z,第4-6维表示目标框的长宽高box_dim,第7-8维表示旋转角度sin(α) cos(α),第9-10维表示速度vx vy | ind[idx] = x*128 + y | mask[idx] = 1 |
主要包含两个loss,一个是针对heatmap的focal loss
,另一个是针对bbox的L1 loss
。
def loss(self, gt_bboxes_3d, gt_labels_3d, preds_dicts, **kwargs):
"""Loss function for CenterHead.
"""
heatmaps, anno_boxes, inds, masks = self.get_targets(
gt_bboxes_3d, gt_labels_3d)
loss_dict = dict()
for task_id, preds_dict in enumerate(preds_dicts):
# loss1: heatmap focal loss
preds_dict[0]['heatmap'] = clip_sigmoid(preds_dict[0]['heatmap'])
num_pos = heatmaps[task_id].eq(1).float().sum().item()
loss_heatmap = self.loss_cls(
preds_dict[0]['heatmap'],
heatmaps[task_id],
avg_factor=max(num_pos, 1))
target_box = anno_boxes[task_id]
# reconstruct the anno_box from multiple reg heads
preds_dict[0]['anno_box'] = torch.cat(
(preds_dict[0]['reg'], preds_dict[0]['height'],
preds_dict[0]['dim'], preds_dict[0]['rot'],
preds_dict[0]['vel']),
dim=1)
# Regression loss for dimension, offset, height, rotation
ind = inds[task_id]
num = masks[task_id].float().sum()
pred = preds_dict[0]['anno_box'].permute(0, 2, 3, 1).contiguous()
pred = pred.view(pred.size(0), -1, pred.size(3))
pred = self._gather_feat(pred, ind)
mask = masks[task_id].unsqueeze(2).expand_as(target_box).float()
isnotnan = (~torch.isnan(target_box)).float()
mask *= isnotnan
code_weights = self.train_cfg.get('code_weights', None)
bbox_weights = mask * mask.new_tensor(code_weights)
# loss2: bbox loss
loss_bbox = self.loss_bbox(
pred, target_box, bbox_weights, avg_factor=(num + 1e-4))
loss_dict[f'task{task_id}.loss_heatmap'] = loss_heatmap
loss_dict[f'task{task_id}.loss_bbox'] = loss_bbox
return loss_dict