• 如何使用nuScenes数据集格式的单帧数据推理(以DETR3D为例)


    【请尊重原创!转载和引用文章内容务必注明出处!未经许可上传到某文库或其他收费阅读/下载网站赚钱的必追究责任!】

    无论是mmdetection3D https://github.com/open-mmlab/mmdetection3d还是OpenPCDet  https://github.com/open-mmlab/OpenPCDet都只有使用数据集(使用哪个数据集由配置文件里指定)训练和测试的代码,没有使用某种数据集格式的单帧数据进行推理的代码(也就是一些2D目标检测框里里提供的推理demo代码),而OpenPCDet尤其是mmdetection3D里面为了支持多种不同的数据集和多种不同模型的训练和测试,把很多实现步骤高度配置化,这是好事也是坏事,如果你只是使用这些框架(尤其是使用它们已支持的公开数据集)进行训练和测试,那么相对简单,如果是要使用这些框架对单帧数据(可以是他们支持的公开数据集里的数据或者遵循这些数据集的格式自己制作的数据集的数据)进行推理,就比较复杂了,框架没有提供工具代码只能自己去写,写之前当然得把框架的相关代码翻看几遍把支持某个数据集和模型的配置文件和分散在多处的代码的逻辑理顺连贯起来,不然没法下手,因为框架高度配置化的话相关的代码就会非常分散在连贯性可读性方面很差,需要花时间先理清楚,然后把那些配置化的相关零散代码都整理并修改和串联起来,完成数据的多步预处理、调用模型推理、对推理结果的解析和后处理的全部流程的实现。

    用nuScense格式数据调用detr3d模型的一个最烦的地方就是,你在调用模型推理时非得提供对应的img_metas数据,模型的这个设计我觉得不是很合理,起码不算友好,对于硬件和相关标定参数定了后,这种参数完全可以通过配置隐式提供给模型,为何每次调用都得提供这些meta参数呢!反正我写下面的推理代码之前是浪费了不少时间在厘清img_metas这个参数的数据该怎么准备上!

    此处以DETR3D最初的官方实现代码GitHub - WangYueFt/detr3d为基础给出了相关数据预处理和模型调用及推理结果后处理等代码,另外,这些代码是用于在ROS2节点中运行,解析出的推理结果还按项目实际要求转换成autoware的ObjectDetection数据格式后发布,所以还包含了ROS2和autoware相关的代码,这些代码不是重点仅供做代码逻辑完整性展示。当然,如果你项目中有类似需要的话,可以直接拷贝使用这些代码。

    1. import rclpy
    2. from rclpy.node import Node
    3. from rclpy.executors import MultiThreadedExecutor
    4. from rclpy.callback_groups import MutuallyExclusiveCallbackGroup, ReentrantCallbackGroup
    5. from cv_bridge import CvBridge
    6. from sensor_msgs.msg import CompressedImage, Image
    7. from std_msgs.msg import String
    8. from autoware_auto_perception_msgs.msg import DetectedObjects
    9. from autoware_auto_perception_msgs.msg import DetectedObject
    10. from autoware_auto_perception_msgs.msg import ObjectClassification
    11. from autoware_auto_perception_msgs.msg import DetectedObjectKinematics
    12. from autoware_auto_perception_msgs.msg import Shape
    13. import numpy as np
    14. import mmcv
    15. import sys, os
    16. import torch
    17. import warnings
    18. from mmcv import Config, DictAction
    19. from mmcv.cnn import fuse_conv_bn
    20. from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
    21. from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
    22. wrap_fp16_model)
    23. from mmdet3d.models import build_model
    24. from mmdet.apis import multi_gpu_test, set_random_seed
    25. from mmdet.datasets import replace_ImageToTensor
    26. from mmdet3d.core.bbox.structures.box_3d_mode import (Box3DMode, CameraInstance3DBoxes,
    27. DepthInstance3DBoxes, LiDARInstance3DBoxes)
    28. from nuscenes import NuScenes
    29. from pyquaternion import Quaternion
    30. from geometry_msgs.msg import Point
    31. from geometry_msgs.msg import Vector3
    32. from trimesh.transformations import quaternion_from_euler
    33. from geometry_msgs.msg import Quaternion as GeoQuaternion
    34. from geometry_msgs.msg import Twist
    35. import math
    36. import time
    37. class DetectionPublisher(Node):
    38. def __init__(self):
    39. super().__init__('DetectionPublisher_python')
    40. self.publisher_ = self.create_publisher(String, 'example_topic', 10)
    41. timer_period = 0.5 # seconds
    42. self.timer = self.create_timer(timer_period, self.timer_callback)
    43. self.i = 0
    44. def timer_callback(self):
    45. msg = String()
    46. msg.data = 'Hello World: %d' % self.i
    47. self.publisher_.publish(msg)
    48. self.get_logger().info('Publishing: "%s"' % msg.data)
    49. self.i += 1
    50. class PadMultiViewImage(object):
    51. """Pad the multi-view image.
    52. There are two padding modes: (1) pad to a fixed size and (2) pad to the
    53. minimum size that is divisible by some number.
    54. Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
    55. Args:
    56. size (tuple, optional): Fixed padding size.
    57. size_divisor (int, optional): The divisor of padded size.
    58. pad_val (float, optional): Padding value, 0 by default.
    59. """
    60. def __init__(self, size=None, size_divisor=32, pad_val=0):
    61. self.size = size
    62. self.size_divisor = size_divisor
    63. self.pad_val = pad_val
    64. # only one of size and size_divisor should be valid
    65. assert size is not None or size_divisor is not None
    66. assert size is None or size_divisor is None
    67. def _pad_img(self, results):
    68. """Pad images according to ``self.size``."""
    69. if self.size is not None:
    70. padded_img = [mmcv.impad(
    71. img, shape=self.size, pad_val=self.pad_val) for img in results['img']]
    72. elif self.size_divisor is not None:
    73. padded_img = [mmcv.impad_to_multiple(
    74. img, self.size_divisor, pad_val=self.pad_val) for img in results['img']]
    75. results['img'] = padded_img
    76. results['img_shape'] = [img.shape for img in padded_img]
    77. results['pad_shape'] = [img.shape for img in padded_img]
    78. results['pad_fixed_size'] = self.size
    79. results['pad_size_divisor'] = self.size_divisor
    80. def __call__(self, results):
    81. """Call function to pad images, masks, semantic segmentation maps.
    82. Args:
    83. results (dict): Result dict from loading pipeline.
    84. Returns:
    85. dict: Updated result dict.
    86. """
    87. self._pad_img(results)
    88. return results
    89. def __repr__(self):
    90. repr_str = self.__class__.__name__
    91. repr_str += f'(size={self.size}, '
    92. repr_str += f'size_divisor={self.size_divisor}, '
    93. repr_str += f'pad_val={self.pad_val})'
    94. return repr_str
    95. class NormalizeMultiviewImage(object):
    96. """Normalize the image.
    97. Added key is "img_norm_cfg".
    98. Args:
    99. mean (sequence): Mean values of 3 channels.
    100. std (sequence): Std values of 3 channels.
    101. to_rgb (bool): Whether to convert the image from BGR to RGB,
    102. default is true.
    103. """
    104. def __init__(self, mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=True):
    105. self.mean = np.array(mean, dtype=np.float32)
    106. self.std = np.array(std, dtype=np.float32)
    107. self.to_rgb = to_rgb
    108. def __call__(self, results):
    109. """Call function to normalize images.
    110. Args:
    111. results (dict): Result dict from loading pipeline.
    112. Returns:
    113. dict: Normalized results, 'img_norm_cfg' key is added into
    114. result dict.
    115. """
    116. results['img'] = [mmcv.imnormalize(
    117. img, self.mean, self.std, self.to_rgb) for img in results['img']]
    118. results['img_norm_cfg'] = dict(
    119. mean=self.mean, std=self.std, to_rgb=self.to_rgb)
    120. return results
    121. def __repr__(self):
    122. repr_str = self.__class__.__name__
    123. repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
    124. return repr_str
    125. class LoadSingleViewImageFromFiles(object):
    126. """Load multi channel images from a list of separate channel files.
    127. Expects results['img_filename'] to be a list of filenames.
    128. Args:
    129. to_float32 (bool): Whether to convert the img to float32.
    130. Defaults to False.
    131. color_type (str): Color type of the file. Defaults to 'unchanged'.
    132. """
    133. def __init__(self, to_float32=True, color_type='unchanged'):
    134. self.to_float32 = to_float32
    135. self.color_type = color_type
    136. def __call__(self, results):
    137. """Call function to load multi-view image from files.
    138. Args:
    139. results (dict): Result dict containing multi-view image filenames.
    140. Returns:
    141. dict: The result dict containing the multi-view image data. \
    142. Added keys and values are described below.
    143. - filename (str): Multi-view image filenames.
    144. - img (np.ndarray): Multi-view image arrays.
    145. - img_shape (tuple[int]): Shape of multi-view image arrays.
    146. - ori_shape (tuple[int]): Shape of original image arrays.
    147. - pad_shape (tuple[int]): Shape of padded image arrays.
    148. - scale_factor (float): Scale factor.
    149. - img_norm_cfg (dict): Normalization configuration of images.
    150. """
    151. results['filename'] = 'sample.jpg'
    152. # h,w,c => h, w, c, nv
    153. img = np.stack([results['img']], axis=-1)
    154. if self.to_float32:
    155. img = img.astype(np.float32)
    156. results['img'] = [img[..., i] for i in range(img.shape[-1])]
    157. results['img_shape'] = img.shape
    158. results['ori_shape'] = img.shape
    159. # Set initial values for default meta_keys
    160. results['pad_shape'] = img.shape
    161. results['scale_factor'] = 1.0
    162. num_channels = 1 if len(img.shape) < 3 else img.shape[2]
    163. results['img_norm_cfg'] = dict(
    164. mean=np.zeros(num_channels, dtype=np.float32),
    165. std=np.ones(num_channels, dtype=np.float32),
    166. to_rgb=False)
    167. return results
    168. def __repr__(self):
    169. """str: Return a string that describes the module."""
    170. repr_str = self.__class__.__name__
    171. repr_str += f'(to_float32={self.to_float32}, '
    172. repr_str += f"color_type='{self.color_type}')"
    173. return repr_str
    174. def obtain_sensor2top(nusc,
    175. sensor_token,
    176. l2e_t,
    177. l2e_r_mat,
    178. e2g_t,
    179. e2g_r_mat,
    180. sensor_type='lidar'):
    181. """Obtain the info with RT matric from general sensor to Top LiDAR.
    182. Args:
    183. nusc (class): Dataset class in the nuScenes dataset.
    184. sensor_token (str): Sample data token corresponding to the
    185. specific sensor type.
    186. l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3).
    187. l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego
    188. in shape (3, 3).
    189. e2g_t (np.ndarray): Translation from ego to global in shape (1, 3).
    190. e2g_r_mat (np.ndarray): Rotation matrix from ego to global
    191. in shape (3, 3).
    192. sensor_type (str, optional): Sensor to calibrate. Default: 'lidar'.
    193. Returns:
    194. sweep (dict): Sweep information after transformation.
    195. """
    196. sd_rec = nusc.get('sample_data', sensor_token)
    197. cs_record = nusc.get('calibrated_sensor',
    198. sd_rec['calibrated_sensor_token'])
    199. pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
    200. data_path = str(nusc.get_sample_data_path(sd_rec['token']))
    201. if os.getcwd() in data_path: # path from lyftdataset is absolute path
    202. data_path = data_path.split(f'{os.getcwd()}/')[-1] # relative path
    203. sweep = {
    204. 'data_path': data_path,
    205. 'type': sensor_type,
    206. 'sample_data_token': sd_rec['token'],
    207. 'sensor2ego_translation': cs_record['translation'],
    208. 'sensor2ego_rotation': cs_record['rotation'],
    209. 'ego2global_translation': pose_record['translation'],
    210. 'ego2global_rotation': pose_record['rotation'],
    211. 'timestamp': sd_rec['timestamp']
    212. }
    213. l2e_r_s = sweep['sensor2ego_rotation']
    214. l2e_t_s = sweep['sensor2ego_translation']
    215. e2g_r_s = sweep['ego2global_rotation']
    216. e2g_t_s = sweep['ego2global_translation']
    217. # obtain the RT from sensor to Top LiDAR
    218. # sweep->ego->global->ego'->lidar
    219. l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix
    220. e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix
    221. R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ (
    222. np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
    223. T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ (
    224. np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
    225. T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
    226. ) + l2e_t @ np.linalg.inv(l2e_r_mat).T
    227. sweep['sensor2lidar_rotation'] = R.T # points @ R.T + T
    228. sweep['sensor2lidar_translation'] = T
    229. return sweep
    230. def getSemanticType(class_name):
    231. if (class_name == "CAR" or class_name == "Car"):
    232. return ObjectClassification.CAR
    233. elif (class_name == "TRUCK" or class_name == "Medium_Truck" or class_name =="Big_Truck"):
    234. return ObjectClassification.TRUCK
    235. elif (class_name == "BUS"):
    236. return ObjectClassification.BUS
    237. elif (class_name == "TRAILER"):
    238. return ObjectClassification.TRAILER
    239. elif (class_name == "BICYCLE"):
    240. return ObjectClassification.BICYCLE
    241. elif (class_name == "MOTORBIKE"):
    242. return ObjectClassification.MOTORCYCLE
    243. elif (class_name == "PEDESTRIAN" or class_name == "Pedestrian"):
    244. return ObjectClassification.PEDESTRIAN
    245. else:
    246. return ObjectClassification.UNKNOWN
    247. class CustomBox3D(object):
    248. def __init__(self,nid,score,x,y,z,w,l,h,rt,vel_x,vel_y):
    249. self.id = nid
    250. self.score = score
    251. self.x = x
    252. self.y = y
    253. self.z = z
    254. self.w = w
    255. self.l = l
    256. self.h = h
    257. self.rt = rt
    258. self.vel_x = vel_x
    259. self.vel_y = vel_y
    260. def isCarLikeVehicle(label):
    261. return label == ObjectClassification.BICYCLE or label == ObjectClassification.BUS or \
    262. label == ObjectClassification.CAR or label == ObjectClassification.MOTORCYCLE or \
    263. label == ObjectClassification.TRAILER or label == ObjectClassification.TRUCK
    264. def createPoint(x, y, z):
    265. p = Point()
    266. p.x = float(x)
    267. p.y = float(y)
    268. p.z = float(z)
    269. return p
    270. def createQuaternionFromYaw(yaw):
    271. # tf2.Quaternion
    272. # q.setRPY(0, 0, yaw)
    273. q = quaternion_from_euler(0, 0, yaw)
    274. # geometry_msgs.msg.Quaternion
    275. #return tf2.toMsg(q)
    276. #return GeoQuaternion(*q)
    277. return GeoQuaternion(x=q[0],y=q[1],z=q[2],w=q[3])
    278. def createTranslation(x, y, z):
    279. v = Vector3()
    280. v.x = float(x)
    281. v.y = float(y)
    282. v.z = float(z)
    283. return v
    284. def box3DToDetectedObject(box3d, class_names, has_twist, is_sign):
    285. obj = DetectedObject()
    286. obj.existence_probability = float(box3d.score)
    287. classification = ObjectClassification()
    288. classification.probability = 1.0
    289. if (box3d.id >= 0 and box3d.id < len(class_names)):
    290. classification.label = getSemanticType(class_names[box3d.id])
    291. else:
    292. if is_sign:
    293. sign_label = 255
    294. classification.label = sign_label
    295. else:
    296. classification.label = ObjectClassification.UNKNOWN
    297. print("Unexpected label: UNKNOWN is set.")
    298. if (isCarLikeVehicle(classification.label)):
    299. obj.kinematics.orientation_availability = DetectedObjectKinematics.SIGN_UNKNOWN
    300. obj.classification.append(classification)
    301. # pose and shape
    302. # mmdet3d yaw format to ros yaw format
    303. yaw = -box3d.rt - np.pi / 2
    304. obj.kinematics.pose_with_covariance.pose.position = createPoint(box3d.x, box3d.y, box3d.z)
    305. obj.kinematics.pose_with_covariance.pose.orientation = createQuaternionFromYaw(yaw)
    306. obj.shape.type = Shape.BOUNDING_BOX
    307. obj.shape.dimensions = createTranslation(box3d.l, box3d.w, box3d.h)
    308. # twist
    309. if (has_twist):
    310. vel_x = float(box3d.vel_x)
    311. vel_y = float(box3d.vel_y)
    312. twist = Twist()
    313. twist.linear.x = math.sqrt(pow(vel_x, 2) + pow(vel_y, 2))
    314. twist.angular.z = 2 * (math.atan2(vel_y, vel_x) - yaw)
    315. obj.kinematics.twist_with_covariance.twist = twist
    316. obj.kinematics.has_twist = has_twist
    317. return obj
    318. class ImageSubscriber(Node):
    319. def __init__(self):
    320. super().__init__('ImageSubscriber_python')
    321. cb_group = MutuallyExclusiveCallbackGroup()
    322. self.img_sub = self.create_subscription(
    323. CompressedImage,
    324. 'pub_image/compressed',
    325. self.image_callback,
    326. 10,
    327. callback_group=cb_group)
    328. self.img_sub
    329. self.od_pub = self.create_publisher(DetectedObjects, 'pub_detection', 10)
    330. self.cvBridge = CvBridge()
    331. self.pad = PadMultiViewImage()
    332. self.norm = NormalizeMultiviewImage()
    333. self.file_loader = LoadSingleViewImageFromFiles()
    334. config_path = "./detr3d_res101_gridmask_wst.py"
    335. self.cfg = Config.fromfile(config_path)
    336. if self.cfg.get('custom_imports', None):
    337. from mmcv.utils import import_modules_from_strings
    338. import_modules_from_strings(**self.cfg['custom_imports'])
    339. if hasattr(self.cfg, 'plugin'):
    340. if self.cfg.plugin:
    341. import importlib
    342. if hasattr(self.cfg, 'plugin_dir'):
    343. plugin_dir = self.cfg.plugin_dir
    344. _module_dir = os.path.dirname(plugin_dir)
    345. _module_dir = _module_dir.split('/')
    346. _module_path = _module_dir[0]
    347. for m in _module_dir[1:]:
    348. _module_path = _module_path + '.' + m
    349. print(_module_path)
    350. print(sys.path)
    351. plg_lib = importlib.import_module(_module_path)
    352. if self.cfg.get('cudnn_benchmark', False):
    353. torch.backends.cudnn.benchmark = True
    354. self.cfg.model.pretrained = None
    355. self.cfg.model.train_cfg = None
    356. self.model = build_model(self.cfg.model, test_cfg=self.cfg.get('test_cfg'))
    357. fp16_cfg = self.cfg.get('fp16', None)
    358. if fp16_cfg is not None:
    359. wrap_fp16_model(self.model)
    360. checkpoint = load_checkpoint(self.model, "epoch_200.pth", map_location='cpu')
    361. if 'CLASSES' in checkpoint.get('meta', {}):
    362. self.model.CLASSES = checkpoint['meta']['CLASSES']
    363. else:
    364. self.model.CLASSS = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
    365. 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
    366. 'barrier')
    367. # palette for visualization in segmentation tasks
    368. if 'PALETTE' in checkpoint.get('meta', {}):
    369. self.model.PALETTE = checkpoint['meta']['PALETTE']
    370. self.model.cfg = self.cfg
    371. self.model.cuda()
    372. self.model.eval()
    373. #if torch.cuda.device_count() > 1: # for server side
    374. # self.model = nn.DataParallel(self.model)
    375. print("model is created!")
    376. nusc = NuScenes(version='v1.0-mini', dataroot='nuscenes_mini', verbose=True)
    377. scene0 = nusc.scene[0]
    378. first_sample_token = scene0['first_sample_token']
    379. first_sample = nusc.get('sample', first_sample_token)
    380. sd_rec = nusc.get('sample_data', first_sample['data']['LIDAR_TOP'])
    381. cs_record = nusc.get('calibrated_sensor',
    382. sd_rec['calibrated_sensor_token'])
    383. pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
    384. lidar_token = first_sample['data']['LIDAR_TOP']
    385. lidar_path, boxes, _ = nusc.get_sample_data(lidar_token)
    386. info = {
    387. 'lidar_path': lidar_path,
    388. 'token': first_sample['token'],
    389. 'sweeps': [],
    390. 'cams': dict(),
    391. 'lidar2ego_translation': cs_record['translation'],
    392. 'lidar2ego_rotation': cs_record['rotation'],
    393. 'ego2global_translation': pose_record['translation'],
    394. 'ego2global_rotation': pose_record['rotation'],
    395. 'timestamp': first_sample['timestamp'],
    396. }
    397. l2e_r = info['lidar2ego_rotation']
    398. l2e_t = info['lidar2ego_translation']
    399. e2g_r = info['ego2global_rotation']
    400. e2g_t = info['ego2global_translation']
    401. l2e_r_mat = Quaternion(l2e_r).rotation_matrix
    402. e2g_r_mat = Quaternion(e2g_r).rotation_matrix
    403. camera_types = [
    404. 'CAM_FRONT',
    405. 'CAM_FRONT_RIGHT',
    406. 'CAM_FRONT_LEFT',
    407. 'CAM_BACK',
    408. 'CAM_BACK_LEFT',
    409. 'CAM_BACK_RIGHT',
    410. ]
    411. for cam in camera_types:
    412. cam_token = first_sample['data'][cam]
    413. cam_path, _, cam_intrinsic = nusc.get_sample_data(cam_token)
    414. cam_info = obtain_sensor2top(nusc, cam_token, l2e_t, l2e_r_mat,
    415. e2g_t, e2g_r_mat, cam)
    416. cam_info.update(cam_intrinsic=cam_intrinsic)
    417. info['cams'].update({cam: cam_info})
    418. '''
    419. cam_front_sample_data = nusc.get('sample_data', first_sample['data']['CAM_FRONT'])
    420. cam_front_sample_path = os.path.join(nusc.dataroot, cam_front_sample_data['filename'])
    421. print("sample image file:", cam_front_sample_path)
    422. cam_front_calibrate = nusc.get('calibrated_sensor', cam_front_sample_data['calibrated_sensor_token'])
    423. sensor2lidar = translation = np.expand_dims(np.array(cam_front_calibrate['translation']), axis=-1)
    424. sensor2lidar_rotation = np.expand_dims(np.array(cam_front_calibrate['rotation']), axis=-1)
    425. camera_intrinsic = np.array(cam_front_calibrate['camera_intrinsic'])
    426. '''
    427. image_paths = []
    428. lidar2img_rts = []
    429. lidar2cam_rts = []
    430. cam_intrinsics = []
    431. for cam_type, cam_info in info['cams'].items():
    432. image_paths.append(cam_info['data_path'])
    433. # obtain lidar to image transformation matrix
    434. lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
    435. lidar2cam_t = cam_info[
    436. 'sensor2lidar_translation'] @ lidar2cam_r.T
    437. lidar2cam_rt = np.eye(4)
    438. lidar2cam_rt[:3, :3] = lidar2cam_r.T
    439. lidar2cam_rt[3, :3] = -lidar2cam_t
    440. intrinsic = cam_info['cam_intrinsic']
    441. viewpad = np.eye(4)
    442. viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
    443. lidar2img_rt = (viewpad @ lidar2cam_rt.T)
    444. lidar2img_rts.append(lidar2img_rt)
    445. cam_intrinsics.append(viewpad)
    446. lidar2cam_rts.append(lidar2cam_rt.T)
    447. self.img_metas = {}
    448. self.img_metas.update(
    449. dict(
    450. img_filename=image_paths,
    451. lidar2img=lidar2img_rts,
    452. cam_intrinsic=cam_intrinsics,
    453. lidar2cam=lidar2cam_rts,
    454. ))
    455. self.class_names_ = self.cfg.class_names
    456. print("ImageSubscriber init done")
    457. def image_callback(self, msg):
    458. #image = self.cvBridge.imgmsg_to_cv2(msg, "bgr8")
    459. image = self.cvBridge.compressed_imgmsg_to_cv2(msg, "bgr8")
    460. #print("image received, shape:", image.shape)
    461. results = {'img': image}
    462. self.file_loader(results)
    463. self.norm(results)
    464. self.pad(results)
    465. image = results['img'][0]
    466. meta = {'filename': results['filename'],
    467. 'img_shape': results['img_shape'],
    468. 'ori_shape': results['ori_shape'],
    469. 'pad_shape': results['pad_shape'],
    470. 'scale_factor': results['scale_factor'],
    471. 'box_type_3d': LiDARInstance3DBoxes, #CameraInstance3DBoxes/LiDARInstance3DBoxes
    472. 'box_mode_3d': Box3DMode.LIDAR}
    473. meta.update(self.img_metas)
    474. #print("meta:", meta)
    475. img_metas = [[meta]]
    476. inputs = torch.tensor(image).to('cuda')
    477. # h,w,c => bs,nv,c,h,w
    478. inputs = inputs.permute(2,0,1)
    479. inputs = torch.unsqueeze(inputs, 0)
    480. inputs = torch.unsqueeze(inputs, 0)
    481. #print("input tensor shape:", inputs.shape)
    482. with torch.no_grad():
    483. outputs = self.model(return_loss=False, rescale=True, img=inputs, img_metas=img_metas)
    484. torch.cuda.synchronize()
    485. pts_bbox = outputs[0]['pts_bbox']
    486. boxes_3d_enc = pts_bbox['boxes_3d']
    487. scores_3d = pts_bbox['scores_3d']
    488. labels_3d = pts_bbox['labels_3d']
    489. filter = scores_3d >= 0.5
    490. boxes_3d_enc.tensor = boxes_3d_enc.tensor[filter]
    491. boxes_3d = boxes_3d_enc.tensor.numpy() # [[cx, cy, cz, w, l, h, rot, vx, vy]]
    492. scores_3d = scores_3d[filter].numpy()
    493. labels_3d = labels_3d[filter].numpy()
    494. custom_boxes_3d = []
    495. for i, box_3d in enumerate(boxes_3d):
    496. box3d = CustomBox3D(labels_3d[i], scores_3d[i],
    497. box_3d[0],box_3d[1],box_3d[2],
    498. box_3d[3],box_3d[4],box_3d[5],
    499. box_3d[6],box_3d[7],box_3d[8])
    500. custom_boxes_3d.append(box3d)
    501. #print("boxes_3d", boxes_3d)
    502. #print("scores_3d", scores_3d)
    503. #print("labels_3d", labels_3d)
    504. output_msg = DetectedObjects()
    505. obj_num = len(boxes_3d)
    506. for i, box3d in enumerate(custom_boxes_3d):
    507. obj = box3DToDetectedObject(box3d, self.class_names_, True, False);
    508. output_msg.objects.append(obj)
    509. output_msg.header.stamp = self.get_clock().now().to_msg() #rclpy.time.Time()
    510. output_msg.header.frame_id = "base_link"
    511. self.od_pub.publish(output_msg)
    512. print(obj_num, "Objects published")
    513. def main(args=None):
    514. rclpy.init(args=args)
    515. sys.path.append(os.getcwd())
    516. image_subscriber = ImageSubscriber()
    517. executor = MultiThreadedExecutor()
    518. executor.add_node(image_subscriber)
    519. executor.spin()
    520. image_subscriber.destroy_node()
    521. detection_publisher.destroy_node()
    522. rclpy.shutdown()
    523. if __name__ == '__main__':
    524. main()

    这里说的单帧数据可以是单张图片或者nuScenes格式的6张环视图片,之所以说一帧,是对应点云的概念,实际做3D目标检测尤其是雷视融合推理时一般都是以点云帧的频率为时间单位进行推理(在获取一帧点云的同时获取一张或多张图片(取决于有几个摄像头),前融合的话是将点云和图像的数据做数据融合后输入模型或直接输入模型由模型内部抽取特征做特征融合,后融合的话是将点云和图像分别输入不同的模型推理,再对模型各自的结果进行融合处理)。

    上面的代码由于是以DETR3D官方代码为基础的,所以和mmdetection3D后来集成DETR3D的实现代码不同,对比两者代码可以发现,mmdetection3D将DETR3D官方代码里的一些数据预处理环节的实现代码都移走了,在模型内部单独提供了一个Det3DDataPreprocessor类来进行一些公共处理,总体的配置文件和实现思路还是比较类似,毕竟DETR3D官方是在mmdetection3D的早期版本基础上开发的,例如两者的detr3d_r101_gridmask.py配置文件里面有些配置不同,但是思路是相似的:

    mmdetection3d/projects/DETR3D/configs/detr3d_r101_gridmask.py

    1. model = dict(
    2. type='DETR3D',
    3. use_grid_mask=True,
    4. data_preprocessor=dict(
    5. type='Det3DDataPreprocessor', **img_norm_cfg, pad_size_divisor=32),
    6. img_backbone=dict(
    7. type='mmdet.ResNet',
    8. depth=101,
    9. num_stages=4,
    10. out_indices=(0, 1, 2, 3),
    11. frozen_stages=1,
    12. norm_cfg=dict(type='BN2d', requires_grad=False),
    13. norm_eval=True,
    14. style='caffe',
    15. dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
    16. stage_with_dcn=(False, False, True, True)),
    17. img_neck=dict(
    18. type='mmdet.FPN',
    19. in_channels=[256, 512, 1024, 2048],
    20. out_channels=256,
    21. start_level=1,
    22. add_extra_convs='on_output',
    23. num_outs=4,
    24. relu_before_extra_convs=True),
    25. pts_bbox_head=dict(
    26. type='DETR3DHead',
    27. num_query=900,
    28. num_classes=10,
    29. in_channels=256,
    30. sync_cls_avg_factor=True,
    31. with_box_refine=True,
    32. as_two_stage=False,
    33. transformer=dict(
    34. type='Detr3DTransformer',
    35. decoder=dict(
    36. type='Detr3DTransformerDecoder',
    37. num_layers=6,
    38. return_intermediate=True,
    39. transformerlayers=dict(
    40. type='mmdet.DetrTransformerDecoderLayer',
    41. attn_cfgs=[
    42. dict(
    43. type='MultiheadAttention', # mmcv.
    44. embed_dims=256,
    45. num_heads=8,
    46. dropout=0.1),
    47. dict(
    48. type='Detr3DCrossAtten',
    49. pc_range=point_cloud_range,
    50. num_points=1,
    51. embed_dims=256)
    52. ],
    53. feedforward_channels=512,
    54. ffn_dropout=0.1,
    55. operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
    56. 'ffn', 'norm')))),
    57. bbox_coder=dict(
    58. type='NMSFreeCoder',
    59. post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
    60. pc_range=point_cloud_range,
    61. max_num=300,
    62. voxel_size=voxel_size,
    63. num_classes=10),
    64. positional_encoding=dict(
    65. type='mmdet.SinePositionalEncoding',
    66. num_feats=128,
    67. normalize=True,
    68. offset=-0.5),
    69. loss_cls=dict(
    70. type='mmdet.FocalLoss',
    71. use_sigmoid=True,
    72. gamma=2.0,
    73. alpha=0.25,
    74. loss_weight=2.0),
    75. loss_bbox=dict(type='mmdet.L1Loss', loss_weight=0.25),
    76. loss_iou=dict(type='mmdet.GIoULoss', loss_weight=0.0)),
    77. # model training and testing settings
    78. train_cfg=dict(
    79. pts=dict(
    80. grid_size=[512, 512, 1],
    81. voxel_size=voxel_size,
    82. point_cloud_range=point_cloud_range,
    83. out_size_factor=4,
    84. assigner=dict(
    85. type='HungarianAssigner3D',
    86. cls_cost=dict(type='mmdet.FocalLossCost', weight=2.0),
    87. reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
    88. # ↓ Fake cost. This is just to get compatible with DETR head
    89. iou_cost=dict(type='mmdet.IoUCost', weight=0.0),
    90. pc_range=point_cloud_range))))
    91. dataset_type = 'NuScenesDataset'
    92. data_root = 'data/nuscenes/'
    93. test_transforms = [
    94. dict(
    95. type='RandomResize3D',
    96. scale=(1600, 900),
    97. ratio_range=(1., 1.),
    98. keep_ratio=True)
    99. ]
    100. train_transforms = [dict(type='PhotoMetricDistortion3D')] + test_transforms
    101. backend_args = None
    102. train_pipeline = [
    103. dict(
    104. type='LoadMultiViewImageFromFiles',
    105. to_float32=True,
    106. num_views=6,
    107. backend_args=backend_args),
    108. dict(
    109. type='LoadAnnotations3D',
    110. with_bbox_3d=True,
    111. with_label_3d=True,
    112. with_attr_label=False),
    113. dict(type='MultiViewWrapper', transforms=train_transforms),
    114. dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    115. dict(type='ObjectNameFilter', classes=class_names),
    116. dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])
    117. ]
    118. test_pipeline = [
    119. dict(
    120. type='LoadMultiViewImageFromFiles',
    121. to_float32=True,
    122. num_views=6,
    123. backend_args=backend_args),
    124. dict(type='MultiViewWrapper', transforms=test_transforms),
    125. dict(type='Pack3DDetInputs', keys=['img'])
    126. ]

    detr3d/projects/configs/detr3d/detr3d_res101_gridmask.py

    1. model = dict(
    2. type='Detr3D',
    3. use_grid_mask=True,
    4. img_backbone=dict(
    5. type='ResNet',
    6. depth=101,
    7. num_stages=4,
    8. out_indices=(0, 1, 2, 3),
    9. frozen_stages=1,
    10. norm_cfg=dict(type='BN2d', requires_grad=False),
    11. norm_eval=True,
    12. style='caffe',
    13. dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
    14. stage_with_dcn=(False, False, True, True)),
    15. img_neck=dict(
    16. type='FPN',
    17. in_channels=[256, 512, 1024, 2048],
    18. out_channels=256,
    19. start_level=1,
    20. add_extra_convs='on_output',
    21. num_outs=4,
    22. relu_before_extra_convs=True),
    23. pts_bbox_head=dict(
    24. type='Detr3DHead',
    25. num_query=900,
    26. num_classes=10,
    27. in_channels=256,
    28. sync_cls_avg_factor=True,
    29. with_box_refine=True,
    30. as_two_stage=False,
    31. transformer=dict(
    32. type='Detr3DTransformer',
    33. decoder=dict(
    34. type='Detr3DTransformerDecoder',
    35. num_layers=6,
    36. return_intermediate=True,
    37. transformerlayers=dict(
    38. type='DetrTransformerDecoderLayer',
    39. attn_cfgs=[
    40. dict(
    41. type='MultiheadAttention',
    42. embed_dims=256,
    43. num_heads=8,
    44. dropout=0.1),
    45. dict(
    46. type='Detr3DCrossAtten',
    47. pc_range=point_cloud_range,
    48. num_points=1,
    49. embed_dims=256)
    50. ],
    51. feedforward_channels=512,
    52. ffn_dropout=0.1,
    53. operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
    54. 'ffn', 'norm')))),
    55. bbox_coder=dict(
    56. type='NMSFreeCoder',
    57. post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
    58. pc_range=point_cloud_range,
    59. max_num=300,
    60. voxel_size=voxel_size,
    61. num_classes=10),
    62. positional_encoding=dict(
    63. type='SinePositionalEncoding',
    64. num_feats=128,
    65. normalize=True,
    66. offset=-0.5),
    67. loss_cls=dict(
    68. type='FocalLoss',
    69. use_sigmoid=True,
    70. gamma=2.0,
    71. alpha=0.25,
    72. loss_weight=2.0),
    73. loss_bbox=dict(type='L1Loss', loss_weight=0.25),
    74. loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
    75. # model training and testing settings
    76. train_cfg=dict(pts=dict(
    77. grid_size=[512, 512, 1],
    78. voxel_size=voxel_size,
    79. point_cloud_range=point_cloud_range,
    80. out_size_factor=4,
    81. assigner=dict(
    82. type='HungarianAssigner3D',
    83. cls_cost=dict(type='FocalLossCost', weight=2.0),
    84. reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
    85. iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
    86. pc_range=point_cloud_range))))
    87. dataset_type = 'NuScenesDataset'
    88. data_root = 'data/nuscenes/'
    89. ...
    90. train_pipeline = [
    91. dict(type='LoadMultiViewImageFromFiles', to_float32=True),
    92. dict(type='PhotoMetricDistortionMultiViewImage'),
    93. dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
    94. dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    95. dict(type='ObjectNameFilter', classes=class_names),
    96. dict(type='NormalizeMultiviewImage', **img_norm_cfg),
    97. dict(type='PadMultiViewImage', size_divisor=32),
    98. dict(type='DefaultFormatBundle3D', class_names=class_names),
    99. dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
    100. ]
    101. test_pipeline = [
    102. dict(type='LoadMultiViewImageFromFiles', to_float32=True),
    103. dict(type='NormalizeMultiviewImage', **img_norm_cfg),
    104. dict(type='PadMultiViewImage', size_divisor=32),
    105. dict(
    106. type='MultiScaleFlipAug3D',
    107. img_scale=(1333, 800),
    108. pts_scale_ratio=1,
    109. flip=False,
    110. transforms=[
    111. dict(
    112. type='DefaultFormatBundle3D',
    113. class_names=class_names,
    114. with_label=False),
    115. dict(type='Collect3D', keys=['img'])
    116. ])
    117. ]

    所以,如果是使用mmmdetection3D里集成的DETR3D推理的话,上面的程序需要做修改,可以先对比配置文件和相关代码实现的理清楚差异然后修改代码,例如PadMultiViewImage在mmdetection3D里是不存在的。

  • 相关阅读:
    C //例5.11 译密码。为使电文保密,往往按一定规律将其转换成密码,收报人再按约定的规律将其译回原文。
    《PyTorch深度学习实践》第十二课(循环神经网络RNN)
    丹尼尔·拉瑞莫(BM):EOS的主要开发者
    文字的选择与排版
    分布式事务详解
    车路协同 智能路侧决策系统总体架构及应用
    实战讲解Spring配置中心Config同步更新配置信息(图+文)
    PROFINET通信介绍
    Mybatis框架的缓存
    软件测试测试常见分类有哪些?
  • 原文地址:https://blog.csdn.net/XCCCCZ/article/details/133501513