代码地址: github.com/xingyizhou/…
数据处理部分
这里以MOT17数据为例,MOT类继承了GenericDataset类。直接看GenericDataset中的getitem函数,首先是从self._load_data中索引出img和anno信息。基本上都是调用coco的一些函数。
def __getitem__(self, index):
opt = self.opt
img, anns, img_info, img_path = self._load_data(index)
def _load_data(self, index):
coco = self.coco
img_dir = self.img_dir
img_id = self.images[index]
img, anns, img_info, img_path = self._load_image_anns(img_id, coco, img_dir)
return img, anns, img_info, img_path
def _load_image_anns(self, img_id, coco, img_dir):
img_info = coco.loadImgs(ids=[img_id])[0]
file_name = img_info['file_name']
img_path = os.path.join(img_dir, file_name)
ann_ids = coco.getAnnIds(imgIds=[img_id])
anns = copy.deepcopy(coco.loadAnns(ids=ann_ids))
img = cv2.imread(img_path)
return img, anns, img_info, img_path
假设index=0, 打印一些结果看看:
- img_info记录了整体图片信息,包括id,图片路径,cur/pre/next帧的framd_id,以及video_id
- anns记录了图片内单个行人的信息,包括id,category_id,img_id, track_id和bbox
In [1]: img.shape
Out[1]: (1080, 1920, 3)
In [2]: img_info
Out[2]:
{'file_name': 'MOT17-02-FRCNN/img1/000001.jpg',
'id': 1,
'frame_id': 1,
'prev_image_id': -1,
'next_image_id': 2,
'video_id': 1}
In [3]: len(anns)
Out[3]: 16
In [4]: anns[0]
Out[4]:
{'id': 601,
'category_id': 1,
'image_id': 1,
'track_id': 2,
'bbox': [1338.0, 418.0, 167.0, 379.0],
'conf': 1.0}
下面这部分主要是进行random_crop_resize的augment,和centernet中相同。
height, width = img.shape[0], img.shape[1]
c = np.array([img.shape[1] / 2., img.shape[0] / 2.], dtype=np.float32)
s = max(img.shape[0], img.shape[1]) * 1.0 if not self.opt.not_max_crop \
else np.array([img.shape[1], img.shape[0]], np.float32)
aug_s, rot, flipped = 1, 0, 0
if self.split == 'train':
c, aug_s, rot = self._get_aug_param(c, s, width, height)
s = s * aug_s
if np.random.random() < opt.flip:
flipped = 1
img = img[:, ::-1, :]
anns = self._flip_anns(anns, width)
trans_input = get_affine_transform(
c, s, rot, [opt.input_w, opt.input_h])
trans_output = get_affine_transform(
c, s, rot, [opt.output_w, opt.output_h])
inp = self._get_input(img, trans_input)
ret = {'image': inp}
gt_det = {'bboxes': [], 'scores': [], 'clses': [], 'cts': []}
根据video_id和frame_id得到前一帧数据,调用_load_pre_data函数
-
训练时,先找出同video_id的图片,然后根据frame_id(frame_dist不超过3)随机挑选img作为前一帧,记录id以及frame_id(这两个似乎是一样的)
-
测试时,使用真实前一帧图片
-
根据img_id来load对应的img和anns ·
pre_cts, track_ids = None, None
if opt.tracking:
pre_image, pre_anns, frame_dist = self._load_pre_data(
img_info['video_id'], img_info['frame_id'],
img_info['sensor_id'] if 'sensor_id' in img_info else 1)
def _load_pre_data(self, video_id, frame_id, sensor_id=1):
# 先找出同video_id的所有图片
img_infos = self.video_to_images[video_id]
# 训练阶段, 从附近帧中随机挑选一张作为"前一帧"
# 测试阶段, 获取真实的前一帧
if 'train' in self.split:
img_ids = [(img_info['id'], img_info['frame_id']) \
for img_info in img_infos \
if abs(img_info['frame_id'] - frame_id) < self.opt.max_frame_dist and \
(not ('sensor_id' in img_info) or img_info['sensor_id'] == sensor_id)]
else:
img_ids = [(img_info['id'], img_info['frame_id']) \
for img_info in img_infos \
if (img_info['frame_id'] - frame_id) == -1 and \
(not ('sensor_id' in img_info) or img_info['sensor_id'] == sensor_id)]
if len(img_ids) == 0:
img_ids = [(img_info['id'], img_info['frame_id']) \
for img_info in img_infos \
if (img_info['frame_id'] - frame_id) == 0 and \
(not ('sensor_id' in img_info) or img_info['sensor_id'] == sensor_id)]
rand_id = np.random.choice(len(img_ids))
img_id, pre_frame_id = img_ids[rand_id]
frame_dist = abs(frame_id - pre_frame_id)
# 根据上面选出的img_id来load对应的img和anns
img, anns, _, _ = self._load_image_anns(img_id, self.coco, self.img_dir)
return img, anns, frame_dist
对pre_img做同样的augment
if flipped:
pre_image = pre_image[:, ::-1, :].copy()
pre_anns = self._flip_anns(pre_anns, width)
if opt.same_aug_pre and frame_dist != 0:
trans_input_pre = trans_input
trans_output_pre = trans_output
else:
c_pre, aug_s_pre, _ = self._get_aug_param(
c, s, width, height, disturb=True)
s_pre = s * aug_s_pre
trans_input_pre = get_affine_transform(
c_pre, s_pre, rot, [opt.input_w, opt.input_h])
trans_output_pre = get_affine_transform(
c_pre, s_pre, rot, [opt.output_w, opt.output_h])
pre_img = self._get_input(pre_image, trans_input_pre)
计算pre_img的dets,包括前一帧的heatmap,center 以及track_id。
- 获取pre_hm,通过计算高斯半径(gaussian_radius函数),并且将高斯分布分散到heatmap上(draw_umich_gaussian函数)
- 获取pre_ctes和track_ids · 计算cur_img的ret和dets
pre_hm, pre_cts, track_ids = self._get_pre_dets(
pre_anns, trans_input_pre, trans_output_pre)
ret['pre_img'] = pre_img # 3*544*960
if opt.pre_hm:
ret['pre_hm'] = pre_hm # 3*544*960
def _get_pre_dets(self, anns, trans_input, trans_output):
hm_h, hm_w = self.opt.input_h, self.opt.input_w
down_ratio = self.opt.down_ratio
trans = trans_input
reutrn_hm = self.opt.pre_hm
pre_hm = np.zeros((1, hm_h, hm_w), dtype=np.float32) if reutrn_hm else None
pre_cts, track_ids = [], []
for ann in anns:
cls_id = int(self.cat_ids[ann['category_id']])
if cls_id > self.opt.num_classes or cls_id <= -99 or \
('iscrowd' in ann and ann['iscrowd'] > 0):
continue
bbox = self._coco_box_to_bbox(ann['bbox'])
bbox[:2] = affine_transform(bbox[:2], trans)
bbox[2:] = affine_transform(bbox[2:], trans)
bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, hm_w - 1)
bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, hm_h - 1)
h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
max_rad = 1
if (h > 0 and w > 0):
# 根据一元二次方程计算出最小半径
radius = gaussian_radius((math.ceil(h), math.ceil(w)))
radius = max(0, int(radius))
max_rad = max(max_rad, radius)
ct = np.array(
[(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32)
ct0 = ct.copy()
conf = 1
ct[0] = ct[0] + np.random.randn() * self.opt.hm_disturb * w
ct[1] = ct[1] + np.random.randn() * self.opt.hm_disturb * h
conf = 1 if np.random.random() > self.opt.lost_disturb else 0
ct_int = ct.astype(np.int32)
if conf == 0:
pre_cts.append(ct / down_ratio)
else:
pre_cts.append(ct0 / down_ratio)
track_ids.append(ann['track_id'] if 'track_id' in ann else -1)
if reutrn_hm:
# 得到高斯分布
draw_umich_gaussian(pre_hm[0], ct_int, radius, k=conf)
# 迭代计算pre_hm?
if np.random.random() < self.opt.fp_disturb and reutrn_hm:
ct2 = ct0.copy()
# Hard code heatmap disturb ratio, haven't tried other numbers.
ct2[0] = ct2[0] + np.random.randn() * 0.05 * w
ct2[1] = ct2[1] + np.random.randn() * 0.05 * h
ct2_int = ct2.astype(np.int32)
draw_umich_gaussian(pre_hm[0], ct2_int, radius, k=conf)
return pre_hm, pre_cts, track_ids
初始化ret, gt_det
# init samples
self._init_ret(ret, gt_det)
def _init_ret(self, ret, gt_det):
max_objs = self.max_objs * self.opt.dense_reg # 256
# heatmap 1*136*240
ret['hm'] = np.zeros(
(self.opt.num_classes, self.opt.output_h, self.opt.output_w),
np.float32)
# hm拉平的index 256
ret['ind'] = np.zeros((max_objs), dtype=np.int64)
# 类别 256
ret['cat'] = np.zeros((max_objs), dtype=np.int64)
# mask 256
ret['mask'] = np.zeros((max_objs), dtype=np.float32)
regression_head_dims = {
'reg': 2, 'wh': 2, 'tracking': 2, 'ltrb': 4, 'ltrb_amodal': 4,
'nuscenes_att': 8, 'velocity': 3, 'hps': self.num_joints * 2,
'dep': 1, 'dim': 3, 'amodel_offset': 2}
# {'hm': 1, 'reg': 2, 'wh': 2, 'tracking': 2, 'ltrb_amodal': 4}
for head in regression_head_dims:
if head in self.opt.heads:
ret[head] = np.zeros(
(max_objs, regression_head_dims[head]), dtype=np.float32)
ret[head + '_mask'] = np.zeros(
(max_objs, regression_head_dims[head]), dtype=np.float32)
gt_det[head] = []
初始化后打印:
In [1]: ret.keys()
Out[1]: dict_keys(['image', 'pre_img', 'pre_hm', 'hm', 'ind', 'cat', 'mask', 'reg', 'reg_mask',
'wh', 'wh_mask', 'tracking', 'tracking_mask', 'ltrb_amodal', 'ltrb_amodal_mask'])
In [2]: gt_det.keys()
Out[2]: dict_keys(['bboxes', 'scores', 'clses', 'cts', 'reg', 'wh', 'tracking', 'ltrb_amodal'])
获取calib (3*4),是相机变换矩阵??
calib = self._get_calib(img_info, width, height)
def _get_calib(self, img_info, width, height):
if 'calib' in img_info:
calib = np.array(img_info['calib'], dtype=np.float32)
else:
calib = np.array([[self.rest_focal_length, 0, width / 2, 0],
[0, self.rest_focal_length, height / 2, 0],
[0, 0, 1, 0]])
return calib
对于img里的每个行人,首先获取trans_output变换之后的bbox和bbox_amodal(bbox_amodal只是变换后下采样4倍。bbox基于bbox_amodal做了边界擦除,最大边界output_w/h)。
num_objs = min(len(anns), self.max_objs)
for k in range(num_objs):
ann = anns[k]
cls_id = int(self.cat_ids[ann['category_id']])
if cls_id > self.opt.num_classes or cls_id <= -999:
continue
bbox, bbox_amodal = self._get_bbox_output(
ann['bbox'], trans_output, height, width)
if cls_id <= 0 or ('iscrowd' in ann and ann['iscrowd'] > 0):
self._mask_ignore_or_crowd(ret, cls_id, bbox)
continue
self._add_instance(
ret, gt_det, k, cls_id, bbox, bbox_amodal, ann, trans_output, aug_s,
calib, pre_cts, track_ids)
return ret
对图片中的每个行人,生成对应的标签信息
def _add_instance(
self, ret, gt_det, k, cls_id, bbox, bbox_amodal, ann, trans_output,
aug_s, calib, pre_cts=None, track_ids=None):
h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
if h <= 0 or w <= 0:
return
# 计算出最小半径
radius = gaussian_radius((math.ceil(h), math.ceil(w)))
radius = max(0, int(radius))
# 中心点坐标
ct = np.array(
[(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32)
ct_int = ct.astype(np.int32)
# 第k个行人的类别
ret['cat'][k] = cls_id - 1
# mask为1代表有行人
ret['mask'][k] = 1
# 第k个行人的宽高
if 'wh' in ret:
ret['wh'][k] = 1. * w, 1. * h
ret['wh_mask'][k] = 1
# 第k个行人hm拉平的index
ret['ind'][k] = ct_int[1] * self.opt.output_w + ct_int[0]
# 第k个行人中心点的偏移量
ret['reg'][k] = ct - ct_int
ret['reg_mask'][k] = 1
# 得到高斯分布
draw_umich_gaussian(ret['hm'][cls_id - 1], ct_int, radius)
# 比直接用bbox精度更高
gt_det['bboxes'].append(
np.array([ct[0] - w / 2, ct[1] - h / 2,
ct[0] + w / 2, ct[1] + h / 2], dtype=np.float32))
# 存gt的得分,类别,中心点
gt_det['scores'].append(1)
gt_det['clses'].append(cls_id - 1)
gt_det['cts'].append(ct)
# 有track_id存前后两帧的中心点偏移,无track_id全置0
if 'tracking' in self.opt.heads:
if ann['track_id'] in track_ids:
pre_ct = pre_cts[track_ids.index(ann['track_id'])]
ret['tracking_mask'][k] = 1
ret['tracking'][k] = pre_ct - ct_int
gt_det['tracking'].append(ret['tracking'][k])
else:
gt_det['tracking'].append(np.zeros(2, np.float32))
# bbox_amodal减去ct_int,得到以原点为中心点的bbox_amodal坐标
if 'ltrb_amodal' in self.opt.heads:
ret['ltrb_amodal'][k] = \
bbox_amodal[0] - ct_int[0], bbox_amodal[1] - ct_int[1], \
bbox_amodal[2] - ct_int[0], bbox_amodal[3] - ct_int[1]
ret['ltrb_amodal_mask'][k] = 1
gt_det['ltrb_amodal'].append(bbox_amodal)
总结一下ret和gt_det都存了哪些信息:
- ret
image # (3, 544, 960) 输入图片
pre_img # (3, 544, 960) 前一帧图片
pre_hm # (1, 544, 960) 前一帧图片hm(为什么没有4倍下采样)
hm # (1, 136, 240) 当前图片hm
ind # (256,) hm拉平的index
cat # (256,) 类别
mask # (256,) 是否有行人的标志
reg # (256, 2) 中心点偏差
reg_mask # (256, 2)
wh # (256, 2) 宽高
wh_mask # (256, 2)
tracking # (256, 2) 前后两帧中心点偏移量
tracking_mask # (256, 2)
ltrb_amodal # (256, 4) bbox_amodal减去ct_int,得到以原点为中心点的bbox坐标
ltrb_amodal_mask # (256, 4)
- gt_det
bboxes # list n*4, 每个行人bbox坐标
scores # list n*1, 全为1
clses # list n*1, 全为0
cts # list n*=2, 每个行人center坐标
reg # 空
wh # 空
tracking # list n*=2, 每个行人前后两帧center偏移量
ltrb_amodal # list n*4, 每个行人bbox_amodal坐标