论文地址:arxiv.org/pdf/2004.01…

数据处理部分

这里以MOT17数据为例，MOT类继承了GenericDataset类。直接看GenericDataset中的getitem函数,首先是从self._load_data中索引出img和anno信息。基本上都是调用coco的一些函数。

  def __getitem__(self, index):
    opt = self.opt
    img, anns, img_info, img_path = self._load_data(index)

  def _load_data(self, index):
    coco = self.coco
    img_dir = self.img_dir
    img_id = self.images[index]
    img, anns, img_info, img_path = self._load_image_anns(img_id, coco, img_dir)
    return img, anns, img_info, img_path

  def _load_image_anns(self, img_id, coco, img_dir):
    img_info = coco.loadImgs(ids=[img_id])[0]
    file_name = img_info['file_name']
    img_path = os.path.join(img_dir, file_name)
    ann_ids = coco.getAnnIds(imgIds=[img_id])
    anns = copy.deepcopy(coco.loadAnns(ids=ann_ids))
    img = cv2.imread(img_path)
    return img, anns, img_info, img_path

假设index=0, 打印一些结果看看：

img_info记录了整体图片信息，包括id，图片路径，cur/pre/next帧的framd_id，以及video_id
anns记录了图片内单个行人的信息，包括id，category_id，img_id, track_id和bbox

In [1]: img.shape
Out[1]: (1080, 1920, 3)

In [2]: img_info
Out[2]: 
{'file_name': 'MOT17-02-FRCNN/img1/000001.jpg',
 'id': 1,
 'frame_id': 1,
 'prev_image_id': -1,
 'next_image_id': 2,
 'video_id': 1}

In [3]: len(anns)
Out[3]: 16

In [4]: anns[0]
Out[4]: 
{'id': 601,
 'category_id': 1,
 'image_id': 1,
 'track_id': 2,
 'bbox': [1338.0, 418.0, 167.0, 379.0],
 'conf': 1.0}

下面这部分主要是进行random_crop_resize的augment,和centernet中相同。

    height, width = img.shape[0], img.shape[1] 
    c = np.array([img.shape[1] / 2., img.shape[0] / 2.], dtype=np.float32)
    s = max(img.shape[0], img.shape[1]) * 1.0 if not self.opt.not_max_crop \
      else np.array([img.shape[1], img.shape[0]], np.float32)
    aug_s, rot, flipped = 1, 0, 0
    
    if self.split == 'train':
      c, aug_s, rot = self._get_aug_param(c, s, width, height)
      s = s * aug_s
      if np.random.random() < opt.flip:
        flipped = 1
        img = img[:, ::-1, :]
        anns = self._flip_anns(anns, width)
        
    trans_input = get_affine_transform(
      c, s, rot, [opt.input_w, opt.input_h])
    trans_output = get_affine_transform(
      c, s, rot, [opt.output_w, opt.output_h])
    inp = self._get_input(img, trans_input)
    ret = {'image': inp}
    gt_det = {'bboxes': [], 'scores': [], 'clses': [], 'cts': []}

根据video_id和frame_id得到前一帧数据,调用_load_pre_data函数

训练时，先找出同video_id的图片，然后根据frame_id（frame_dist不超过3）随机挑选img作为前一帧，记录id以及frame_id（这两个似乎是一样的）
测试时，使用真实前一帧图片
根据img_id来load对应的img和anns ·

    pre_cts, track_ids = None, None
    if opt.tracking:
      pre_image, pre_anns, frame_dist = self._load_pre_data(
        img_info['video_id'], img_info['frame_id'], 
        img_info['sensor_id'] if 'sensor_id' in img_info else 1)

  def _load_pre_data(self, video_id, frame_id, sensor_id=1):
    # 先找出同video_id的所有图片
    img_infos = self.video_to_images[video_id]
    
    # 训练阶段, 从附近帧中随机挑选一张作为"前一帧"
    # 测试阶段, 获取真实的前一帧
    if 'train' in self.split:
      img_ids = [(img_info['id'], img_info['frame_id']) \
          for img_info in img_infos \
          if abs(img_info['frame_id'] - frame_id) < self.opt.max_frame_dist and \
          (not ('sensor_id' in img_info) or img_info['sensor_id'] == sensor_id)]
    else:
      img_ids = [(img_info['id'], img_info['frame_id']) \
          for img_info in img_infos \
            if (img_info['frame_id'] - frame_id) == -1 and \
            (not ('sensor_id' in img_info) or img_info['sensor_id'] == sensor_id)]
      if len(img_ids) == 0:
        img_ids = [(img_info['id'], img_info['frame_id']) \
            for img_info in img_infos \
            if (img_info['frame_id'] - frame_id) == 0 and \
            (not ('sensor_id' in img_info) or img_info['sensor_id'] == sensor_id)]
    rand_id = np.random.choice(len(img_ids))
    img_id, pre_frame_id = img_ids[rand_id]
    frame_dist = abs(frame_id - pre_frame_id)
    
    # 根据上面选出的img_id来load对应的img和anns
    img, anns, _, _ = self._load_image_anns(img_id, self.coco, self.img_dir)
    return img, anns, frame_dist

对pre_img做同样的augment

      if flipped:
        pre_image = pre_image[:, ::-1, :].copy()
        pre_anns = self._flip_anns(pre_anns, width)
      if opt.same_aug_pre and frame_dist != 0:
        trans_input_pre = trans_input 
        trans_output_pre = trans_output
      else:
        c_pre, aug_s_pre, _ = self._get_aug_param(
          c, s, width, height, disturb=True)
        s_pre = s * aug_s_pre
        trans_input_pre = get_affine_transform(
          c_pre, s_pre, rot, [opt.input_w, opt.input_h])
        trans_output_pre = get_affine_transform(
          c_pre, s_pre, rot, [opt.output_w, opt.output_h])
          
      pre_img = self._get_input(pre_image, trans_input_pre)

计算pre_img的dets，包括前一帧的heatmap,center 以及track_id。

获取pre_hm，通过计算高斯半径（gaussian_radius函数），并且将高斯分布分散到heatmap上（draw_umich_gaussian函数）
获取pre_ctes和track_ids · 计算cur_img的ret和dets

      pre_hm, pre_cts, track_ids = self._get_pre_dets(
        pre_anns, trans_input_pre, trans_output_pre)
      ret['pre_img'] = pre_img # 3*544*960
      if opt.pre_hm:
        ret['pre_hm'] = pre_hm # 3*544*960

  def _get_pre_dets(self, anns, trans_input, trans_output):
    hm_h, hm_w = self.opt.input_h, self.opt.input_w
    down_ratio = self.opt.down_ratio
    trans = trans_input
    reutrn_hm = self.opt.pre_hm
    pre_hm = np.zeros((1, hm_h, hm_w), dtype=np.float32) if reutrn_hm else None
    pre_cts, track_ids = [], []
    for ann in anns:
      cls_id = int(self.cat_ids[ann['category_id']])
      if cls_id > self.opt.num_classes or cls_id <= -99 or \
         ('iscrowd' in ann and ann['iscrowd'] > 0):
        continue
      bbox = self._coco_box_to_bbox(ann['bbox'])
      bbox[:2] = affine_transform(bbox[:2], trans)
      bbox[2:] = affine_transform(bbox[2:], trans)
      bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, hm_w - 1)
      bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, hm_h - 1)
      h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
      max_rad = 1
      
      if (h > 0 and w > 0):
        # 根据一元二次方程计算出最小半径
        radius = gaussian_radius((math.ceil(h), math.ceil(w)))
        radius = max(0, int(radius)) 
        max_rad = max(max_rad, radius)
        ct = np.array(
          [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32)
        ct0 = ct.copy()
        conf = 1

        ct[0] = ct[0] + np.random.randn() * self.opt.hm_disturb * w
        ct[1] = ct[1] + np.random.randn() * self.opt.hm_disturb * h
        conf = 1 if np.random.random() > self.opt.lost_disturb else 0
        
        ct_int = ct.astype(np.int32)
        if conf == 0:
          pre_cts.append(ct / down_ratio)
        else:
          pre_cts.append(ct0 / down_ratio)

        track_ids.append(ann['track_id'] if 'track_id' in ann else -1)
        if reutrn_hm:
          # 得到高斯分布
          draw_umich_gaussian(pre_hm[0], ct_int, radius, k=conf)    
          
        # 迭代计算pre_hm?
        if np.random.random() < self.opt.fp_disturb and reutrn_hm:
          ct2 = ct0.copy()
          # Hard code heatmap disturb ratio, haven't tried other numbers.
          ct2[0] = ct2[0] + np.random.randn() * 0.05 * w
          ct2[1] = ct2[1] + np.random.randn() * 0.05 * h 
          ct2_int = ct2.astype(np.int32)
          draw_umich_gaussian(pre_hm[0], ct2_int, radius, k=conf)

    return pre_hm, pre_cts, track_ids

初始化ret, gt_det

    # init samples
    self._init_ret(ret, gt_det)

  def _init_ret(self, ret, gt_det):
    max_objs = self.max_objs * self.opt.dense_reg # 256
    # heatmap 1*136*240
    ret['hm'] = np.zeros(
      (self.opt.num_classes, self.opt.output_h, self.opt.output_w), 
      np.float32) 
    # hm拉平的index 256
    ret['ind'] = np.zeros((max_objs), dtype=np.int64)
    # 类别 256
    ret['cat'] = np.zeros((max_objs), dtype=np.int64)
    # mask 256
    ret['mask'] = np.zeros((max_objs), dtype=np.float32)

    regression_head_dims = {
      'reg': 2, 'wh': 2, 'tracking': 2, 'ltrb': 4, 'ltrb_amodal': 4, 
      'nuscenes_att': 8, 'velocity': 3, 'hps': self.num_joints * 2, 
      'dep': 1, 'dim': 3, 'amodel_offset': 2}

    # {'hm': 1, 'reg': 2, 'wh': 2, 'tracking': 2, 'ltrb_amodal': 4}
    for head in regression_head_dims:
      if head in self.opt.heads:
        ret[head] = np.zeros(
          (max_objs, regression_head_dims[head]), dtype=np.float32)
        ret[head + '_mask'] = np.zeros(
          (max_objs, regression_head_dims[head]), dtype=np.float32)
        gt_det[head] = []

初始化后打印：

In [1]: ret.keys()
Out[1]: dict_keys(['image', 'pre_img', 'pre_hm', 'hm', 'ind', 'cat', 'mask', 'reg', 'reg_mask', 
'wh', 'wh_mask', 'tracking', 'tracking_mask', 'ltrb_amodal', 'ltrb_amodal_mask'])
In [2]: gt_det.keys()
Out[2]: dict_keys(['bboxes', 'scores', 'clses', 'cts', 'reg', 'wh', 'tracking', 'ltrb_amodal'])

获取calib (3*4)，是相机变换矩阵？？

calib = self._get_calib(img_info, width, height)

  def _get_calib(self, img_info, width, height):
    if 'calib' in img_info:
      calib = np.array(img_info['calib'], dtype=np.float32)
    else:
      calib = np.array([[self.rest_focal_length, 0, width / 2, 0], 
                        [0, self.rest_focal_length, height / 2, 0], 
                        [0, 0, 1, 0]])
    return calib

对于img里的每个行人，首先获取trans_output变换之后的bbox和bbox_amodal(bbox_amodal只是变换后下采样4倍。bbox基于bbox_amodal做了边界擦除，最大边界output_w/h）。

    num_objs = min(len(anns), self.max_objs)
    for k in range(num_objs):
      ann = anns[k]
      cls_id = int(self.cat_ids[ann['category_id']])
      if cls_id > self.opt.num_classes or cls_id <= -999:
        continue
      bbox, bbox_amodal = self._get_bbox_output(
        ann['bbox'], trans_output, height, width)
      if cls_id <= 0 or ('iscrowd' in ann and ann['iscrowd'] > 0):
        self._mask_ignore_or_crowd(ret, cls_id, bbox)
        continue
      self._add_instance(
        ret, gt_det, k, cls_id, bbox, bbox_amodal, ann, trans_output, aug_s,
        calib, pre_cts, track_ids)
    return ret

对图片中的每个行人，生成对应的标签信息

  def _add_instance(
    self, ret, gt_det, k, cls_id, bbox, bbox_amodal, ann, trans_output,
    aug_s, calib, pre_cts=None, track_ids=None):
    h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
    if h <= 0 or w <= 0:
      return
      
    # 计算出最小半径
    radius = gaussian_radius((math.ceil(h), math.ceil(w)))
    radius = max(0, int(radius)) 
    
    # 中心点坐标
    ct = np.array(
      [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32)
    ct_int = ct.astype(np.int32)
    
    # 第k个行人的类别
    ret['cat'][k] = cls_id - 1
    
    # mask为1代表有行人
    ret['mask'][k] = 1
    
    # 第k个行人的宽高
    if 'wh' in ret:
      ret['wh'][k] = 1. * w, 1. * h
      ret['wh_mask'][k] = 1
    
    # 第k个行人hm拉平的index
    ret['ind'][k] = ct_int[1] * self.opt.output_w + ct_int[0]
    
    # 第k个行人中心点的偏移量
    ret['reg'][k] = ct - ct_int
    ret['reg_mask'][k] = 1
    
    # 得到高斯分布
    draw_umich_gaussian(ret['hm'][cls_id - 1], ct_int, radius)
	
    # 比直接用bbox精度更高
    gt_det['bboxes'].append(
      np.array([ct[0] - w / 2, ct[1] - h / 2,
                ct[0] + w / 2, ct[1] + h / 2], dtype=np.float32))
                
    # 存gt的得分，类别，中心点
    gt_det['scores'].append(1)
    gt_det['clses'].append(cls_id - 1)
    gt_det['cts'].append(ct)

    # 有track_id存前后两帧的中心点偏移，无track_id全置0
    if 'tracking' in self.opt.heads:
      if ann['track_id'] in track_ids:
        pre_ct = pre_cts[track_ids.index(ann['track_id'])]
        ret['tracking_mask'][k] = 1
        ret['tracking'][k] = pre_ct - ct_int
        gt_det['tracking'].append(ret['tracking'][k])
      else:
        gt_det['tracking'].append(np.zeros(2, np.float32))

    # bbox_amodal减去ct_int，得到以原点为中心点的bbox_amodal坐标
    if 'ltrb_amodal' in self.opt.heads:
      ret['ltrb_amodal'][k] = \
        bbox_amodal[0] - ct_int[0], bbox_amodal[1] - ct_int[1], \
        bbox_amodal[2] - ct_int[0], bbox_amodal[3] - ct_int[1]
      ret['ltrb_amodal_mask'][k] = 1
      gt_det['ltrb_amodal'].append(bbox_amodal)

总结一下ret和gt_det都存了哪些信息：

image # (3, 544, 960) 输入图片
pre_img # (3, 544, 960) 前一帧图片
pre_hm # (1, 544, 960) 前一帧图片hm(为什么没有4倍下采样)
hm # (1, 136, 240) 当前图片hm
ind # (256,) hm拉平的index
cat # (256,) 类别
mask # (256,) 是否有行人的标志
reg # (256, 2) 中心点偏差
reg_mask # (256, 2)
wh # (256, 2) 宽高
wh_mask # (256, 2)
tracking # (256, 2) 前后两帧中心点偏移量
tracking_mask # (256, 2)
ltrb_amodal # (256, 4) bbox_amodal减去ct_int，得到以原点为中心点的bbox坐标
ltrb_amodal_mask # (256, 4)

gt_det

bboxes # list n*4, 每个行人bbox坐标
scores # list n*1, 全为1
clses # list n*1, 全为0
cts # list n*=2, 每个行人center坐标
reg # 空
wh # 空
tracking # list n*=2, 每个行人前后两帧center偏移量
ltrb_amodal # list n*4, 每个行人bbox_amodal坐标

CenterTrack代码解读——数据处理部分

数据处理部分