本文已参与「新人创作礼」活动,一起开启掘金创作之路。
参考: 1.2Faster RCNN源码解析(pytorch) (bilibili.com)
源码:vision/faster_rcnn.py at main · pytorch/vision (github.com)
Faster R-CNN结构图
黄色部分只在训练中才会由的
总体方法定义模型(create_model)
-
1 初始化backbone层和参数
-
2 初始化anchor_generator参数
-
3 初始化roi_pooler roi_pooling层对anchor box进行统一缩放
-
4 初始化model = FasterRCNN(backbone,anchor_generator,roi_pooler, num_class ) Faster R-CNN整体模型 在FasterRCNN中,初始化一系列参数,
-
- 4.1调用AnchorGenerrator().num_anchors_per_location()【函数讲解在下面】计算每个预测特征层上每个滑动窗口的预测目标数。初始化了RPNHead()
-
- 4.2初始化定义整个RPN框架
-
- 4.3初始化TwoMLPHead() fast RCNN中roi pooling后的展平处理两个全连接层部分
-
- 4.4初始化FastRCNNPredictor() 在box_head的输出上预测部分
-
- 4.5初始化GeneralizedRCNNTransform 数据处理
-
- 4.6初始化继承的FasterRCNNBase # 整个模型框架
-
forward的运行步骤是从FasterRCNNBase中的forward开始的
create_model()
def create_model(num_classes):
# https://download.pytorch.org/models/vgg16-397923af.pth
# 如果使用vgg16的话就下载对应预训练权重并取消下面注释,接着把mobilenetv2模型对应的两行代码注释掉
# vgg_feature = vgg(model_name="vgg16", weights_path="./backbone/vgg16.pth").features
# backbone = torch.nn.Sequential(*list(vgg_feature._modules.values())[:-1]) # 删除features中最后一个Maxpool层
# backbone.out_channels = 512
# https://download.pytorch.org/models/mobilenet_v2-b0353104.pth
backbone = MobileNetV2(weights_path="./backbone/mobilenet_v2.pth").features
backbone.out_channels = 1280 # 设置对应backbone输出特征矩阵的channels
anchor_generator = AnchorsGenerator(sizes=((32, 64, 128, 256, 512),), # 传入的sizes时tuple类型,每一个元素也是tuple类型
aspect_ratios=((0.5, 1.0, 2.0),))
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'], # 在哪些特征层上进行roi pooling
output_size=[7, 7], # roi_pooling输出特征矩阵尺寸
sampling_ratio=2) # 采样率
model = FasterRCNN(backbone=backbone,
num_classes=num_classes,
rpn_anchor_generator=anchor_generator,
box_roi_pool=roi_pooler)
return model
AnchorGenerrator().num_anchors_per_location()方法
def num_anchors_per_location(self):
# 计算每个预测特征层上每个滑动窗口的预测目标数
return [len(s) * len(a) for s, a in zip(self.sizes, self.aspect_ratios)]
# 返回了anchor_size(5个) 和 anchor_scale(3个)个数的乘积,
个人测试
sizes = (1,2,3,4,5)
aspect_ratios = (6, 7, 8)
for s, a in zip(sizes, aspect_ratios):
print(len(s) * len(a))
# 报错, TypeError: object of type 'int' has no len()
# 调试后发现, zip是将sizes, aspect_ratios按照最短的元组, 进行组合, 打印循环的结果为
# (s=1, a=6), (2, 7), (3, 8)所以
# 传如的self.sizes, self.aspect_ratios应该是元组中的元组,也就是将sizes,aspect_ratios改为如下:
sizes = ((1,2,3,4,5), )
aspect_ratios = ((6, 7, 8), )
创建模型
model = create_model(num_classes=4) # print(model)
model.to(device) # 指定GPU
冻结权重, 或冻结部分权重
# 先训练RPN和其后面的网络, 所以需要将预训练的backbone冻结 parameters()
for param in model.backbone.parameters(): # 冻结backbone网络,不进行反向传播(不进行梯度计算)
param.requires_grad = False
# 冻结backbone部分底层权重named_parameters()
for name, parameter in model.backbone.named_parameters():
split_name = name.split(".")[0]
if split_name in ["0", "1", "2", "3"]:
parameter.requires_grad = False
else:
parameter.requires_grad = True
提取出非冻结网络, 并传入SGD优化器中, 学习率的设置
# define optimizer
params = [p for p in model.parameters() if p.requires_grad] # 提取出非backbone的参数
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
# 每训练step_size=3次, 将学习率乘以0.33
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.33)
FasterRCNN_frame
FasterRCNN(FasterRCNNBase)
主要是初始化一系列参数, 之后运行FasterRCNNBase 初始化结构
- 1 backbone 生成特征图
- 2 AnchorsGenerator 生成特征图的anchor box
- 3 RPNHead roipooling之后的层
- 4 RegionProposalNetwork
- 5 model = FasterRCNN() Faster R-CNN整体模型
- 6 super(FasterRCNN, self).init(backbone, rpn, roi_heads, transform)
###########################################################
参数解析
- transform parameter min_size=800, max_size=1333, # 预处理resize时限制的最小尺寸与最大尺寸,将输入的图像统一缩放到该范围内
image_mean=None, image_std=None, # 预处理normalize时使用的均值和方差
- RPN parameters rpn_anchor_generator=None, # anchor生成器
rpn_head=None, # RPN中的两个分类器
rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, # rpn中在nms处理前保留的proposal数(根据score)
rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, # rpn中在nms处理后总共保留的proposal数
rpn_nms_thresh=0.7, # rpn中进行nms处理时使用的iou阈值
rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, # rpn计算损失时,采集正负样本设置的阈值
rpn_batch_size_per_image=256, # 正负样本中随机采样256
rpn_positive_fraction=0.5, # rpn计算损失时采样的样本数,随机采样的正负比例各0.5
rpn_score_thresh=0.0,
- Box parameters
box_roi_pool=None, # 对应ROI pooling层
box_head=None, # 对应模型图中的TWO MLPHead
box_predictor=None,
box_score_thresh=0.05, # 过滤小概率目标
box_nms_thresh=0.5,
box_detections_per_img=100, # 对预测结果排序取前100个目标
box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, # fast rcnn计算误差时,采集正负样本设置的阈值
box_batch_size_per_image=512, box_positive_fraction=0.25, # fast rcnn计算误差时采样的样本数,以及正样本占所有样本的比例
bbox_reg_weights=None
###########################################################
# 使用模型框架具体的参数等信息
class FasterRCNN(FasterRCNNBase):
def __init__(self, backbone, num_classes=None, # 加上背景的目标个数
# transform parameter
min_size=800, max_size=1333, # 预处理resize时限制的最小尺寸与最大尺寸
image_mean=None, image_std=None, # 预处理normalize时使用的均值和方差
# RPN parameters
rpn_anchor_generator=None, rpn_head=None,
rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, # rpn中在nms处理前保留的proposal数(根据score)
rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, # rpn中在nms处理后保留的proposal数
rpn_nms_thresh=0.7, # rpn中进行nms处理时使用的iou阈值
rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, # rpn计算损失时,采集正负样本设置的阈值
rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # rpn计算损失时采样的样本数,以及正样本占总样本的比例
rpn_score_thresh=0.0,
# Box parameters
box_roi_pool=None, box_head=None, box_predictor=None,
# 移除低目标概率 fast rcnn中进行nms处理的阈值 对预测结果根据score排序取前100个目标
box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100,
box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, # fast rcnn计算误差时,采集正负样本设置的阈值
box_batch_size_per_image=512, box_positive_fraction=0.25, # fast rcnn计算误差时采样的样本数,以及正样本占所有样本的比例
bbox_reg_weights=None):
# 判断backbone是否包含输出channels属性,如果没有报错
if not hasattr(backbone, "out_channels"): #hasattr: 用于判断对象是否包含对应的属性
raise ValueError(
"backbone should contain an attribute out_channels"
"specifying the number of output channels (assumed to be the"
"same for all the levels"
)
# 检查rpn_anchor_generator和box_roi_pool的数据类型
assert isinstance(rpn_anchor_generator, (AnchorsGenerator, type(None))) # 要吗是给定的类型,要嘛为空
assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None)))
# 检查输出类别num_classes和box_predictor
if num_classes is not None:
if box_predictor is not None:
raise ValueError("num_classes should be None when box_predictor "
"is specified")
else:
if box_predictor is None:
raise ValueError("num_classes should not be None when box_predictor "
"is not specified")
# 预测特征层的channels
out_channels = backbone.out_channels
# 若anchor生成器为空,则自动生成针对resnet50_fpn的anchor生成器
if rpn_anchor_generator is None:
anchor_sizes = ((32,), (64,), (128,), (256,), (512,)) # 带上逗号才是元组, 5个元组,在5个预测特征层上预测
aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
rpn_anchor_generator = AnchorsGenerator(
anchor_sizes, aspect_ratios
)
# 生成RPN通过滑动窗口预测网络部分
if rpn_head is None:
rpn_head = RPNHead(
out_channels, rpn_anchor_generator.num_anchors_per_location()[0]
)
# 默认rpn_pre_nms_top_n_train = 2000, rpn_pre_nms_top_n_test = 1000,
# 默认rpn_post_nms_top_n_train = 2000, rpn_post_nms_top_n_test = 1000,
rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test)
rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test)
# 定义整个RPN框架
rpn = RegionProposalNetwork(
rpn_anchor_generator, # 生成图像的anchor
rpn_head,
rpn_fg_iou_thresh, rpn_bg_iou_thresh, # rpn计算损失时,采集正负样本设置的阈值,fg代表前景目标, bg代表背景目标
rpn_batch_size_per_image, rpn_positive_fraction, # rpn计算损失时采用的正负样本的总个数,以及正样本占总样本的比例
rpn_pre_nms_top_n, # 进行nms处理之前针对每一个预测特征层所保留的目标个数
rpn_post_nms_top_n, # 进行nms处理之后所剩余的目标个数, rpn输出的proposal的个数
rpn_nms_thresh, # rpn中进行nms处理时使用的iou阈值
score_thresh=rpn_score_thresh)
# Multi-scale RoIAlign pooling
if box_roi_pool is None:
box_roi_pool = MultiScaleRoIAlign(
featmap_names=['0', '1', '2', '3'], # 在哪些特征层进行roi pooling
output_size=[7, 7],
sampling_ratio=2)
# fast RCNN中roi pooling后的展平处理两个全连接层部分
if box_head is None:
resolution = box_roi_pool.output_size[0] # 默认等于7
representation_size = 1024
box_head = TwoMLPHead(
out_channels * resolution ** 2,
representation_size
)
# 在box_head的输出上预测部分
if box_predictor is None:
representation_size = 1024
box_predictor = FastRCNNPredictor(
representation_size,
num_classes)
# 将roi pooling, box_head以及box_predictor结合在一起
roi_heads = RoIHeads(
# box
box_roi_pool, box_head, box_predictor,
box_fg_iou_thresh, box_bg_iou_thresh, # 0.5 0.5
box_batch_size_per_image, box_positive_fraction, # 512 0.25
bbox_reg_weights,
box_score_thresh, box_nms_thresh, box_detections_per_img) # 0.05 0.5 100
if image_mean is None:
image_mean = [0.485, 0.456, 0.406]
if image_std is None:
image_std = [0.229, 0.224, 0.225]
# 对数据进行标准化,缩放,打包成batch等处理部分
transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std)
super(FasterRCNN, self).__init__(backbone, rpn, roi_heads, transform)
#####################################
目前FasterRCNN_frame 初始化, FasterRCNNBase运行, 就是Faster R-CNN的整体框架
#####################################
FasterRCNNBase
- 整个模型运行框架 ################################################################
_init_()初始化模型的3个部分
- 1 backbone
- 2 rpn
- 3 roi_heads
################################################################
forward(images, targets) # images.shape=[batch_size, channels, w, h]
-
iamges
-
original_image_sizes:存储原始每张图像尺寸(w , h),因为预处理肯定是图像发生变化, 所以需要记住原图
-
images, targets = self.transform(images, targets) # 对图像和targets进行预处理 self.transform()是初始化中的GeneralizedRCNNTransform() 图像处理结果为将所有图片标准化和统一尺寸(所以图片的w, h)会发生变化。 同时该方法返回的images是一个list包含处理后每张图片的shape和一个图片tensor, shape=[batch_size, channels, new_w, new_h],
targets是包含每张图片的[boxes, labels, image_id, area, iscrowed],具体含义可看预处理方法。
#######--------------------------------------至此开始新大陆, 特征图
- features = self.backbone(images.tensors) # 将图像输入backbone得到特征图
#######--------------------------------------RPN
将特征层以及标注target信息传入rpn中
-
proposals, proposal_losses = self.rpn(images, features, targets) 将图片, 特征图, targets输入到rpn中得到,
-
- proposals=生成每张图中所有anchor的4个坐标
-
-
proposal_losses=RPN的损失 #######--------------------------------------RPN 之后的模块 ROI_head
将rpn生成的数据以及标注target信息传入fast rcnn后半部分
-
-
detections, detector_losses= self.roi_heads(features, proposals, images.image_sizes, targets)
-
- 将rpn生成的数据以及标注target信息传入fast rcnn后半部分
-
- detections=[]
-
- detector_losses=分类器和边界回归损失 #######--------------------------------------将预测结果映射到原始图像
-
detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)
-
-
-
- 对网络的预测结果进行后处理(主要将bboxes还原到原图像尺度上)
-
-
-
return self.eager_outputs(losses, detections)如果是训练返回loss, 评估返回detections ################################################################
class FasterRCNNBase(nn.Module):
"""
Main class for Generalized R-CNN.
Arguments:
backbone (nn.Module):
rpn (nn.Module):
roi_heads (nn.Module): takes the features + the proposals from the RPN and computes
detections / masks from it.
transform (nn.Module): performs the data transformation from the inputs to feed into
the model
"""
def __init__(self, backbone, rpn, roi_heads, transform):
'''
backbone: 特征提取网络
rpn: 区域建议生成网络部分
roi_heads:ROI_pooling后面的部分 (不包括后处理)
'''
super(FasterRCNNBase, self).__init__()
self.transform = transform
self.backbone = backbone
self.rpn = rpn
self.roi_heads = roi_heads
# used only on torchscript mode
self._has_warned = False
@torch.jit.unused
def eager_outputs(self, losses, detections):
# type: (Dict[str, Tensor], List[Dict[str, Tensor]]) -> Union[Dict[str, Tensor], List[Dict[str, Tensor]]]
if self.training:
return losses
return detections
def forward(self, images, targets=None):
# type: (List[Tensor], Optional[List[Dict[str, Tensor]]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]
"""
Arguments:
images (list[Tensor]): images to be processed
targets (list[Dict[Tensor]]): 每一张图片所标注的一些信息 ground-truth boxes present in the image (optional)
Returns:
result (list[BoxList] or dict[Tensor]): the output from the model.
During training, it returns a dict[Tensor] which contains the losses.
During testing, it returns list[BoxList] contains additional fields
like `scores`, `labels` and `mask` (for Mask R-CNN models).
"""
if self.training and targets is None: # 判断如果为训练模式但是没有输入标注信息,self.training 为继承的nn.Module的函数
raise ValueError("In training mode, targets should be passed")
if self.training: # 如果为训练模式
assert targets is not None # 判断是否存在标注信息
for target in targets: # 进一步判断传入的target的boxes参数是否符合规定
boxes = target["boxes"]
if isinstance(boxes, torch.Tensor): # 判断数据类型
if len(boxes.shape) != 2 or boxes.shape[-1] != 4:
raise ValueError("Expected target boxes to be a tensor"
"of shape [N, 4], got {:}.".format(
boxes.shape))
else:
raise ValueError("Expected target boxes to be of type "
"Tensor, got {:}.".format(type(boxes)))
# original_image_sizes:存储每张图像原始尺寸
original_image_sizes = torch.jit.annotate(List[Tuple[int, int]], []) #传递给TorchScript编译器的Python类型,作为the_value的类型提示 List[Tuple[int, int]]变量声明
for img in images:
val = img.shape[-2:] # tensor格式,后两个为图像高和宽
assert len(val) == 2 # 防止输入的是个一维向量
original_image_sizes.append((val[0], val[1]))
# original_image_sizes = [img.shape[-2:] for img in images]
images, targets = self.transform(images, targets) # 对图像进行预处理
# ====================================================================Backbone
# print(images.tensors.shape)
features = self.backbone(images.tensors) # 将图像输入backbone得到特征图
if isinstance(features, torch.Tensor): # 若只在一层特征层上预测,将feature放入有序字典中,并编号为‘0’
features = OrderedDict([('0', features)]) # 若在多层特征层上预测,传入的就是一个有序字典
# ====================================================================RPN
# 将特征层以及标注target信息传入rpn中
# proposals: List[Tensor], Tensor_shape: [num_proposals, 4],
# 每个proposals是绝对坐标,且为(x1, y1, x2, y2)格式
proposals, proposal_losses = self.rpn(images, features, targets)
# ====================================================================roi_Head
# 将rpn生成的数据以及标注target信息传入fast rcnn后半部分
detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
# ====================================================================将预测结果映射到原始图像
# 对网络的预测结果进行后处理(主要将bboxes还原到原图像尺度上)
detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)
losses = {}
losses.update(detector_losses)
losses.update(proposal_losses)
if torch.jit.is_scripting():
if not self._has_warned:
warnings.warn("RCNN always returns a (Losses, Detections) tuple in scripting")
self._has_warned = True
return losses, detections
else:
return self.eager_outputs(losses, detections)
# if self.training:
# return losses
#
# return detections
小模块
RegionProposalNetwork() RPN模块
_init_
rpn_anchor_generator, # 生成图像的anchor
rpn_head, # 分类器
rpn_fg_iou_thresh,
rpn_bg_iou_thresh, # rpn计算损失时,采集正负样本设置的阈值,fg代表前景目标, bg代表背景目标
rpn_batch_size_per_image, # rpn计算损失时采用的正负样本的总个数,
rpn_positive_fraction, # 以及正样本占总样本的比例
rpn_pre_nms_top_n, # 进行nms处理之前针对每一个预测特征层所保留的目标个数
rpn_post_nms_top_n, # 进行nms处理之后所剩余的目标个数, rpn输出的proposal的个数
rpn_nms_thresh, # rpn中进行nms处理时使用的iou阈值
score_thresh=rpn_score_thresh
def __init__(self, anchor_generator, head,
fg_iou_thresh, bg_iou_thresh, # # rpn计算损失时,采集正负样本设置的阈值,fg代表前景目标, bg代表背景目标
batch_size_per_image, positive_fraction, # rpn计算损失时采用的正负样本的总个数,以及正样本占总样本的比例
pre_nms_top_n, # 进行nms处理之前针对每一个预测特征层所保留的目标个数
post_nms_top_n, # # 进行nms处理之后所剩余的目标个数, rpn输出的proposal的个数
nms_thresh, # rpn中进行nms处理时使用的iou阈值
score_thresh=0.0):
'''
anchor_generator:生成anchors
head:
fg_iou_thresh, bg_iou_thresh,
'''
super(RegionProposalNetwork, self).__init__()
self.anchor_generator = anchor_generator
self.head = head
self.box_coder = det_utils.BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))
# use during training
# 计算anchors与真实bbox的iou
self.box_similarity = box_ops.box_iou # 计算传入的两组box的iou值
self.proposal_matcher = det_utils.Matcher(
fg_iou_thresh, # 当iou大于fg_iou_thresh(0.7)时视为正样本
bg_iou_thresh, # 当iou小于bg_iou_thresh(0.3)时视为负样本
allow_low_quality_matches=True
)
self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(
batch_size_per_image, positive_fraction # 256, 0.5
)
# use during testing
self._pre_nms_top_n = pre_nms_top_n
self._post_nms_top_n = post_nms_top_n
self.nms_thresh = nms_thresh
self.score_thresh = score_thresh
self.min_size = 1.
在FasterRCNNBase模块的前向传播中, 运行backbone获取了frature。
之后,
1、 proposals, proposal_losses = self.rpn(images, features, targets)
2、 detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
3、 detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)
proposals, proposal_losses = self.rpn(images, features, targets)
传入的参数为images, feature, targets
images
feature
targets
RPN模块包含
RPNHead()
AnchorsGenerator()
# 因为图像的anchor是根据特征层到原图的一个缩放比例划分原图计算anchor的区域,
# 所以通过计算特征图所有通道的像素数, 一张图就是所有anchors个数
filter_proposals()
1 RPNHead()
_init_()
传入outchannels,每个预测特征层上每个滑动窗口的预测目标数(len(anchor_size) len(anchor_ratios))*
定义
- self.conv=改变特征层的深度, 这里直接等于原特征层的输出深度
- self.cls_logits = 前景或背景的预测器, out_channels= num_anchors个anchorc
- self.bbox_pred = 对anchor boxes的四个参数的回归预测。
最后对上面三个卷积层进行初始化
forward(feature)
# 改变特征的深度,但是在源代码中还是backbone的深度,也就是
# in_channels=out_channels, 多加了一层特征提取
t = F.relu(self.conv(feature))
# 因为不同的backbone可能会有不同特征层的预测个数, 所以需要logits=[], 列表存储。
# conv执行全连接, 使通道变为15 (anchor的个数)
logits.append(self.cls_logits(t))
# conv执行全连接, 使通道变为15 * 4 (anchor的坐标)
bbox_reg.append(self.bbox_pred(t))
class RPNHead(nn.Module):
"""
add a RPN head with classification and regression
通过滑动窗口计算预测目标概率与bbox regression参数
Arguments:
in_channels: number of channels of the input feature
num_anchors: number of anchors to be predicted
"""
def __init__(self, in_channels, num_anchors):
super(RPNHead, self).__init__()
'''
in_channels:输入特征矩阵的channel
'''
# 3x3 滑动窗口, in_channels:backbone之后生成特征图的深度
self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) # 在这里直接将输出特征矩阵的channel = 出入的特征矩阵的channel
# 下面两个卷积层分别对应两个预测器
# 计算预测的目标分数(这里的目标只是指前景或者背景),有num_anchors个anchorc则对其预测分数
self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)
# 计算预测的目标bbox regression参数 : 4:每个anchor的x,y,w,h
self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1, stride=1)
# 三个卷积层的参数初始化
for layer in self.children():
if isinstance(layer, nn.Conv2d):
torch.nn.init.normal_(layer.weight, std=0.01)
torch.nn.init.constant_(layer.bias, 0)
# 前向传播,x是backbone预测生成的特征图
def forward(self, x):
# type: (List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
logits = []
bbox_reg = []
# 遍历预测特征层,backbone=mobeilnet只有一个预测特征层,resnet时有多个预测特征层
for i, feature in enumerate(x):
t = F.relu(self.conv(feature))
logits.append(self.cls_logits(t))
bbox_reg.append(self.bbox_pred(t))
return logits, bbox_reg
2 AnchorsGenerator(anchor_sizes, aspect_ratios) anchors生成器
_init_()
初始化anchor_size, anchor_ratios, -cache:存储原图生成的所有坐标信息
- forward(image_list, feature_map)
image_list 保存图像padding前和后的信息, batch信息
self.head(features) # 计算每个预测特征层上的预测目标概率和bboxes regression参数
self.anchor_generator(images, features)
strides:计算特征层上的一步等于原始图像上的步长
self.set_cell_anchors(dtype, device) # 根据提供的sizes和aspect_ratios生成anchors模板,并赋值给self.cell_anchors
self.cached_grid_anchors(grid_sizes, strides) # 计算每一个anchor box的坐标信息
之后将anchor box映射到特征图上。
forward(images, feature)
输入参数, 和RPN模块开头中讲述的一样
grid_size: 获取每一个特征层的图片shape, 这里我们只有一个shape=【24, 42】
iamge_szie:输入的图片shape, 经过tranformer统一尺寸后的图片
strides:为了映射特征层,计算特征层上的一步等于原始图像上的步长、
set_cell_anchors()-->self.generate_anchors():根据提供的sizes和aspect_ratios生成anchors模板,并赋值给self.cell_anchors.
-->h_ratios = torch.sqrt(aspect_ratios) # aspect_ratios = (0.5, 1.0, 2.0) 高度的乘法因子 =长宽比
w_ratios = 1.0 / h_ratios # 宽度的乘法因子,保持面积不变 h * h_ratios * w * w_ratios = w * h
# 上述两句的理解是 1限制面积不变--》 w_ratios = 1 / h_ratios, 所以aspect_ratios = 长宽比 = h_ratios / w_ratios
最终得到15个anchor的相对于每一个像素中心点的长和宽。
得到15个anchor模板
# 将anchor模板映射到image_list原图上,得到原图所有像素点的anchor。
# 该方法主要是通过原图和特征图之间的缩放比例stride,进行在原图中划
# 分要计算anchor的区域, 并不是每一个像素点都有15个anchor。
# 在原图得到区域坐标后再域anchor模板进行相加, 便得到原图像每一个区域的15个anchor的左上角和右下角的坐标模板。
anchors_over_all_feature_maps= self.cached_grid_anchors-->self.grid_anchors()
再次理解RPNHead中的objectness
# self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)
# 计算预测的目标bbox regression参数 : 4:每个anchor的x,y,w,h
# self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1, stride=1)
# anchors得到的是一张图中anchor的总数量, objectness:特征图--》通道变为anchor模板的数量, 相当于是生成了一张图所有anchor的数量==》objectnesss是每一个anchor的打分
# 然后遍历每一张图, 获取每一个图, 每一个区域的anchor
concat_box_prediction_layers调整RPNHead()anchor和boxes分数的shape
#通过真anchors和RPNHead预测的anchor boxes回归计算得到真预测的proposals
self.box_coder.decode-->signal_decode
NMS对proposals进行非极大值抑制处理(通过objectnesss分数)
计算loss
class AnchorsGenerator(nn.Module):
# 注释信息
__annotations__ = {
"cell_anchors": Optional[List[torch.Tensor]],
"_cache": Dict[str, List[torch.Tensor]]
}
"""
anchors生成器
Module that generates anchors for a set of feature maps and
image sizes.
The module support computing anchors at multiple sizes and aspect ratios
per feature map.
sizes and aspect_ratios should have the same number of elements, and it should
correspond to the number of feature maps.
sizes[i] and aspect_ratios[i] can have an arbitrary number of elements,
and AnchorGenerator will output a set of sizes[i] * aspect_ratios[i] anchors
per spatial location for feature map i.
Arguments:
sizes (Tuple[Tuple[int]]):
aspect_ratios (Tuple[Tuple[float]]):
"""
# sizes=anchor的scale, aspect_ratios=每一个anchor采用的不同比例
def __init__(self, sizes=(128, 256, 512), aspect_ratios=(0.5, 1.0, 2.0)):
super(AnchorsGenerator, self).__init__()
# 判断sizes 和 aspect_ratios的每一个元素是否为 list或tuple类型, 以及两者元素个数相同
if not isinstance(sizes[0], (list, tuple)):
# TODO change this
sizes = tuple((s,) for s in sizes)
if not isinstance(aspect_ratios[0], (list, tuple)):
aspect_ratios = (aspect_ratios,) * len(sizes)
assert len(sizes) == len(aspect_ratios) # 判断元素个数
self.sizes = sizes
self.aspect_ratios = aspect_ratios
self.cell_anchors = None
self._cache = {} # 原图上生成的所有anchor坐标信息存储到_cache中
# (self,scales=anchor的scale, aspect_ratios=每一类anchor用到的不同比例, dtype=数据类型, device=设备类型)
def generate_anchors(self, scales, aspect_ratios, dtype=torch.float32, device=torch.device("cpu")):
# type: (List[int], List[float], torch.dtype, torch.device) -> Tensor
"""
compute anchor sizes
Arguments:
scales: sqrt(anchor_area)
aspect_ratios: h/w ratios
dtype: float32
device: cpu/gpu
"""
scales = torch.as_tensor(scales, dtype=dtype, device=device) # 将scales转化为tensor
aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device) # 将aspect_ratios转化为tensor
h_ratios = torch.sqrt(aspect_ratios) # aspect_ratios = (0.5, 1.0, 2.0) 高度的乘法因子 =长宽比
w_ratios = 1.0 / h_ratios # 宽度的乘法因子,保持面积不变 h * h_ratios * w * w_ratios = w * h
# 上述两句的理解是 1限制面积不变--》 w_ratios = 1 / h_ratios, 所以aspect_ratios = 长宽比 = h_ratios / w_ratios
# [r1, r2, r3]' * [s1, s2, s3]
# number of elements is len(ratios)*len(scales)
ws = (w_ratios[:, None] * scales[None, :]).view(-1) # 在w_ratios原来维度基础上,后面增加一个维度 # 生成15个anchors的宽度
hs = (h_ratios[:, None] * scales[None, :]).view(-1) # 这里加none的目的是用广播机制,让每个比例均有对应的长和宽 view(-1) 将shape转变为行向量展开
# left-top, right-bottom coordinate relative to anchor center(0, 0)
# 生成的anchors模板都是以(0, 0)为中心的, shape [len(ratios)*len(scales), 4]
# 其他的点(a, b)也是一样的,则anchor左上角的点为(a-ws/2, y-hs/2)
base_anchors = torch.stack([-ws, -hs, ws, hs], dim=1) / 2 # 每一个滑动窗口的中间为(0,0)坐标,x 向右为正,y向下为正,返回(15,4)15个anchor和anchor的左上角和右下角坐标
return base_anchors.round() # round 四舍五入
# dtype, device=(变量类型, 设备类型)
def set_cell_anchors(self, dtype, device):
# type: (torch.dtype, torch.device) -> None
if self.cell_anchors is not None: # 初始化的时候cell_anchors = None
cell_anchors = self.cell_anchors
assert cell_anchors is not None
# suppose that all anchors have the same device
# which is a valid assumption in the current state of the codebase
if cell_anchors[0].device == device:
return
# 根据提供的sizes和aspect_ratios生成anchors模板
# anchors模板都是以(0, 0)为中心的anchor
cell_anchors = [
self.generate_anchors(sizes, aspect_ratios, dtype, device) # 生成15个anchors模板
for sizes, aspect_ratios in zip(self.sizes, self.aspect_ratios) # 遍历self.sizes, self.aspect_ratios,作为参数
]
self.cell_anchors = cell_anchors
def num_anchors_per_location(self):
# 计算每个预测特征层上每个滑动窗口的预测目标数
return [len(s) * len(a) for s, a in zip(self.sizes, self.aspect_ratios)]
# For every combination of (a, (g, s), i) in (self.cell_anchors, zip(grid_sizes, strides), 0:2),
# output g[i] anchors that are s[i] distance apart in direction i, with the same dimensions as a.
def grid_anchors(self, grid_sizes, strides):
# type: (List[List[int]], List[List[Tensor]]) -> List[Tensor]
"""
anchors position in grid coordinate axis map into origin image
计算预测特征图对应原始图像上的所有anchors的坐标
Args:
grid_sizes: 预测特征矩阵的height和width
strides: 预测特征矩阵上一步对应原始图像上的步距
"""
anchors = []
cell_anchors = self.cell_anchors # anchors模板都是以
assert cell_anchors is not None
# 遍历每个预测特征层的grid_size,strides和cell_anchors,mobile只有一个
for size, stride, base_anchors in zip(grid_sizes, strides, cell_anchors): # (grid_sizes在特征图上滑动, strides:原图步长, cell_anchors:anchors)
grid_height, grid_width = size
stride_height, stride_width = stride
device = base_anchors.device
# For output anchor, compute [x_center, y_center, x_center, y_center]
# shape: [grid_width] 对应原图上的x坐标(列)
shifts_x = torch.arange(0, grid_width, dtype=torch.float32, device=device) * stride_width # 预测特征图的坐标(第一行) 乘以 映射到原图的比例(步长),得到原图选中点的x轴坐标
# shape: [grid_height] 对应原图上的y坐标(行)
shifts_y = torch.arange(0, grid_height, dtype=torch.float32, device=device) * stride_height
# 计算预测特征矩阵上每个点对应原图上的坐标(anchors模板的坐标偏移量)
# torch.meshgrid函数分别传入行坐标和列坐标,生成网格行坐标矩阵和网格列坐标矩阵
# shape: [grid_height, grid_width]
shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) # [[34]...共25个]
shift_x = shift_x.reshape(-1) # 预测特征层每一个cell, 映射到原图的坐标
shift_y = shift_y.reshape(-1)
# 计算anchors坐标(xmin, ymin, xmax, ymax)在原图上的坐标偏移量
# shape: [grid_width*grid_height, 4]
shifts = torch.stack([shift_x, shift_y, shift_x, shift_y], dim=1)
# For every (base anchor, output anchor) pair,
# offset each zero-centered base anchor by the center of the output anchor.
# 将anchors模板与原图上的坐标偏移量相加得到原图上所有anchors的坐标信息(shape不同时会使用广播机制)
shifts_anchor = shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4) # 将模板中心移动到原图每一个cell的坐标上base_anchors = strides
anchors.append(shifts_anchor.reshape(-1, 4))
return anchors # List[Tensor(all_num_anchors, 4)]
# grid_sizes = 每一个预测特征层的尺寸(height, width), strides=特征图中的anchor 在原图中的步长(预测特征层的每个scale对应原图的信息)
def cached_grid_anchors(self, grid_sizes, strides):
# type: (List[List[int]], List[List[Tensor]]) -> List[Tensor]
"""将计算得到的所有anchors信息进行缓存"""
key = str(grid_sizes) + str(strides)
# self._cache是字典类型 初始化为空 原图上生成的所有anchor坐标信息存储到_cache中
if key in self._cache:
return self._cache[key]
anchors = self.grid_anchors(grid_sizes, strides)
self._cache[key] = anchors
return anchors
# 正向传播image_list是一个类,这个类存储了在预处理的时候将图片打包成一个batch,每一个batch对应一个填充同样大小的tensors,和pading前的图片size;
# feature_maps是预测特征层的信息。
def forward(self, image_list, feature_maps):
# type: (ImageList, List[Tensor]) -> List[Tensor]
# 获取每个预测特征层的尺寸(height, width)
grid_sizes = list([feature_map.shape[-2:] for feature_map in feature_maps])
# 获取输入图像的height和width
image_size = image_list.tensors.shape[-2:]
# 获取变量类型和设备类型
dtype, device = feature_maps[0].dtype, feature_maps[0].device
# one step in feature map equate n pixel stride in origin image
# 计算特征层上的一步等于原始图像上的步长, grid_size = [torch.Size([25, 34])],所以得到原图和特征图的映射比例=步长(宽,高)
strides = [[torch.tensor(image_size[0] // g[0], dtype=torch.int64, device=device),
torch.tensor(image_size[1] // g[1], dtype=torch.int64, device=device)] for g in grid_sizes]
# 根据提供的sizes和aspect_ratios生成anchors模板,并赋值给self.cell_anchors
self.set_cell_anchors(dtype, device)
# 生成anchors模板后,应用边缘检测过滤anchors
# ----------------------------------------------------------------------------------------------
# 计算/读取所有anchors的坐标信息(这里的anchors信息是映射到原图上的所有anchors信息,不是anchors模板)
# 得到的是一个list列表,对应每张预测特征图映射回原图的anchors坐标信息 # grid_sizes:获取每个预测特征层的尺寸(height, width)
anchors_over_all_feature_maps = self.cached_grid_anchors(grid_sizes, strides) # strides: 特征图一步对应的原图一步长度
anchors = torch.jit.annotate(List[List[torch.Tensor]], [])
# 遍历一个batch中的每张图像
for i, (image_height, image_width) in enumerate(image_list.image_sizes):
anchors_in_image = []
# 遍历每张预测特征图映射回原图的anchors坐标信息
for anchors_per_feature_map in anchors_over_all_feature_maps:
anchors_in_image.append(anchors_per_feature_map)
anchors.append(anchors_in_image)
# 将每一张图像的所有预测特征层的anchors坐标信息拼接在一起
# anchors是个list,每个元素为一张图像的所有anchors信息
anchors = [torch.cat(anchors_per_image) for anchors_per_image in anchors]
# Clear the cache in case that memory leaks.
self._cache.clear()
return anchors
3 roi_head()
后面就简单了, 关键是RPN的理解。
4 GeneralizedRCNNTransform图像的标准化处理
对每一张图像缩放到[min_size, max_size]之间, 并进行标准化和
- normalize(self, image) # 标准化处理,mean[:, None, None])是维度扩展
- resize(self, image, target) # 将图像缩放到[min_size, max_size]
- image = torch.nn.functional.interpolate()[0] 通过双线性插值方法进行缩放
- resize_boxes # 将boxes参数根据图像的缩放情况进行相应缩放
- batch_images # 将一批图像打包成一个batch返回(注意batch中每个tensor的shape是相同的) 为了使原图w, h保持比例不变, 又保持原图不变,采用将图片统一放在同一个框框中。黄色是原图, 蓝色是统一大小的框框。
-
postprocess # 后处理方法, 将bounding box映射到原图
class GeneralizedRCNNTransform(nn.Module):
"""
图像标准化处理,resize处理
它执行的转换是:
- 输入标准化(平均减法和标准除法)
- 输入目标调整大小以匹配 min_size max_size 它为输入返回一个 ImageList,为目标返 回一个 List[Dict[Tensor]]
"""
def __init__(self, min_size, max_size, image_mean, image_std):
super(GeneralizedRCNNTransform, self).__init__()
if not isinstance(min_size, (list, tuple)): # 判断min_size是否为list或tuple类型
min_size = (min_size,) # 如果不是,将min_size类型改为tuple类型
self.min_size = min_size # 指定图像的最小边长范围
self.max_size = max_size # 指定图像的最大边长范围
self.image_mean = image_mean # 指定图像在标准化处理中的均值
self.image_std = image_std # 指定图像在标准化处理中的方差
# 标准化处理
def normalize(self, image):
"""标准化处理"""
dtype, device = image.dtype, image.device # 获取图片数据类型, 设备信息(CPU或GPU) image:List[Tensor]
mean = torch.as_tensor(self.image_mean, dtype=dtype, device=device) # 将均值格式转化为和图片一样的tensor格式
std = torch.as_tensor(self.image_std, dtype=dtype, device=device)
# [:, None, None]: shape [3] -> [3, 1, 1]
return (image - mean[:, None, None]) / std[:, None, None]
# 随机获取一个min_size
def torch_choice(self, k):
# type: (List[int]) -> int
"""
Implements `random.choice` via torch ops so it can be compiled with
TorchScript. Remove if https://github.com/pytorch/pytorch/issues/25803
is fixed.
"""
index = int(torch.empty(1).uniform_(0., float(len(k))).item()) # 用0-len(k)的随机值填充单个数字的零矩阵,并获取该随机值作为索引
return k[index]
# 图像缩放
def resize(self, image, target):
# type: (Tensor, Optional[Dict[str, Tensor]]) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]
"""
将图片缩放到指定的大小范围内,并对应缩放bboxes信息
Args:
image: 输入的图片
target: 输入图片的相关信息(包括bboxes信息)
Returns:
image: 缩放后的图片
target: 缩放bboxes后的图片相关信息
"""
h, w = image.shape[-2:] # 获取图片高和宽,image shape is [channel, height, width]
if self.training: # 如果是训练模式
size = float(self.torch_choice(self.min_size)) # 指定输入图片的最小边长,注意是self.min_size不是min_size
else: # 如果是预测模式
# FIXME assume for now that testing uses the largest scale
size = float(self.min_size[-1]) # 指定输入图片的最小边长,注意是self.min_size不是min_size
if torchvision._is_tracing():
image = _resize_image_onnx(image, size, float(self.max_size))
else:
image = _resize_image(image, size, float(self.max_size))
# 如果target is None 则为预测模式,直接返回
if target is None:
return image, target
# 否则为训练模式,需要处理bound box
bbox = target["boxes"]
# 根据图像的缩放比例来缩放bbox
bbox = resize_boxes(bbox, [h, w], image.shape[-2:])
target["boxes"] = bbox
return image, target
# _onnx_batch_images() is an implementation of
# batch_images() that is supported by ONNX tracing.
@torch.jit.unused
def _onnx_batch_images(self, images, size_divisible=32):
# type: (List[Tensor], int) -> Tensor
max_size = []
for i in range(images[0].dim()):
max_size_i = torch.max(torch.stack([img.shape[i] for img in images]).to(torch.float32)).to(torch.int64)
max_size.append(max_size_i)
stride = size_divisible
max_size[1] = (torch.ceil((max_size[1].to(torch.float32)) / stride) * stride).to(torch.int64)
max_size[2] = (torch.ceil((max_size[2].to(torch.float32)) / stride) * stride).to(torch.int64)
max_size = tuple(max_size)
# work around for
# pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
# which is not yet supported in onnx
padded_imgs = []
for img in images:
padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
padded_img = torch.nn.functional.pad(img, [0, padding[2], 0, padding[1], 0, padding[0]])
padded_imgs.append(padded_img)
return torch.stack(padded_imgs)
def max_by_axis(self, the_list):
# type: (List[List[int]]) -> List[int]
maxes = the_list[0]
for sublist in the_list[1:]:
for index, item in enumerate(sublist):
maxes[index] = max(maxes[index], item)
return maxes
def batch_images(self, images, size_divisible=32):
# type: (List[Tensor], int) -> Tensor
"""
将一批图像打包成一个batch返回(注意batch中每个tensor的shape是相同的)
Args:
images: 输入的一批图片
size_divisible: 将图像高和宽调整到该数的整数倍
Returns:
batched_imgs: 打包成一个batch后的tensor数据
"""
if torchvision._is_tracing():
# batch_images() does not export well to ONNX
# call _onnx_batch_images() instead
return self._onnx_batch_images(images, size_divisible) # onnx是开放的神经网络模型,可以在torch、tensorflow等框架转换,不再依赖于框架
# 分别计算一个batch中所有图片中的最大channel, height, width
max_size = self.max_by_axis([list(img.shape) for img in images])
stride = float(size_divisible)
# max_size = list(max_size)
# 将height向上调整到stride的整数倍
max_size[1] = int(math.ceil(float(max_size[1]) / stride) * stride)
# 将width向上调整到stride的整数倍
max_size[2] = int(math.ceil(float(max_size[2]) / stride) * stride)
# [batch, channel, height, width]
batch_shape = [len(images)] + max_size
# 创建shape为batch_shape且值全部为0的tensor
batched_imgs = images[0].new_full(batch_shape, 0)
for img, pad_img in zip(images, batched_imgs):
# 将输入images中的每张图片复制到新的batched_imgs的每张图片中,对齐左上角,保证bboxes的坐标不变
# 这样保证输入到网络中一个batch的每张图片的shape相同
# copy_: Copies the elements from src into self tensor and returns self
pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
return batched_imgs
def postprocess(self,
result, # type: List[Dict[str, Tensor]]
image_shapes, # type: List[Tuple[int, int]]
original_image_sizes # type: List[Tuple[int, int]]
):
# type: (...) -> List[Dict[str, Tensor]]
"""
对网络的预测结果进行后处理(主要将bboxes还原到原图像尺度上)
Args:
result: list(dict), 网络的预测结果, len(result) == batch_size
image_shapes: list(torch.Size), 图像预处理缩放后的尺寸, len(image_shapes) == batch_size
original_image_sizes: list(torch.Size), 图像的原始尺寸, len(original_image_sizes) == batch_size
Returns:
"""
if self.training:
return result
# 遍历每张图片的预测信息,将boxes信息还原回原尺度
for i, (pred, im_s, o_im_s) in enumerate(zip(result, image_shapes, original_image_sizes)):
boxes = pred["boxes"]
boxes = resize_boxes(boxes, im_s, o_im_s) # 将bboxes缩放回原图像尺度上
result[i]["boxes"] = boxes
return result
def __repr__(self):
"""自定义输出实例化对象的信息,可通过print打印实例信息"""
format_string = self.__class__.__name__ + '('
_indent = '\n '
format_string += "{0}Normalize(mean={1}, std={2})".format(_indent, self.image_mean, self.image_std)
format_string += "{0}Resize(min_size={1}, max_size={2}, mode='bilinear')".format(_indent, self.min_size,
self.max_size)
format_string += '\n)'
return format_string
def forward(self,
images, # type: List[Tensor]
targets=None # type: Optional[List[Dict[str, Tensor]]]
):
# type: (...) -> Tuple[ImageList, Optional[List[Dict[str, Tensor]]]]
images = [img for img in images]
for i in range(len(images)):
image = images[i]
target_index = targets[i] if targets is not None else None
if image.dim() != 3:
raise ValueError("images is expected to be a list of 3d tensors "
"of shape [C, H, W], got {}".format(image.shape))
image = self.normalize(image) # 对图像进行标准化处理
image, target_index = self.resize(image, target_index) # 对图像和对应的bboxes缩放到指定范围
images[i] = image
if targets is not None and target_index is not None:
targets[i] = target_index
# 记录resize后的图像尺寸
image_sizes = [img.shape[-2:] for img in images]
images = self.batch_images(images) # 将images打包成一个batch
image_sizes_list = torch.jit.annotate(List[Tuple[int, int]], [])
for image_size in image_sizes:
assert len(image_size) == 2
image_sizes_list.append((image_size[0], image_size[1]))
image_list = ImageList(images, image_sizes_list)
return image_list, targets