SSD目标检测算法详解

289 阅读4分钟
主要分几个程序:
1、config.py : 保存了整个项目的大部分参数;
2、calculate_IOU.py : 计算预选框和真值框的IOU值,用于筛选正负样本;以及定义了对坐标进行encode和decode的函数;
3、nms.py : 定义了非极大值抑制函数;
4、random_crop.py : 定义了一个Cropper类,通过随机裁剪和随机翻转进行数据增强;
5、read_data.py : 定义了一个Reader类,用于读取VOC2012数据集;
6、anchors.py : 对不同特征层生成相应大小和数目的default box;
7、label_anchors.py : 将不同的default box与真值框(true boxes)进行匹配;
8、network.py : 定义了一个Net类,并定义了SSD网络结构,用于训练并保存模型;
9、loss_function.py : 定义了损失函数,其中包含对正样本和负样本1:3比例的取样;
10、SSD_API.py : 定义了SSD_detector类,用于加载模型并输入图片进行目标检测;

1、config.py
保存了这个项目的参数,先上代码:
# config.py
import numpy as np
import os

NMS_THRESHOLD = 0.3 # nms(非极大值抑制)的阙值

DATA_PATH = '../VOC2012' # 数据集路径

ImageSets_PATH = os.path.join(DATA_PATH, 'ImageSets') # 保存图片坐标和类别信息的路径

BLOCKS = ['block4', 'block7', 'block8',

'block9', 'block10', 'block11', 'block12'] # 需要抽出的特征层名称

MAX_SIZE = 1000 # 图片最大边长

MIN_SIZE = 600 # 图片最小边长

EPOCHES = 2000 # 迭代次数

BATCHES = 64 # 一个epoch迭代多少个batch

THRESHOLD = 0.5 # 区分正负样本匹配的阙值

SCORE_THRESHOLD = 0.997 # 测试时正样本得分阙值

MIN_CROP_RATIO = 0.6 # 随机裁剪的最小比率

MAX_CROP_RATIO = 1.0 # 随机裁剪的最大比率

MODEL_PATH = './model/' # 模型保存路径

LEARNING_RATE = 2e-4 # 学习率

CLASSES = ['', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
'motorbike', 'person', 'pottedplant', 'sheep', 'sofa',
function(){ //外汇返佣 http://www.fx61.com/
'train', 'tvmonitor'] # 物体类别,第一个是背景类别

# 图片三像素均值
PIXEL_MEANS = np.array([[[122.7717, 115.9465, 102.9801]]])

# 不同层预选框的长宽比
RATIOS = [[2, .5],
[2, .5, 3, 1./3],
[2, .5, 3, 1./3],
[2, .5, 3, 1./3],
[2, .5, 3, 1./3],
[2, .5], [2, .5]]

# 每层的步长
STRIDES = [8, 16, 32, 64, 128, 256, 512]

# 论文中的s,认为是每层预选框的边长大小(比率大小)
S = [0.04, 0.1, 0.26, 0.42, 0.58, 0.74, 0.9, 1.06]

# 每层default box的边长,第二个元素是下一层default box的边长
Sk = [(20.48, 51.2),

(51.2, 133.12),

(133.12, 215.04),

(215.04, 296.96),

(296.96, 378.88),

(378.88, 460.8),

(460.8, 542.72)]

# 用于调整边框回归值在loss中的比率
PRIOT_SCALING = (0.1, 0.1, 0.2, 0.2)
参数都有备注,就不多说啦,挑几个比较重要的吧:

1、BLOCKS: BLOCKS保存了我们需要提取的特征层的名称(共七个),其中第一个特征层’block4’是VGG的一个中间层,其余六个特征层是SSD在VGG之层后额外添加的几个,每层的步长见‘STRIDES’参数;

2、RATIOS: RATIOS保存了七个层default box的几个长宽比,比如第一层有[2, 0.5]两个长宽比,代表第一个特征层每个特征点有长宽比分别为2, 0.5的额外两个default box;

3、Sk: Sk保存了每个特征层的default box的边长,注意这里的边长大小跟原论文不太一样;

然后config.py中的参数通过 import config as cfg 引用,参量用cfg.参数名即可。
2、calculate_IOU.py
这里定义了计算预选框和真值框的IOU值的函数,用于筛选正负样本;以及定义了对坐标进行encode和decode的函数;
先上代码:
# calculate_IOU.py
import numpy as np
import config as cfg

def encode_targets(true_box, anchors, prior_scaling=cfg.PRIOT_SCALING):

anchor_y_min = anchors[:, 0]
anchor_x_min = anchors[:, 1]
anchor_y_max = anchors[:, 2]
anchor_x_max = anchors[:, 3]

anchor_ctr_y = (anchor_y_max + anchor_y_min) / 2
anchor_ctr_x = (anchor_x_max + anchor_x_min) / 2
anchor_h = anchor_y_max - anchor_y_min
anchor_w = anchor_x_max - anchor_x_min

true_box_y_min = true_box[:, 0]
true_box_x_min = true_box[:, 1]
true_box_y_max = true_box[:, 2]
true_box_x_max = true_box[:, 3]

true_box_ctr_y = (true_box_y_max + true_box_y_min) / 2
true_box_ctr_x = (true_box_x_max + true_box_x_min) / 2
true_box_h = true_box_y_max - true_box_y_min
true_box_w = true_box_x_max - true_box_x_min

target_dy = (true_box_ctr_y-anchor_ctr_y)/anchor_h
target_dx = (true_box_ctr_x-anchor_ctr_x)/anchor_w
target_dh = np.log(true_box_h/anchor_h)
target_dw = np.log(true_box_w/anchor_w)

targets = np.stack([target_dy, target_dx, target_dh, target_dw], axis=1)

return np.reshape(targets, (-1, 4)) / prior_scaling


def decode_targets(anchors, targets, image_shape, prior_scaling=cfg.PRIOT_SCALING):

y_min = anchors[:, 0]
x_min = anchors[:, 1]
y_max = anchors[:, 2]
x_max = anchors[:, 3]

height, width = image_shape[:2]

ctr_y = (y_max + y_min) / 2
ctr_x = (x_max + x_min) / 2
h = y_max - y_min
w = x_max - x_min

targets = targets * prior_scaling

dy = targets[:, 0]
dx = targets[:, 1]
dh = targets[:, 2]
dw = targets[:, 3]

pred_ctr_y = dy*h + ctr_y
pred_ctr_x = dx*w + ctr_x
pred_h = h*np.exp(dh)
pred_w = w*np.exp(dw)

y_min = pred_ctr_y - pred_h/2
x_min = pred_ctr_x - pred_w/2
y_max = pred_ctr_y + pred_h/2
x_max = pred_ctr_x + pred_w/2

y_min = np.clip(y_min, 0, height)
y_max = np.clip(y_max, 0, height)
x_min = np.clip(x_min, 0, width)
x_max = np.clip(x_max, 0, width)

boxes = np.stack([y_min, x_min, y_max, x_max], axis=1)

return boxes


def fast_bbox_overlaps(holdon_anchor, true_boxes):

num_true = true_boxes.shape[0] # 真值框的个数 m
num_holdon = holdon_anchor.shape[0] # 候选框的个数(已删去越界的样本)n

true_y_max = true_boxes[:, 2]
true_y_min = true_boxes[:, 0]
true_x_max = true_boxes[:, 3]
true_x_min = true_boxes[:, 1]

anchor_y_max = holdon_anchor[:, 2]
anchor_y_min = holdon_anchor[:, 0]
anchor_x_max = holdon_anchor[:, 3]
anchor_x_min = holdon_anchor[:, 1]

true_h = true_y_max - true_y_min
true_w = true_x_max - true_x_min

true_h = np.expand_dims(true_h, axis=1)
true_w = np.expand_dims(true_w, axis=1)

anchor_h = holdon_anchor[:, 2] - holdon_anchor[:, 0]
anchor_w = holdon_anchor[:, 3] - holdon_anchor[:, 1]

true_area = true_w * true_h
anchor_area = anchor_w * anchor_h

min_y_up = np.expand_dims(true_y_max, axis=1) < anchor_y_max
min_y_up = np.where(min_y_up, np.expand_dims(
true_y_max, axis=1), np.expand_dims(anchor_y_max, axis=0))

max_y_down = np.expand_dims(true_y_min, axis=1) > anchor_y_min
max_y_down = np.where(max_y_down, np.expand_dims(
true_y_min, axis=1), np.expand_dims(anchor_y_min, axis=0))

lh = min_y_up - max_y_down

min_x_up = np.expand_dims(true_x_max, axis=1) < anchor_x_max
min_x_up = np.where(min_x_up, np.expand_dims(
true_x_max, axis=1), np.expand_dims(anchor_x_max, axis=0))

max_x_down = np.expand_dims(true_x_min, axis=1) > anchor_x_min
max_x_down = np.where(max_x_down, np.expand_dims(
true_x_min, axis=1), np.expand_dims(anchor_x_min, axis=0))

lw = min_x_up - max_x_down

pos_index = np.where(
np.logical_and(
lh > 0, lw > 0
)
)

overlap_area = lh * lw # (n, m)

overlap_weight = np.zeros(shape=lh.shape, dtype=np.int)

overlap_weight[pos_index] = 1

all_area = true_area + anchor_area

dialta_S = all_area - overlap_area

dialta_S = np.where(dialta_S > 0, dialta_S, all_area)

IOU = np.divide(overlap_area, dialta_S)

IOU = np.where(overlap_weight, IOU, 0)

IOU_s = np.transpose(IOU)

return IOU_s.astype(np.float32) # (n, m) 转置矩阵

if __name__ == "__main__":

pass
3、nms.py
非极大值抑制(Non-Maximum Suppression,NMS),功能是去除冗余的检测框,保留最好的一个。
如果不进行NMS,效果是这样的:
上代码:
import tensorflow as tf
from network import Net
import config as cfg
import cv2
import numpy as np
from label_anchors import decode_targets
import matplotlib.pyplot as plt
from nms import py_cpu_nms


class SSD_detector(object):

def __init__(self):

self.net = Net(is_training=False)

self.model_path = cfg.MODEL_PATH

self.pixel_means = cfg.PIXEL_MEANS

self.min_size = cfg.MIN_SIZE

self.pred_loc, self.pred_cls = self.net.output

self.score_threshold = cfg.SCORE_THRESHOLD

def pre_process(self, image_path):

image = cv2.imread(image_path)

image = image.astype(np.float)

image, scale = self.resize_image(image)

value = {'image': image, 'scale': scale, 'image_path': image_path}

return value

def resize_image(self, image):

image_shape = image.shape

size_min = np.min(image_shape[:2])
size_max = np.max(image_shape[:2])

scale = float(self.min_size) / float(size_min)

image = cv2.resize(image, dsize=(0, 0), fx=scale, fy=scale)

return image, scale

def test_ssd(self, image_paths):

if isinstance(image_paths, str):
image_paths = [image_paths]

with tf.Session() as sess:

sess.run(tf.compat.v1.global_variables_initializer())

ckpt = tf.train.get_checkpoint_state(cfg.MODEL_PATH)

if ckpt and ckpt.model_checkpoint_path:
# 如果保存过模型,则在保存的模型的基础上继续训练
self.net.saver.restore(sess, ckpt.model_checkpoint_path)
print('Model Reload Successfully!')

for path in image_paths:

value = self.pre_process(path)

image = value['image'] - self.pixel_means

feed_dict = {self.net.x: image}

pred_loc, pred_cls, layer_anchors = sess.run(
[self.pred_loc, self.pred_cls, self.net.anchors], feed_dict
)

pos_loc, pos_cls, pos_anchors, pos_scores = self.decode_output(
pred_loc, pred_cls, layer_anchors)

pos_boxes = decode_targets(pos_anchors, pos_loc, image.shape)

pos_scores = np.expand_dims(pos_scores, axis=-1)

self.draw_result(
value['image'], pos_boxes, pos_cls, value['scale']
)

keep_index = py_cpu_nms(np.hstack([pos_boxes, pos_scores]))

self.draw_result(
value['image'], pos_boxes[keep_index], pos_cls[keep_index], value['scale']
)

def draw_result(self, image, pos_boxes, pos_cls, scale, font=cv2.FONT_HERSHEY_SIMPLEX):

image = cv2.resize(image, dsize=(0, 0), fx=1/scale, fy=1/scale)
image = image.astype(np.int)

pos_boxes = pos_boxes * (1/scale)

for i in range(pos_boxes.shape[0]):

bbox = pos_boxes[i]
label = cfg.CLASSES[pos_cls[i]]

y_min, x_min, y_max, x_max = bbox.astype(np.int)

cv2.rectangle(image, (x_min, y_min),
(x_max, y_max), (0, 0, 255), thickness=2)

cv2.putText(image, label, (x_min+20, y_min+20),
font, 1, (255, 0, 0), thickness=2)

plt.imshow(image[:, :, [2, 1, 0]])
plt.show()

def decode_output(self, pred_loc, pred_cls, layer_anchors):

pos_loc, pos_cls, pos_anchors, pos_scores = [], [], [], []

for i in range(len(pred_cls)):

loc_ = pred_loc[i]
cls_ = pred_cls[i] # cls_是每个分类的得分
anchors = layer_anchors[i].reshape((-1, 4))

max_scores = np.max(cls_[:, 1:], axis=-1) # 非背景最大得分
cls_ = np.argmax(cls_, axis=-1) # 最大索引

pos_index = np.where(max_scores > self.score_threshold)[0] # 正样本

pos_loc.append(loc_[pos_index])
pos_cls.append(cls_[pos_index])
pos_anchors.append(anchors[pos_index])
pos_scores.append(max_scores[pos_index])

pos_loc = np.vstack(pos_loc)
pos_cls = np.hstack(pos_cls)
pos_anchors = np.vstack(pos_anchors)
pos_scores = np.hstack(pos_scores)

return pos_loc, pos_cls, pos_anchors, pos_scores


if __name__ == "__main__":

detector = SSD_detector()

detector.test_ssd('./1.jpg')

更多技术资讯可关注:itheimaGZ获取