R-CNN(目标检测的开山之作)原理解析+代码注释

608 阅读3分钟

rcnn.png

Since we combine region proposals with CNNs, we call our method R-CNN 原文链接。 R-CNN 深度学习目标检测的开山之作。

原理解析

  1. 输入一张图片。
  2. 使用Selective Search的方法选出约2000个候选框(Region Proposal)。
  3. 将每个候选框对应的子图输入到CNN中得到特征向量。每个候选框对应的子图需要变更到接下来被输入的神经网络的输入大小。比如如果要输入到AlexNet中需要将大小变更到 227×227227 \times 227,但如果要输入VGG则需要将大小变更到 224×224224 \times 224
  4. 为每一类都训练一个SVM分类器,假如要分十类,就会有10个SVM分类器。将上一步中CNN提取到的特征,输入到每一个SVM分类器中,可以得到每一类的分数,从而得到分类结果。

代码注释

  1. 该代码未使用NMS进行后处理,也未训练10个SVM分类器,仅仅止步于第3步训练了一个二分类的CNN。
  2. 代码链接 + 数据集1链接 + 数据集2链接
  3. 运行结果
1.png 2.png 2.png
import os,cv2,keras
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

# 设置图片存储路径
path = "/content/drive/MyDrive/RCNN/Images"
# 设置Ground-Truth (GT)定位框存储路径
annot = "/content/drive/MyDrive/RCNN/Airplanes_Annotations"

# 读取并且显示一张图片
img = cv2.imread(os.path.join(path,"42850.jpg"))
plt.imshow(img)
# 获得定位框的左上 右下两点坐标
df = pd.read_csv(os.path.join(annot, "42850.csv"))

# GT定位框不止一个
for row in df.iterrows():
  x1 = int(row[1][0].split(" ")[0])
  y1 = int(row[1][0].split(" ")[1])
  x2 = int(row[1][0].split(" ")[2])
  y2 = int(row[1][0].split(" ")[3])
  # 打印定位框
  cv2.rectangle(img,(x1,y1),(x2,y2),(255,0,0), 2)

# 画图显示
plt.imshow(img)

# 打开OpenCV (cv) 优化模式
cv2.setUseOptimized(True);
# 获得SelectiveSearch (SS) 类对象
ss = cv2.ximgproc.segmentation.createSelectiveSearchSegmentation()

# cv读取图片
im = cv2.imread(os.path.join(path,"42850.jpg"))
# 为SS设置待处理图片
ss.setBaseImage(im)
# 将SS切换到SS的快速选择模式上
ss.switchToSelectiveSearchFast()
# 开始进行Selective Search处理获得一些列候选定位框
rects = ss.process()
#拷贝一下原图
imOut = im.copy()
# 打印显示所有候选定位框
for i, rect in enumerate(rects):
  x, y, w, h = rect
  cv2.rectangle(imOut, (x, y), (x+w, y+h), (0, 255, 0), 1, cv2.LINE_AA)
plt.imshow(imOut)

# 存储根据一张图片获得的相应的训练正负子图以及标签
train_images=[]
train_labels=[]

# 定义计算两个候选框(候选定位框与GT定位框)的IoU函数
def GetIoU(bb1, bb2):
  # 保证定位框的第一个点是左上,第二个点是右下
  assert bb1['x1'] < bb1['x2']
  assert bb1['y1'] < bb1['y2']
  assert bb2['x1'] < bb2['x2']
  assert bb2['y1'] < bb2['y2']

  x_left = max(bb1['x1'], bb2['x1'])
  y_top = max(bb1['y1'], bb2['y1'])

  x_right = min(bb1['x2'], bb2['x2'])
  y_bottom = min(bb1['y2'], bb2['y2'])

  # 如果没有重叠部分,则直接返回0
  if x_right < x_left or y_bottom < y_top:
    return 0.0

  # 计算相交部分面积
  intersection_area = (x_right - x_left) * (y_bottom - y_top)

  # 计算两个定位框的面积
  bb1_area = (bb1['x2'] - bb1['x1']) * (bb1['y2'] - bb1['y1'])
  bb2_area = (bb2['x2'] - bb2['x1']) * (bb2['y2'] - bb2['y1'])

  # 计算IoU值
  iou = intersection_area / float(bb1_area + bb2_area - intersection_area)

  # IoU的范围需要在0,1之间
  assert iou >= 0.0
  assert iou <= 1.0

  return iou

ss = cv2.ximgproc.segmentation.createSelectiveSearchSegmentation()

# 枚举存储目录下的前10个(或者所有)图片中的飞机图片
for e,i in enumerate(os.listdir(annot)[:10]):
  try:
    # 仅仅针对飞机这一类,其他的类需要重新进行训练
    if i.startswith("airplane"):
      # 获取图片名字并且打印
      filename = i.split(".")[0]+".jpg"
      print(e,filename)

      # 读取图片以及GT标签
      image = cv2.imread(os.path.join(path,filename))
      df = pd.read_csv(os.path.join(annot,i))

      # 存储GT定位框的左上右下点    
      gtvalues=[]
      for row in df.iterrows():
        x1 = int(row[1][0].split(" ")[0])
        y1 = int(row[1][0].split(" ")[1])
        x2 = int(row[1][0].split(" ")[2])
        y2 = int(row[1][0].split(" ")[3])
        gtvalues.append({"x1":x1,"x2":x2,"y1":y1,"y2":y2})

      # 对图片进行Selective Search处理获得候选定位框
      ss.setBaseImage(image)
      ss.switchToSelectiveSearchFast()
      ssresults = ss.process()
      imout = image.copy()

      # 记录从一张图片获得的正子图数量
      counter = 0
      # 记录从一张图片获得的负子图数量
      falsecounter = 0
      # 标记是否提取足够的正负子图数量
      flag = 0
      # 标记是否提取足够的正子图数量
      fflag = 0
      # 标记是否提取足够的负子图数量
      bflag = 0

      # 枚举一张图片的所有候选定位框
      for e, result in enumerate(ssresults):
        # 最多生成2000个候选定位框
        if e < 2000 and flag == 0:

          # 与每一个GT定位框一一比较
          for gtval in gtvalues:
            x,y,w,h = result

            #候选定位框与GT定位框之间的IoU
            iou = GetIoU(gtval,{"x1":x,"x2":x+w,"y1":y,"y2":y+h})

            #获得30个正例,并且IoU要大于0.7
            if counter < 30:
              if iou > 0.70:
                #获取子图,子图数量少于30并且IoU大于0.7
                timage = imout[y: y + h, x : x + w]

                #改变大小为224,因为VGG输入大小是244,而Alexnet的输入是227
                resized = cv2.resize(timage, (224,224), interpolation = cv2.INTER_AREA)
                
                # 获取训练图片
                train_images.append(resized)
                
                #设置正例标签为1
                train_labels.append(1)

                #正例数加1
                counter += 1
            else:
              fflag =1

            #获得30个负例,并且IoU要小于0.3
            if falsecounter < 30:
              if iou < 0.3:
                timage = imout[y:y+h,x:x+w]
                resized = cv2.resize(timage, (224,224), interpolation = cv2.INTER_AREA)
                train_images.append(resized)
                train_labels.append(0)
                falsecounter += 1
            else:
              bflag = 1

          # 如果正负子图都获得完毕,则结束
          if fflag == 1 and bflag == 1:
            flag = 1

  # 如果出现错误打印下错误详细信息以及错误出现在那张图片
  except Exception as e:
    print(e)
    print("error in " + filename)
    continue
    
# 将获得的子图转化为np array
X_new = np.array(train_images)
y_new = np.array(train_labels)

# 导入由ImageNet预训练的VGG
vggmodel = VGG16(weights='imagenet', include_top=True)
vggmodel.summary()
# Setting layer.trainable to False moves all the layer's weights from trainable to non-trainable. This is called "freezing" the layer: the state of a frozen layer won't be updated during training (either when training with fit() or when training with any custom loop that relies on trainable_weights to apply gradient updates).
# https://keras.io/guides/transfer_learning/#:~:text=trainable%20to%20False%20moves%20all,trainable_weights%20to%20apply%20gradient%20updates).
for layers in (vggmodel.layers)[:15]:
    layers.trainable = False

X= vggmodel.layers[-2].output
print (X.shape)

#因为要二分类
predictions = Dense(2, activation="softmax")(X)
print (predictions.shape)

model_final = Model(inputs = vggmodel.input, outputs = predictions)
model_final.summary()

from tensorflow import keras
# 使用Adam优化策略并且使用交叉损失
opt = keras.optimizers.Adam(learning_rate=0.0001)
model_final.compile(loss = keras.losses.categorical_crossentropy, optimizer = opt, metrics=["accuracy"])

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

class MyLabelBinarizer(LabelBinarizer):
  def transform(self, y):
    Y = super().transform(y)
    if self.y_type_ == 'binary':
      return np.hstack((Y, 1-Y))
    else:
      return Y
      
  def inverse_transform(self, Y, threshold=None):
    if self.y_type_ == 'binary':
      return super().inverse_transform(Y[:, 0], threshold)
    else:
      return super().inverse_transform(Y, threshold)

lenc = MyLabelBinarizer()
Y =  lenc.fit_transform(y_new)

# 分割获得训练测试数据集
X_train, X_test , y_train, y_test = train_test_split(X_new,Y,test_size=0.10)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

# 进行数据增强
trdata = ImageDataGenerator(horizontal_flip=True, vertical_flip=True, rotation_range=90)
traindata = trdata.flow(x=X_train, y=y_train)

tsdata = ImageDataGenerator(horizontal_flip=True, vertical_flip=True, rotation_range=90)
testdata = tsdata.flow(x=X_test, y=y_test)

from keras.callbacks import ModelCheckpoint, EarlyStopping
# 设置存储点,并且使用早停防止过拟合
checkpoint = ModelCheckpoint("ieeercnn_vgg16_1.h5", monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)
early = EarlyStopping(monitor='val_loss', min_delta=0, patience=100, verbose=1, mode='auto')
# 开始训练
hist = model_final.fit_generator(generator= traindata, steps_per_epoch= 1, epochs= 10, validation_data= testdata, validation_steps=2, callbacks=[checkpoint,early])

# 打印测试结果
import matplotlib.pyplot as plt

plt.plot(hist.history["accuracy"])
plt.plot(hist.history['val_accuracy'])
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title("model loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.legend(["Accuracy", "Val_accuracy", "Loss","Validation Loss"])
plt.show()
plt.savefig('chart loss.png')

# 测试一下VGG的分类性能,即经过训练后是否能够正确分类

im = X_test[10]
plt.imshow(im)
img = np.expand_dims(im, axis=0)
out= model_final.predict(img)
if out[0][0] > out[0][1]:
    print("plane")
else:
    print("not plane")
    

z=0
for e,i in enumerate(os.listdir(path)):
  if i.startswith("4"):
    z += 1

    # 进行SS获得候选定位框
    img = cv2.imread(os.path.join(path,i))
    ss.setBaseImage(img)
    ss.switchToSelectiveSearchFast()
    ssresults = ss.process()
    imout = img.copy()

    for e, result in enumerate(ssresults):
      if e < 2000:
        x,y,w,h = result
        timage = imout[y:y+h,x:x+w]
        resized = cv2.resize(timage, (224,224), interpolation = cv2.INTER_AREA)
        img = np.expand_dims(resized, axis=0)
        # 对定位框选定的子图进行分类
        out= model_final.predict(img)

        # 如果正类的概率大于0.5则进行标注
        if out[0][0] > 0.65:
          cv2.rectangle(imout, (x, y), (x+w, y+h), (0, 255, 0), 1, cv2.LINE_AA)
          
    plt.imshow(imout)   
      

待续

  1. Selective Search
  2. 训练十个SVM