Tensorflow Object Detection 目标检测API使用

记录使用 Tensorflow 官方的 Object Detection API 进行自定义模型训练的踩坑过程。官方项目地址
https://github.com/tensorflow/models

假设自定义的项目名称是 Demo。推荐步骤：

一. 环境准备

git clone 到本地，然后建议复制出项目的 research/object_detection 文件夹和 research/slim 文件夹到 Demo 目录下
新建环境变量 PYTHONPATH=Demo:Demo/slim
下载 protobuf 3.0 工具（注意版本最好是3.0的，高版本的protobuf 没有通用符匹配功能， windows版本对应路径找就行了）

wget -O protobuf.zip https://github.com/google/protobuf/releases/download/v3.0.0/protoc-3.0.0-linux-x86_64.zip

使用 protobuf 工具编译 Demo/object_detection/protos 文件夹下的 *.proto 文件,生成的.py文件也在protos文件夹下即可

./bin/protoc object_detection/protos/*.proto --python_out=object_detection/protos/

安装依赖的包

pip install tensorflow-gpu
pip install --user Cython
pip install --user contextlib2
pip install --user pillow
pip install --user lxml
pip install --user matplotlib
pip install --user opencv-python

检测上述步骤是否成功 (在Demo根目录执行，下同)，最后输出 OK 表示环境准备成功

python object_detection/builders/model_builder_test.py

二. 自定义训练

准备图片，分别是 images 和 xml 描述文件即可，这里就存放在 Demo/data 下面，images存放图片信息， labels存放xml描述信息。两者名字必须一一对应。

data文件夹
将准备好的数据转换成 TF record 格式的文件，使用以下python脚本一步到位

'''
    将普通xml文件转换成 tensorflow 的 tf record文件
'''

import os, io
import random
import os.path
import cv2
import imutils
from xml.dom import minidom
from os.path import basename
from PIL import Image
from object_detection.utils import dataset_util
from collections import namedtuple
import pandas as pd
import tensorflow as tf

#------------               用户自定义修改部分  开始----------------------------
project_dir = os.path.dirname(__file__)
data_dir = os.path.join(project_dir, "data")
folderCharacter = "/"  # \\ is for windows
classList = { "test":0}  # 这里的class 表示检测的类别，value 从0开始，生成的pbtxt文件从1开始（自动生成）
xmlFolder = os.path.join(data_dir, "labels")
imgFolder = os.path.join(data_dir, "images")
savePath = os.path.join(data_dir, "ssd_dataset")
testRatio = 0.2                                                  # 测试集比率
recordTF_out = ("train.record", "test.record")  # 生成的 record 文件
recordTF_in = ("train.csv", "test.csv")            

resizeImage = False # set true will cause rfcn box large than image size problem
resize_width = 1920
imgResizedFolder = imgFolder + "_" + str(resize_width)
#-----------               用户自定义修改部分  结束  ------------------------------------------

fileList = []
outputTrainFile = savePath + folderCharacter + recordTF_in[0]
outputTestFile = savePath + folderCharacter + recordTF_in[1]

if not os.path.exists(savePath):
    os.makedirs(savePath)

if not os.path.exists(imgResizedFolder):
    os.makedirs(imgResizedFolder)

def transferTF( xmlFilepath, imgFilepath, labelGrep=""):
    #print("TEST:", xmlFilepath, imgFilepath)
    if(os.path.exists(xmlFilepath) and os.path.exists(imgFilepath)):

        img_file, img_file_extension = os.path.splitext(imgFilepath)
        img_filename = basename(img_file)

        img = cv2.imread(imgFilepath)
        org_width = img.shape[1]
        org_height = img.shape[0]

        if(resizeImage==True):
            if(img.shape[1]>=img.shape[0]):
                img = imutils.resize(img, width = resize_width)
                size_ratio_w = img.shape[1] / org_width
                size_ratio_h = img.shape[0] / org_height
            else:
                img = imutils.resize(img, height = resize_width)
                size_ratio_w = img.shape[1] / org_width
                size_ratio_h = img.shape[0] / org_height

            cv2.imwrite(imgResizedFolder + folderCharacter + img_filename + img_file_extension, img)
        else:
            cv2.imwrite(imgResizedFolder + folderCharacter + img_filename + img_file_extension, img)
            size_ratio_w = 1
            size_ratio_h = 1

        imgShape = img.shape
        img_h = imgShape[0]
        img_w = imgShape[1]


        labelXML = minidom.parse(xmlFilepath)
        labelName = []
        labelXmin = []
        labelYmin = []
        labelXmax = []
        labelYmax = []
        countLabels = 0

        tmpArrays = labelXML.getElementsByTagName("filename")
        for elem in tmpArrays:
            filenameImage = elem.firstChild.data

        tmpArrays = labelXML.getElementsByTagName("name")
        for elem in tmpArrays:
            labelName.append(str(elem.firstChild.data))

        tmpArrays = labelXML.getElementsByTagName("xmin")
        for elem in tmpArrays:
            labelXmin.append(int(int(elem.firstChild.data) * size_ratio_w))

        tmpArrays = labelXML.getElementsByTagName("ymin")
        for elem in tmpArrays:
            labelYmin.append(int(int(elem.firstChild.data) * size_ratio_h))

        tmpArrays = labelXML.getElementsByTagName("xmax")
        for elem in tmpArrays:
            labelXmax.append(int(int(elem.firstChild.data) * size_ratio_w))

        tmpArrays = labelXML.getElementsByTagName("ymax")
        for elem in tmpArrays:
            labelYmax.append(int(int(elem.firstChild.data) * size_ratio_h))

        return (img_filename+img_file_extension , img_w, img_h, labelName, labelXmin, labelYmin, labelXmax, labelYmax)

    else:
        return (None, None, None, None, None, None, None, None)

def class_text_to_int(row_label):
    print("row_label :{}".format(row_label))
    return classList[row_label]

def split(df, group):
    data = namedtuple('data', ['filename', 'object'])
    gb = df.groupby(group)
    return [data(filename, gb.get_group(x)) for filename, x in zip(gb.groups.keys(), gb.groups)]


def create_tf_example(group, path):
    with tf.gfile.GFile(os.path.join(path, '{}'.format(group.filename)), 'rb') as fid:
        encoded_jpg = fid.read()
    encoded_jpg_io = io.BytesIO(encoded_jpg)
    image = Image.open(encoded_jpg_io)
    width, height = image.size

    filename = group.filename.encode('utf8')
    image_format = b'jpg'
    xmins = []
    xmaxs = []
    ymins = []
    ymaxs = []
    classes_text = []
    classes = []

    for index, row in group.object.iterrows():
        xmins.append(int(row['xmin']) / width)
        xmaxs.append(int(row['xmax']) / width)
        ymins.append(int(row['ymin']) / height)
        ymaxs.append(int(row['ymax']) / height)
        classes_text.append(row['class'].encode('utf8'))
        classes.append(class_text_to_int(row['class']))

    tf_example = tf.train.Example(features=tf.train.Features(feature={
        'image/height': dataset_util.int64_feature(height),
        'image/width': dataset_util.int64_feature(width),
        'image/filename': dataset_util.bytes_feature(filename),
        'image/source_id': dataset_util.bytes_feature(filename),
        'image/encoded': dataset_util.bytes_feature(encoded_jpg),
        'image/format': dataset_util.bytes_feature(image_format),
        'image/object/bbox/xmin': dataset_util.float_list_feature(xmins),
        'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs),
        'image/object/bbox/ymin': dataset_util.float_list_feature(ymins),
        'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs),
        'image/object/class/text': dataset_util.bytes_list_feature(classes_text),
        'image/object/class/label': dataset_util.int64_list_feature(classes),
    }))
    return tf_example

#-------------------------------------------------
#step 1: make train.csv / test.csv

for file in os.listdir(imgFolder):
    filename, file_extension = os.path.splitext(file)
    file_extension = file_extension.lower()

    if(file_extension == ".jpg" or file_extension==".jpeg" or file_extension==".png" or file_extension==".bmp"):
        imgFile = basename(filename) + file_extension
        xmlFile = basename(filename) + ".xml"
        print("XML:"+xmlFile, "IMG:"+imgFile)

        if(os.path.exists(xmlFolder+folderCharacter+xmlFile)):
            fileList.append(imgFile)

print("total image files: ", len(fileList))

testCount = int(len(fileList) * testRatio)
trainCount = len(fileList) - testCount

a = range(len(fileList))
test_data = random.sample(a, testCount)
#train_data = random.sample(a, trainCount)
train_data = [x for x in a if x not in test_data]
print ("Train:{} images".format(len(train_data)))
print("Test:{} images".format(len(test_data)))


csvFilename = savePath + folderCharacter + recordTF_in[0]
print("writeing to {}".format(csvFilename))

with open(csvFilename, 'a') as the_file:
    i = 0
    the_file.write("filename,width,height,class,xmin,ymin,xmax,ymax" + '\n')

    for id in train_data:
        base_filename = os.path.splitext(fileList[id])[0]
        xmlpath = xmlFolder + folderCharacter + base_filename + ".xml"
        imgpath = imgFolder + folderCharacter + fileList[id]

        (imgfile , w, h, labels, Xmin, Ymin, Xmax, Ymax) = transferTF(xmlpath, imgpath, "")
        print(imgfile , w, h, labels, Xmin, Ymin, Xmax, Ymax)
        if(imgfile is not None):
            for id2, label in enumerate(labels):
                the_file.write(imgfile + ',' + str(w) + ',' + str(h) + ',' + labels[id2] + ',' + str(Xmin[id2]) + ',' + str(Ymin[id2]) \
                    + ',' + str(Xmax[id2]) + ',' + str(Ymax[id2]) + '\n')

            i += 1

the_file.close()
print("Total {} train records to {}".format(i, recordTF_in[0]))


csvFilename = savePath + folderCharacter + recordTF_in[1]
print("writeing to {}".format(csvFilename))

with open(csvFilename, 'a') as the_file:
    i = 0
    the_file.write("filename,width,height,class,xmin,ymin,xmax,ymax" + '\n')

    for id in test_data:
        base_filename = os.path.splitext(fileList[id])[0]
        xmlpath = xmlFolder + folderCharacter + base_filename + ".xml"
        imgpath = imgFolder + folderCharacter + fileList[id]

        (imgfile, w, h, labels, Xmin, Ymin, Xmax, Ymax) = transferTF( xmlpath, imgpath, "")
        # print(imgfile , w, h, labels, Xmin, Ymin, Xmax, Ymax)
        if(imgfile is not None):
            print("TEST_labels:", labels)
            print("TEST_Xmin:", Xmin)
            print("TEST_Xmax:", Xmax)
            print("TEST_Ymin:", Ymin)
            print("TEST_Ymax:", Ymax)

            for id2, label in enumerate(labels):
                the_file.write(imgfile + ',' + str(w) + ',' + str(h) + ',' + labels[id2] + ',' + str(Xmin[id2]) + ',' + str(Ymin[id2]) \
                    + ',' + str(Xmax[id2]) + ',' + str(Ymax[id2]) + '\n')

            i += 1

the_file.close()
print("Total {} test records to {}".format(i, recordTF_in[1]))

#----------------------------------------------------------
#step 2: make TFRecords: train.record / test.record

print("----------- Transfer to TF Record ---------------")

for i in (0, 1):
    output_path = savePath + folderCharacter + recordTF_out[i]
    writer = tf.python_io.TFRecordWriter(output_path)
    examples = pd.read_csv(savePath + folderCharacter + recordTF_in[i])
    grouped = split(examples, 'filename')

    for group in grouped:
        print("group :{}".format(group))
        tf_example = create_tf_example(group, imgResizedFolder)
        writer.write(tf_example.SerializeToString())

    writer.close()
    print('Successfully created the TFRecords: {}'.format(output_path))

#-----------------------------------------

print("-----------make object_detection.pbtxt -----------")

filename = savePath + folderCharacter + 'object_detection.pbtxt'
print("writeing to {}".format(filename))

inv_classList = {v: k for k, v in classList.items()}
print(inv_classList)

with open(filename, 'a') as the_file:

    for i in range(1, len(classList)+1):
        print("i=", i)
        the_file.write("item {" + '\n')
        the_file.write("  id: " + str(i) + '\n')
        the_file.write("  name: '" + inv_classList[i-1] + "'" + '\n')
        the_file.write("}" + '\n\n')

the_file.close()

执行这个脚本会在 data/ssd_dataset 目录下生成train.record, test.record 和 object_detection.pbtxt 三个文件和csv中间文件。

下载预训练模型，模型选择地址：
https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md
自己综合选择吧，我这里选择的是ssd_resnet_50_fpn_coco ☆ ，速度还行，准确率比较高
解压缩预训练模型到项目根目录

   tar xvf  ssd_resnet50_v1_fpn_shared_box_predictor_640x640_coco14_sync_2018_07_03.tar.gz

查看预训练模型的目录结构：

预训练模型目录结构

配置pipeline.config文件，这是训练的关键。有几个注意点说明一下：
1. train_config 块的 batch_size 的大小是你训练时发生OOM的关键，如果发生OOM了，减小这个数值即可。
2. train_config 块的 fine_tune_checkpoint 配置为上一步骤的目录的绝对地址
3. .pbtxt 和 .record 文件的路径配置（绝对路径）

train_input_reader {
   label_map_path: "pbtxt 文件的绝对路径"
    tf_record_input_reader {
       input_path: "train record 文件的绝对路径"
    }
}
eval_input_reader {
  label_map_path: "pbtxt 文件的绝对路径"
  shuffle: false
  num_readers: 1
  tf_record_input_reader {
    input_path:  "test record 文件的绝对路径"
  }
}

训练模型，这里假设训练结果保存路径为 trainning 文件夹（相对于项目根目录）

python object_detection/model_main.py --alsologtostrerr  --pipeline_config_path=上一步里面pipeline.config文件相对路径（绝对路径也行）  --model_dir=存放训练结果的地址（这里是trining/） --num_train_steps=训练步骤（这个模型默认是25000）

将训练后的模型转换成可执行的pb格式文件

 python object_detection/export_inference_graph.py
    --input_type image_tensor 
    --pipeline_config_path pipeline.config文件的相对地址
    --trained_checkpoint_prefix 训练后的ckpt文件前缀 （这里假设为 training/model.ckpt-50000） 
    --output_directory model （pb文件的保存路径）

测试模型，写了个脚本测试：

'''
    模型测试
'''
import tensorflow as tf
import cv2 as cv
import os
import imutils  # pip install imutils

project_dir = os.path.dirname(__file__)

model_dir = os.path.join(project_dir, "model") # pb文件存放目录
data_dir = os.path.join(project_dir, "data")       # data文件夹路径
pb_path = os.path.join(project_dir, "pb")          # pbtxt文件的目录，这里是把前面生成的pbtxt文件放在了项目的pb文件夹里面

model_path = os.path.join(model_dir, "frozen_inference_graph.pb")
pbtxt_path = os.path.join(pb_path, 'object_detection.pbtxt')
testimg = os.path.join(data_dir, 'test.jpg')  # 测试的图片文件路径

with tf.gfile.FastGFile(model_path, 'rb') as f:
    graph_def = tf.GraphDef()
    graph_def.ParseFromString(f.read())

with tf.Session() as sess:

    sess.graph.as_default()
    tf.import_graph_def(graph_def, name='')

    img = cv.imread(testimg)
    rows = img.shape[0]
    cols = img.shape[1]
    inp = cv.resize(img, (450, 450))
    inp = inp[:, :, [2, 1, 0]]  # BGR2RGB

    out = sess.run([sess.graph.get_tensor_by_name('num_detections:0'),
                    sess.graph.get_tensor_by_name('detection_scores:0'),
                    sess.graph.get_tensor_by_name('detection_boxes:0'),
                    sess.graph.get_tensor_by_name('detection_classes:0')],
                   feed_dict={'image_tensor:0': inp.reshape(1, inp.shape[0], inp.shape[1], 3)})

    num_detections = int(out[0][0])
    for i in range(num_detections):
        cat = int(out[3][0][i])
        score  = float(out[1][0][i])
        bbox = [float(v) for v in out[2][0][i]]
        x = bbox[1] * cols
        y = bbox[0] * rows
        right = bbox[3] * cols
        bottom = bbox[2] * rows
        cv.rectangle(img, (int(x), int(y)), (int(right), int(bottom)), (125, 255, 51), thickness=2)
        print(cat, "-->", score, x, y, right, bottom)
    cv.imshow('SHOW', imutils.resize(img, width=1080))
    cv.waitKey()

运行脚本就可以看到结果了。

环境踩坑：

opencv-python 的安装问题，建议安装：

  pip3 install opencv-contrib-python==3.3.0.9

gast库版本：

pip3 uninstall -y gast
pip3 install gast==0.2.2

在windows 上训练自定义模型，无法避免的一个包 pycocotools 官方安装是失败的，换种方式:

pip3 install Cython
pip3 install git+https://github.com/philferriere/cocoapi.git#subdirectory=PythonAPI

Cython 是pycocotools的依赖

总结：

仔细看日志，特别是预训练模型选择那一块，不同的模型训练时报的错误都不一样，有些是bug，有些是包版本问题，多google，看官方github的issue解决问题。这里用的tensorflow gpu版本是1.14. 好运~

Tensorflow Object Detection 目标检测API使用

假设自定义的项目名称是 Demo。推荐步骤：

一. 环境准备

二. 自定义训练

环境踩坑：

总结：

推荐阅读更多精彩内容