CPMTransformationParameter参数解析
layer {
name: "data"
type: "CPMData"
top: "data"
top: "label"
data_param { //caffe.proto Line:687
source: "/mnt/sdb/yangbin/COCO_kpt/lmdb"
batch_size: 10
backend: LMDB
}
cpm_transform_param {
stride: 8
max_rotate_degree: 40
visualize: false
crop_size_x: 368
crop_size_y: 368
scale_prob: 1
scale_min: 0.5
scale_max: 1.1
target_dist: 0.6
center_perterb_max: 40
do_clahe: false
num_parts: 56
np_in_lmdb: 17
}
}
message CPMTransformationParameter {
// 对数据进行预处理,可以执行简单的缩放或者减去图像均值(注意减去均值操作通常在缩放之前执行)
optional float scale = 1 [default = 1];
// 制定是否镜像数据(默认为false)
optional bool mirror = 2 [default = false];
// 制定图像裁剪尺寸(默认为0,实际使用中设置crop_size_x和crop_size_y)
optional uint32 crop_size = 3 [default = 0];
// 均值文件和均值大小可以同时指定
optional string mean_file = 4;
// if specified can be repeated once (所有的通道都减去均值)
// or can be repeated the same number of times as channels (指定通道减去均值)
repeated float mean_value = 5;
optional uint32 stride = 6 [default = 4];
optional float scale_cvg = 7 [default = 0.5];
optional uint32 max_cvg_len = 8 [default = 50];
optional uint32 min_cvg_len = 9 [default = 50];
optional bool opaque_coverage = 10 [default = true];
optional string coverage = 11 [default = "gridbox_max"];
optional float flip_prob = 12 [default = 0.5];
optional float max_rotate_degree = 13 [default = 5.0];
optional bool visualize = 14 [default = false];
optional uint32 crop_size_x = 15 [default = 368];
optional uint32 crop_size_y = 16 [default = 368];
optional float scale_prob = 17 [default = 0.5];
optional float scale_min = 18 [default = 0.9];
optional float scale_max = 19 [default = 1.1];
optional float bbox_norm_factor = 20 [default = 300];
optional string img_header = 21 [default = "."];
// Force the decoded image to have 3 color channels.
optional bool force_color = 22 [default = false];
// Force the decoded image to have 1 color channels.
optional bool force_gray = 23 [default = false];
optional float target_dist = 24 [default = 1.0];
optional float center_perterb_max = 25 [default = 10.0];
optional float sigma = 26 [default = 7.0];
optional float sigma_center = 27 [default = 21.0];
optional float clahe_tile_size = 28 [default = 8.0];
optional float clahe_clip_limit = 29 [default = 4.0];
optional bool do_clahe = 30 [default = false];
optional uint32 num_parts = 31 [default = 14];
optional uint32 num_total_augs = 32 [default = 82];
optional string aug_way = 33 [default = "rand"];
optional uint32 gray = 34 [default = 0];
optional uint32 np_in_lmdb = 35 [default = 16];
optional bool transform_body_joint = 38 [default = true];
}
一个Datum有三个维度,channels, height,和width,可以看做是少了num维度的Blob。存放数据的地方有两个:byte_data和float_data,分别存放整数型和浮点型数据。图像数据一般是整形,放在byte_data里,特征向量一般是浮点型,放在float_data里。label存放数据的类别标签,是整数型。encoded标识数据是否需要被解码(里面有可能放的是JPEG或者PNG之类经过编码的数据)。
message Datum {
optional int32 channels = 1; //数据维度信息,channel*height*width
optional int32 height = 2;
optional int32 width = 3;
// the actual image data, in bytes
optional bytes data = 4; //图像数据,以字节类型存储
optional int32 label = 5;
// Optionally, the datum could also hold float data.
repeated float float_data = 6; //可选,图像数据也可以float类型存储
// If true data contains an encoded image that need to be decoded
optional bool encoded = 7 [default = false]; //encoded标识数据是否需要被解码(里面有可能放的是JPEG或者PNG之类经过编码的数据)
}
DataLayerSetUp函数实现层设置
template <typename Dtype>
CPMDataLayer<Dtype>::CPMDataLayer(const LayerParameter& param)
: BasePrefetchingDataLayer<Dtype>(param),
reader_(param),
cpm_transform_param_(param.cpm_transform_param()){
}
template <typename Dtype>
void CPMDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
cpm_data_transformer_.reset(
new CPMDataTransformer<Dtype>(cpm_transform_param_, this->phase_)); // 调用DataLayerSetUp函数的类设置phase为train or test
cpm_data_transformer_->InitRand(); // cpm_data_transformer_初始化
上述两部分代码都位于cpm_data_layer.cpp中,第一部分为CPMData层的构造函数,LayerParameter是包含所有层的类(例如Loss层、ReLU层、Data层……),然后param就是将我们写的网络读入的一个参数,用param.cpm_transform_param来初始化cpm_transform_param_(CPMTransformationParameter cpm_transform_param_;)参数,这样cpm_transform_param_就包含了CPMData层的所有参数,即如下所示。
cpm_transform_param {
stride: 8
max_rotate_degree: 40
visualize: false
crop_size_x: 368
crop_size_y: 368
scale_prob: 1
scale_min: 0.5
scale_max: 1.1
target_dist: 0.6
center_perterb_max: 40
do_clahe: false
num_parts: 56
np_in_lmdb: 17
}
- 设置crop_size_x和crop_size_y
// image
const int crop_size = this->layer_param_.cpm_transform_param().crop_size();
const int batch_size = this->layer_param_.data_param().batch_size();
if (crop_size > 0) { //实际运行中,设置crop_size为默认值0
// top[0]->Reshape(batch_size, datum.channels(), crop_size, crop_size);
// for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
// this->prefetch_[i].data_.Reshape(batch_size, datum.channels(), crop_size, crop_size);
// }
// //this->transformed_data_.Reshape(1, 4, crop_size, crop_size);
// this->transformed_data_.Reshape(1, 6, crop_size, crop_size);
}
else {
const int height = this->phase_ != TRAIN ? datum.height() : //设置crop_size_x=368
this->layer_param_.cpm_transform_param().crop_size_y(); //设置crop_size_y=368
const int width = this->phase_ != TRAIN ? datum.width() :
this->layer_param_.cpm_transform_param().crop_size_x();
LOG(INFO) << "PREFETCH_COUNT is " << this->PREFETCH_COUNT;
top[0]->Reshape(batch_size, datum.channels(), height, width);
for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
this->prefetch_[i].data_.Reshape(batch_size, datum.channels(), height, width);
}
//this->transformed_data_.Reshape(1, 4, height, width);
this->transformed_data_.Reshape(1, datum.channels(), height, width);
}
- 设置num_parts stride
// label
if (this->output_labels_) {
const int stride = this->layer_param_.cpm_transform_param().stride();
const int height = this->phase_ != TRAIN ? datum.height() :
this->layer_param_.cpm_transform_param().crop_size_y();
const int width = this->phase_ != TRAIN ? datum.width() :
this->layer_param_.cpm_transform_param().crop_size_x();
int num_parts = this->layer_param_.cpm_transform_param().num_parts(); //COCO 's num_parts = 56
top[1]->Reshape(batch_size, 2*(num_parts+1), height/stride, width/stride);
//训练时产生如下结果:
//I1008 14:29:50.468617 33177 net.cpp:157] Top shape: 10 6 368 368 (8125440)
//I1008 14:29:50.468626 33177 net.cpp:157] Top shape: 10 114 46 46 (2412240)
for (int i = 0; i < this->PREFETCH_COUNT; ++i) { //static const int PREFETCH_COUNT = 3;
this->prefetch_[i].label_.Reshape(batch_size, 2*(num_parts+1), height/stride, width/stride); // 10,114,46,46
}
this->transformed_label_.Reshape(1, 2*(num_parts+1), height/stride, width/stride); // 1,114,46,46
}
- 接下来的load_batch是一个纯虚函数,因此继承BasePrefetchingDataLayer类的子类都需要实现这个函数,用于 取数据,填充数据结构
virtual void load_batch(Batch<Dtype>* batch) = 0;
- 调用data transformations(mirror, scale, crop……)
// Apply data transformations (mirror, scale, crop...)
timer.Start();
const int offset_data = batch->data_.offset(item_id);
const int offset_label = batch->label_.offset(item_id);
this->transformed_data_.set_cpu_data(top_data + offset_data);
this->transformed_label_.set_cpu_data(top_label + offset_label);
if (datum.encoded()) {
this->cpm_data_transformer_->Transform(cv_img, &(this->transformed_data_)); //调用Transform函数
} else {
this->cpm_data_transformer_->Transform_nv(datum,
&(this->transformed_data_),
&(this->transformed_label_), cnt); //调用Transform_nv函数
++cnt;
}
接下来通过调用Transform和Transform_nv函数来对数据进行处理
- Transform函数
template<typename Dtype>
void CPMDataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
Blob<Dtype>* transformed_blob) {
const int img_channels = cv_img.channels();
const int img_height = cv_img.rows;
const int img_width = cv_img.cols;
const int channels = transformed_blob->channels();
const int height = transformed_blob->height();
const int width = transformed_blob->width();
const int num = transformed_blob->num();
CHECK_EQ(channels, img_channels);
CHECK_LE(height, img_height);
CHECK_LE(width, img_width);
CHECK_GE(num, 1);
CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte";
const int crop_size = param_.crop_size();
const Dtype scale = param_.scale();
const bool do_mirror = param_.mirror() && Rand(2);
const bool has_mean_file = param_.has_mean_file();
const bool has_mean_values = mean_values_.size() > 0;
CHECK_GT(img_channels, 0);
CHECK_GE(img_height, crop_size);
CHECK_GE(img_width, crop_size);
Dtype* mean = NULL;
if (has_mean_file) {
CHECK_EQ(img_channels, data_mean_.channels());
CHECK_EQ(img_height, data_mean_.height());
CHECK_EQ(img_width, data_mean_.width());
mean = data_mean_.mutable_cpu_data();
}
if (has_mean_values) {
CHECK(mean_values_.size() == 1 || mean_values_.size() == img_channels) <<
"Specify either 1 mean_value or as many as channels: " << img_channels;
if (img_channels > 1 && mean_values_.size() == 1) {
// Replicate the mean_value for simplicity
for (int c = 1; c < img_channels; ++c) {
mean_values_.push_back(mean_values_[0]);
}
}
}
int h_off = 0;
int w_off = 0;
cv::Mat cv_cropped_img = cv_img;
if (crop_size) {
CHECK_EQ(crop_size, height);
CHECK_EQ(crop_size, width);
// We only do random crop when we do training.
if (phase_ == TRAIN) {
h_off = Rand(img_height - crop_size + 1);
w_off = Rand(img_width - crop_size + 1);
} else {
h_off = (img_height - crop_size) / 2;
w_off = (img_width - crop_size) / 2;
}
cv::Rect roi(w_off, h_off, crop_size, crop_size);
cv_cropped_img = cv_img(roi);
} else {
CHECK_EQ(img_height, height);
CHECK_EQ(img_width, width);
}
CHECK(cv_cropped_img.data);
Dtype* transformed_data = transformed_blob->mutable_cpu_data();
int top_index;
for (int h = 0; h < height; ++h) {
const uchar* ptr = cv_cropped_img.ptr<uchar>(h);
int img_index = 0;
for (int w = 0; w < width; ++w) {
for (int c = 0; c < img_channels; ++c) {
if (do_mirror) {
top_index = (c * height + h) * width + (width - 1 - w);
//图像存储顺序:C*H*W,因此top_index之前有C个通道,每个通道有H*W个像素,
//在当前通道top_index之前又有h*width像素,最后还要加上当前行所在的w个像素
} else {
top_index = (c * height + h) * width + w;
}
// int top_index = (c * height + h) * width + w;
Dtype pixel = static_cast<Dtype>(ptr[img_index++]);
if (has_mean_file) {
int mean_index = (c * img_height + h_off + h) * img_width + w_off + w;
transformed_data[top_index] =
(pixel - mean[mean_index]) * scale;
} else {
if (has_mean_values) {
transformed_data[top_index] =
(pixel - mean_values_[c]) * scale; //减去均值操作
} else {
transformed_data[top_index] = pixel * scale;
}
}
}
}
}
}
- Transform_nv函数
load_batch函数中:datum是数据的来源,作为Transform_nv函数中的data,应该是制作好的LMDB数据
Datum& datum = *(reader_.full().pop("Waiting for data"));
this->cpm_data_transformer_->Transform_nv(datum,
&(this->transformed_data_),
&(this->transformed_label_), cnt);
++cnt;
template<typename Dtype> void CPMDataTransformer<Dtype>::Transform_nv(const Datum& datum, Dtype* transformed_data, Dtype* transformed_label, int cnt) {
//TODO: some parameter should be set in prototxt
int clahe_tileSize = param_.clahe_tile_size();
int clahe_clipLimit = param_.clahe_clip_limit();
//float targetDist = 41.0/35.0;
AugmentSelection as = {
false, //bool flip
0.0, //float degree
Size(), //Size crop
0, //float scale
};
MetaData meta;
const string& data = datum.data(); //输入图像数据
const int datum_channels = datum.channels();
const int datum_height = datum.height();
const int datum_width = datum.width();
// To do: make this a parameter in caffe.proto
//const int mode = 5; //related to datum.channels();
const int mode = 5;
/** 位于Transform函数中
//const int crop_size = param_.crop_size();
//const Dtype scale = param_.scale();
//const bool do_mirror = param_.mirror() && Rand(2);
//const bool has_mean_file = param_.has_mean_file();
**/
const bool has_uint8 = data.size() > 0;
//const bool has_mean_values = mean_values_.size() > 0;
int crop_x = param_.crop_size_x();
int crop_y = param_.crop_size_y();
CHECK_GT(datum_channels, 0);
//CHECK_GE(datum_height, crop_size);
//CHECK_GE(datum_width, crop_size);
CPUTimer timer1;
timer1.Start();
//before any transformation, get the image from datum
Mat img = Mat::zeros(datum_height, datum_width, CV_8UC3);
Mat mask_all, mask_miss;
if(mode >= 5){
mask_miss = Mat::ones(datum_height, datum_width, CV_8UC1);
}
if(mode == 6){
mask_all = Mat::zeros(datum_height, datum_width, CV_8UC1);
}
int offset = img.rows * img.cols;
int dindex;
Dtype d_element;
for (int i = 0; i < img.rows; ++i) {
for (int j = 0; j < img.cols; ++j) {
Vec3b& rgb = img.at<Vec3b>(i, j);
for(int c = 0; c < 3; c++){
dindex = c*offset + i*img.cols + j; //C*H*W格式存储
if (has_uint8)
d_element = static_cast<Dtype>(static_cast<uint8_t>(data[dindex]));
else
d_element = datum.float_data(dindex);
rgb[c] = d_element; //img.at<Vec3b>(i, j)的c通道数据(uchar类型)
}
if(mode >= 5){
dindex = 4*offset + i*img.cols + j;
if (has_uint8)
d_element = static_cast<Dtype>(static_cast<uint8_t>(data[dindex]));
else
d_element = datum.float_data(dindex);
if (round(d_element/255)!=1 && round(d_element/255)!=0){ //主要用来判断d_element是否为整数,若为小数则四舍五入(float/255会得到小数)
cout << d_element << " " << round(d_element/255) << endl;
}
mask_miss.at<uchar>(i, j) = d_element; //round(d_element/255);
}
if(mode == 6){
dindex = 5*offset + i*img.cols + j;
if (has_uint8)
d_element = static_cast<Dtype>(static_cast<uint8_t>(data[dindex]));
else
d_element = datum.float_data(dindex);
mask_all.at<uchar>(i, j) = d_element;
}
}
}
VLOG(2) << " rgb[:] = datum: " << timer1.MicroSeconds()/1000.0 << " ms";
timer1.Start();
//color, contract
if(param_.do_clahe())
clahe(img, clahe_tileSize, clahe_clipLimit); //直方图均衡化
if(param_.gray() == 1){
cv::cvtColor(img, img, CV_BGR2GRAY);
cv::cvtColor(img, img, CV_GRAY2BGR);
}
VLOG(2) << " color: " << timer1.MicroSeconds()/1000.0 << " ms";
timer1.Start();
int offset3 = 3 * offset;
int offset1 = datum_width;
int stride = param_.stride(); //stride = 8
ReadMetaData(meta, data, offset3, offset1);
if(param_.transform_body_joint()) // we expect to transform body joints, and not to transform hand joints
TransformMetaJoints(meta);
VLOG(2) << " ReadMeta+MetaJoints: " << timer1.MicroSeconds()/1000.0 << " ms";
timer1.Start();
//visualize original
if(0 && param_.visualize())
visualize(img, meta, as);
//Start transforming
Mat img_aug = Mat::zeros(crop_y, crop_x, CV_8UC3);
Mat mask_miss_aug, mask_all_aug ;
//Mat mask_miss_aug = Mat::zeros(crop_y, crop_x, CV_8UC1);
//Mat mask_all_aug = Mat::zeros(crop_y, crop_x, CV_8UC1);
Mat img_temp, img_temp2, img_temp3; //size determined by scale
VLOG(2) << " input size (" << img.cols << ", " << img.rows << ")";
// We only do random transform as augmentation when training.
if (phase_ == TRAIN) {
as.scale = augmentation_scale(img, img_temp, mask_miss, mask_all, meta, mode);
//LOG(INFO) << meta.joint_self.joints.size();
//LOG(INFO) << meta.joint_self.joints[0];
as.degree = augmentation_rotate(img_temp, img_temp2, mask_miss, mask_all, meta, mode);
//LOG(INFO) << meta.joint_self.joints.size();
//LOG(INFO) << meta.joint_self.joints[0];
if(0 && param_.visualize())
visualize(img_temp2, meta, as);
as.crop = augmentation_croppad(img_temp2, img_temp3, mask_miss, mask_miss_aug, mask_all, mask_all_aug, meta, mode);
//LOG(INFO) << meta.joint_self.joints.size();
//LOG(INFO) << meta.joint_self.joints[0];
if(0 && param_.visualize())
visualize(img_temp3, meta, as);
as.flip = augmentation_flip(img_temp3, img_aug, mask_miss_aug, mask_all_aug, meta, mode);
//LOG(INFO) << meta.joint_self.joints.size();
//LOG(INFO) << meta.joint_self.joints[0];
if(param_.visualize())
visualize(img_aug, meta, as);
// imshow("img_aug", img_aug);
// Mat label_map = mask_miss_aug;
// applyColorMap(label_map, label_map, COLORMAP_JET);
// addWeighted(label_map, 0.5, img_aug, 0.5, 0.0, label_map);
// imshow("mask_miss_aug", label_map);
if (mode > 4){
resize(mask_miss_aug, mask_miss_aug, Size(), 1.0/stride, 1.0/stride, INTER_CUBIC);
}
if (mode > 5){
resize(mask_all_aug, mask_all_aug, Size(), 1.0/stride, 1.0/stride, INTER_CUBIC);
}
}
else {
img_aug = img.clone();
as.scale = 1;
as.crop = Size();
as.flip = 0;
as.degree = 0;
}
VLOG(2) << " Aug: " << timer1.MicroSeconds()/1000.0 << " ms";
timer1.Start();
//LOG(INFO) << "scale: " << as.scale << "; crop:(" << as.crop.width << "," << as.crop.height
// << "); flip:" << as.flip << "; degree: " << as.degree;
//copy transformed img (img_aug) into transformed_data, do the mean-subtraction here
offset = img_aug.rows * img_aug.cols;
int rezX = img_aug.cols;
int rezY = img_aug.rows;
int grid_x = rezX / stride;
int grid_y = rezY / stride;
int channelOffset = grid_y * grid_x;
for (int i = 0; i < img_aug.rows; ++i) {
for (int j = 0; j < img_aug.cols; ++j) {
Vec3b& rgb = img_aug.at<Vec3b>(i, j);
transformed_data[0*offset + i*img_aug.cols + j] = (rgb[0] - 128)/256.0;
transformed_data[1*offset + i*img_aug.cols + j] = (rgb[1] - 128)/256.0;
transformed_data[2*offset + i*img_aug.cols + j] = (rgb[2] - 128)/256.0;
}
}
// label size is image size/ stride
if (mode > 4){
for (int g_y = 0; g_y < grid_y; g_y++){
for (int g_x = 0; g_x < grid_x; g_x++){
for (int i = 0; i < np; i++){
float weight = float(mask_miss_aug.at<uchar>(g_y, g_x)) /255; //mask_miss_aug.at<uchar>(i, j);
if (meta.joint_self.isVisible[i] != 3){
transformed_label[i*channelOffset + g_y*grid_x + g_x] = weight;
}
}
// background channel
if(mode == 5){
transformed_label[np*channelOffset + g_y*grid_x + g_x] = float(mask_miss_aug.at<uchar>(g_y, g_x)) /255;
}
if(mode > 5){
transformed_label[np*channelOffset + g_y*grid_x + g_x] = 1;
transformed_label[(2*np+1)*channelOffset + g_y*grid_x + g_x] = float(mask_all_aug.at<uchar>(g_y, g_x)) /255;
}
}
}
}
//putGaussianMaps(transformed_data + 3*offset, meta.objpos, 1, img_aug.cols, img_aug.rows, param_.sigma_center());
//LOG(INFO) << "image transformation done!";
generateLabelMap(transformed_label, img_aug, meta);
VLOG(2) << " putGauss+genLabel: " << timer1.MicroSeconds()/1000.0 << " ms";
//starts to visualize everything (transformed_data in 4 ch, label) fed into conv1
//if(param_.visualize()){
//dumpEverything(transformed_data, transformed_label, meta);
//}
}