前言
<math> <semantics> <mrow> </mrow> <annotation encoding="application/x-tex"> \quad </annotation> </semantics> </math>之前在Keras下训练Unet十分方便,但是想要平台移植和嵌入到C++代码却成为了一个很困难的问题,我花费了几天时间完成了Caffe版本的Unet在Windows下的训练,以及利用VS2015进行前向推理的过程,由于这个过程在网络上没有任何资料,所以打算将这个过程分享一下。
步骤
- 编译windows下的caffe并编译python接口,大家可以用BVLC下windows分支的caffe代码,使用scripts/build_win.cmd来编译,这里需要配置cmake和python环境,如果使用gpu的话还需要配置cuda8.0和cudnn5.1,环境配置很无脑,就不说了,自行配置即可。
- 制作数据集,准备一份原始图片,然后mask图片(通过labelme把json文件转成png的图片),注意由于python脚本里面img.txt只存图片的名字,要求原始图片和mask图片的名字一样,且后缀名也一样,所以需要把labelme转出来的mask图片png格式转为jpg格式。最后得到的数据集就是3个东西,一是原始图片,一个是mask图片(名字和原始图片对应),一个img.txt,这个保存图片名字,例如000001.jpg,每行保存一张图片名字,多少张图片就多少行,这个可以自己写一个简单的脚本实现。
- 下载caffe版本unet的网络结构,https://github.com/warden3344/unet, 使用这里面的train_val.prototxt进行训练,这个prototxt需要更改的地方就是最后的分类数,我是两个分类,所以score层的num_output=2。然后输入层是python实现的,克隆这个工程下面有一个mydatalayer.py,默认是一个分类,由于我们是二分类需要更改一下代码,数据集3个东西的路径自己相应更改即可,如下:
import caffe
import numpy as np
import cv2
import numpy.random as random
class DataLayer(caffe.Layer):
def setup(self, bottom, top):
self.imgdir = "/home/pic/zxy-project/caffe-unet/data/image/"
self.maskdir = "/home/pic/zxy-project/caffe-unet/data/mask_png/"
self.imgtxt = "/home/pic/zxy-project/caffe-unet/data/img.txt"
self.random = True
self.seed = None
if len(top) != 2:
raise Exception("Need to define two tops: data and mask.")
if len(bottom) != 0:
raise Exception("Do not define a bottom.")
self.lines = open(self.imgtxt, 'r').readlines()
self.idx = 0
if self.random:
random.seed(self.seed)
self.idx = random.randint(0, len(self.lines) - 1)
def reshape(self, bottom, top):
# load image + label image pair
self.data = self.load_image(self.idx)
self.mask = self.load_mask(self.idx)
# reshape tops to fit (leading 1 is for batch dimension)
top[0].reshape(1, *self.data.shape)
top[1].reshape(1, *self.mask.shape)
def forward(self, bottom, top):
# assign output
top[0].data[...] = self.data
top[1].data[...] = self.mask
# pick next input
if self.random:
self.idx = random.randint(0, len(self.lines) - 1)
else:
self.idx += 1
if self.idx == len(self.lines):
self.idx = 0
def backward(self, top, propagate_down, bottom):
pass
def load_image(self, idx):
imname = self.imgdir + self.lines[idx]
imname = imname[:-1]
#print 'load img %s' %imname
im = cv2.imread(imname)
#im = cv2.imread(imname)
#print im.shape
im = cv2.resize(im,(512,512))
im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
#im = np.array(im, np.float64)
#im /= 255.0
#im -= 0.5
return im[np.newaxis, :]
def load_mask(self, idx):
imname = self.maskdir + self.lines[idx]
imname = imname[:-1]
#print 'load mask %s' %imname
im = cv2.imread(imname, 1)
seg_labels = np.zeros((2, 512, 512))
#im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
im = cv2.resize(im,(512,512))
#ret, img = cv2.threshold(im, 0.5, 1.0, cv2.THRESH_BINARY)
#ret, back = cv2.threshold(im, 0.5, 1.0, cv2.THRESH_BINARY_INV)
#outimg[0, ...] = img;
#outimg[1, ...] = back;
#outimg.astype(np.uint8)
#return im[np.newaxis, :]
im = im[:, :, 0]
for c in range(2):
seg_labels[c, :, :] = (im == c).astype(int)
#seg_labels = np.reshape(seg_labels, (512 * 512, 2))
return seg_labels
- 将VS2015配置caffe,我们使用cmake装好了caffe,现在需要配置一下vs2015的include和lib,还有链接器,之后调用caffe接口。
VC的include:
F:\caffe-enet-windows\include;
C:\Users\Administrator\.caffe\dependencies\libraries_v140_x64_py27_1.1.0\libraries\include;
F:\caffe-windows\scripts\build\include;
VC的lib:
C:\Users\Administrator\.caffe\dependencies\libraries_v140_x64_py27_1.1.0\libraries\lib;
D:\Anaconda2\libs;
C:\Users\Administrator\.caffe\dependencies\libraries_v140_x64_py27_1.1.0\libraries\x64\vc14\lib;
F:\caffe-windows\scripts\build\lib\Release;
链接器输入:
opencv_core310.lib
opencv_highgui310.lib
opencv_imgproc310.lib
opencv_imgcodecs310.lib
caffe.lib
caffeproto.lib
caffehdf5.lib
caffehdf5_hl.lib
gflags.lib
glog.lib
leveldb.lib
libprotobuf.lib
libopenblas.dll.a
lmdb.lib
boost_python-vc140-mt-1_61.lib
boost_thread-vc140-mt-1_61.lib
如果配置vs2015使用caffe有什么其他错误,可以百度或者评论区问我。
5. 编写Unet的前向预测代码,对新的测试图片进行分割图像的测试。
#pragma once
//#include "stdafx.h"
#include <caffe/caffe.hpp>
#include "head.h"
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
//#include <opencv/cv.hpp>
#define CPU_ONLY 1
#include <algorithm>
#include <iomanip>
#include <iosfwd>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include <iostream>
#include <iostream>
#include <time.h>
#include <tchar.h>
//#include "datamatrix_location.h"
//#include "ExportsDef.h"
//#include "ParseBarcodeInterface.h"
//#include <stdio.h>
using namespace cv;
using namespace caffe;
//定义分割器
class Segmentation {
public:
Segmentation(void);
~Segmentation(void);
void Segmentinit(const string& model_file_seg,
const string& weights_file_seg);
bool Segment(const cv::Mat& img, cv::Mat& result_img);
private:
void WrapInputLayer(std::vector<cv::Mat>* input_channels);
void Preprocess(const cv::Mat& img,
std::vector<cv::Mat>* input_channels);
private:
shared_ptr<Net<float> > net_seg;
cv::Size input_geometry_seg;
int num_channels_seg;
cv::Mat mean_seg;
};
Segmentation::Segmentation(void)
{
}
Segmentation::~Segmentation(void)
{
}
void Segmentation::Segmentinit(const string& model_file_seg,
const string& weights_file_seg) {
#ifdef CPU_ONLY
Caffe::set_mode(Caffe::CPU);
#else
Caffe::set_mode(Caffe::GPU);
#endif
/* Load the network. */
net_seg.reset(new Net<float>(model_file_seg, TEST));
net_seg->CopyTrainedLayersFrom(weights_file_seg);
//CHECK_EQ(net_seg->num_inputs(), 1) << "Network should have exactly one input.";
//CHECK_EQ(net_seg->num_outputs(), 1) << "Network should have exactly one output.";
Blob<float>* input_layer = net_seg->input_blobs()[0];
num_channels_seg = input_layer->channels();
CHECK(num_channels_seg == 3 || num_channels_seg == 1)
<< "Input layer should have 1 or 3 channels.";
input_geometry_seg = cv::Size(input_layer->width(), input_layer->height());
/* Load the binaryproto mean file. */
}
bool Segmentation::Segment(const cv::Mat& img, cv::Mat& result_img) {
Blob<float>* input_layer = net_seg->input_blobs()[0];
input_layer->Reshape(1, num_channels_seg,
input_geometry_seg.height, input_geometry_seg.width);
/* Forward dimension change to all layers. */
net_seg->Reshape();
std::vector<cv::Mat> input_channels;
WrapInputLayer(&input_channels);
Preprocess(img, &input_channels);
net_seg->Forward();
/* Copy the output layer to a std::vector */
Blob<float>* result_blob = net_seg->output_blobs()[0];
const float* result = result_blob->cpu_data();
//获取data中的数据转换成图像
const int h = result_blob->height();
const int w = result_blob->width();
const int channels = result_blob->channels();
const int num = result_blob->num();
std::cout << "output_blob(n,c,h,w) = " << num << ", " << channels << ", "
<< h << ", " << w << std::endl;
//定义unet_result用于存放概率图
//cv::Mat unet_result = cv::Mat(h, w, CV_8UC1);
//int index = 0;
//for (int r = 0; r < h; r++)
//{
// for (int c = 0; c < w; c++)
// {
// index = r * w + c;
// //printf("%.5f\n", result[index]);
// int temp = static_cast<int>(result[index] * 255);
// //std::cout << "(" << r << "," << c << "): " << temp << std::endl;
// unet_result.at<uchar>(r, c) = (unsigned char)temp;
// //std::cout << "(" << r << "," << c << "): " << result[index] << std::endl;
// }
//}
//if (unet_result.data)
//{
// result_img = unet_result.clone();
// return true;
//}
//else
//{
// std::cout << "Abnormal!!!!" << std::endl;
// return false;
//}
// compute argmax
cv::Mat class_each_row(channels, w * h, CV_32FC1, const_cast<float *>(result_blob->cpu_data()));
class_each_row = class_each_row.t(); // transpose to make each row with all probabilities
cv::Point maxId; // point [x,y] values for index of max
double maxValue; // the holy max value itself
cv::Mat prediction_map(h, w, CV_8UC1);
printf("rows: %d\n", class_each_row.rows);
printf("cols: %d\n", class_each_row.cols);
for (int i = 0; i<class_each_row.rows; i++) {
minMaxLoc(class_each_row.row(i), 0, &maxValue, 0, &maxId);
//printf("%.5f\n", maxValue);
//if (maxValue > 0.6) prediction_map.at<uchar>(i) = 255;
//else prediction_map.at<uchar>(i) = 0;
//printf("%.5f %.5f\n", class_each_row.at<float>(i, 0), class_each_row.at<float>(i, 1));
//printf("%d\n", maxId.x);
prediction_map.at<uchar>(i) = maxId.x;
}
cv::cvtColor(prediction_map.clone(), prediction_map, CV_GRAY2BGR);
cv::Mat label_colours = cv::imread("F:\\caffe-enet-windows\\cityscapes19.png", 1);
cv::cvtColor(label_colours, label_colours, CV_RGB2BGR);
cv::Mat output_image;
LUT(prediction_map, label_colours, output_image);
result_img = output_image;
return true;
}
/* Wrap the input layer of the network in separate cv::Mat objects
* (one per channel). This way we save one memcpy operation and we
* don't need to rely on cudaMemcpy2D. The last preprocessing
* operation will write the separate channels directly to the input
* layer. */
void Segmentation::WrapInputLayer(std::vector<cv::Mat>* input_channels) {
Blob<float>* input_layer = net_seg->input_blobs()[0];
int width = input_layer->width();
int height = input_layer->height();
float* input_data = input_layer->mutable_cpu_data();
for (int i = 0; i < input_layer->channels(); ++i) {
cv::Mat channel(height, width, CV_32FC1, input_data);
input_channels->push_back(channel);
input_data += width * height;
}
}
void Segmentation::Preprocess(const cv::Mat& img,
std::vector<cv::Mat>* input_channels) {
/* Convert the input image to the input image format of the network. */
cv::Mat sample;
if (img.channels() == 3 && num_channels_seg == 1)
cv::cvtColor(img, sample, cv::COLOR_BGR2GRAY);
else if (img.channels() == 4 && num_channels_seg == 1)
cv::cvtColor(img, sample, cv::COLOR_BGRA2GRAY);
else if (img.channels() == 4 && num_channels_seg == 3)
cv::cvtColor(img, sample, cv::COLOR_BGRA2BGR);
else if (img.channels() == 1 && num_channels_seg == 3)
cv::cvtColor(img, sample, cv::COLOR_GRAY2BGR);
else
sample = img;
//std::cout << num_channels_seg << std::endl;
//std::cout << sample.channels() << std::endl;
cv::Mat sample_resized;
if (sample.size() != input_geometry_seg)
cv::resize(sample, sample_resized, input_geometry_seg);
else
sample_resized = sample;
cv::Mat sample_float;
if (num_channels_seg == 3)
sample_resized.convertTo(sample_float, CV_32FC3);
else
sample_resized.convertTo(sample_float, CV_32FC1);
//cv::Mat sample_normalized;
//cv::subtract(sample_float, mean_seg, sample_normalized);
/* This operation will write the separate BGR planes directly to the
* input layer of the network because it is wrapped by the cv::Mat
* objects in input_channels. */
//cv::split(sample_normalized, *input_channels);
cv::split(sample_float, *input_channels);
CHECK(reinterpret_cast<float*>(input_channels->at(0).data)
== net_seg->input_blobs()[0]->cpu_data())
<< "Input channels are not wrapping the input layer of the network.";
}
//用于初始化分割器
int init_segment_model(Segmentation& segmentation, const string& model_path, const string& weights_path)
{
const string model_file_seg = model_path;
const string weights_file_seg = weights_path;
// Initialize the unet_network.
segmentation.Segmentinit(model_file_seg, weights_file_seg);
return 1;
}
Mat speed_rgb2gray(Mat src) {
Mat dst(src.rows, src.cols, CV_8UC1);
for (int i = 0; i < src.rows; i++) {
for (int j = 0; j < src.cols; j++) {
dst.at<uchar>(i, j) = ((src.at<Vec3b>(i, j)[0] << 18) + (src.at<Vec3b>(i, j)[0] << 15) + (src.at<Vec3b>(i, j)[0] << 14) +
(src.at<Vec3b>(i, j)[0] << 11) + (src.at<Vec3b>(i, j)[0] << 7) + (src.at<Vec3b>(i, j)[0] << 7) + (src.at<Vec3b>(i, j)[0] << 5) +
(src.at<Vec3b>(i, j)[0] << 4) + (src.at<Vec3b>(i, j)[0] << 2) +
(src.at<Vec3b>(i, j)[1] << 19) + (src.at<Vec3b>(i, j)[1] << 16) + (src.at<Vec3b>(i, j)[1] << 14) + (src.at<Vec3b>(i, j)[1] << 13) +
(src.at<Vec3b>(i, j)[1] << 10) + (src.at<Vec3b>(i, j)[1] << 8) + (src.at<Vec3b>(i, j)[1] << 4) + (src.at<Vec3b>(i, j)[1] << 3) + (src.at<Vec3b>(i, j)[1] << 1) +
(src.at<Vec3b>(i, j)[2] << 16) + (src.at<Vec3b>(i, j)[2] << 15) + (src.at<Vec3b>(i, j)[2] << 14) + (src.at<Vec3b>(i, j)[2] << 12) +
(src.at<Vec3b>(i, j)[2] << 9) + (src.at<Vec3b>(i, j)[2] << 7) + (src.at<Vec3b>(i, j)[2] << 6) + (src.at<Vec3b>(i, j)[2] << 5) + (src.at<Vec3b>(i, j)[2] << 4) + (src.at<Vec3b>(i, j)[2] << 1) >> 20);
}
}
return dst;
}
int main(int argc, char** argv)
{
const string& model_file_seg = "F:\\caffe-unet\\train_val_test.prototxt";
const string& weights_file_seg = "F:\\caffe-unet\\zxy_iter_2000.caffemodel";
Segmentation segmentation;
int segment_init = init_segment_model(segmentation, model_file_seg, weights_file_seg);
// load original images of dataMatrix
std::string strOrgImg_Path = "F:\\make_data\\caffe-enet-data2\\image\\000001.jpg";
cv::Mat img_seg = cv::imread(strOrgImg_Path);
cvtColor(img_seg, img_seg, CV_BGR2GRAY);
if (!img_seg.data)
{
return -1;
}
cv::Mat seg_result = cv::Mat(img_seg.rows, img_seg.cols, CV_8UC1);
bool segment_flag = segmentation.Segment(img_seg, seg_result);
if (segment_flag)
{
cv::namedWindow("unet_result");
cv::imshow("unet_result", seg_result);
cv::moveWindow("unet_result", 700, 0);
cv::waitKey(0);
}
return 0;
}
我的代码里用了cityscapes19.png显示颜色,自己预测的时候可以自己修改想要的颜色。我这里贴一下,这3张图片。
原图:
预测结果:
cityscapes19.png:
另外一个prototxt,去掉了crop,速度稍快
https://github.com/jolibrain/deepdetect/tree/master/templates/caffe/unet