caffe2TensorRT and TensorRT_plugin_layer
Tensor RT可以直接解析caffe模型。 对于不支持的操作op,您可以使用接口手动添加它们;
本博客将提供了一个广播操作(BroadCast_op)的示例, 此外,它还测试了Pooling_layer,它还添加了一个test_layer,它可以单独打印结构中的参数并协助Debug;
Code地址:
https://github.com/junqiangwu/Mxnet2Caffe-Tensor-RT-SEnet
-
1.继承IPluginExt接口以创建自定义图层类
-
2.创建一个PluginFactory函数,用于向网络添加自定义图层类。
提供的接口类内部函数,需要根据自定义op修改
virtual int getNbOutputs() const = 0;
virtual Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) = 0;
virtual void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) = 0;
virtual int initialize() = 0;
virtual void terminate() = 0;
virtual size_t getWorkspaceSize(int maxBatchSize) const = 0;
virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) = 0;
virtual size_t getSerializationSize() = 0;
virtual void serialize(void* buffer) = 0;
Broadcast_layer 实现
// 继承 IPluginExt 类
class Broadcast: public IPluginExt{
private:
int c,h,w;
string layer_name;
public:
Broadcast(const char * name):layer_name(name){
printf("init_Layer %s\n",name);
}
~Broadcast(){ }
// 构造函数,定义一些必须的参数
Broadcast(const char* name,const void* data, size_t length):layer_name(name)
{
const char* d = static_cast<const char*>(data), *a = d;
read(d, c);
read(d, h);
read(d, w);
assert(d == a + length);
}
int getNbOutputs() const override{
return 1;
}
// 指定该op操作的输出tensor的shape
Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
{
assert(index == 0 && nbInputDims == 2 && inputs[0].nbDims == 3);
//printf("%s getOutputDimensions: InputDims = %d %d %d\n",layer_name.c_str(),nbInputDims,inputs[0].nbDims,inputs[1].nbDims);
return DimsCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]);
}
bool supportsFormat(DataType type, PluginFormat format) const override { return (type == DataType::kFLOAT || type == DataType::kHALF) && format == PluginFormat::kNCHW; }
virtual size_t getWorkspaceSize(int maxBatchSize) const override
{
return 0;
}
void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) override
{
assert((type == DataType::kFLOAT || type == DataType::kHALF) && format == PluginFormat::kNCHW);
c = inputDims[0].d[0];
h = inputDims[0].d[1];
w = inputDims[0].d[2];
}
int initialize() override
{
return 0;
}
// 你需要重点关注的实现,相当于forward操作,在这里实现op的内部操作
virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override
{
printf("enqueue %s c %d h %d w %d \n",layer_name.c_str(),c,h,w);
float *pbottom = (float*)malloc(sizeof(float)*c*h*w);
//cudaMemcpyDeviceToHost 从gpu中copy一份数据,因为后面的实现是在cpu上进行的,你也可以使用cuda实现,就不需要搬运数据了
cudaMemcpy((void*)pbottom, inputs[0], sizeof(float) * c* h *w, cudaMemcpyDeviceToHost);
float *pbottom2 = (float*)malloc(sizeof(float)*c*1*1);
cudaMemcpy((void*)pbottom2, inputs[1], sizeof(float) * c* 1 *1, cudaMemcpyDeviceToHost);
for(int i=0;i<c;i+=1){
for(int j=0;j<w*h;j+=1){
int index = i*h*w;
pbottom[index+j] = pbottom[index+j] *pbottom2[i];
}
}
// 将计算结果再搬回gpu 用于后续layer操作
cudaMemcpy(outputs[0], (const void*)pbottom, sizeof(float) * c * h * w,cudaMemcpyHostToDevice);
#if P_log
if(layer_name == "broadcast_mul1")
{
ofstream fp("broadcast_mul1_out.txt");
for(int i = 0; i < c*h*w; i++)
{
fp << pbottom[i] << endl;
}
}
#endif
free(pbottom);
free(pbottom2);
return 0;
}
virtual void terminate() override {}
virtual size_t getSerializationSize() override
{
return 3*sizeof(int);
}
virtual void serialize(void* buffer) override
{
char* d = static_cast<char*>(buffer), *a = d;
write(d, c);
write(d, h);
write(d, w);
assert(d == a + getSerializationSize());
}
};
PoolingLayer 实现
- ker_size (3,3)
- paddin = 1
- stride =2
class PoolingLayer: public IPluginExt{
private:
int _c,_h,_w;
int k=3;
int p=1;
int s=2;
int out_w,out_h;
string layer_name;
public:
PoolingLayer(const char * name):layer_name(name){
printf("Testlayer %s\n",name);
}
~PoolingLayer(){ }
PoolingLayer(const char* name,const void* data, size_t length):layer_name(name)
{
printf("test_deserialize %s\n",name);
const char* d = static_cast<const char*>(data), *a = d;
read(d, _c);
read(d, _h);
read(d, _w);
read(d,out_w);
read(d,out_h);
assert(d == a + length);
}
int getNbOutputs() const override{
return 1;
}
Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
{
assert(index == 0 && nbInputDims == 1 && inputs[0].nbDims == 3);
printf("name: %s inputs[0] %d %d %d \n",layer_name.c_str(),inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]);
out_w = out_h = floor( (inputs[0].d[1]+2*p-k)/s + 1 );
printf("name: %s --> output %d %d %d \n",layer_name.c_str(),inputs[0].d[0],out_w, out_h);
return DimsCHW(inputs[0].d[0],out_w, out_h);
}
bool supportsFormat(DataType type, PluginFormat format) const override { return (type == DataType::kFLOAT || type == DataType::kHALF) && format == PluginFormat::kNCHW; }
virtual size_t getWorkspaceSize(int maxBatchSize) const override
{
return 0;
}
void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) override
{
assert((type == DataType::kFLOAT || type == DataType::kHALF) && format == PluginFormat::kNCHW);
_c = inputDims[0].d[0];
_h = inputDims[0].d[1];
_w = inputDims[0].d[2];
}
int initialize() override
{
return 0;
}
virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override
{
printf("enqueue %s c %d h %d w %d \n",layer_name.c_str(),_c,_h,_w);
float *pbottom = (float*)malloc(sizeof(float)*_c*_h*_w);
cudaMemcpy((void*)pbottom, inputs[0], sizeof(float) * _c* _h * _w, cudaMemcpyDeviceToHost);
float *top = (float*)malloc(sizeof(float)*_c*out_w*out_h);
memset(top,-1*FLT_MAX,sizeof(float)*_c*out_w*out_h);
//int maxk = k*k;
// int p1=0,p2=0;
// int gap = w-k;
// std::vector<int> _space(maxk);
// for(int i=0;i<k;i++){
// for(int j=0;j<k;j++){
// _space[p1] = p2;
// p1++;
// p2++;
// }
// p2+= gap;
// }
for(int n=0;n<_c;n++){
int out_index = n*out_w*out_h;
float* int_buffer = pbottom + n*_w*_h;
for(int ph=0;ph<out_h;ph++){
for(int pw=0;pw<out_w;pw++){
int hstart = ph*s -p;
int wstart = pw*s -p;
int hend = std::min(hstart+k,_h);
int wend = std::min(wstart+k,_w);
hstart =std::max(hstart,0);
wstart =std::max(wstart,0);
const int pool_index = out_index + ph * out_w + pw;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
const int index = h * _w + w;
if (int_buffer[index] > top[pool_index]) {
top[pool_index] = int_buffer[index];
}
}
}
}
}
}
#if P_log
// if(!strcmp(layer_name.c_str(),"reshape0"))
// {
ofstream fp;
fp.open(("pooling0.txt"),ios::out);
for(int i = 0; i < _c*out_h*out_w; i++)
{
fp << setiosflags(ios::fixed) <<setprecision(12) << top[i] << endl;
}
fp.close();
// }
#endif
cudaMemcpy(outputs[0], (const void*)top, sizeof(float) * _c * out_h * out_w,cudaMemcpyHostToDevice);
free(pbottom);
free(top);
return 0;
}
virtual void terminate() override {}
virtual size_t getSerializationSize() override
{
return 5*sizeof(int);
}
virtual void serialize(void* buffer) override
{
char* d = static_cast<char*>(buffer), *a = d;
write(d, _c);
write(d, _h);
write(d, _w);
write(d, out_w);
write(d, out_h);
assert(d == a + getSerializationSize());
}
};