caffe2TensorRT and TensorRT_plugin_layer

Tensor RT可以直接解析caffe模型。 对于不支持的操作op,您可以使用接口手动添加它们;

本博客将提供了一个广播操作(BroadCast_op)的示例, 此外,它还测试了Pooling_layer,它还添加了一个test_layer,它可以单独打印结构中的参数并协助Debug;

Code地址:
https://github.com/junqiangwu/Mxnet2Caffe-Tensor-RT-SEnet

  • 1.继承IPluginExt接口以创建自定义图层类

  • 2.创建一个PluginFactory函数,用于向网络添加自定义图层类。

提供的接口类内部函数,需要根据自定义op修改

  virtual int getNbOutputs() const = 0;

  virtual Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) = 0;

  virtual void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) = 0;

  virtual int initialize() = 0;

  virtual void terminate() = 0;

  virtual size_t getWorkspaceSize(int maxBatchSize) const = 0;

  virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) = 0;

  virtual size_t getSerializationSize() = 0;

  virtual void serialize(void* buffer) = 0;

Broadcast_layer 实现

// 继承 IPluginExt 类
class Broadcast: public IPluginExt{
private:
    int c,h,w;
    string layer_name;
public:
    Broadcast(const char * name):layer_name(name){ 
        printf("init_Layer %s\n",name);
     }
    ~Broadcast(){ }
    // 构造函数,定义一些必须的参数
    Broadcast(const char* name,const void* data, size_t length):layer_name(name)
    {
        const char* d = static_cast<const char*>(data), *a = d;
        read(d, c);
        read(d, h);
        read(d, w);
        assert(d == a + length);
    }

    int getNbOutputs() const override{
        return 1;
    }
   // 指定该op操作的输出tensor的shape
    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
    {
        assert(index == 0 && nbInputDims == 2 && inputs[0].nbDims == 3);

        //printf("%s getOutputDimensions: InputDims = %d %d %d\n",layer_name.c_str(),nbInputDims,inputs[0].nbDims,inputs[1].nbDims);

        return DimsCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]);
    }

    bool supportsFormat(DataType type, PluginFormat format) const override { return (type == DataType::kFLOAT || type == DataType::kHALF) && format == PluginFormat::kNCHW; }

   virtual size_t getWorkspaceSize(int maxBatchSize) const override
    {
        return 0;
    }

    void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) override
    {
        assert((type == DataType::kFLOAT || type == DataType::kHALF) && format == PluginFormat::kNCHW);
        c = inputDims[0].d[0];
	    h = inputDims[0].d[1];
	    w = inputDims[0].d[2];
        
    }

    int initialize() override
    {
        return 0;
    }
    
	// 你需要重点关注的实现,相当于forward操作,在这里实现op的内部操作
    virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override
    {
    	printf("enqueue %s c %d h %d w %d \n",layer_name.c_str(),c,h,w);

        float *pbottom = (float*)malloc(sizeof(float)*c*h*w);
	    //cudaMemcpyDeviceToHost 从gpu中copy一份数据,因为后面的实现是在cpu上进行的,你也可以使用cuda实现,就不需要搬运数据了 
	    cudaMemcpy((void*)pbottom, inputs[0], sizeof(float) * c* h *w, cudaMemcpyDeviceToHost);

        float *pbottom2 = (float*)malloc(sizeof(float)*c*1*1);
        cudaMemcpy((void*)pbottom2, inputs[1], sizeof(float) * c* 1 *1, cudaMemcpyDeviceToHost);

        for(int i=0;i<c;i+=1){
            for(int j=0;j<w*h;j+=1){
                int index = i*h*w;
                pbottom[index+j] = pbottom[index+j] *pbottom2[i];            
            }
        }
        // 将计算结果再搬回gpu 用于后续layer操作
        cudaMemcpy(outputs[0], (const void*)pbottom, sizeof(float) * c * h * w,cudaMemcpyHostToDevice);

#if P_log
        if(layer_name == "broadcast_mul1")
                {
                    ofstream fp("broadcast_mul1_out.txt");
                    for(int i = 0; i < c*h*w; i++)
                    {
                        fp << pbottom[i] << endl;
                    }
                }
#endif

        free(pbottom);
        free(pbottom2);
        return 0;
    }
    virtual void terminate() override {}

   
   virtual size_t getSerializationSize() override
    {
        return 3*sizeof(int);
    }

    virtual void serialize(void* buffer) override
    {
        char* d = static_cast<char*>(buffer), *a = d;

        write(d, c);
        write(d, h);
        write(d, w);
        assert(d == a + getSerializationSize());
    }

};

PoolingLayer 实现

  1. ker_size (3,3)
  2. paddin = 1
  3. stride =2
class PoolingLayer: public IPluginExt{
private:
    int _c,_h,_w;
    int k=3;
    int p=1;
    int s=2;
    int out_w,out_h;

    string layer_name;
public:
    PoolingLayer(const char * name):layer_name(name){ 
        printf("Testlayer %s\n",name);
     }
    ~PoolingLayer(){ }
    
    PoolingLayer(const char* name,const void* data, size_t length):layer_name(name)
    {
    	printf("test_deserialize %s\n",name);
        const char* d = static_cast<const char*>(data), *a = d;

        read(d, _c);
        read(d, _h);
        read(d, _w);
        read(d,out_w);
        read(d,out_h);
        assert(d == a + length);
    }

    int getNbOutputs() const override{
        return 1;
    }

    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
    {
        assert(index == 0 && nbInputDims == 1 && inputs[0].nbDims == 3);
        printf("name: %s inputs[0] %d %d %d \n",layer_name.c_str(),inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]);
        
        out_w = out_h = floor( (inputs[0].d[1]+2*p-k)/s + 1 );
        
        printf("name: %s --> output %d %d %d \n",layer_name.c_str(),inputs[0].d[0],out_w, out_h);

        return DimsCHW(inputs[0].d[0],out_w, out_h);
    }

    bool supportsFormat(DataType type, PluginFormat format) const override { return (type == DataType::kFLOAT || type == DataType::kHALF) && format == PluginFormat::kNCHW; }

   virtual size_t getWorkspaceSize(int maxBatchSize) const override
    {
        return 0;
    }

    void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) override
    {
        assert((type == DataType::kFLOAT || type == DataType::kHALF) && format == PluginFormat::kNCHW);
        _c = inputDims[0].d[0];
	    _h = inputDims[0].d[1];
	    _w = inputDims[0].d[2];
        
    }

    int initialize() override
    {
        return 0;
    }


    virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override
    {
    	printf("enqueue %s c %d h %d w %d \n",layer_name.c_str(),_c,_h,_w);

        float *pbottom = (float*)malloc(sizeof(float)*_c*_h*_w);
	    cudaMemcpy((void*)pbottom, inputs[0], sizeof(float) * _c* _h * _w, cudaMemcpyDeviceToHost);


        float *top = (float*)malloc(sizeof(float)*_c*out_w*out_h);
	    memset(top,-1*FLT_MAX,sizeof(float)*_c*out_w*out_h);

        //int maxk = k*k;
        // int p1=0,p2=0;
        // int gap = w-k;

        // std::vector<int> _space(maxk);
        // for(int i=0;i<k;i++){
        // for(int j=0;j<k;j++){
        // _space[p1] = p2;
        // p1++;
        // p2++;
        // }
        // p2+= gap;
        // }

        for(int n=0;n<_c;n++){
            int out_index = n*out_w*out_h;
            
            float* int_buffer = pbottom + n*_w*_h;

            for(int ph=0;ph<out_h;ph++){
                for(int pw=0;pw<out_w;pw++){
                    
                    int hstart = ph*s -p;
                    int wstart = pw*s -p;
                    int hend = std::min(hstart+k,_h);
                    int wend = std::min(wstart+k,_w);
                    hstart =std::max(hstart,0);
                    wstart =std::max(wstart,0);

                    const int pool_index = out_index + ph * out_w + pw;
                    
                    for (int h = hstart; h < hend; ++h) {
                        for (int w = wstart; w < wend; ++w) {
                            const int index = h * _w + w;
                             
                            if (int_buffer[index] > top[pool_index]) {
                                 top[pool_index] = int_buffer[index];
                             }
                        }
                    }

                }
            }
        }

#if P_log
   
    // if(!strcmp(layer_name.c_str(),"reshape0"))
    // {
        ofstream fp;
        fp.open(("pooling0.txt"),ios::out);
 
		for(int i = 0; i < _c*out_h*out_w; i++)
		{
			fp << setiosflags(ios::fixed) <<setprecision(12) << top[i] << endl;
		}
        fp.close(); 

   // }
		
#endif

        cudaMemcpy(outputs[0], (const void*)top, sizeof(float) * _c * out_h * out_w,cudaMemcpyHostToDevice);
        
        free(pbottom);
        free(top);


        return 0;

    }

    virtual void terminate() override {}

    virtual size_t getSerializationSize() override
    {
        return 5*sizeof(int);
    }

    virtual void serialize(void* buffer) override
    {
        char* d = static_cast<char*>(buffer), *a = d;

        write(d, _c);
        write(d, _h);
        write(d, _w);
        write(d, out_w);
        write(d, out_h);
        assert(d == a + getSerializationSize());
    }

};