当前位置:   article > 正文

record------CUDA GPU并行编程理解并实现centernet batch后处理_cuda nms batch

cuda nms batch

CUDA GPU并行编程

前言

理解基本单元:线程(Thread)、线程块(Block)、网格(Grid),线程块数(BlockDim)、网格数(GridDim)
其中网格(Grid)是由多个线程块(Block)组成,线程块(Block)是由多个线程(Thread)组成。
BlockDim:一个Block块中含有的最大线程(Thread)数,和硬件有关系,是一个dim3类型
GridDim:一个Grid块中含有的最大Block块数目,和硬件有关系,是一个dim3类型
ThreadIdx:线程的索引号,是相对当前所在的一个block中的位置,是一个uint3类型
BlockIdx:线程块的索引号,是相对当前所在的一个grid中的位置,是一个uint3类型
GridIdx:没有这个东西,会报错,就是只能设置最大一个网格 的核,在1080ti上

Dim3 数据类型是CUDA自定义的一种数据结构,一般为三维的无符号整型,其定义及引用方式如下:

Dim3 BlockSize(16, 16, 1)
// BlockSize.x = 16; BlockSize.y = 16; BlockSize.z = 1;
// 也可以写为  Dim3 BlockSize(16, 16),最后一维默认为1;
  • 1
  • 2
  • 3

以下介绍如何在CUDA中使用一维、二维核函数,并索引其下标。

一维CUDA核函数下标索引获取方法

threadIdx.x : 代表在当前block内 x方向上的线程序号;
blockIdx.x : 代表在当前grid内 x方向上的线程块序号;
blockDim.x : 代表一个block中x方向上最大的线程数;
gridDim.x : 代表一个grid中x方向最大的block数;

所以要索引第三个Block(序号2)中第4个线程(序号3)的下标index,其过程如图所示:
在这里插入图片描述

二维CUDA核函数下标索引获取方法

threadIdx.y : 代表在当前block内 y方向上的线程序号;
blockIdx.y : 代表在当前grid内 y方向上的线程块序号;
blockDim.y : 代表一个block中 y方向上最大的线程数;
gridDim.y : 代表一个grid中 y方向最大的block数;

图示理解:
在这里插入图片描述在这里插入图片描述

设置线程维数

在.cu文件中调用自定义的内核函数时,需要使用三角括号语法<<<paramA,paramB,*args >>>指定CUDA内核启动,其中参数paramA, paramB 必须要给定,参数详解:
paramA: 设置gridDim,即一个grid中每个维度block的数量,使用 Dim3 定义
paramA: 设置blockDim,即一个block中每个维度thread的数量,使用 Dim3 定义

centernet cuda后处理

膜拜大佬代码(码云:https://gitee.com/zjkclpch/TensorRT-CenterNet?_from=gitee_search) (github:暂时略,两会期间被禁),不才将大佬的单张后处理 修改为 batch处理。
ctdetNet.cpp文件
修改函数: void ctdetNet::InitEngine() 为:

void ctdetNet::InitEngine() {
        const int maxBatchSize = 1; //需要修改为自己使用的batch size
        mContext = mEngine->createExecutionContext();
        assert(mContext != nullptr);
        mContext->setProfiler(&mProfiler);
        int nbBindings = mEngine->getNbBindings();//nbBindings=4
        if (nbBindings > 4) forwardFace= true;
        mCudaBuffers.resize(nbBindings); //0:输入 1-3:输出
        mBindBufferSizes.resize(nbBindings);
//        cudaOutputBuffer.resize(maxBatchSize);

        int64_t totalSize = 0;
        for (int i = 0; i < nbBindings; ++i)
        {
            nvinfer1::Dims dims = mEngine->getBindingDimensions(i); //pth->onnx时设置n>1,此处的 dims.d[0]=n
            nvinfer1::DataType dtype = mEngine->getBindingDataType(i);
            dims_type.push_back({dims,dtype});
            totalSize = volume(dims) * maxBatchSize * getElementSize(dtype);
            mBindBufferSizes[i] = totalSize/maxBatchSize; //必须进行出掉batch
            mCudaBuffers[i] = safeCudaMalloc(totalSize/maxBatchSize);
        }
//        outputBufferSize = mBindBufferSizes[1] * 6/maxBatchSize ; //heatmap上每个热点对应6个值:x,y,w,h,reg_x,reg_y
        outputBufferSize = mBindBufferSizes[1] * 6; //heatmap上每个热点对应6个值:x,y,w,h,reg_x,reg_y

//        for(int i=0;i<maxBatchSize;i++)
//        {
//            cudaOutputBuffer[i]=safeCudaMalloc(outputBufferSize);//每张图片的输出地址
//        }

        cudaOutputBuffer=safeCudaMalloc(outputBufferSize);//每张图片的输出地址

        CUDA_CHECK(cudaStreamCreate(&mCudaStream));
    }
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33

增加 void ctdetNet::doInference_batch_one_step(const void *inputData,float * outputData_b):

   void ctdetNet::doInference_batch_one_step(const void *inputData,float * outputData_b)
    {
        int inputIndex = 0 ;
        int batch_size=1;

        struct timeval start0, end0;
        gettimeofday(&start0,NULL);
        CUDA_CHECK(cudaMemcpyAsync(mCudaBuffers[inputIndex], inputData, mBindBufferSizes[inputIndex],
                                   cudaMemcpyHostToDevice, mCudaStream));
        cudaDeviceSynchronize();
        gettimeofday(&end0,NULL);
        int timeuse0 = static_cast<int>(1000000 * (end0.tv_sec - start0.tv_sec ) + end0.tv_usec - start0.tv_usec);
        float time0 = float(timeuse0)/1000;
        std::cout << "batch cpDevice is: " << time0 <<  std::endl;

        struct timeval start, end;
        gettimeofday(&start,NULL);
        mContext->execute(batch_size, &mCudaBuffers[inputIndex]); //出现问题,不支持批处理
        cudaDeviceSynchronize();
        gettimeofday(&end,NULL);


        int timeuse = static_cast<int>(1000000 * (end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec);
        float time = float(timeuse)/1000;
        std::cout << "batch forward is: " << time <<  std::endl;
//        *time_consum = *time_consum + time;

        CUDA_CHECK(cudaMemset(cudaOutputBuffer, 0, volume(dims_type[1].first)*6*sizeof(float)));

        struct timeval start1, end1;
        gettimeofday(&start1,NULL);
//        PrintResult(mCudaStream[0])


        int step=int(volume(dims_type[1].first)/batch_size*6); //float型的地址偏移

        if (forwardFace){
            CTfaceforward_gpu(static_cast<const float *>(mCudaBuffers[1]),static_cast<const float *>(mCudaBuffers[2]),
                              static_cast<const float *>(mCudaBuffers[3]),static_cast<const float *>(mCudaBuffers[4]),static_cast<float *>(cudaOutputBuffer),
                              input_w/4,input_h/4,classNum,kernelSize,visThresh);
        } else{
            CTdetforward_gpu(static_cast<const float *>(mCudaBuffers[1]),static_cast<const float *>(mCudaBuffers[2]),
                             static_cast<const float *>(mCudaBuffers[3]),static_cast<float *>(cudaOutputBuffer),
                             input_w/4,input_h/4,classNum,kernelSize,visThresh,step);

//            CTdetforward_gpu(static_cast<const float *>(mCudaBuffers[1]),static_cast<const float *>(mCudaBuffers[3]),
//                    static_cast<const float *>(mCudaBuffers[2]),static_cast<float *>(cudaOutputBuffer),
//                             input_w/4,input_h/4,classNum,kernelSize,visThresh,step);
        }

        cudaDeviceSynchronize();
        gettimeofday(&end1,NULL);
        int timeuse1 = static_cast<int>(1000000 * (end1.tv_sec - start1.tv_sec ) + end1.tv_usec - start1.tv_usec);
        float time1 = float(timeuse1)/1000;
        std::cout << " batch post is: " << time1 <<  std::endl;


        struct timeval start2, end2;
        gettimeofday(&start2,NULL);

        CUDA_CHECK(cudaMemcpy((void*)(outputData_b), cudaOutputBuffer,outputBufferSize, cudaMemcpyDeviceToHost));

        cudaDeviceSynchronize();
        gettimeofday(&end2,NULL);
        int timeuse2 = static_cast<int>(1000000 * (end2.tv_sec - start2.tv_sec ) + end2.tv_usec - start2.tv_usec);
        float time2 = float(timeuse2)/1000;
        std::cout << "batch cpHost is: " << time2 <<  std::endl;
    }
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68

ctdetLayer.cu文件
修改内容如下:

#include "ctdetLayer.h"
#include "utils.h"
#include "stdio.h"
dim3 cudaGridSize(uint n)
{
    uint k = (n - 1) /BLOCK + 1;
    uint x = k ;
    uint y = 1 ;
    if (x > 65535 )
    {
        x = ceil(sqrt(x));
        y = (n - 1 )/(x*BLOCK) + 1;
    }
    dim3 d = {x,y,1} ;
    return d;
}
__device__ float Logist(float data){ return 1./(1. + exp(-data));}

__global__ void PrintResultGPU(float *buffer, size_t unit_length){
    printf("buffer values: %f, %f\n",*buffer, *(buffer+unit_length));
    printf("buffer values: %f, %f\n",*(buffer+1000000), *(buffer+unit_length+1000000));
    printf("buffer values: %f, %f\n",*(buffer+1000000-1), *(buffer+unit_length+1000000-1));
    printf("buffer values: %f, %f\n",*(buffer+1000000), *(buffer+unit_length+1000000));
    printf("buffer values: %f, %f\n",*(buffer+1000000-1), *(buffer+unit_length+1000000-1));
}

__global__ void CTdetforward_kernel(const float *hm, const float *reg,const float *wh , float* output,
        const int w,const int h,const int classes,const int kernel_size,const float visthresh,int step)
        {
    int idx = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;  //gridIdx.x

    int imageID= idx/(w*h*classes);//(w*h*c);
    //printf("begin to run cuda kernel, currently at %d\n ", imageID);
        //std::cout<<imageID<<std::endl;
//    if(idx == 0){
//        printf("input values: %f, %f, %f\n", hm[0], reg[0], wh[0]);
//    }
    if (idx >= 1*w * h * classes) return;
//    if(idx == 0)
//
    int padding = (kernel_size - 1) / 2;
    int offset = -padding;
    int stride = w * h;
    int grid_x = (idx%(w*h*classes)) % w;
    int grid_y = ((idx%(w*h*classes)) / w) % h;
    int cls = (idx/w/h)%classes;
    int  l, m;
//    int reg_index = idx - cls*stride-imageID*classes*stride; //修改,原有int reg_index = idx - cls*stride
    int reg_index = idx - cls*stride-imageID*classes*stride+imageID*stride; //修改,原有int reg_index = idx - cls*stride

    float c_x, c_y;
    float objProb = Logist(hm[idx]);
    if (objProb > visthresh) {
        float max = -1;
        int max_index = 0;
        for (l = 0; l < kernel_size; ++l)
            for (m = 0; m < kernel_size; ++m) {
                int cur_x = offset + l + grid_x;
                int cur_y = offset + m + grid_y;
                int cur_index = cur_y * w + cur_x + stride * cls + imageID*classes*stride;
                int valid = (cur_x >= 0 && cur_x < w && cur_y >= 0 && cur_y < h);
                float val = (valid != 0) ? Logist(hm[cur_index]) : -1;
                max_index = (val > max) ? cur_index : max_index;
                max = (val > max) ? val : max;
            }

        if(idx == max_index){
            //printf("begin to run cuda kernel, currently at %d\n ", imageID);
            int resCount = (int) atomicAdd(output+imageID*step, 1);
//            printf("cur rescount %d %f\n ", imageID, *(output+imageID*step));
//            printf("resCount is %d",resCount);
            char *data = (char *) (output+imageID*step) + sizeof(float) + resCount * sizeof(Detection);//+imageID*step*sizeof(float);

//            int resCount = (int) atomicAdd(output, 1);
//            printf("resCount is %d",resCount);
//            char *data = (char *) output + sizeof(float) + resCount * sizeof(Detection);


            Detection *det = (Detection *) (data);
            c_x = grid_x + reg[reg_index];
            c_y = grid_y + reg[reg_index + stride];
            det->bbox.x1 = (c_x - wh[reg_index] / 2) * 4;
            det->bbox.y1 = (c_y - wh[reg_index + stride] / 2) * 4;
            det->bbox.x2 = (c_x + wh[reg_index] / 2) * 4;
            det->bbox.y2 = (c_y + wh[reg_index + stride] / 2) * 4;
            det->classId = cls;
            det->prob = objProb;
        }
    }
//    if(idx == w * h * classes -1){
//        char *test = (char *)output + sizeof(float);
//        printf("first float: %f\n", output[0]);
//        Detection *det_t = (Detection *)test;
//        printf("det_t prob: %f, %d, %f, %f, %f, %f\n", det_t->prob, det_t->classId, det_t->bbox.x1, det_t->bbox.y1, det_t->bbox.x2, det_t->bbox.y2);
//    }

}


__global__ void CTfaceforward_kernel(const float *hm, const float *wh,const float *reg,const float* landmarks,
                                    float *output,const int w,const int h,const int classes,const int kernel_size,const float visthresh  ) {
    int idx = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
    if (idx >= w*h*classes) return;
    int padding = (kernel_size-1)/2;
    int offset = - padding;
    int stride = w * h;
    int grid_x = idx % w;
    int grid_y = (idx / w) % h;
    int cls = idx/w/h ;
    int reg_index = idx - cls*stride;
    int l,m,mark_id;
    float c_x,c_y,scale_w,scale_h;

    float objProb = hm[idx];
    float max=-1;
    int max_index =0;
    if(objProb > visthresh){
        for(l=0 ;l < kernel_size ; ++l)
            for(m=0 ; m < kernel_size ; ++m){
                int cur_x = offset + l + grid_x;
                int cur_y = offset + m + grid_y;
                int cur_index = cur_y * w + cur_x + stride*cls;
                int valid = (cur_x>=0 && cur_x < w && cur_y >=0 && cur_y <h );
                float val = (valid !=0 ) ? hm[cur_index]: -1;
                max_index = (val > max) ? cur_index : max_index;
                max = (val > max ) ?  val: max ;
            }
        if(idx == max_index){
            int resCount = (int)atomicAdd(output,1);
            //printf("%d",resCount);
            char* data = (char * )output + sizeof(float) + resCount*sizeof(Detection);
            Detection* det =  (Detection*)(data);
            c_x = (grid_x + reg[reg_index+stride] + 0.5)*4 ; c_y  = (grid_y + reg[reg_index] + 0.5) * 4;
            scale_w =  expf(wh[reg_index+stride]) * 4 ; scale_h  = expf(wh[reg_index]) * 4;
            det->bbox.x1 = c_x - scale_w/2;
            det->bbox.y1 = c_y - scale_h/2 ;
            det->bbox.x2 = c_x + scale_w/2;
            det->bbox.y2 = c_y + scale_h/2;
            det->prob = objProb;
            det->classId = cls;
            for(mark_id=0 ; mark_id < 5 ; ++ mark_id){
                det->marks[mark_id].x = det->bbox.x1 + landmarks[reg_index + (2*mark_id+1)*stride]*scale_w;
                det->marks[mark_id].y = det->bbox.y1 + landmarks[reg_index + (2*mark_id)*stride]*scale_h;
            }
        }
    }
}


void CTdetforward_gpu(const float *hm, const float *reg,const float *wh ,float * output,
                      const int w,const int h,const int classes,const int kernerl_size, const float visthresh,const int step){
    uint num = 1*w * h * classes;
    CTdetforward_kernel<<<cudaGridSize(num),BLOCK>>>(hm,reg,wh,output,w,h,classes,kernerl_size,visthresh,step);
    if(cudaGetLastError())printf("cuda kernel error!\n");
//    CTdetforward_kernel<<<cudaGridSize(num),BLOCK>>>(hm,reg,wh,output,w,h,classes,kernerl_size,visthresh);
}

void PrintResult(float *buffer, size_t unit_length){
    PrintResultGPU<<<1, 1>>>(buffer, unit_length);
    cudaDeviceSynchronize();
}

//void CTdetforward_gpu(const float *hm, const float *reg,const float *wh ,float *output,
//                      const int w,const int h,const int classes,const int kernerl_size, const float visthresh){
//    uint num = w * h * classes;
//    CTdetforward_kernel<<<cudaGridSize(num),BLOCK>>>(hm,reg,wh,output,w,h,classes,kernerl_size,visthresh);
//}


void CTfaceforward_gpu(const float *hm, const float *wh,const float *reg,const float* landmarks,float *output,
                      const int w,const int h,const int classes,const int kernerl_size, const float visthresh ){
    uint num = w * h * classes;
    CTfaceforward_kernel<<<cudaGridSize(num),BLOCK>>>(hm,wh,reg,landmarks,output,w,h,classes,kernerl_size,visthresh);
}

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175

ctdetLayer.h文件
修改内容如下:

//
// Created by cao on 19-10-25.
//

#ifndef CTDET_TRT_CTDETLAYER_H
#define CTDET_TRT_CTDETLAYER_H

//void CTdetforward_gpu(const float *hm, const float *reg,const float *wh ,float *output,
//                      const int w,const int h,const int classes,const int kernerl_size,const float visthresh,cudaStream_t stream);

void PrintResult(float *buffer, size_t unit_length);

void CTdetforward_gpu(const float *hm, const float *reg,const float *wh , float * output,
                      const int w,const int h,const int classes,const int kernerl_size,
                      const float visthresh,const int step);


void CTfaceforward_gpu(const float *hm, const float *wh,const float *reg,const float* landmarks,float *output,
                       const int w,const int h,const int classes,const int kernerl_size, const float visthresh );
#endif //CTDET_TRT_CTDETLAYER_H

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Gausst松鼠会/article/detail/261564
推荐阅读
相关标签
  

闽ICP备14008679号