record------CUDA GPU并行编程理解并实现centernet batch后处理_cuda nms batch

作者：Gausst松鼠会 | 2024-03-18 08:06:20

踩

cuda nms batch

CUDA GPU并行编程理解并实现centernet batch后处理

- CUDA GPU并行编程

CUDA GPU并行编程

前言

理解基本单元：线程(Thread)、线程块(Block)、网格(Grid)，线程块数(BlockDim)、网格数(GridDim)
其中网格(Grid)是由多个线程块(Block)组成，线程块(Block)是由多个线程(Thread)组成。
BlockDim：一个Block块中含有的最大线程(Thread)数，和硬件有关系，是一个dim3类型
GridDim：一个Grid块中含有的最大Block块数目，和硬件有关系，是一个dim3类型
ThreadIdx：线程的索引号，是相对当前所在的一个block中的位置，是一个uint3类型
BlockIdx：线程块的索引号，是相对当前所在的一个grid中的位置，是一个uint3类型
GridIdx：没有这个东西，会报错，就是只能设置最大一个网格的核，在1080ti上

Dim3 数据类型是CUDA自定义的一种数据结构，一般为三维的无符号整型，其定义及引用方式如下：

Dim3 BlockSize(16, 16, 1)
// BlockSize.x = 16; BlockSize.y = 16; BlockSize.z = 1;
// 也可以写为  Dim3 BlockSize(16, 16)，最后一维默认为1；
1
2
3

以下介绍如何在CUDA中使用一维、二维核函数，并索引其下标。

一维CUDA核函数下标索引获取方法

threadIdx.x ：代表在当前block内 x方向上的线程序号；
blockIdx.x ：代表在当前grid内 x方向上的线程块序号；
blockDim.x ：代表一个block中x方向上最大的线程数；
gridDim.x : 代表一个grid中x方向最大的block数；

所以要索引第三个Block(序号2)中第4个线程(序号3)的下标index，其过程如图所示：
在这里插入图片描述

二维CUDA核函数下标索引获取方法

threadIdx.y ：代表在当前block内 y方向上的线程序号；
blockIdx.y ：代表在当前grid内 y方向上的线程块序号；
blockDim.y ：代表一个block中 y方向上最大的线程数；
gridDim.y : 代表一个grid中 y方向最大的block数；

图示理解：
在这里插入图片描述

设置线程维数

在.cu文件中调用自定义的内核函数时，需要使用三角括号语法<<<paramA,paramB,*args >>>指定CUDA内核启动，其中参数paramA, paramB 必须要给定，参数详解：
paramA：设置gridDim，即一个grid中每个维度block的数量，使用 Dim3 定义
paramA：设置blockDim，即一个block中每个维度thread的数量，使用 Dim3 定义

centernet cuda后处理

膜拜大佬代码(码云：https://gitee.com/zjkclpch/TensorRT-CenterNet?_from=gitee_search) (github:暂时略，两会期间被禁)，不才将大佬的单张后处理修改为 batch处理。
ctdetNet.cpp文件
修改函数： void ctdetNet::InitEngine() 为：

void ctdetNet::InitEngine() {
        const int maxBatchSize = 1; //需要修改为自己使用的batch size
        mContext = mEngine->createExecutionContext();
        assert(mContext != nullptr);
        mContext->setProfiler(&mProfiler);
        int nbBindings = mEngine->getNbBindings();//nbBindings=4
        if (nbBindings > 4) forwardFace= true;
        mCudaBuffers.resize(nbBindings); //0:输入 1-3：输出
        mBindBufferSizes.resize(nbBindings);
//        cudaOutputBuffer.resize(maxBatchSize);

        int64_t totalSize = 0;
        for (int i = 0; i < nbBindings; ++i)
        {
            nvinfer1::Dims dims = mEngine->getBindingDimensions(i); //pth->onnx时设置n>1，此处的 dims.d[0]=n
            nvinfer1::DataType dtype = mEngine->getBindingDataType(i);
            dims_type.push_back({dims,dtype});
            totalSize = volume(dims) * maxBatchSize * getElementSize(dtype);
            mBindBufferSizes[i] = totalSize/maxBatchSize; //必须进行出掉batch
            mCudaBuffers[i] = safeCudaMalloc(totalSize/maxBatchSize);
        }
//        outputBufferSize = mBindBufferSizes[1] * 6/maxBatchSize ; //heatmap上每个热点对应6个值：x,y,w,h,reg_x,reg_y
        outputBufferSize = mBindBufferSizes[1] * 6; //heatmap上每个热点对应6个值：x,y,w,h,reg_x,reg_y

//        for(int i=0;i<maxBatchSize;i++)
//        {
//            cudaOutputBuffer[i]=safeCudaMalloc(outputBufferSize);//每张图片的输出地址
//        }

        cudaOutputBuffer=safeCudaMalloc(outputBufferSize);//每张图片的输出地址

        CUDA_CHECK(cudaStreamCreate(&mCudaStream));
    }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

增加 void ctdetNet::doInference_batch_one_step(const void *inputData,float * outputData_b):

   void ctdetNet::doInference_batch_one_step(const void *inputData,float * outputData_b)
    {
        int inputIndex = 0 ;
        int batch_size=1;

        struct timeval start0, end0;
        gettimeofday(&start0,NULL);
        CUDA_CHECK(cudaMemcpyAsync(mCudaBuffers[inputIndex], inputData, mBindBufferSizes[inputIndex],
                                   cudaMemcpyHostToDevice, mCudaStream));
        cudaDeviceSynchronize();
        gettimeofday(&end0,NULL);
        int timeuse0 = static_cast<int>(1000000 * (end0.tv_sec - start0.tv_sec ) + end0.tv_usec - start0.tv_usec);
        float time0 = float(timeuse0)/1000;
        std::cout << "batch cpDevice is: " << time0 <<  std::endl;

        struct timeval start, end;
        gettimeofday(&start,NULL);
        mContext->execute(batch_size, &mCudaBuffers[inputIndex]); //出现问题，不支持批处理
        cudaDeviceSynchronize();
        gettimeofday(&end,NULL);


        int timeuse = static_cast<int>(1000000 * (end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec);
        float time = float(timeuse)/1000;
        std::cout << "batch forward is: " << time <<  std::endl;
//        *time_consum = *time_consum + time;

        CUDA_CHECK(cudaMemset(cudaOutputBuffer, 0, volume(dims_type[1].first)*6*sizeof(float)));

        struct timeval start1, end1;
        gettimeofday(&start1,NULL);
//        PrintResult(mCudaStream[0])


        int step=int(volume(dims_type[1].first)/batch_size*6); //float型的地址偏移

        if (forwardFace){
            CTfaceforward_gpu(static_cast<const float *>(mCudaBuffers[1]),static_cast<const float *>(mCudaBuffers[2]),
                              static_cast<const float *>(mCudaBuffers[3]),static_cast<const float *>(mCudaBuffers[4]),static_cast<float *>(cudaOutputBuffer),
                              input_w/4,input_h/4,classNum,kernelSize,visThresh);
        } else{
            CTdetforward_gpu(static_cast<const float *>(mCudaBuffers[1]),static_cast<const float *>(mCudaBuffers[2]),
                             static_cast<const float *>(mCudaBuffers[3]),static_cast<float *>(cudaOutputBuffer),
                             input_w/4,input_h/4,classNum,kernelSize,visThresh,step);

//            CTdetforward_gpu(static_cast<const float *>(mCudaBuffers[1]),static_cast<const float *>(mCudaBuffers[3]),
//                    static_cast<const float *>(mCudaBuffers[2]),static_cast<float *>(cudaOutputBuffer),
//                             input_w/4,input_h/4,classNum,kernelSize,visThresh,step);
        }

        cudaDeviceSynchronize();
        gettimeofday(&end1,NULL);
        int timeuse1 = static_cast<int>(1000000 * (end1.tv_sec - start1.tv_sec ) + end1.tv_usec - start1.tv_usec);
        float time1 = float(timeuse1)/1000;
        std::cout << " batch post is: " << time1 <<  std::endl;


        struct timeval start2, end2;
        gettimeofday(&start2,NULL);

        CUDA_CHECK(cudaMemcpy((void*)(outputData_b), cudaOutputBuffer,outputBufferSize, cudaMemcpyDeviceToHost));

        cudaDeviceSynchronize();
        gettimeofday(&end2,NULL);
        int timeuse2 = static_cast<int>(1000000 * (end2.tv_sec - start2.tv_sec ) + end2.tv_usec - start2.tv_usec);
        float time2 = float(timeuse2)/1000;
        std::cout << "batch cpHost is: " << time2 <<  std::endl;
    }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68

ctdetLayer.cu文件
修改内容如下：

#include "ctdetLayer.h"
#include "utils.h"
#include "stdio.h"
dim3 cudaGridSize(uint n)
{
    uint k = (n - 1) /BLOCK + 1;
    uint x = k ;
    uint y = 1 ;
    if (x > 65535 )
    {
        x = ceil(sqrt(x));
        y = (n - 1 )/(x*BLOCK) + 1;
    }
    dim3 d = {x,y,1} ;
    return d;
}
__device__ float Logist(float data){ return 1./(1. + exp(-data));}

__global__ void PrintResultGPU(float *buffer, size_t unit_length){
    printf("buffer values: %f, %f\n",*buffer, *(buffer+unit_length));
    printf("buffer values: %f, %f\n",*(buffer+1000000), *(buffer+unit_length+1000000));
    printf("buffer values: %f, %f\n",*(buffer+1000000-1), *(buffer+unit_length+1000000-1));
    printf("buffer values: %f, %f\n",*(buffer+1000000), *(buffer+unit_length+1000000));
    printf("buffer values: %f, %f\n",*(buffer+1000000-1), *(buffer+unit_length+1000000-1));
}

__global__ void CTdetforward_kernel(const float *hm, const float *reg,const float *wh , float* output,
        const int w,const int h,const int classes,const int kernel_size,const float visthresh,int step)
        {
    int idx = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;  //gridIdx.x

    int imageID= idx/(w*h*classes);//(w*h*c);
    //printf("begin to run cuda kernel, currently at %d\n ", imageID);
        //std::cout<<imageID<<std::endl;
//    if(idx == 0){
//        printf("input values: %f, %f, %f\n", hm[0], reg[0], wh[0]);
//    }
    if (idx >= 1*w * h * classes) return;
//    if(idx == 0)
//
    int padding = (kernel_size - 1) / 2;
    int offset = -padding;
    int stride = w * h;
    int grid_x = (idx%(w*h*classes)) % w;
    int grid_y = ((idx%(w*h*classes)) / w) % h;
    int cls = (idx/w/h)%classes;
    int  l, m;
//    int reg_index = idx - cls*stride-imageID*classes*stride; //修改，原有int reg_index = idx - cls*stride
    int reg_index = idx - cls*stride-imageID*classes*stride+imageID*stride; //修改，原有int reg_index = idx - cls*stride

    float c_x, c_y;
    float objProb = Logist(hm[idx]);
    if (objProb > visthresh) {
        float max = -1;
        int max_index = 0;
        for (l = 0; l < kernel_size; ++l)
            for (m = 0; m < kernel_size; ++m) {
                int cur_x = offset + l + grid_x;
                int cur_y = offset + m + grid_y;
                int cur_index = cur_y * w + cur_x + stride * cls + imageID*classes*stride;
                int valid = (cur_x >= 0 && cur_x < w && cur_y >= 0 && cur_y < h);
                float val = (valid != 0) ? Logist(hm[cur_index]) : -1;
                max_index = (val > max) ? cur_index : max_index;
                max = (val > max) ? val : max;
            }

        if(idx == max_index){
            //printf("begin to run cuda kernel, currently at %d\n ", imageID);
            int resCount = (int) atomicAdd(output+imageID*step, 1);
//            printf("cur rescount %d %f\n ", imageID, *(output+imageID*step));
//            printf("resCount is %d",resCount);
            char *data = (char *) (output+imageID*step) + sizeof(float) + resCount * sizeof(Detection);//+imageID*step*sizeof(float);

//            int resCount = (int) atomicAdd(output, 1);
//            printf("resCount is %d",resCount);
//            char *data = (char *) output + sizeof(float) + resCount * sizeof(Detection);


            Detection *det = (Detection *) (data);
            c_x = grid_x + reg[reg_index];
            c_y = grid_y + reg[reg_index + stride];
            det->bbox.x1 = (c_x - wh[reg_index] / 2) * 4;
            det->bbox.y1 = (c_y - wh[reg_index + stride] / 2) * 4;
            det->bbox.x2 = (c_x + wh[reg_index] / 2) * 4;
            det->bbox.y2 = (c_y + wh[reg_index + stride] / 2) * 4;
            det->classId = cls;
            det->prob = objProb;
        }
    }
//    if(idx == w * h * classes -1){
//        char *test = (char *)output + sizeof(float);
//        printf("first float: %f\n", output[0]);
//        Detection *det_t = (Detection *)test;
//        printf("det_t prob: %f, %d, %f, %f, %f, %f\n", det_t->prob, det_t->classId, det_t->bbox.x1, det_t->bbox.y1, det_t->bbox.x2, det_t->bbox.y2);
//    }

}


__global__ void CTfaceforward_kernel(const float *hm, const float *wh,const float *reg,const float* landmarks,
                                    float *output,const int w,const int h,const int classes,const int kernel_size,const float visthresh  ) {
    int idx = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
    if (idx >= w*h*classes) return;
    int padding = (kernel_size-1)/2;
    int offset = - padding;
    int stride = w * h;
    int grid_x = idx % w;
    int grid_y = (idx / w) % h;
    int cls = idx/w/h ;
    int reg_index = idx - cls*stride;
    int l,m,mark_id;
    float c_x,c_y,scale_w,scale_h;

    float objProb = hm[idx];
    float max=-1;
    int max_index =0;
    if(objProb > visthresh){
        for(l=0 ;l < kernel_size ; ++l)
            for(m=0 ; m < kernel_size ; ++m){
                int cur_x = offset + l + grid_x;
                int cur_y = offset + m + grid_y;
                int cur_index = cur_y * w + cur_x + stride*cls;
                int valid = (cur_x>=0 && cur_x < w && cur_y >=0 && cur_y <h );
                float val = (valid !=0 ) ? hm[cur_index]: -1;
                max_index = (val > max) ? cur_index : max_index;
                max = (val > max ) ?  val: max ;
            }
        if(idx == max_index){
            int resCount = (int)atomicAdd(output,1);
            //printf("%d",resCount);
            char* data = (char * )output + sizeof(float) + resCount*sizeof(Detection);
            Detection* det =  (Detection*)(data);
            c_x = (grid_x + reg[reg_index+stride] + 0.5)*4 ; c_y  = (grid_y + reg[reg_index] + 0.5) * 4;
            scale_w =  expf(wh[reg_index+stride]) * 4 ; scale_h  = expf(wh[reg_index]) * 4;
            det->bbox.x1 = c_x - scale_w/2;
            det->bbox.y1 = c_y - scale_h/2 ;
            det->bbox.x2 = c_x + scale_w/2;
            det->bbox.y2 = c_y + scale_h/2;
            det->prob = objProb;
            det->classId = cls;
            for(mark_id=0 ; mark_id < 5 ; ++ mark_id){
                det->marks[mark_id].x = det->bbox.x1 + landmarks[reg_index + (2*mark_id+1)*stride]*scale_w;
                det->marks[mark_id].y = det->bbox.y1 + landmarks[reg_index + (2*mark_id)*stride]*scale_h;
            }
        }
    }
}


void CTdetforward_gpu(const float *hm, const float *reg,const float *wh ,float * output,
                      const int w,const int h,const int classes,const int kernerl_size, const float visthresh,const int step){
    uint num = 1*w * h * classes;
    CTdetforward_kernel<<<cudaGridSize(num),BLOCK>>>(hm,reg,wh,output,w,h,classes,kernerl_size,visthresh,step);
    if(cudaGetLastError())printf("cuda kernel error!\n");
//    CTdetforward_kernel<<<cudaGridSize(num),BLOCK>>>(hm,reg,wh,output,w,h,classes,kernerl_size,visthresh);
}

void PrintResult(float *buffer, size_t unit_length){
    PrintResultGPU<<<1, 1>>>(buffer, unit_length);
    cudaDeviceSynchronize();
}

//void CTdetforward_gpu(const float *hm, const float *reg,const float *wh ,float *output,
//                      const int w,const int h,const int classes,const int kernerl_size, const float visthresh){
//    uint num = w * h * classes;
//    CTdetforward_kernel<<<cudaGridSize(num),BLOCK>>>(hm,reg,wh,output,w,h,classes,kernerl_size,visthresh);
//}


void CTfaceforward_gpu(const float *hm, const float *wh,const float *reg,const float* landmarks,float *output,
                      const int w,const int h,const int classes,const int kernerl_size, const float visthresh ){
    uint num = w * h * classes;
    CTfaceforward_kernel<<<cudaGridSize(num),BLOCK>>>(hm,wh,reg,landmarks,output,w,h,classes,kernerl_size,visthresh);
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175

ctdetLayer.ｈ文件
修改内容如下：

//
// Created by cao on 19-10-25.
//

#ifndef CTDET_TRT_CTDETLAYER_H
#define CTDET_TRT_CTDETLAYER_H

//void CTdetforward_gpu(const float *hm, const float *reg,const float *wh ,float *output,
//                      const int w,const int h,const int classes,const int kernerl_size,const float visthresh,cudaStream_t stream);

void PrintResult(float *buffer, size_t unit_length);

void CTdetforward_gpu(const float *hm, const float *reg,const float *wh , float * output,
                      const int w,const int h,const int classes,const int kernerl_size,
                      const float visthresh,const int step);


void CTfaceforward_gpu(const float *hm, const float *wh,const float *reg,const float* landmarks,float *output,
                       const int w,const int h,const int classes,const int kernerl_size, const float visthresh );
#endif //CTDET_TRT_CTDETLAYER_H

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/Gausst松鼠会/article/detail/261564