赞
踩
理解基本单元:线程(Thread)、线程块(Block)、网格(Grid),线程块数(BlockDim)、网格数(GridDim)
其中网格(Grid)是由多个线程块(Block)组成,线程块(Block)是由多个线程(Thread)组成。
BlockDim:一个Block块中含有的最大线程(Thread)数,和硬件有关系,是一个dim3类型
GridDim:一个Grid块中含有的最大Block块数目,和硬件有关系,是一个dim3类型
ThreadIdx:线程的索引号,是相对当前所在的一个block中的位置,是一个uint3类型
BlockIdx:线程块的索引号,是相对当前所在的一个grid中的位置,是一个uint3类型
GridIdx:没有这个东西,会报错,就是只能设置最大一个网格 的核,在1080ti上
Dim3 数据类型是CUDA自定义的一种数据结构,一般为三维的无符号整型,其定义及引用方式如下:
Dim3 BlockSize(16, 16, 1)
// BlockSize.x = 16; BlockSize.y = 16; BlockSize.z = 1;
// 也可以写为 Dim3 BlockSize(16, 16),最后一维默认为1;
以下介绍如何在CUDA中使用一维、二维核函数,并索引其下标。
threadIdx.x : 代表在当前block内 x方向上的线程序号;
blockIdx.x : 代表在当前grid内 x方向上的线程块序号;
blockDim.x : 代表一个block中x方向上最大的线程数;
gridDim.x : 代表一个grid中x方向最大的block数;
所以要索引第三个Block(序号2)中第4个线程(序号3)的下标index,其过程如图所示:
threadIdx.y : 代表在当前block内 y方向上的线程序号;
blockIdx.y : 代表在当前grid内 y方向上的线程块序号;
blockDim.y : 代表一个block中 y方向上最大的线程数;
gridDim.y : 代表一个grid中 y方向最大的block数;
图示理解:
在.cu文件中调用自定义的内核函数时,需要使用三角括号语法<<<paramA,paramB,*args >>>指定CUDA内核启动,其中参数paramA, paramB 必须要给定,参数详解:
paramA: 设置gridDim,即一个grid中每个维度block的数量,使用 Dim3 定义
paramA: 设置blockDim,即一个block中每个维度thread的数量,使用 Dim3 定义
膜拜大佬代码(码云:https://gitee.com/zjkclpch/TensorRT-CenterNet?_from=gitee_search) (github:暂时略,两会期间被禁),不才将大佬的单张后处理 修改为 batch处理。
ctdetNet.cpp文件
修改函数: void ctdetNet::InitEngine() 为:
void ctdetNet::InitEngine() { const int maxBatchSize = 1; //需要修改为自己使用的batch size mContext = mEngine->createExecutionContext(); assert(mContext != nullptr); mContext->setProfiler(&mProfiler); int nbBindings = mEngine->getNbBindings();//nbBindings=4 if (nbBindings > 4) forwardFace= true; mCudaBuffers.resize(nbBindings); //0:输入 1-3:输出 mBindBufferSizes.resize(nbBindings); // cudaOutputBuffer.resize(maxBatchSize); int64_t totalSize = 0; for (int i = 0; i < nbBindings; ++i) { nvinfer1::Dims dims = mEngine->getBindingDimensions(i); //pth->onnx时设置n>1,此处的 dims.d[0]=n nvinfer1::DataType dtype = mEngine->getBindingDataType(i); dims_type.push_back({dims,dtype}); totalSize = volume(dims) * maxBatchSize * getElementSize(dtype); mBindBufferSizes[i] = totalSize/maxBatchSize; //必须进行出掉batch mCudaBuffers[i] = safeCudaMalloc(totalSize/maxBatchSize); } // outputBufferSize = mBindBufferSizes[1] * 6/maxBatchSize ; //heatmap上每个热点对应6个值:x,y,w,h,reg_x,reg_y outputBufferSize = mBindBufferSizes[1] * 6; //heatmap上每个热点对应6个值:x,y,w,h,reg_x,reg_y // for(int i=0;i<maxBatchSize;i++) // { // cudaOutputBuffer[i]=safeCudaMalloc(outputBufferSize);//每张图片的输出地址 // } cudaOutputBuffer=safeCudaMalloc(outputBufferSize);//每张图片的输出地址 CUDA_CHECK(cudaStreamCreate(&mCudaStream)); }
增加 void ctdetNet::doInference_batch_one_step(const void *inputData,float * outputData_b):
void ctdetNet::doInference_batch_one_step(const void *inputData,float * outputData_b) { int inputIndex = 0 ; int batch_size=1; struct timeval start0, end0; gettimeofday(&start0,NULL); CUDA_CHECK(cudaMemcpyAsync(mCudaBuffers[inputIndex], inputData, mBindBufferSizes[inputIndex], cudaMemcpyHostToDevice, mCudaStream)); cudaDeviceSynchronize(); gettimeofday(&end0,NULL); int timeuse0 = static_cast<int>(1000000 * (end0.tv_sec - start0.tv_sec ) + end0.tv_usec - start0.tv_usec); float time0 = float(timeuse0)/1000; std::cout << "batch cpDevice is: " << time0 << std::endl; struct timeval start, end; gettimeofday(&start,NULL); mContext->execute(batch_size, &mCudaBuffers[inputIndex]); //出现问题,不支持批处理 cudaDeviceSynchronize(); gettimeofday(&end,NULL); int timeuse = static_cast<int>(1000000 * (end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec); float time = float(timeuse)/1000; std::cout << "batch forward is: " << time << std::endl; // *time_consum = *time_consum + time; CUDA_CHECK(cudaMemset(cudaOutputBuffer, 0, volume(dims_type[1].first)*6*sizeof(float))); struct timeval start1, end1; gettimeofday(&start1,NULL); // PrintResult(mCudaStream[0]) int step=int(volume(dims_type[1].first)/batch_size*6); //float型的地址偏移 if (forwardFace){ CTfaceforward_gpu(static_cast<const float *>(mCudaBuffers[1]),static_cast<const float *>(mCudaBuffers[2]), static_cast<const float *>(mCudaBuffers[3]),static_cast<const float *>(mCudaBuffers[4]),static_cast<float *>(cudaOutputBuffer), input_w/4,input_h/4,classNum,kernelSize,visThresh); } else{ CTdetforward_gpu(static_cast<const float *>(mCudaBuffers[1]),static_cast<const float *>(mCudaBuffers[2]), static_cast<const float *>(mCudaBuffers[3]),static_cast<float *>(cudaOutputBuffer), input_w/4,input_h/4,classNum,kernelSize,visThresh,step); // CTdetforward_gpu(static_cast<const float *>(mCudaBuffers[1]),static_cast<const float *>(mCudaBuffers[3]), // static_cast<const float *>(mCudaBuffers[2]),static_cast<float *>(cudaOutputBuffer), // input_w/4,input_h/4,classNum,kernelSize,visThresh,step); } cudaDeviceSynchronize(); gettimeofday(&end1,NULL); int timeuse1 = static_cast<int>(1000000 * (end1.tv_sec - start1.tv_sec ) + end1.tv_usec - start1.tv_usec); float time1 = float(timeuse1)/1000; std::cout << " batch post is: " << time1 << std::endl; struct timeval start2, end2; gettimeofday(&start2,NULL); CUDA_CHECK(cudaMemcpy((void*)(outputData_b), cudaOutputBuffer,outputBufferSize, cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); gettimeofday(&end2,NULL); int timeuse2 = static_cast<int>(1000000 * (end2.tv_sec - start2.tv_sec ) + end2.tv_usec - start2.tv_usec); float time2 = float(timeuse2)/1000; std::cout << "batch cpHost is: " << time2 << std::endl; }
ctdetLayer.cu文件
修改内容如下:
#include "ctdetLayer.h" #include "utils.h" #include "stdio.h" dim3 cudaGridSize(uint n) { uint k = (n - 1) /BLOCK + 1; uint x = k ; uint y = 1 ; if (x > 65535 ) { x = ceil(sqrt(x)); y = (n - 1 )/(x*BLOCK) + 1; } dim3 d = {x,y,1} ; return d; } __device__ float Logist(float data){ return 1./(1. + exp(-data));} __global__ void PrintResultGPU(float *buffer, size_t unit_length){ printf("buffer values: %f, %f\n",*buffer, *(buffer+unit_length)); printf("buffer values: %f, %f\n",*(buffer+1000000), *(buffer+unit_length+1000000)); printf("buffer values: %f, %f\n",*(buffer+1000000-1), *(buffer+unit_length+1000000-1)); printf("buffer values: %f, %f\n",*(buffer+1000000), *(buffer+unit_length+1000000)); printf("buffer values: %f, %f\n",*(buffer+1000000-1), *(buffer+unit_length+1000000-1)); } __global__ void CTdetforward_kernel(const float *hm, const float *reg,const float *wh , float* output, const int w,const int h,const int classes,const int kernel_size,const float visthresh,int step) { int idx = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x; //gridIdx.x int imageID= idx/(w*h*classes);//(w*h*c); //printf("begin to run cuda kernel, currently at %d\n ", imageID); //std::cout<<imageID<<std::endl; // if(idx == 0){ // printf("input values: %f, %f, %f\n", hm[0], reg[0], wh[0]); // } if (idx >= 1*w * h * classes) return; // if(idx == 0) // int padding = (kernel_size - 1) / 2; int offset = -padding; int stride = w * h; int grid_x = (idx%(w*h*classes)) % w; int grid_y = ((idx%(w*h*classes)) / w) % h; int cls = (idx/w/h)%classes; int l, m; // int reg_index = idx - cls*stride-imageID*classes*stride; //修改,原有int reg_index = idx - cls*stride int reg_index = idx - cls*stride-imageID*classes*stride+imageID*stride; //修改,原有int reg_index = idx - cls*stride float c_x, c_y; float objProb = Logist(hm[idx]); if (objProb > visthresh) { float max = -1; int max_index = 0; for (l = 0; l < kernel_size; ++l) for (m = 0; m < kernel_size; ++m) { int cur_x = offset + l + grid_x; int cur_y = offset + m + grid_y; int cur_index = cur_y * w + cur_x + stride * cls + imageID*classes*stride; int valid = (cur_x >= 0 && cur_x < w && cur_y >= 0 && cur_y < h); float val = (valid != 0) ? Logist(hm[cur_index]) : -1; max_index = (val > max) ? cur_index : max_index; max = (val > max) ? val : max; } if(idx == max_index){ //printf("begin to run cuda kernel, currently at %d\n ", imageID); int resCount = (int) atomicAdd(output+imageID*step, 1); // printf("cur rescount %d %f\n ", imageID, *(output+imageID*step)); // printf("resCount is %d",resCount); char *data = (char *) (output+imageID*step) + sizeof(float) + resCount * sizeof(Detection);//+imageID*step*sizeof(float); // int resCount = (int) atomicAdd(output, 1); // printf("resCount is %d",resCount); // char *data = (char *) output + sizeof(float) + resCount * sizeof(Detection); Detection *det = (Detection *) (data); c_x = grid_x + reg[reg_index]; c_y = grid_y + reg[reg_index + stride]; det->bbox.x1 = (c_x - wh[reg_index] / 2) * 4; det->bbox.y1 = (c_y - wh[reg_index + stride] / 2) * 4; det->bbox.x2 = (c_x + wh[reg_index] / 2) * 4; det->bbox.y2 = (c_y + wh[reg_index + stride] / 2) * 4; det->classId = cls; det->prob = objProb; } } // if(idx == w * h * classes -1){ // char *test = (char *)output + sizeof(float); // printf("first float: %f\n", output[0]); // Detection *det_t = (Detection *)test; // printf("det_t prob: %f, %d, %f, %f, %f, %f\n", det_t->prob, det_t->classId, det_t->bbox.x1, det_t->bbox.y1, det_t->bbox.x2, det_t->bbox.y2); // } } __global__ void CTfaceforward_kernel(const float *hm, const float *wh,const float *reg,const float* landmarks, float *output,const int w,const int h,const int classes,const int kernel_size,const float visthresh ) { int idx = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x; if (idx >= w*h*classes) return; int padding = (kernel_size-1)/2; int offset = - padding; int stride = w * h; int grid_x = idx % w; int grid_y = (idx / w) % h; int cls = idx/w/h ; int reg_index = idx - cls*stride; int l,m,mark_id; float c_x,c_y,scale_w,scale_h; float objProb = hm[idx]; float max=-1; int max_index =0; if(objProb > visthresh){ for(l=0 ;l < kernel_size ; ++l) for(m=0 ; m < kernel_size ; ++m){ int cur_x = offset + l + grid_x; int cur_y = offset + m + grid_y; int cur_index = cur_y * w + cur_x + stride*cls; int valid = (cur_x>=0 && cur_x < w && cur_y >=0 && cur_y <h ); float val = (valid !=0 ) ? hm[cur_index]: -1; max_index = (val > max) ? cur_index : max_index; max = (val > max ) ? val: max ; } if(idx == max_index){ int resCount = (int)atomicAdd(output,1); //printf("%d",resCount); char* data = (char * )output + sizeof(float) + resCount*sizeof(Detection); Detection* det = (Detection*)(data); c_x = (grid_x + reg[reg_index+stride] + 0.5)*4 ; c_y = (grid_y + reg[reg_index] + 0.5) * 4; scale_w = expf(wh[reg_index+stride]) * 4 ; scale_h = expf(wh[reg_index]) * 4; det->bbox.x1 = c_x - scale_w/2; det->bbox.y1 = c_y - scale_h/2 ; det->bbox.x2 = c_x + scale_w/2; det->bbox.y2 = c_y + scale_h/2; det->prob = objProb; det->classId = cls; for(mark_id=0 ; mark_id < 5 ; ++ mark_id){ det->marks[mark_id].x = det->bbox.x1 + landmarks[reg_index + (2*mark_id+1)*stride]*scale_w; det->marks[mark_id].y = det->bbox.y1 + landmarks[reg_index + (2*mark_id)*stride]*scale_h; } } } } void CTdetforward_gpu(const float *hm, const float *reg,const float *wh ,float * output, const int w,const int h,const int classes,const int kernerl_size, const float visthresh,const int step){ uint num = 1*w * h * classes; CTdetforward_kernel<<<cudaGridSize(num),BLOCK>>>(hm,reg,wh,output,w,h,classes,kernerl_size,visthresh,step); if(cudaGetLastError())printf("cuda kernel error!\n"); // CTdetforward_kernel<<<cudaGridSize(num),BLOCK>>>(hm,reg,wh,output,w,h,classes,kernerl_size,visthresh); } void PrintResult(float *buffer, size_t unit_length){ PrintResultGPU<<<1, 1>>>(buffer, unit_length); cudaDeviceSynchronize(); } //void CTdetforward_gpu(const float *hm, const float *reg,const float *wh ,float *output, // const int w,const int h,const int classes,const int kernerl_size, const float visthresh){ // uint num = w * h * classes; // CTdetforward_kernel<<<cudaGridSize(num),BLOCK>>>(hm,reg,wh,output,w,h,classes,kernerl_size,visthresh); //} void CTfaceforward_gpu(const float *hm, const float *wh,const float *reg,const float* landmarks,float *output, const int w,const int h,const int classes,const int kernerl_size, const float visthresh ){ uint num = w * h * classes; CTfaceforward_kernel<<<cudaGridSize(num),BLOCK>>>(hm,wh,reg,landmarks,output,w,h,classes,kernerl_size,visthresh); }
ctdetLayer.h文件
修改内容如下:
// // Created by cao on 19-10-25. // #ifndef CTDET_TRT_CTDETLAYER_H #define CTDET_TRT_CTDETLAYER_H //void CTdetforward_gpu(const float *hm, const float *reg,const float *wh ,float *output, // const int w,const int h,const int classes,const int kernerl_size,const float visthresh,cudaStream_t stream); void PrintResult(float *buffer, size_t unit_length); void CTdetforward_gpu(const float *hm, const float *reg,const float *wh , float * output, const int w,const int h,const int classes,const int kernerl_size, const float visthresh,const int step); void CTfaceforward_gpu(const float *hm, const float *wh,const float *reg,const float* landmarks,float *output, const int w,const int h,const int classes,const int kernerl_size, const float visthresh ); #endif //CTDET_TRT_CTDETLAYER_H
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。