赞
踩
基数排序是具有固定迭代次数的排序算法, 其通过对最低位到最高位的一一比较,对数值排序。GPU版的基数排序将数据分为N个部分并行进行基数排序,随后并行规约得到排序后的数组。
这里实现了一版多batch的基数排序实例,并同时输出原数组的序号,基本实现了argsort的功能,代码如下所示:
#include<iostream> #include<cuda_runtime.h> __device__ void preprocess_float(float* const data, int batch, int numData, int tidx,int tidy) { if(tidy>numData) return; if(tidx>batch) return; for(int i = tidy;i<numData;i+=blockDim.y) { unsigned int *data_temp = (unsigned int *)(&data[i + tidx*numData]); *data_temp = (*data_temp >> 31 & 0x1)? ~(*data_temp): (*data_temp) | 0x80000000; } } __device__ void Aeprocess_float(float* const data, int batch, int numData, int tidx,int tidy) { for(int i = tidy;i<numData;i+=blockDim.y) { unsigned int* data_temp = (unsigned int *)(&data[i + tidx*numData]); *data_temp = (*data_temp >> 31 & 0x1)? (*data_temp) & 0x7fffffff: ~(*data_temp); } } __device__ void radixKernel(float* data0,float* data1,int tidx,int tidy,int batch,int numData){ for(int bit=0;bit<32;bit++){ unsigned int mask = 1 << bit; unsigned int cnt0 = 0,cnt1 = 0; for(int i=tidy;i<numData;i+=blockDim.y){ unsigned int *temp =(unsigned int *) &data0[i + tidx*numData]; if(*temp&mask){ data1[tidy+cnt1 + tidx*numData] = data0[i + tidx*numData]; cnt1 += blockDim.y; } else{ data0[tidy+cnt0+ tidx*numData] = data0[i+ tidx*numData]; cnt0 += blockDim.y; } } for(int j=0;j<cnt1;j+=blockDim.y){ data0[j+cnt0+tidy+ tidx*numData] = data1[j+tidy+ tidx*numData]; } } return; } __device__ void mergeKernel(float* data0,float* data1,int* index,int tidx,int tidy,int batch,int numData){ int numPerList = ceil((float)numData / blockDim.y); extern __shared__ int listIndexrecordValrecordTid[]; int* listIndex = (int*)listIndexrecordValrecordTid; //记录线程上指针位置 float* recordVal = (float*)listIndexrecordValrecordTid + batch*blockDim.y; //得到应比较的数 int * recordTid = (int*)listIndexrecordValrecordTid + 2*batch*blockDim.y; // 记录当前线程 int* recordSrcIndex = (int*)listIndexrecordValrecordTid + 3*batch*blockDim.y; // 记录原index listIndex[tidy + tidx * blockDim.y] = 0; recordVal[tidy + tidx * blockDim.y] = 0; recordTid[tidy + tidx * blockDim.y] = tidy + tidx * blockDim.y; recordSrcIndex[tidy + tidx * blockDim.y] = 0; __syncthreads(); for(int i=0;i<numData;i++){ recordVal[tidy + tidx * blockDim.y] = 0; recordTid[tidy + tidx * blockDim.y] = tidy + tidx * blockDim.y; recordSrcIndex[tidy + tidx * blockDim.y] = 0; if(listIndex[tidy + tidx * blockDim.y] < numPerList) { int src_index = tidy + tidx * numData + listIndex[tidy + tidx * blockDim.y]*blockDim.y; int batch_index = tidy + listIndex[tidy + tidx * blockDim.y]*blockDim.y; if(batch_index < numData) { recordVal[tidy + tidx * blockDim.y] = data0[src_index]; recordSrcIndex[tidy + tidx * blockDim.y] = src_index; } else{ unsigned int *temp = (unsigned int *)&recordVal[tidy + tidx * blockDim.y]; *temp = 0xffffffff; } }else{ unsigned int *temp = (unsigned int *)&recordVal[tidy + tidx * blockDim.y]; *temp = 0xffffffff; } __syncthreads(); int tidMax = blockDim.y >> 1; while (tidMax!=0) { if(tidy < tidMax) { unsigned int* temp1 = (unsigned int*)&recordVal[tidy + tidx * blockDim.y]; unsigned int *temp2 = (unsigned int*)&recordVal[tidy + tidx * blockDim.y + tidMax]; if(*temp2 < *temp1) { recordVal[tidy + tidx * blockDim.y] = recordVal[tidy + tidx * blockDim.y + tidMax]; recordTid[tidy + tidx * blockDim.y] = recordTid[tidy + tidx * blockDim.y + tidMax]; recordSrcIndex[tidy + tidx * blockDim.y] = recordSrcIndex[tidy + tidx * blockDim.y + tidMax]; } } tidMax = tidMax >> 1; __syncthreads(); } if(tidy==0){ listIndex[recordTid[tidx * blockDim.y]]++; data1[i + tidx * numData] = recordVal[tidx * blockDim.y]; index[i + tidx * numData] = recordSrcIndex[tidx * blockDim.y]%numData; } __syncthreads(); } return; } __global__ void radixSortGpu(float* src_data, float* dst_data, int* index,int batch, int dataLen){ int tidx = threadIdx.x; int tidy = threadIdx.y; preprocess_float(src_data, batch,dataLen, tidx,tidy); __syncthreads(); radixKernel(src_data,dst_data,tidx,tidy,batch,dataLen); __syncthreads(); mergeKernel(src_data,dst_data,index,tidx,tidy,batch,dataLen); __syncthreads(); Aeprocess_float(dst_data, batch,dataLen, tidx,tidy); return; } int main(){ using namespace std; int batch=4; int inputsLen = 40; int numBolcks=128; float* inputs; float* outputs; int* index; srand(100); cudaMallocManaged(&inputs,sizeof(float)*inputsLen*batch); cudaMallocManaged(&outputs,sizeof(float)*inputsLen*batch); cudaMallocManaged(&index,sizeof(int)*inputsLen*batch); cout<<"input rand :"<<endl; for(int j=0;j<batch;j++){ for(int i=0;i<inputsLen;i++){ inputs[i+j*inputsLen] = (float)rand()/(float)RAND_MAX; index[i+j*inputsLen] = i; cout<<inputs[i+j*inputsLen]<<", "; } cout<<" "<<endl; } cout<<" "<<endl; const dim3 blockSize(batch,numBolcks); const dim3 girdSize(1,1); radixSortGpu<<<girdSize,blockSize,4*batch*numBolcks*sizeof(float)>>>(inputs,outputs,index,batch,inputsLen); cudaDeviceSynchronize(); cout<<"output :"<<endl; for(int j=0;j<batch;j++){ for(int i=0;i<inputsLen;i++){ cout<<outputs[i+j*inputsLen]<<", "; } cout<<" "<<endl; } cout<<" "<<endl; cout<<"index :"<<endl; for(int j=0;j<batch;j++){ for(int i=0;i<inputsLen;i++){ cout<<index[i+j*inputsLen]<<", "; } cout<<" "<<endl; } cout<<" "<<endl; return 0; }
编译执行代码,可以得到结果:
input rand : 0.315598, 0.284943, 0.240601, 0.484127, 0.375793, 0.0537027, 0.570274, 0.970005, 0.515422, 0.429529, 0.408115, 0.150135, 0.586551, 0.631635, 0.61386, 0.411339, 0.107092, 0.871626, 0.264386, 0.621543, 0.670743, 0.358033, 0.208356, 0.534175, 0.384512, 0.844556, 0.883552, 0.461531, 0.650512, 0.772418, 0.496347, 0.96611, 0.0573612, 0.736949, 0.450236, 0.433154, 0.790652, 0.0205103, 0.403159, 0.306074, 0.450039, 0.811274, 0.456208, 0.0365907, 0.442909, 0.0700681, 0.44793, 0.550001, 0.941694, 0.712316, 0.171544, 0.612436, 0.0703487, 0.3799, 0.146612, 0.45486, 0.224456, 0.0301636, 0.916391, 0.874968, 0.802581, 0.412738, 0.841078, 0.859943, 0.149687, 0.291314, 0.293097, 0.940339, 0.311825, 0.696256, 0.246413, 0.761864, 0.50753, 0.702621, 0.798455, 0.950439, 0.772689, 0.246385, 0.50044, 0.714383, 0.9587, 0.671984, 0.326819, 0.0290491, 0.0518843, 0.473431, 0.483909, 0.27634, 0.503595, 0.4003, 0.151308, 0.306176, 0.813039, 0.992386, 0.166119, 0.962726, 0.2837, 0.459215, 0.903065, 0.595525, 0.155472, 0.149477, 0.357389, 0.663002, 0.852098, 0.155843, 0.613441, 0.624787, 0.402228, 0.113881, 0.33917, 0.360928, 0.785866, 0.665989, 0.389977, 0.83775, 0.13942, 0.873886, 0.11409, 0.643015, 0.274187, 0.265398, 0.949191, 0.0872253, 0.257784, 0.115309, 0.0499512, 0.541484, 0.574525, 0.953016, 0.137009, 0.729996, 0.102493, 0.494398, 0.392998, 0.954591, 0.650241, 0.00643936, 0.579378, 0.0524684, 0.120321, 0.918549, 0.413396, 0.906187, 0.584538, 0.803373, 0.743937, 0.723958, 0.67726, 0.858027, 0.366973, 0.951447, 0.123425, 0.316164, 0.0386718, 0.38121, 0.431473, 0.0886231, 0.922694, 0.00599772, output : 0.0205103, 0.0537027, 0.0573612, 0.107092, 0.150135, 0.208356, 0.240601, 0.264386, 0.284943, 0.306074, 0.315598, 0.358033, 0.375793, 0.384512, 0.403159, 0.408115, 0.411339, 0.429529, 0.433154, 0.450236, 0.461531, 0.484127, 0.496347, 0.515422, 0.534175, 0.570274, 0.586551, 0.61386, 0.621543, 0.631635, 0.650512, 0.670743, 0.736949, 0.772418, 0.790652, 0.844556, 0.871626, 0.883552, 0.96611, 0.970005, 0.0301636, 0.0365907, 0.0700681, 0.0703487, 0.146612, 0.149687, 0.171544, 0.224456, 0.246385, 0.246413, 0.291314, 0.293097, 0.311825, 0.3799, 0.412738, 0.442909, 0.44793, 0.450039, 0.45486, 0.456208, 0.50044, 0.50753, 0.550001, 0.612436, 0.696256, 0.702621, 0.712316, 0.714383, 0.761864, 0.772689, 0.798455, 0.802581, 0.811274, 0.841078, 0.859943, 0.874968, 0.916391, 0.940339, 0.941694, 0.950439, 0.0290491, 0.0518843, 0.113881, 0.11409, 0.13942, 0.149477, 0.151308, 0.155472, 0.155843, 0.166119, 0.27634, 0.2837, 0.306176, 0.326819, 0.33917, 0.357389, 0.360928, 0.389977, 0.4003, 0.402228, 0.459215, 0.473431, 0.483909, 0.503595, 0.595525, 0.613441, 0.624787, 0.643015, 0.663002, 0.665989, 0.671984, 0.785866, 0.813039, 0.83775, 0.852098, 0.873886, 0.903065, 0.9587, 0.962726, 0.992386, 0.00599772, 0.00643936, 0.0386718, 0.0499512, 0.0524684, 0.0872253, 0.0886231, 0.102493, 0.115309, 0.120321, 0.123425, 0.137009, 0.257784, 0.265398, 0.274187, 0.316164, 0.366973, 0.38121, 0.392998, 0.413396, 0.431473, 0.494398, 0.541484, 0.574525, 0.579378, 0.584538, 0.650241, 0.67726, 0.723958, 0.729996, 0.743937, 0.803373, 0.858027, 0.906187, 0.918549, 0.922694, 0.949191, 0.951447, 0.953016, 0.954591, index : 37, 5, 32, 16, 11, 22, 2, 18, 1, 39, 0, 21, 4, 24, 38, 10, 15, 9, 35, 34, 27, 3, 30, 8, 23, 6, 12, 14, 19, 13, 28, 20, 33, 29, 36, 25, 17, 26, 31, 7, 17, 3, 5, 12, 14, 24, 10, 16, 37, 30, 25, 26, 28, 13, 21, 4, 6, 0, 15, 2, 38, 32, 7, 11, 29, 33, 9, 39, 31, 36, 34, 20, 1, 22, 23, 19, 18, 27, 8, 35, 3, 4, 29, 38, 36, 21, 10, 20, 25, 14, 7, 16, 11, 2, 30, 22, 31, 34, 9, 28, 17, 5, 6, 8, 19, 26, 27, 39, 23, 33, 1, 32, 12, 35, 24, 37, 18, 0, 15, 13, 39, 17, 34, 6, 19, 3, 37, 12, 5, 20, 32, 10, 4, 1, 0, 33, 30, 35, 14, 22, 36, 13, 7, 8, 18, 24, 16, 28, 27, 11, 26, 25, 29, 23, 21, 38, 2, 31, 9, 15,
对比numpy中的结果,确信结果无误。
或者使用nvidia官方的cub库也可以实现同样的效果,具体代码如下:
#include<cuda_runtime.h> #include<iostream> #include<cub/cub.cuh> int main(){ using namespace std; int batch=4; int inputsLen = 40; int* d_offset; float* inputs; float* outputs; int* index; int* outIndex; srand(100); cudaMallocManaged(&inputs,sizeof(float)*inputsLen*batch); cudaMallocManaged(&outputs,sizeof(float)*inputsLen*batch); cudaMallocManaged(&index,sizeof(int)*inputsLen*batch); cudaMallocManaged(&outIndex,sizeof(int)*inputsLen*batch); cudaMallocManaged(&d_offset,sizeof(int)*(batch+1)); cout<<"input rand :"<<endl; d_offset[0] = 0; for(int j=0;j<batch;j++){ for(int i=0;i<inputsLen;i++){ inputs[i+j*inputsLen] = (float)rand()/(float)RAND_MAX; index[i+j*inputsLen] = i; cout<<inputs[i+j*inputsLen]<<", "; } d_offset[j+1] = inputsLen*(j+1); // 1 2 3 4 --> 0 3 cout<<" "<<endl; } cout<<" "<<endl; size_t temp_storage_bytes = 0; void *d_temp_storage = NULL; cub::DeviceSegmentedRadixSort::SortPairs( d_temp_storage, temp_storage_bytes, inputs, outputs, index, outIndex, batch * inputsLen, batch, d_offset, d_offset + 1); cudaMalloc(&d_temp_storage, temp_storage_bytes); cub::DeviceSegmentedRadixSort::SortPairs( d_temp_storage, temp_storage_bytes, inputs, outputs, index, outIndex, batch * inputsLen, batch, d_offset,d_offset + 1); cudaDeviceSynchronize(); cout<<"output :"<<endl; for(int j=0;j<batch;j++){ for(int i=0;i<inputsLen;i++){ cout<<outputs[i+j*inputsLen]<<", "; } cout<<" "<<endl; } cout<<" "<<endl; cout<<"index :"<<endl; for(int j=0;j<batch;j++){ for(int i=0;i<inputsLen;i++){ cout<<outIndex[i+j*inputsLen]<<", "; } cout<<" "<<endl; } cout<<" "<<endl; return 0; }
实现了一个基于cuda加速多batch的双线性插值resize,稍加改造可以用于视频流模型输入时的resize方法,具体代码如下:
#include<iostream> #include<cuda_runtime.h> #include<opencv2/opencv.hpp> #include<getopt.h> #include<string.h> #include<vector> #include<fstream> using namespace std; using namespace cv; __device__ uchar3 getValues(uchar3* input,int x,int y,int b, int H,int W){ if (x<0 || x>W || y<0 || y>H) return make_uchar3(0,0,0); return input[b*H*W + y*W + x]; } __global__ void bilinearKernel( uchar3*input, uchar3*output, int oriH, int oriW, int outH, int outW, int batch, float scaleX, float scaleY, int shiftX,int shiftY ){ int b = threadIdx.x + blockDim.x * blockIdx.x; int outXY = threadIdx.y + blockDim.y * blockIdx.y; int outX = outXY % outW; int outY = outXY / outW; if (outX>=outW || outY>=outH || b>=batch) return; float srcX = (outX - shiftX + 0.5) * scaleX -0.5; float srcY = (outY - shiftY + 0.5) * scaleY -0.5; int minSrcX = (int)srcX; int minSrcY = (int)srcY; int maxSrcX = (int)srcX + 1; int maxSrcY = (int)srcY + 1; float w1 = (srcX-minSrcX) * ( srcY-minSrcY); float w2 = (maxSrcX-srcX) * ( srcY-minSrcY); float w3 = (maxSrcX-srcX) * ( maxSrcY-srcY); float w4 = (srcX-minSrcX) * ( maxSrcY-srcY); uchar3 v1 = getValues(input,minSrcX,minSrcY,b,oriH,oriW); uchar3 v2 = getValues(input,maxSrcX,minSrcY,b,oriH,oriW); uchar3 v3 = getValues(input,maxSrcX,maxSrcY,b,oriH,oriW); uchar3 v4 = getValues(input,minSrcX,maxSrcY,b,oriH,oriW); output[b*outW*outH + outY*outW + outX].x = (uchar)(w1 * (float)v1.x + w2 * (float)v2.x + w3 * (float)v3.x + w4 * (float)v4.x); output[b*outW*outH + outY*outW + outX].y = (uchar)(w1 * (float)v1.y + w2 * (float)v2.y + w3 * (float)v3.y + w4 * (float)v4.y); output[b*outW*outH + outY*outW + outX].z = (uchar)(w1 * (float)v1.z + w2 * (float)v2.z + w3 * (float)v3.z + w4 * (float)v4.z); return; } void stringSplit(string str, const const char split,vector<string>& res) { istringstream iss(str); // 输入流 string token; // 接收缓冲区 while (getline(iss, token, split)) // 以split为分隔符 { res.push_back(token); } } int main(int argc,char**argv){ int outH; int outW; int keepRatio = 0; int keepCenter = 0; string imgPath; string outPath; int opt=0,option_index = 0; static struct option opts[]= { {"outH",required_argument,nullptr,'h'},// 长选项名,required_argument 表明要跟参数,返回值是什么,返回值 {"outW",required_argument,nullptr,'w'}, {"keepRatio",no_argument,nullptr,'r'}, {"keepCenter",no_argument,nullptr,'c'}, {"imgPath",required_argument,nullptr,'i'}, {"outPath",required_argument,nullptr,'o'}, {0,0,0,0} }; while((opt=getopt_long_only(argc,argv,"h:w:i:o:rc",opts,&option_index))!=-1) { switch (opt) { case 'h':outH = atoi(optarg);break; case 'w':outW = atoi(optarg);break; case 'i':imgPath = string(optarg);break; case 'o':outPath = string(optarg);break; case 'r':keepRatio = 1;break; case 'c':keepCenter =1;break; default: break; } } if(imgPath.find(".jpg") != string::npos || imgPath.find(".png") != string::npos){ Mat img = imread(imgPath); int oriWidth = img.size().width; int oriHeight = img.size().height; uchar3* inputs; uchar3* outputs; cudaMallocManaged(&inputs,sizeof(uchar3)*oriWidth*oriHeight); cudaMallocManaged(&outputs,sizeof(uchar3)*outH*outW); float scaleX = (oriWidth*1.0f / outW); float scaleY = (oriHeight*1.0f / outH); float shiftX = 0.f ,shiftY = 0.f; if(keepRatio)scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY; if(keepRatio && keepCenter){shiftX = (outW - oriWidth/scaleX)/2.f;shiftY = (outH - oriHeight/scaleY)/2.f;} cudaMemcpy(inputs,img.data,sizeof(uchar3)*oriHeight*oriWidth,cudaMemcpyHostToDevice); dim3 blockSize(1,512); dim3 gridSize(1,(outH*outW+512-1)/512); bilinearKernel<<<gridSize,blockSize>>>(inputs,outputs,oriHeight,oriWidth,outH,outW,1,scaleX,scaleY,shiftX,shiftY); Mat outImg(outH,outW,CV_8UC3,Scalar(0,0,0)); cudaMemcpy(outImg.data,outputs,sizeof(uchar3)*outH*outW,cudaMemcpyDeviceToHost); imwrite(outPath,outImg); } else if (imgPath.find(".txt") != string::npos) { cout<<"read image list "<<imgPath<<endl; ifstream inputImageNameList(imgPath); vector<string> fileNames; vector<Mat> imgs; auto dataptr = imgs.data(); int oriWidth = 0; int oriHeight = 0; if(!inputImageNameList.is_open()){ cout<<"can not read image list "<<imgPath<<endl; return 1; } string strLine; while (getline(inputImageNameList,strLine)){ Mat img = imread(strLine); oriWidth = img.size().width; oriHeight = img.size().height; imgs.push_back(img); vector<string> strList; string str2("This-is-a-test"); stringSplit(strLine, '/', strList); // 将子串存放到strList中 int lenStrList = strList.size(); fileNames.push_back(strList[lenStrList-1]); } inputImageNameList.close(); int batch = fileNames.size(); uchar3* inputs; uchar3* outputs; cudaMallocManaged(&inputs,sizeof(uchar3)*oriWidth*oriHeight*batch); cudaMallocManaged(&outputs,sizeof(uchar3)*outH*outW*batch); float scaleX = (oriWidth*1.0f / outW); float scaleY = (oriHeight*1.0f / outH); float shiftX = 0.f ,shiftY = 0.f; if(keepRatio)scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY; if(keepRatio && keepCenter){shiftX = (outW - oriWidth/scaleX)/2.f;shiftY = (outH - oriHeight/scaleY)/2.f;} Mat outImg_x(oriHeight,oriWidth,CV_8UC3,Scalar(0,0,0)); for(int b=0;b<batch;++b){ cudaMemcpy(inputs+oriHeight*oriWidth*b,imgs[b].data,sizeof(uchar3)*oriHeight*oriWidth,cudaMemcpyHostToDevice); } dim3 blockSize(1,512); dim3 gridSize(batch,(outH*outW+512-1)/512); bilinearKernel<<<gridSize,blockSize>>>(inputs,outputs,oriHeight,oriWidth,outH,outW,batch,scaleX,scaleY,shiftX,shiftY); Mat outImg(outH,outW,CV_8UC3,Scalar(0,0,0)); for(int b=0;b<batch;++b){ cudaMemcpy(outImg.data,outputs+b*outH*outW,sizeof(uchar3)*outH*outW,cudaMemcpyDeviceToHost); imwrite(outPath+"result_"+fileNames[b],outImg); } } return; }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。