CUDA实现多batch基数排序_cub::devicesegmentedradixsort:: 在哪里调用

作者：菜鸟追梦旅行 | 2024-03-18 08:10:46

踩

cub::devicesegmentedradixsort:: 在哪里调用

CUDA实现多batch基数排序

基数排序是具有固定迭代次数的排序算法, 其通过对最低位到最高位的一一比较，对数值排序。GPU版的基数排序将数据分为N个部分并行进行基数排序，随后并行规约得到排序后的数组。
这里实现了一版多batch的基数排序实例，并同时输出原数组的序号，基本实现了argsort的功能，代码如下所示：

#include<iostream>
#include<cuda_runtime.h>


__device__ void preprocess_float(float* const data, int batch, int numData, int tidx,int tidy)
{
    if(tidy>numData) return;
    if(tidx>batch) return;    
    for(int i = tidy;i<numData;i+=blockDim.y)
    {
        unsigned int *data_temp = (unsigned int *)(&data[i + tidx*numData]);    
        *data_temp = (*data_temp >> 31 & 0x1)? ~(*data_temp): (*data_temp) | 0x80000000; 
    }
}

__device__ void Aeprocess_float(float* const data, int batch, int numData, int tidx,int tidy)
{
    for(int i = tidy;i<numData;i+=blockDim.y)
    {
        unsigned int* data_temp = (unsigned int *)(&data[i + tidx*numData]);
        *data_temp = (*data_temp >> 31 & 0x1)? (*data_temp) & 0x7fffffff: ~(*data_temp);
    }
}


__device__ void radixKernel(float* data0,float* data1,int tidx,int tidy,int batch,int numData){
    for(int bit=0;bit<32;bit++){
        unsigned int mask = 1 << bit;
        unsigned int cnt0 = 0,cnt1 = 0;   
        for(int i=tidy;i<numData;i+=blockDim.y){
            unsigned int *temp =(unsigned int *) &data0[i + tidx*numData];
            if(*temp&mask){
                data1[tidy+cnt1 + tidx*numData] = data0[i + tidx*numData];
                cnt1 += blockDim.y;
            }
            else{
                data0[tidy+cnt0+ tidx*numData] = data0[i+ tidx*numData];
                cnt0 += blockDim.y;
            }
        }    
        for(int j=0;j<cnt1;j+=blockDim.y){
            data0[j+cnt0+tidy+ tidx*numData] = data1[j+tidy+ tidx*numData];
        } 
    }
    return;
}

__device__ void mergeKernel(float* data0,float* data1,int* index,int tidx,int tidy,int batch,int numData){
    int numPerList = ceil((float)numData / blockDim.y);
    extern __shared__ int listIndexrecordValrecordTid[]; 
    int* listIndex = (int*)listIndexrecordValrecordTid; //记录线程上指针位置
    float* recordVal = (float*)listIndexrecordValrecordTid + batch*blockDim.y;  //得到应比较的数
    int * recordTid = (int*)listIndexrecordValrecordTid + 2*batch*blockDim.y;  // 记录当前线程
    int* recordSrcIndex = (int*)listIndexrecordValrecordTid + 3*batch*blockDim.y;  // 记录原index
    listIndex[tidy + tidx * blockDim.y] = 0;
    recordVal[tidy + tidx * blockDim.y] = 0;
    recordTid[tidy + tidx * blockDim.y] = tidy + tidx * blockDim.y;
    recordSrcIndex[tidy + tidx * blockDim.y] = 0;
    __syncthreads();

    for(int i=0;i<numData;i++){
        recordVal[tidy + tidx * blockDim.y] = 0;
        recordTid[tidy + tidx * blockDim.y] = tidy + tidx * blockDim.y;
        recordSrcIndex[tidy + tidx * blockDim.y] = 0; 
        if(listIndex[tidy + tidx * blockDim.y] < numPerList)
        {
            int src_index = tidy + tidx * numData + listIndex[tidy + tidx * blockDim.y]*blockDim.y;
            int batch_index = tidy + listIndex[tidy + tidx * blockDim.y]*blockDim.y;
            if(batch_index < numData)
            {
                recordVal[tidy + tidx * blockDim.y] = data0[src_index];
                recordSrcIndex[tidy + tidx * blockDim.y] = src_index;
            }
            else{
                unsigned int *temp = (unsigned int *)&recordVal[tidy + tidx * blockDim.y];
                *temp = 0xffffffff;
            }
        }else{
                unsigned int *temp = (unsigned int *)&recordVal[tidy + tidx * blockDim.y];
                *temp = 0xffffffff;
        }

        __syncthreads();
        int tidMax = blockDim.y >> 1;
        while (tidMax!=0)
        {
            if(tidy < tidMax)
            {
                unsigned int* temp1 = (unsigned int*)&recordVal[tidy + tidx * blockDim.y];
                unsigned int *temp2 = (unsigned int*)&recordVal[tidy + tidx * blockDim.y + tidMax];
                if(*temp2 < *temp1)
                {
                    recordVal[tidy + tidx * blockDim.y] = recordVal[tidy + tidx * blockDim.y + tidMax];
                    recordTid[tidy + tidx * blockDim.y] = recordTid[tidy + tidx * blockDim.y + tidMax];
                    recordSrcIndex[tidy + tidx * blockDim.y] = recordSrcIndex[tidy + tidx * blockDim.y + tidMax];
                }
            }
            tidMax = tidMax >> 1;
            __syncthreads();

        }
        if(tidy==0){
            listIndex[recordTid[tidx * blockDim.y]]++;
            data1[i + tidx * numData] = recordVal[tidx * blockDim.y];
            index[i + tidx * numData] = recordSrcIndex[tidx * blockDim.y]%numData;
        }
        __syncthreads();
        
    }
    return;
}

__global__ void radixSortGpu(float* src_data, float* dst_data, int* index,int batch, int dataLen){
    int tidx = threadIdx.x;
    int tidy = threadIdx.y;
    preprocess_float(src_data, batch,dataLen, tidx,tidy); 
    __syncthreads();
    radixKernel(src_data,dst_data,tidx,tidy,batch,dataLen);
    __syncthreads();
    mergeKernel(src_data,dst_data,index,tidx,tidy,batch,dataLen);
    __syncthreads();
    Aeprocess_float(dst_data, batch,dataLen, tidx,tidy);
    return;
}


int main(){
    using namespace std;
    int batch=4;
    int inputsLen = 40;
    int numBolcks=128;
    float* inputs;
    float* outputs;
    int* index;
    srand(100);    
    cudaMallocManaged(&inputs,sizeof(float)*inputsLen*batch);
    cudaMallocManaged(&outputs,sizeof(float)*inputsLen*batch);
    cudaMallocManaged(&index,sizeof(int)*inputsLen*batch);

    cout<<"input rand :"<<endl;
    for(int j=0;j<batch;j++){
        for(int i=0;i<inputsLen;i++){
            inputs[i+j*inputsLen] = (float)rand()/(float)RAND_MAX;
            index[i+j*inputsLen] = i;
            cout<<inputs[i+j*inputsLen]<<", ";
        }
        cout<<" "<<endl;
    }
    cout<<" "<<endl;

    const dim3 blockSize(batch,numBolcks);
    const dim3 girdSize(1,1);
    radixSortGpu<<<girdSize,blockSize,4*batch*numBolcks*sizeof(float)>>>(inputs,outputs,index,batch,inputsLen);
    cudaDeviceSynchronize();

    cout<<"output :"<<endl;
    for(int j=0;j<batch;j++){
        for(int i=0;i<inputsLen;i++){
            cout<<outputs[i+j*inputsLen]<<", ";
        }
        cout<<" "<<endl;
    }
    cout<<" "<<endl;   

    cout<<"index :"<<endl;
    for(int j=0;j<batch;j++){
        for(int i=0;i<inputsLen;i++){
            cout<<index[i+j*inputsLen]<<", ";
        }
        cout<<" "<<endl;
    }
    cout<<" "<<endl;  

    return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175

编译执行代码，可以得到结果:

input rand :
0.315598, 0.284943, 0.240601, 0.484127, 0.375793, 0.0537027, 0.570274, 0.970005, 0.515422, 0.429529, 0.408115, 0.150135, 0.586551, 0.631635, 0.61386, 0.411339, 0.107092, 0.871626, 0.264386, 0.621543, 0.670743, 0.358033, 0.208356, 0.534175, 0.384512, 0.844556, 0.883552, 0.461531, 0.650512, 0.772418, 0.496347, 0.96611, 0.0573612, 0.736949, 0.450236, 0.433154, 0.790652, 0.0205103, 0.403159, 0.306074,  
0.450039, 0.811274, 0.456208, 0.0365907, 0.442909, 0.0700681, 0.44793, 0.550001, 0.941694, 0.712316, 0.171544, 0.612436, 0.0703487, 0.3799, 0.146612, 0.45486, 0.224456, 0.0301636, 0.916391, 0.874968, 0.802581, 0.412738, 0.841078, 0.859943, 0.149687, 0.291314, 0.293097, 0.940339, 0.311825, 0.696256, 0.246413, 0.761864, 0.50753, 0.702621, 0.798455, 0.950439, 0.772689, 0.246385, 0.50044, 0.714383,  
0.9587, 0.671984, 0.326819, 0.0290491, 0.0518843, 0.473431, 0.483909, 0.27634, 0.503595, 0.4003, 0.151308, 0.306176, 0.813039, 0.992386, 0.166119, 0.962726, 0.2837, 0.459215, 0.903065, 0.595525, 0.155472, 0.149477, 0.357389, 0.663002, 0.852098, 0.155843, 0.613441, 0.624787, 0.402228, 0.113881, 0.33917, 0.360928, 0.785866, 0.665989, 0.389977, 0.83775, 0.13942, 0.873886, 0.11409, 0.643015,  
0.274187, 0.265398, 0.949191, 0.0872253, 0.257784, 0.115309, 0.0499512, 0.541484, 0.574525, 0.953016, 0.137009, 0.729996, 0.102493, 0.494398, 0.392998, 0.954591, 0.650241, 0.00643936, 0.579378, 0.0524684, 0.120321, 0.918549, 0.413396, 0.906187, 0.584538, 0.803373, 0.743937, 0.723958, 0.67726, 0.858027, 0.366973, 0.951447, 0.123425, 0.316164, 0.0386718, 0.38121, 0.431473, 0.0886231, 0.922694, 0.00599772,  
 
output :
0.0205103, 0.0537027, 0.0573612, 0.107092, 0.150135, 0.208356, 0.240601, 0.264386, 0.284943, 0.306074, 0.315598, 0.358033, 0.375793, 0.384512, 0.403159, 0.408115, 0.411339, 0.429529, 0.433154, 0.450236, 0.461531, 0.484127, 0.496347, 0.515422, 0.534175, 0.570274, 0.586551, 0.61386, 0.621543, 0.631635, 0.650512, 0.670743, 0.736949, 0.772418, 0.790652, 0.844556, 0.871626, 0.883552, 0.96611, 0.970005,  
0.0301636, 0.0365907, 0.0700681, 0.0703487, 0.146612, 0.149687, 0.171544, 0.224456, 0.246385, 0.246413, 0.291314, 0.293097, 0.311825, 0.3799, 0.412738, 0.442909, 0.44793, 0.450039, 0.45486, 0.456208, 0.50044, 0.50753, 0.550001, 0.612436, 0.696256, 0.702621, 0.712316, 0.714383, 0.761864, 0.772689, 0.798455, 0.802581, 0.811274, 0.841078, 0.859943, 0.874968, 0.916391, 0.940339, 0.941694, 0.950439,  
0.0290491, 0.0518843, 0.113881, 0.11409, 0.13942, 0.149477, 0.151308, 0.155472, 0.155843, 0.166119, 0.27634, 0.2837, 0.306176, 0.326819, 0.33917, 0.357389, 0.360928, 0.389977, 0.4003, 0.402228, 0.459215, 0.473431, 0.483909, 0.503595, 0.595525, 0.613441, 0.624787, 0.643015, 0.663002, 0.665989, 0.671984, 0.785866, 0.813039, 0.83775, 0.852098, 0.873886, 0.903065, 0.9587, 0.962726, 0.992386,  
0.00599772, 0.00643936, 0.0386718, 0.0499512, 0.0524684, 0.0872253, 0.0886231, 0.102493, 0.115309, 0.120321, 0.123425, 0.137009, 0.257784, 0.265398, 0.274187, 0.316164, 0.366973, 0.38121, 0.392998, 0.413396, 0.431473, 0.494398, 0.541484, 0.574525, 0.579378, 0.584538, 0.650241, 0.67726, 0.723958, 0.729996, 0.743937, 0.803373, 0.858027, 0.906187, 0.918549, 0.922694, 0.949191, 0.951447, 0.953016, 0.954591,  
 
index :
37, 5, 32, 16, 11, 22, 2, 18, 1, 39, 0, 21, 4, 24, 38, 10, 15, 9, 35, 34, 27, 3, 30, 8, 23, 6, 12, 14, 19, 13, 28, 20, 33, 29, 36, 25, 17, 26, 31, 7,  
17, 3, 5, 12, 14, 24, 10, 16, 37, 30, 25, 26, 28, 13, 21, 4, 6, 0, 15, 2, 38, 32, 7, 11, 29, 33, 9, 39, 31, 36, 34, 20, 1, 22, 23, 19, 18, 27, 8, 35,  
3, 4, 29, 38, 36, 21, 10, 20, 25, 14, 7, 16, 11, 2, 30, 22, 31, 34, 9, 28, 17, 5, 6, 8, 19, 26, 27, 39, 23, 33, 1, 32, 12, 35, 24, 37, 18, 0, 15, 13,  
39, 17, 34, 6, 19, 3, 37, 12, 5, 20, 32, 10, 4, 1, 0, 33, 30, 35, 14, 22, 36, 13, 7, 8, 18, 24, 16, 28, 27, 11, 26, 25, 29, 23, 21, 38, 2, 31, 9, 15,  
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

对比numpy中的结果，确信结果无误。

或者使用nvidia官方的cub库也可以实现同样的效果，具体代码如下：

#include<cuda_runtime.h>
#include<iostream>
#include<cub/cub.cuh>


int main(){
    using namespace std;
    int batch=4;
    int inputsLen = 40;
    int* d_offset;
    float* inputs;
    float* outputs;
    int* index;
    int* outIndex;
    srand(100);    
    cudaMallocManaged(&inputs,sizeof(float)*inputsLen*batch);
    cudaMallocManaged(&outputs,sizeof(float)*inputsLen*batch);
    cudaMallocManaged(&index,sizeof(int)*inputsLen*batch);
    cudaMallocManaged(&outIndex,sizeof(int)*inputsLen*batch);
    cudaMallocManaged(&d_offset,sizeof(int)*(batch+1));

    cout<<"input rand :"<<endl;
    d_offset[0] = 0;
    for(int j=0;j<batch;j++){
        for(int i=0;i<inputsLen;i++){
            inputs[i+j*inputsLen] = (float)rand()/(float)RAND_MAX;
            index[i+j*inputsLen] = i;
            cout<<inputs[i+j*inputsLen]<<", ";
        }
        d_offset[j+1] = inputsLen*(j+1); // 1 2 3 4 --> 0 3 
        cout<<" "<<endl;
    }
    cout<<" "<<endl;
    size_t  temp_storage_bytes  = 0;
    void    *d_temp_storage     = NULL;
    cub::DeviceSegmentedRadixSort::SortPairs(
        d_temp_storage, temp_storage_bytes,
        inputs, outputs,
        index, outIndex,
        batch * inputsLen, batch,
        d_offset, d_offset + 1);
    cudaMalloc(&d_temp_storage, temp_storage_bytes);
    cub::DeviceSegmentedRadixSort::SortPairs(
        d_temp_storage, temp_storage_bytes,
        inputs, outputs,
        index, outIndex,
        batch * inputsLen, batch,
        d_offset,d_offset + 1);
    cudaDeviceSynchronize();
    
    cout<<"output :"<<endl;
    for(int j=0;j<batch;j++){
        for(int i=0;i<inputsLen;i++){
            cout<<outputs[i+j*inputsLen]<<", ";
        }
        cout<<" "<<endl;
    }
    cout<<" "<<endl;   

    cout<<"index :"<<endl;
    for(int j=0;j<batch;j++){
        for(int i=0;i<inputsLen;i++){
            cout<<outIndex[i+j*inputsLen]<<", ";
        }
        cout<<" "<<endl;
    }
    cout<<" "<<endl;  

    return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70

更新：

实现了一个基于cuda加速多batch的双线性插值resize，稍加改造可以用于视频流模型输入时的resize方法，具体代码如下：

#include<iostream>
#include<cuda_runtime.h>
#include<opencv2/opencv.hpp>
#include<getopt.h>
#include<string.h>
#include<vector>
#include<fstream>

using namespace std;
using namespace cv;

__device__ uchar3 getValues(uchar3* input,int x,int y,int b, int H,int W){
    if (x<0 || x>W || y<0 || y>H) return make_uchar3(0,0,0);

    return input[b*H*W + y*W + x];
}

__global__ void bilinearKernel( uchar3*input, uchar3*output, int oriH, int oriW, int outH, int outW, int batch, float scaleX, float scaleY, int shiftX,int shiftY ){

    int b = threadIdx.x + blockDim.x * blockIdx.x;
    int outXY = threadIdx.y + blockDim.y * blockIdx.y;
    int outX = outXY % outW;
    int outY = outXY / outW;
    if (outX>=outW || outY>=outH || b>=batch) return;

    float srcX = (outX - shiftX + 0.5) * scaleX -0.5;
    float srcY = (outY - shiftY + 0.5) * scaleY -0.5;

    int minSrcX = (int)srcX;
    int minSrcY = (int)srcY;

    int maxSrcX = (int)srcX + 1;
    int maxSrcY = (int)srcY + 1;

    float w1 = (srcX-minSrcX) * ( srcY-minSrcY);
    float w2 = (maxSrcX-srcX) * ( srcY-minSrcY);
    float w3 = (maxSrcX-srcX) * ( maxSrcY-srcY);
    float w4 = (srcX-minSrcX) * ( maxSrcY-srcY);

    uchar3 v1 = getValues(input,minSrcX,minSrcY,b,oriH,oriW);
    uchar3 v2 = getValues(input,maxSrcX,minSrcY,b,oriH,oriW);
    uchar3 v3 = getValues(input,maxSrcX,maxSrcY,b,oriH,oriW);
    uchar3 v4 = getValues(input,minSrcX,maxSrcY,b,oriH,oriW);

    output[b*outW*outH + outY*outW + outX].x = (uchar)(w1 * (float)v1.x + w2 * (float)v2.x + w3 * (float)v3.x + w4 * (float)v4.x);
    output[b*outW*outH + outY*outW + outX].y = (uchar)(w1 * (float)v1.y + w2 * (float)v2.y + w3 * (float)v3.y + w4 * (float)v4.y);
    output[b*outW*outH + outY*outW + outX].z = (uchar)(w1 * (float)v1.z + w2 * (float)v2.z + w3 * (float)v3.z + w4 * (float)v4.z);

    return;
}

void stringSplit(string str, const const char split,vector<string>& res)
{
	istringstream iss(str);	// 输入流
	string token;			// 接收缓冲区
	while (getline(iss, token, split))	// 以split为分隔符
	{
		res.push_back(token);
	}
}

int main(int argc,char**argv){

    int outH;
    int outW;
    int keepRatio = 0;
    int keepCenter = 0;
    string imgPath;
    string outPath;

    int opt=0,option_index = 0;
    static struct option opts[]=
    {
        {"outH",required_argument,nullptr,'h'},// 长选项名，required_argument 表明要跟参数，返回值是什么，返回值
        {"outW",required_argument,nullptr,'w'},
        {"keepRatio",no_argument,nullptr,'r'},
        {"keepCenter",no_argument,nullptr,'c'},
        {"imgPath",required_argument,nullptr,'i'},
        {"outPath",required_argument,nullptr,'o'},
        {0,0,0,0}
    };

    while((opt=getopt_long_only(argc,argv,"h:w:i:o:rc",opts,&option_index))!=-1)
    {
        switch (opt)
        {
        case 'h':outH = atoi(optarg);break;
        case 'w':outW = atoi(optarg);break;
        case 'i':imgPath = string(optarg);break;
        case 'o':outPath = string(optarg);break;
        case 'r':keepRatio = 1;break;
        case 'c':keepCenter =1;break;
        
        default:
            break;
        }
    }

    if(imgPath.find(".jpg") != string::npos || imgPath.find(".png") != string::npos){

        Mat img = imread(imgPath);
        int oriWidth = img.size().width;
        int oriHeight = img.size().height;

        uchar3* inputs;
        uchar3* outputs;  
        cudaMallocManaged(&inputs,sizeof(uchar3)*oriWidth*oriHeight);
        cudaMallocManaged(&outputs,sizeof(uchar3)*outH*outW);

        float scaleX = (oriWidth*1.0f / outW);
        float scaleY = (oriHeight*1.0f / outH);
        float shiftX = 0.f ,shiftY = 0.f;
        if(keepRatio)scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY;
        if(keepRatio && keepCenter){shiftX = (outW - oriWidth/scaleX)/2.f;shiftY = (outH - oriHeight/scaleY)/2.f;}

        cudaMemcpy(inputs,img.data,sizeof(uchar3)*oriHeight*oriWidth,cudaMemcpyHostToDevice);

        dim3 blockSize(1,512);
        dim3 gridSize(1,(outH*outW+512-1)/512);
        bilinearKernel<<<gridSize,blockSize>>>(inputs,outputs,oriHeight,oriWidth,outH,outW,1,scaleX,scaleY,shiftX,shiftY);

        Mat outImg(outH,outW,CV_8UC3,Scalar(0,0,0));
        cudaMemcpy(outImg.data,outputs,sizeof(uchar3)*outH*outW,cudaMemcpyDeviceToHost);

        imwrite(outPath,outImg);
    }
    else if (imgPath.find(".txt") != string::npos)
    {
        cout<<"read image list "<<imgPath<<endl;
        ifstream inputImageNameList(imgPath);
        vector<string> fileNames;
        vector<Mat> imgs;
        auto dataptr = imgs.data();
        int oriWidth = 0;
        int oriHeight = 0;
        if(!inputImageNameList.is_open()){
            cout<<"can not read image list "<<imgPath<<endl;
            return 1;
        }
        string strLine;
        while (getline(inputImageNameList,strLine)){
            Mat img = imread(strLine);
            oriWidth = img.size().width;
            oriHeight = img.size().height;  
            imgs.push_back(img);

            vector<string> strList;
            string str2("This-is-a-test");
            stringSplit(strLine, '/', strList);	// 将子串存放到strList中
            int lenStrList = strList.size();
            fileNames.push_back(strList[lenStrList-1]);

        }
        inputImageNameList.close();

        int batch = fileNames.size();

        uchar3* inputs;
        uchar3* outputs;  
        cudaMallocManaged(&inputs,sizeof(uchar3)*oriWidth*oriHeight*batch);
        cudaMallocManaged(&outputs,sizeof(uchar3)*outH*outW*batch);

        float scaleX = (oriWidth*1.0f / outW);
        float scaleY = (oriHeight*1.0f / outH);
        float shiftX = 0.f ,shiftY = 0.f;
        if(keepRatio)scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY;
        if(keepRatio && keepCenter){shiftX = (outW - oriWidth/scaleX)/2.f;shiftY = (outH - oriHeight/scaleY)/2.f;}

        Mat outImg_x(oriHeight,oriWidth,CV_8UC3,Scalar(0,0,0));
        for(int b=0;b<batch;++b){
            cudaMemcpy(inputs+oriHeight*oriWidth*b,imgs[b].data,sizeof(uchar3)*oriHeight*oriWidth,cudaMemcpyHostToDevice);
        }
        dim3 blockSize(1,512);
        dim3 gridSize(batch,(outH*outW+512-1)/512);
        bilinearKernel<<<gridSize,blockSize>>>(inputs,outputs,oriHeight,oriWidth,outH,outW,batch,scaleX,scaleY,shiftX,shiftY);

        Mat outImg(outH,outW,CV_8UC3,Scalar(0,0,0));
        for(int b=0;b<batch;++b){
            cudaMemcpy(outImg.data,outputs+b*outH*outW,sizeof(uchar3)*outH*outW,cudaMemcpyDeviceToHost); 
            imwrite(outPath+"result_"+fileNames[b],outImg);       
        }
        

    }
    return;

}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/菜鸟追梦旅行/article/detail/261588