赞
踩
//cuda头文件
struct CoreDataGPU { // host cpu float *H_X; float *H_Y ; float *H_Z ; //device gpu float *D_X; float *D_Y ; float *D_Z ; }; //开辟内存空间. void initPointGPU(PointGpu **_HostPointGPU,int arraySize); //释放内存空间 void freePointGPU(PointGpu &_HostPointGPU); //核心算子. __global__ void addPoint(PointGpu* _PointGPU,int arraySize); //对上面结构体进行处理测试. void PointGpuMethod_Test(); //测试二:上面结构体,假设里面结构体不是指针,只是一个变量.,那么做一个结构体数组 // void PointGpuMethod_dim1();
/// void initPointGPU(PointGpu **_HostPointGPU, int arraySize) { //开辟空间. 在本地开辟空间. cudaMallocHost((void**)(_HostPointGPU), sizeof(PointGpu)); //在cpu上开辟空间,这里考虑的是:后续进行类封装的时候,直接从gpu上对应的地址拷贝到cpu对应的数据内存, cudaMallocHost((void**)&(*_HostPointGPU)->host_X,sizeof(float)*arraySize); cudaMallocHost((void**)&(*_HostPointGPU)->host_Y, sizeof(float)*arraySize); cudaMallocHost((void**)&(*_HostPointGPU)->host_Z, sizeof(float)*arraySize); //在gpu上开辟空间 cudaMalloc((void**)&(*_HostPointGPU)->device_X,sizeof(float)*arraySize); cudaMalloc((void**)&(*_HostPointGPU)->device_Y, sizeof(float)*arraySize); cudaMalloc((void**)&(*_HostPointGPU)->device_Z, sizeof(float)*arraySize); } void freePointGPU(PointGpu &_HostPointGPU) { cudaFreeHost(_HostPointGPU.host_X); cudaFreeHost(_HostPointGPU.host_Y); cudaFreeHost(_HostPointGPU.host_Z); cudaFree(_HostPointGPU.device_X); cudaFree(_HostPointGPU.device_Y); cudaFree(_HostPointGPU.device_Z); cudaFreeHost(&_HostPointGPU); } //核函数,我就直接赋值了,不做计算. __global__ void addPoint(PointGpu *_PointGPU, int arraySize) { int tid = threadIdx.x; if (_PointGPU != nullptr && tid < arraySize) { _PointGPU->device_X[tid] = 9; _PointGPU->device_Y[tid] = 8; _PointGPU->device_Z[tid] = 10; } } //接下来就是如何使用了. void PointGpuMethod_Test() { 这里是单一的数据 const int arraySize = 10; float X[arraySize] = {0,}; float Y[arraySize] = { 0, }; float Z[arraySize] = { 0 ,}; memset(X, 0, sizeof(float)*arraySize); memset(Y, 0, sizeof(float)*arraySize); memset(Z, 0, sizeof(float)*arraySize); PointGpu* source = {nullptr}; initPointGPU(&source,arraySize); //初始化数据. addPoint << < 1,arraySize >> > (source, arraySize); //计算 cudaDeviceSynchronize(); //同步 或者在核函数里面使用 __syncthreads(); cudaMemcpy(source->host_Z, source->device_Z, arraySize*sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost); //下载数据. cudaMemcpy(source->host_Y, source->device_Y, arraySize * sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost); //下载数据. cudaMemcpy(source->host_X, source->device_X, arraySize * sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost); //下载数据. memcpy(Z, source->host_Z,sizeof(float)*arraySize); memcpy(Y, source->host_Y, sizeof(float)*arraySize); memcpy(X, source->host_X, sizeof(float)*arraySize); //输出: std::cout << "GPU下载的数据:" << std::endl; for (int i = 0; i < arraySize; i++) { std::cout << Z[i]<<" "<<X[i]<<" "<<Y[i] << " | "; } std::cout << std::endl; freePointGPU(*source); ///GPU 下载数据数组. } void PointGpuMethod_dim1() { const int arraySize = 10; float X[arraySize] = { 0, }; float Y[arraySize] = { 0, }; float Z[arraySize] = { 0 , }; memset(X, 0, sizeof(float)*arraySize); memset(Y, 0, sizeof(float)*arraySize); memset(Z, 0, sizeof(float)*arraySize); const int BUfferNumber = 10; PointGpu** source = new PointGpu*[BUfferNumber]; //创建二级指针 for (int i = 0; i < BUfferNumber; i++) { initPointGPU((source+i), 110000); //初始化数据. //每级指针开辟空间 } //初始化之后,赋值. for (int i = 0; i < BUfferNumber; i++) { addPoint << < 1, 10 >> > (source[i], 1); cudaDeviceSynchronize(); //获取数值.赋值,并输出数据. float buffer[3][2] = {0}; cudaMemcpy(buffer[0], source[i]->device_X, sizeof(float),cudaMemcpyKind::cudaMemcpyDeviceToHost); cudaMemcpy(buffer[1], source[i]->device_Y, sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost); cudaMemcpy(buffer[2], source[i]->device_Z, sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost); std::cout << buffer[0][0] << " " << buffer[1][0] << " " << buffer[2][0] << " "<<std::endl; } //释放空间 for (int i = 0; i < BUfferNumber; i++) { freePointGPU(*source[i]); } delete []source; } ///以上经过测试,连续GPU内存以及使用情况,稳定.
这样,cuda自定义结构体测试成功,这个只是最基本的,其它都可以在这个基础上做改动.
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。