赞
踩
#include <iostream> #include <cuda_runtime.h> #include <time.h> using namespace std; // 在device上做矩阵加法运算 __global__ void sumArrays(int *a, int *b, int *res, const int size) { int index = threadIdx.x + blockIdx.x * blockDim.x; if (index < size) { res[index] = a[index] + b[index]; } } void random_ints(int* a, const int size) { for (int i = 0; i < size; ++i) a[i] = rand(); } int main() { int dev = 0; cudaSetDevice(dev); cudaEvent_t cuda_start_time, cuda_stop_time; // 定义两个CUDA事件类型的变量 cudaEventCreate(&cuda_start_time); // CUDA事件记时 cudaEventCreate(&cuda_stop_time); // 初始化定义的cudaEvent_t变量 cudaEventRecord(cuda_start_time); // 在需要记时的代码块之前记录代表时间开始的事件 cudaEventQuery(cuda_stop_time); // cudaEventQuery函数在TCC驱动模式的GPU下可省略,但在处于WDDM驱动模式的GPU必须保留 clock_t c_start_time , c_end_time; c_start_time = clock(); // C语言函数返回记录开始的值 int n = (2048 * 2048); int threads_per_block = 1024; int *a, *b, *c; int *d_a, *d_b, *d_c; int size = n * sizeof(int); cudaMalloc((void**)&d_a, size); cudaMalloc((void**)&d_b, size); cudaMalloc((void**)&d_c, size); a = (int*)malloc(size); random_ints(a, n); b = (int*)malloc(size); random_ints(b, n); c = (int*)malloc(size); cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice); sumArrays<<<(n + threads_per_block - 1)/threads_per_block, threads_per_block>>>(d_a, d_b, d_c, n); cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost); cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); for (int i = 0; i < n; ++i) { cout << c[i] << "\t"; } cout << endl; cout << cudaGetErrorString(cudaGetLastError()) << endl; free(a); free(b); free(c); cudaDeviceSynchronize(); cudaEventRecord(cuda_stop_time); cudaEventSynchronize(cuda_stop_time); // 等待事件被记录完毕 float cuda_run_time; cudaEventElapsedTime(&cuda_run_time, cuda_start_time, cuda_stop_time); printf("\n(Cuda) The Running time is %f\n", cuda_run_time); cudaEventDestroy(cuda_start_time); cudaEventDestroy(cuda_stop_time); c_end_time = clock(); // 记录结束的值 double c_run_time = ((double)(c_end_time - c_start_time)) / CLK_TCK; /* 用结束时间减去开始时间,因为是毫秒单位,所以除以CLK_TCK来转化为秒 */ printf("\n(C) The Running time is %f\n", c_run_time); return 0; } /* Cuda程序运行流程如下: 准备待处理数据 在device上分配存储空间 把数据从host拷贝到device 执行device运算 把结果从device拷贝回host 释放device空间 释放host空间 */
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。