当前位置:   article > 正文

cuda 复数矩阵求逆_矩阵求逆cuda

矩阵求逆cuda

前一篇设置篇

https://blog.csdn.net/kewang93/article/details/118702824?spm=1001.2014.3001.5502

这一篇为复数矩阵求逆

  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <string.h>
  4. #include <ctype.h>
  5. #include <math.h>
  6. #include <cuda_runtime.h>
  7. #include <cublas_v2.h>
  8. #include <helper_cuda.h>
  9. //#include "batchCUBLAS.h"
  10. #define T_ELEM cuComplex
  11. #define CUBLASTEST_FAILED 1
  12. #define MATDEM 2
  13. const char *sSDKname = "batchCUBLAS";
  14. static __inline__ int imax(int x, int y)
  15. {
  16. return (x > y) ? x : y;
  17. }
  18. static __inline__ unsigned cuRand(void)
  19. {
  20. /* George Marsaglia's fast inline random number generator */
  21. #define CUDA_ZNEW (cuda_z=36969*(cuda_z&65535)+(cuda_z>>16))
  22. #define CUDA_WNEW (cuda_w=18000*(cuda_w&65535)+(cuda_w>>16))
  23. #define CUDA_MWC ((CUDA_ZNEW<<16)+CUDA_WNEW)
  24. #define CUDA_SHR3 (cuda_jsr=cuda_jsr^(cuda_jsr<<17), \
  25. cuda_jsr = cuda_jsr ^ (cuda_jsr >> 13), \
  26. cuda_jsr = cuda_jsr ^ (cuda_jsr << 5))
  27. #define CUDA_CONG (cuda_jcong=69069*cuda_jcong+1234567)
  28. #define KISS ((CUDA_MWC^CUDA_CONG)+CUDA_SHR3)
  29. static unsigned int cuda_z = 362436069, cuda_w = 521288629;
  30. static unsigned int cuda_jsr = 123456789, cuda_jcong = 380116160;
  31. return KISS;
  32. }
  33. #include <windows.h>
  34. static __inline__ double second(void)
  35. {
  36. LARGE_INTEGER t;
  37. static double oofreq;
  38. static int checkedForHighResTimer;
  39. static BOOL hasHighResTimer;
  40. if (!checkedForHighResTimer)
  41. {
  42. hasHighResTimer = QueryPerformanceFrequency(&t);
  43. oofreq = 1.0 / (double)t.QuadPart;
  44. checkedForHighResTimer = 1;
  45. }
  46. if (hasHighResTimer)
  47. {
  48. QueryPerformanceCounter(&t);
  49. return (double)t.QuadPart * oofreq;
  50. }
  51. else
  52. {
  53. return (double)GetTickCount() / 1000.0;
  54. }
  55. }
  56. struct gemmOpts
  57. {
  58. int m;
  59. int n;
  60. int k;
  61. //testMethod test_method;
  62. char *elem_type;
  63. int N; // number of multiplications
  64. };
  65. struct gemmTestParams
  66. {
  67. cublasOperation_t transa;
  68. cublasOperation_t transb;
  69. int m;
  70. int n;
  71. int k;
  72. T_ELEM alpha;
  73. T_ELEM beta;
  74. };
  75. #define CLEANUP() \
  76. do { \
  77. if (A) free (A); \
  78. if (B) free (B); \
  79. if (C) free (C); \
  80. for(int i = 0; i < opts.N; ++i) { \
  81. if(devPtrA[i]) cudaFree(devPtrA[i]);\
  82. if(devPtrB[i]) cudaFree(devPtrB[i]);\
  83. if(devPtrC[i]) cudaFree(devPtrC[i]);\
  84. } \
  85. if (devPtrA) free(devPtrA); \
  86. if (devPtrB) free(devPtrB); \
  87. if (devPtrC) free(devPtrC); \
  88. if (devPtrA_dev) cudaFree(devPtrA_dev); \
  89. if (devPtrB_dev) cudaFree(devPtrB_dev); \
  90. if (devPtrC_dev) cudaFree(devPtrC_dev); \
  91. fflush (stdout); \
  92. } while (0)
  93. void fillupMatrixDebug(T_ELEM *A, int lda, int rows, int cols)
  94. {
  95. for (int j = 0; j < cols; j++)
  96. {
  97. for (int i = 0; i < rows; i++)
  98. {
  99. A[i + lda*j].x = (i + rand()%13);
  100. A[i + lda*j].y = (j + rand() % 17);
  101. }
  102. }
  103. }
  104. void PrintMatrixDebug(T_ELEM *A, int lda, int rows, int cols)
  105. {
  106. printf("========The matrix is \n");
  107. for (int j = 0; j < cols; j++)
  108. {
  109. for (int i = 0; i < rows; i++)
  110. {
  111. printf("%+.04f%+.04fi ", A[i + lda*j].x, A[i + lda*j].y);
  112. //A[i + lda*j] = (i + j);
  113. }
  114. printf(" \n");
  115. }
  116. }
  117. int main(int argc, char *argv[])
  118. {
  119. printf("%s Starting...\n\n", sSDKname);
  120. int dev = findCudaDevice(argc, (const char **)argv);
  121. if (dev == -1)
  122. {
  123. return CUBLASTEST_FAILED;
  124. }
  125. cublasHandle_t handle;
  126. if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS)
  127. {
  128. fprintf(stdout, "CUBLAS initialization failed!\n");
  129. exit(EXIT_FAILURE);
  130. }
  131. cublasStatus_t status1, status2, status3;
  132. T_ELEM *A = NULL;
  133. T_ELEM *B = NULL;
  134. T_ELEM *C = NULL;
  135. T_ELEM **devPtrA = 0;
  136. T_ELEM **devPtrB = 0;
  137. T_ELEM **devPtrC = 0;
  138. T_ELEM **devPtrA_dev = NULL;
  139. T_ELEM **devPtrB_dev = NULL;
  140. T_ELEM **devPtrC_dev = NULL;
  141. int matrixM, matrixN, matrixK;
  142. int rowsA, rowsB, rowsC;
  143. int colsA, colsB, colsC;
  144. int m, n, k;
  145. int matrixSizeA, matrixSizeB, matrixSizeC;
  146. int errors;
  147. double start, stop;
  148. gemmOpts opts;
  149. opts.N = 20;
  150. gemmTestParams params;
  151. printf("Testing Batch INV Cublas\n");
  152. matrixM = MATDEM;
  153. matrixN = MATDEM;
  154. matrixK = MATDEM;
  155. int * info;//用于记录LU分解是否成功
  156. int * pivo;//用于记录LU分解的信息
  157. cudaMalloc((void **)& info, sizeof(int)* opts.N);
  158. cudaMalloc((void **)& pivo, sizeof(int)* matrixM * opts.N);
  159. rowsA = imax(1, matrixM);
  160. colsA = imax(1, matrixK);
  161. rowsB = imax(1, matrixK);
  162. colsB = imax(1, matrixN);
  163. rowsC = imax(1, matrixM);
  164. colsC = imax(1, matrixN);
  165. matrixSizeA = rowsA * colsA;
  166. matrixSizeB = rowsB * colsB;
  167. matrixSizeC = rowsC * colsC;
  168. params.transa = CUBLAS_OP_N;
  169. params.transb = CUBLAS_OP_N;
  170. m = matrixM;
  171. n = matrixN;
  172. k = matrixK;
  173. params.m = m;
  174. params.n = n;
  175. params.k = k;
  176. params.alpha.x = 1.0; params.alpha.y = 0.0;
  177. params.beta.x = 0.0; params.beta.y = 0.0;
  178. devPtrA = (T_ELEM **)malloc(opts.N * sizeof(T_ELEM *));
  179. devPtrB = (T_ELEM **)malloc(opts.N * sizeof(*devPtrB));
  180. devPtrC = (T_ELEM **)malloc(opts.N * sizeof(*devPtrC));
  181. for (int i = 0; i < opts.N; i++)
  182. {
  183. cudaError_t err1 = cudaMalloc((void **)&devPtrA[i], matrixSizeA * sizeof(T_ELEM ));
  184. cudaError_t err2 = cudaMalloc((void **)&devPtrB[i], matrixSizeB * sizeof(devPtrB[0][0]));
  185. cudaError_t err3 = cudaMalloc((void **)&devPtrC[i], matrixSizeC * sizeof(devPtrC[0][0]));
  186. }
  187. // For batched processing we need those arrays on the device
  188. cudaError_t err1 = cudaMalloc((void **)&devPtrA_dev, opts.N * sizeof(*devPtrA));
  189. cudaError_t err2 = cudaMalloc((void **)&devPtrB_dev, opts.N * sizeof(*devPtrB));
  190. cudaError_t err3 = cudaMalloc((void **)&devPtrC_dev, opts.N * sizeof(*devPtrC));
  191. err1 = cudaMemcpy(devPtrA_dev, devPtrA, opts.N * sizeof(*devPtrA), cudaMemcpyHostToDevice);
  192. err2 = cudaMemcpy(devPtrB_dev, devPtrB, opts.N * sizeof(*devPtrB), cudaMemcpyHostToDevice);
  193. err3 = cudaMemcpy(devPtrC_dev, devPtrC, opts.N * sizeof(*devPtrC), cudaMemcpyHostToDevice);
  194. A = (T_ELEM *)malloc(matrixSizeA * sizeof(A[0]));
  195. B = (T_ELEM *)malloc(matrixSizeB * sizeof(B[0]));
  196. C = (T_ELEM *)malloc(matrixSizeC * sizeof(C[0]));
  197. printf("#### args: lda=%d ldb=%d ldc=%d\n", rowsA, rowsB, rowsC);
  198. m = cuRand() % matrixM;
  199. n = cuRand() % matrixN;
  200. k = cuRand() % matrixK;
  201. memset(A, 0xFF, matrixSizeA* sizeof(A[0]));
  202. fillupMatrixDebug(A, rowsA, rowsA, rowsA);
  203. memset(B, 0xFF, matrixSizeB* sizeof(B[0]));
  204. fillupMatrixDebug(B, rowsB, rowsA, rowsA);
  205. for (int i = 0; i < opts.N; i++)
  206. {
  207. status1 = cublasSetMatrix(rowsA, colsA, sizeof(A[0]), A, rowsA, devPtrA[i], rowsA);
  208. status2 = cublasSetMatrix(rowsB, colsB, sizeof(B[0]), B, rowsB, devPtrB[i], rowsB);
  209. status3 = cublasSetMatrix(rowsC, colsC, sizeof(C[0]), C, rowsC, devPtrC[i], rowsC);
  210. }
  211. start = second();
  212. status1 = cublasCgemmBatched(handle, params.transa, params.transb, params.m, params.n,
  213. params.k, &params.alpha, (const T_ELEM **)devPtrA_dev, rowsA,
  214. (const T_ELEM **)devPtrB_dev, rowsB, &params.beta, devPtrC_dev, rowsC, opts.N);
  215. cublasCgetrfBatched(handle, params.m, devPtrA_dev, params.m, pivo, info, opts.N);//第四个参数是矩阵的主导维,由于这里假设数据在内存中的存放是连续的,所以是size
  216. cublasCgetriBatched(handle, params.m, devPtrA_dev, params.m, pivo, devPtrC_dev, params.m, info, opts.N);
  217. //cublasCgetrfBatched(cublasHandle_t handle,
  218. // int n,
  219. // cuComplex *Aarray[],
  220. // int lda,
  221. // int *PivotArray,
  222. // int *infoArray,
  223. // int batchSize);
  224. //cudaMemcpy(devPtrC_dev, devPtrC, opts.N * sizeof(*devPtrC), cudaMemcpyHostToDevice);
  225. for (int i = 0; i < opts.N; i++)
  226. {
  227. //status1 = cublasSetMatrix(rowsA, colsA, sizeof(A[0]), A, rowsA, devPtrA[i], rowsA);
  228. //status2 = cublasSetMatrix(rowsB, colsB, sizeof(B[0]), B, rowsB, devPtrB[i], rowsB);
  229. status3 = cublasGetMatrix(rowsC, colsC, sizeof(C[0]), devPtrC[i], rowsC, C, rowsC);
  230. }
  231. PrintMatrixDebug(A, params.m, params.n, params.k);
  232. PrintMatrixDebug(B, params.m, params.n, params.k);
  233. PrintMatrixDebug(C, params.m, params.n, params.k);
  234. stop = second();
  235. fprintf(stdout, "^^^^ elapsed = %10.8f sec \n", (stop - start));
  236. CLEANUP();
  237. cublasDestroy(handle);
  238. printf("\nTest Summary\n");
  239. system("pause");
  240. return 0;
  241. }

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/盐析白兔/article/detail/245562
推荐阅读
相关标签
  

闽ICP备14008679号