当前位置:   article > 正文

cuda nms_sortpairsdescending

sortpairsdescending

pyhon cuda

CTX = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

  1. int YoloLayerPlugin::nms_fun(int batch_size, void **inputs, void *const* outputs, size_t count, int detections_per_im, float nms_thresh, void *workspace, size_t workspace_size, cudaStream_t stream) const {
  2. if (!workspace || !workspace_size) {
  3. // Return required scratch space size cub style
  4. workspace_size = get_size_aligned<bool>(count); // flags
  5. workspace_size += get_size_aligned<int>(count); // indices
  6. workspace_size += get_size_aligned<int>(count); // indices_sorted
  7. workspace_size += get_size_aligned<float>(count); // scores
  8. workspace_size += get_size_aligned<float>(count); // scores_sorted
  9. size_t temp_size_flag = 0;
  10. cub::DeviceSelect::Flagged((void *)nullptr, temp_size_flag,
  11. cub::CountingInputIterator<int>(count),
  12. (bool *)nullptr, (int *)nullptr, (int *)nullptr, count);
  13. size_t temp_size_sort = 0;
  14. cub::DeviceRadixSort::SortPairsDescending((void *)nullptr, temp_size_sort,
  15. (float *)nullptr, (float *)nullptr, (int *)nullptr, (int *)nullptr, count);
  16. workspace_size += std::max(temp_size_flag, temp_size_sort);
  17. return workspace_size;
  18. }
  19. auto on_stream = thrust::cuda::par.on(stream);
  20. auto flags = get_next_ptr<bool>(count, workspace, workspace_size);
  21. auto indices = get_next_ptr<int>(count, workspace, workspace_size);
  22. auto indices_sorted = get_next_ptr<int>(count, workspace, workspace_size);
  23. auto scores = get_next_ptr<float>(count, workspace, workspace_size);
  24. auto scores_sorted = get_next_ptr<float>(count, workspace, workspace_size);
  25. // printf("nms batch %d \n", batch_size);
  26. for (int batch = 0; batch < batch_size; batch++) {
  27. auto in_scores = static_cast<const float *>(inputs[0]) + batch * count;
  28. auto in_boxes = static_cast<const float4 *>(inputs[1]) + batch * count;
  29. auto in_classes = static_cast<const float *>(inputs[2]) + batch * count;
  30. auto in_points = static_cast<const float *>(inputs[3]) + batch * count;
  31. auto out_scores = static_cast<float *>(outputs[0]) + batch * detections_per_im;
  32. auto out_boxes = static_cast<float4 *>(outputs[1]) + batch * detections_per_im;
  33. auto out_classes = static_cast<float *>(outputs[2]) + batch * detections_per_im;
  34. auto out_points = static_cast<float4 *>(outputs[3]) + batch * detections_per_im;
  35. // cudaMemcpyAsync(tmp, out_scores, 10 * sizeof(float), cudaMemcpyDeviceToHost, stream);
  36. // printf("output %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f\n", tmp[0],tmp[1],tmp[2],tmp[3],tmp[4],tmp[5],tmp[6],tmp[7],tmp[8],tmp[9]);
  37. // Discard null scores
  38. thrust::transform(on_stream, in_scores, in_scores + count,flags, thrust::placeholders::_1 > 0.0f);
  39. int *num_selected = reinterpret_cast<int *>(indices_sorted);
  40. cub::DeviceSelect::Flagged(workspace, workspace_size, cub::CountingInputIterator<int>(0),flags, indices, num_selected, count, stream);
  41. cudaStreamSynchronize(stream);
  42. int num_detections = *thrust::device_pointer_cast(num_selected);
  43. // Sort scores and corresponding indices
  44. thrust::gather(on_stream, indices, indices + num_detections, in_scores, scores);
  45. cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,scores, scores_sorted, indices, indices_sorted, num_detections, 0, sizeof(*scores)*8, stream);
  46. // Launch actual NMS kernel - 1 block with each thread handling n detections
  47. const int max_threads = 1024;
  48. int num_per_thread = ceil((float)num_detections / max_threads);
  49. nms_kernel<<<1, max_threads, 0, stream>>>(num_per_thread, nms_thresh, num_detections,
  50. indices_sorted, scores_sorted, in_classes, in_boxes);
  51. // Re-sort with updated scores
  52. cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,
  53. scores_sorted, scores, indices_sorted, indices, num_detections, 0, sizeof(*scores)*8, stream);
  54. // Gather filtered scores, boxes, classes
  55. num_detections = min(detections_per_im, num_detections);
  56. cudaMemcpyAsync(out_scores, scores, num_detections * sizeof *scores, cudaMemcpyDeviceToDevice, stream);
  57. if (num_detections < detections_per_im) {
  58. thrust::fill_n(on_stream, out_scores + num_detections, detections_per_im - num_detections, 0);
  59. }
  60. thrust::gather(on_stream, indices, indices + num_detections, in_boxes, out_boxes);
  61. thrust::gather(on_stream, indices, indices + num_detections, in_classes, out_classes);
  62. thrust::gather(on_stream, indices, indices + num_detections, in_points, out_points);
  63. float tmp[10];
  64. cudaMemcpyAsync(tmp, out_points, 10 * sizeof(float), cudaMemcpyDeviceToHost, stream);
  65. printf("out_points %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f\n", tmp[0], tmp[1], tmp[2], tmp[3], tmp[4], tmp[5], tmp[6], tmp[7], tmp[8], tmp[9]);
  66. // printf("num_detections %d \n", num_detections);
  67. // cudaMemcpyAsync(tmp, out_scores, 10 * sizeof(float), cudaMemcpyDeviceToHost, stream);
  68. // printf("output %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f\n", tmp[0],tmp[1],tmp[2],tmp[3],tmp[4],tmp[5],tmp[6],tmp[7],tmp[8],tmp[9]);
  69. }
  70. return 0;
  71. }

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Cpp五条/article/detail/261592
推荐阅读
相关标签
  

闽ICP备14008679号