赞
踩
CTX = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
- int YoloLayerPlugin::nms_fun(int batch_size, void **inputs, void *const* outputs, size_t count, int detections_per_im, float nms_thresh, void *workspace, size_t workspace_size, cudaStream_t stream) const {
-
- if (!workspace || !workspace_size) {
- // Return required scratch space size cub style
- workspace_size = get_size_aligned<bool>(count); // flags
- workspace_size += get_size_aligned<int>(count); // indices
- workspace_size += get_size_aligned<int>(count); // indices_sorted
- workspace_size += get_size_aligned<float>(count); // scores
- workspace_size += get_size_aligned<float>(count); // scores_sorted
-
- size_t temp_size_flag = 0;
- cub::DeviceSelect::Flagged((void *)nullptr, temp_size_flag,
- cub::CountingInputIterator<int>(count),
- (bool *)nullptr, (int *)nullptr, (int *)nullptr, count);
- size_t temp_size_sort = 0;
- cub::DeviceRadixSort::SortPairsDescending((void *)nullptr, temp_size_sort,
- (float *)nullptr, (float *)nullptr, (int *)nullptr, (int *)nullptr, count);
- workspace_size += std::max(temp_size_flag, temp_size_sort);
-
- return workspace_size;
- }
-
- auto on_stream = thrust::cuda::par.on(stream);
-
- auto flags = get_next_ptr<bool>(count, workspace, workspace_size);
- auto indices = get_next_ptr<int>(count, workspace, workspace_size);
- auto indices_sorted = get_next_ptr<int>(count, workspace, workspace_size);
- auto scores = get_next_ptr<float>(count, workspace, workspace_size);
- auto scores_sorted = get_next_ptr<float>(count, workspace, workspace_size);
-
- // printf("nms batch %d \n", batch_size);
-
- for (int batch = 0; batch < batch_size; batch++) {
- auto in_scores = static_cast<const float *>(inputs[0]) + batch * count;
- auto in_boxes = static_cast<const float4 *>(inputs[1]) + batch * count;
- auto in_classes = static_cast<const float *>(inputs[2]) + batch * count;
- auto in_points = static_cast<const float *>(inputs[3]) + batch * count;
-
-
- auto out_scores = static_cast<float *>(outputs[0]) + batch * detections_per_im;
- auto out_boxes = static_cast<float4 *>(outputs[1]) + batch * detections_per_im;
- auto out_classes = static_cast<float *>(outputs[2]) + batch * detections_per_im;
- auto out_points = static_cast<float4 *>(outputs[3]) + batch * detections_per_im;
-
-
-
- // cudaMemcpyAsync(tmp, out_scores, 10 * sizeof(float), cudaMemcpyDeviceToHost, stream);
- // printf("output %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f\n", tmp[0],tmp[1],tmp[2],tmp[3],tmp[4],tmp[5],tmp[6],tmp[7],tmp[8],tmp[9]);
-
- // Discard null scores
- thrust::transform(on_stream, in_scores, in_scores + count,flags, thrust::placeholders::_1 > 0.0f);
-
- int *num_selected = reinterpret_cast<int *>(indices_sorted);
- cub::DeviceSelect::Flagged(workspace, workspace_size, cub::CountingInputIterator<int>(0),flags, indices, num_selected, count, stream);
- cudaStreamSynchronize(stream);
- int num_detections = *thrust::device_pointer_cast(num_selected);
-
- // Sort scores and corresponding indices
- thrust::gather(on_stream, indices, indices + num_detections, in_scores, scores);
- cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,scores, scores_sorted, indices, indices_sorted, num_detections, 0, sizeof(*scores)*8, stream);
-
- // Launch actual NMS kernel - 1 block with each thread handling n detections
- const int max_threads = 1024;
- int num_per_thread = ceil((float)num_detections / max_threads);
- nms_kernel<<<1, max_threads, 0, stream>>>(num_per_thread, nms_thresh, num_detections,
- indices_sorted, scores_sorted, in_classes, in_boxes);
-
- // Re-sort with updated scores
- cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,
- scores_sorted, scores, indices_sorted, indices, num_detections, 0, sizeof(*scores)*8, stream);
-
- // Gather filtered scores, boxes, classes
- num_detections = min(detections_per_im, num_detections);
- cudaMemcpyAsync(out_scores, scores, num_detections * sizeof *scores, cudaMemcpyDeviceToDevice, stream);
- if (num_detections < detections_per_im) {
- thrust::fill_n(on_stream, out_scores + num_detections, detections_per_im - num_detections, 0);
- }
- thrust::gather(on_stream, indices, indices + num_detections, in_boxes, out_boxes);
- thrust::gather(on_stream, indices, indices + num_detections, in_classes, out_classes);
- thrust::gather(on_stream, indices, indices + num_detections, in_points, out_points);
-
- float tmp[10];
- cudaMemcpyAsync(tmp, out_points, 10 * sizeof(float), cudaMemcpyDeviceToHost, stream);
- printf("out_points %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f\n", tmp[0], tmp[1], tmp[2], tmp[3], tmp[4], tmp[5], tmp[6], tmp[7], tmp[8], tmp[9]);
- // printf("num_detections %d \n", num_detections);
- // cudaMemcpyAsync(tmp, out_scores, 10 * sizeof(float), cudaMemcpyDeviceToHost, stream);
- // printf("output %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f\n", tmp[0],tmp[1],tmp[2],tmp[3],tmp[4],tmp[5],tmp[6],tmp[7],tmp[8],tmp[9]);
- }
-
- return 0;
- }

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。