当前位置:   article > 正文

【代码分析】TensorRT sampleINT8 详解_samplescommon::volume

samplescommon::volume

目录

 

前言

 

代码分析

Main入口

构建(Build)网络

BatchStream

推理(Infer)过程

资源释放


 

前言

TensorRT可以通过INT8量化处理网络,然后大幅加速网络推理速度,本文旨在详细分析MNIST INT8 Sample 的代码,解释如何使用TensorRT 对网络做INT8 量化处理。

关于INT8 量化的背景知识可以参考博文TensorRT INT8校准与量化原理

 

代码分析

sampleINT8的github 代码参考link: https://github.com/NVIDIA/TensorRT/tree/release/6.0/samples/opensource/sampleINT8

程序的主要流程分为 main与程序输入参数初始化 -> 网络构建 -> 网络推理 -> 释放资源结束 这几个阶段,下面逐个阶段分析代码
 

Main入口

  1. //!
  2. //! \brief Initializes members of the params struct using the command line args
  3. //!
  4. SampleINT8Params initializeSampleParams(const samplesCommon::Args& args, int batchSize)
  5. {
  6. SampleINT8Params params;
  7. // Use directories provided by the user, in addition to default directories.
  8. params.dataDirs = args.dataDirs;
  9. params.dataDirs.emplace_back("data/mnist/");
  10. params.dataDirs.emplace_back("int8/mnist/");
  11. params.dataDirs.emplace_back("samples/mnist/");
  12. params.dataDirs.emplace_back("data/samples/mnist/");
  13. params.dataDirs.emplace_back("data/int8/mnist/");
  14. params.dataDirs.emplace_back("data/int8_samples/mnist/");
  15. params.batchSize = batchSize;
  16. params.dlaCore = args.useDLACore;
  17. params.nbCalBatches = 10;
  18. params.calBatchSize = 50;
  19. params.inputTensorNames.push_back("data");
  20. params.outputTensorNames.push_back("prob");
  21. params.prototxtFileName = "deploy.prototxt";
  22. params.weightsFileName = "mnist_lenet.caffemodel";
  23. params.networkName = "mnist";
  24. return params;
  25. }
  26. //!
  27. //! \brief Prints the help information for running this sample
  28. //!
  29. void printHelpInfo()
  30. {
  31. std::cout << "Usage: ./sample_int8 [-h or --help] [-d or --datadir=<path to data directory>] "
  32. "[--useDLACore=<int>]"
  33. << std::endl;
  34. std::cout << "--help Display help information" << std::endl;
  35. std::cout << "--datadir Specify path to a data directory, overriding the default. This option can be used "
  36. "multiple times to add multiple directories."
  37. << std::endl;
  38. std::cout << "--useDLACore=N Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, "
  39. "where n is the number of DLA engines on the platform."
  40. << std::endl;
  41. std::cout << "batch=N Set batch size (default = 32)." << std::endl;
  42. std::cout << "start=N Set the first batch to be scored (default = 100). All batches before this batch will "
  43. "be used for calibration."
  44. << std::endl;
  45. std::cout << "score=N Set the number of batches to be scored (default = 400)." << std::endl;
  46. }
  47. int main(int argc, char** argv)
  48. {
  49. if (argc >= 2 && (!strncmp(argv[1], "help", 4) || !strncmp(argv[1], "--help", 6) || !strncmp(argv[1], "--h", 3)))
  50. {
  51. printHelpInfo();
  52. return EXIT_FAILURE;
  53. }
  54. // By default we score over 40K images starting at 3200, so we don't score those used to search calibration
  55. int batchSize = 32;
  56. int firstScoreBatch = 100;
  57. int nbScoreBatches = 400;
  58. // Parse extra arguments
  59. for (int i = 1; i < argc; ++i)
  60. {
  61. if (!strncmp(argv[i], "batch=", 6))
  62. {
  63. batchSize = atoi(argv[i] + 6);
  64. }
  65. else if (!strncmp(argv[i], "start=", 6))
  66. {
  67. firstScoreBatch = atoi(argv[i] + 6);
  68. }
  69. else if (!strncmp(argv[i], "score=", 6))
  70. {
  71. nbScoreBatches = atoi(argv[i] + 6);
  72. }
  73. }
  74. if (batchSize > 128)
  75. {
  76. gLogError << "Please provide batch size <= 128" << std::endl;
  77. return EXIT_FAILURE;
  78. }
  79. if ((firstScoreBatch + nbScoreBatches) * batchSize > 500000)
  80. {
  81. gLogError << "Only 50000 images available" << std::endl;
  82. return EXIT_FAILURE;
  83. }
  84. samplesCommon::Args args;
  85. samplesCommon::parseArgs(args, argc, argv);
  86. SampleINT8 sample(initializeSampleParams(args, batchSize));
  87. ......
  • 检查程序输入参数,如果不符合要求,则print help的提示信息
  • 通过initializeSampleParams函数设置默认参数的值
  1. int main(int argc, char** argv)
  2. {
  3. ......
  4. std::vector<std::string> dataTypeNames = {"FP32", "FP16", "INT8"};
  5. std::vector<DataType> dataTypes = {DataType::kFLOAT, DataType::kHALF, DataType::kINT8};
  6. std::vector<std::pair<float, float>> scores(3, std::make_pair(0.0f, 0.0f));
  7. for (size_t i = 0; i < dataTypes.size(); i++)
  8. {
  9. gLogInfo << dataTypeNames[i] << " run:" << nbScoreBatches << " batches of size " << batchSize << " starting at "
  10. << firstScoreBatch << std::endl;
  11. if (!sample.build(dataTypes[i]))
  12. {
  13. if (!sample.isSupported(dataTypes[i]))
  14. {
  15. gLogWarning << "Skipping " << dataTypeNames[i] << " since the platform does not support this data type."
  16. << std::endl;
  17. continue;
  18. }
  19. return gLogger.reportFail(sampleTest);
  20. }
  21. if (!sample.infer(scores[i], firstScoreBatch, nbScoreBatches))
  22. {
  23. return gLogger.reportFail(sampleTest);
  24. }
  25. }
  26. ......
  •  根据FP32, FP16, INT8 三种数据类型来构建网络和推理网络

 

构建(Build)网络

  1. bool SampleINT8::build(DataType dataType)
  2. {
  3. auto builder = SampleUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(gLogger.getTRTLogger()));
  4. if (!builder)
  5. {
  6. return false;
  7. }
  8. auto network = SampleUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetwork());
  9. if (!network)
  10. {
  11. return false;
  12. }
  13. auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
  14. if (!config)
  15. {
  16. return false;
  17. }
  18. auto parser = SampleUniquePtr<nvcaffeparser1::ICaffeParser>(nvcaffeparser1::createCaffeParser());
  19. if (!parser)
  20. {
  21. return false;
  22. }
  23. if ((dataType == DataType::kINT8 && !builder->platformHasFastInt8())
  24. || (dataType == DataType::kHALF && !builder->platformHasFastFp16()))
  25. {
  26. return false;
  27. }
  28. auto constructed = constructNetwork(builder, network, config, parser, dataType);
  29. ......
  • TensorRT的标准流程,创建IBuilder -> 创建INetworkDefinition -> 创建IBuilderConfig -> 创建caffe模型的分析器ICaffeParser,判断硬件平台是否支持native FP16或INT8
  • 通过consstructNetwork函数分析caffe模型,构建网络
  1. bool SampleINT8::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
  2. SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config,
  3. SampleUniquePtr<nvcaffeparser1::ICaffeParser>& parser, DataType dataType)
  4. {
  5. mEngine = nullptr;
  6. const nvcaffeparser1::IBlobNameToTensor* blobNameToTensor
  7. = parser->parse(locateFile(mParams.prototxtFileName, mParams.dataDirs).c_str(),
  8. locateFile(mParams.weightsFileName, mParams.dataDirs).c_str(), *network,
  9. dataType == DataType::kINT8 ? DataType::kFLOAT : dataType);
  10. for (auto& s : mParams.outputTensorNames)
  11. {
  12. network->markOutput(*blobNameToTensor->find(s.c_str()));
  13. }
  14. // Calibrator life time needs to last until after the engine is built.
  15. std::unique_ptr<IInt8Calibrator> calibrator;
  16. config->setAvgTimingIterations(1);
  17. config->setMinTimingIterations(1);
  18. config->setMaxWorkspaceSize(1_GiB);
  19. config->setFlag(BuilderFlag::kDEBUG);
  20. if (dataType == DataType::kHALF)
  21. {
  22. config->setFlag(BuilderFlag::kFP16);
  23. }
  24. if (dataType == DataType::kINT8)
  25. {
  26. config->setFlag(BuilderFlag::kINT8);
  27. }
  28. builder->setMaxBatchSize(mParams.batchSize);
  29. if (dataType == DataType::kINT8)
  30. {
  31. MNISTBatchStream calibrationStream(mParams.calBatchSize, mParams.nbCalBatches, "train-images-idx3-ubyte",
  32. "train-labels-idx1-ubyte", mParams.dataDirs);
  33. calibrator.reset(new Int8EntropyCalibrator2<MNISTBatchStream>(
  34. calibrationStream, 0, mParams.networkName.c_str(), mParams.inputTensorNames[0].c_str()));
  35. config->setInt8Calibrator(calibrator.get());
  36. }
  37. ......
  • 通过parser->parse分析caffe模型分析和权重文件结合dataType是INT8还是FP 来构建network对象
  • 通过network->markOutput(*blobNameToTensor->find(s.c_str())) 标记网络output的Tensor
  • 通过config 配置网络每一层的迭代次数和网络占用的内存大小,根据dataType设置Flag
  • 通过builder->setMaxBatchSize设置网络input的batchSize
  • 如果是dataType是INT8类型,则构建BatchStream对象用于获取执行校准需要的calibration数据
  • 通过calibrator.reset(new Int8EntropyCalibrator2<MNISTBatchStream>( calibrationStream ... 构建TensorRT需要的calibration interface,TensorRT通过该interface在量化过程中获取calibration数据,关于BatchStream的详细分析如下
  • 通过config->setInt8Calibrator(calibrator.get()); 将calibration interface配置到网络的config中

BatchStream

  1. class IBatchStream
  2. {
  3. public:
  4. virtual void reset(int firstBatch) = 0;
  5. virtual bool next() = 0;
  6. virtual void skip(int skipCount) = 0;
  7. virtual float* getBatch() = 0;
  8. virtual float* getLabels() = 0;
  9. virtual int getBatchesRead() const = 0;
  10. virtual int getBatchSize() const = 0;
  11. virtual nvinfer1::Dims getDims() const = 0;
  12. };
  13. class MNISTBatchStream : public IBatchStream
  14. {
  15. public:
  16. MNISTBatchStream(int batchSize, int maxBatches, const std::string& dataFile, const std::string& labelsFile,
  17. const std::vector<std::string>& directories)
  18. : mBatchSize{batchSize}
  19. , mMaxBatches{maxBatches}
  20. , mDims{3, 1, 28, 28} //!< We already know the dimensions of MNIST images.
  21. {
  22. readDataFile(locateFile(dataFile, directories));
  23. readLabelsFile(locateFile(labelsFile, directories));
  24. }
  25. void reset(int firstBatch) override
  26. {
  27. mBatchCount = firstBatch;
  28. }
  29. bool next() override
  30. {
  31. if (mBatchCount >= mMaxBatches)
  32. {
  33. return false;
  34. }
  35. ++mBatchCount;
  36. return true;
  37. }
  38. void skip(int skipCount) override
  39. {
  40. mBatchCount += skipCount;
  41. }
  42. float* getBatch() override
  43. {
  44. return mData.data() + (mBatchCount * mBatchSize * samplesCommon::volume(mDims));
  45. }
  46. float* getLabels() override
  47. {
  48. return mLabels.data() + (mBatchCount * mBatchSize);
  49. }
  50. int getBatchesRead() const override
  51. {
  52. return mBatchCount;
  53. }
  54. int getBatchSize() const override
  55. {
  56. return mBatchSize;
  57. }
  58. nvinfer1::Dims getDims() const override
  59. {
  60. return mDims;
  61. }
  62. private:
  63. void readDataFile(const std::string& dataFilePath)
  64. {
  65. std::ifstream file{dataFilePath.c_str(), std::ios::binary};
  66. int magicNumber, numImages, imageH, imageW;
  67. file.read(reinterpret_cast<char*>(&magicNumber), sizeof(magicNumber));
  68. // All values in the MNIST files are big endian.
  69. magicNumber = samplesCommon::swapEndianness(magicNumber);
  70. assert(magicNumber == 2051 && "Magic Number does not match the expected value for an MNIST image set");
  71. // Read number of images and dimensions
  72. file.read(reinterpret_cast<char*>(&numImages), sizeof(numImages));
  73. file.read(reinterpret_cast<char*>(&imageH), sizeof(imageH));
  74. file.read(reinterpret_cast<char*>(&imageW), sizeof(imageW));
  75. numImages = samplesCommon::swapEndianness(numImages);
  76. imageH = samplesCommon::swapEndianness(imageH);
  77. imageW = samplesCommon::swapEndianness(imageW);
  78. // The MNIST data is made up of unsigned bytes, so we need to cast to float and normalize.
  79. int numElements = numImages * imageH * imageW;
  80. std::vector<uint8_t> rawData(numElements);
  81. file.read(reinterpret_cast<char*>(rawData.data()), numElements * sizeof(uint8_t));
  82. mData.resize(numElements);
  83. std::transform(
  84. rawData.begin(), rawData.end(), mData.begin(), [](uint8_t val) { return static_cast<float>(val) / 255.f; });
  85. }
  86. void readLabelsFile(const std::string& labelsFilePath)
  87. {
  88. std::ifstream file{labelsFilePath.c_str(), std::ios::binary};
  89. int magicNumber, numImages;
  90. file.read(reinterpret_cast<char*>(&magicNumber), sizeof(magicNumber));
  91. // All values in the MNIST files are big endian.
  92. magicNumber = samplesCommon::swapEndianness(magicNumber);
  93. assert(magicNumber == 2049 && "Magic Number does not match the expected value for an MNIST labels file");
  94. file.read(reinterpret_cast<char*>(&numImages), sizeof(numImages));
  95. numImages = samplesCommon::swapEndianness(numImages);
  96. std::vector<uint8_t> rawLabels(numImages);
  97. file.read(reinterpret_cast<char*>(rawLabels.data()), numImages * sizeof(uint8_t));
  98. mLabels.resize(numImages);
  99. std::transform(
  100. rawLabels.begin(), rawLabels.end(), mLabels.begin(), [](uint8_t val) { return static_cast<float>(val); });
  101. }
  102. int mBatchSize{0};
  103. int mBatchCount{0}; //!< The batch that will be read on the next invocation of next()
  104. int mMaxBatches{0};
  105. Dims mDims{};
  106. std::vector<float> mData{};
  107. std::vector<float> mLabels{};
  108. };
  • 通过构造函数获得校准calibrationS数据的BatchSize和Batch number
  • 通过readDataFile读取calibrationS数据集的数据文件,包括文件中共有多少Image,每个Image的height和width,计算总共要读取的数据量numElements = numImages * imageH * imageW;,将文件的数据读取到mData
  • 通过readDataFile读取calibrationS数据集的labels文件,过程与readDataFile类似,只是将读取的数据保存到mLabels
  • next()函数返回下一个Batch的count
  • getBatch 返回当前BatchCount对应的数据指针,即数据mData偏移(mBatchCount * mBatchSize * samplesCommon::volume(mDims)) 的位置
  • getLabels() 返回当前BatchCount对应的label指针,即mLabels.data() 偏移 (mBatchCount * mBatchSize) 的位置
  1. template <typename TBatchStream>
  2. class EntropyCalibratorImpl
  3. {
  4. public:
  5. EntropyCalibratorImpl(
  6. TBatchStream stream, int firstBatch, std::string networkName, const char* inputBlobName, bool readCache = true)
  7. : mStream{stream}
  8. , mCalibrationTableName("CalibrationTable" + networkName)
  9. , mInputBlobName(inputBlobName)
  10. , mReadCache(readCache)
  11. {
  12. nvinfer1::Dims dims = mStream.getDims();
  13. mInputCount = samplesCommon::volume(dims) * mStream.getBatchSize();
  14. CHECK(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float)));
  15. mStream.reset(firstBatch);
  16. }
  17. virtual ~EntropyCalibratorImpl()
  18. {
  19. CHECK(cudaFree(mDeviceInput));
  20. }
  21. int getBatchSize() const
  22. {
  23. return mStream.getBatchSize();
  24. }
  25. bool getBatch(void* bindings[], const char* names[], int nbBindings)
  26. {
  27. if (!mStream.next())
  28. {
  29. return false;
  30. }
  31. CHECK(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice));
  32. assert(!strcmp(names[0], mInputBlobName));
  33. bindings[0] = mDeviceInput;
  34. return true;
  35. }
  36. const void* readCalibrationCache(size_t& length)
  37. {
  38. mCalibrationCache.clear();
  39. std::ifstream input(mCalibrationTableName, std::ios::binary);
  40. input >> std::noskipws;
  41. if (mReadCache && input.good())
  42. {
  43. std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(),
  44. std::back_inserter(mCalibrationCache));
  45. }
  46. length = mCalibrationCache.size();
  47. return length ? mCalibrationCache.data() : nullptr;
  48. }
  49. void writeCalibrationCache(const void* cache, size_t length)
  50. {
  51. std::ofstream output(mCalibrationTableName, std::ios::binary);
  52. output.write(reinterpret_cast<const char*>(cache), length);
  53. }
  54. private:
  55. TBatchStream mStream;
  56. size_t mInputCount;
  57. std::string mCalibrationTableName;
  58. const char* mInputBlobName;
  59. bool mReadCache{true};
  60. void* mDeviceInput{nullptr};
  61. std::vector<char> mCalibrationCache;
  62. };
  63. //! \class Int8EntropyCalibrator2
  64. //!
  65. //! \brief Implements Entropy calibrator 2.
  66. //! CalibrationAlgoType is kENTROPY_CALIBRATION_2.
  67. //!
  68. template <typename TBatchStream>
  69. class Int8EntropyCalibrator2 : public IInt8EntropyCalibrator2
  70. {
  71. public:
  72. Int8EntropyCalibrator2(
  73. TBatchStream stream, int firstBatch, const char* networkName, const char* inputBlobName, bool readCache = true)
  74. : mImpl(stream, firstBatch, networkName, inputBlobName, readCache)
  75. {
  76. }
  77. int getBatchSize() const override
  78. {
  79. return mImpl.getBatchSize();
  80. }
  81. bool getBatch(void* bindings[], const char* names[], int nbBindings) override
  82. {
  83. return mImpl.getBatch(bindings, names, nbBindings);
  84. }
  85. const void* readCalibrationCache(size_t& length) override
  86. {
  87. return mImpl.readCalibrationCache(length);
  88. }
  89. void writeCalibrationCache(const void* cache, size_t length) override
  90. {
  91. mImpl.writeCalibrationCache(cache, length);
  92. }
  93. private:
  94. EntropyCalibratorImpl<TBatchStream> mImpl;
  95. };

  • TensorRT需要应用程序实现calibration的interface IInt8Calibrator, 其中IInt8EntropyCalibrator2是它的继承类,所以在本示例程序中通过Int8EntropyCalibrator2实现了IInt8EntropyCalibrator2,而Int8EntropyCalibrator2则是通过EntropyCalibratorImpl类实现了IInt8Calibrator需要提供的几个接口方法,包括
  • 构造函数中通过mInputCount = samplesCommon::volume(dims) * mStream.getBatchSize(); 计算每一个input batch中calibration数据个数,然后通过cudaMalloc(&mDeviceInput, mInputCount * sizeof(float)) 在GPU Device上分配保存input数据的存储空间
  • 在getBatch方法中,通过cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice) 将BatchStream提供的一次 batch数据从host 传送到device端
  • 在readCalibrationCache/writeCalibrationCache方法中,将网络每一层校准后得到的Calibration阈值保存到CalibrationTable以便后续读取重复使用
  1. bool SampleINT8::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
  2. SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config,
  3. SampleUniquePtr<nvcaffeparser1::ICaffeParser>& parser, DataType dataType)
  4. {
  5. ......
  6. if (mParams.dlaCore >= 0)
  7. {
  8. samplesCommon::enableDLA(builder.get(), config.get(), mParams.dlaCore);
  9. if (mParams.batchSize > builder->getMaxDLABatchSize())
  10. {
  11. gLogError << "Requested batch size " << mParams.batchSize << " is greater than the max DLA batch size of "
  12. << builder->getMaxDLABatchSize() << ". Reducing batch size accordingly." << std::endl;
  13. return false;
  14. }
  15. }
  16. mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
  17. builder->buildEngineWithConfig(*network, *config), samplesCommon::InferDeleter());
  18. if (!mEngine)
  19. {
  20. return false;
  21. }
  22. return true;
  23. }
  • 根据程序输入参数判断是否要enable NV DLA硬件加速
  • 创建ICudaEngine用于后续网络推理过程

 

推理(Infer)过程

  1. bool SampleINT8::infer(std::pair<float, float>& score, int firstScoreBatch, int nbScoreBatches)
  2. {
  3. float ms{0.0f};
  4. // Create RAII buffer manager object
  5. samplesCommon::BufferManager buffers(mEngine, mParams.batchSize);
  6. auto context = SampleUniquePtr<nvinfer1::IExecutionContext>(mEngine->createExecutionContext());
  7. if (!context)
  8. {
  9. return false;
  10. }
  11. MNISTBatchStream batchStream(
  12. mParams.batchSize, nbScoreBatches, "train-images-idx3-ubyte", "train-labels-idx1-ubyte", mParams.dataDirs);
  13. batchStream.skip(firstScoreBatch);
  14. Dims outputDims = context->getEngine().getBindingDimensions(
  15. context->getEngine().getBindingIndex(mParams.outputTensorNames[0].c_str()));
  16. int outputSize = samplesCommon::volume(outputDims);
  17. int top1{0}, top5{0};
  18. float totalTime{0.0f};
  19. ......
  • 构建BufferManager 分配Host与Device的存储空间供推理过程输入网络input数据和获取网络output数据,关于BufferManager数据结构的详细分析请参考我的博文  TensorRT sampleMNIST 详解
  • 创建推理需要的IExecutionContext对象
  • 创建Score数据集的BatchStream,用于获取Score数据集的data与label数据
  • 获取网络output Tensor的维度outputDims,计算出outputSzie的大小
  1. bool SampleINT8::processInput(const samplesCommon::BufferManager& buffers, const float* data)
  2. {
  3. // Fill data buffer
  4. float* hostDataBuffer = static_cast<float*>(buffers.getHostBuffer(mParams.inputTensorNames[0]));
  5. std::memcpy(hostDataBuffer, data, mParams.batchSize * samplesCommon::volume(mInputDims) * sizeof(float));
  6. return true;
  7. }
  8. ......
  9. bool SampleINT8::infer(std::pair<float, float>& score, int firstScoreBatch, int nbScoreBatches)
  10. {
  11. ......
  12. while (batchStream.next())
  13. {
  14. // Read the input data into the managed buffers
  15. assert(mParams.inputTensorNames.size() == 1);
  16. if (!processInput(buffers, batchStream.getBatch()))
  17. {
  18. return false;
  19. }
  20. // Memcpy from host input buffers to device input buffers
  21. buffers.copyInputToDevice();
  22. cudaStream_t stream;
  23. CHECK(cudaStreamCreate(&stream));
  24. // Use CUDA events to measure inference time
  25. cudaEvent_t start, end;
  26. CHECK(cudaEventCreateWithFlags(&start, cudaEventBlockingSync));
  27. CHECK(cudaEventCreateWithFlags(&end, cudaEventBlockingSync));
  28. cudaEventRecord(start, stream);
  29. bool status = context->enqueue(mParams.batchSize, buffers.getDeviceBindings().data(), stream, nullptr);
  30. if (!status)
  31. {
  32. return false;
  33. }
  34. cudaEventRecord(end, stream);
  35. cudaEventSynchronize(end);
  36. cudaEventElapsedTime(&ms, start, end);
  37. cudaEventDestroy(start);
  38. cudaEventDestroy(end);
  39. totalTime += ms;
  40. // Memcpy from device output buffers to host output buffers
  41. buffers.copyOutputToHost();
  42. CHECK(cudaStreamDestroy(stream));
  43. top1 += calculateScore(buffers, batchStream.getLabels(), mParams.batchSize, outputSize, 1);
  44. top5 += calculateScore(buffers, batchStream.getLabels(), mParams.batchSize, outputSize, 5);
  45. if (batchStream.getBatchesRead() % 100 == 0)
  46. {
  47. gLogInfo << "Processing next set of max 100 batches" << std::endl;
  48. }
  49. }
  50. int imagesRead = batchStream.getBatchesRead() * mParams.batchSize;
  51. score.first = float(top1) / float(imagesRead);
  52. score.second = float(top5) / float(imagesRead);
  53. gLogInfo << "Top1: " << score.first << ", Top5: " << score.second << std::endl;
  54. gLogInfo << "Processing " << imagesRead << " images averaged " << totalTime / imagesRead << " ms/image and "
  55. << totalTime / batchStream.getBatchesRead() << " ms/batch." << std::endl;
  56. return true;
  • while循环持续从score数据集合中按batchSize的大小读取一批input数据
  • 通过processInput函数将input数据copy到BufferManager的host 存储空间中
  • 通过buffers.copyInputToDevice(); 将BufferManager的host 存储空间数据传送到GPU device 存储空间,实现input数据的输入
  • 通过cudaStreamCreate 创建并行计算的stream对象
  • 通过cudaEventCreateWithFlags创建统计时间的start和end 对象
  • 通过context->enqueue 开始执行在score数据集上的网络推理过程
  • 通过cudaEventSynchronize同步等待cuda并行计算的推理过程结束
  • 通过cudaEventElapsedTime计算本次推理过程的时间
  • 通过buffers.copyOutputToHost(); 将推理的output从GPU device端传送到host 端
  • 通过calculateScore函数来计算本次推理过程的精度,具体过程如下
  1. int SampleINT8::calculateScore(
  2. const samplesCommon::BufferManager& buffers, float* labels, int batchSize, int outputSize, int threshold)
  3. {
  4. float* probs = static_cast<float*>(buffers.getHostBuffer(mParams.outputTensorNames[0]));
  5. int success = 0;
  6. for (int i = 0; i < batchSize; i++)
  7. {
  8. float *prob = probs + outputSize * i, correct = prob[(int) labels[i]];
  9. int better = 0;
  10. for (int j = 0; j < outputSize; j++)
  11. {
  12. if (prob[j] >= correct)
  13. {
  14. better++;
  15. }
  16. }
  17. if (better <= threshold)
  18. {
  19. success++;
  20. }
  21. }
  22. return success;
  23. }
  • probs与labels指针的关系如下图所示

  • 特别解释一下for循环中的better和success的计算规则
  • calculateScore函数本质上是计算在1次Batch推理结果中排Top1/Top5 的正确结果个数,其中Top1/Top5的计算规则如下图所示

  •  最终calculateScore函数返回一个BatchSize的 input image推理后,概率排名Top1/Top5 的正确output结果个数

 

  1. bool SampleINT8::infer(std::pair<float, float>& score, int firstScoreBatch, int nbScoreBatches)
  2. {
  3. ......
  4. int imagesRead = batchStream.getBatchesRead() * mParams.batchSize;
  5. score.first = float(top1) / float(imagesRead);
  6. score.second = float(top5) / float(imagesRead);
  7. gLogInfo << "Top1: " << score.first << ", Top5: " << score.second << std::endl;
  8. gLogInfo << "Processing " << imagesRead << " images averaged " << totalTime / imagesRead << " ms/image and "
  9. << totalTime / batchStream.getBatchesRead() << " ms/batch." << std::endl;
  10. return true;
  11. }
  • 总共向网络输入了imagesRead个image做推理,计算推理结果中Top1/Top5的正确结果个数占总输入image的比例输出到log,即如下的程序运行log
  1. &&&& RUNNING TensorRT.sample_int8 # ./sample_int8 mnist
  2. [I] FP32 run:400 batches of size 100 starting at 100
  3. [I] Processing next set of max 100 batches
  4. [I] Processing next set of max 100 batches
  5. [I] Processing next set of max 100 batches
  6. [I] Processing next set of max 100 batches
  7. [I] Top1: 0.9904, Top5: 1
  8. [I] Processing 40000 images averaged 0.00170236 ms/image and 0.170236 ms/batch.
  9. [I] FP16 run:400 batches of size 100 starting at 100
  10. [I] Processing next set of max 100 batches
  11. [I] Processing next set of max 100 batches
  12. [I] Processing next set of max 100 batches
  13. [I] Processing next set of max 100 batches
  14. [I] Top1: 0.9904, Top5: 1
  15. [I] Processing 40000 images averaged 0.00128872 ms/image and 0.128872 ms/batch.
  16. INT8 run:400 batches of size 100 starting at 100
  17. [I] Processing next set of max 100 batches
  18. [I] Processing next set of max 100 batches
  19. [I] Processing next set of max 100 batches
  20. [I] Processing next set of max 100 batches
  21. [I] Top1: 0.9908, Top5: 1
  22. [I] Processing 40000 images averaged 0.000946117 ms/image and 0.0946117 ms/batch.
  23. &&&& PASSED TensorRT.sample_int8 # ./sample_int8 mnist

 

资源释放

  1. int main(int argc, char** argv)
  2. {
  3. ......
  4. auto isApproximatelyEqual = [](float a, float b, double tolerance) { return (std::abs(a - b) <= tolerance); };
  5. double fp16tolerance{0.5}, int8tolerance{1.0};
  6. if (scores[1].first != 0.0f && !isApproximatelyEqual(scores[0].first, scores[1].first, fp16tolerance))
  7. {
  8. gLogError << "FP32(" << scores[0].first << ") and FP16(" << scores[1].first
  9. << ") Top1 accuracy differ by more than " << fp16tolerance << "." << std::endl;
  10. return gLogger.reportFail(sampleTest);
  11. }
  12. if (scores[2].first != 0.0f && !isApproximatelyEqual(scores[0].first, scores[2].first, int8tolerance))
  13. {
  14. gLogError << "FP32(" << scores[0].first << ") and Int8(" << scores[2].first
  15. << ") Top1 accuracy differ by more than " << int8tolerance << "." << std::endl;
  16. return gLogger.reportFail(sampleTest);
  17. }
  18. if (scores[1].second != 0.0f && !isApproximatelyEqual(scores[0].second, scores[1].second, fp16tolerance))
  19. {
  20. gLogError << "FP32(" << scores[0].second << ") and FP16(" << scores[1].second
  21. << ") Top5 accuracy differ by more than " << fp16tolerance << "." << std::endl;
  22. return gLogger.reportFail(sampleTest);
  23. }
  24. if (scores[2].second != 0.0f && !isApproximatelyEqual(scores[0].second, scores[2].second, int8tolerance))
  25. {
  26. gLogError << "FP32(" << scores[0].second << ") and INT8(" << scores[2].second
  27. << ") Top5 accuracy differ by more than " << int8tolerance << "." << std::endl;
  28. return gLogger.reportFail(sampleTest);
  29. }
  30. if (!sample.teardown())
  31. {
  32. return gLogger.reportFail(sampleTest);
  33. }
  34. return gLogger.reportPass(sampleTest);
  35. }
  • 计算不同dataType中Top1和Top5的精度差异是否符合预设的阈值xxxtolerance
  • 通过teardown释放程序的资源
本文内容由网友自发贡献,转载请注明出处:https://www.wpsshop.cn/article/detail/57559
推荐阅读
  

闽ICP备14008679号