TensorRT学习笔记

编程入门 行业动态 更新时间:2024-10-11 15:23:32

TensorRT<a href=https://www.elefans.com/category/jswz/34/1770117.html style=学习笔记"/>

TensorRT学习笔记

目录

1--Tensor算子API

1-1--卷积算子

1-2--激活算子

1-3--池化算子

1-4--FC层算子

2--代码实例

3--编译运行


1--Tensor算子API

TensorRT提供了卷积层、激活函数和池化层三种最常用算子的API:

// 创建一个空的网络
nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); // 添加卷积层算子
nvinfer1::IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, nvinfer1::DimsHW{3, 3}, weightMap["features.0.weight"], weightMap["features.0.bias"]);// 添加激活算子
nvinfer1::IActivationLayer* relu1 = network->addActivation(*conv1->getOutput(0), nvinfer1::ActivationType::kRELU);// 添加池化算子
nvinfer1::IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{2, 2});

1-1--卷积算子

IConvolutionLayer* addConvolutionNd(ITensor& input, int32_t nbOutputMaps, Dims kernelSize, Weights kernelWeights, Weights biasWeights
)

第一个参数表示输入的Tensor数据;

第二个参数表示卷积层输出的特征图数,即通道数channel;

第三个参数表示使用的卷积核大小;

第四个参数和第五个参数表示加载的权重;

1-2--激活算子

IActivationLayer* addActivation(ITensor& input, ActivationType type
)

第一个参数表示输入的Tensor数据;

第二个参数表示使用的激活函数类型,包括以下激活函数:

enum class ActivationType : int32_t
{kRELU = 0,             //!< Rectified linear activation.kSIGMOID = 1,          //!< Sigmoid activation.kTANH = 2,             //!< TanH activation.kLEAKY_RELU = 3,       //!< LeakyRelu activation: x>=0 ? x : alpha * x.kELU = 4,              //!< Elu activation: x>=0 ? x : alpha * (exp(x) - 1).kSELU = 5,             //!< Selu activation: x>0 ? beta * x : beta * (alpha*exp(x) - alpha)kSOFTSIGN = 6,         //!< Softsign activation: x / (1+|x|)kSOFTPLUS = 7,         //!< Parametric softplus activation: alpha*log(exp(beta*x)+1)kCLIP = 8,             //!< Clip activation: max(alpha, min(beta, x))kHARD_SIGMOID = 9,     //!< Hard sigmoid activation: max(0, min(1, alpha*x+beta))kSCALED_TANH = 10,     //!< Scaled tanh activation: alpha*tanh(beta*x)kTHRESHOLDED_RELU = 11 //!< Thresholded ReLU activation: x>alpha ? x : 0
};

1-3--池化算子

IPoolingLayer* addPoolingNd(ITensor& input, PoolingType type, Dims windowSize
)

第一个参数表示输入的Tensor数据;

第二个参数表示使用的池化类型;

第三个参数表示池化窗口的大小;

提供的池化类型包括:

enum class PoolingType : int32_t
{kMAX = 0,              // Maximum over elementskAVERAGE = 1,          // Average over elements. If the tensor is padded, the count includes the paddingkMAX_AVERAGE_BLEND = 2 // Blending between max and average pooling: (1-blendFactor)*maxPool + blendFactor*avgPool
};

1-4--FC层算子

IFullyConnectedLayer* addFullyConnected(ITensor& input, int32_t nbOutputs, Weights kernelWeights, Weights biasWeights
)

第一个参数表示输入的Tensor数据;

第二个参数表示输出的通道数;

第三个参数和第四个参数表示加载的权重;

2--代码实例

基于算子 API 搭建 VGG11:(完整可运行的代码参考:liujf69/TensorRT-Demo)

核心程序代码:

// 创建builder和config
nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);
nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();// 基于builder创建network
nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); // 一开始是空的// 调用API搭建Network
// 创建输入
nvinfer1::ITensor* data = network->addInput(this->INPUT_BLOB_NAME, dt, nvinfer1::Dims3{3, this->INPUT_H, this->INPUT_W});
// 搭建卷积层
nvinfer1::IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, nvinfer1::DimsHW{3, 3}, weightMap["features.0.weight"], weightMap["features.0.bias"]);
conv1->setPaddingNd(nvinfer1::DimsHW{1, 1});
// 搭建激活层
nvinfer1::IActivationLayer* relu1 = network->addActivation(*conv1->getOutput(0), nvinfer1::ActivationType::kRELU);
// 搭建池化层
nvinfer1::IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{2, 2});
pool1->setStrideNd(nvinfer1::DimsHW{2, 2});
...
// 搭建FC层
nvinfer1::IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool1->getOutput(0), 4096, weightMap["classifier.0.weight"], weightMap["classifier.0.bias"]);
...// 基于config和network生成engine
builder->setMaxBatchSize(maxBatchSize);
config->setMaxWorkspaceSize(1 << 20);
nvinfer1::ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
...

主程序代码:

#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include "logging.h"
#include <iostream>#define CHECK(status) \do\{\auto ret = (status);\if (ret != 0)\{\std::cerr << "Cuda failure: " << ret << std::endl;\abort();\}\} while (0)static Logger gLogger; // 日志class VGG_Demo{
public:VGG_Demo(){this->prob = new float[OUTPUT_SIZE];}~VGG_Demo(){delete[] prob;}int serialize();void APIToModel(unsigned int maxBatchSize, nvinfer1::IHostMemory** modelStream);nvinfer1::ICudaEngine* createEngine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt);std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file);void doInference(nvinfer1::IExecutionContext& context, float* input, float* output, int batchSize);void deserialize(float* data);void load_engine();const char* INPUT_BLOB_NAME = "data"; // 输入名称const char* OUTPUT_BLOB_NAME = "prob"; // 输出名称const int INPUT_H = 224; // 输入数据高度const int INPUT_W = 224; // 输入数据宽度const int OUTPUT_SIZE = 1000; // 输出大小std::string engine_file = "./vgg.engine";char* trtModelStream = nullptr;float* prob = nullptr;size_t size = 0;
};int VGG_Demo::serialize(){nvinfer1::IHostMemory* modelStream  = nullptr;this->APIToModel(1, &modelStream); // 调用API构建networkassert(modelStream != nullptr);// 保存std::ofstream p("./vgg.engine", std::ios::binary);if (!p) {std::cerr << "could not open plan output file" << std::endl;return -1;}p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());modelStream->destroy();return 1;
}void VGG_Demo::APIToModel(unsigned int maxBatchSize, nvinfer1::IHostMemory** modelStream){// 创建builder和confignvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();nvinfer1::ICudaEngine* engine = this->createEngine(maxBatchSize, builder, config, nvinfer1::DataType::kFLOAT);assert(engine != nullptr);// 序列化*modelStream = engine->serialize();// 销毁engine->destroy();builder->destroy();config->destroy();
}nvinfer1::ICudaEngine* VGG_Demo::createEngine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt){// 加载权重std::map<std::string, nvinfer1::Weights> weightMap = loadWeights("../weights/vgg.wts");nvinfer1::Weights emptywts{nvinfer1::DataType::kFLOAT, nullptr, 0};nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); // 创建一个空的networknvinfer1::ITensor* data = network->addInput(this->INPUT_BLOB_NAME, dt, nvinfer1::Dims3{3, this->INPUT_H, this->INPUT_W}); // 创建输入assert(data);// 使用卷积、激活和池化三种算子,按顺序连接三种算子,并用对应的权重初始化nvinfer1::IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, nvinfer1::DimsHW{3, 3}, weightMap["features.0.weight"], weightMap["features.0.bias"]);assert(conv1);conv1->setPaddingNd(nvinfer1::DimsHW{1, 1});nvinfer1::IActivationLayer* relu1 = network->addActivation(*conv1->getOutput(0), nvinfer1::ActivationType::kRELU);assert(relu1);nvinfer1::IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{2, 2});assert(pool1);pool1->setStrideNd(nvinfer1::DimsHW{2, 2});conv1 = network->addConvolutionNd(*pool1->getOutput(0), 128, nvinfer1::DimsHW{3, 3}, weightMap["features.3.weight"], weightMap["features.3.bias"]);conv1->setPaddingNd(nvinfer1::DimsHW{1, 1});relu1 = network->addActivation(*conv1->getOutput(0), nvinfer1::ActivationType::kRELU);pool1 = network->addPoolingNd(*relu1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{2, 2});pool1->setStrideNd(nvinfer1::DimsHW{2, 2});conv1 = network->addConvolutionNd(*pool1->getOutput(0), 256, nvinfer1::DimsHW{3, 3}, weightMap["features.6.weight"], weightMap["features.6.bias"]);conv1->setPaddingNd(nvinfer1::DimsHW{1, 1});relu1 = network->addActivation(*conv1->getOutput(0), nvinfer1::ActivationType::kRELU);conv1 = network->addConvolutionNd(*relu1->getOutput(0), 256, nvinfer1::DimsHW{3, 3}, weightMap["features.8.weight"], weightMap["features.8.bias"]);conv1->setPaddingNd(nvinfer1::DimsHW{1, 1});relu1 = network->addActivation(*conv1->getOutput(0), nvinfer1::ActivationType::kRELU);pool1 = network->addPoolingNd(*relu1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{2, 2});pool1->setStrideNd(nvinfer1::DimsHW{2, 2});conv1 = network->addConvolutionNd(*pool1->getOutput(0), 512, nvinfer1::DimsHW{3, 3}, weightMap["features.11.weight"], weightMap["features.11.bias"]);conv1->setPaddingNd(nvinfer1::DimsHW{1, 1});relu1 = network->addActivation(*conv1->getOutput(0), nvinfer1::ActivationType::kRELU);conv1 = network->addConvolutionNd(*relu1->getOutput(0), 512, nvinfer1::DimsHW{3, 3}, weightMap["features.13.weight"], weightMap["features.13.bias"]);conv1->setPaddingNd(nvinfer1::DimsHW{1, 1});relu1 = network->addActivation(*conv1->getOutput(0), nvinfer1::ActivationType::kRELU);pool1 = network->addPoolingNd(*relu1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{2, 2});pool1->setStrideNd(nvinfer1::DimsHW{2, 2});conv1 = network->addConvolutionNd(*pool1->getOutput(0), 512, nvinfer1::DimsHW{3, 3}, weightMap["features.16.weight"], weightMap["features.16.bias"]);conv1->setPaddingNd(nvinfer1::DimsHW{1, 1});relu1 = network->addActivation(*conv1->getOutput(0), nvinfer1::ActivationType::kRELU);conv1 = network->addConvolutionNd(*relu1->getOutput(0), 512, nvinfer1::DimsHW{3, 3}, weightMap["features.18.weight"], weightMap["features.18.bias"]);conv1->setPaddingNd(nvinfer1::DimsHW{1, 1});relu1 = network->addActivation(*conv1->getOutput(0), nvinfer1::ActivationType::kRELU);pool1 = network->addPoolingNd(*relu1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{2, 2});pool1->setStrideNd(nvinfer1::DimsHW{2, 2});// 使用全连接层算子nvinfer1::IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool1->getOutput(0), 4096, weightMap["classifier.0.weight"], weightMap["classifier.0.bias"]);assert(fc1);relu1 = network->addActivation(*fc1->getOutput(0), nvinfer1::ActivationType::kRELU);fc1 = network->addFullyConnected(*relu1->getOutput(0), 4096, weightMap["classifier.3.weight"], weightMap["classifier.3.bias"]);relu1 = network->addActivation(*fc1->getOutput(0), nvinfer1::ActivationType::kRELU);fc1 = network->addFullyConnected(*relu1->getOutput(0), 1000, weightMap["classifier.6.weight"], weightMap["classifier.6.bias"]);fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME); // 设置输出名称network->markOutput(*fc1->getOutput(0)); // 标记输出// 生成enginebuilder->setMaxBatchSize(maxBatchSize);config->setMaxWorkspaceSize(1 << 20);nvinfer1::ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);std::cout << "build out" << std::endl;// 生成engine后释放networknetwork->destroy();// 释放权重内存for (auto& mem : weightMap) free((void*) (mem.second.values)); return engine;
}std::map<std::string, nvinfer1::Weights> VGG_Demo::loadWeights(const std::string file){std::cout << "Loading weights: " << file << std::endl;std::map<std::string, nvinfer1::Weights> weightMap; // 权重名称和权重类的哈希表std::ifstream input(file);assert(input.is_open() && "Unable to load weight file.");// 首先读取权重block的个数int32_t count;input >> count;assert(count > 0 && "Invalid weight map file.");// 遍历权重blockwhile (count--){nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0}; // 初始化一个权重对象uint32_t size;// Read name and type of blobstd::string name;input >> name >> std::dec >> size; // std::dec表示使用十进制表示权重的sizewt.type = nvinfer1::DataType::kFLOAT; // 设置权重的类型// 拷贝权重值uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));for (uint32_t x = 0, y = size; x < y; ++x){ // 拷贝size大小input >> std::hex >> val[x];}// 完成哈希映射wt.values = val;wt.count = size;weightMap[name] = wt;}return weightMap;
}void VGG_Demo::deserialize(float* data){load_engine(); // 加载enginenvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(gLogger);assert(runtime != nullptr);nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(this->trtModelStream, this->size);assert(engine != nullptr);nvinfer1::IExecutionContext* context = engine->createExecutionContext();assert(context != nullptr);delete[] this->trtModelStream; // 手动释放trtModelStream// 执行推理for (int i = 0; i < 10; i++){ // 记录推理10次的时间auto start = std::chrono::system_clock::now();doInference(*context, data, this->prob, 1);auto end = std::chrono::system_clock::now();std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;}// 销毁context->destroy();engine->destroy();runtime->destroy();// 打印推理结果std::cout << "\nOutput:\n\n";for (unsigned int i = 0; i < 10; i++){ // 打印10个std::cout << this->prob[i] << ", ";if (i % 10 == 0) std::cout << i / 10 << std::endl;}std::cout << std::endl;
}void VGG_Demo::load_engine(){std::ifstream file(this->engine_file, std::ios::binary);if(file.good()){file.seekg(0, file.end);this->size = file.tellg();file.seekg(0, file.beg);this->trtModelStream = new char[size];assert(this->trtModelStream);file.read(this->trtModelStream, size);file.close();}
}void VGG_Demo::doInference(nvinfer1::IExecutionContext& context, float* input, float* output, int batchSize){const nvinfer1::ICudaEngine& engine = context.getEngine();assert(engine.getNbBindings() == 2);void* buffers[2];const int inputIndex = engine.getBindingIndex(this->INPUT_BLOB_NAME);const int outputIndex = engine.getBindingIndex(this->OUTPUT_BLOB_NAME);CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * this->INPUT_H * this->INPUT_W * sizeof(float)));CHECK(cudaMalloc(&buffers[outputIndex], batchSize * this->OUTPUT_SIZE * sizeof(float)));// 创建streamcudaStream_t stream;CHECK(cudaStreamCreate(&stream));// Host to deviceCHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));context.enqueue(batchSize, buffers, stream, nullptr);// device to hostCHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));cudaStreamSynchronize(stream);// 释放cudaStreamDestroy(stream);CHECK(cudaFree(buffers[inputIndex]));CHECK(cudaFree(buffers[outputIndex]));
}int main(int argc, char** argv){// 判断参数是否准确if(argc != 2){std::cerr << "arguments not right!" << std::endl;std::cerr << "./vgg_demo -s   // serialize model to plan file" << std::endl;std::cerr << "./vgg_demo -d   // deserialize plan file and run inference" << std::endl;return -1;}VGG_Demo vgg_demo1;if(std::string(argv[1]) == "-s"){ // 序列化vgg_demo1.serialize();}else if(std::string(argv[1]) == "-d"){ // 反序列化并推理// 生成测试数据float data[3 * 224 * 224];for (int i = 0; i < 3 * 224 * 224; i++) data[i] = 1;vgg_demo1.deserialize(data);}else{std::cerr << "wrong arguments!" << std::endl;;return -1;}return 0;
}

3--编译运行

mkdir build && cd build
cmake ..
make ./vgg_demo -s
./vgg_demo -d

更多推荐

TensorRT学习笔记

本文发布于:2023-12-05 17:20:26,感谢您对本站的认可!
本文链接:https://www.elefans.com/category/jswz/34/1664820.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
本文标签:学习笔记   TensorRT

发布评论

评论列表 (有 0 条评论)
草根站长

>www.elefans.com

编程频道|电子爱好者 - 技术资讯及电子产品介绍!