聊一聊应急C++部署方案——libtorch with torchscript

很早之前写过一篇libtorch的教程,记得那会torch的版本刚到1.0。现在torch都2.2.1了,libtorch作为很早一代的C++部署方案,现在依然可以使用(兼容性牛逼),在pytorch2.0时代,libtorch的一些基建以及torchscript依然可以在aot场景发挥余热。

自己有一些libtorch的笔记,最近有时间了整理下发出来,这篇大部分主要是libtorch和torchscript在常见部署场景中的用法。后续会介绍一些和AOTInductor结合使用的一些实际场景。

libtorch能做什么

在部署任务中,有很多时候

Pytorch C++ API

  • ATen: The foundational tensor and mathematical operation library on which all else is built.
  • Autograd: Augments ATen with automatic differentiation.
  • C++ Frontend: High level constructs for training and evaluation of machine learning models.
  • TorchScript: An interface to the TorchScript JIT compiler and interpreter.
  • C++ Extensions: A means of extending the Python API with custom C++ and CUDA routines.

如何写libtorch

写libtorch不再抓狂——自己用的C++在线调试notebook

导出你需要的后处理操作

可以采用cling C++ notebook的方式去实际操作学习一下。

/// IValue (Interpreter Value) is a tagged union over the types /// supported by the TorchScript interpreter. IValues contain their /// values as an IValue::Payload, which holds primitive types /// (int64_t, bool, double, Device) and Tensor as values, /// and all other types as a c10::intrusive_ptr. In order to /// optimize performance of the destructor and related operations by /// making the Tensor and c10::intrusive_ptr paths generate the /// same code, we represent a null c10::intrusive_ptr as /// UndefinedTensorImpl::singleton(), not nullptr.

Tensor内存操作

可以直接通过void*指针得到Tensor

at::Tensor tensor_image = torch::from_blob(image.data, {1, 3, image.rows, image.cols}, at::kByte);
tensor_image = tensor_image.to(at::kFloat);


std::unique_ptr<float[]> outputData(new float[1*17*96*72]);
auto res_point = torch::from_blob(outputData.get(), {input_shape[0],input_shape[1],input_shape[2],input_shape[3]});

Here, I assume that image.data is 8-bit byte values. The to(at::kFloat) will convert the 8-bit values into 32-bit floating points just as if you wrote static_cast(b) where b is a byte – just in case that wasn’t clear. If image.data is already floats, you can just write at::kFloat in place of at::kByte and skip the conversion of course. What’s super important to know is that from_blob does not take ownership of the data! It only interprets the data as a tensor, but doesn’t store the data itself. It’s easy to fix this if you want to, by calling .clone() on the tensor, since that will incur a copy of the data such that the resulting tensor will indeed own its data (which means the original cv::Mat can be destroyed and the cloned tensor will live on).

On the other side, it’s actually easier. You can use tensor.data() to access a tensor’s underlying data through a T*. For example, tensor_image.data() would give you a float*. If you want a more raw void* because you’re dumping the raw data somewhere else, there’s also a data_ptr() method that gives you a raw byte pointer.

Let me know if this helps.

// 假如传入的数据来自GPU void* input_buffer = ...
auto options = torch::TensorOptions().device(at::kCUDA);
auto detections = torch::from_blob(input_buffer, {input_shape[0],input_shape[1],input_shape[2]}, options);

Tensor基本操作

F::interpolate 
cv::Mat image = cv::imread("/home/lll/Pictures/test.jpg");


torch::Tensor image_tensor = torch::from_blob(image.data, {image.rows, image.cols, 3}, torch::kByte);

image_tensor = image_tensor.permute({2, 0, 1}).toType(torch::kFloat).div_(255);
image_tensor.sub_(0.5).div_(0.5);
image_tensor = image_tensor.unsqueeze(0);
image_tensor = image_tensor.to(torch::kCUDA);
image_tensor = image_tensor.contiguous();  // 必要

namespace F = torch::nn::functional;
image_tensor = F::interpolate(
        image_tensor,
        F::InterpolateFuncOptions()
                .mode(torch::kBilinear)
                .size(std::vector<int64_t>({512, 512}))
                .align_corners(true)
);
image_tensor = image_tensor.mul(0.5).add(0.5).mul(255);
image_tensor = image_tensor.squeeze(0).permute({1, 2, 0}).toType(torch::kByte).to(torch::kCPU);

cv::Mat test_mat(512, 512, CV_8UC3);
std::memcpy((void *) test_mat.data, image_tensor.data_ptr(), sizeof(torch::kU8) * image_tensor.numel());
cv::imshow("test", test_mat);
cv::waitKey(0);

如何调试

class Container(torch.nn.Module):
    def __init__(self, my_values):
        super().__init__()
        for key in my_values:
            setattr(self, key, my_values[key])

my_values = {
    'res': prediction.cpu()
}

container = torch.jit.script(Container(my_values))
container.save("container.pt")

torch::jit::script::Module container = torch::jit::load("/data/yanzong/code/data/debug/container.pt");
auto res = container.attr("res").toTensor();

torch::jit::script::Module container = torch::jit::load("container_jit.pt");
auto input_f = container.attr("input").toTensor();
auto input_o = container.attr("offset").toTensor();
auto input_m = container.attr("mask").toTensor();
auto input_w = container.attr("weight").toTensor();
auto input_b = container.attr("bias").toTensor();
auto res = container.attr("res").toTensor();

CUDA_CHECK(cudaMalloc(&inputs[0], sizeof(half)*input_feature_len));
CUDA_CHECK(cudaMalloc(&inputs[1], sizeof(half)*input_offset_len));
CUDA_CHECK(cudaMalloc(&inputs[2], sizeof(half)*input_mask_len));
CUDA_CHECK(cudaMalloc(&inputs[3], sizeof(half)*weight_len));
CUDA_CHECK(cudaMalloc(&inputs[4], sizeof(half)*bias_len));

auto input_f_half = at::_cast_Half(input_f);
auto input_o_half = at::_cast_Half(input_o);
auto input_m_half = at::_cast_Half(input_m);
auto input_w_half = at::_cast_Half(input_w);
auto input_b_half = at::_cast_Half(input_b);
// std::cout << input_f_half.element_size();  // 2

CUDA_CHECK(cudaMemcpyAsync(inputs[0], input_f_half.data_ptr(), sizeof(half)*input_feature_len, cudaMemcpyHostToDevice, stream));
CUDA_CHECK(cudaMemcpyAsync(inputs[1], input_o_half.data_ptr(), sizeof(half)*input_offset_len, cudaMemcpyHostToDevice,stream));
CUDA_CHECK(cudaMemcpyAsync(inputs[2], input_m_half.data_ptr(), sizeof(half)*input_mask_len, cudaMemcpyHostToDevice,stream));
CUDA_CHECK(cudaMemcpyAsync(inputs[3], input_w_half.data_ptr(), sizeof(half)*weight_len, cudaMemcpyHostToDevice, stream));
CUDA_CHECK(cudaMemcpyAsync(inputs[4], input_b_half.data_ptr(), sizeof(half)*bias_len, cudaMemcpyHostToDevice,stream));

...

CUDA_CHECK(cudaMalloc(&outputs[0], sizeof(half)*1*4*4*4));
CUDA_CHECK(cudaMalloc(&workspace, sizeof(float)*workspace_len));
 
DataType datatype = DataType::kHALF;

std::cout << "enqueue_call\n";
enqueue_call(reinterpret_cast<void**>(inputs), reinterpret_cast<void**>(outputs), workspace, stream, datatype, dcnparam);   

// float* res_cuda = new float[1*4*4*4];
// CUDA_CHECK(cudaMemcpyAsync(res_cuda, outputs[0], sizeof(float)*4*4*4, cudaMemcpyDeviceToHost,stream));
// torch::Tensor f = torch::from_blob(res_cuda, {1,4,4,4});
// std::cout << f << "\n";

half* res_cuda = new half[1*4*4*4];
CUDA_CHECK(cudaMemcpyAsync(res_cuda, outputs[0], sizeof(half)*4*4*4, cudaMemcpyDeviceToHost,stream));
torch::Tensor f = torch::from_blob(res_cuda, {1,4,4,4}, torch::kHalf);

此文章不断更新中。

参考