triton-inference-server中显存分配情况

  {OPTION_CUDA_MEMORY_POOL_BYTE_SIZE, "cuda-memory-pool-byte-size",
   "<integer>:<integer>",
   "The total byte size that can be allocated as CUDA memory for the GPU "
   "device. If GPU support is enabled, the server will allocate CUDA "
   "memory to minimize data transfer between host and devices until it "
   "exceeds the specified byte size. This option will not affect the "
   "allocation conducted by the backend frameworks. The argument should be "
   "2 integers separated by colons in the format "
   "<GPU device ID>:<pool byte size>. This option can be used multiple "
   "times, but only once per GPU device. Subsequent uses will overwrite "
   "previous uses for the same GPU device. Default is 64 MB."},

根据cuda-memory-pool-byte-size大小创建CudaMemoryManager,这里的cuda_memory_pool_size_由设置的cuda-memory-pool-byte-size决定,如果没有设置那就默认值:

#ifdef TRITON_ENABLE_GPU
  // Set the default CUDA memory pool size for GPUs where it is not
  // set explicitly.
  std::set<int> supported_gpus;
  if (GetSupportedGPUs(&supported_gpus, min_supported_compute_capability_)
          .IsOk()) {
    for (const auto gpu : supported_gpus) {
      if (cuda_memory_pool_size_.find(gpu) == cuda_memory_pool_size_.end()) {
        cuda_memory_pool_size_[gpu] = 1 << 26;
      }
    }
  }

  CudaMemoryManager::Options cuda_options(
      min_supported_compute_capability_, cuda_memory_pool_size_);
  status = CudaMemoryManager::Create(cuda_options);

  if (!status.IsOk()) {
    LOG_ERROR << status.Message();
  }
Status
CudaMemoryManager::Create(const CudaMemoryManager::Options& options)
{
  // Ensure thread-safe creation of CUDA memory pool
  std::lock_guard<std::mutex> lock(instance_mu_);
  if (instance_ != nullptr) {
    LOG_WARNING << "New CUDA memory pools could not be created since they "
                   "already exists";
    return Status::Success;
  }

  std::set<int> supported_gpus;
  auto status = GetSupportedGPUs(
      &supported_gpus, options.min_supported_compute_capability_);
  if (status.IsOk()) {
    std::vector<cnmemDevice_t> devices;
    for (auto gpu : supported_gpus) {
      const auto it = options.memory_pool_byte_size_.find(gpu);
      if ((it != options.memory_pool_byte_size_.end()) && (it->second != 0)) {
        devices.emplace_back();
        auto& device = devices.back();
        memset(&device, 0, sizeof(device));
        device.device = gpu;
        device.size = it->second;

        LOG_INFO << "CUDA memory pool is created on device " << device.device
                 << " with size " << device.size;
      }
    }

    if (!devices.empty()) {
      RETURN_IF_CNMEM_ERROR(
          cnmemInit(devices.size(), devices.data(), CNMEM_FLAGS_CANNOT_GROW),
          std::string("Failed to finalize CUDA memory manager"));
    } else {
      LOG_INFO << "CUDA memory pool disabled";
    }

    // Use to finalize CNMeM properly when out of scope
    instance_.reset(new CudaMemoryManager(!devices.empty()));
  } else {
    return Status(
        status.ErrorCode(),
        "Failed to initialize CUDA memory manager: " + status.Message());
  }

  return Status::Success;
}

日常中的一些资源会通过这个申请:

//
// AllocatedMemory
//
AllocatedMemory::AllocatedMemory(
    size_t byte_size, TRITONSERVER_MemoryType memory_type,
    int64_t memory_type_id)
    : MutableMemory(nullptr, byte_size, memory_type, memory_type_id)
{
  if (total_byte_size_ != 0) {
    // Allocate memory with the following fallback policy:
    // CUDA memory -> pinned system memory -> non-pinned system memory
    switch (buffer_attributes_.MemoryType()) {
#ifdef TRITON_ENABLE_GPU
      case TRITONSERVER_MEMORY_GPU: {
        auto status = CudaMemoryManager::Alloc(
            (void**)&buffer_, total_byte_size_,
            buffer_attributes_.MemoryTypeId());
        if (!status.IsOk()) {
          static bool warning_logged = false;
          if (!warning_logged) {
            LOG_WARNING << status.Message()
                        << ", falling back to pinned system memory";
            warning_logged = true;
          }

          goto pinned_memory_allocation;
        }
        break;
      }
      pinned_memory_allocation:
#endif  // TRITON_ENABLE_GPU
      default: {
        TRITONSERVER_MemoryType memory_type = buffer_attributes_.MemoryType();
        auto status = PinnedMemoryManager::Alloc(
            (void**)&buffer_, total_byte_size_, &memory_type, true);
        buffer_attributes_.SetMemoryType(memory_type);
        if (!status.IsOk()) {
          LOG_ERROR << status.Message();
          buffer_ = nullptr;
        }
        break;
      }
    }
  }
  total_byte_size_ = (buffer_ == nullptr) ? 0 : total_byte_size_;
}