{OPTION_CUDA_MEMORY_POOL_BYTE_SIZE, "cuda-memory-pool-byte-size",
"<integer>:<integer>",
"The total byte size that can be allocated as CUDA memory for the GPU "
"device. If GPU support is enabled, the server will allocate CUDA "
"memory to minimize data transfer between host and devices until it "
"exceeds the specified byte size. This option will not affect the "
"allocation conducted by the backend frameworks. The argument should be "
"2 integers separated by colons in the format "
"<GPU device ID>:<pool byte size>. This option can be used multiple "
"times, but only once per GPU device. Subsequent uses will overwrite "
"previous uses for the same GPU device. Default is 64 MB."},
根据cuda-memory-pool-byte-size
大小创建CudaMemoryManager,这里的cuda_memory_pool_size_
由设置的cuda-memory-pool-byte-size决定,如果没有设置那就默认值:
#ifdef TRITON_ENABLE_GPU
// Set the default CUDA memory pool size for GPUs where it is not
// set explicitly.
std::set<int> supported_gpus;
if (GetSupportedGPUs(&supported_gpus, min_supported_compute_capability_)
.IsOk()) {
for (const auto gpu : supported_gpus) {
if (cuda_memory_pool_size_.find(gpu) == cuda_memory_pool_size_.end()) {
cuda_memory_pool_size_[gpu] = 1 << 26;
}
}
}
CudaMemoryManager::Options cuda_options(
min_supported_compute_capability_, cuda_memory_pool_size_);
status = CudaMemoryManager::Create(cuda_options);
if (!status.IsOk()) {
LOG_ERROR << status.Message();
}
Status
CudaMemoryManager::Create(const CudaMemoryManager::Options& options)
{
// Ensure thread-safe creation of CUDA memory pool
std::lock_guard<std::mutex> lock(instance_mu_);
if (instance_ != nullptr) {
LOG_WARNING << "New CUDA memory pools could not be created since they "
"already exists";
return Status::Success;
}
std::set<int> supported_gpus;
auto status = GetSupportedGPUs(
&supported_gpus, options.min_supported_compute_capability_);
if (status.IsOk()) {
std::vector<cnmemDevice_t> devices;
for (auto gpu : supported_gpus) {
const auto it = options.memory_pool_byte_size_.find(gpu);
if ((it != options.memory_pool_byte_size_.end()) && (it->second != 0)) {
devices.emplace_back();
auto& device = devices.back();
memset(&device, 0, sizeof(device));
device.device = gpu;
device.size = it->second;
LOG_INFO << "CUDA memory pool is created on device " << device.device
<< " with size " << device.size;
}
}
if (!devices.empty()) {
RETURN_IF_CNMEM_ERROR(
cnmemInit(devices.size(), devices.data(), CNMEM_FLAGS_CANNOT_GROW),
std::string("Failed to finalize CUDA memory manager"));
} else {
LOG_INFO << "CUDA memory pool disabled";
}
// Use to finalize CNMeM properly when out of scope
instance_.reset(new CudaMemoryManager(!devices.empty()));
} else {
return Status(
status.ErrorCode(),
"Failed to initialize CUDA memory manager: " + status.Message());
}
return Status::Success;
}
日常中的一些资源会通过这个申请:
//
// AllocatedMemory
//
AllocatedMemory::AllocatedMemory(
size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id)
: MutableMemory(nullptr, byte_size, memory_type, memory_type_id)
{
if (total_byte_size_ != 0) {
// Allocate memory with the following fallback policy:
// CUDA memory -> pinned system memory -> non-pinned system memory
switch (buffer_attributes_.MemoryType()) {
#ifdef TRITON_ENABLE_GPU
case TRITONSERVER_MEMORY_GPU: {
auto status = CudaMemoryManager::Alloc(
(void**)&buffer_, total_byte_size_,
buffer_attributes_.MemoryTypeId());
if (!status.IsOk()) {
static bool warning_logged = false;
if (!warning_logged) {
LOG_WARNING << status.Message()
<< ", falling back to pinned system memory";
warning_logged = true;
}
goto pinned_memory_allocation;
}
break;
}
pinned_memory_allocation:
#endif // TRITON_ENABLE_GPU
default: {
TRITONSERVER_MemoryType memory_type = buffer_attributes_.MemoryType();
auto status = PinnedMemoryManager::Alloc(
(void**)&buffer_, total_byte_size_, &memory_type, true);
buffer_attributes_.SetMemoryType(memory_type);
if (!status.IsOk()) {
LOG_ERROR << status.Message();
buffer_ = nullptr;
}
break;
}
}
}
total_byte_size_ = (buffer_ == nullptr) ? 0 : total_byte_size_;
}