Skip to content

Commit ebb00b3

Browse files
kaiqiy-nvbmhowe23
andauthored
Fix pytorch AcceleratorError root-caused by QEC (#472)
This is to fix the 'AcceleratorError' part of [NVBug 6008689](https://nvbugspro.nvidia.com/bug/6008689) . Details are explained in this NVBug. Thanks for reviewing this PR. Signed-off-by: Kaiqi Yan <kaiqiy@nvidia.com> Co-authored-by: Ben Howe <141149032+bmhowe23@users.noreply.github.com>
1 parent 5c012c9 commit ebb00b3

1 file changed

Lines changed: 24 additions & 1 deletion

File tree

libs/qec/lib/decoders/plugins/trt_decoder/trt_decoder.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -434,13 +434,36 @@ struct trt_decoder::Impl {
434434
}
435435

436436
~Impl() {
437+
// IMPORTANT: Destroy resources in the correct order.
438+
439+
// 1. Synchronise the stream so all async work completes
440+
if (stream) {
441+
cudaStreamSynchronize(stream);
442+
}
443+
444+
// 2. Destroy the CUDA graph executor BEFORE the stream it was captured on
445+
executor = TraditionalExecutor{};
446+
447+
// 3. Destroy TensorRT execution context and engine BEFORE freeing their
448+
// underlying GPU memory
449+
context.reset();
450+
engine.reset();
451+
452+
// 4. Free GPU buffers
437453
if (buffers[input_index]) {
438454
HANDLE_CUDA_ERROR_NO_THROW(cudaFree(buffers[input_index]));
455+
buffers[input_index] = nullptr;
439456
}
440457
if (buffers[output_index]) {
441458
HANDLE_CUDA_ERROR_NO_THROW(cudaFree(buffers[output_index]));
459+
buffers[output_index] = nullptr;
460+
}
461+
462+
// 5. Destroy stream last
463+
if (stream) {
464+
HANDLE_CUDA_ERROR_NO_THROW(cudaStreamDestroy(stream));
465+
stream = nullptr;
442466
}
443-
HANDLE_CUDA_ERROR_NO_THROW(cudaStreamDestroy(stream));
444467
}
445468
};
446469

0 commit comments

Comments
 (0)