Skip to content
64 changes: 64 additions & 0 deletions quadrants/program/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@
#include "quadrants/rhi/cuda/cuda_context.h"
#endif

#ifdef QD_WITH_AMDGPU
#include "quadrants/rhi/amdgpu/amdgpu_driver.h"
#include "quadrants/rhi/amdgpu/amdgpu_context.h"
#endif

#ifdef QD_WITH_VULKAN
#include "quadrants/runtime/program_impls/vulkan/vulkan_program.h"
#include "quadrants/rhi/vulkan/vulkan_loader.h"
Expand Down Expand Up @@ -493,6 +498,13 @@ uint64 Program::stream_create() {
CUDADriver::get_instance().stream_create(&stream, 0 /*flags*/);
return reinterpret_cast<uint64>(stream);
}
#endif
#ifdef QD_WITH_AMDGPU
if (compile_config().arch == Arch::amdgpu) {
void *stream = nullptr;
AMDGPUDriver::get_instance().stream_create(&stream, 0 /*flags*/);
return reinterpret_cast<uint64>(stream);
}
#endif
return 0;
}
Expand All @@ -504,6 +516,12 @@ void Program::stream_destroy(uint64 stream_handle) {
reinterpret_cast<void *>(stream_handle));
}
#endif
#ifdef QD_WITH_AMDGPU
if (compile_config().arch == Arch::amdgpu && stream_handle != 0) {
AMDGPUDriver::get_instance().stream_destroy(
reinterpret_cast<void *>(stream_handle));
}
#endif
}

void Program::stream_synchronize(uint64 stream_handle) {
Expand All @@ -513,6 +531,12 @@ void Program::stream_synchronize(uint64 stream_handle) {
reinterpret_cast<void *>(stream_handle));
}
#endif
#ifdef QD_WITH_AMDGPU
if (compile_config().arch == Arch::amdgpu && stream_handle != 0) {
AMDGPUDriver::get_instance().stream_synchronize(
reinterpret_cast<void *>(stream_handle));
}
#endif
}

void Program::set_current_cuda_stream(uint64 stream_handle) {
Expand All @@ -522,6 +546,12 @@ void Program::set_current_cuda_stream(uint64 stream_handle) {
reinterpret_cast<void *>(stream_handle));
}
#endif
#ifdef QD_WITH_AMDGPU
if (compile_config().arch == Arch::amdgpu) {
AMDGPUContext::get_instance().set_stream(
reinterpret_cast<void *>(stream_handle));
}
#endif
}

uint64 Program::event_create() {
Expand All @@ -532,6 +562,14 @@ uint64 Program::event_create() {
0x02 /*CU_EVENT_DISABLE_TIMING*/);
return reinterpret_cast<uint64>(event);
}
#endif
#ifdef QD_WITH_AMDGPU
if (compile_config().arch == Arch::amdgpu) {
void *event = nullptr;
AMDGPUDriver::get_instance().event_create(&event,
0x02 /*hipEventDisableTiming*/);
return reinterpret_cast<uint64>(event);
}
#endif
return 0;
}
Expand All @@ -543,6 +581,12 @@ void Program::event_destroy(uint64 event_handle) {
reinterpret_cast<void *>(event_handle));
}
#endif
#ifdef QD_WITH_AMDGPU
if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
AMDGPUDriver::get_instance().event_destroy(
reinterpret_cast<void *>(event_handle));
}
#endif
}

void Program::event_record(uint64 event_handle, uint64 stream_handle) {
Expand All @@ -553,6 +597,13 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) {
reinterpret_cast<void *>(stream_handle));
}
#endif
#ifdef QD_WITH_AMDGPU
if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
AMDGPUDriver::get_instance().event_record(
reinterpret_cast<void *>(event_handle),
reinterpret_cast<void *>(stream_handle));
}
#endif
}

void Program::event_synchronize(uint64 event_handle) {
Expand All @@ -562,6 +613,12 @@ void Program::event_synchronize(uint64 event_handle) {
reinterpret_cast<void *>(event_handle));
}
#endif
#ifdef QD_WITH_AMDGPU
if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
AMDGPUDriver::get_instance().event_synchronize(
reinterpret_cast<void *>(event_handle));
}
#endif
}

void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
Expand All @@ -572,6 +629,13 @@ void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
reinterpret_cast<void *>(event_handle), 0 /*flags*/);
}
#endif
#ifdef QD_WITH_AMDGPU
if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
AMDGPUDriver::get_instance().stream_wait_event(
reinterpret_cast<void *>(stream_handle),
reinterpret_cast<void *>(event_handle), 0 /*flags*/);
}
#endif
}

} // namespace quadrants::lang
6 changes: 4 additions & 2 deletions quadrants/rhi/amdgpu/amdgpu_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
namespace quadrants {
namespace lang {

thread_local void *AMDGPUContext::stream_ = nullptr;

AMDGPUContext::AMDGPUContext()
: driver_(AMDGPUDriver::get_instance_without_context()) {
dev_count_ = 0;
Expand Down Expand Up @@ -188,7 +190,7 @@ void AMDGPUContext::launch(void *func,
void *config[] = {(void *)0x01, (void *)packed_arg, (void *)0x02,
(void *)&pack_size, (void *)0x03};
driver_.launch_kernel(func, grid_dim, 1, 1, block_dim, 1, 1,
dynamic_shared_mem_bytes, nullptr, nullptr,
dynamic_shared_mem_bytes, stream_, nullptr,
reinterpret_cast<void **>(&config));
}
std::free(packed_arg);
Expand All @@ -197,7 +199,7 @@ void AMDGPUContext::launch(void *func,
profiler_->stop(task_handle);

if (debug_) {
driver_.stream_synchronize(nullptr);
driver_.stream_synchronize(stream_);
}
}

Expand Down
9 changes: 9 additions & 0 deletions quadrants/rhi/amdgpu/amdgpu_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class AMDGPUContext {
KernelProfilerBase *profiler_{nullptr};
AMDGPUDriver &driver_;
bool debug_{false};
static thread_local void *stream_;
std::vector<void *> kernel_arg_pointer_;

public:
Expand Down Expand Up @@ -116,6 +117,14 @@ class AMDGPUContext {
return std::unique_lock<std::mutex>(lock_);
}

void set_stream(void *stream) {
stream_ = stream;
}

void *get_stream() const {
return stream_;
}

static AMDGPUContext &get_instance();
};

Expand Down
9 changes: 9 additions & 0 deletions quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ PER_AMDGPU_FUNCTION(context_get_current, hipCtxGetCurrent, void **);

// Stream management
PER_AMDGPU_FUNCTION(stream_create, hipStreamCreate, void **, uint32);
PER_AMDGPU_FUNCTION(stream_destroy, hipStreamDestroy, void *);

// Memory management
PER_AMDGPU_FUNCTION(memcpy_host_to_device,
Expand Down Expand Up @@ -69,13 +70,16 @@ PER_AMDGPU_FUNCTION(memcpy_device_to_host_async,
std::size_t,
void *);
PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t);
// hipMallocAsync/hipFreeAsync require ROCm >= 5.4
PER_AMDGPU_FUNCTION(malloc_async, hipMallocAsync, void **, std::size_t, void *);
PER_AMDGPU_FUNCTION(malloc_managed,
hipMallocManaged,
void **,
std::size_t,
uint32);
PER_AMDGPU_FUNCTION(memset, hipMemset, void *, uint8, std::size_t);
PER_AMDGPU_FUNCTION(mem_free, hipFree, void *);
PER_AMDGPU_FUNCTION(mem_free_async, hipFreeAsync, void *, void *);
PER_AMDGPU_FUNCTION(mem_get_info, hipMemGetInfo, std::size_t *, std::size_t *);
PER_AMDGPU_FUNCTION(mem_get_attribute,
hipPointerGetAttribute,
Expand Down Expand Up @@ -121,6 +125,11 @@ PER_AMDGPU_FUNCTION(kernel_get_occupancy,

// Stream management
PER_AMDGPU_FUNCTION(stream_synchronize, hipStreamSynchronize, void *);
PER_AMDGPU_FUNCTION(stream_wait_event,
hipStreamWaitEvent,
void *,
void *,
uint32);
Comment on lines +128 to +132
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it is time to increase linewidth of C++ code. 80 chars is painful nowadays. I would recommend either 100 or 120, as a matter of taste.


// Event management
PER_AMDGPU_FUNCTION(event_create, hipEventCreateWithFlags, void **, uint32);
Expand Down
56 changes: 29 additions & 27 deletions quadrants/runtime/amdgpu/kernel_launcher.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "quadrants/runtime/amdgpu/kernel_launcher.h"
#include "quadrants/rhi/amdgpu/amdgpu_context.h"
#include "quadrants/rhi/amdgpu/amdgpu_driver.h"
#include "quadrants/program/launch_context_builder.h"

namespace quadrants::lang {
Expand Down Expand Up @@ -32,18 +33,14 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
transfers;
std::unordered_map<ArgArrayPtrKey, void *, ArgArrayPtrKeyHasher> device_ptrs;

auto *active_stream = AMDGPUContext::get_instance().get_stream();

char *device_result_buffer{nullptr};
// Here we have to guarantee the result_result_buffer isn't nullptr
// It is interesting - The code following
// L60: DeviceAllocation devalloc =
// executor->allocate_memory_on_device( call another kernel and it will result
// in
// Memory access fault by GPU node-1 (Agent handle: 0xeda5ca0) on address
// (nil). Reason: Page not present or supervisor privilege.
// if you don't allocate it.
Comment on lines -36 to -43
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did you remove this comment? It is no longer applicable? It is irrelevant?

AMDGPUDriver::get_instance().malloc(
// Must always allocate device_result_buffer (even when result_buffer_size
// is 0) to avoid memory access faults from allocate_memory_on_device below.
AMDGPUDriver::get_instance().malloc_async(
(void **)&device_result_buffer,
std::max(ctx.result_buffer_size, sizeof(uint64)));
std::max(ctx.result_buffer_size, sizeof(uint64)), active_stream);

for (int i = 0; i < (int)parameters.size(); i++) {
const auto &kv = parameters[i];
Expand All @@ -69,8 +66,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
executor->get_device_alloc_info_ptr(devalloc);
transfers[data_ptr_idx] = {data_ptr, devalloc};

AMDGPUDriver::get_instance().memcpy_host_to_device(
(void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz);
AMDGPUDriver::get_instance().memcpy_host_to_device_async(
(void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz,
active_stream);
}
ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
(uint64)ctx.array_ptrs[grad_ptr_idx]);
Expand All @@ -86,27 +84,28 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
}
}
if (transfers.size() > 0) {
AMDGPUDriver::get_instance().stream_synchronize(nullptr);
AMDGPUDriver::get_instance().stream_synchronize(active_stream);
}
char *host_result_buffer = (char *)ctx.get_context().result_buffer;
if (ctx.result_buffer_size > 0) {
// Malloc_Async and Free_Async are available after ROCm 5.4
ctx.get_context().result_buffer = (uint64 *)device_result_buffer;
}
char *device_arg_buffer = nullptr;
if (ctx.arg_buffer_size > 0) {
AMDGPUDriver::get_instance().malloc((void **)&device_arg_buffer,
ctx.arg_buffer_size);
AMDGPUDriver::get_instance().memcpy_host_to_device(
device_arg_buffer, ctx.get_context().arg_buffer, ctx.arg_buffer_size);
AMDGPUDriver::get_instance().malloc_async(
(void **)&device_arg_buffer, ctx.arg_buffer_size, active_stream);
AMDGPUDriver::get_instance().memcpy_host_to_device_async(
device_arg_buffer, ctx.get_context().arg_buffer, ctx.arg_buffer_size,
active_stream);
ctx.get_context().arg_buffer = device_arg_buffer;
}
void *context_pointer;
int arg_size = sizeof(RuntimeContext *);
AMDGPUDriver::get_instance().malloc((void **)&context_pointer,
sizeof(RuntimeContext));
AMDGPUDriver::get_instance().memcpy_host_to_device(
context_pointer, &ctx.get_context(), sizeof(RuntimeContext));
AMDGPUDriver::get_instance().malloc_async(
(void **)&context_pointer, sizeof(RuntimeContext), active_stream);
AMDGPUDriver::get_instance().memcpy_host_to_device_async(
context_pointer, &ctx.get_context(), sizeof(RuntimeContext),
active_stream);

AMDGPUContext::get_instance().push_back_kernel_arg_pointer(context_pointer);

Expand All @@ -119,13 +118,18 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
}
QD_TRACE("Launching kernel");
if (ctx.arg_buffer_size > 0) {
AMDGPUDriver::get_instance().mem_free(device_arg_buffer);
AMDGPUDriver::get_instance().mem_free_async(device_arg_buffer,
active_stream);
}
if (ctx.result_buffer_size > 0) {
AMDGPUDriver::get_instance().memcpy_device_to_host(
host_result_buffer, device_result_buffer, ctx.result_buffer_size);
AMDGPUDriver::get_instance().memcpy_device_to_host_async(
host_result_buffer, device_result_buffer, ctx.result_buffer_size,
active_stream);
}
AMDGPUDriver::get_instance().mem_free_async(device_result_buffer,
active_stream);
if (transfers.size()) {
AMDGPUDriver::get_instance().stream_synchronize(active_stream);
for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
auto &idx = itr->first;
auto arg_id = idx.arg_id;
Expand All @@ -135,8 +139,6 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
executor->deallocate_memory_on_device(itr->second.second);
}
}
// Since we always allocating above then we should always free
AMDGPUDriver::get_instance().mem_free(device_result_buffer);
}

KernelLauncher::Handle KernelLauncher::register_llvm_kernel(
Expand Down
Loading
Loading