From 705c43d941a99464481ff2d3ca84fe2bba543e97 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sun, 15 Mar 2026 21:45:46 -0700 Subject: [PATCH 1/3] [CPU] Abort kernel execution on assertion failure instead of segfaulting On CPU, when debug mode is enabled, out-of-bounds array accesses trigger a runtime assertion that records the error but allows execution to continue -- leading to a SIGSEGV before Python can retrieve the error. Fix this by using setjmp/longjmp: each CPU task runner (range_for, struct_for, mesh_for, serial) sets up a jmp_buf via RuntimeContext, and the new quadrants_assert_format_ctx function longjmps back on failure. The existing check_runtime_error path then raises QuadrantsAssertionError. GPU architectures are unaffected (they already kill threads via asm). Made-with: Cursor --- quadrants/codegen/llvm/codegen_llvm.cpp | 11 ++- quadrants/program/context.h | 4 + quadrants/runtime/cpu/kernel_launcher.cpp | 7 ++ .../runtime/llvm/runtime_module/runtime.cpp | 45 ++++++++++ tests/python/test_debug.py | 88 +++++++++++++++++++ 5 files changed, 153 insertions(+), 2 deletions(-) diff --git a/quadrants/codegen/llvm/codegen_llvm.cpp b/quadrants/codegen/llvm/codegen_llvm.cpp index b7c5b302b..cb0048c9c 100644 --- a/quadrants/codegen/llvm/codegen_llvm.cpp +++ b/quadrants/codegen/llvm/codegen_llvm.cpp @@ -1332,7 +1332,11 @@ void TaskCodeGenLLVM::visit(AssertStmt *stmt) { auto arguments = create_entry_block_alloca(argument_buffer_size); std::vector args; - args.emplace_back(get_runtime()); + // On CPU, use the context-aware variant that can longjmp to abort the kernel + // task, preventing segfaults from subsequent out-of-bounds memory accesses. + // On GPU, the original variant suffices because asm("exit;") kills the thread. + bool use_ctx_variant = arch_is_cpu(current_arch()); + args.emplace_back(use_ctx_variant ? get_context() : get_runtime()); args.emplace_back(builder->CreateIsNotNull(llvm_val[stmt->cond])); args.emplace_back(builder->CreateGlobalStringPtr(stmt->text)); @@ -1362,7 +1366,10 @@ void TaskCodeGenLLVM::visit(AssertStmt *stmt) { builder->CreateGEP(argument_buffer_size, arguments, {tlctx->get_constant(0), tlctx->get_constant(0)})); - llvm_val[stmt] = call("quadrants_assert_format", std::move(args)); + llvm_val[stmt] = call( + use_ctx_variant ? "quadrants_assert_format_ctx" + : "quadrants_assert_format", + std::move(args)); } void TaskCodeGenLLVM::visit(SNodeOpStmt *stmt) { diff --git a/quadrants/program/context.h b/quadrants/program/context.h index a75fad53c..e1d307220 100644 --- a/quadrants/program/context.h +++ b/quadrants/program/context.h @@ -24,6 +24,10 @@ struct RuntimeContext { // LLVMRuntime is shared among functions. So we moved the pointer to // RuntimeContext which each function have one. uint64_t *result_buffer; + + // On CPU, points to a jmp_buf used to abort kernel execution when a runtime + // assertion (e.g. out-of-bounds check) fails. NULL when no guard is active. + void *cpu_abort_jmp_buf{nullptr}; }; #if defined(QD_RUNTIME_HOST) diff --git a/quadrants/runtime/cpu/kernel_launcher.cpp b/quadrants/runtime/cpu/kernel_launcher.cpp index d7dd8df25..00020bfb2 100644 --- a/quadrants/runtime/cpu/kernel_launcher.cpp +++ b/quadrants/runtime/cpu/kernel_launcher.cpp @@ -1,5 +1,6 @@ #include "quadrants/runtime/cpu/kernel_launcher.h" #include "quadrants/rhi/arch.h" +#include namespace quadrants::lang { namespace cpu { @@ -41,9 +42,15 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, } } } + std::jmp_buf abort_buf; + ctx.get_context().cpu_abort_jmp_buf = &abort_buf; for (auto task : launcher_ctx.task_funcs) { + if (setjmp(abort_buf) != 0) { + break; + } task(&ctx.get_context()); } + ctx.get_context().cpu_abort_jmp_buf = nullptr; } KernelLauncher::Handle KernelLauncher::register_llvm_kernel( diff --git a/quadrants/runtime/llvm/runtime_module/runtime.cpp b/quadrants/runtime/llvm/runtime_module/runtime.cpp index cfd76e999..e83382172 100644 --- a/quadrants/runtime/llvm/runtime_module/runtime.cpp +++ b/quadrants/runtime/llvm/runtime_module/runtime.cpp @@ -14,6 +14,7 @@ #endif #include +#include #include #include #include @@ -365,6 +366,7 @@ STRUCT_FIELD_ARRAY(PhysicalCoordinates, val); STRUCT_FIELD(RuntimeContext, runtime); STRUCT_FIELD(RuntimeContext, result_buffer) +STRUCT_FIELD(RuntimeContext, cpu_abort_jmp_buf) #include "quadrants/runtime/llvm/runtime_module/atomic.h" @@ -886,6 +888,23 @@ void quadrants_assert_format(LLVMRuntime *runtime, #endif } +// Context-aware variant called by bounds-check assertions in JIT'd code. +// On CPU, longjmps to the task runner's abort guard when an assertion fails, +// preventing the subsequent out-of-bounds memory access from segfaulting. +void quadrants_assert_format_ctx(RuntimeContext *context, + u1 test, + const char *format, + int num_arguments, + uint64 *arguments) { + quadrants_assert_format(context->runtime, test, format, num_arguments, + arguments); +#if !ARCH_cuda && !ARCH_amdgpu + if (enable_assert && test == 0 && context->cpu_abort_jmp_buf) { + std::longjmp(*(std::jmp_buf *)context->cpu_abort_jmp_buf, 1); + } +#endif +} + void quadrants_assert_runtime(LLVMRuntime *runtime, u1 test, const char *msg) { quadrants_assert_format(runtime, test, msg, 0, nullptr); } @@ -1510,10 +1529,19 @@ void cpu_struct_for_block_helper(void *ctx_, int thread_id, int i) { RuntimeContext this_thread_context = *ctx->context; this_thread_context.cpu_thread_id = thread_id; + + std::jmp_buf abort_buf; + this_thread_context.cpu_abort_jmp_buf = &abort_buf; + if (setjmp(abort_buf) != 0) { + return; + } + if (lower < upper) { (*ctx->task)(&this_thread_context, tls_buffer, &ctx->list->get(element_id), lower, upper); } + + this_thread_context.cpu_abort_jmp_buf = nullptr; } void parallel_struct_for(RuntimeContext *context, @@ -1591,6 +1619,13 @@ void cpu_parallel_range_for_task(void *range_context, RuntimeContext this_thread_context = *ctx.context; this_thread_context.cpu_thread_id = thread_id; + + std::jmp_buf abort_buf; + this_thread_context.cpu_abort_jmp_buf = &abort_buf; + if (setjmp(abort_buf) != 0) { + return; + } + if (ctx.step == 1) { int block_start = ctx.begin + task_id * ctx.block_size; int block_end = std::min(block_start + ctx.block_size, ctx.end); @@ -1604,6 +1639,8 @@ void cpu_parallel_range_for_task(void *range_context, ctx.body(&this_thread_context, tls_ptr, i); } } + + this_thread_context.cpu_abort_jmp_buf = nullptr; if (ctx.epilogue) ctx.epilogue(ctx.context, tls_ptr); } @@ -1690,6 +1727,12 @@ void cpu_parallel_mesh_for_task(void *range_context, RuntimeContext this_thread_context = *ctx.context; this_thread_context.cpu_thread_id = thread_id; + std::jmp_buf abort_buf; + this_thread_context.cpu_abort_jmp_buf = &abort_buf; + if (setjmp(abort_buf) != 0) { + return; + } + int block_start = task_id * ctx.block_size; int block_end = std::min(block_start + ctx.block_size, ctx.num_patches); @@ -1700,6 +1743,8 @@ void cpu_parallel_mesh_for_task(void *range_context, if (ctx.epilogue) ctx.epilogue(ctx.context, tls_ptr, idx); } + + this_thread_context.cpu_abort_jmp_buf = nullptr; } void cpu_parallel_mesh_for(RuntimeContext *context, diff --git a/tests/python/test_debug.py b/tests/python/test_debug.py index 828639eae..78fc68db2 100644 --- a/tests/python/test_debug.py +++ b/tests/python/test_debug.py @@ -1,5 +1,6 @@ import platform +import numpy as np import pytest import quadrants as qd @@ -135,3 +136,90 @@ def func(): x[3, 7] = 2 func() + + +@test_utils.test( + arch=[qd.cpu], + require=qd.extension.assertion, + debug=True, + check_out_of_bound=True, + gdb_trigger=False, +) +def test_ndarray_oob_cpu_raises_not_segfaults(): + """Out-of-bounds ndarray access in a parallel kernel on CPU should raise + QuadrantsAssertionError instead of segfaulting.""" + arr = qd.ndarray(dtype=qd.f32, shape=(4,)) + + @qd.kernel + def write_oob(a: qd.types.ndarray(dtype=qd.f32, ndim=1)): + for i in range(10): + a[i] = 1.0 + + with pytest.raises(AssertionError, match=r"Out of bound access"): + write_oob(arr) + + +@test_utils.test( + arch=[qd.cpu], + require=qd.extension.assertion, + debug=True, + check_out_of_bound=True, + gdb_trigger=False, +) +def test_ndarray_oob_cpu_small_array(): + """Reproduces the pattern from the temperature-sensor segfault: a kernel + accesses a very small (shape-1) array with an index that goes out of + bounds. Before the longjmp fix this would SIGSEGV on CPU in debug mode.""" + small = qd.ndarray(dtype=qd.f32, shape=(1,)) + small.fill(42.0) + + @qd.kernel + def read_oob(a: qd.types.ndarray(dtype=qd.f32, ndim=1)) -> qd.f32: + return a[5] + + with pytest.raises(AssertionError, match=r"Out of bound access"): + read_oob(small) + + +@test_utils.test( + arch=[qd.cpu], + require=qd.extension.assertion, + debug=True, + check_out_of_bound=True, + gdb_trigger=False, +) +def test_ndarray_oob_cpu_2d(): + """2D ndarray out-of-bounds on CPU should produce a clear error.""" + arr = qd.ndarray(dtype=qd.f32, shape=(3, 4)) + + @qd.kernel + def write_oob_2d(a: qd.types.ndarray(dtype=qd.f32, ndim=2)): + for i in range(1): + a[10, 0] = 1.0 + + with pytest.raises(AssertionError, match=r"Out of bound access"): + write_oob_2d(arr) + + +@test_utils.test( + arch=[qd.cpu], + require=qd.extension.assertion, + debug=True, + check_out_of_bound=True, + gdb_trigger=False, +) +def test_ndarray_inbounds_cpu_still_works(): + """Verify that the setjmp/longjmp mechanism does not break normal + in-bounds ndarray access.""" + n = 8 + arr = qd.ndarray(dtype=qd.f32, shape=(n,)) + + @qd.kernel + def fill(a: qd.types.ndarray(dtype=qd.f32, ndim=1)): + for i in range(n): + a[i] = qd.cast(i * 10, qd.f32) + + fill(arr) + result = arr.to_numpy() + for i in range(n): + assert result[i] == pytest.approx(i * 10) From 8f51ba5be5b2d9c4caa6aad4f085d12f526d60ba Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Mon, 16 Mar 2026 09:19:06 -0700 Subject: [PATCH 2/3] Fix pre-commit lint: clang-format and unused import --- quadrants/codegen/llvm/codegen_llvm.cpp | 10 +++++----- tests/python/test_debug.py | 1 - 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/quadrants/codegen/llvm/codegen_llvm.cpp b/quadrants/codegen/llvm/codegen_llvm.cpp index cb0048c9c..2cbbdf362 100644 --- a/quadrants/codegen/llvm/codegen_llvm.cpp +++ b/quadrants/codegen/llvm/codegen_llvm.cpp @@ -1334,7 +1334,8 @@ void TaskCodeGenLLVM::visit(AssertStmt *stmt) { std::vector args; // On CPU, use the context-aware variant that can longjmp to abort the kernel // task, preventing segfaults from subsequent out-of-bounds memory accesses. - // On GPU, the original variant suffices because asm("exit;") kills the thread. + // On GPU, the original variant suffices because asm("exit;") kills the + // thread. bool use_ctx_variant = arch_is_cpu(current_arch()); args.emplace_back(use_ctx_variant ? get_context() : get_runtime()); args.emplace_back(builder->CreateIsNotNull(llvm_val[stmt->cond])); @@ -1366,10 +1367,9 @@ void TaskCodeGenLLVM::visit(AssertStmt *stmt) { builder->CreateGEP(argument_buffer_size, arguments, {tlctx->get_constant(0), tlctx->get_constant(0)})); - llvm_val[stmt] = call( - use_ctx_variant ? "quadrants_assert_format_ctx" - : "quadrants_assert_format", - std::move(args)); + llvm_val[stmt] = call(use_ctx_variant ? "quadrants_assert_format_ctx" + : "quadrants_assert_format", + std::move(args)); } void TaskCodeGenLLVM::visit(SNodeOpStmt *stmt) { diff --git a/tests/python/test_debug.py b/tests/python/test_debug.py index 78fc68db2..7049424ac 100644 --- a/tests/python/test_debug.py +++ b/tests/python/test_debug.py @@ -1,6 +1,5 @@ import platform -import numpy as np import pytest import quadrants as qd From f83fca99be26c93f4c9164d6676b1bc9f4b547ad Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Mon, 16 Mar 2026 11:06:54 -0700 Subject: [PATCH 3/3] Replace setjmp/longjmp with flag-based early return for CPU assert abort MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Windows x64, longjmp performs SEH-based stack unwinding which requires proper unwind tables (.pdata/.xdata) for every frame on the call stack. JIT-compiled code does not register these tables, so longjmp from JIT'd code crashes the process — causing all Windows OOB-check tests to fail with worker crashes. Replace the mechanism: quadrants_assert_format_ctx now returns 1 on failure instead of calling longjmp, and the codegen emits a conditional ret-void after each assert call on CPU. Task runners check the cpu_assert_failed flag after each body call to break out of their loops. --- quadrants/codegen/llvm/codegen_llvm.cpp | 20 +++++-- quadrants/program/context.h | 8 ++- quadrants/runtime/cpu/kernel_launcher.cpp | 10 +--- .../runtime/llvm/runtime_module/runtime.cpp | 58 ++++++++----------- 4 files changed, 47 insertions(+), 49 deletions(-) diff --git a/quadrants/codegen/llvm/codegen_llvm.cpp b/quadrants/codegen/llvm/codegen_llvm.cpp index 2cbbdf362..dba5917e8 100644 --- a/quadrants/codegen/llvm/codegen_llvm.cpp +++ b/quadrants/codegen/llvm/codegen_llvm.cpp @@ -1332,10 +1332,9 @@ void TaskCodeGenLLVM::visit(AssertStmt *stmt) { auto arguments = create_entry_block_alloca(argument_buffer_size); std::vector args; - // On CPU, use the context-aware variant that can longjmp to abort the kernel - // task, preventing segfaults from subsequent out-of-bounds memory accesses. - // On GPU, the original variant suffices because asm("exit;") kills the - // thread. + // On CPU, use the context-aware variant that returns non-zero on failure + // so we can emit an early return and avoid the subsequent out-of-bounds + // memory access. On GPU, asm("exit;") kills the thread directly. bool use_ctx_variant = arch_is_cpu(current_arch()); args.emplace_back(use_ctx_variant ? get_context() : get_runtime()); args.emplace_back(builder->CreateIsNotNull(llvm_val[stmt->cond])); @@ -1370,6 +1369,19 @@ void TaskCodeGenLLVM::visit(AssertStmt *stmt) { llvm_val[stmt] = call(use_ctx_variant ? "quadrants_assert_format_ctx" : "quadrants_assert_format", std::move(args)); + + if (use_ctx_variant) { + auto *assert_abort = + llvm::BasicBlock::Create(*llvm_context, "assert_abort", func); + auto *assert_cont = + llvm::BasicBlock::Create(*llvm_context, "assert_cont", func); + auto *failed = + builder->CreateICmpNE(llvm_val[stmt], tlctx->get_constant(0)); + builder->CreateCondBr(failed, assert_abort, assert_cont); + builder->SetInsertPoint(assert_abort); + builder->CreateRetVoid(); + builder->SetInsertPoint(assert_cont); + } } void TaskCodeGenLLVM::visit(SNodeOpStmt *stmt) { diff --git a/quadrants/program/context.h b/quadrants/program/context.h index e1d307220..706f65258 100644 --- a/quadrants/program/context.h +++ b/quadrants/program/context.h @@ -25,9 +25,11 @@ struct RuntimeContext { // RuntimeContext which each function have one. uint64_t *result_buffer; - // On CPU, points to a jmp_buf used to abort kernel execution when a runtime - // assertion (e.g. out-of-bounds check) fails. NULL when no guard is active. - void *cpu_abort_jmp_buf{nullptr}; + // Set to 1 by quadrants_assert_format_ctx when a runtime assertion (e.g. + // out-of-bounds check) fails on CPU. The codegen emits an early return + // after each assert call when this is set, and the task runner breaks out + // of its loop. + int32_t cpu_assert_failed{0}; }; #if defined(QD_RUNTIME_HOST) diff --git a/quadrants/runtime/cpu/kernel_launcher.cpp b/quadrants/runtime/cpu/kernel_launcher.cpp index 00020bfb2..dd686a554 100644 --- a/quadrants/runtime/cpu/kernel_launcher.cpp +++ b/quadrants/runtime/cpu/kernel_launcher.cpp @@ -1,6 +1,5 @@ #include "quadrants/runtime/cpu/kernel_launcher.h" #include "quadrants/rhi/arch.h" -#include namespace quadrants::lang { namespace cpu { @@ -42,15 +41,12 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, } } } - std::jmp_buf abort_buf; - ctx.get_context().cpu_abort_jmp_buf = &abort_buf; + ctx.get_context().cpu_assert_failed = 0; for (auto task : launcher_ctx.task_funcs) { - if (setjmp(abort_buf) != 0) { - break; - } task(&ctx.get_context()); + if (ctx.get_context().cpu_assert_failed) + break; } - ctx.get_context().cpu_abort_jmp_buf = nullptr; } KernelLauncher::Handle KernelLauncher::register_llvm_kernel( diff --git a/quadrants/runtime/llvm/runtime_module/runtime.cpp b/quadrants/runtime/llvm/runtime_module/runtime.cpp index e83382172..8287f930e 100644 --- a/quadrants/runtime/llvm/runtime_module/runtime.cpp +++ b/quadrants/runtime/llvm/runtime_module/runtime.cpp @@ -14,7 +14,6 @@ #endif #include -#include #include #include #include @@ -366,7 +365,7 @@ STRUCT_FIELD_ARRAY(PhysicalCoordinates, val); STRUCT_FIELD(RuntimeContext, runtime); STRUCT_FIELD(RuntimeContext, result_buffer) -STRUCT_FIELD(RuntimeContext, cpu_abort_jmp_buf) +STRUCT_FIELD(RuntimeContext, cpu_assert_failed) #include "quadrants/runtime/llvm/runtime_module/atomic.h" @@ -889,20 +888,23 @@ void quadrants_assert_format(LLVMRuntime *runtime, } // Context-aware variant called by bounds-check assertions in JIT'd code. -// On CPU, longjmps to the task runner's abort guard when an assertion fails, -// preventing the subsequent out-of-bounds memory access from segfaulting. -void quadrants_assert_format_ctx(RuntimeContext *context, - u1 test, - const char *format, - int num_arguments, - uint64 *arguments) { +// Returns 1 when the assertion failed (so the codegen can emit an early +// return), 0 otherwise. This replaces a previous setjmp/longjmp approach +// that crashed on Windows because JIT'd frames lack SEH unwind tables. +i32 quadrants_assert_format_ctx(RuntimeContext *context, + u1 test, + const char *format, + int num_arguments, + uint64 *arguments) { quadrants_assert_format(context->runtime, test, format, num_arguments, arguments); #if !ARCH_cuda && !ARCH_amdgpu - if (enable_assert && test == 0 && context->cpu_abort_jmp_buf) { - std::longjmp(*(std::jmp_buf *)context->cpu_abort_jmp_buf, 1); + if (enable_assert && test == 0) { + context->cpu_assert_failed = 1; + return 1; } #endif + return 0; } void quadrants_assert_runtime(LLVMRuntime *runtime, u1 test, const char *msg) { @@ -1529,19 +1531,12 @@ void cpu_struct_for_block_helper(void *ctx_, int thread_id, int i) { RuntimeContext this_thread_context = *ctx->context; this_thread_context.cpu_thread_id = thread_id; - - std::jmp_buf abort_buf; - this_thread_context.cpu_abort_jmp_buf = &abort_buf; - if (setjmp(abort_buf) != 0) { - return; - } + this_thread_context.cpu_assert_failed = 0; if (lower < upper) { (*ctx->task)(&this_thread_context, tls_buffer, &ctx->list->get(element_id), lower, upper); } - - this_thread_context.cpu_abort_jmp_buf = nullptr; } void parallel_struct_for(RuntimeContext *context, @@ -1619,29 +1614,27 @@ void cpu_parallel_range_for_task(void *range_context, RuntimeContext this_thread_context = *ctx.context; this_thread_context.cpu_thread_id = thread_id; - - std::jmp_buf abort_buf; - this_thread_context.cpu_abort_jmp_buf = &abort_buf; - if (setjmp(abort_buf) != 0) { - return; - } + this_thread_context.cpu_assert_failed = 0; if (ctx.step == 1) { int block_start = ctx.begin + task_id * ctx.block_size; int block_end = std::min(block_start + ctx.block_size, ctx.end); for (int i = block_start; i < block_end; i++) { ctx.body(&this_thread_context, tls_ptr, i); + if (this_thread_context.cpu_assert_failed) + break; } } else if (ctx.step == -1) { int block_start = ctx.end - task_id * ctx.block_size; int block_end = std::max(ctx.begin, block_start * ctx.block_size); for (int i = block_start - 1; i >= block_end; i--) { ctx.body(&this_thread_context, tls_ptr, i); + if (this_thread_context.cpu_assert_failed) + break; } } - this_thread_context.cpu_abort_jmp_buf = nullptr; - if (ctx.epilogue) + if (!this_thread_context.cpu_assert_failed && ctx.epilogue) ctx.epilogue(ctx.context, tls_ptr); } @@ -1726,12 +1719,7 @@ void cpu_parallel_mesh_for_task(void *range_context, RuntimeContext this_thread_context = *ctx.context; this_thread_context.cpu_thread_id = thread_id; - - std::jmp_buf abort_buf; - this_thread_context.cpu_abort_jmp_buf = &abort_buf; - if (setjmp(abort_buf) != 0) { - return; - } + this_thread_context.cpu_assert_failed = 0; int block_start = task_id * ctx.block_size; int block_end = std::min(block_start + ctx.block_size, ctx.num_patches); @@ -1740,11 +1728,11 @@ void cpu_parallel_mesh_for_task(void *range_context, if (ctx.prologue) ctx.prologue(ctx.context, tls_ptr, idx); ctx.body(&this_thread_context, tls_ptr, idx); + if (this_thread_context.cpu_assert_failed) + break; if (ctx.epilogue) ctx.epilogue(ctx.context, tls_ptr, idx); } - - this_thread_context.cpu_abort_jmp_buf = nullptr; } void cpu_parallel_mesh_for(RuntimeContext *context,