From 705c43d941a99464481ff2d3ca84fe2bba543e97 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sun, 15 Mar 2026 21:45:46 -0700
Subject: [PATCH 1/3] [CPU] Abort kernel execution on assertion failure instead
 of segfaulting

On CPU, when debug mode is enabled, out-of-bounds array accesses trigger
a runtime assertion that records the error but allows execution to
continue -- leading to a SIGSEGV before Python can retrieve the error.

Fix this by using setjmp/longjmp: each CPU task runner (range_for,
struct_for, mesh_for, serial) sets up a jmp_buf via RuntimeContext, and
the new quadrants_assert_format_ctx function longjmps back on failure.
The existing check_runtime_error path then raises QuadrantsAssertionError.

GPU architectures are unaffected (they already kill threads via asm).

Made-with: Cursor
---
 quadrants/codegen/llvm/codegen_llvm.cpp       | 11 ++-
 quadrants/program/context.h                   |  4 +
 quadrants/runtime/cpu/kernel_launcher.cpp     |  7 ++
 .../runtime/llvm/runtime_module/runtime.cpp   | 45 ++++++++++
 tests/python/test_debug.py                    | 88 +++++++++++++++++++
 5 files changed, 153 insertions(+), 2 deletions(-)
diff --git a/quadrants/codegen/llvm/codegen_llvm.cpp b/quadrants/codegen/llvm/codegen_llvm.cpp
index b7c5b302b..cb0048c9c 100644
--- a/quadrants/codegen/llvm/codegen_llvm.cpp
+++ b/quadrants/codegen/llvm/codegen_llvm.cpp
@@ -1332,7 +1332,11 @@ void TaskCodeGenLLVM::visit(AssertStmt *stmt) {
   auto arguments = create_entry_block_alloca(argument_buffer_size);
 
   std::vector<llvm::Value *> args;
-  args.emplace_back(get_runtime());
+  // On CPU, use the context-aware variant that can longjmp to abort the kernel
+  // task, preventing segfaults from subsequent out-of-bounds memory accesses.
+  // On GPU, the original variant suffices because asm("exit;") kills the thread.
+  bool use_ctx_variant = arch_is_cpu(current_arch());
+  args.emplace_back(use_ctx_variant ? get_context() : get_runtime());
   args.emplace_back(builder->CreateIsNotNull(llvm_val[stmt->cond]));
   args.emplace_back(builder->CreateGlobalStringPtr(stmt->text));
 
@@ -1362,7 +1366,10 @@ void TaskCodeGenLLVM::visit(AssertStmt *stmt) {
       builder->CreateGEP(argument_buffer_size, arguments,
                          {tlctx->get_constant(0), tlctx->get_constant(0)}));
 
-  llvm_val[stmt] = call("quadrants_assert_format", std::move(args));
+  llvm_val[stmt] = call(
+      use_ctx_variant ? "quadrants_assert_format_ctx"
+                      : "quadrants_assert_format",
+      std::move(args));
 }
 
 void TaskCodeGenLLVM::visit(SNodeOpStmt *stmt) {
diff --git a/quadrants/program/context.h b/quadrants/program/context.h
index a75fad53c..e1d307220 100644
--- a/quadrants/program/context.h
+++ b/quadrants/program/context.h
@@ -24,6 +24,10 @@ struct RuntimeContext {
   // LLVMRuntime is shared among functions. So we moved the pointer to
   // RuntimeContext which each function have one.
   uint64_t *result_buffer;
+
+  // On CPU, points to a jmp_buf used to abort kernel execution when a runtime
+  // assertion (e.g. out-of-bounds check) fails. NULL when no guard is active.
+  void *cpu_abort_jmp_buf{nullptr};
 };
 
 #if defined(QD_RUNTIME_HOST)
diff --git a/quadrants/runtime/cpu/kernel_launcher.cpp b/quadrants/runtime/cpu/kernel_launcher.cpp
index d7dd8df25..00020bfb2 100644
--- a/quadrants/runtime/cpu/kernel_launcher.cpp
+++ b/quadrants/runtime/cpu/kernel_launcher.cpp
@@ -1,5 +1,6 @@
 #include "quadrants/runtime/cpu/kernel_launcher.h"
 #include "quadrants/rhi/arch.h"
+#include <csetjmp>
 
 namespace quadrants::lang {
 namespace cpu {
@@ -41,9 +42,15 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
       }
     }
   }
+  std::jmp_buf abort_buf;
+  ctx.get_context().cpu_abort_jmp_buf = &abort_buf;
   for (auto task : launcher_ctx.task_funcs) {
+    if (setjmp(abort_buf) != 0) {
+      break;
+    }
     task(&ctx.get_context());
   }
+  ctx.get_context().cpu_abort_jmp_buf = nullptr;
 }
 
 KernelLauncher::Handle KernelLauncher::register_llvm_kernel(
diff --git a/quadrants/runtime/llvm/runtime_module/runtime.cpp b/quadrants/runtime/llvm/runtime_module/runtime.cpp
index cfd76e999..e83382172 100644
--- a/quadrants/runtime/llvm/runtime_module/runtime.cpp
+++ b/quadrants/runtime/llvm/runtime_module/runtime.cpp
@@ -14,6 +14,7 @@
 #endif
 
 #include <atomic>
+#include <csetjmp>
 #include <cstdint>
 #include <cmath>
 #include <cstdarg>
@@ -365,6 +366,7 @@ STRUCT_FIELD_ARRAY(PhysicalCoordinates, val);
 
 STRUCT_FIELD(RuntimeContext, runtime);
 STRUCT_FIELD(RuntimeContext, result_buffer)
+STRUCT_FIELD(RuntimeContext, cpu_abort_jmp_buf)
 
 #include "quadrants/runtime/llvm/runtime_module/atomic.h"
 
@@ -886,6 +888,23 @@ void quadrants_assert_format(LLVMRuntime *runtime,
 #endif
 }
 
+// Context-aware variant called by bounds-check assertions in JIT'd code.
+// On CPU, longjmps to the task runner's abort guard when an assertion fails,
+// preventing the subsequent out-of-bounds memory access from segfaulting.
+void quadrants_assert_format_ctx(RuntimeContext *context,
+                                 u1 test,
+                                 const char *format,
+                                 int num_arguments,
+                                 uint64 *arguments) {
+  quadrants_assert_format(context->runtime, test, format, num_arguments,
+                          arguments);
+#if !ARCH_cuda && !ARCH_amdgpu
+  if (enable_assert && test == 0 && context->cpu_abort_jmp_buf) {
+    std::longjmp(*(std::jmp_buf *)context->cpu_abort_jmp_buf, 1);
+  }
+#endif
+}
+
 void quadrants_assert_runtime(LLVMRuntime *runtime, u1 test, const char *msg) {
   quadrants_assert_format(runtime, test, msg, 0, nullptr);
 }
@@ -1510,10 +1529,19 @@ void cpu_struct_for_block_helper(void *ctx_, int thread_id, int i) {
 
   RuntimeContext this_thread_context = *ctx->context;
   this_thread_context.cpu_thread_id = thread_id;
+
+  std::jmp_buf abort_buf;
+  this_thread_context.cpu_abort_jmp_buf = &abort_buf;
+  if (setjmp(abort_buf) != 0) {
+    return;
+  }
+
   if (lower < upper) {
     (*ctx->task)(&this_thread_context, tls_buffer,
                  &ctx->list->get<Element>(element_id), lower, upper);
   }
+
+  this_thread_context.cpu_abort_jmp_buf = nullptr;
 }
 
 void parallel_struct_for(RuntimeContext *context,
@@ -1591,6 +1619,13 @@ void cpu_parallel_range_for_task(void *range_context,
 
   RuntimeContext this_thread_context = *ctx.context;
   this_thread_context.cpu_thread_id = thread_id;
+
+  std::jmp_buf abort_buf;
+  this_thread_context.cpu_abort_jmp_buf = &abort_buf;
+  if (setjmp(abort_buf) != 0) {
+    return;
+  }
+
   if (ctx.step == 1) {
     int block_start = ctx.begin + task_id * ctx.block_size;
     int block_end = std::min(block_start + ctx.block_size, ctx.end);
@@ -1604,6 +1639,8 @@ void cpu_parallel_range_for_task(void *range_context,
       ctx.body(&this_thread_context, tls_ptr, i);
     }
   }
+
+  this_thread_context.cpu_abort_jmp_buf = nullptr;
   if (ctx.epilogue)
     ctx.epilogue(ctx.context, tls_ptr);
 }
@@ -1690,6 +1727,12 @@ void cpu_parallel_mesh_for_task(void *range_context,
   RuntimeContext this_thread_context = *ctx.context;
   this_thread_context.cpu_thread_id = thread_id;
 
+  std::jmp_buf abort_buf;
+  this_thread_context.cpu_abort_jmp_buf = &abort_buf;
+  if (setjmp(abort_buf) != 0) {
+    return;
+  }
+
   int block_start = task_id * ctx.block_size;
   int block_end = std::min(block_start + ctx.block_size, ctx.num_patches);
 
@@ -1700,6 +1743,8 @@ void cpu_parallel_mesh_for_task(void *range_context,
     if (ctx.epilogue)
       ctx.epilogue(ctx.context, tls_ptr, idx);
   }
+
+  this_thread_context.cpu_abort_jmp_buf = nullptr;
 }
 
 void cpu_parallel_mesh_for(RuntimeContext *context,
diff --git a/tests/python/test_debug.py b/tests/python/test_debug.py
index 828639eae..78fc68db2 100644
--- a/tests/python/test_debug.py
+++ b/tests/python/test_debug.py
@@ -1,5 +1,6 @@
 import platform
 
+import numpy as np
 import pytest
 
 import quadrants as qd
@@ -135,3 +136,90 @@ def func():
         x[3, 7] = 2
 
     func()
+
+
+@test_utils.test(
+    arch=[qd.cpu],
+    require=qd.extension.assertion,
+    debug=True,
+    check_out_of_bound=True,
+    gdb_trigger=False,
+)
+def test_ndarray_oob_cpu_raises_not_segfaults():
+    """Out-of-bounds ndarray access in a parallel kernel on CPU should raise
+    QuadrantsAssertionError instead of segfaulting."""
+    arr = qd.ndarray(dtype=qd.f32, shape=(4,))
+
+    @qd.kernel
+    def write_oob(a: qd.types.ndarray(dtype=qd.f32, ndim=1)):
+        for i in range(10):
+            a[i] = 1.0
+
+    with pytest.raises(AssertionError, match=r"Out of bound access"):
+        write_oob(arr)
+
+
+@test_utils.test(
+    arch=[qd.cpu],
+    require=qd.extension.assertion,
+    debug=True,
+    check_out_of_bound=True,
+    gdb_trigger=False,
+)
+def test_ndarray_oob_cpu_small_array():
+    """Reproduces the pattern from the temperature-sensor segfault: a kernel
+    accesses a very small (shape-1) array with an index that goes out of
+    bounds.  Before the longjmp fix this would SIGSEGV on CPU in debug mode."""
+    small = qd.ndarray(dtype=qd.f32, shape=(1,))
+    small.fill(42.0)
+
+    @qd.kernel
+    def read_oob(a: qd.types.ndarray(dtype=qd.f32, ndim=1)) -> qd.f32:
+        return a[5]
+
+    with pytest.raises(AssertionError, match=r"Out of bound access"):
+        read_oob(small)
+
+
+@test_utils.test(
+    arch=[qd.cpu],
+    require=qd.extension.assertion,
+    debug=True,
+    check_out_of_bound=True,
+    gdb_trigger=False,
+)
+def test_ndarray_oob_cpu_2d():
+    """2D ndarray out-of-bounds on CPU should produce a clear error."""
+    arr = qd.ndarray(dtype=qd.f32, shape=(3, 4))
+
+    @qd.kernel
+    def write_oob_2d(a: qd.types.ndarray(dtype=qd.f32, ndim=2)):
+        for i in range(1):
+            a[10, 0] = 1.0
+
+    with pytest.raises(AssertionError, match=r"Out of bound access"):
+        write_oob_2d(arr)
+
+
+@test_utils.test(
+    arch=[qd.cpu],
+    require=qd.extension.assertion,
+    debug=True,
+    check_out_of_bound=True,
+    gdb_trigger=False,
+)
+def test_ndarray_inbounds_cpu_still_works():
+    """Verify that the setjmp/longjmp mechanism does not break normal
+    in-bounds ndarray access."""
+    n = 8
+    arr = qd.ndarray(dtype=qd.f32, shape=(n,))
+
+    @qd.kernel
+    def fill(a: qd.types.ndarray(dtype=qd.f32, ndim=1)):
+        for i in range(n):
+            a[i] = qd.cast(i * 10, qd.f32)
+
+    fill(arr)
+    result = arr.to_numpy()
+    for i in range(n):
+        assert result[i] == pytest.approx(i * 10)

From 8f51ba5be5b2d9c4caa6aad4f085d12f526d60ba Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Mon, 16 Mar 2026 09:19:06 -0700
Subject: [PATCH 2/3] Fix pre-commit lint: clang-format and unused import

---
 quadrants/codegen/llvm/codegen_llvm.cpp | 10 +++++-----
 tests/python/test_debug.py              |  1 -
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/quadrants/codegen/llvm/codegen_llvm.cpp b/quadrants/codegen/llvm/codegen_llvm.cpp
index cb0048c9c..2cbbdf362 100644
--- a/quadrants/codegen/llvm/codegen_llvm.cpp
+++ b/quadrants/codegen/llvm/codegen_llvm.cpp
@@ -1334,7 +1334,8 @@ void TaskCodeGenLLVM::visit(AssertStmt *stmt) {
   std::vector<llvm::Value *> args;
   // On CPU, use the context-aware variant that can longjmp to abort the kernel
   // task, preventing segfaults from subsequent out-of-bounds memory accesses.
-  // On GPU, the original variant suffices because asm("exit;") kills the thread.
+  // On GPU, the original variant suffices because asm("exit;") kills the
+  // thread.
   bool use_ctx_variant = arch_is_cpu(current_arch());
   args.emplace_back(use_ctx_variant ? get_context() : get_runtime());
   args.emplace_back(builder->CreateIsNotNull(llvm_val[stmt->cond]));
@@ -1366,10 +1367,9 @@ void TaskCodeGenLLVM::visit(AssertStmt *stmt) {
       builder->CreateGEP(argument_buffer_size, arguments,
                          {tlctx->get_constant(0), tlctx->get_constant(0)}));
 
-  llvm_val[stmt] = call(
-      use_ctx_variant ? "quadrants_assert_format_ctx"
-                      : "quadrants_assert_format",
-      std::move(args));
+  llvm_val[stmt] = call(use_ctx_variant ? "quadrants_assert_format_ctx"
+                                        : "quadrants_assert_format",
+                        std::move(args));
 }
 
 void TaskCodeGenLLVM::visit(SNodeOpStmt *stmt) {
diff --git a/tests/python/test_debug.py b/tests/python/test_debug.py
index 78fc68db2..7049424ac 100644
--- a/tests/python/test_debug.py
+++ b/tests/python/test_debug.py
@@ -1,6 +1,5 @@
 import platform
 
-import numpy as np
 import pytest
 
 import quadrants as qd

From f83fca99be26c93f4c9164d6676b1bc9f4b547ad Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Mon, 16 Mar 2026 11:06:54 -0700
Subject: [PATCH 3/3] Replace setjmp/longjmp with flag-based early return for
 CPU assert abort
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On Windows x64, longjmp performs SEH-based stack unwinding which requires
proper unwind tables (.pdata/.xdata) for every frame on the call stack.
JIT-compiled code does not register these tables, so longjmp from JIT'd
code crashes the process — causing all Windows OOB-check tests to fail
with worker crashes.

Replace the mechanism: quadrants_assert_format_ctx now returns 1 on
failure instead of calling longjmp, and the codegen emits a conditional
ret-void after each assert call on CPU.  Task runners check the
cpu_assert_failed flag after each body call to break out of their loops.
---
 quadrants/codegen/llvm/codegen_llvm.cpp       | 20 +++++--
 quadrants/program/context.h                   |  8 ++-
 quadrants/runtime/cpu/kernel_launcher.cpp     | 10 +---
 .../runtime/llvm/runtime_module/runtime.cpp   | 58 ++++++++-----------
 4 files changed, 47 insertions(+), 49 deletions(-)

diff --git a/quadrants/codegen/llvm/codegen_llvm.cpp b/quadrants/codegen/llvm/codegen_llvm.cpp
index 2cbbdf362..dba5917e8 100644
--- a/quadrants/codegen/llvm/codegen_llvm.cpp
+++ b/quadrants/codegen/llvm/codegen_llvm.cpp
@@ -1332,10 +1332,9 @@ void TaskCodeGenLLVM::visit(AssertStmt *stmt) {
   auto arguments = create_entry_block_alloca(argument_buffer_size);
 
   std::vector<llvm::Value *> args;
-  // On CPU, use the context-aware variant that can longjmp to abort the kernel
-  // task, preventing segfaults from subsequent out-of-bounds memory accesses.
-  // On GPU, the original variant suffices because asm("exit;") kills the
-  // thread.
+  // On CPU, use the context-aware variant that returns non-zero on failure
+  // so we can emit an early return and avoid the subsequent out-of-bounds
+  // memory access.  On GPU, asm("exit;") kills the thread directly.
   bool use_ctx_variant = arch_is_cpu(current_arch());
   args.emplace_back(use_ctx_variant ? get_context() : get_runtime());
   args.emplace_back(builder->CreateIsNotNull(llvm_val[stmt->cond]));
@@ -1370,6 +1369,19 @@ void TaskCodeGenLLVM::visit(AssertStmt *stmt) {
   llvm_val[stmt] = call(use_ctx_variant ? "quadrants_assert_format_ctx"
                                         : "quadrants_assert_format",
                         std::move(args));
+
+  if (use_ctx_variant) {
+    auto *assert_abort =
+        llvm::BasicBlock::Create(*llvm_context, "assert_abort", func);
+    auto *assert_cont =
+        llvm::BasicBlock::Create(*llvm_context, "assert_cont", func);
+    auto *failed =
+        builder->CreateICmpNE(llvm_val[stmt], tlctx->get_constant(0));
+    builder->CreateCondBr(failed, assert_abort, assert_cont);
+    builder->SetInsertPoint(assert_abort);
+    builder->CreateRetVoid();
+    builder->SetInsertPoint(assert_cont);
+  }
 }
 
 void TaskCodeGenLLVM::visit(SNodeOpStmt *stmt) {
diff --git a/quadrants/program/context.h b/quadrants/program/context.h
index e1d307220..706f65258 100644
--- a/quadrants/program/context.h
+++ b/quadrants/program/context.h
@@ -25,9 +25,11 @@ struct RuntimeContext {
   // RuntimeContext which each function have one.
   uint64_t *result_buffer;
 
-  // On CPU, points to a jmp_buf used to abort kernel execution when a runtime
-  // assertion (e.g. out-of-bounds check) fails. NULL when no guard is active.
-  void *cpu_abort_jmp_buf{nullptr};
+  // Set to 1 by quadrants_assert_format_ctx when a runtime assertion (e.g.
+  // out-of-bounds check) fails on CPU.  The codegen emits an early return
+  // after each assert call when this is set, and the task runner breaks out
+  // of its loop.
+  int32_t cpu_assert_failed{0};
 };
 
 #if defined(QD_RUNTIME_HOST)
diff --git a/quadrants/runtime/cpu/kernel_launcher.cpp b/quadrants/runtime/cpu/kernel_launcher.cpp
index 00020bfb2..dd686a554 100644
--- a/quadrants/runtime/cpu/kernel_launcher.cpp
+++ b/quadrants/runtime/cpu/kernel_launcher.cpp
@@ -1,6 +1,5 @@
 #include "quadrants/runtime/cpu/kernel_launcher.h"
 #include "quadrants/rhi/arch.h"
-#include <csetjmp>
 
 namespace quadrants::lang {
 namespace cpu {
@@ -42,15 +41,12 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
       }
     }
   }
-  std::jmp_buf abort_buf;
-  ctx.get_context().cpu_abort_jmp_buf = &abort_buf;
+  ctx.get_context().cpu_assert_failed = 0;
   for (auto task : launcher_ctx.task_funcs) {
-    if (setjmp(abort_buf) != 0) {
-      break;
-    }
     task(&ctx.get_context());
+    if (ctx.get_context().cpu_assert_failed)
+      break;
   }
-  ctx.get_context().cpu_abort_jmp_buf = nullptr;
 }
 
 KernelLauncher::Handle KernelLauncher::register_llvm_kernel(
diff --git a/quadrants/runtime/llvm/runtime_module/runtime.cpp b/quadrants/runtime/llvm/runtime_module/runtime.cpp
index e83382172..8287f930e 100644
--- a/quadrants/runtime/llvm/runtime_module/runtime.cpp
+++ b/quadrants/runtime/llvm/runtime_module/runtime.cpp
@@ -14,7 +14,6 @@
 #endif
 
 #include <atomic>
-#include <csetjmp>
 #include <cstdint>
 #include <cmath>
 #include <cstdarg>
@@ -366,7 +365,7 @@ STRUCT_FIELD_ARRAY(PhysicalCoordinates, val);
 
 STRUCT_FIELD(RuntimeContext, runtime);
 STRUCT_FIELD(RuntimeContext, result_buffer)
-STRUCT_FIELD(RuntimeContext, cpu_abort_jmp_buf)
+STRUCT_FIELD(RuntimeContext, cpu_assert_failed)
 
 #include "quadrants/runtime/llvm/runtime_module/atomic.h"
 
@@ -889,20 +888,23 @@ void quadrants_assert_format(LLVMRuntime *runtime,
 }
 
 // Context-aware variant called by bounds-check assertions in JIT'd code.
-// On CPU, longjmps to the task runner's abort guard when an assertion fails,
-// preventing the subsequent out-of-bounds memory access from segfaulting.
-void quadrants_assert_format_ctx(RuntimeContext *context,
-                                 u1 test,
-                                 const char *format,
-                                 int num_arguments,
-                                 uint64 *arguments) {
+// Returns 1 when the assertion failed (so the codegen can emit an early
+// return), 0 otherwise.  This replaces a previous setjmp/longjmp approach
+// that crashed on Windows because JIT'd frames lack SEH unwind tables.
+i32 quadrants_assert_format_ctx(RuntimeContext *context,
+                                u1 test,
+                                const char *format,
+                                int num_arguments,
+                                uint64 *arguments) {
   quadrants_assert_format(context->runtime, test, format, num_arguments,
                           arguments);
 #if !ARCH_cuda && !ARCH_amdgpu
-  if (enable_assert && test == 0 && context->cpu_abort_jmp_buf) {
-    std::longjmp(*(std::jmp_buf *)context->cpu_abort_jmp_buf, 1);
+  if (enable_assert && test == 0) {
+    context->cpu_assert_failed = 1;
+    return 1;
   }
 #endif
+  return 0;
 }
 
 void quadrants_assert_runtime(LLVMRuntime *runtime, u1 test, const char *msg) {
@@ -1529,19 +1531,12 @@ void cpu_struct_for_block_helper(void *ctx_, int thread_id, int i) {
 
   RuntimeContext this_thread_context = *ctx->context;
   this_thread_context.cpu_thread_id = thread_id;
-
-  std::jmp_buf abort_buf;
-  this_thread_context.cpu_abort_jmp_buf = &abort_buf;
-  if (setjmp(abort_buf) != 0) {
-    return;
-  }
+  this_thread_context.cpu_assert_failed = 0;
 
   if (lower < upper) {
     (*ctx->task)(&this_thread_context, tls_buffer,
                  &ctx->list->get<Element>(element_id), lower, upper);
   }
-
-  this_thread_context.cpu_abort_jmp_buf = nullptr;
 }
 
 void parallel_struct_for(RuntimeContext *context,
@@ -1619,29 +1614,27 @@ void cpu_parallel_range_for_task(void *range_context,
 
   RuntimeContext this_thread_context = *ctx.context;
   this_thread_context.cpu_thread_id = thread_id;
-
-  std::jmp_buf abort_buf;
-  this_thread_context.cpu_abort_jmp_buf = &abort_buf;
-  if (setjmp(abort_buf) != 0) {
-    return;
-  }
+  this_thread_context.cpu_assert_failed = 0;
 
   if (ctx.step == 1) {
     int block_start = ctx.begin + task_id * ctx.block_size;
     int block_end = std::min(block_start + ctx.block_size, ctx.end);
     for (int i = block_start; i < block_end; i++) {
       ctx.body(&this_thread_context, tls_ptr, i);
+      if (this_thread_context.cpu_assert_failed)
+        break;
     }
   } else if (ctx.step == -1) {
     int block_start = ctx.end - task_id * ctx.block_size;
     int block_end = std::max(ctx.begin, block_start * ctx.block_size);
     for (int i = block_start - 1; i >= block_end; i--) {
       ctx.body(&this_thread_context, tls_ptr, i);
+      if (this_thread_context.cpu_assert_failed)
+        break;
     }
   }
 
-  this_thread_context.cpu_abort_jmp_buf = nullptr;
-  if (ctx.epilogue)
+  if (!this_thread_context.cpu_assert_failed && ctx.epilogue)
     ctx.epilogue(ctx.context, tls_ptr);
 }
 
@@ -1726,12 +1719,7 @@ void cpu_parallel_mesh_for_task(void *range_context,
 
   RuntimeContext this_thread_context = *ctx.context;
   this_thread_context.cpu_thread_id = thread_id;
-
-  std::jmp_buf abort_buf;
-  this_thread_context.cpu_abort_jmp_buf = &abort_buf;
-  if (setjmp(abort_buf) != 0) {
-    return;
-  }
+  this_thread_context.cpu_assert_failed = 0;
 
   int block_start = task_id * ctx.block_size;
   int block_end = std::min(block_start + ctx.block_size, ctx.num_patches);
@@ -1740,11 +1728,11 @@ void cpu_parallel_mesh_for_task(void *range_context,
     if (ctx.prologue)
       ctx.prologue(ctx.context, tls_ptr, idx);
     ctx.body(&this_thread_context, tls_ptr, idx);
+    if (this_thread_context.cpu_assert_failed)
+      break;
     if (ctx.epilogue)
       ctx.epilogue(ctx.context, tls_ptr, idx);
   }
-
-  this_thread_context.cpu_abort_jmp_buf = nullptr;
 }
 
 void cpu_parallel_mesh_for(RuntimeContext *context,