diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e6f73e193..e8e9e3c0c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -22,6 +22,26 @@ jobs:
         with:
           extra_args: --from-ref ${{ github.event.pull_request.base.sha }} --to-ref ${{ github.event.pull_request.head.sha }}
 
+  ut-cpp:
+    needs: pre-commit
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: sudo apt-get update && sudo apt-get install -y cmake ninja-build
+
+      - name: Configure C++ tests
+        run: cmake -B tests/ut/cpp/build -S tests/ut/cpp -G Ninja
+
+      - name: Build C++ tests
+        run: cmake --build tests/ut/cpp/build
+
+      - name: Run C++ tests
+        run: ctest --test-dir tests/ut/cpp/build --output-on-failure -L no_hardware
+
   ut-py:
     needs: pre-commit
     runs-on: ubuntu-latest
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
new file mode 100644
index 000000000..fe6987bff
--- /dev/null
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -0,0 +1,120 @@
+cmake_minimum_required(VERSION 3.15)
+project(pto_runtime_tests CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Disable profiling to avoid device_time.h dependency in hot paths
+add_compile_definitions(
+    PTO2_PROFILING=0
+    PTO2_ORCH_PROFILING=0
+    PTO2_SCHED_PROFILING=0
+    PTO2_TENSORMAP_PROFILING=0
+    PTO2_SPIN_VERBOSE_LOGGING=0
+    _GLIBCXX_USE_CXX11_ABI=0
+)
+
+# GoogleTest: prefer system installation, fallback to FetchContent
+find_package(GTest QUIET)
+if(NOT GTest_FOUND)
+    include(FetchContent)
+    FetchContent_Declare(googletest
+        GIT_REPOSITORY https://github.com/google/googletest.git
+        GIT_TAG v1.14.0
+    )
+    set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+    FetchContent_MakeAvailable(googletest)
+endif()
+
+enable_testing()
+
+# Source directories (use a2a3 as the reference arch for UT)
+set(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+set(TMR_RUNTIME_DIR ${PROJECT_ROOT}/src/a2a3/runtime/tensormap_and_ringbuffer/runtime)
+set(TMR_ORCH_DIR ${PROJECT_ROOT}/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration)
+set(TMR_COMMON_DIR ${PROJECT_ROOT}/src/a2a3/runtime/tensormap_and_ringbuffer/common)
+set(PLATFORM_INCLUDE_DIR ${PROJECT_ROOT}/src/a2a3/platform/include)
+set(COMMON_TASK_DIR ${PROJECT_ROOT}/src/common/task_interface)
+
+set(COMMON_INCLUDE_DIRS
+    ${TMR_RUNTIME_DIR}
+    ${TMR_ORCH_DIR}
+    ${TMR_COMMON_DIR}
+    ${PLATFORM_INCLUDE_DIR}
+    ${COMMON_TASK_DIR}
+)
+
+# Determine the GTest link target name
+if(TARGET GTest::gtest_main)
+    set(GTEST_TARGET GTest::gtest_main)
+else()
+    set(GTEST_TARGET gtest_main)
+endif()
+
+# Stub sources
+set(STUB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/stubs/test_stubs.cpp)
+
+# Helper: add a test target (only if source file exists)
+function(add_gtest_target name)
+    cmake_parse_arguments(ARG "" "" "SOURCES;EXTRA_SOURCES" ${ARGN})
+    # Check all source files exist
+    foreach(src ${ARG_SOURCES})
+        if(NOT IS_ABSOLUTE "${src}")
+            set(src "${CMAKE_CURRENT_SOURCE_DIR}/${src}")
+        endif()
+        if(NOT EXISTS "${src}")
+            message(STATUS "Skipping ${name}: ${src} not found")
+            return()
+        endif()
+    endforeach()
+    add_executable(${name} ${ARG_SOURCES} ${STUB_SOURCES} ${ARG_EXTRA_SOURCES})
+    target_include_directories(${name} PRIVATE ${COMMON_INCLUDE_DIRS})
+    target_link_libraries(${name} ${GTEST_TARGET})
+    add_test(NAME ${name} COMMAND ${name})
+    set_tests_properties(${name} PROPERTIES LABELS "no_hardware")
+endfunction()
+
+# --- Header-only tests (no runtime .cpp sources needed) ---
+
+add_gtest_target(test_submit_types SOURCES test_submit_types.cpp)
+add_gtest_target(test_core_types SOURCES test_core_types.cpp)
+add_gtest_target(test_tensor SOURCES test_tensor.cpp)
+
+# --- Tests requiring runtime .cpp sources ---
+
+add_gtest_target(test_shared_memory
+    SOURCES test_shared_memory.cpp
+    EXTRA_SOURCES ${TMR_RUNTIME_DIR}/pto_shared_memory.cpp
+)
+
+add_gtest_target(test_ring_buffer
+    SOURCES test_ring_buffer.cpp
+    EXTRA_SOURCES
+        ${TMR_RUNTIME_DIR}/pto_shared_memory.cpp
+        ${TMR_RUNTIME_DIR}/pto_scheduler.cpp
+        ${TMR_RUNTIME_DIR}/pto_ring_buffer.cpp
+)
+
+add_gtest_target(test_tensormap
+    SOURCES test_tensormap.cpp
+    EXTRA_SOURCES
+        ${TMR_RUNTIME_DIR}/pto_tensormap.cpp
+)
+
+add_gtest_target(test_ready_queue
+    SOURCES test_ready_queue.cpp
+    EXTRA_SOURCES
+        ${TMR_RUNTIME_DIR}/pto_scheduler.cpp
+        ${TMR_RUNTIME_DIR}/pto_shared_memory.cpp
+)
+
+add_gtest_target(test_scheduler_state
+    SOURCES test_scheduler_state.cpp
+    EXTRA_SOURCES
+        ${TMR_RUNTIME_DIR}/pto_scheduler.cpp
+        ${TMR_RUNTIME_DIR}/pto_shared_memory.cpp
+)
+
+add_gtest_target(test_pto_types SOURCES test_pto_types.cpp)
+
+add_gtest_target(test_dispatch_payload SOURCES test_dispatch_payload.cpp)
diff --git a/tests/ut/cpp/stubs/test_stubs.cpp b/tests/ut/cpp/stubs/test_stubs.cpp
new file mode 100644
index 000000000..26ff45a18
--- /dev/null
+++ b/tests/ut/cpp/stubs/test_stubs.cpp
@@ -0,0 +1,99 @@
+/**
+ * Link-time stubs for platform APIs used by runtime headers.
+ *
+ * Provides x86-compatible implementations of functions declared in
+ * platform headers (unified_log.h, device_time.h, common.h) so that
+ * runtime data structures can be unit-tested on CI runners without
+ * Ascend hardware or SDK.
+ */
+
+#include <chrono>
+#include <cstdarg>
+#include <cstdint>
+#include <cstdio>
+#include <stdexcept>
+#include <string>
+
+// =============================================================================
+// unified_log.h stubs (5 log-level functions)
+// =============================================================================
+
+extern "C" {
+
+void unified_log_error(const char* func, const char* fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    fprintf(stderr, "[ERROR] %s: ", func);
+    vfprintf(stderr, fmt, args);
+    fprintf(stderr, "\n");
+    va_end(args);
+}
+
+void unified_log_warn(const char* func, const char* fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    fprintf(stderr, "[WARN]  %s: ", func);
+    vfprintf(stderr, fmt, args);
+    fprintf(stderr, "\n");
+    va_end(args);
+}
+
+void unified_log_info(const char* /* func */, const char* /* fmt */, ...) {
+    // Suppress info in tests
+}
+
+void unified_log_debug(const char* /* func */, const char* /* fmt */, ...) {
+    // Suppress debug in tests
+}
+
+void unified_log_always(const char* func, const char* fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    fprintf(stderr, "[ALWAYS] %s: ", func);
+    vfprintf(stderr, fmt, args);
+    fprintf(stderr, "\n");
+    va_end(args);
+}
+
+}  // extern "C"
+
+// =============================================================================
+// device_time.h stub
+// =============================================================================
+
+uint64_t get_sys_cnt_aicpu() {
+    auto now = std::chrono::steady_clock::now();
+    return static_cast<uint64_t>(
+        std::chrono::duration_cast<std::chrono::nanoseconds>(now.time_since_epoch()).count());
+}
+
+// =============================================================================
+// common.h stubs (assert_impl, get_stacktrace, AssertionError)
+// =============================================================================
+
+std::string get_stacktrace(int /* skip_frames */) {
+    return "<stacktrace not available in test stubs>";
+}
+
+class AssertionError : public std::runtime_error {
+public:
+    AssertionError(const char* condition, const char* file, int line)
+        : std::runtime_error(std::string("Assertion failed: ") + condition + " at " + file + ":" +
+                             std::to_string(line)),
+          condition_(condition),
+          file_(file),
+          line_(line) {}
+
+    const char* condition() const { return condition_; }
+    const char* file() const { return file_; }
+    int line() const { return line_; }
+
+private:
+    const char* condition_;
+    const char* file_;
+    int line_;
+};
+
+[[noreturn]] void assert_impl(const char* condition, const char* file, int line) {
+    throw AssertionError(condition, file, line);
+}
diff --git a/tests/ut/cpp/test_core_types.cpp b/tests/ut/cpp/test_core_types.cpp
new file mode 100644
index 000000000..ee3cb247d
--- /dev/null
+++ b/tests/ut/cpp/test_core_types.cpp
@@ -0,0 +1,141 @@
+/**
+ * Unit tests for core types in pto_runtime2_types.h
+ *
+ * Tests PTO2TaskId encoding, alignment assertions, and utility macros.
+ */
+
+#include <gtest/gtest.h>
+
+#include "pto_runtime2_types.h"
+
+// =============================================================================
+// PTO2TaskId encoding/extraction
+// =============================================================================
+
+TEST(TaskId, DefaultIsZero) {
+    PTO2TaskId id;
+    EXPECT_EQ(id.raw, 0u);
+    EXPECT_EQ(id.ring(), 0);
+    EXPECT_EQ(id.local(), 0u);
+}
+
+TEST(TaskId, MakeAndExtract) {
+    auto id = pto2_make_task_id(2, 42);
+    EXPECT_EQ(id.ring(), 2);
+    EXPECT_EQ(id.local(), 42u);
+}
+
+TEST(TaskId, RingInUpperBits) {
+    auto id = pto2_make_task_id(3, 0);
+    EXPECT_EQ(id.raw, static_cast<uint64_t>(3) << 32);
+    EXPECT_EQ(id.ring(), 3);
+    EXPECT_EQ(id.local(), 0u);
+}
+
+TEST(TaskId, MaxRingMaxLocal) {
+    auto id = pto2_make_task_id(255, 0xFFFFFFFF);
+    EXPECT_EQ(id.ring(), 255);
+    EXPECT_EQ(id.local(), 0xFFFFFFFF);
+}
+
+TEST(TaskId, Roundtrip) {
+    for (uint8_t ring = 0; ring < PTO2_MAX_RING_DEPTH; ring++) {
+        for (uint32_t local : {0u, 1u, 100u, 0xFFFFu, 0xFFFFFFFFu}) {
+            auto id = pto2_make_task_id(ring, local);
+            EXPECT_EQ(id.ring(), ring);
+            EXPECT_EQ(id.local(), local);
+        }
+    }
+}
+
+TEST(TaskId, Equality) {
+    auto a = pto2_make_task_id(1, 42);
+    auto b = pto2_make_task_id(1, 42);
+    auto c = pto2_make_task_id(1, 43);
+    auto d = pto2_make_task_id(2, 42);
+
+    EXPECT_TRUE(a == b);
+    EXPECT_FALSE(a != b);
+    EXPECT_TRUE(a != c);
+    EXPECT_TRUE(a != d);
+}
+
+TEST(TaskId, SizeIs8Bytes) {
+    EXPECT_EQ(sizeof(PTO2TaskId), 8u);
+}
+
+// =============================================================================
+// PTO2TaskSlotState size (cache-line aligned)
+// =============================================================================
+
+TEST(TaskSlotState, SizeIs64Bytes) {
+    EXPECT_EQ(sizeof(PTO2TaskSlotState), 64u);
+}
+
+// =============================================================================
+// PTO2_ALIGN_UP macro
+// =============================================================================
+
+TEST(AlignUp, Zero) {
+    EXPECT_EQ(PTO2_ALIGN_UP(0, 64), 0u);
+}
+
+TEST(AlignUp, AlreadyAligned) {
+    EXPECT_EQ(PTO2_ALIGN_UP(64, 64), 64u);
+    EXPECT_EQ(PTO2_ALIGN_UP(128, 64), 128u);
+}
+
+TEST(AlignUp, NotAligned) {
+    EXPECT_EQ(PTO2_ALIGN_UP(1, 64), 64u);
+    EXPECT_EQ(PTO2_ALIGN_UP(63, 64), 64u);
+    EXPECT_EQ(PTO2_ALIGN_UP(65, 64), 128u);
+}
+
+TEST(AlignUp, SmallAlignment) {
+    EXPECT_EQ(PTO2_ALIGN_UP(5, 4), 8u);
+    EXPECT_EQ(PTO2_ALIGN_UP(4, 4), 4u);
+    EXPECT_EQ(PTO2_ALIGN_UP(3, 4), 4u);
+}
+
+// =============================================================================
+// Task state enum values
+// =============================================================================
+
+TEST(TaskState, EnumValues) {
+    EXPECT_EQ(PTO2_TASK_PENDING, 0);
+    EXPECT_EQ(PTO2_TASK_READY, 1);
+    EXPECT_EQ(PTO2_TASK_RUNNING, 2);
+    EXPECT_EQ(PTO2_TASK_COMPLETED, 3);
+    EXPECT_EQ(PTO2_TASK_CONSUMED, 4);
+}
+
+// =============================================================================
+// Error code constants
+// =============================================================================
+
+TEST(ErrorCodes, Values) {
+    EXPECT_EQ(PTO2_ERROR_NONE, 0);
+    EXPECT_EQ(PTO2_ERROR_SCOPE_DEADLOCK, 1);
+    EXPECT_EQ(PTO2_ERROR_HEAP_RING_DEADLOCK, 2);
+    EXPECT_EQ(PTO2_ERROR_FLOW_CONTROL_DEADLOCK, 3);
+    EXPECT_EQ(PTO2_ERROR_DEP_POOL_OVERFLOW, 4);
+    EXPECT_EQ(PTO2_ERROR_INVALID_ARGS, 5);
+    EXPECT_EQ(PTO2_ERROR_SCHEDULER_TIMEOUT, 100);
+}
+
+// =============================================================================
+// Configuration constants
+// =============================================================================
+
+TEST(Config, TaskWindowSizeIsPowerOf2) {
+    EXPECT_GT(PTO2_TASK_WINDOW_SIZE, 0);
+    EXPECT_EQ(PTO2_TASK_WINDOW_SIZE & (PTO2_TASK_WINDOW_SIZE - 1), 0);
+}
+
+TEST(Config, MaxRingDepth) {
+    EXPECT_EQ(PTO2_MAX_RING_DEPTH, 4);
+}
+
+TEST(Config, AlignSize) {
+    EXPECT_EQ(PTO2_ALIGN_SIZE, 64);
+}
diff --git a/tests/ut/cpp/test_dispatch_payload.cpp b/tests/ut/cpp/test_dispatch_payload.cpp
new file mode 100644
index 000000000..2a402d485
--- /dev/null
+++ b/tests/ut/cpp/test_dispatch_payload.cpp
@@ -0,0 +1,144 @@
+/**
+ * Unit tests for PTO2DispatchPayload and SPMD context structures.
+ *
+ * Tests layout constants, alignment, static_assert consistency, and the
+ * get_block_idx / get_block_num / get_sub_block_id intrinsic accessors.
+ */
+
+#include <cstdint>
+
+#include <gtest/gtest.h>
+
+#include "intrinsic.h"
+#include "pto2_dispatch_payload.h"
+#include "pto_types.h"
+
+// =============================================================================
+// Compile-time constant consistency
+// =============================================================================
+
+TEST(DispatchPayloadConstants, LocalContextIndex) {
+    // SPMD_LOCAL_CONTEXT_INDEX must equal MAX_TENSOR_ARGS + MAX_SCALAR_ARGS
+    EXPECT_EQ(SPMD_LOCAL_CONTEXT_INDEX, MAX_TENSOR_ARGS + MAX_SCALAR_ARGS);
+}
+
+TEST(DispatchPayloadConstants, GlobalContextIndex) {
+    EXPECT_EQ(SPMD_GLOBAL_CONTEXT_INDEX, SPMD_LOCAL_CONTEXT_INDEX + 1);
+}
+
+TEST(DispatchPayloadConstants, ExtParamsCount) {
+    EXPECT_EQ(PTO2_EXT_PARAMS_COUNT, 2);
+}
+
+TEST(DispatchPayloadConstants, DispatchMaxArgs) {
+    EXPECT_EQ(PTO2_DISPATCH_MAX_ARGS,
+              MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + PTO2_EXT_PARAMS_COUNT);
+}
+
+// =============================================================================
+// PTO2DispatchPayload layout and alignment
+// =============================================================================
+
+TEST(DispatchPayloadLayout, IsAlignedTo64Bytes) {
+    EXPECT_EQ(alignof(PTO2DispatchPayload), 64u);
+}
+
+TEST(DispatchPayloadLayout, ArgsArrayHasCorrectSize) {
+    PTO2DispatchPayload p{};
+    EXPECT_EQ(sizeof(p.args) / sizeof(p.args[0]),
+              static_cast<size_t>(PTO2_DISPATCH_MAX_ARGS));
+}
+
+TEST(DispatchPayloadLayout, ArgElementIs8Bytes) {
+    PTO2DispatchPayload p{};
+    EXPECT_EQ(sizeof(p.args[0]), 8u);
+}
+
+// =============================================================================
+// LocalContext
+// =============================================================================
+
+TEST(LocalContext, FieldsReadWrite) {
+    LocalContext lctx{3, 8};
+    EXPECT_EQ(lctx.block_idx, 3);
+    EXPECT_EQ(lctx.block_num, 8);
+}
+
+TEST(LocalContext, DefaultZero) {
+    LocalContext lctx{};
+    EXPECT_EQ(lctx.block_idx, 0);
+    EXPECT_EQ(lctx.block_num, 0);
+}
+
+// =============================================================================
+// GlobalContext
+// =============================================================================
+
+TEST(GlobalContext, FieldReadWrite) {
+    GlobalContext gctx{1};
+    EXPECT_EQ(gctx.sub_block_id, 1);
+}
+
+// =============================================================================
+// Intrinsic accessor functions
+// =============================================================================
+
+// Build a minimal args[] array with context pointers at the correct indices.
+struct IntrinsicTestSetup {
+    static constexpr int kArgsLen = SPMD_GLOBAL_CONTEXT_INDEX + 1;
+    LocalContext lctx;
+    GlobalContext gctx;
+    uint64_t args[kArgsLen];
+
+    IntrinsicTestSetup(int block_idx, int block_num, int sub_block_id)
+        : lctx{block_idx, block_num}, gctx{sub_block_id} {
+        for (auto& a : args) a = 0;
+        args[SPMD_LOCAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&lctx);
+        args[SPMD_GLOBAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&gctx);
+    }
+
+    int64_t* raw() { return reinterpret_cast<int64_t*>(args); }
+};
+
+TEST(IntrinsicAccessors, GetBlockIdx) {
+    IntrinsicTestSetup s(5, 10, 0);
+    EXPECT_EQ(get_block_idx(s.raw()), 5);
+}
+
+TEST(IntrinsicAccessors, GetBlockNum) {
+    IntrinsicTestSetup s(0, 7, 0);
+    EXPECT_EQ(get_block_num(s.raw()), 7);
+}
+
+TEST(IntrinsicAccessors, GetSubBlockId_AIV0) {
+    IntrinsicTestSetup s(0, 1, 0);
+    EXPECT_EQ(get_sub_block_id(s.raw()), 0);
+}
+
+TEST(IntrinsicAccessors, GetSubBlockId_AIV1) {
+    IntrinsicTestSetup s(0, 1, 1);
+    EXPECT_EQ(get_sub_block_id(s.raw()), 1);
+}
+
+TEST(IntrinsicAccessors, BlockIdxAndNumIndependent) {
+    // Changing block_idx must not affect block_num and vice versa
+    IntrinsicTestSetup s(2, 4, 0);
+    EXPECT_EQ(get_block_idx(s.raw()), 2);
+    EXPECT_EQ(get_block_num(s.raw()), 4);
+
+    s.lctx.block_idx = 3;
+    EXPECT_EQ(get_block_idx(s.raw()), 3);
+    EXPECT_EQ(get_block_num(s.raw()), 4);
+}
+
+TEST(IntrinsicAccessors, ContextPointersAreAtCorrectSlots) {
+    IntrinsicTestSetup s(1, 2, 0);
+    // The value at SPMD_LOCAL_CONTEXT_INDEX must point to lctx
+    auto lctx_ptr = reinterpret_cast<LocalContext*>(
+        static_cast<uint64_t>(s.args[SPMD_LOCAL_CONTEXT_INDEX]));
+    EXPECT_EQ(lctx_ptr, &s.lctx);
+
+    auto gctx_ptr = reinterpret_cast<GlobalContext*>(
+        static_cast<uint64_t>(s.args[SPMD_GLOBAL_CONTEXT_INDEX]));
+    EXPECT_EQ(gctx_ptr, &s.gctx);
+}
diff --git a/tests/ut/cpp/test_pto_types.cpp b/tests/ut/cpp/test_pto_types.cpp
new file mode 100644
index 000000000..c053c1b1e
--- /dev/null
+++ b/tests/ut/cpp/test_pto_types.cpp
@@ -0,0 +1,397 @@
+/**
+ * Unit tests for Arg and TaskOutputTensors from pto_types.h.
+ *
+ * Tests argument ordering enforcement, tensor/scalar storage,
+ * error propagation, add_scalars_i32 zero-extension, copy_scalars_from,
+ * and TaskOutputTensors materialization.
+ */
+
+#include <cstdint>
+#include <cstring>
+
+#include <gtest/gtest.h>
+
+#include "common.h"
+#include "pto_orchestration_api.h"
+#include "pto_types.h"
+
+// =============================================================================
+// Helpers
+// =============================================================================
+
+static Tensor make_test_tensor(void* buf) {
+    uint32_t shapes[2] = {4, 8};
+    return make_tensor_external(buf, shapes, 2, DataType::FLOAT32);
+}
+
+// =============================================================================
+// TaskOutputTensors
+// =============================================================================
+
+TEST(TaskOutputTensors, InitialState) {
+    TaskOutputTensors out;
+    EXPECT_TRUE(out.empty());
+    EXPECT_EQ(out.size(), 0u);
+}
+
+TEST(TaskOutputTensors, MaterializeAddsOne) {
+    float buf[4] = {};
+    Tensor t = make_test_tensor(buf);
+
+    TaskOutputTensors out;
+    out.materialize_output(t);
+
+    EXPECT_FALSE(out.empty());
+    EXPECT_EQ(out.size(), 1u);
+}
+
+TEST(TaskOutputTensors, GetRefReturnsCorrectTensor) {
+    float buf0[4] = {};
+    float buf1[4] = {};
+    Tensor t0 = make_test_tensor(buf0);
+    Tensor t1 = make_test_tensor(buf1);
+
+    TaskOutputTensors out;
+    out.materialize_output(t0);
+    out.materialize_output(t1);
+
+    EXPECT_EQ(&out.get_ref(0), &t0);
+    EXPECT_EQ(&out.get_ref(1), &t1);
+    EXPECT_EQ(out.size(), 2u);
+}
+
+TEST(TaskOutputTensors, GetRefOutOfRangeThrows) {
+    TaskOutputTensors out;
+    EXPECT_THROW(out.get_ref(0), AssertionError);
+}
+
+TEST(TaskOutputTensors, MaxOutputsFill) {
+    float bufs[PTO2_MAX_OUTPUTS] = {};
+    std::vector<Tensor> tensors;
+    tensors.reserve(PTO2_MAX_OUTPUTS);
+
+    TaskOutputTensors out;
+    for (int i = 0; i < PTO2_MAX_OUTPUTS; i++) {
+        tensors.push_back(make_test_tensor(&bufs[i]));
+        out.materialize_output(tensors.back());
+    }
+    EXPECT_EQ(out.size(), static_cast<uint32_t>(PTO2_MAX_OUTPUTS));
+}
+
+// =============================================================================
+// Arg — initial state
+// =============================================================================
+
+TEST(Arg, DefaultState) {
+    Arg a;
+    EXPECT_FALSE(a.has_error);
+    EXPECT_EQ(a.error_msg, nullptr);
+    EXPECT_EQ(a.tensor_count(), 0);
+    EXPECT_EQ(a.scalar_count(), 0);
+}
+
+// =============================================================================
+// Arg — add_input / add_output / add_inout
+// =============================================================================
+
+TEST(Arg, AddInput) {
+    float buf[4] = {};
+    Tensor t = make_test_tensor(buf);
+    Arg a;
+    a.add_input(t);
+    EXPECT_EQ(a.tensor_count(), 1);
+    EXPECT_EQ(a.tag(0), TensorArgType::INPUT);
+    EXPECT_EQ(a.tensor(0).ptr, &t);
+    EXPECT_FALSE(a.has_error);
+}
+
+TEST(Arg, AddOutput) {
+    uint32_t shapes[2] = {4, 8};
+    TensorCreateInfo ci(shapes, 2, DataType::FLOAT32);
+    Arg a;
+    a.add_output(ci);
+    EXPECT_EQ(a.tensor_count(), 1);
+    EXPECT_EQ(a.tag(0), TensorArgType::OUTPUT);
+    EXPECT_EQ(a.tensor(0).create_info, &ci);
+    EXPECT_FALSE(a.has_error);
+}
+
+TEST(Arg, AddInout) {
+    float buf[4] = {};
+    Tensor t = make_test_tensor(buf);
+    Arg a;
+    a.add_inout(t);
+    EXPECT_EQ(a.tensor_count(), 1);
+    EXPECT_EQ(a.tag(0), TensorArgType::INOUT);
+    EXPECT_EQ(a.tensor(0).ptr, &t);
+    EXPECT_FALSE(a.has_error);
+}
+
+TEST(Arg, MixedInputOutputInout) {
+    float buf_in[4] = {}, buf_inout[4] = {};
+    Tensor tin = make_test_tensor(buf_in);
+    Tensor tinout = make_test_tensor(buf_inout);
+    uint32_t shapes_in[2] = {4, 8};
+    TensorCreateInfo ci(shapes_in, 1, DataType::FLOAT32);
+
+    Arg a;
+    a.add_input(tin);
+    a.add_output(ci);
+    a.add_inout(tinout);
+
+    EXPECT_EQ(a.tensor_count(), 3);
+    EXPECT_EQ(a.tag(0), TensorArgType::INPUT);
+    EXPECT_EQ(a.tag(1), TensorArgType::OUTPUT);
+    EXPECT_EQ(a.tag(2), TensorArgType::INOUT);
+    EXPECT_FALSE(a.has_error);
+}
+
+// =============================================================================
+// Arg — ordering enforcement: tensor after scalar sets error
+// =============================================================================
+
+TEST(Arg, TensorAfterScalarSetsError) {
+    float buf[4] = {};
+    Tensor t = make_test_tensor(buf);
+    Arg a;
+    a.add_scalar(uint64_t(42));
+    a.add_input(t);  // invalid: tensor after scalar
+    EXPECT_TRUE(a.has_error);
+    EXPECT_NE(a.error_msg, nullptr);
+    // The scalar was recorded, the tensor was not
+    EXPECT_EQ(a.tensor_count(), 0);
+    EXPECT_EQ(a.scalar_count(), 1);
+}
+
+TEST(Arg, OutputAfterScalarSetsError) {
+    uint32_t shapes_in[2] = {4, 8};
+    TensorCreateInfo ci(shapes_in, 1, DataType::FLOAT32);
+    Arg a;
+    a.add_scalar(uint64_t(1));
+    a.add_output(ci);
+    EXPECT_TRUE(a.has_error);
+    EXPECT_EQ(a.tensor_count(), 0);
+}
+
+TEST(Arg, InoutAfterScalarSetsError) {
+    float buf[4] = {};
+    Tensor t = make_test_tensor(buf);
+    Arg a;
+    a.add_scalar(uint64_t(1));
+    a.add_inout(t);
+    EXPECT_TRUE(a.has_error);
+    EXPECT_EQ(a.tensor_count(), 0);
+}
+
+// =============================================================================
+// Arg — capacity limits
+// =============================================================================
+
+TEST(Arg, TensorCapacityExceeded) {
+    Arg a;
+    for (int i = 0; i < MAX_TENSOR_ARGS; i++) {
+        float dummy = 0.0f;
+        Tensor t = make_test_tensor(&dummy);
+        a.add_input(t);
+        ASSERT_FALSE(a.has_error) << "Failed at tensor " << i;
+    }
+    // One more should trigger the error
+    float extra = 0.0f;
+    Tensor t_extra = make_test_tensor(&extra);
+    a.add_input(t_extra);
+    EXPECT_TRUE(a.has_error);
+    EXPECT_EQ(a.tensor_count(), MAX_TENSOR_ARGS);
+}
+
+TEST(Arg, ScalarCapacityExceeded) {
+    Arg a;
+    for (int i = 0; i < MAX_SCALAR_ARGS; i++) {
+        a.add_scalar(static_cast<uint64_t>(i));
+        ASSERT_FALSE(a.has_error) << "Failed at scalar " << i;
+    }
+    a.add_scalar(uint64_t(999));
+    EXPECT_TRUE(a.has_error);
+    EXPECT_EQ(a.scalar_count(), MAX_SCALAR_ARGS);
+}
+
+// =============================================================================
+// Arg — add_scalar with various types
+// =============================================================================
+
+TEST(Arg, AddScalarUint64) {
+    Arg a;
+    a.add_scalar(uint64_t(0xDEADBEEFCAFEBABEULL));
+    EXPECT_EQ(a.scalar_count(), 1);
+    EXPECT_EQ(a.scalar(0), 0xDEADBEEFCAFEBABEULL);
+    EXPECT_FALSE(a.has_error);
+}
+
+TEST(Arg, AddScalarFloat) {
+    Arg a;
+    float v = 3.14f;
+    a.add_scalar(v);
+    EXPECT_EQ(a.scalar_count(), 1);
+    EXPECT_EQ(a.scalar(0), to_u64(v));
+    EXPECT_FALSE(a.has_error);
+}
+
+TEST(Arg, AddScalarInt32) {
+    Arg a;
+    int32_t v = -7;
+    a.add_scalar(v);
+    EXPECT_EQ(a.scalar_count(), 1);
+    EXPECT_EQ(a.scalar(0), to_u64(v));
+    EXPECT_FALSE(a.has_error);
+}
+
+// =============================================================================
+// Arg — add_scalars (batch uint64)
+// =============================================================================
+
+TEST(Arg, AddScalarsBatch) {
+    Arg a;
+    uint64_t vals[3] = {10, 20, 30};
+    a.add_scalars(vals, 3);
+    EXPECT_EQ(a.scalar_count(), 3);
+    EXPECT_EQ(a.scalar(0), 10u);
+    EXPECT_EQ(a.scalar(1), 20u);
+    EXPECT_EQ(a.scalar(2), 30u);
+    EXPECT_FALSE(a.has_error);
+}
+
+TEST(Arg, AddScalarsBatchOverCapacitySetsError) {
+    Arg a;
+    // Fill to capacity minus 1
+    for (int i = 0; i < MAX_SCALAR_ARGS - 1; i++) {
+        a.add_scalar(uint64_t(i));
+    }
+    // Batch of 3 would overflow by 2
+    uint64_t vals[3] = {1, 2, 3};
+    a.add_scalars(vals, 3);
+    EXPECT_TRUE(a.has_error);
+}
+
+// =============================================================================
+// Arg — add_scalars_i32 (zero-extension)
+// =============================================================================
+
+TEST(Arg, AddScalarsI32ZeroExtends) {
+    Arg a;
+    int32_t vals[4] = {0, 1, -1, 0x7FFFFFFF};
+    a.add_scalars_i32(vals, 4);
+    EXPECT_EQ(a.scalar_count(), 4);
+    EXPECT_EQ(a.scalar(0), uint64_t(0));
+    EXPECT_EQ(a.scalar(1), uint64_t(1));
+    // -1 as int32 is 0xFFFFFFFF; zero-extended to uint64 is 0x00000000FFFFFFFF
+    EXPECT_EQ(a.scalar(2), uint64_t(0x00000000FFFFFFFFull));
+    EXPECT_EQ(a.scalar(3), uint64_t(0x000000007FFFFFFFull));
+    EXPECT_FALSE(a.has_error);
+}
+
+TEST(Arg, AddScalarsI32NegativeValues) {
+    Arg a;
+    int32_t vals[2] = {-1, -2};
+    a.add_scalars_i32(vals, 2);
+    // -1 -> 0xFFFFFFFF zero-extended -> 0x00000000FFFFFFFF
+    // -2 -> 0xFFFFFFFE zero-extended -> 0x00000000FFFFFFFE
+    EXPECT_EQ(a.scalar(0), uint64_t(0xFFFFFFFFull));
+    EXPECT_EQ(a.scalar(1), uint64_t(0xFFFFFFFEull));
+}
+
+TEST(Arg, AddScalarsI32SingleElement) {
+    Arg a;
+    int32_t v = 42;
+    a.add_scalars_i32(&v, 1);
+    EXPECT_EQ(a.scalar_count(), 1);
+    EXPECT_EQ(a.scalar(0), uint64_t(42));
+}
+
+TEST(Arg, AddScalarsI32OverCapacitySetsError) {
+    Arg a;
+    for (int i = 0; i < MAX_SCALAR_ARGS - 1; i++) {
+        a.add_scalar(uint64_t(i));
+    }
+    int32_t vals[3] = {1, 2, 3};
+    a.add_scalars_i32(vals, 3);
+    EXPECT_TRUE(a.has_error);
+}
+
+// =============================================================================
+// Arg — copy_scalars_from
+// =============================================================================
+
+TEST(Arg, CopyScalarsFrom) {
+    Arg src;
+    src.add_scalar(uint64_t(10));
+    src.add_scalar(uint64_t(20));
+    src.add_scalar(uint64_t(30));
+
+    Arg dst;
+    dst.copy_scalars_from(src, 1, 2);  // copy scalars[1..2] = {20, 30}
+    EXPECT_EQ(dst.scalar_count(), 2);
+    EXPECT_EQ(dst.scalar(0), uint64_t(20));
+    EXPECT_EQ(dst.scalar(1), uint64_t(30));
+    EXPECT_FALSE(dst.has_error);
+}
+
+TEST(Arg, CopyScalarsFromOutOfBoundsSetsError) {
+    Arg src;
+    src.add_scalar(uint64_t(1));
+
+    Arg dst;
+    dst.copy_scalars_from(src, 0, 5);  // only 1 scalar available, request 5
+    EXPECT_TRUE(dst.has_error);
+}
+
+TEST(Arg, CopyScalarsFromFull) {
+    Arg src;
+    for (int i = 0; i < MAX_SCALAR_ARGS; i++) {
+        src.add_scalar(static_cast<uint64_t>(i));
+    }
+    Arg dst;
+    for (int i = 0; i < MAX_SCALAR_ARGS - 1; i++) {
+        dst.add_scalar(uint64_t(0));
+    }
+    // dst has MAX-1 scalars; copying 2 from src would overflow
+    dst.copy_scalars_from(src, 0, 2);
+    EXPECT_TRUE(dst.has_error);
+}
+
+// =============================================================================
+// Arg — reset clears all state
+// =============================================================================
+
+TEST(Arg, ResetClearsAll) {
+    float buf[4] = {};
+    Tensor t = make_test_tensor(buf);
+    Arg a;
+    a.add_input(t);
+    a.add_scalar(uint64_t(99));
+    a.set_error("deliberate error");
+
+    a.reset();
+    EXPECT_EQ(a.tensor_count(), 0);
+    EXPECT_EQ(a.scalar_count(), 0);
+    EXPECT_FALSE(a.has_error);
+    EXPECT_EQ(a.error_msg, nullptr);
+}
+
+// =============================================================================
+// Arg — set_error is idempotent (first error wins)
+// =============================================================================
+
+TEST(Arg, SetErrorFirstWins) {
+    Arg a;
+    a.set_error("first");
+    a.set_error("second");
+    EXPECT_STREQ(a.error_msg, "first");
+}
+
+// =============================================================================
+// Arg — launch_spec default
+// =============================================================================
+
+TEST(Arg, LaunchSpecDefaultBlockNum) {
+    Arg a;
+    EXPECT_EQ(a.launch_spec.block_num(), 1);
+}
diff --git a/tests/ut/cpp/test_ready_queue.cpp b/tests/ut/cpp/test_ready_queue.cpp
new file mode 100644
index 000000000..0c6b0fb86
--- /dev/null
+++ b/tests/ut/cpp/test_ready_queue.cpp
@@ -0,0 +1,398 @@
+/**
+ * Unit tests for PTO2ReadyQueue and PTO2LocalReadyBuffer from pto_scheduler.h
+ *
+ * Tests the lock-free bounded MPMC queue (Vyukov design) and the thread-local
+ * ready buffer used for local-first dispatch optimization.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <algorithm>
+#include <set>
+#include <thread>
+#include <vector>
+
+#include "pto_scheduler.h"
+
+// =============================================================================
+// ReadyQueue: Single-threaded tests
+// =============================================================================
+
+class ReadyQueueTest : public ::testing::Test {
+protected:
+    static constexpr uint64_t kCapacity = 16;  // Power of 2
+
+    PTO2ReadyQueue queue;
+
+    void SetUp() override {
+        ASSERT_TRUE(pto2_ready_queue_init(&queue, kCapacity));
+    }
+
+    void TearDown() override {
+        pto2_ready_queue_destroy(&queue);
+    }
+};
+
+// 1. Empty pop returns nullptr
+TEST_F(ReadyQueueTest, EmptyPopReturnsNullptr) {
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+// 2. Single push/pop returns correct item
+TEST_F(ReadyQueueTest, SinglePushPop) {
+    PTO2TaskSlotState item;
+    ASSERT_TRUE(queue.push(&item));
+
+    PTO2TaskSlotState* result = queue.pop();
+    EXPECT_EQ(result, &item);
+}
+
+// 3. FIFO ordering: push A,B,C then pop A,B,C
+TEST_F(ReadyQueueTest, FIFOOrdering) {
+    PTO2TaskSlotState a, b, c;
+
+    ASSERT_TRUE(queue.push(&a));
+    ASSERT_TRUE(queue.push(&b));
+    ASSERT_TRUE(queue.push(&c));
+
+    EXPECT_EQ(queue.pop(), &a);
+    EXPECT_EQ(queue.pop(), &b);
+    EXPECT_EQ(queue.pop(), &c);
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+// 4. Queue full: push returns false at capacity
+TEST_F(ReadyQueueTest, QueueFullReturnsFalse) {
+    std::vector<PTO2TaskSlotState> items(kCapacity);
+
+    for (uint64_t i = 0; i < kCapacity; i++) {
+        ASSERT_TRUE(queue.push(&items[i]));
+    }
+
+    PTO2TaskSlotState extra;
+    EXPECT_FALSE(queue.push(&extra));
+}
+
+// 5. Slot reuse after full drain (push/pop cycle)
+TEST_F(ReadyQueueTest, SlotReuseAfterFullDrain) {
+    std::vector<PTO2TaskSlotState> items(kCapacity);
+
+    // Fill the queue
+    for (uint64_t i = 0; i < kCapacity; i++) {
+        ASSERT_TRUE(queue.push(&items[i]));
+    }
+
+    // Drain the queue
+    for (uint64_t i = 0; i < kCapacity; i++) {
+        EXPECT_EQ(queue.pop(), &items[i]);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+
+    // Refill and re-drain to verify slot reuse
+    for (uint64_t i = 0; i < kCapacity; i++) {
+        ASSERT_TRUE(queue.push(&items[i]));
+    }
+    for (uint64_t i = 0; i < kCapacity; i++) {
+        EXPECT_EQ(queue.pop(), &items[i]);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+// 6. push_batch: batch enqueue then individual dequeue
+TEST_F(ReadyQueueTest, PushBatchThenIndividualPop) {
+    constexpr int kBatchSize = 5;
+    PTO2TaskSlotState items[kBatchSize];
+    PTO2TaskSlotState* ptrs[kBatchSize];
+    for (int i = 0; i < kBatchSize; i++) {
+        ptrs[i] = &items[i];
+    }
+
+    queue.push_batch(ptrs, kBatchSize);
+
+    for (int i = 0; i < kBatchSize; i++) {
+        EXPECT_EQ(queue.pop(), &items[i]);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+// 7. push_batch count=0: no-op
+TEST_F(ReadyQueueTest, PushBatchZeroIsNoop) {
+    queue.push_batch(nullptr, 0);
+
+    EXPECT_EQ(queue.size(), 0u);
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+// 8. pop_batch: push 10, pop_batch(5) returns 5
+TEST_F(ReadyQueueTest, PopBatchReturnsFive) {
+    constexpr int kPushCount = 10;
+    PTO2TaskSlotState items[kPushCount];
+
+    for (int i = 0; i < kPushCount; i++) {
+        ASSERT_TRUE(queue.push(&items[i]));
+    }
+
+    PTO2TaskSlotState* out[5];
+    int popped = queue.pop_batch(out, 5);
+    EXPECT_EQ(popped, 5);
+
+    for (int i = 0; i < 5; i++) {
+        EXPECT_EQ(out[i], &items[i]);
+    }
+}
+
+// 9. pop_batch partial: push 3, pop_batch(5) returns 3
+TEST_F(ReadyQueueTest, PopBatchPartial) {
+    constexpr int kPushCount = 3;
+    PTO2TaskSlotState items[kPushCount];
+
+    for (int i = 0; i < kPushCount; i++) {
+        ASSERT_TRUE(queue.push(&items[i]));
+    }
+
+    PTO2TaskSlotState* out[5];
+    int popped = queue.pop_batch(out, 5);
+    EXPECT_EQ(popped, kPushCount);
+
+    for (int i = 0; i < kPushCount; i++) {
+        EXPECT_EQ(out[i], &items[i]);
+    }
+}
+
+// 10. pop_batch empty: returns 0
+TEST_F(ReadyQueueTest, PopBatchEmpty) {
+    PTO2TaskSlotState* out[5];
+    int popped = queue.pop_batch(out, 5);
+    EXPECT_EQ(popped, 0);
+}
+
+// 11. size() accuracy after various push/pop
+TEST_F(ReadyQueueTest, SizeAccuracy) {
+    EXPECT_EQ(queue.size(), 0u);
+
+    PTO2TaskSlotState items[8];
+
+    queue.push(&items[0]);
+    EXPECT_EQ(queue.size(), 1u);
+
+    queue.push(&items[1]);
+    queue.push(&items[2]);
+    EXPECT_EQ(queue.size(), 3u);
+
+    queue.pop();
+    EXPECT_EQ(queue.size(), 2u);
+
+    queue.pop();
+    queue.pop();
+    EXPECT_EQ(queue.size(), 0u);
+
+    // Push 5 more
+    for (int i = 0; i < 5; i++) {
+        queue.push(&items[i]);
+    }
+    EXPECT_EQ(queue.size(), 5u);
+}
+
+// =============================================================================
+// ReadyQueue: Multi-threaded tests
+// =============================================================================
+
+class ReadyQueueMTTest : public ::testing::Test {
+protected:
+    static constexpr uint64_t kCapacity = 1024;  // Power of 2
+
+    PTO2ReadyQueue queue;
+
+    void SetUp() override {
+        ASSERT_TRUE(pto2_ready_queue_init(&queue, kCapacity));
+    }
+
+    void TearDown() override {
+        pto2_ready_queue_destroy(&queue);
+    }
+};
+
+// 12. 2 producers / 2 consumers: all items consumed exactly once
+TEST_F(ReadyQueueMTTest, TwoProducersTwoConsumers) {
+    constexpr int kItemsPerProducer = 200;
+    constexpr int kTotalItems = kItemsPerProducer * 2;
+
+    std::vector<PTO2TaskSlotState> items(kTotalItems);
+    // Give each item a unique counter value via its fanin_count field
+    for (int i = 0; i < kTotalItems; i++) {
+        items[i].fanin_count = i;
+    }
+
+    std::atomic<int> produced{0};
+    std::atomic<bool> producers_done{false};
+
+    // Tracking: atomic counter per item to verify exactly-once consumption
+    std::vector<std::atomic<int>> consumed_count(kTotalItems);
+    for (int i = 0; i < kTotalItems; i++) {
+        consumed_count[i].store(0, std::memory_order_relaxed);
+    }
+
+    auto producer = [&](int offset) {
+        for (int i = 0; i < kItemsPerProducer; i++) {
+            while (!queue.push(&items[offset + i])) {
+                // Queue full, retry
+            }
+        }
+        produced.fetch_add(kItemsPerProducer, std::memory_order_release);
+    };
+
+    auto consumer = [&](std::vector<PTO2TaskSlotState*>& results) {
+        while (true) {
+            PTO2TaskSlotState* item = queue.pop();
+            if (item != nullptr) {
+                results.push_back(item);
+                consumed_count[item->fanin_count].fetch_add(1, std::memory_order_relaxed);
+            } else if (producers_done.load(std::memory_order_acquire)) {
+                // Drain remaining
+                while ((item = queue.pop()) != nullptr) {
+                    results.push_back(item);
+                    consumed_count[item->fanin_count].fetch_add(1, std::memory_order_relaxed);
+                }
+                break;
+            }
+        }
+    };
+
+    std::vector<PTO2TaskSlotState*> results_c1, results_c2;
+    std::thread p1(producer, 0);
+    std::thread p2(producer, kItemsPerProducer);
+    std::thread c1(consumer, std::ref(results_c1));
+    std::thread c2(consumer, std::ref(results_c2));
+
+    p1.join();
+    p2.join();
+    producers_done.store(true, std::memory_order_release);
+    c1.join();
+    c2.join();
+
+    // Verify all items consumed exactly once
+    int total_consumed = static_cast<int>(results_c1.size() + results_c2.size());
+    EXPECT_EQ(total_consumed, kTotalItems);
+
+    for (int i = 0; i < kTotalItems; i++) {
+        EXPECT_EQ(consumed_count[i].load(), 1)
+            << "Item " << i << " consumed "
+            << consumed_count[i].load() << " times (expected 1)";
+    }
+}
+
+// 13. 1 producer / N consumers: all items consumed exactly once
+TEST_F(ReadyQueueMTTest, OneProducerNConsumers) {
+    constexpr int kTotalItems = 500;
+    constexpr int kNumConsumers = 4;
+
+    std::vector<PTO2TaskSlotState> items(kTotalItems);
+    for (int i = 0; i < kTotalItems; i++) {
+        items[i].fanin_count = i;
+    }
+
+    std::atomic<bool> producer_done{false};
+    std::vector<std::atomic<int>> consumed_count(kTotalItems);
+    for (int i = 0; i < kTotalItems; i++) {
+        consumed_count[i].store(0, std::memory_order_relaxed);
+    }
+
+    auto producer = [&]() {
+        for (int i = 0; i < kTotalItems; i++) {
+            while (!queue.push(&items[i])) {
+                // Queue full, retry
+            }
+        }
+        producer_done.store(true, std::memory_order_release);
+    };
+
+    std::atomic<int> total_consumed{0};
+
+    auto consumer = [&]() {
+        while (true) {
+            PTO2TaskSlotState* item = queue.pop();
+            if (item != nullptr) {
+                consumed_count[item->fanin_count].fetch_add(1, std::memory_order_relaxed);
+                total_consumed.fetch_add(1, std::memory_order_relaxed);
+            } else if (producer_done.load(std::memory_order_acquire)) {
+                // Drain remaining
+                while ((item = queue.pop()) != nullptr) {
+                    consumed_count[item->fanin_count].fetch_add(1, std::memory_order_relaxed);
+                    total_consumed.fetch_add(1, std::memory_order_relaxed);
+                }
+                break;
+            }
+        }
+    };
+
+    std::thread prod(producer);
+    std::vector<std::thread> consumers;
+    for (int i = 0; i < kNumConsumers; i++) {
+        consumers.emplace_back(consumer);
+    }
+
+    prod.join();
+    for (auto& c : consumers) {
+        c.join();
+    }
+
+    EXPECT_EQ(total_consumed.load(), kTotalItems);
+
+    for (int i = 0; i < kTotalItems; i++) {
+        EXPECT_EQ(consumed_count[i].load(), 1)
+            << "Item " << i << " consumed "
+            << consumed_count[i].load() << " times (expected 1)";
+    }
+}
+
+// =============================================================================
+// LocalReadyBuffer tests
+// =============================================================================
+
+class LocalReadyBufferTest : public ::testing::Test {
+protected:
+    static constexpr int kCapacity = 8;
+
+    PTO2LocalReadyBuffer buffer;
+    PTO2TaskSlotState* backing[kCapacity];
+
+    void SetUp() override {
+        buffer.reset(backing, kCapacity);
+    }
+};
+
+// 14. reset sets clean state
+TEST_F(LocalReadyBufferTest, ResetSetsCleanState) {
+    EXPECT_EQ(buffer.count, 0);
+    EXPECT_EQ(buffer.capacity, kCapacity);
+    EXPECT_EQ(buffer.slot_states, backing);
+}
+
+// 15. try_push/pop LIFO: push A,B -> pop returns B,A
+TEST_F(LocalReadyBufferTest, LIFOOrdering) {
+    PTO2TaskSlotState a, b;
+
+    ASSERT_TRUE(buffer.try_push(&a));
+    ASSERT_TRUE(buffer.try_push(&b));
+
+    EXPECT_EQ(buffer.pop(), &b);
+    EXPECT_EQ(buffer.pop(), &a);
+    EXPECT_EQ(buffer.pop(), nullptr);
+}
+
+// 16. try_push full: returns false at capacity
+TEST_F(LocalReadyBufferTest, TryPushFullReturnsFalse) {
+    PTO2TaskSlotState items[kCapacity + 1];
+
+    for (int i = 0; i < kCapacity; i++) {
+        ASSERT_TRUE(buffer.try_push(&items[i]));
+    }
+
+    EXPECT_FALSE(buffer.try_push(&items[kCapacity]));
+}
+
+// 17. pop empty: returns nullptr
+TEST_F(LocalReadyBufferTest, PopEmptyReturnsNullptr) {
+    EXPECT_EQ(buffer.pop(), nullptr);
+}
diff --git a/tests/ut/cpp/test_ring_buffer.cpp b/tests/ut/cpp/test_ring_buffer.cpp
new file mode 100644
index 000000000..58bd8691f
--- /dev/null
+++ b/tests/ut/cpp/test_ring_buffer.cpp
@@ -0,0 +1,573 @@
+/**
+ * Unit tests for PTO2TaskAllocator and PTO2DepListPool from pto_ring_buffer.h
+ *
+ * Tests ring buffer allocation, heap bump logic, dependency list pool,
+ * and known boundary conditions including a bug candidate in
+ * try_bump_heap wrap-around when tail == alloc_size.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstring>
+#include <vector>
+
+#include "pto_ring_buffer.h"
+
+// =============================================================================
+// Helpers
+// =============================================================================
+
+static constexpr int32_t kWindowSize = 16;  // Power of 2, small for testing
+static constexpr uint64_t kHeapSize = 1024; // Small heap for boundary testing
+
+/**
+ * Test fixture for PTO2TaskAllocator tests.
+ *
+ * Sets up a descriptor array, heap buffer, and atomic flow-control variables.
+ * last_alive starts at 0, so tasks 0..window_size-2 can be allocated before
+ * the ring is considered full (active = local_task_id - last_alive + 1 < window_size).
+ */
+class TaskAllocatorTest : public ::testing::Test {
+protected:
+    void SetUp() override {
+        descriptors_.resize(kWindowSize);
+        std::memset(descriptors_.data(), 0, sizeof(PTO2TaskDescriptor) * kWindowSize);
+        heap_buf_.resize(kHeapSize, 0);
+
+        current_index_.store(0, std::memory_order_relaxed);
+        last_alive_.store(0, std::memory_order_relaxed);
+        error_code_.store(0, std::memory_order_relaxed);
+
+        allocator_.init(
+            descriptors_.data(), kWindowSize,
+            &current_index_, &last_alive_,
+            heap_buf_.data(), kHeapSize,
+            &error_code_);
+    }
+
+    // Simulate the scheduler consuming tasks up to (exclusive) task_id
+    // by advancing last_alive and setting packed_buffer_end on the consumed descriptor.
+    void consume_up_to(int32_t task_id, uint64_t heap_tail_offset) {
+        // Set the packed_buffer_end on the descriptor that last_alive-1 maps to
+        // so update_heap_tail can derive the tail.
+        int32_t last_consumed = task_id - 1;
+        descriptors_[last_consumed & (kWindowSize - 1)].packed_buffer_end =
+            static_cast<char*>(static_cast<void*>(heap_buf_.data())) + heap_tail_offset;
+        last_alive_.store(task_id, std::memory_order_release);
+    }
+
+    PTO2TaskAllocator allocator_;
+    std::vector<PTO2TaskDescriptor> descriptors_;
+    std::vector<char> heap_buf_;
+    std::atomic<int32_t> current_index_{0};
+    std::atomic<int32_t> last_alive_{0};
+    std::atomic<int32_t> error_code_{0};
+};
+
+// =============================================================================
+// TaskAllocator: init and state queries
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, InitialState) {
+    EXPECT_EQ(allocator_.window_size(), kWindowSize);
+    EXPECT_EQ(allocator_.active_count(), 0);
+    EXPECT_EQ(allocator_.heap_top(), 0u);
+    EXPECT_EQ(allocator_.heap_capacity(), kHeapSize);
+    EXPECT_EQ(allocator_.heap_available(), kHeapSize);
+}
+
+// =============================================================================
+// TaskAllocator: single alloc with output_size=0
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, AllocZeroOutputSize) {
+    auto result = allocator_.alloc(0);
+    ASSERT_FALSE(result.failed());
+    EXPECT_EQ(result.task_id, 0);
+    EXPECT_EQ(result.slot, 0);
+    // packed_base should be heap_base + 0 (non-null)
+    EXPECT_NE(result.packed_base, nullptr);
+    // packed_end == packed_base when output_size == 0
+    EXPECT_EQ(result.packed_base, result.packed_end);
+    // Heap top should not advance for zero-size alloc
+    EXPECT_EQ(allocator_.heap_top(), 0u);
+}
+
+// =============================================================================
+// TaskAllocator: single alloc with non-zero size
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, AllocNonZeroSize) {
+    auto result = allocator_.alloc(100);
+    ASSERT_FALSE(result.failed());
+    EXPECT_EQ(result.task_id, 0);
+    EXPECT_EQ(result.slot, 0);
+    EXPECT_NE(result.packed_base, nullptr);
+    // 100 bytes aligned up to PTO2_ALIGN_SIZE (64) = 128
+    uint64_t expected_aligned = PTO2_ALIGN_UP(100u, PTO2_ALIGN_SIZE);
+    EXPECT_EQ(expected_aligned, 128u);
+    EXPECT_EQ(allocator_.heap_top(), expected_aligned);
+    EXPECT_EQ(static_cast<char*>(result.packed_end) - static_cast<char*>(result.packed_base),
+              static_cast<ptrdiff_t>(expected_aligned));
+}
+
+// =============================================================================
+// TaskAllocator: sequential allocs produce sequential task IDs
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, SequentialTaskIds) {
+    for (int i = 0; i < 5; i++) {
+        auto result = allocator_.alloc(0);
+        ASSERT_FALSE(result.failed()) << "Alloc failed at i=" << i;
+        EXPECT_EQ(result.task_id, i);
+        EXPECT_EQ(result.slot, i & (kWindowSize - 1));
+    }
+    EXPECT_EQ(allocator_.active_count(), 5);
+}
+
+// =============================================================================
+// TaskAllocator: alignment of output_size to PTO2_ALIGN_SIZE
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, OutputSizeAlignment) {
+    // 1 byte -> aligned to 64
+    auto r1 = allocator_.alloc(1);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator_.heap_top(), 64u);
+
+    // Another 33 bytes -> aligned to 64, total 128
+    auto r2 = allocator_.alloc(33);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(allocator_.heap_top(), 128u);
+
+    // Exactly 64 bytes -> stays 64, total 192
+    auto r3 = allocator_.alloc(64);
+    ASSERT_FALSE(r3.failed());
+    EXPECT_EQ(allocator_.heap_top(), 192u);
+}
+
+// =============================================================================
+// TaskAllocator: try_bump_heap exact fit at end (space_at_end == alloc_size)
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, HeapExactFitAtEnd) {
+    // Heap size is 1024. Allocate 960 bytes (15 * 64) to leave exactly 64 at end.
+    // Then allocate exactly 64 which should succeed (space_at_end >= alloc_size).
+    auto r1 = allocator_.alloc(960);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator_.heap_top(), 960u);
+
+    auto r2 = allocator_.alloc(64);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(allocator_.heap_top(), 1024u);
+    // Result pointer should be at heap_base + 960
+    EXPECT_EQ(static_cast<char*>(r2.packed_base),
+              heap_buf_.data() + 960);
+}
+
+// =============================================================================
+// TaskAllocator: try_bump_heap wrap-around with tail == alloc_size (BUG TEST)
+//
+// BUG at pto_ring_buffer.h try_bump_heap: uses `tail > alloc_size` (strict >).
+// When tail == alloc_size, [0, alloc_size) is exactly available, so the
+// wrap-around should succeed. The strict > incorrectly rejects it.
+//
+// Correct behavior: allocation succeeds, packed_base == heap_base, top == 64.
+// This test FAILS until the bug is fixed (> changed to >=).
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, HeapWrapAroundTailEqualsAllocSize_BugCandidate) {
+    // Fill heap completely: allocate 1024 bytes total
+    auto r1 = allocator_.alloc(1024);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator_.heap_top(), 1024u);
+
+    // Now consume task 0, setting tail to exactly 64 (one aligned block)
+    consume_up_to(1, 64);
+
+    // top=1024 (== heap_size), tail=64
+    // space_at_end = 0, wrap-around: tail(64) >= alloc_size(64) => exactly fits.
+    // Correct behavior: allocation wraps to [0, 64) and succeeds.
+    auto r2 = allocator_.alloc(64);
+    ASSERT_FALSE(r2.failed()) << "wrap-around must succeed when tail == alloc_size";
+    EXPECT_EQ(r2.packed_base, static_cast<void*>(heap_buf_.data()))
+        << "packed_base should wrap to start of heap";
+    EXPECT_EQ(allocator_.heap_top(), 64u);
+    EXPECT_EQ(error_code_.load(), 0) << "no error on successful allocation";
+}
+
+// =============================================================================
+// TaskAllocator: try_bump_heap wrap-around success (tail > alloc_size)
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, HeapWrapAroundSuccess) {
+    // Fill heap completely: allocate 1024 bytes
+    auto r1 = allocator_.alloc(1024);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator_.heap_top(), 1024u);
+
+    // Consume task 0, setting tail to 128 (more than one block)
+    consume_up_to(1, 128);
+
+    // Now: top=1024 (== heap_size), tail=128
+    // space_at_end = 0, so wrap-around check: tail(128) > alloc_size(64)? => TRUE
+    // Wraps to beginning: result = heap_base, top = 64
+    auto r2 = allocator_.alloc(64);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.packed_base, static_cast<void*>(heap_buf_.data()));
+    EXPECT_EQ(allocator_.heap_top(), 64u);
+}
+
+// =============================================================================
+// TaskAllocator: try_bump_heap top < tail exact fit
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, HeapTopLessThanTailExactFit) {
+    // Fill heap, then wrap around to set up top < tail.
+    auto r1 = allocator_.alloc(960);
+    ASSERT_FALSE(r1.failed());
+
+    // Consume task 0, tail moves to 960
+    consume_up_to(1, 960);
+
+    // Allocate 128 bytes: space_at_end = 1024-960 = 64, not enough for 128.
+    // Wrap-around: tail(960) > 128 => TRUE, wraps.
+    auto r2 = allocator_.alloc(128);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(allocator_.heap_top(), 128u);
+
+    // Now top=128, tail=960 (top < tail)
+    // Available = tail - top = 960 - 128 = 832
+    // Allocate exactly 832 bytes: should succeed (exact fit)
+    auto r3 = allocator_.alloc(832);
+    ASSERT_FALSE(r3.failed());
+    EXPECT_EQ(allocator_.heap_top(), 960u);
+}
+
+// =============================================================================
+// TaskAllocator: try_bump_heap top < tail insufficient space
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, HeapTopLessThanTailInsufficientSpace) {
+    // Set up top < tail scenario
+    auto r1 = allocator_.alloc(960);
+    ASSERT_FALSE(r1.failed());
+    consume_up_to(1, 960);
+
+    auto r2 = allocator_.alloc(128);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(allocator_.heap_top(), 128u);
+
+    // Now top=128, tail=960. Available = 832.
+    // Try to allocate 896 (> 832): should fail (deadlock after spin).
+    auto r3 = allocator_.alloc(896);
+    EXPECT_TRUE(r3.failed());
+    EXPECT_NE(error_code_.load(), 0);
+}
+
+// =============================================================================
+// TaskAllocator: update_heap_tail from consumed task
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, UpdateHeapTailFromConsumedTask) {
+    auto r1 = allocator_.alloc(256);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator_.heap_top(), 256u);
+
+    // Before consumption, heap_available should be heap_size - top = 768
+    EXPECT_EQ(allocator_.heap_available(), kHeapSize - 256u);
+
+    // Consume task 0, tail moves to 256
+    consume_up_to(1, 256);
+
+    // Force the allocator to observe the new last_alive by doing another alloc
+    auto r2 = allocator_.alloc(0);
+    ASSERT_FALSE(r2.failed());
+
+    // After update_heap_tail, full heap should be available again
+    // top=256, tail=256, so available = heap_size - top = 768 (at_end)
+    // Actually: top >= tail, at_end = 1024-256=768, at_begin = 256
+    // heap_available returns max(at_end, at_begin) = 768
+    EXPECT_EQ(allocator_.heap_available(), kHeapSize - 256u);
+}
+
+// =============================================================================
+// TaskAllocator: update_heap_tail at task 0 boundary
+//
+// When last_alive=1, update_heap_tail reads descriptors[(1-1) & mask] = descriptors[0].
+// This is task 0's descriptor, which should have valid packed_buffer_end.
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, UpdateHeapTailAtTask0) {
+    // Allocate task 0 with some heap
+    auto r1 = allocator_.alloc(64);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(r1.task_id, 0);
+
+    // Set packed_buffer_end on task 0's descriptor
+    descriptors_[0].packed_buffer_end =
+        static_cast<char*>(static_cast<void*>(heap_buf_.data())) + 64;
+
+    // Advance last_alive to 1 (meaning task 0 is consumed)
+    last_alive_.store(1, std::memory_order_release);
+
+    // The next alloc triggers update_heap_tail(1), reading descriptors[0].
+    auto r2 = allocator_.alloc(0);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.task_id, 1);
+}
+
+// =============================================================================
+// TaskAllocator: update_heap_tail idempotent
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, UpdateHeapTailIdempotent) {
+    auto r1 = allocator_.alloc(128);
+    ASSERT_FALSE(r1.failed());
+
+    consume_up_to(1, 128);
+
+    // Multiple allocs should not cause heap_tail to drift
+    auto r2 = allocator_.alloc(0);
+    ASSERT_FALSE(r2.failed());
+    uint64_t avail_after_first = allocator_.heap_available();
+
+    auto r3 = allocator_.alloc(0);
+    ASSERT_FALSE(r3.failed());
+    EXPECT_EQ(allocator_.heap_available(), avail_after_first);
+}
+
+// =============================================================================
+// TaskAllocator: heap_available for top>=tail and top<tail
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, HeapAvailableTopGeTail) {
+    // Initially top=0, tail=0: available = heap_size - 0 = 1024
+    EXPECT_EQ(allocator_.heap_available(), kHeapSize);
+
+    auto r1 = allocator_.alloc(256);
+    ASSERT_FALSE(r1.failed());
+    // top=256, tail=0: at_end=768, at_begin=0, available=768
+    EXPECT_EQ(allocator_.heap_available(), kHeapSize - 256u);
+}
+
+TEST_F(TaskAllocatorTest, HeapAvailableTopLtTail) {
+    // Set up top < tail
+    auto r1 = allocator_.alloc(960);
+    ASSERT_FALSE(r1.failed());
+    consume_up_to(1, 960);
+
+    // Wrap around
+    auto r2 = allocator_.alloc(128);
+    ASSERT_FALSE(r2.failed());
+    // top=128, tail=960: available = 960 - 128 = 832
+    EXPECT_EQ(allocator_.heap_available(), 832u);
+}
+
+// =============================================================================
+// DepListPool Test Fixture
+// =============================================================================
+
+class DepListPoolTest : public ::testing::Test {
+protected:
+    void SetUp() override {
+        entries_.resize(kPoolCapacity);
+        std::memset(entries_.data(), 0, sizeof(PTO2DepListEntry) * kPoolCapacity);
+        error_code_.store(0, std::memory_order_relaxed);
+
+        pool_.init(entries_.data(), kPoolCapacity, &error_code_);
+    }
+
+    static constexpr int32_t kPoolCapacity = 8;
+
+    PTO2DepListPool pool_;
+    std::vector<PTO2DepListEntry> entries_;
+    std::atomic<int32_t> error_code_{0};
+};
+
+// =============================================================================
+// DepListPool: init (top=1, tail=1, entry 0 is NULL)
+// =============================================================================
+
+TEST_F(DepListPoolTest, InitialState) {
+    EXPECT_EQ(pool_.top, 1);
+    EXPECT_EQ(pool_.tail, 1);
+    EXPECT_EQ(pool_.high_water, 0);
+    EXPECT_EQ(pool_.used(), 0);
+    EXPECT_EQ(pool_.available(), kPoolCapacity);
+
+    // Entry 0 should be NULL marker
+    EXPECT_EQ(entries_[0].slot_state, nullptr);
+    EXPECT_EQ(entries_[0].next, nullptr);
+}
+
+// =============================================================================
+// DepListPool: single alloc
+// =============================================================================
+
+TEST_F(DepListPoolTest, SingleAlloc) {
+    PTO2DepListEntry* entry = pool_.alloc();
+    ASSERT_NE(entry, nullptr);
+    EXPECT_EQ(pool_.top, 2);
+    EXPECT_EQ(pool_.tail, 1);
+    EXPECT_EQ(pool_.used(), 1);
+    EXPECT_EQ(pool_.available(), kPoolCapacity - 1);
+    EXPECT_EQ(pool_.high_water, 1);
+
+    // The allocated entry should be at index 1 (top was 1, mod capacity)
+    EXPECT_EQ(entry, &entries_[1]);
+}
+
+// =============================================================================
+// DepListPool: modular wrap — entry 0 must not be returned (BUG TEST)
+//
+// BUG in PTO2DepListPool::alloc(): uses `top % capacity` as physical index.
+// When top is a multiple of capacity (e.g. top=8 with capacity=8), this
+// yields index 0 — the NULL sentinel slot set during init(). Handing out
+// &entries_[0] corrupts the sentinel, breaking pto2_dep_pool_get().
+//
+// Correct behavior: the pool must never return the NULL sentinel slot.
+// This test FAILS until the bug is fixed (index arithmetic skips slot 0).
+// =============================================================================
+
+TEST_F(DepListPoolTest, ModularWrapEntry0Conflict) {
+    // Capacity is 8. Allocate 7 entries (top goes from 1 to 8).
+    for (int i = 0; i < 7; i++) {
+        PTO2DepListEntry* e = pool_.alloc();
+        ASSERT_NE(e, nullptr) << "Failed at alloc " << i;
+    }
+    EXPECT_EQ(pool_.top, 8);
+    EXPECT_EQ(pool_.used(), 7);
+
+    // Advance tail so pool is not full (used drops to 3)
+    pool_.advance_tail(5);
+    EXPECT_EQ(pool_.used(), 3);
+
+    // Correct behavior: allocation must NOT return the NULL sentinel slot.
+    PTO2DepListEntry* e = pool_.alloc();
+    ASSERT_NE(e, nullptr) << "allocation must succeed (space is available)";
+    EXPECT_NE(e, &entries_[0]) << "must never return the NULL sentinel at index 0";
+    // The sentinel must remain intact
+    EXPECT_EQ(entries_[0].slot_state, nullptr) << "NULL sentinel must not be overwritten";
+    EXPECT_EQ(entries_[0].next, nullptr);
+}
+
+// =============================================================================
+// DepListPool: overflow detection
+// =============================================================================
+
+TEST_F(DepListPoolTest, OverflowDetection) {
+    // Allocate until full (capacity entries used)
+    for (int i = 0; i < kPoolCapacity; i++) {
+        PTO2DepListEntry* e = pool_.alloc();
+        ASSERT_NE(e, nullptr) << "Unexpected failure at alloc " << i;
+    }
+    EXPECT_EQ(pool_.used(), kPoolCapacity);
+    EXPECT_EQ(pool_.available(), 0);
+
+    // Next alloc should fail (overflow)
+    PTO2DepListEntry* overflow = pool_.alloc();
+    EXPECT_EQ(overflow, nullptr);
+    EXPECT_NE(error_code_.load(), 0);
+    EXPECT_EQ(error_code_.load(), PTO2_ERROR_DEP_POOL_OVERFLOW);
+}
+
+// =============================================================================
+// DepListPool: prepend chain integrity
+// =============================================================================
+
+TEST_F(DepListPoolTest, PrependChainIntegrity) {
+    PTO2TaskSlotState slot_a{};
+    PTO2TaskSlotState slot_b{};
+    PTO2TaskSlotState slot_c{};
+
+    // Build a chain: NULL -> slot_a -> slot_b -> slot_c (prepend order)
+    PTO2DepListEntry* head = nullptr;
+
+    head = pool_.prepend(head, &slot_a);
+    ASSERT_NE(head, nullptr);
+    EXPECT_EQ(head->slot_state, &slot_a);
+    EXPECT_EQ(head->next, nullptr);
+
+    head = pool_.prepend(head, &slot_b);
+    ASSERT_NE(head, nullptr);
+    EXPECT_EQ(head->slot_state, &slot_b);
+    EXPECT_EQ(head->next->slot_state, &slot_a);
+    EXPECT_EQ(head->next->next, nullptr);
+
+    head = pool_.prepend(head, &slot_c);
+    ASSERT_NE(head, nullptr);
+    EXPECT_EQ(head->slot_state, &slot_c);
+    EXPECT_EQ(head->next->slot_state, &slot_b);
+    EXPECT_EQ(head->next->next->slot_state, &slot_a);
+    EXPECT_EQ(head->next->next->next, nullptr);
+}
+
+// =============================================================================
+// DepListPool: advance_tail
+// =============================================================================
+
+TEST_F(DepListPoolTest, AdvanceTail) {
+    // Allocate 4 entries
+    for (int i = 0; i < 4; i++) {
+        pool_.alloc();
+    }
+    EXPECT_EQ(pool_.used(), 4);
+    EXPECT_EQ(pool_.available(), kPoolCapacity - 4);
+
+    // Advance tail by 3 (from 1 to 4)
+    pool_.advance_tail(4);
+    EXPECT_EQ(pool_.tail, 4);
+    EXPECT_EQ(pool_.used(), 1);  // top=5, tail=4
+    EXPECT_EQ(pool_.available(), kPoolCapacity - 1);
+}
+
+// =============================================================================
+// DepListPool: advance_tail backwards (no-op)
+// =============================================================================
+
+TEST_F(DepListPoolTest, AdvanceTailBackwardsNoop) {
+    pool_.alloc();
+    pool_.alloc();
+    pool_.advance_tail(3);
+    EXPECT_EQ(pool_.tail, 3);
+
+    // Trying to advance backwards should be a no-op
+    pool_.advance_tail(2);
+    EXPECT_EQ(pool_.tail, 3);
+
+    // Same value should also be a no-op
+    pool_.advance_tail(3);
+    EXPECT_EQ(pool_.tail, 3);
+}
+
+// =============================================================================
+// DepListPool: pto2_dep_pool_get(0) returns NULL
+// =============================================================================
+
+TEST_F(DepListPoolTest, GetOffsetZeroReturnsNull) {
+    PTO2DepListEntry* result = pool_.pto2_dep_pool_get(0);
+    EXPECT_EQ(result, nullptr);
+}
+
+// =============================================================================
+// DepListPool: pto2_dep_pool_get(-1) returns NULL
+// =============================================================================
+
+TEST_F(DepListPoolTest, GetNegativeOffsetReturnsNull) {
+    PTO2DepListEntry* result = pool_.pto2_dep_pool_get(-1);
+    EXPECT_EQ(result, nullptr);
+}
+
+// =============================================================================
+// DepListPool: pto2_dep_pool_get with valid offset
+// =============================================================================
+
+TEST_F(DepListPoolTest, GetValidOffset) {
+    PTO2DepListEntry* result = pool_.pto2_dep_pool_get(1);
+    EXPECT_EQ(result, &entries_[1]);
+
+    result = pool_.pto2_dep_pool_get(5);
+    EXPECT_EQ(result, &entries_[5]);
+}
diff --git a/tests/ut/cpp/test_scheduler_state.cpp b/tests/ut/cpp/test_scheduler_state.cpp
new file mode 100644
index 000000000..c82ebf1d8
--- /dev/null
+++ b/tests/ut/cpp/test_scheduler_state.cpp
@@ -0,0 +1,231 @@
+/**
+ * Unit tests for PTO2SchedulerState from pto_scheduler.h
+ *
+ * Tests task state transitions, fanin/fanout logic, subtask completion.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstring>
+#include <thread>
+#include <vector>
+
+#include "pto_scheduler.h"
+
+class SchedulerStateTest : public ::testing::Test {
+protected:
+    PTO2SchedulerState sched;
+    PTO2SharedMemoryHandle* sm_handle = nullptr;
+
+    void SetUp() override {
+        sm_handle = pto2_sm_create_default();
+        ASSERT_NE(sm_handle, nullptr);
+        bool ok = pto2_scheduler_init(&sched, sm_handle);
+        ASSERT_TRUE(ok);
+    }
+
+    void TearDown() override {
+        pto2_scheduler_destroy(&sched);
+        if (sm_handle) {
+            pto2_sm_destroy(sm_handle);
+        }
+    }
+
+    void init_slot(PTO2TaskSlotState& slot, PTO2TaskState state,
+                   int32_t fanin_count, int32_t fanout_count,
+                   uint8_t ring_id = 0) {
+        memset(&slot, 0, sizeof(slot));
+        slot.task_state.store(state);
+        slot.fanin_count = fanin_count;
+        slot.fanin_refcount.store(0);
+        slot.fanout_count = fanout_count;
+        slot.fanout_refcount.store(0);
+        slot.fanout_lock.store(0);
+        slot.fanout_head = nullptr;
+        slot.ring_id = ring_id;
+        slot.active_mask = PTO2_SUBTASK_MASK_AIC;
+        slot.completed_subtasks.store(0);
+        slot.total_required_subtasks = 1;
+        slot.block_num = 1;
+    }
+};
+
+// =============================================================================
+// check_and_handle_consumed
+// =============================================================================
+
+TEST_F(SchedulerStateTest, ConsumedNotReady) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_COMPLETED, 1, 2);
+    slot.fanout_refcount.store(1);  // 1 != 2
+
+    sched.check_and_handle_consumed(slot);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_COMPLETED);
+}
+
+TEST_F(SchedulerStateTest, ConsumedTransition) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_COMPLETED, 1, 2);
+    slot.fanout_refcount.store(2);  // matches fanout_count
+
+    sched.check_and_handle_consumed(slot);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED);
+}
+
+TEST_F(SchedulerStateTest, ConsumedNotCompletedState) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_RUNNING, 1, 1);
+    slot.fanout_refcount.store(1);
+
+    sched.check_and_handle_consumed(slot);
+    // CAS fails because state is RUNNING, not COMPLETED
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_RUNNING);
+}
+
+TEST_F(SchedulerStateTest, ConsumedIdempotent) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_CONSUMED, 1, 1);
+    slot.fanout_refcount.store(1);
+
+    sched.check_and_handle_consumed(slot);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED);
+}
+
+// =============================================================================
+// release_producer
+// =============================================================================
+
+TEST_F(SchedulerStateTest, ReleaseProducerIncrements) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_COMPLETED, 1, 3);
+
+    sched.release_producer(slot);
+    EXPECT_EQ(slot.fanout_refcount.load(), 1);
+
+    sched.release_producer(slot);
+    EXPECT_EQ(slot.fanout_refcount.load(), 2);
+}
+
+TEST_F(SchedulerStateTest, ReleaseProducerTriggersConsumed) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_COMPLETED, 1, 2);
+    slot.fanout_refcount.store(1);  // One away
+
+    sched.release_producer(slot);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED);
+}
+
+// =============================================================================
+// release_fanin_and_check_ready
+// =============================================================================
+
+TEST_F(SchedulerStateTest, FaninPartialNotReady) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_PENDING, 3, 1);
+
+    bool ready = sched.release_fanin_and_check_ready(slot);
+    EXPECT_FALSE(ready);
+    EXPECT_EQ(slot.fanin_refcount.load(), 1);
+}
+
+TEST_F(SchedulerStateTest, FaninAllSatisfiedReady) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_PENDING, 1, 1);
+
+    bool ready = sched.release_fanin_and_check_ready(slot);
+    EXPECT_TRUE(ready);
+}
+
+// =============================================================================
+// on_subtask_complete
+// =============================================================================
+
+TEST_F(SchedulerStateTest, SubtaskCompleteSingle) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_RUNNING, 1, 1);
+    slot.total_required_subtasks = 1;
+    slot.completed_subtasks.store(0);
+
+    EXPECT_TRUE(sched.on_subtask_complete(slot));
+}
+
+TEST_F(SchedulerStateTest, SubtaskCompleteMultiBlock) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_RUNNING, 1, 1);
+    slot.total_required_subtasks = 6;  // 3 cores * 2 blocks
+    slot.completed_subtasks.store(0);
+
+    for (int i = 0; i < 5; i++) {
+        EXPECT_FALSE(sched.on_subtask_complete(slot));
+    }
+    EXPECT_TRUE(sched.on_subtask_complete(slot));
+}
+
+TEST_F(SchedulerStateTest, SubtaskCompleteConcurrent) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_RUNNING, 1, 1);
+    slot.total_required_subtasks = 6;
+    slot.completed_subtasks.store(0);
+
+    std::atomic<int> true_count{0};
+    std::vector<std::thread> threads;
+    for (int i = 0; i < 6; i++) {
+        threads.emplace_back([&]() {
+            if (sched.on_subtask_complete(slot)) {
+                true_count.fetch_add(1);
+            }
+        });
+    }
+    for (auto& t : threads) t.join();
+
+    EXPECT_EQ(true_count.load(), 1);
+    EXPECT_EQ(slot.completed_subtasks.load(), 6);
+}
+
+// =============================================================================
+// on_scope_end
+// =============================================================================
+
+TEST_F(SchedulerStateTest, ScopeEndBatchRelease) {
+    constexpr int N = 4;
+    alignas(64) PTO2TaskSlotState slots[N];
+    PTO2TaskSlotState* ptrs[N];
+
+    for (int i = 0; i < N; i++) {
+        init_slot(slots[i], PTO2_TASK_COMPLETED, 1, 2);
+        ptrs[i] = &slots[i];
+    }
+
+    sched.on_scope_end(ptrs, N);
+
+    for (int i = 0; i < N; i++) {
+        EXPECT_EQ(slots[i].fanout_refcount.load(), 1);
+    }
+}
+
+// =============================================================================
+// get_ready_tasks_batch: local buffer first
+// =============================================================================
+
+TEST_F(SchedulerStateTest, GetReadyTasksBatchLocalFirst) {
+    alignas(64) PTO2TaskSlotState slot_a, slot_b;
+    init_slot(slot_a, PTO2_TASK_READY, 0, 1);
+    init_slot(slot_b, PTO2_TASK_READY, 0, 1);
+
+    PTO2TaskSlotState* local_buf_storage[4];
+    PTO2LocalReadyBuffer local_buf;
+    local_buf.reset(local_buf_storage, 4);
+    local_buf.try_push(&slot_a);
+
+    // Push slot_b to global queue
+    sched.ready_queues[0].push(&slot_b);
+
+    PTO2TaskSlotState* out[4];
+    int count = sched.get_ready_tasks_batch(PTO2ResourceShape::AIC, local_buf, out, 4);
+
+    EXPECT_EQ(count, 2);
+    // Local buffer drains first (LIFO), so slot_a comes first
+    EXPECT_EQ(out[0], &slot_a);
+    EXPECT_EQ(out[1], &slot_b);
+}
diff --git a/tests/ut/cpp/test_shared_memory.cpp b/tests/ut/cpp/test_shared_memory.cpp
new file mode 100644
index 000000000..0a2acf620
--- /dev/null
+++ b/tests/ut/cpp/test_shared_memory.cpp
@@ -0,0 +1,84 @@
+/**
+ * Unit tests for PTO2SharedMemory layout from pto_shared_memory.h
+ */
+
+#include <gtest/gtest.h>
+
+#include "pto_shared_memory.h"
+
+class SharedMemoryTest : public ::testing::Test {
+protected:
+    PTO2SharedMemoryHandle* handle = nullptr;
+
+    void SetUp() override {
+        handle = pto2_sm_create_default();
+        ASSERT_NE(handle, nullptr);
+    }
+
+    void TearDown() override {
+        if (handle) {
+            pto2_sm_destroy(handle);
+            handle = nullptr;
+        }
+    }
+};
+
+TEST_F(SharedMemoryTest, CreateDefaultReturnsNonNull) {
+    EXPECT_NE(handle->sm_base, nullptr);
+    EXPECT_GT(handle->sm_size, 0u);
+}
+
+TEST_F(SharedMemoryTest, IsOwner) {
+    EXPECT_TRUE(handle->is_owner);
+}
+
+TEST_F(SharedMemoryTest, HeaderInitValues) {
+    auto* hdr = handle->header;
+    EXPECT_EQ(hdr->orchestrator_done.load(), 0);
+    EXPECT_EQ(hdr->orch_error_code.load(), 0);
+    EXPECT_EQ(hdr->sched_error_bitmap.load(), 0);
+    EXPECT_EQ(hdr->sched_error_code.load(), 0);
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto& fc = hdr->rings[r].fc;
+        EXPECT_EQ(fc.current_task_index.load(), 0);
+        EXPECT_EQ(fc.last_task_alive.load(), 0);
+    }
+}
+
+TEST_F(SharedMemoryTest, Validate) {
+    EXPECT_TRUE(pto2_sm_validate(handle));
+}
+
+TEST_F(SharedMemoryTest, PerRingIndependence) {
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        EXPECT_NE(handle->task_descriptors[r], nullptr) << "Ring " << r;
+        EXPECT_NE(handle->task_payloads[r], nullptr) << "Ring " << r;
+    }
+    // Different rings should have different pointers
+    for (int r = 1; r < PTO2_MAX_RING_DEPTH; r++) {
+        EXPECT_NE(handle->task_descriptors[r], handle->task_descriptors[0]) << "Ring " << r;
+    }
+}
+
+TEST_F(SharedMemoryTest, PointerAlignment) {
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto addr = reinterpret_cast<uintptr_t>(handle->task_descriptors[r]);
+        EXPECT_EQ(addr % PTO2_ALIGN_SIZE, 0u) << "Ring " << r << " descriptors not aligned";
+    }
+}
+
+TEST(SharedMemoryCalcSize, NonZero) {
+    uint64_t size = pto2_sm_calculate_size(PTO2_TASK_WINDOW_SIZE);
+    EXPECT_GT(size, 0u);
+}
+
+TEST(SharedMemoryCalcSize, LargerWindowGivesLargerSize) {
+    uint64_t small_size = pto2_sm_calculate_size(64);
+    uint64_t large_size = pto2_sm_calculate_size(256);
+    EXPECT_GT(large_size, small_size);
+}
+
+TEST(SharedMemoryCalcSize, HeaderAligned) {
+    EXPECT_EQ(sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE, 0u);
+}
diff --git a/tests/ut/cpp/test_submit_types.cpp b/tests/ut/cpp/test_submit_types.cpp
new file mode 100644
index 000000000..0d0856f77
--- /dev/null
+++ b/tests/ut/cpp/test_submit_types.cpp
@@ -0,0 +1,144 @@
+/**
+ * Unit tests for pto_submit_types.h
+ *
+ * Tests submit contract types: subtask masks, resource shapes,
+ * active mask derivation, and launch spec.
+ */
+
+#include <gtest/gtest.h>
+
+#include "pto_submit_types.h"
+
+// =============================================================================
+// pto2_subtask_active
+// =============================================================================
+
+TEST(SubtaskActive, AICMaskActivatesAICSlot) {
+    EXPECT_TRUE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIC, PTO2SubtaskSlot::AIC));
+    EXPECT_FALSE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIC, PTO2SubtaskSlot::AIV0));
+    EXPECT_FALSE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIC, PTO2SubtaskSlot::AIV1));
+}
+
+TEST(SubtaskActive, AIV0MaskActivatesAIV0Slot) {
+    EXPECT_FALSE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIV0, PTO2SubtaskSlot::AIC));
+    EXPECT_TRUE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIV0, PTO2SubtaskSlot::AIV0));
+    EXPECT_FALSE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIV0, PTO2SubtaskSlot::AIV1));
+}
+
+TEST(SubtaskActive, AIV1MaskActivatesAIV1Slot) {
+    EXPECT_FALSE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIV1, PTO2SubtaskSlot::AIC));
+    EXPECT_FALSE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIV1, PTO2SubtaskSlot::AIV0));
+    EXPECT_TRUE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIV1, PTO2SubtaskSlot::AIV1));
+}
+
+TEST(SubtaskActive, CombinedMask) {
+    uint8_t mask = PTO2_SUBTASK_MASK_AIC | PTO2_SUBTASK_MASK_AIV1;
+    EXPECT_TRUE(pto2_subtask_active(mask, PTO2SubtaskSlot::AIC));
+    EXPECT_FALSE(pto2_subtask_active(mask, PTO2SubtaskSlot::AIV0));
+    EXPECT_TRUE(pto2_subtask_active(mask, PTO2SubtaskSlot::AIV1));
+}
+
+TEST(SubtaskActive, AllActive) {
+    uint8_t mask = PTO2_SUBTASK_MASK_AIC | PTO2_SUBTASK_MASK_AIV0 | PTO2_SUBTASK_MASK_AIV1;
+    EXPECT_TRUE(pto2_subtask_active(mask, PTO2SubtaskSlot::AIC));
+    EXPECT_TRUE(pto2_subtask_active(mask, PTO2SubtaskSlot::AIV0));
+    EXPECT_TRUE(pto2_subtask_active(mask, PTO2SubtaskSlot::AIV1));
+}
+
+// =============================================================================
+// pto2_active_mask_to_shape
+// =============================================================================
+
+TEST(ActiveMaskToShape, SingleAIC) {
+    EXPECT_EQ(pto2_active_mask_to_shape(PTO2_SUBTASK_MASK_AIC), PTO2ResourceShape::AIC);
+}
+
+TEST(ActiveMaskToShape, SingleAIV0) {
+    EXPECT_EQ(pto2_active_mask_to_shape(PTO2_SUBTASK_MASK_AIV0), PTO2ResourceShape::AIV);
+}
+
+TEST(ActiveMaskToShape, SingleAIV1) {
+    EXPECT_EQ(pto2_active_mask_to_shape(PTO2_SUBTASK_MASK_AIV1), PTO2ResourceShape::AIV);
+}
+
+TEST(ActiveMaskToShape, TwoActiveBecomesMIX) {
+    uint8_t mask = PTO2_SUBTASK_MASK_AIC | PTO2_SUBTASK_MASK_AIV0;
+    EXPECT_EQ(pto2_active_mask_to_shape(mask), PTO2ResourceShape::MIX);
+}
+
+TEST(ActiveMaskToShape, AllThreeBecomesMIX) {
+    uint8_t mask = PTO2_SUBTASK_MASK_AIC | PTO2_SUBTASK_MASK_AIV0 | PTO2_SUBTASK_MASK_AIV1;
+    EXPECT_EQ(pto2_active_mask_to_shape(mask), PTO2ResourceShape::MIX);
+}
+
+// =============================================================================
+// pto2_mixed_kernels_to_active_mask
+// =============================================================================
+
+TEST(MixedKernelsToMask, AllInvalid) {
+    MixedKernels mk;
+    EXPECT_EQ(pto2_mixed_kernels_to_active_mask(mk), 0);
+}
+
+TEST(MixedKernelsToMask, AICOnly) {
+    MixedKernels mk;
+    mk.aic_kernel_id = 42;
+    EXPECT_EQ(pto2_mixed_kernels_to_active_mask(mk), PTO2_SUBTASK_MASK_AIC);
+}
+
+TEST(MixedKernelsToMask, AIV0Only) {
+    MixedKernels mk;
+    mk.aiv0_kernel_id = 7;
+    EXPECT_EQ(pto2_mixed_kernels_to_active_mask(mk), PTO2_SUBTASK_MASK_AIV0);
+}
+
+TEST(MixedKernelsToMask, AllValid) {
+    MixedKernels mk;
+    mk.aic_kernel_id = 1;
+    mk.aiv0_kernel_id = 2;
+    mk.aiv1_kernel_id = 3;
+    uint8_t expected = PTO2_SUBTASK_MASK_AIC | PTO2_SUBTASK_MASK_AIV0 | PTO2_SUBTASK_MASK_AIV1;
+    EXPECT_EQ(pto2_mixed_kernels_to_active_mask(mk), expected);
+}
+
+// =============================================================================
+// MixedKernels defaults
+// =============================================================================
+
+TEST(MixedKernels, DefaultsAreInvalid) {
+    MixedKernels mk;
+    EXPECT_EQ(mk.aic_kernel_id, INVALID_KERNEL_ID);
+    EXPECT_EQ(mk.aiv0_kernel_id, INVALID_KERNEL_ID);
+    EXPECT_EQ(mk.aiv1_kernel_id, INVALID_KERNEL_ID);
+}
+
+// =============================================================================
+// PTO2LaunchSpec
+// =============================================================================
+
+TEST(LaunchSpec, DefaultBlockNumIsOne) {
+    PTO2LaunchSpec spec;
+    EXPECT_EQ(spec.block_num(), 1);
+}
+
+TEST(LaunchSpec, SetAndGet) {
+    PTO2LaunchSpec spec;
+    spec.set_block_num(4);
+    EXPECT_EQ(spec.block_num(), 4);
+}
+
+// =============================================================================
+// Constants
+// =============================================================================
+
+TEST(Constants, SubtaskSlotCount) {
+    EXPECT_EQ(PTO2_SUBTASK_SLOT_COUNT, 3);
+}
+
+TEST(Constants, NumResourceShapes) {
+    EXPECT_EQ(PTO2_NUM_RESOURCE_SHAPES, 3);
+}
+
+TEST(Constants, InvalidKernelId) {
+    EXPECT_EQ(INVALID_KERNEL_ID, -1);
+}
diff --git a/tests/ut/cpp/test_tensor.cpp b/tests/ut/cpp/test_tensor.cpp
new file mode 100644
index 000000000..2a40354b6
--- /dev/null
+++ b/tests/ut/cpp/test_tensor.cpp
@@ -0,0 +1,354 @@
+/**
+ * Unit tests for Tensor and related types in tensor.h
+ *
+ * Tests Tensor operations, TensorCreateInfo, Segment intersection,
+ * and boundary conditions in cache-line layout coupling.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstring>
+
+#include "pto_orchestration_api.h"
+
+// Helper: create a Tensor via make_tensor_external (the public factory)
+static Tensor make_test_tensor(void* addr, uint64_t size, const uint32_t shapes[],
+                               uint32_t ndims, DataType dtype = DataType::FLOAT32,
+                               bool manual_dep = false, int32_t version = 0) {
+    return make_tensor_external(addr, shapes, ndims, dtype, manual_dep, version);
+}
+
+// =============================================================================
+// Segment intersection
+// =============================================================================
+
+TEST(Segment, OverlappingIntersects) {
+    Segment a{0, 10};
+    Segment b{5, 15};
+    EXPECT_TRUE(a.line_segment_intersection(b));
+    EXPECT_TRUE(b.line_segment_intersection(a));
+}
+
+TEST(Segment, TouchingDoesNotIntersect) {
+    Segment a{0, 10};
+    Segment b{10, 20};
+    EXPECT_FALSE(a.line_segment_intersection(b));
+    EXPECT_FALSE(b.line_segment_intersection(a));
+}
+
+TEST(Segment, DisjointDoesNotIntersect) {
+    Segment a{0, 5};
+    Segment b{10, 20};
+    EXPECT_FALSE(a.line_segment_intersection(b));
+    EXPECT_FALSE(b.line_segment_intersection(a));
+}
+
+TEST(Segment, ZeroLengthAtBoundary) {
+    // Zero-length segment at position 10 touching [0,10)
+    Segment a{10, 10};
+    Segment b{0, 10};
+    EXPECT_FALSE(a.line_segment_intersection(b));
+}
+
+TEST(Segment, ZeroLengthInsideRange) {
+    // Zero-length segment at position 5 inside [0,10)
+    // end(5) > other.begin(0) && other.end(10) > begin(5) => true
+    // KNOWN BEHAVIOR: zero-length segments report intersection.
+    // This could cause spurious dependencies in TensorMap overlap detection.
+    Segment a{5, 5};
+    Segment b{0, 10};
+    EXPECT_TRUE(a.line_segment_intersection(b));
+}
+
+TEST(Segment, IdenticalRanges) {
+    Segment a{0, 10};
+    EXPECT_TRUE(a.line_segment_intersection(a));
+}
+
+TEST(Segment, ContainsFull) {
+    Segment outer{0, 20};
+    Segment inner{5, 10};
+    EXPECT_TRUE(outer.contains(inner));
+}
+
+TEST(Segment, ContainsIdentical) {
+    Segment a{0, 10};
+    EXPECT_TRUE(a.contains(a));
+}
+
+TEST(Segment, DoesNotContainPartial) {
+    Segment a{0, 10};
+    Segment b{5, 15};
+    EXPECT_FALSE(a.contains(b));
+}
+
+TEST(Segment, ContainsAtBoundary) {
+    Segment a{0, 10};
+    Segment b{0, 10};
+    EXPECT_TRUE(a.contains(b));
+}
+
+// =============================================================================
+// TensorCreateInfo
+// =============================================================================
+
+TEST(TensorCreateInfo, BufferSizeBytes) {
+    uint32_t shapes[] = {4, 8};
+    TensorCreateInfo ci(shapes, 2, DataType::FLOAT32);
+    EXPECT_EQ(ci.buffer_size_bytes(), 4u * 8u * 4u);  // 4*8 elements * 4 bytes
+}
+
+TEST(TensorCreateInfo, BufferSizeBytesInt8) {
+    uint32_t shapes[] = {10, 20, 30};
+    TensorCreateInfo ci(shapes, 3, DataType::INT8);
+    EXPECT_EQ(ci.buffer_size_bytes(), 10u * 20u * 30u * 1u);
+}
+
+TEST(TensorCreateInfo, SizeIs64Bytes) {
+    EXPECT_EQ(sizeof(TensorCreateInfo), 64u);
+}
+
+TEST(TensorCreateInfo, InitialValueDefault) {
+    uint32_t shapes[] = {4};
+    TensorCreateInfo ci(shapes, 1);
+    EXPECT_FALSE(ci.has_initial_value);
+}
+
+TEST(TensorCreateInfo, SetInitialValue) {
+    uint32_t shapes[] = {4};
+    TensorCreateInfo ci(shapes, 1);
+    ci.set_initial_value<float>(3.14f);
+    EXPECT_TRUE(ci.has_initial_value);
+}
+
+// =============================================================================
+// Tensor basic operations
+// =============================================================================
+
+TEST(Tensor, SizeIs128Bytes) {
+    EXPECT_EQ(sizeof(Tensor), 128u);
+}
+
+TEST(Tensor, RawShapesAtOffset64) {
+    EXPECT_EQ(offsetof(Tensor, raw_shapes), 64u);
+}
+
+TEST(Tensor, MakeExternal) {
+    char buf[256];
+    uint32_t shapes[] = {4, 8};
+    auto t = make_test_tensor(buf, sizeof(buf), shapes, 2);
+    EXPECT_EQ(t.buffer.addr, reinterpret_cast<uint64_t>(buf));
+    EXPECT_EQ(t.ndims, 2u);
+    EXPECT_EQ(t.shapes[0], 4u);
+    EXPECT_EQ(t.shapes[1], 8u);
+}
+
+TEST(Tensor, Numel) {
+    char buf[256];
+    uint32_t shapes[] = {4, 8, 2};
+    auto t = make_test_tensor(buf, sizeof(buf), shapes, 3);
+    EXPECT_EQ(t.numel(), 64u);
+}
+
+TEST(Tensor, NumelZeroDim) {
+    char buf[256];
+    uint32_t shapes[] = {};
+    auto t = make_test_tensor(buf, sizeof(buf), shapes, 0);
+    EXPECT_EQ(t.numel(), 0u);
+}
+
+TEST(Tensor, IsContiguousWhenRawEqShapes) {
+    char buf[256];
+    uint32_t shapes[] = {4, 8};
+    auto t = make_test_tensor(buf, sizeof(buf), shapes, 2);
+    EXPECT_TRUE(t.is_raw_eq_shapes);
+    EXPECT_TRUE(t.is_contiguous());
+}
+
+TEST(Tensor, IsSameMemref) {
+    char buf1[256], buf2[256];
+    uint32_t shapes[] = {4};
+    auto t1 = make_test_tensor(buf1, sizeof(buf1), shapes, 1);
+    auto t2 = make_test_tensor(buf1, sizeof(buf1), shapes, 1);
+    auto t3 = make_test_tensor(buf2, sizeof(buf2), shapes, 1);
+    EXPECT_TRUE(t1.is_same_memref(t2));
+    EXPECT_FALSE(t1.is_same_memref(t3));
+}
+
+// =============================================================================
+// View
+// =============================================================================
+
+TEST(Tensor, ViewWithZeroOffsets) {
+    char buf[256];
+    uint32_t shapes[] = {10, 20};
+    auto parent = make_test_tensor(buf, sizeof(buf), shapes, 2);
+
+    uint32_t view_shapes[] = {5, 10};
+    uint32_t view_offsets[] = {0, 0};
+    auto v = parent.view(view_shapes, view_offsets);
+
+    EXPECT_EQ(v.shapes[0], 5u);
+    EXPECT_EQ(v.shapes[1], 10u);
+    EXPECT_TRUE(v.is_all_offset_zero);
+    EXPECT_EQ(v.buffer.addr, parent.buffer.addr);
+}
+
+TEST(Tensor, ViewWithNonZeroOffsets) {
+    char buf[256];
+    uint32_t shapes[] = {10, 20};
+    auto parent = make_test_tensor(buf, sizeof(buf), shapes, 2);
+
+    uint32_t view_shapes[] = {5, 10};
+    uint32_t view_offsets[] = {2, 3};
+    auto v = parent.view(view_shapes, view_offsets);
+
+    EXPECT_EQ(v.shapes[0], 5u);
+    EXPECT_EQ(v.shapes[1], 10u);
+    EXPECT_FALSE(v.is_all_offset_zero);
+    EXPECT_EQ(v.offsets[0], 2u);
+    EXPECT_EQ(v.offsets[1], 3u);
+}
+
+TEST(Tensor, ViewOffsetAccumulation) {
+    char buf[256];
+    uint32_t shapes[] = {20, 30};
+    auto parent = make_test_tensor(buf, sizeof(buf), shapes, 2);
+
+    // First view with offsets
+    uint32_t v1_shapes[] = {10, 15};
+    uint32_t v1_offsets[] = {5, 10};
+    auto v1 = parent.view(v1_shapes, v1_offsets);
+
+    // Second view on top of first
+    uint32_t v2_shapes[] = {3, 4};
+    uint32_t v2_offsets[] = {1, 2};
+    auto v2 = v1.view(v2_shapes, v2_offsets);
+
+    EXPECT_EQ(v2.offsets[0], 6u);  // 5 + 1
+    EXPECT_EQ(v2.offsets[1], 12u);  // 10 + 2
+}
+
+// =============================================================================
+// Reshape
+// =============================================================================
+
+TEST(Tensor, ReshapeContiguous) {
+    char buf[256];
+    uint32_t shapes[] = {4, 8};
+    auto t = make_test_tensor(buf, sizeof(buf), shapes, 2);
+
+    uint32_t new_shapes[] = {32};
+    auto r = t.reshape(new_shapes, 1);
+
+    EXPECT_EQ(r.numel(), 32u);
+    EXPECT_EQ(r.ndims, 1u);
+    EXPECT_EQ(r.shapes[0], 32u);
+    EXPECT_TRUE(r.is_raw_eq_shapes);
+    EXPECT_TRUE(r.is_all_offset_zero);
+}
+
+TEST(Tensor, ReshapePreservesBuffer) {
+    char buf[256];
+    uint32_t shapes[] = {4, 8};
+    auto t = make_test_tensor(buf, sizeof(buf), shapes, 2);
+
+    uint32_t new_shapes[] = {2, 16};
+    auto r = t.reshape(new_shapes, 2);
+
+    EXPECT_EQ(r.buffer.addr, t.buffer.addr);
+}
+
+// =============================================================================
+// Transpose
+// =============================================================================
+
+TEST(Tensor, TransposeSwapsDims) {
+    char buf[256];
+    uint32_t shapes[] = {4, 8, 2};
+    auto t = make_test_tensor(buf, sizeof(buf), shapes, 3);
+
+    auto tr = t.transpose(0, 2);
+
+    EXPECT_EQ(tr.shapes[0], 2u);
+    EXPECT_EQ(tr.shapes[1], 8u);
+    EXPECT_EQ(tr.shapes[2], 4u);
+    EXPECT_EQ(tr.numel(), t.numel());
+}
+
+// =============================================================================
+// compute_flat_offset
+// =============================================================================
+
+TEST(Tensor, ComputeFlatOffsetZeroDim) {
+    char buf[256];
+    uint32_t shapes[] = {};
+    auto t = make_test_tensor(buf, sizeof(buf), shapes, 0);
+    uint32_t indices[] = {};
+    EXPECT_EQ(t.compute_flat_offset(indices, 0), 0u);
+}
+
+TEST(Tensor, ComputeFlatOffset1D) {
+    char buf[256];
+    uint32_t shapes[] = {10};
+    auto t = make_test_tensor(buf, sizeof(buf), shapes, 1);
+    uint32_t indices[] = {7};
+    EXPECT_EQ(t.compute_flat_offset(indices, 1), 7u);
+}
+
+TEST(Tensor, ComputeFlatOffset2D) {
+    char buf[256];
+    uint32_t shapes[] = {4, 8};
+    auto t = make_test_tensor(buf, sizeof(buf), shapes, 2);
+    // Row-major: offset = i0 * 8 + i1 = 2*8+3 = 19
+    uint32_t indices[] = {2, 3};
+    EXPECT_EQ(t.compute_flat_offset(indices, 2), 19u);
+}
+
+// =============================================================================
+// update_start_offset
+// =============================================================================
+
+TEST(Tensor, UpdateStartOffsetZeroOffsets) {
+    char buf[256];
+    uint32_t shapes[] = {4, 8};
+    auto t = make_test_tensor(buf, sizeof(buf), shapes, 2);
+    t.update_start_offset();
+    EXPECT_EQ(t.start_offset, 0u);
+}
+
+// =============================================================================
+// fill_initial_value
+// =============================================================================
+
+TEST(Tensor, FillInitialValue) {
+    alignas(64) char buf[128];
+    memset(buf, 0, sizeof(buf));
+
+    uint32_t shapes[] = {32};
+    TensorCreateInfo ci(shapes, 1, DataType::FLOAT32);
+    ci.set_initial_value<float>(1.0f);
+
+    // Use make_tensor_external then overwrite with init_from_create_info
+    auto t = make_tensor_external(buf, shapes, 1);
+    t.init_from_create_info(ci, buf, sizeof(buf));
+
+    // Check that the buffer was filled with 1.0f
+    float* data = reinterpret_cast<float*>(buf);
+    for (int i = 0; i < 32; i++) {
+        EXPECT_FLOAT_EQ(data[i], 1.0f) << "Mismatch at index " << i;
+    }
+}
+
+// =============================================================================
+// Layout coupling: TensorCreateInfo <-> Tensor cacheline 1
+// =============================================================================
+
+TEST(LayoutCoupling, TensorCreateInfoMatchesTensor) {
+    // These static_asserts are in tensor.h but we verify they compile here
+    static_assert(offsetof(TensorCreateInfo, version) == offsetof(Tensor, version));
+    static_assert(offsetof(TensorCreateInfo, dtype) == offsetof(Tensor, dtype));
+    static_assert(offsetof(TensorCreateInfo, ndims) == offsetof(Tensor, ndims));
+    static_assert(offsetof(TensorCreateInfo, is_all_offset_zero) == offsetof(Tensor, is_all_offset_zero));
+    SUCCEED();
+}
diff --git a/tests/ut/cpp/test_tensormap.cpp b/tests/ut/cpp/test_tensormap.cpp
new file mode 100644
index 000000000..8e5ea5da9
--- /dev/null
+++ b/tests/ut/cpp/test_tensormap.cpp
@@ -0,0 +1,270 @@
+/**
+ * Unit tests for PTO2TensorMap and check_overlap from pto_tensormap.h
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstring>
+
+#include "pto_orchestration_api.h"
+#include "pto_tensormap.h"
+
+// =============================================================================
+// TensorMapEntry::check_overlap tests
+// =============================================================================
+
+class CheckOverlapTest : public ::testing::Test {
+protected:
+    alignas(64) PTO2TensorMapEntry entry;
+    char buf[256];
+
+    void SetUp() override {
+        memset(&entry, 0, sizeof(entry));
+        memset(buf, 0, sizeof(buf));
+    }
+
+    Tensor make_input(uint32_t shapes[], uint32_t ndims, int32_t version = 0) {
+        return make_tensor_external(buf, shapes, ndims, DataType::FLOAT32, false, version);
+    }
+
+    void setup_entry(uint32_t shapes[], uint32_t ndims, int32_t version = 0) {
+        entry.buffer_addr = reinterpret_cast<uint64_t>(buf);
+        entry.ndims = ndims;
+        entry.version = version;
+        entry.is_all_offset_zero = true;
+        for (uint32_t i = 0; i < ndims; i++) {
+            entry.shapes[i] = shapes[i];
+        }
+    }
+};
+
+TEST_F(CheckOverlapTest, IdenticalShapesZeroOffsets) {
+    uint32_t shapes[] = {10, 20};
+    setup_entry(shapes, 2);
+    auto input = make_input(shapes, 2);
+    EXPECT_EQ(entry.check_overlap(input), OverlapStatus::COVERED);
+}
+
+TEST_F(CheckOverlapTest, InputLargerThanOutput) {
+    uint32_t entry_shapes[] = {5, 10};
+    uint32_t input_shapes[] = {10, 20};
+    setup_entry(entry_shapes, 2);
+    auto input = make_input(input_shapes, 2);
+    EXPECT_EQ(entry.check_overlap(input), OverlapStatus::COVERED);
+}
+
+TEST_F(CheckOverlapTest, InputSmallerThanOutput) {
+    uint32_t entry_shapes[] = {10, 20};
+    uint32_t input_shapes[] = {5, 10};
+    setup_entry(entry_shapes, 2);
+    auto input = make_input(input_shapes, 2);
+    EXPECT_EQ(entry.check_overlap(input), OverlapStatus::OTHER);
+}
+
+TEST_F(CheckOverlapTest, VersionMismatch) {
+    // input.version > entry.version -> returns OTHER (not NO_OVERLAP)
+    // This means version bumps create dependencies (intentional)
+    uint32_t shapes[] = {10};
+    setup_entry(shapes, 1, /*version=*/0);
+    auto input = make_input(shapes, 1, /*version=*/1);
+    EXPECT_EQ(entry.check_overlap(input), OverlapStatus::OTHER);
+}
+
+TEST_F(CheckOverlapTest, DisjointOffsetsWithNonZeroEntry) {
+    // Entry covers [10,20), input covers [0,5) -> NO_OVERLAP
+    uint32_t entry_shapes[] = {10};
+    uint32_t input_shapes[] = {5};
+    setup_entry(entry_shapes, 1);
+    entry.is_all_offset_zero = false;
+    entry.offsets[0] = 10;
+
+    auto input = make_input(input_shapes, 1);
+    // Input is [0,5), entry is [10,20)
+    EXPECT_EQ(entry.check_overlap(input), OverlapStatus::NO_OVERLAP);
+}
+
+// =============================================================================
+// TensorMap lifecycle tests
+// =============================================================================
+
+class TensorMapTest : public ::testing::Test {
+protected:
+    PTO2TensorMap tmap;
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {16, 16, 16, 16};
+
+    void SetUp() override {
+        bool ok = tmap.init(64, 256, window_sizes);
+        ASSERT_TRUE(ok);
+        // Initialize last_task_alives to 0 for all rings
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            tmap.last_task_alives[r] = 0;
+        }
+    }
+
+    void TearDown() override {
+        tmap.destroy();
+    }
+};
+
+TEST_F(TensorMapTest, InitSucceeds) {
+    EXPECT_EQ(tmap.num_buckets, 64);
+    EXPECT_EQ(tmap.pool_size, 256);
+}
+
+TEST_F(TensorMapTest, HashDistribution) {
+    // Aligned addresses should distribute across buckets
+    uint64_t addr1 = 0x1000;
+    uint64_t addr2 = 0x2000;
+    uint64_t addr3 = 0x3000;
+    uint32_t h1 = tmap.hash(addr1);
+    uint32_t h2 = tmap.hash(addr2);
+    uint32_t h3 = tmap.hash(addr3);
+    // At least some should be different
+    EXPECT_TRUE(h1 != h2 || h2 != h3);
+    // All within bucket range
+    EXPECT_LT(h1, 64u);
+    EXPECT_LT(h2, 64u);
+    EXPECT_LT(h3, 64u);
+}
+
+TEST_F(TensorMapTest, InsertAndLookupExact) {
+    char buf[256];
+    uint32_t shapes[] = {10, 20};
+    auto tensor = make_tensor_external(buf, shapes, 2);
+    auto task_id = pto2_make_task_id(0, 5);
+
+    tmap.insert(tensor, task_id, false);
+
+    PTO2LookupResult result;
+    tmap.lookup(tensor, result);
+    EXPECT_GE(result.count, 1);
+}
+
+TEST_F(TensorMapTest, LookupNoMatch) {
+    char buf1[256], buf2[256];
+    uint32_t shapes[] = {10};
+    auto tensor1 = make_tensor_external(buf1, shapes, 1);
+    auto tensor2 = make_tensor_external(buf2, shapes, 1);
+    auto task_id = pto2_make_task_id(0, 0);
+
+    tmap.insert(tensor1, task_id, false);
+
+    PTO2LookupResult result;
+    tmap.lookup(tensor2, result);
+    EXPECT_EQ(result.count, 0);
+}
+
+TEST_F(TensorMapTest, LookupStaleEntrySkipped) {
+    char buf[256];
+    uint32_t shapes[] = {10};
+    auto tensor = make_tensor_external(buf, shapes, 1);
+    auto task_id = pto2_make_task_id(0, 0);
+
+    tmap.insert(tensor, task_id, false);
+
+    // Invalidate: advance last_task_alives past this task
+    tmap.sync_validity(0, 5);
+
+    PTO2LookupResult result;
+    tmap.lookup(tensor, result);
+    EXPECT_EQ(result.count, 0);
+}
+
+TEST_F(TensorMapTest, MultipleSameBucket) {
+    // Insert multiple entries for the same address
+    char buf[256];
+    uint32_t shapes[] = {10};
+    auto tensor = make_tensor_external(buf, shapes, 1);
+
+    tmap.insert(tensor, pto2_make_task_id(0, 0), false);
+    tmap.insert(tensor, pto2_make_task_id(0, 1), false);
+    tmap.insert(tensor, pto2_make_task_id(0, 2), false);
+
+    PTO2LookupResult result;
+    tmap.lookup(tensor, result);
+    EXPECT_EQ(result.count, 3);
+}
+
+TEST_F(TensorMapTest, CleanupRetired) {
+    char buf[256];
+    uint32_t shapes[] = {10};
+    auto tensor = make_tensor_external(buf, shapes, 1);
+
+    // Insert entries for tasks 0..4
+    for (int i = 0; i < 5; i++) {
+        tmap.insert(tensor, pto2_make_task_id(0, i), false);
+    }
+
+    // Retire tasks 0..3
+    tmap.cleanup_retired(0, 0, 4);
+    tmap.sync_validity(0, 4);
+
+    PTO2LookupResult result;
+    tmap.lookup(tensor, result);
+    EXPECT_EQ(result.count, 1);  // Only task 4 remains
+}
+
+TEST_F(TensorMapTest, NewEntryFreeListPriority) {
+    // Allocate, free, allocate again -> should reuse freed entry
+    PTO2TensorMapEntry* e1 = tmap.new_entry();
+    ASSERT_NE(e1, nullptr);
+    // Link entry so we can free it
+    e1->bucket_index = 0;
+    e1->prev_in_bucket = nullptr;
+    e1->next_in_bucket = nullptr;
+    e1->next_in_task = nullptr;
+    e1->prev_in_task = nullptr;
+    tmap.buckets[0] = e1;
+
+    tmap.free_entry(*e1);
+
+    PTO2TensorMapEntry* e2 = tmap.new_entry();
+    EXPECT_EQ(e1, e2);  // Reused from free list
+}
+
+TEST_F(TensorMapTest, EntryValidBoundary) {
+    alignas(64) PTO2TensorMapEntry entry;
+    memset(&entry, 0, sizeof(entry));
+
+    // local_id == last_task_alive -> valid (not yet retired)
+    entry.producer_task_id = pto2_make_task_id(0, 5);
+    tmap.last_task_alives[0] = 5;
+    EXPECT_TRUE(tmap.entry_valid(entry));
+
+    // local_id < last_task_alive -> stale
+    tmap.last_task_alives[0] = 6;
+    EXPECT_FALSE(tmap.entry_valid(entry));
+}
+
+TEST_F(TensorMapTest, MultiRingInterleaving) {
+    char buf[256];
+    uint32_t shapes[] = {10};
+    auto tensor = make_tensor_external(buf, shapes, 1);
+
+    // Insert entries from ring 0 and ring 1
+    tmap.insert(tensor, pto2_make_task_id(0, 0), false);
+    tmap.insert(tensor, pto2_make_task_id(1, 0), false);
+    tmap.insert(tensor, pto2_make_task_id(0, 1), false);
+
+    // Retire ring 0 tasks
+    tmap.cleanup_retired(0, 0, 2);
+    tmap.sync_validity(0, 2);
+
+    // Ring 1 entry should still be valid
+    PTO2LookupResult result;
+    tmap.lookup(tensor, result);
+    EXPECT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].entry->producer_task_id.ring(), 1);
+}
+
+// =============================================================================
+// Static assertions (compile-time checks)
+// =============================================================================
+
+TEST(TensorMapLayout, EntrySizeIs128) {
+    EXPECT_EQ(sizeof(PTO2TensorMapEntry), 128u);
+}
+
+TEST(TensorMapLayout, CacheLine2StartsAt64) {
+    EXPECT_EQ(offsetof(PTO2TensorMapEntry, prev_in_bucket), 64u);
+}
diff --git a/tests/ut/test_elf_parser.py b/tests/ut/test_elf_parser.py
new file mode 100644
index 000000000..25dffa407
--- /dev/null
+++ b/tests/ut/test_elf_parser.py
@@ -0,0 +1,207 @@
+"""Tests for python/elf_parser.py - ELF64 and Mach-O .text extraction."""
+
+import struct
+import sys
+import tempfile
+from pathlib import Path
+
+import pytest
+
+_python_dir = str(Path(__file__).resolve().parent.parent.parent / "python")
+if _python_dir not in sys.path:
+    sys.path.insert(0, _python_dir)
+
+from elf_parser import _extract_cstring, extract_text_section
+
+
+def _build_elf64_with_text(text_data: bytes) -> bytes:
+    """Build a minimal ELF64 .o file with a .text section."""
+    # String table: \0.text\0.shstrtab\0
+    strtab = b"\x00.text\x00.shstrtab\x00"
+    text_name_offset = 1   # offset of ".text" in strtab
+    shstrtab_name_offset = 7  # offset of ".shstrtab" in strtab
+
+    # ELF header (64 bytes)
+    e_shoff = 64  # section headers right after ELF header
+    e_shnum = 3   # null + .text + .shstrtab
+    e_shstrndx = 2  # .shstrtab is section 2
+
+    elf_header = bytearray(64)
+    elf_header[0:4] = b"\x7fELF"
+    elf_header[4] = 2   # 64-bit
+    elf_header[5] = 1   # little-endian
+    elf_header[6] = 1   # version
+    struct.pack_into("<H", elf_header, 18, 1)    # e_type = ET_REL
+    struct.pack_into("<H", elf_header, 52, 64)   # e_shentsize
+    struct.pack_into("<Q", elf_header, 40, e_shoff)
+    struct.pack_into("<H", elf_header, 60, e_shnum)
+    struct.pack_into("<H", elf_header, 62, e_shstrndx)
+
+    # Data follows section headers: text_data then strtab
+    data_offset = e_shoff + 64 * e_shnum  # after headers
+    text_offset = data_offset
+    strtab_offset = text_offset + len(text_data)
+
+    # Section headers (64 bytes each)
+    # Section 0: null
+    sh_null = bytearray(64)
+
+    # Section 1: .text
+    sh_text = bytearray(64)
+    struct.pack_into("<I", sh_text, 0, text_name_offset)  # sh_name
+    struct.pack_into("<I", sh_text, 4, 1)  # SHT_PROGBITS
+    struct.pack_into("<Q", sh_text, 24, text_offset)  # sh_offset
+    struct.pack_into("<Q", sh_text, 32, len(text_data))  # sh_size
+
+    # Section 2: .shstrtab
+    sh_strtab = bytearray(64)
+    struct.pack_into("<I", sh_strtab, 0, shstrtab_name_offset)  # sh_name
+    struct.pack_into("<I", sh_strtab, 4, 3)  # SHT_STRTAB
+    struct.pack_into("<Q", sh_strtab, 24, strtab_offset)  # sh_offset
+    struct.pack_into("<Q", sh_strtab, 32, len(strtab))  # sh_size
+
+    return bytes(elf_header) + bytes(sh_null) + bytes(sh_text) + bytes(sh_strtab) + text_data + strtab
+
+
+def _build_macho64_with_text(text_data: bytes) -> bytes:
+    """Build a minimal Mach-O 64-bit .o file with __text section."""
+    # Header (32 bytes)
+    header = bytearray(32)
+    struct.pack_into("<I", header, 0, 0xFEEDFACF)  # magic
+    struct.pack_into("<I", header, 4, 0x0100000C)   # cputype (ARM64)
+    struct.pack_into("<I", header, 12, 1)            # filetype MH_OBJECT
+    struct.pack_into("<I", header, 16, 1)            # ncmds
+
+    # LC_SEGMENT_64 command
+    segment_header = bytearray(72)
+    struct.pack_into("<I", segment_header, 0, 0x19)  # LC_SEGMENT_64
+
+    # One section: __text
+    section = bytearray(80)
+    section[0:6] = b"__text"
+    section[16:22] = b"__TEXT"
+
+    text_offset = 32 + 72 + 80  # after header + segment + section
+    struct.pack_into("<Q", section, 40, len(text_data))  # size
+    struct.pack_into("<I", section, 48, text_offset)      # offset
+
+    cmdsize = 72 + 80
+    struct.pack_into("<I", segment_header, 4, cmdsize)  # cmdsize
+    struct.pack_into("<I", segment_header, 64, 1)       # nsects
+    struct.pack_into("<I", header, 20, cmdsize)         # sizeofcmds
+
+    return bytes(header) + bytes(segment_header) + bytes(section) + text_data
+
+
+# =============================================================================
+# ELF64 tests
+# =============================================================================
+
+
+class TestELF64:
+    def test_extract_text(self):
+        text_data = b"\x01\x02\x03\x04\x05"
+        elf = _build_elf64_with_text(text_data)
+        result = extract_text_section(elf)
+        assert result == text_data
+
+    def test_missing_text_section(self):
+        # Build ELF with only null + .shstrtab (no .text)
+        strtab = b"\x00.shstrtab\x00"
+        e_shoff = 64
+        e_shnum = 2
+        e_shstrndx = 1
+
+        elf_header = bytearray(64)
+        elf_header[0:4] = b"\x7fELF"
+        elf_header[4] = 2
+        elf_header[5] = 1
+        elf_header[6] = 1
+        struct.pack_into("<Q", elf_header, 40, e_shoff)
+        struct.pack_into("<H", elf_header, 60, e_shnum)
+        struct.pack_into("<H", elf_header, 62, e_shstrndx)
+
+        data_offset = e_shoff + 64 * e_shnum
+        sh_null = bytearray(64)
+        sh_strtab = bytearray(64)
+        struct.pack_into("<I", sh_strtab, 0, 1)
+        struct.pack_into("<I", sh_strtab, 4, 3)
+        struct.pack_into("<Q", sh_strtab, 24, data_offset)
+        struct.pack_into("<Q", sh_strtab, 32, len(strtab))
+
+        elf = bytes(elf_header) + bytes(sh_null) + bytes(sh_strtab) + strtab
+        with pytest.raises(ValueError, match=".text section not found"):
+            extract_text_section(elf)
+
+    def test_truncated_header(self):
+        with pytest.raises(ValueError):
+            extract_text_section(b"\x7fELF" + b"\x00" * 10)
+
+
+# =============================================================================
+# Mach-O tests
+# =============================================================================
+
+
+class TestMachO:
+    def test_extract_text(self):
+        text_data = b"\xAA\xBB\xCC\xDD"
+        macho = _build_macho64_with_text(text_data)
+        result = extract_text_section(macho)
+        assert result == text_data
+
+    def test_missing_text_section(self):
+        # Header with no sections
+        header = bytearray(32)
+        struct.pack_into("<I", header, 0, 0xFEEDFACF)
+        struct.pack_into("<I", header, 16, 0)  # ncmds = 0
+        with pytest.raises(ValueError, match="__text section not found"):
+            extract_text_section(bytes(header))
+
+
+# =============================================================================
+# Format detection
+# =============================================================================
+
+
+class TestFormatDetection:
+    def test_unknown_format(self):
+        with pytest.raises(ValueError, match="Not a valid ELF or Mach-O"):
+            extract_text_section(b"\x00\x00\x00\x00" + b"\x00" * 60)
+
+    def test_too_small(self):
+        with pytest.raises(ValueError, match="too small"):
+            extract_text_section(b"\x01\x02")
+
+    def test_file_path(self):
+        text_data = b"\xDE\xAD\xBE\xEF"
+        elf = _build_elf64_with_text(text_data)
+        with tempfile.NamedTemporaryFile(suffix=".o", delete=False) as f:
+            f.write(elf)
+            f.flush()
+            result = extract_text_section(f.name)
+            assert result == text_data
+
+    def test_file_not_found(self):
+        with pytest.raises(FileNotFoundError):
+            extract_text_section("/nonexistent/path.o")
+
+
+# =============================================================================
+# _extract_cstring
+# =============================================================================
+
+
+class TestExtractCString:
+    def test_basic(self):
+        data = b"hello\x00world\x00"
+        assert _extract_cstring(data, 0) == "hello"
+        assert _extract_cstring(data, 6) == "world"
+
+    def test_no_null_terminator(self):
+        data = b"unterminated"
+        assert _extract_cstring(data, 0) == "unterminated"
+
+    def test_empty_string(self):
+        data = b"\x00rest"
+        assert _extract_cstring(data, 0) == ""
diff --git a/tests/ut/test_env_manager.py b/tests/ut/test_env_manager.py
new file mode 100644
index 000000000..2656616fd
--- /dev/null
+++ b/tests/ut/test_env_manager.py
@@ -0,0 +1,66 @@
+"""Tests for python/env_manager.py - environment variable cache."""
+
+import os
+import sys
+from pathlib import Path
+
+import pytest
+
+_python_dir = str(Path(__file__).resolve().parent.parent.parent / "python")
+if _python_dir not in sys.path:
+    sys.path.insert(0, _python_dir)
+
+import env_manager
+
+
+@pytest.fixture(autouse=True)
+def _reset_cache():
+    """Reset env_manager cache between tests."""
+    env_manager._cache.clear()
+    yield
+    env_manager._cache.clear()
+
+
+class TestGet:
+    def test_uncached_returns_none(self):
+        assert env_manager.get("NONEXISTENT_VAR_12345") is None
+
+    def test_after_ensure(self, monkeypatch):
+        monkeypatch.setenv("TEST_ENV_VAR_XYZ", "hello")
+        env_manager.ensure("TEST_ENV_VAR_XYZ")
+        assert env_manager.get("TEST_ENV_VAR_XYZ") == "hello"
+
+
+class TestEnsure:
+    def test_returns_value_when_set(self, monkeypatch):
+        monkeypatch.setenv("TEST_ENSURE_VAR", "value123")
+        result = env_manager.ensure("TEST_ENSURE_VAR")
+        assert result == "value123"
+
+    def test_raises_when_unset(self, monkeypatch):
+        monkeypatch.delenv("UNSET_VAR_99999", raising=False)
+        with pytest.raises(EnvironmentError, match="not set"):
+            env_manager.ensure("UNSET_VAR_99999")
+
+    def test_raises_when_empty(self, monkeypatch):
+        monkeypatch.setenv("EMPTY_VAR_TEST", "")
+        with pytest.raises(EnvironmentError, match="not set"):
+            env_manager.ensure("EMPTY_VAR_TEST")
+
+    def test_caching(self, monkeypatch):
+        monkeypatch.setenv("CACHED_VAR", "original")
+        env_manager.ensure("CACHED_VAR")
+
+        # Change the env var - cached value should persist
+        monkeypatch.setenv("CACHED_VAR", "changed")
+        result = env_manager.ensure("CACHED_VAR")
+        assert result == "original"  # Returns cached, not re-read
+
+    def test_caching_skips_none_check(self, monkeypatch):
+        monkeypatch.setenv("CACHE_TEST_2", "val")
+        env_manager.ensure("CACHE_TEST_2")
+
+        # Even if we remove from env, cache returns the value
+        monkeypatch.delenv("CACHE_TEST_2")
+        result = env_manager.ensure("CACHE_TEST_2")
+        assert result == "val"