diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e6f73e193..e8e9e3c0c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,6 +22,26 @@ jobs: with: extra_args: --from-ref ${{ github.event.pull_request.base.sha }} --to-ref ${{ github.event.pull_request.head.sha }} + ut-cpp: + needs: pre-commit + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install dependencies + run: sudo apt-get update && sudo apt-get install -y cmake ninja-build + + - name: Configure C++ tests + run: cmake -B tests/ut/cpp/build -S tests/ut/cpp -G Ninja + + - name: Build C++ tests + run: cmake --build tests/ut/cpp/build + + - name: Run C++ tests + run: ctest --test-dir tests/ut/cpp/build --output-on-failure -L no_hardware + ut-py: needs: pre-commit runs-on: ubuntu-latest diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt new file mode 100644 index 000000000..fe6987bff --- /dev/null +++ b/tests/ut/cpp/CMakeLists.txt @@ -0,0 +1,120 @@ +cmake_minimum_required(VERSION 3.15) +project(pto_runtime_tests CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Disable profiling to avoid device_time.h dependency in hot paths +add_compile_definitions( + PTO2_PROFILING=0 + PTO2_ORCH_PROFILING=0 + PTO2_SCHED_PROFILING=0 + PTO2_TENSORMAP_PROFILING=0 + PTO2_SPIN_VERBOSE_LOGGING=0 + _GLIBCXX_USE_CXX11_ABI=0 +) + +# GoogleTest: prefer system installation, fallback to FetchContent +find_package(GTest QUIET) +if(NOT GTest_FOUND) + include(FetchContent) + FetchContent_Declare(googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG v1.14.0 + ) + set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + FetchContent_MakeAvailable(googletest) +endif() + +enable_testing() + +# Source directories (use a2a3 as the reference arch for UT) +set(PROJECT_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) +set(TMR_RUNTIME_DIR ${PROJECT_ROOT}/src/a2a3/runtime/tensormap_and_ringbuffer/runtime) +set(TMR_ORCH_DIR ${PROJECT_ROOT}/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration) +set(TMR_COMMON_DIR ${PROJECT_ROOT}/src/a2a3/runtime/tensormap_and_ringbuffer/common) +set(PLATFORM_INCLUDE_DIR ${PROJECT_ROOT}/src/a2a3/platform/include) +set(COMMON_TASK_DIR ${PROJECT_ROOT}/src/common/task_interface) + +set(COMMON_INCLUDE_DIRS + ${TMR_RUNTIME_DIR} + ${TMR_ORCH_DIR} + ${TMR_COMMON_DIR} + ${PLATFORM_INCLUDE_DIR} + ${COMMON_TASK_DIR} +) + +# Determine the GTest link target name +if(TARGET GTest::gtest_main) + set(GTEST_TARGET GTest::gtest_main) +else() + set(GTEST_TARGET gtest_main) +endif() + +# Stub sources +set(STUB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/stubs/test_stubs.cpp) + +# Helper: add a test target (only if source file exists) +function(add_gtest_target name) + cmake_parse_arguments(ARG "" "" "SOURCES;EXTRA_SOURCES" ${ARGN}) + # Check all source files exist + foreach(src ${ARG_SOURCES}) + if(NOT IS_ABSOLUTE "${src}") + set(src "${CMAKE_CURRENT_SOURCE_DIR}/${src}") + endif() + if(NOT EXISTS "${src}") + message(STATUS "Skipping ${name}: ${src} not found") + return() + endif() + endforeach() + add_executable(${name} ${ARG_SOURCES} ${STUB_SOURCES} ${ARG_EXTRA_SOURCES}) + target_include_directories(${name} PRIVATE ${COMMON_INCLUDE_DIRS}) + target_link_libraries(${name} ${GTEST_TARGET}) + add_test(NAME ${name} COMMAND ${name}) + set_tests_properties(${name} PROPERTIES LABELS "no_hardware") +endfunction() + +# --- Header-only tests (no runtime .cpp sources needed) --- + +add_gtest_target(test_submit_types SOURCES test_submit_types.cpp) +add_gtest_target(test_core_types SOURCES test_core_types.cpp) +add_gtest_target(test_tensor SOURCES test_tensor.cpp) + +# --- Tests requiring runtime .cpp sources --- + +add_gtest_target(test_shared_memory + SOURCES test_shared_memory.cpp + EXTRA_SOURCES ${TMR_RUNTIME_DIR}/pto_shared_memory.cpp +) + +add_gtest_target(test_ring_buffer + SOURCES test_ring_buffer.cpp + EXTRA_SOURCES + ${TMR_RUNTIME_DIR}/pto_shared_memory.cpp + ${TMR_RUNTIME_DIR}/pto_scheduler.cpp + ${TMR_RUNTIME_DIR}/pto_ring_buffer.cpp +) + +add_gtest_target(test_tensormap + SOURCES test_tensormap.cpp + EXTRA_SOURCES + ${TMR_RUNTIME_DIR}/pto_tensormap.cpp +) + +add_gtest_target(test_ready_queue + SOURCES test_ready_queue.cpp + EXTRA_SOURCES + ${TMR_RUNTIME_DIR}/pto_scheduler.cpp + ${TMR_RUNTIME_DIR}/pto_shared_memory.cpp +) + +add_gtest_target(test_scheduler_state + SOURCES test_scheduler_state.cpp + EXTRA_SOURCES + ${TMR_RUNTIME_DIR}/pto_scheduler.cpp + ${TMR_RUNTIME_DIR}/pto_shared_memory.cpp +) + +add_gtest_target(test_pto_types SOURCES test_pto_types.cpp) + +add_gtest_target(test_dispatch_payload SOURCES test_dispatch_payload.cpp) diff --git a/tests/ut/cpp/stubs/test_stubs.cpp b/tests/ut/cpp/stubs/test_stubs.cpp new file mode 100644 index 000000000..26ff45a18 --- /dev/null +++ b/tests/ut/cpp/stubs/test_stubs.cpp @@ -0,0 +1,99 @@ +/** + * Link-time stubs for platform APIs used by runtime headers. + * + * Provides x86-compatible implementations of functions declared in + * platform headers (unified_log.h, device_time.h, common.h) so that + * runtime data structures can be unit-tested on CI runners without + * Ascend hardware or SDK. + */ + +#include +#include +#include +#include +#include +#include + +// ============================================================================= +// unified_log.h stubs (5 log-level functions) +// ============================================================================= + +extern "C" { + +void unified_log_error(const char* func, const char* fmt, ...) { + va_list args; + va_start(args, fmt); + fprintf(stderr, "[ERROR] %s: ", func); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n"); + va_end(args); +} + +void unified_log_warn(const char* func, const char* fmt, ...) { + va_list args; + va_start(args, fmt); + fprintf(stderr, "[WARN] %s: ", func); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n"); + va_end(args); +} + +void unified_log_info(const char* /* func */, const char* /* fmt */, ...) { + // Suppress info in tests +} + +void unified_log_debug(const char* /* func */, const char* /* fmt */, ...) { + // Suppress debug in tests +} + +void unified_log_always(const char* func, const char* fmt, ...) { + va_list args; + va_start(args, fmt); + fprintf(stderr, "[ALWAYS] %s: ", func); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n"); + va_end(args); +} + +} // extern "C" + +// ============================================================================= +// device_time.h stub +// ============================================================================= + +uint64_t get_sys_cnt_aicpu() { + auto now = std::chrono::steady_clock::now(); + return static_cast( + std::chrono::duration_cast(now.time_since_epoch()).count()); +} + +// ============================================================================= +// common.h stubs (assert_impl, get_stacktrace, AssertionError) +// ============================================================================= + +std::string get_stacktrace(int /* skip_frames */) { + return ""; +} + +class AssertionError : public std::runtime_error { +public: + AssertionError(const char* condition, const char* file, int line) + : std::runtime_error(std::string("Assertion failed: ") + condition + " at " + file + ":" + + std::to_string(line)), + condition_(condition), + file_(file), + line_(line) {} + + const char* condition() const { return condition_; } + const char* file() const { return file_; } + int line() const { return line_; } + +private: + const char* condition_; + const char* file_; + int line_; +}; + +[[noreturn]] void assert_impl(const char* condition, const char* file, int line) { + throw AssertionError(condition, file, line); +} diff --git a/tests/ut/cpp/test_core_types.cpp b/tests/ut/cpp/test_core_types.cpp new file mode 100644 index 000000000..ee3cb247d --- /dev/null +++ b/tests/ut/cpp/test_core_types.cpp @@ -0,0 +1,141 @@ +/** + * Unit tests for core types in pto_runtime2_types.h + * + * Tests PTO2TaskId encoding, alignment assertions, and utility macros. + */ + +#include + +#include "pto_runtime2_types.h" + +// ============================================================================= +// PTO2TaskId encoding/extraction +// ============================================================================= + +TEST(TaskId, DefaultIsZero) { + PTO2TaskId id; + EXPECT_EQ(id.raw, 0u); + EXPECT_EQ(id.ring(), 0); + EXPECT_EQ(id.local(), 0u); +} + +TEST(TaskId, MakeAndExtract) { + auto id = pto2_make_task_id(2, 42); + EXPECT_EQ(id.ring(), 2); + EXPECT_EQ(id.local(), 42u); +} + +TEST(TaskId, RingInUpperBits) { + auto id = pto2_make_task_id(3, 0); + EXPECT_EQ(id.raw, static_cast(3) << 32); + EXPECT_EQ(id.ring(), 3); + EXPECT_EQ(id.local(), 0u); +} + +TEST(TaskId, MaxRingMaxLocal) { + auto id = pto2_make_task_id(255, 0xFFFFFFFF); + EXPECT_EQ(id.ring(), 255); + EXPECT_EQ(id.local(), 0xFFFFFFFF); +} + +TEST(TaskId, Roundtrip) { + for (uint8_t ring = 0; ring < PTO2_MAX_RING_DEPTH; ring++) { + for (uint32_t local : {0u, 1u, 100u, 0xFFFFu, 0xFFFFFFFFu}) { + auto id = pto2_make_task_id(ring, local); + EXPECT_EQ(id.ring(), ring); + EXPECT_EQ(id.local(), local); + } + } +} + +TEST(TaskId, Equality) { + auto a = pto2_make_task_id(1, 42); + auto b = pto2_make_task_id(1, 42); + auto c = pto2_make_task_id(1, 43); + auto d = pto2_make_task_id(2, 42); + + EXPECT_TRUE(a == b); + EXPECT_FALSE(a != b); + EXPECT_TRUE(a != c); + EXPECT_TRUE(a != d); +} + +TEST(TaskId, SizeIs8Bytes) { + EXPECT_EQ(sizeof(PTO2TaskId), 8u); +} + +// ============================================================================= +// PTO2TaskSlotState size (cache-line aligned) +// ============================================================================= + +TEST(TaskSlotState, SizeIs64Bytes) { + EXPECT_EQ(sizeof(PTO2TaskSlotState), 64u); +} + +// ============================================================================= +// PTO2_ALIGN_UP macro +// ============================================================================= + +TEST(AlignUp, Zero) { + EXPECT_EQ(PTO2_ALIGN_UP(0, 64), 0u); +} + +TEST(AlignUp, AlreadyAligned) { + EXPECT_EQ(PTO2_ALIGN_UP(64, 64), 64u); + EXPECT_EQ(PTO2_ALIGN_UP(128, 64), 128u); +} + +TEST(AlignUp, NotAligned) { + EXPECT_EQ(PTO2_ALIGN_UP(1, 64), 64u); + EXPECT_EQ(PTO2_ALIGN_UP(63, 64), 64u); + EXPECT_EQ(PTO2_ALIGN_UP(65, 64), 128u); +} + +TEST(AlignUp, SmallAlignment) { + EXPECT_EQ(PTO2_ALIGN_UP(5, 4), 8u); + EXPECT_EQ(PTO2_ALIGN_UP(4, 4), 4u); + EXPECT_EQ(PTO2_ALIGN_UP(3, 4), 4u); +} + +// ============================================================================= +// Task state enum values +// ============================================================================= + +TEST(TaskState, EnumValues) { + EXPECT_EQ(PTO2_TASK_PENDING, 0); + EXPECT_EQ(PTO2_TASK_READY, 1); + EXPECT_EQ(PTO2_TASK_RUNNING, 2); + EXPECT_EQ(PTO2_TASK_COMPLETED, 3); + EXPECT_EQ(PTO2_TASK_CONSUMED, 4); +} + +// ============================================================================= +// Error code constants +// ============================================================================= + +TEST(ErrorCodes, Values) { + EXPECT_EQ(PTO2_ERROR_NONE, 0); + EXPECT_EQ(PTO2_ERROR_SCOPE_DEADLOCK, 1); + EXPECT_EQ(PTO2_ERROR_HEAP_RING_DEADLOCK, 2); + EXPECT_EQ(PTO2_ERROR_FLOW_CONTROL_DEADLOCK, 3); + EXPECT_EQ(PTO2_ERROR_DEP_POOL_OVERFLOW, 4); + EXPECT_EQ(PTO2_ERROR_INVALID_ARGS, 5); + EXPECT_EQ(PTO2_ERROR_SCHEDULER_TIMEOUT, 100); +} + +// ============================================================================= +// Configuration constants +// ============================================================================= + +TEST(Config, TaskWindowSizeIsPowerOf2) { + EXPECT_GT(PTO2_TASK_WINDOW_SIZE, 0); + EXPECT_EQ(PTO2_TASK_WINDOW_SIZE & (PTO2_TASK_WINDOW_SIZE - 1), 0); +} + +TEST(Config, MaxRingDepth) { + EXPECT_EQ(PTO2_MAX_RING_DEPTH, 4); +} + +TEST(Config, AlignSize) { + EXPECT_EQ(PTO2_ALIGN_SIZE, 64); +} diff --git a/tests/ut/cpp/test_dispatch_payload.cpp b/tests/ut/cpp/test_dispatch_payload.cpp new file mode 100644 index 000000000..2a402d485 --- /dev/null +++ b/tests/ut/cpp/test_dispatch_payload.cpp @@ -0,0 +1,144 @@ +/** + * Unit tests for PTO2DispatchPayload and SPMD context structures. + * + * Tests layout constants, alignment, static_assert consistency, and the + * get_block_idx / get_block_num / get_sub_block_id intrinsic accessors. + */ + +#include + +#include + +#include "intrinsic.h" +#include "pto2_dispatch_payload.h" +#include "pto_types.h" + +// ============================================================================= +// Compile-time constant consistency +// ============================================================================= + +TEST(DispatchPayloadConstants, LocalContextIndex) { + // SPMD_LOCAL_CONTEXT_INDEX must equal MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + EXPECT_EQ(SPMD_LOCAL_CONTEXT_INDEX, MAX_TENSOR_ARGS + MAX_SCALAR_ARGS); +} + +TEST(DispatchPayloadConstants, GlobalContextIndex) { + EXPECT_EQ(SPMD_GLOBAL_CONTEXT_INDEX, SPMD_LOCAL_CONTEXT_INDEX + 1); +} + +TEST(DispatchPayloadConstants, ExtParamsCount) { + EXPECT_EQ(PTO2_EXT_PARAMS_COUNT, 2); +} + +TEST(DispatchPayloadConstants, DispatchMaxArgs) { + EXPECT_EQ(PTO2_DISPATCH_MAX_ARGS, + MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + PTO2_EXT_PARAMS_COUNT); +} + +// ============================================================================= +// PTO2DispatchPayload layout and alignment +// ============================================================================= + +TEST(DispatchPayloadLayout, IsAlignedTo64Bytes) { + EXPECT_EQ(alignof(PTO2DispatchPayload), 64u); +} + +TEST(DispatchPayloadLayout, ArgsArrayHasCorrectSize) { + PTO2DispatchPayload p{}; + EXPECT_EQ(sizeof(p.args) / sizeof(p.args[0]), + static_cast(PTO2_DISPATCH_MAX_ARGS)); +} + +TEST(DispatchPayloadLayout, ArgElementIs8Bytes) { + PTO2DispatchPayload p{}; + EXPECT_EQ(sizeof(p.args[0]), 8u); +} + +// ============================================================================= +// LocalContext +// ============================================================================= + +TEST(LocalContext, FieldsReadWrite) { + LocalContext lctx{3, 8}; + EXPECT_EQ(lctx.block_idx, 3); + EXPECT_EQ(lctx.block_num, 8); +} + +TEST(LocalContext, DefaultZero) { + LocalContext lctx{}; + EXPECT_EQ(lctx.block_idx, 0); + EXPECT_EQ(lctx.block_num, 0); +} + +// ============================================================================= +// GlobalContext +// ============================================================================= + +TEST(GlobalContext, FieldReadWrite) { + GlobalContext gctx{1}; + EXPECT_EQ(gctx.sub_block_id, 1); +} + +// ============================================================================= +// Intrinsic accessor functions +// ============================================================================= + +// Build a minimal args[] array with context pointers at the correct indices. +struct IntrinsicTestSetup { + static constexpr int kArgsLen = SPMD_GLOBAL_CONTEXT_INDEX + 1; + LocalContext lctx; + GlobalContext gctx; + uint64_t args[kArgsLen]; + + IntrinsicTestSetup(int block_idx, int block_num, int sub_block_id) + : lctx{block_idx, block_num}, gctx{sub_block_id} { + for (auto& a : args) a = 0; + args[SPMD_LOCAL_CONTEXT_INDEX] = reinterpret_cast(&lctx); + args[SPMD_GLOBAL_CONTEXT_INDEX] = reinterpret_cast(&gctx); + } + + int64_t* raw() { return reinterpret_cast(args); } +}; + +TEST(IntrinsicAccessors, GetBlockIdx) { + IntrinsicTestSetup s(5, 10, 0); + EXPECT_EQ(get_block_idx(s.raw()), 5); +} + +TEST(IntrinsicAccessors, GetBlockNum) { + IntrinsicTestSetup s(0, 7, 0); + EXPECT_EQ(get_block_num(s.raw()), 7); +} + +TEST(IntrinsicAccessors, GetSubBlockId_AIV0) { + IntrinsicTestSetup s(0, 1, 0); + EXPECT_EQ(get_sub_block_id(s.raw()), 0); +} + +TEST(IntrinsicAccessors, GetSubBlockId_AIV1) { + IntrinsicTestSetup s(0, 1, 1); + EXPECT_EQ(get_sub_block_id(s.raw()), 1); +} + +TEST(IntrinsicAccessors, BlockIdxAndNumIndependent) { + // Changing block_idx must not affect block_num and vice versa + IntrinsicTestSetup s(2, 4, 0); + EXPECT_EQ(get_block_idx(s.raw()), 2); + EXPECT_EQ(get_block_num(s.raw()), 4); + + s.lctx.block_idx = 3; + EXPECT_EQ(get_block_idx(s.raw()), 3); + EXPECT_EQ(get_block_num(s.raw()), 4); +} + +TEST(IntrinsicAccessors, ContextPointersAreAtCorrectSlots) { + IntrinsicTestSetup s(1, 2, 0); + // The value at SPMD_LOCAL_CONTEXT_INDEX must point to lctx + auto lctx_ptr = reinterpret_cast( + static_cast(s.args[SPMD_LOCAL_CONTEXT_INDEX])); + EXPECT_EQ(lctx_ptr, &s.lctx); + + auto gctx_ptr = reinterpret_cast( + static_cast(s.args[SPMD_GLOBAL_CONTEXT_INDEX])); + EXPECT_EQ(gctx_ptr, &s.gctx); +} diff --git a/tests/ut/cpp/test_pto_types.cpp b/tests/ut/cpp/test_pto_types.cpp new file mode 100644 index 000000000..c053c1b1e --- /dev/null +++ b/tests/ut/cpp/test_pto_types.cpp @@ -0,0 +1,397 @@ +/** + * Unit tests for Arg and TaskOutputTensors from pto_types.h. + * + * Tests argument ordering enforcement, tensor/scalar storage, + * error propagation, add_scalars_i32 zero-extension, copy_scalars_from, + * and TaskOutputTensors materialization. + */ + +#include +#include + +#include + +#include "common.h" +#include "pto_orchestration_api.h" +#include "pto_types.h" + +// ============================================================================= +// Helpers +// ============================================================================= + +static Tensor make_test_tensor(void* buf) { + uint32_t shapes[2] = {4, 8}; + return make_tensor_external(buf, shapes, 2, DataType::FLOAT32); +} + +// ============================================================================= +// TaskOutputTensors +// ============================================================================= + +TEST(TaskOutputTensors, InitialState) { + TaskOutputTensors out; + EXPECT_TRUE(out.empty()); + EXPECT_EQ(out.size(), 0u); +} + +TEST(TaskOutputTensors, MaterializeAddsOne) { + float buf[4] = {}; + Tensor t = make_test_tensor(buf); + + TaskOutputTensors out; + out.materialize_output(t); + + EXPECT_FALSE(out.empty()); + EXPECT_EQ(out.size(), 1u); +} + +TEST(TaskOutputTensors, GetRefReturnsCorrectTensor) { + float buf0[4] = {}; + float buf1[4] = {}; + Tensor t0 = make_test_tensor(buf0); + Tensor t1 = make_test_tensor(buf1); + + TaskOutputTensors out; + out.materialize_output(t0); + out.materialize_output(t1); + + EXPECT_EQ(&out.get_ref(0), &t0); + EXPECT_EQ(&out.get_ref(1), &t1); + EXPECT_EQ(out.size(), 2u); +} + +TEST(TaskOutputTensors, GetRefOutOfRangeThrows) { + TaskOutputTensors out; + EXPECT_THROW(out.get_ref(0), AssertionError); +} + +TEST(TaskOutputTensors, MaxOutputsFill) { + float bufs[PTO2_MAX_OUTPUTS] = {}; + std::vector tensors; + tensors.reserve(PTO2_MAX_OUTPUTS); + + TaskOutputTensors out; + for (int i = 0; i < PTO2_MAX_OUTPUTS; i++) { + tensors.push_back(make_test_tensor(&bufs[i])); + out.materialize_output(tensors.back()); + } + EXPECT_EQ(out.size(), static_cast(PTO2_MAX_OUTPUTS)); +} + +// ============================================================================= +// Arg — initial state +// ============================================================================= + +TEST(Arg, DefaultState) { + Arg a; + EXPECT_FALSE(a.has_error); + EXPECT_EQ(a.error_msg, nullptr); + EXPECT_EQ(a.tensor_count(), 0); + EXPECT_EQ(a.scalar_count(), 0); +} + +// ============================================================================= +// Arg — add_input / add_output / add_inout +// ============================================================================= + +TEST(Arg, AddInput) { + float buf[4] = {}; + Tensor t = make_test_tensor(buf); + Arg a; + a.add_input(t); + EXPECT_EQ(a.tensor_count(), 1); + EXPECT_EQ(a.tag(0), TensorArgType::INPUT); + EXPECT_EQ(a.tensor(0).ptr, &t); + EXPECT_FALSE(a.has_error); +} + +TEST(Arg, AddOutput) { + uint32_t shapes[2] = {4, 8}; + TensorCreateInfo ci(shapes, 2, DataType::FLOAT32); + Arg a; + a.add_output(ci); + EXPECT_EQ(a.tensor_count(), 1); + EXPECT_EQ(a.tag(0), TensorArgType::OUTPUT); + EXPECT_EQ(a.tensor(0).create_info, &ci); + EXPECT_FALSE(a.has_error); +} + +TEST(Arg, AddInout) { + float buf[4] = {}; + Tensor t = make_test_tensor(buf); + Arg a; + a.add_inout(t); + EXPECT_EQ(a.tensor_count(), 1); + EXPECT_EQ(a.tag(0), TensorArgType::INOUT); + EXPECT_EQ(a.tensor(0).ptr, &t); + EXPECT_FALSE(a.has_error); +} + +TEST(Arg, MixedInputOutputInout) { + float buf_in[4] = {}, buf_inout[4] = {}; + Tensor tin = make_test_tensor(buf_in); + Tensor tinout = make_test_tensor(buf_inout); + uint32_t shapes_in[2] = {4, 8}; + TensorCreateInfo ci(shapes_in, 1, DataType::FLOAT32); + + Arg a; + a.add_input(tin); + a.add_output(ci); + a.add_inout(tinout); + + EXPECT_EQ(a.tensor_count(), 3); + EXPECT_EQ(a.tag(0), TensorArgType::INPUT); + EXPECT_EQ(a.tag(1), TensorArgType::OUTPUT); + EXPECT_EQ(a.tag(2), TensorArgType::INOUT); + EXPECT_FALSE(a.has_error); +} + +// ============================================================================= +// Arg — ordering enforcement: tensor after scalar sets error +// ============================================================================= + +TEST(Arg, TensorAfterScalarSetsError) { + float buf[4] = {}; + Tensor t = make_test_tensor(buf); + Arg a; + a.add_scalar(uint64_t(42)); + a.add_input(t); // invalid: tensor after scalar + EXPECT_TRUE(a.has_error); + EXPECT_NE(a.error_msg, nullptr); + // The scalar was recorded, the tensor was not + EXPECT_EQ(a.tensor_count(), 0); + EXPECT_EQ(a.scalar_count(), 1); +} + +TEST(Arg, OutputAfterScalarSetsError) { + uint32_t shapes_in[2] = {4, 8}; + TensorCreateInfo ci(shapes_in, 1, DataType::FLOAT32); + Arg a; + a.add_scalar(uint64_t(1)); + a.add_output(ci); + EXPECT_TRUE(a.has_error); + EXPECT_EQ(a.tensor_count(), 0); +} + +TEST(Arg, InoutAfterScalarSetsError) { + float buf[4] = {}; + Tensor t = make_test_tensor(buf); + Arg a; + a.add_scalar(uint64_t(1)); + a.add_inout(t); + EXPECT_TRUE(a.has_error); + EXPECT_EQ(a.tensor_count(), 0); +} + +// ============================================================================= +// Arg — capacity limits +// ============================================================================= + +TEST(Arg, TensorCapacityExceeded) { + Arg a; + for (int i = 0; i < MAX_TENSOR_ARGS; i++) { + float dummy = 0.0f; + Tensor t = make_test_tensor(&dummy); + a.add_input(t); + ASSERT_FALSE(a.has_error) << "Failed at tensor " << i; + } + // One more should trigger the error + float extra = 0.0f; + Tensor t_extra = make_test_tensor(&extra); + a.add_input(t_extra); + EXPECT_TRUE(a.has_error); + EXPECT_EQ(a.tensor_count(), MAX_TENSOR_ARGS); +} + +TEST(Arg, ScalarCapacityExceeded) { + Arg a; + for (int i = 0; i < MAX_SCALAR_ARGS; i++) { + a.add_scalar(static_cast(i)); + ASSERT_FALSE(a.has_error) << "Failed at scalar " << i; + } + a.add_scalar(uint64_t(999)); + EXPECT_TRUE(a.has_error); + EXPECT_EQ(a.scalar_count(), MAX_SCALAR_ARGS); +} + +// ============================================================================= +// Arg — add_scalar with various types +// ============================================================================= + +TEST(Arg, AddScalarUint64) { + Arg a; + a.add_scalar(uint64_t(0xDEADBEEFCAFEBABEULL)); + EXPECT_EQ(a.scalar_count(), 1); + EXPECT_EQ(a.scalar(0), 0xDEADBEEFCAFEBABEULL); + EXPECT_FALSE(a.has_error); +} + +TEST(Arg, AddScalarFloat) { + Arg a; + float v = 3.14f; + a.add_scalar(v); + EXPECT_EQ(a.scalar_count(), 1); + EXPECT_EQ(a.scalar(0), to_u64(v)); + EXPECT_FALSE(a.has_error); +} + +TEST(Arg, AddScalarInt32) { + Arg a; + int32_t v = -7; + a.add_scalar(v); + EXPECT_EQ(a.scalar_count(), 1); + EXPECT_EQ(a.scalar(0), to_u64(v)); + EXPECT_FALSE(a.has_error); +} + +// ============================================================================= +// Arg — add_scalars (batch uint64) +// ============================================================================= + +TEST(Arg, AddScalarsBatch) { + Arg a; + uint64_t vals[3] = {10, 20, 30}; + a.add_scalars(vals, 3); + EXPECT_EQ(a.scalar_count(), 3); + EXPECT_EQ(a.scalar(0), 10u); + EXPECT_EQ(a.scalar(1), 20u); + EXPECT_EQ(a.scalar(2), 30u); + EXPECT_FALSE(a.has_error); +} + +TEST(Arg, AddScalarsBatchOverCapacitySetsError) { + Arg a; + // Fill to capacity minus 1 + for (int i = 0; i < MAX_SCALAR_ARGS - 1; i++) { + a.add_scalar(uint64_t(i)); + } + // Batch of 3 would overflow by 2 + uint64_t vals[3] = {1, 2, 3}; + a.add_scalars(vals, 3); + EXPECT_TRUE(a.has_error); +} + +// ============================================================================= +// Arg — add_scalars_i32 (zero-extension) +// ============================================================================= + +TEST(Arg, AddScalarsI32ZeroExtends) { + Arg a; + int32_t vals[4] = {0, 1, -1, 0x7FFFFFFF}; + a.add_scalars_i32(vals, 4); + EXPECT_EQ(a.scalar_count(), 4); + EXPECT_EQ(a.scalar(0), uint64_t(0)); + EXPECT_EQ(a.scalar(1), uint64_t(1)); + // -1 as int32 is 0xFFFFFFFF; zero-extended to uint64 is 0x00000000FFFFFFFF + EXPECT_EQ(a.scalar(2), uint64_t(0x00000000FFFFFFFFull)); + EXPECT_EQ(a.scalar(3), uint64_t(0x000000007FFFFFFFull)); + EXPECT_FALSE(a.has_error); +} + +TEST(Arg, AddScalarsI32NegativeValues) { + Arg a; + int32_t vals[2] = {-1, -2}; + a.add_scalars_i32(vals, 2); + // -1 -> 0xFFFFFFFF zero-extended -> 0x00000000FFFFFFFF + // -2 -> 0xFFFFFFFE zero-extended -> 0x00000000FFFFFFFE + EXPECT_EQ(a.scalar(0), uint64_t(0xFFFFFFFFull)); + EXPECT_EQ(a.scalar(1), uint64_t(0xFFFFFFFEull)); +} + +TEST(Arg, AddScalarsI32SingleElement) { + Arg a; + int32_t v = 42; + a.add_scalars_i32(&v, 1); + EXPECT_EQ(a.scalar_count(), 1); + EXPECT_EQ(a.scalar(0), uint64_t(42)); +} + +TEST(Arg, AddScalarsI32OverCapacitySetsError) { + Arg a; + for (int i = 0; i < MAX_SCALAR_ARGS - 1; i++) { + a.add_scalar(uint64_t(i)); + } + int32_t vals[3] = {1, 2, 3}; + a.add_scalars_i32(vals, 3); + EXPECT_TRUE(a.has_error); +} + +// ============================================================================= +// Arg — copy_scalars_from +// ============================================================================= + +TEST(Arg, CopyScalarsFrom) { + Arg src; + src.add_scalar(uint64_t(10)); + src.add_scalar(uint64_t(20)); + src.add_scalar(uint64_t(30)); + + Arg dst; + dst.copy_scalars_from(src, 1, 2); // copy scalars[1..2] = {20, 30} + EXPECT_EQ(dst.scalar_count(), 2); + EXPECT_EQ(dst.scalar(0), uint64_t(20)); + EXPECT_EQ(dst.scalar(1), uint64_t(30)); + EXPECT_FALSE(dst.has_error); +} + +TEST(Arg, CopyScalarsFromOutOfBoundsSetsError) { + Arg src; + src.add_scalar(uint64_t(1)); + + Arg dst; + dst.copy_scalars_from(src, 0, 5); // only 1 scalar available, request 5 + EXPECT_TRUE(dst.has_error); +} + +TEST(Arg, CopyScalarsFromFull) { + Arg src; + for (int i = 0; i < MAX_SCALAR_ARGS; i++) { + src.add_scalar(static_cast(i)); + } + Arg dst; + for (int i = 0; i < MAX_SCALAR_ARGS - 1; i++) { + dst.add_scalar(uint64_t(0)); + } + // dst has MAX-1 scalars; copying 2 from src would overflow + dst.copy_scalars_from(src, 0, 2); + EXPECT_TRUE(dst.has_error); +} + +// ============================================================================= +// Arg — reset clears all state +// ============================================================================= + +TEST(Arg, ResetClearsAll) { + float buf[4] = {}; + Tensor t = make_test_tensor(buf); + Arg a; + a.add_input(t); + a.add_scalar(uint64_t(99)); + a.set_error("deliberate error"); + + a.reset(); + EXPECT_EQ(a.tensor_count(), 0); + EXPECT_EQ(a.scalar_count(), 0); + EXPECT_FALSE(a.has_error); + EXPECT_EQ(a.error_msg, nullptr); +} + +// ============================================================================= +// Arg — set_error is idempotent (first error wins) +// ============================================================================= + +TEST(Arg, SetErrorFirstWins) { + Arg a; + a.set_error("first"); + a.set_error("second"); + EXPECT_STREQ(a.error_msg, "first"); +} + +// ============================================================================= +// Arg — launch_spec default +// ============================================================================= + +TEST(Arg, LaunchSpecDefaultBlockNum) { + Arg a; + EXPECT_EQ(a.launch_spec.block_num(), 1); +} diff --git a/tests/ut/cpp/test_ready_queue.cpp b/tests/ut/cpp/test_ready_queue.cpp new file mode 100644 index 000000000..0c6b0fb86 --- /dev/null +++ b/tests/ut/cpp/test_ready_queue.cpp @@ -0,0 +1,398 @@ +/** + * Unit tests for PTO2ReadyQueue and PTO2LocalReadyBuffer from pto_scheduler.h + * + * Tests the lock-free bounded MPMC queue (Vyukov design) and the thread-local + * ready buffer used for local-first dispatch optimization. + */ + +#include + +#include +#include +#include +#include +#include + +#include "pto_scheduler.h" + +// ============================================================================= +// ReadyQueue: Single-threaded tests +// ============================================================================= + +class ReadyQueueTest : public ::testing::Test { +protected: + static constexpr uint64_t kCapacity = 16; // Power of 2 + + PTO2ReadyQueue queue; + + void SetUp() override { + ASSERT_TRUE(pto2_ready_queue_init(&queue, kCapacity)); + } + + void TearDown() override { + pto2_ready_queue_destroy(&queue); + } +}; + +// 1. Empty pop returns nullptr +TEST_F(ReadyQueueTest, EmptyPopReturnsNullptr) { + EXPECT_EQ(queue.pop(), nullptr); +} + +// 2. Single push/pop returns correct item +TEST_F(ReadyQueueTest, SinglePushPop) { + PTO2TaskSlotState item; + ASSERT_TRUE(queue.push(&item)); + + PTO2TaskSlotState* result = queue.pop(); + EXPECT_EQ(result, &item); +} + +// 3. FIFO ordering: push A,B,C then pop A,B,C +TEST_F(ReadyQueueTest, FIFOOrdering) { + PTO2TaskSlotState a, b, c; + + ASSERT_TRUE(queue.push(&a)); + ASSERT_TRUE(queue.push(&b)); + ASSERT_TRUE(queue.push(&c)); + + EXPECT_EQ(queue.pop(), &a); + EXPECT_EQ(queue.pop(), &b); + EXPECT_EQ(queue.pop(), &c); + EXPECT_EQ(queue.pop(), nullptr); +} + +// 4. Queue full: push returns false at capacity +TEST_F(ReadyQueueTest, QueueFullReturnsFalse) { + std::vector items(kCapacity); + + for (uint64_t i = 0; i < kCapacity; i++) { + ASSERT_TRUE(queue.push(&items[i])); + } + + PTO2TaskSlotState extra; + EXPECT_FALSE(queue.push(&extra)); +} + +// 5. Slot reuse after full drain (push/pop cycle) +TEST_F(ReadyQueueTest, SlotReuseAfterFullDrain) { + std::vector items(kCapacity); + + // Fill the queue + for (uint64_t i = 0; i < kCapacity; i++) { + ASSERT_TRUE(queue.push(&items[i])); + } + + // Drain the queue + for (uint64_t i = 0; i < kCapacity; i++) { + EXPECT_EQ(queue.pop(), &items[i]); + } + EXPECT_EQ(queue.pop(), nullptr); + + // Refill and re-drain to verify slot reuse + for (uint64_t i = 0; i < kCapacity; i++) { + ASSERT_TRUE(queue.push(&items[i])); + } + for (uint64_t i = 0; i < kCapacity; i++) { + EXPECT_EQ(queue.pop(), &items[i]); + } + EXPECT_EQ(queue.pop(), nullptr); +} + +// 6. push_batch: batch enqueue then individual dequeue +TEST_F(ReadyQueueTest, PushBatchThenIndividualPop) { + constexpr int kBatchSize = 5; + PTO2TaskSlotState items[kBatchSize]; + PTO2TaskSlotState* ptrs[kBatchSize]; + for (int i = 0; i < kBatchSize; i++) { + ptrs[i] = &items[i]; + } + + queue.push_batch(ptrs, kBatchSize); + + for (int i = 0; i < kBatchSize; i++) { + EXPECT_EQ(queue.pop(), &items[i]); + } + EXPECT_EQ(queue.pop(), nullptr); +} + +// 7. push_batch count=0: no-op +TEST_F(ReadyQueueTest, PushBatchZeroIsNoop) { + queue.push_batch(nullptr, 0); + + EXPECT_EQ(queue.size(), 0u); + EXPECT_EQ(queue.pop(), nullptr); +} + +// 8. pop_batch: push 10, pop_batch(5) returns 5 +TEST_F(ReadyQueueTest, PopBatchReturnsFive) { + constexpr int kPushCount = 10; + PTO2TaskSlotState items[kPushCount]; + + for (int i = 0; i < kPushCount; i++) { + ASSERT_TRUE(queue.push(&items[i])); + } + + PTO2TaskSlotState* out[5]; + int popped = queue.pop_batch(out, 5); + EXPECT_EQ(popped, 5); + + for (int i = 0; i < 5; i++) { + EXPECT_EQ(out[i], &items[i]); + } +} + +// 9. pop_batch partial: push 3, pop_batch(5) returns 3 +TEST_F(ReadyQueueTest, PopBatchPartial) { + constexpr int kPushCount = 3; + PTO2TaskSlotState items[kPushCount]; + + for (int i = 0; i < kPushCount; i++) { + ASSERT_TRUE(queue.push(&items[i])); + } + + PTO2TaskSlotState* out[5]; + int popped = queue.pop_batch(out, 5); + EXPECT_EQ(popped, kPushCount); + + for (int i = 0; i < kPushCount; i++) { + EXPECT_EQ(out[i], &items[i]); + } +} + +// 10. pop_batch empty: returns 0 +TEST_F(ReadyQueueTest, PopBatchEmpty) { + PTO2TaskSlotState* out[5]; + int popped = queue.pop_batch(out, 5); + EXPECT_EQ(popped, 0); +} + +// 11. size() accuracy after various push/pop +TEST_F(ReadyQueueTest, SizeAccuracy) { + EXPECT_EQ(queue.size(), 0u); + + PTO2TaskSlotState items[8]; + + queue.push(&items[0]); + EXPECT_EQ(queue.size(), 1u); + + queue.push(&items[1]); + queue.push(&items[2]); + EXPECT_EQ(queue.size(), 3u); + + queue.pop(); + EXPECT_EQ(queue.size(), 2u); + + queue.pop(); + queue.pop(); + EXPECT_EQ(queue.size(), 0u); + + // Push 5 more + for (int i = 0; i < 5; i++) { + queue.push(&items[i]); + } + EXPECT_EQ(queue.size(), 5u); +} + +// ============================================================================= +// ReadyQueue: Multi-threaded tests +// ============================================================================= + +class ReadyQueueMTTest : public ::testing::Test { +protected: + static constexpr uint64_t kCapacity = 1024; // Power of 2 + + PTO2ReadyQueue queue; + + void SetUp() override { + ASSERT_TRUE(pto2_ready_queue_init(&queue, kCapacity)); + } + + void TearDown() override { + pto2_ready_queue_destroy(&queue); + } +}; + +// 12. 2 producers / 2 consumers: all items consumed exactly once +TEST_F(ReadyQueueMTTest, TwoProducersTwoConsumers) { + constexpr int kItemsPerProducer = 200; + constexpr int kTotalItems = kItemsPerProducer * 2; + + std::vector items(kTotalItems); + // Give each item a unique counter value via its fanin_count field + for (int i = 0; i < kTotalItems; i++) { + items[i].fanin_count = i; + } + + std::atomic produced{0}; + std::atomic producers_done{false}; + + // Tracking: atomic counter per item to verify exactly-once consumption + std::vector> consumed_count(kTotalItems); + for (int i = 0; i < kTotalItems; i++) { + consumed_count[i].store(0, std::memory_order_relaxed); + } + + auto producer = [&](int offset) { + for (int i = 0; i < kItemsPerProducer; i++) { + while (!queue.push(&items[offset + i])) { + // Queue full, retry + } + } + produced.fetch_add(kItemsPerProducer, std::memory_order_release); + }; + + auto consumer = [&](std::vector& results) { + while (true) { + PTO2TaskSlotState* item = queue.pop(); + if (item != nullptr) { + results.push_back(item); + consumed_count[item->fanin_count].fetch_add(1, std::memory_order_relaxed); + } else if (producers_done.load(std::memory_order_acquire)) { + // Drain remaining + while ((item = queue.pop()) != nullptr) { + results.push_back(item); + consumed_count[item->fanin_count].fetch_add(1, std::memory_order_relaxed); + } + break; + } + } + }; + + std::vector results_c1, results_c2; + std::thread p1(producer, 0); + std::thread p2(producer, kItemsPerProducer); + std::thread c1(consumer, std::ref(results_c1)); + std::thread c2(consumer, std::ref(results_c2)); + + p1.join(); + p2.join(); + producers_done.store(true, std::memory_order_release); + c1.join(); + c2.join(); + + // Verify all items consumed exactly once + int total_consumed = static_cast(results_c1.size() + results_c2.size()); + EXPECT_EQ(total_consumed, kTotalItems); + + for (int i = 0; i < kTotalItems; i++) { + EXPECT_EQ(consumed_count[i].load(), 1) + << "Item " << i << " consumed " + << consumed_count[i].load() << " times (expected 1)"; + } +} + +// 13. 1 producer / N consumers: all items consumed exactly once +TEST_F(ReadyQueueMTTest, OneProducerNConsumers) { + constexpr int kTotalItems = 500; + constexpr int kNumConsumers = 4; + + std::vector items(kTotalItems); + for (int i = 0; i < kTotalItems; i++) { + items[i].fanin_count = i; + } + + std::atomic producer_done{false}; + std::vector> consumed_count(kTotalItems); + for (int i = 0; i < kTotalItems; i++) { + consumed_count[i].store(0, std::memory_order_relaxed); + } + + auto producer = [&]() { + for (int i = 0; i < kTotalItems; i++) { + while (!queue.push(&items[i])) { + // Queue full, retry + } + } + producer_done.store(true, std::memory_order_release); + }; + + std::atomic total_consumed{0}; + + auto consumer = [&]() { + while (true) { + PTO2TaskSlotState* item = queue.pop(); + if (item != nullptr) { + consumed_count[item->fanin_count].fetch_add(1, std::memory_order_relaxed); + total_consumed.fetch_add(1, std::memory_order_relaxed); + } else if (producer_done.load(std::memory_order_acquire)) { + // Drain remaining + while ((item = queue.pop()) != nullptr) { + consumed_count[item->fanin_count].fetch_add(1, std::memory_order_relaxed); + total_consumed.fetch_add(1, std::memory_order_relaxed); + } + break; + } + } + }; + + std::thread prod(producer); + std::vector consumers; + for (int i = 0; i < kNumConsumers; i++) { + consumers.emplace_back(consumer); + } + + prod.join(); + for (auto& c : consumers) { + c.join(); + } + + EXPECT_EQ(total_consumed.load(), kTotalItems); + + for (int i = 0; i < kTotalItems; i++) { + EXPECT_EQ(consumed_count[i].load(), 1) + << "Item " << i << " consumed " + << consumed_count[i].load() << " times (expected 1)"; + } +} + +// ============================================================================= +// LocalReadyBuffer tests +// ============================================================================= + +class LocalReadyBufferTest : public ::testing::Test { +protected: + static constexpr int kCapacity = 8; + + PTO2LocalReadyBuffer buffer; + PTO2TaskSlotState* backing[kCapacity]; + + void SetUp() override { + buffer.reset(backing, kCapacity); + } +}; + +// 14. reset sets clean state +TEST_F(LocalReadyBufferTest, ResetSetsCleanState) { + EXPECT_EQ(buffer.count, 0); + EXPECT_EQ(buffer.capacity, kCapacity); + EXPECT_EQ(buffer.slot_states, backing); +} + +// 15. try_push/pop LIFO: push A,B -> pop returns B,A +TEST_F(LocalReadyBufferTest, LIFOOrdering) { + PTO2TaskSlotState a, b; + + ASSERT_TRUE(buffer.try_push(&a)); + ASSERT_TRUE(buffer.try_push(&b)); + + EXPECT_EQ(buffer.pop(), &b); + EXPECT_EQ(buffer.pop(), &a); + EXPECT_EQ(buffer.pop(), nullptr); +} + +// 16. try_push full: returns false at capacity +TEST_F(LocalReadyBufferTest, TryPushFullReturnsFalse) { + PTO2TaskSlotState items[kCapacity + 1]; + + for (int i = 0; i < kCapacity; i++) { + ASSERT_TRUE(buffer.try_push(&items[i])); + } + + EXPECT_FALSE(buffer.try_push(&items[kCapacity])); +} + +// 17. pop empty: returns nullptr +TEST_F(LocalReadyBufferTest, PopEmptyReturnsNullptr) { + EXPECT_EQ(buffer.pop(), nullptr); +} diff --git a/tests/ut/cpp/test_ring_buffer.cpp b/tests/ut/cpp/test_ring_buffer.cpp new file mode 100644 index 000000000..58bd8691f --- /dev/null +++ b/tests/ut/cpp/test_ring_buffer.cpp @@ -0,0 +1,573 @@ +/** + * Unit tests for PTO2TaskAllocator and PTO2DepListPool from pto_ring_buffer.h + * + * Tests ring buffer allocation, heap bump logic, dependency list pool, + * and known boundary conditions including a bug candidate in + * try_bump_heap wrap-around when tail == alloc_size. + */ + +#include + +#include +#include +#include + +#include "pto_ring_buffer.h" + +// ============================================================================= +// Helpers +// ============================================================================= + +static constexpr int32_t kWindowSize = 16; // Power of 2, small for testing +static constexpr uint64_t kHeapSize = 1024; // Small heap for boundary testing + +/** + * Test fixture for PTO2TaskAllocator tests. + * + * Sets up a descriptor array, heap buffer, and atomic flow-control variables. + * last_alive starts at 0, so tasks 0..window_size-2 can be allocated before + * the ring is considered full (active = local_task_id - last_alive + 1 < window_size). + */ +class TaskAllocatorTest : public ::testing::Test { +protected: + void SetUp() override { + descriptors_.resize(kWindowSize); + std::memset(descriptors_.data(), 0, sizeof(PTO2TaskDescriptor) * kWindowSize); + heap_buf_.resize(kHeapSize, 0); + + current_index_.store(0, std::memory_order_relaxed); + last_alive_.store(0, std::memory_order_relaxed); + error_code_.store(0, std::memory_order_relaxed); + + allocator_.init( + descriptors_.data(), kWindowSize, + ¤t_index_, &last_alive_, + heap_buf_.data(), kHeapSize, + &error_code_); + } + + // Simulate the scheduler consuming tasks up to (exclusive) task_id + // by advancing last_alive and setting packed_buffer_end on the consumed descriptor. + void consume_up_to(int32_t task_id, uint64_t heap_tail_offset) { + // Set the packed_buffer_end on the descriptor that last_alive-1 maps to + // so update_heap_tail can derive the tail. + int32_t last_consumed = task_id - 1; + descriptors_[last_consumed & (kWindowSize - 1)].packed_buffer_end = + static_cast(static_cast(heap_buf_.data())) + heap_tail_offset; + last_alive_.store(task_id, std::memory_order_release); + } + + PTO2TaskAllocator allocator_; + std::vector descriptors_; + std::vector heap_buf_; + std::atomic current_index_{0}; + std::atomic last_alive_{0}; + std::atomic error_code_{0}; +}; + +// ============================================================================= +// TaskAllocator: init and state queries +// ============================================================================= + +TEST_F(TaskAllocatorTest, InitialState) { + EXPECT_EQ(allocator_.window_size(), kWindowSize); + EXPECT_EQ(allocator_.active_count(), 0); + EXPECT_EQ(allocator_.heap_top(), 0u); + EXPECT_EQ(allocator_.heap_capacity(), kHeapSize); + EXPECT_EQ(allocator_.heap_available(), kHeapSize); +} + +// ============================================================================= +// TaskAllocator: single alloc with output_size=0 +// ============================================================================= + +TEST_F(TaskAllocatorTest, AllocZeroOutputSize) { + auto result = allocator_.alloc(0); + ASSERT_FALSE(result.failed()); + EXPECT_EQ(result.task_id, 0); + EXPECT_EQ(result.slot, 0); + // packed_base should be heap_base + 0 (non-null) + EXPECT_NE(result.packed_base, nullptr); + // packed_end == packed_base when output_size == 0 + EXPECT_EQ(result.packed_base, result.packed_end); + // Heap top should not advance for zero-size alloc + EXPECT_EQ(allocator_.heap_top(), 0u); +} + +// ============================================================================= +// TaskAllocator: single alloc with non-zero size +// ============================================================================= + +TEST_F(TaskAllocatorTest, AllocNonZeroSize) { + auto result = allocator_.alloc(100); + ASSERT_FALSE(result.failed()); + EXPECT_EQ(result.task_id, 0); + EXPECT_EQ(result.slot, 0); + EXPECT_NE(result.packed_base, nullptr); + // 100 bytes aligned up to PTO2_ALIGN_SIZE (64) = 128 + uint64_t expected_aligned = PTO2_ALIGN_UP(100u, PTO2_ALIGN_SIZE); + EXPECT_EQ(expected_aligned, 128u); + EXPECT_EQ(allocator_.heap_top(), expected_aligned); + EXPECT_EQ(static_cast(result.packed_end) - static_cast(result.packed_base), + static_cast(expected_aligned)); +} + +// ============================================================================= +// TaskAllocator: sequential allocs produce sequential task IDs +// ============================================================================= + +TEST_F(TaskAllocatorTest, SequentialTaskIds) { + for (int i = 0; i < 5; i++) { + auto result = allocator_.alloc(0); + ASSERT_FALSE(result.failed()) << "Alloc failed at i=" << i; + EXPECT_EQ(result.task_id, i); + EXPECT_EQ(result.slot, i & (kWindowSize - 1)); + } + EXPECT_EQ(allocator_.active_count(), 5); +} + +// ============================================================================= +// TaskAllocator: alignment of output_size to PTO2_ALIGN_SIZE +// ============================================================================= + +TEST_F(TaskAllocatorTest, OutputSizeAlignment) { + // 1 byte -> aligned to 64 + auto r1 = allocator_.alloc(1); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(allocator_.heap_top(), 64u); + + // Another 33 bytes -> aligned to 64, total 128 + auto r2 = allocator_.alloc(33); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(allocator_.heap_top(), 128u); + + // Exactly 64 bytes -> stays 64, total 192 + auto r3 = allocator_.alloc(64); + ASSERT_FALSE(r3.failed()); + EXPECT_EQ(allocator_.heap_top(), 192u); +} + +// ============================================================================= +// TaskAllocator: try_bump_heap exact fit at end (space_at_end == alloc_size) +// ============================================================================= + +TEST_F(TaskAllocatorTest, HeapExactFitAtEnd) { + // Heap size is 1024. Allocate 960 bytes (15 * 64) to leave exactly 64 at end. + // Then allocate exactly 64 which should succeed (space_at_end >= alloc_size). + auto r1 = allocator_.alloc(960); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(allocator_.heap_top(), 960u); + + auto r2 = allocator_.alloc(64); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(allocator_.heap_top(), 1024u); + // Result pointer should be at heap_base + 960 + EXPECT_EQ(static_cast(r2.packed_base), + heap_buf_.data() + 960); +} + +// ============================================================================= +// TaskAllocator: try_bump_heap wrap-around with tail == alloc_size (BUG TEST) +// +// BUG at pto_ring_buffer.h try_bump_heap: uses `tail > alloc_size` (strict >). +// When tail == alloc_size, [0, alloc_size) is exactly available, so the +// wrap-around should succeed. The strict > incorrectly rejects it. +// +// Correct behavior: allocation succeeds, packed_base == heap_base, top == 64. +// This test FAILS until the bug is fixed (> changed to >=). +// ============================================================================= + +TEST_F(TaskAllocatorTest, HeapWrapAroundTailEqualsAllocSize_BugCandidate) { + // Fill heap completely: allocate 1024 bytes total + auto r1 = allocator_.alloc(1024); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(allocator_.heap_top(), 1024u); + + // Now consume task 0, setting tail to exactly 64 (one aligned block) + consume_up_to(1, 64); + + // top=1024 (== heap_size), tail=64 + // space_at_end = 0, wrap-around: tail(64) >= alloc_size(64) => exactly fits. + // Correct behavior: allocation wraps to [0, 64) and succeeds. + auto r2 = allocator_.alloc(64); + ASSERT_FALSE(r2.failed()) << "wrap-around must succeed when tail == alloc_size"; + EXPECT_EQ(r2.packed_base, static_cast(heap_buf_.data())) + << "packed_base should wrap to start of heap"; + EXPECT_EQ(allocator_.heap_top(), 64u); + EXPECT_EQ(error_code_.load(), 0) << "no error on successful allocation"; +} + +// ============================================================================= +// TaskAllocator: try_bump_heap wrap-around success (tail > alloc_size) +// ============================================================================= + +TEST_F(TaskAllocatorTest, HeapWrapAroundSuccess) { + // Fill heap completely: allocate 1024 bytes + auto r1 = allocator_.alloc(1024); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(allocator_.heap_top(), 1024u); + + // Consume task 0, setting tail to 128 (more than one block) + consume_up_to(1, 128); + + // Now: top=1024 (== heap_size), tail=128 + // space_at_end = 0, so wrap-around check: tail(128) > alloc_size(64)? => TRUE + // Wraps to beginning: result = heap_base, top = 64 + auto r2 = allocator_.alloc(64); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(r2.packed_base, static_cast(heap_buf_.data())); + EXPECT_EQ(allocator_.heap_top(), 64u); +} + +// ============================================================================= +// TaskAllocator: try_bump_heap top < tail exact fit +// ============================================================================= + +TEST_F(TaskAllocatorTest, HeapTopLessThanTailExactFit) { + // Fill heap, then wrap around to set up top < tail. + auto r1 = allocator_.alloc(960); + ASSERT_FALSE(r1.failed()); + + // Consume task 0, tail moves to 960 + consume_up_to(1, 960); + + // Allocate 128 bytes: space_at_end = 1024-960 = 64, not enough for 128. + // Wrap-around: tail(960) > 128 => TRUE, wraps. + auto r2 = allocator_.alloc(128); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(allocator_.heap_top(), 128u); + + // Now top=128, tail=960 (top < tail) + // Available = tail - top = 960 - 128 = 832 + // Allocate exactly 832 bytes: should succeed (exact fit) + auto r3 = allocator_.alloc(832); + ASSERT_FALSE(r3.failed()); + EXPECT_EQ(allocator_.heap_top(), 960u); +} + +// ============================================================================= +// TaskAllocator: try_bump_heap top < tail insufficient space +// ============================================================================= + +TEST_F(TaskAllocatorTest, HeapTopLessThanTailInsufficientSpace) { + // Set up top < tail scenario + auto r1 = allocator_.alloc(960); + ASSERT_FALSE(r1.failed()); + consume_up_to(1, 960); + + auto r2 = allocator_.alloc(128); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(allocator_.heap_top(), 128u); + + // Now top=128, tail=960. Available = 832. + // Try to allocate 896 (> 832): should fail (deadlock after spin). + auto r3 = allocator_.alloc(896); + EXPECT_TRUE(r3.failed()); + EXPECT_NE(error_code_.load(), 0); +} + +// ============================================================================= +// TaskAllocator: update_heap_tail from consumed task +// ============================================================================= + +TEST_F(TaskAllocatorTest, UpdateHeapTailFromConsumedTask) { + auto r1 = allocator_.alloc(256); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(allocator_.heap_top(), 256u); + + // Before consumption, heap_available should be heap_size - top = 768 + EXPECT_EQ(allocator_.heap_available(), kHeapSize - 256u); + + // Consume task 0, tail moves to 256 + consume_up_to(1, 256); + + // Force the allocator to observe the new last_alive by doing another alloc + auto r2 = allocator_.alloc(0); + ASSERT_FALSE(r2.failed()); + + // After update_heap_tail, full heap should be available again + // top=256, tail=256, so available = heap_size - top = 768 (at_end) + // Actually: top >= tail, at_end = 1024-256=768, at_begin = 256 + // heap_available returns max(at_end, at_begin) = 768 + EXPECT_EQ(allocator_.heap_available(), kHeapSize - 256u); +} + +// ============================================================================= +// TaskAllocator: update_heap_tail at task 0 boundary +// +// When last_alive=1, update_heap_tail reads descriptors[(1-1) & mask] = descriptors[0]. +// This is task 0's descriptor, which should have valid packed_buffer_end. +// ============================================================================= + +TEST_F(TaskAllocatorTest, UpdateHeapTailAtTask0) { + // Allocate task 0 with some heap + auto r1 = allocator_.alloc(64); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(r1.task_id, 0); + + // Set packed_buffer_end on task 0's descriptor + descriptors_[0].packed_buffer_end = + static_cast(static_cast(heap_buf_.data())) + 64; + + // Advance last_alive to 1 (meaning task 0 is consumed) + last_alive_.store(1, std::memory_order_release); + + // The next alloc triggers update_heap_tail(1), reading descriptors[0]. + auto r2 = allocator_.alloc(0); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(r2.task_id, 1); +} + +// ============================================================================= +// TaskAllocator: update_heap_tail idempotent +// ============================================================================= + +TEST_F(TaskAllocatorTest, UpdateHeapTailIdempotent) { + auto r1 = allocator_.alloc(128); + ASSERT_FALSE(r1.failed()); + + consume_up_to(1, 128); + + // Multiple allocs should not cause heap_tail to drift + auto r2 = allocator_.alloc(0); + ASSERT_FALSE(r2.failed()); + uint64_t avail_after_first = allocator_.heap_available(); + + auto r3 = allocator_.alloc(0); + ASSERT_FALSE(r3.failed()); + EXPECT_EQ(allocator_.heap_available(), avail_after_first); +} + +// ============================================================================= +// TaskAllocator: heap_available for top>=tail and top entries_; + std::atomic error_code_{0}; +}; + +// ============================================================================= +// DepListPool: init (top=1, tail=1, entry 0 is NULL) +// ============================================================================= + +TEST_F(DepListPoolTest, InitialState) { + EXPECT_EQ(pool_.top, 1); + EXPECT_EQ(pool_.tail, 1); + EXPECT_EQ(pool_.high_water, 0); + EXPECT_EQ(pool_.used(), 0); + EXPECT_EQ(pool_.available(), kPoolCapacity); + + // Entry 0 should be NULL marker + EXPECT_EQ(entries_[0].slot_state, nullptr); + EXPECT_EQ(entries_[0].next, nullptr); +} + +// ============================================================================= +// DepListPool: single alloc +// ============================================================================= + +TEST_F(DepListPoolTest, SingleAlloc) { + PTO2DepListEntry* entry = pool_.alloc(); + ASSERT_NE(entry, nullptr); + EXPECT_EQ(pool_.top, 2); + EXPECT_EQ(pool_.tail, 1); + EXPECT_EQ(pool_.used(), 1); + EXPECT_EQ(pool_.available(), kPoolCapacity - 1); + EXPECT_EQ(pool_.high_water, 1); + + // The allocated entry should be at index 1 (top was 1, mod capacity) + EXPECT_EQ(entry, &entries_[1]); +} + +// ============================================================================= +// DepListPool: modular wrap — entry 0 must not be returned (BUG TEST) +// +// BUG in PTO2DepListPool::alloc(): uses `top % capacity` as physical index. +// When top is a multiple of capacity (e.g. top=8 with capacity=8), this +// yields index 0 — the NULL sentinel slot set during init(). Handing out +// &entries_[0] corrupts the sentinel, breaking pto2_dep_pool_get(). +// +// Correct behavior: the pool must never return the NULL sentinel slot. +// This test FAILS until the bug is fixed (index arithmetic skips slot 0). +// ============================================================================= + +TEST_F(DepListPoolTest, ModularWrapEntry0Conflict) { + // Capacity is 8. Allocate 7 entries (top goes from 1 to 8). + for (int i = 0; i < 7; i++) { + PTO2DepListEntry* e = pool_.alloc(); + ASSERT_NE(e, nullptr) << "Failed at alloc " << i; + } + EXPECT_EQ(pool_.top, 8); + EXPECT_EQ(pool_.used(), 7); + + // Advance tail so pool is not full (used drops to 3) + pool_.advance_tail(5); + EXPECT_EQ(pool_.used(), 3); + + // Correct behavior: allocation must NOT return the NULL sentinel slot. + PTO2DepListEntry* e = pool_.alloc(); + ASSERT_NE(e, nullptr) << "allocation must succeed (space is available)"; + EXPECT_NE(e, &entries_[0]) << "must never return the NULL sentinel at index 0"; + // The sentinel must remain intact + EXPECT_EQ(entries_[0].slot_state, nullptr) << "NULL sentinel must not be overwritten"; + EXPECT_EQ(entries_[0].next, nullptr); +} + +// ============================================================================= +// DepListPool: overflow detection +// ============================================================================= + +TEST_F(DepListPoolTest, OverflowDetection) { + // Allocate until full (capacity entries used) + for (int i = 0; i < kPoolCapacity; i++) { + PTO2DepListEntry* e = pool_.alloc(); + ASSERT_NE(e, nullptr) << "Unexpected failure at alloc " << i; + } + EXPECT_EQ(pool_.used(), kPoolCapacity); + EXPECT_EQ(pool_.available(), 0); + + // Next alloc should fail (overflow) + PTO2DepListEntry* overflow = pool_.alloc(); + EXPECT_EQ(overflow, nullptr); + EXPECT_NE(error_code_.load(), 0); + EXPECT_EQ(error_code_.load(), PTO2_ERROR_DEP_POOL_OVERFLOW); +} + +// ============================================================================= +// DepListPool: prepend chain integrity +// ============================================================================= + +TEST_F(DepListPoolTest, PrependChainIntegrity) { + PTO2TaskSlotState slot_a{}; + PTO2TaskSlotState slot_b{}; + PTO2TaskSlotState slot_c{}; + + // Build a chain: NULL -> slot_a -> slot_b -> slot_c (prepend order) + PTO2DepListEntry* head = nullptr; + + head = pool_.prepend(head, &slot_a); + ASSERT_NE(head, nullptr); + EXPECT_EQ(head->slot_state, &slot_a); + EXPECT_EQ(head->next, nullptr); + + head = pool_.prepend(head, &slot_b); + ASSERT_NE(head, nullptr); + EXPECT_EQ(head->slot_state, &slot_b); + EXPECT_EQ(head->next->slot_state, &slot_a); + EXPECT_EQ(head->next->next, nullptr); + + head = pool_.prepend(head, &slot_c); + ASSERT_NE(head, nullptr); + EXPECT_EQ(head->slot_state, &slot_c); + EXPECT_EQ(head->next->slot_state, &slot_b); + EXPECT_EQ(head->next->next->slot_state, &slot_a); + EXPECT_EQ(head->next->next->next, nullptr); +} + +// ============================================================================= +// DepListPool: advance_tail +// ============================================================================= + +TEST_F(DepListPoolTest, AdvanceTail) { + // Allocate 4 entries + for (int i = 0; i < 4; i++) { + pool_.alloc(); + } + EXPECT_EQ(pool_.used(), 4); + EXPECT_EQ(pool_.available(), kPoolCapacity - 4); + + // Advance tail by 3 (from 1 to 4) + pool_.advance_tail(4); + EXPECT_EQ(pool_.tail, 4); + EXPECT_EQ(pool_.used(), 1); // top=5, tail=4 + EXPECT_EQ(pool_.available(), kPoolCapacity - 1); +} + +// ============================================================================= +// DepListPool: advance_tail backwards (no-op) +// ============================================================================= + +TEST_F(DepListPoolTest, AdvanceTailBackwardsNoop) { + pool_.alloc(); + pool_.alloc(); + pool_.advance_tail(3); + EXPECT_EQ(pool_.tail, 3); + + // Trying to advance backwards should be a no-op + pool_.advance_tail(2); + EXPECT_EQ(pool_.tail, 3); + + // Same value should also be a no-op + pool_.advance_tail(3); + EXPECT_EQ(pool_.tail, 3); +} + +// ============================================================================= +// DepListPool: pto2_dep_pool_get(0) returns NULL +// ============================================================================= + +TEST_F(DepListPoolTest, GetOffsetZeroReturnsNull) { + PTO2DepListEntry* result = pool_.pto2_dep_pool_get(0); + EXPECT_EQ(result, nullptr); +} + +// ============================================================================= +// DepListPool: pto2_dep_pool_get(-1) returns NULL +// ============================================================================= + +TEST_F(DepListPoolTest, GetNegativeOffsetReturnsNull) { + PTO2DepListEntry* result = pool_.pto2_dep_pool_get(-1); + EXPECT_EQ(result, nullptr); +} + +// ============================================================================= +// DepListPool: pto2_dep_pool_get with valid offset +// ============================================================================= + +TEST_F(DepListPoolTest, GetValidOffset) { + PTO2DepListEntry* result = pool_.pto2_dep_pool_get(1); + EXPECT_EQ(result, &entries_[1]); + + result = pool_.pto2_dep_pool_get(5); + EXPECT_EQ(result, &entries_[5]); +} diff --git a/tests/ut/cpp/test_scheduler_state.cpp b/tests/ut/cpp/test_scheduler_state.cpp new file mode 100644 index 000000000..c82ebf1d8 --- /dev/null +++ b/tests/ut/cpp/test_scheduler_state.cpp @@ -0,0 +1,231 @@ +/** + * Unit tests for PTO2SchedulerState from pto_scheduler.h + * + * Tests task state transitions, fanin/fanout logic, subtask completion. + */ + +#include + +#include +#include +#include +#include + +#include "pto_scheduler.h" + +class SchedulerStateTest : public ::testing::Test { +protected: + PTO2SchedulerState sched; + PTO2SharedMemoryHandle* sm_handle = nullptr; + + void SetUp() override { + sm_handle = pto2_sm_create_default(); + ASSERT_NE(sm_handle, nullptr); + bool ok = pto2_scheduler_init(&sched, sm_handle); + ASSERT_TRUE(ok); + } + + void TearDown() override { + pto2_scheduler_destroy(&sched); + if (sm_handle) { + pto2_sm_destroy(sm_handle); + } + } + + void init_slot(PTO2TaskSlotState& slot, PTO2TaskState state, + int32_t fanin_count, int32_t fanout_count, + uint8_t ring_id = 0) { + memset(&slot, 0, sizeof(slot)); + slot.task_state.store(state); + slot.fanin_count = fanin_count; + slot.fanin_refcount.store(0); + slot.fanout_count = fanout_count; + slot.fanout_refcount.store(0); + slot.fanout_lock.store(0); + slot.fanout_head = nullptr; + slot.ring_id = ring_id; + slot.active_mask = PTO2_SUBTASK_MASK_AIC; + slot.completed_subtasks.store(0); + slot.total_required_subtasks = 1; + slot.block_num = 1; + } +}; + +// ============================================================================= +// check_and_handle_consumed +// ============================================================================= + +TEST_F(SchedulerStateTest, ConsumedNotReady) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_COMPLETED, 1, 2); + slot.fanout_refcount.store(1); // 1 != 2 + + sched.check_and_handle_consumed(slot); + EXPECT_EQ(slot.task_state.load(), PTO2_TASK_COMPLETED); +} + +TEST_F(SchedulerStateTest, ConsumedTransition) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_COMPLETED, 1, 2); + slot.fanout_refcount.store(2); // matches fanout_count + + sched.check_and_handle_consumed(slot); + EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED); +} + +TEST_F(SchedulerStateTest, ConsumedNotCompletedState) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_RUNNING, 1, 1); + slot.fanout_refcount.store(1); + + sched.check_and_handle_consumed(slot); + // CAS fails because state is RUNNING, not COMPLETED + EXPECT_EQ(slot.task_state.load(), PTO2_TASK_RUNNING); +} + +TEST_F(SchedulerStateTest, ConsumedIdempotent) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_CONSUMED, 1, 1); + slot.fanout_refcount.store(1); + + sched.check_and_handle_consumed(slot); + EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED); +} + +// ============================================================================= +// release_producer +// ============================================================================= + +TEST_F(SchedulerStateTest, ReleaseProducerIncrements) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_COMPLETED, 1, 3); + + sched.release_producer(slot); + EXPECT_EQ(slot.fanout_refcount.load(), 1); + + sched.release_producer(slot); + EXPECT_EQ(slot.fanout_refcount.load(), 2); +} + +TEST_F(SchedulerStateTest, ReleaseProducerTriggersConsumed) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_COMPLETED, 1, 2); + slot.fanout_refcount.store(1); // One away + + sched.release_producer(slot); + EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED); +} + +// ============================================================================= +// release_fanin_and_check_ready +// ============================================================================= + +TEST_F(SchedulerStateTest, FaninPartialNotReady) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_PENDING, 3, 1); + + bool ready = sched.release_fanin_and_check_ready(slot); + EXPECT_FALSE(ready); + EXPECT_EQ(slot.fanin_refcount.load(), 1); +} + +TEST_F(SchedulerStateTest, FaninAllSatisfiedReady) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_PENDING, 1, 1); + + bool ready = sched.release_fanin_and_check_ready(slot); + EXPECT_TRUE(ready); +} + +// ============================================================================= +// on_subtask_complete +// ============================================================================= + +TEST_F(SchedulerStateTest, SubtaskCompleteSingle) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_RUNNING, 1, 1); + slot.total_required_subtasks = 1; + slot.completed_subtasks.store(0); + + EXPECT_TRUE(sched.on_subtask_complete(slot)); +} + +TEST_F(SchedulerStateTest, SubtaskCompleteMultiBlock) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_RUNNING, 1, 1); + slot.total_required_subtasks = 6; // 3 cores * 2 blocks + slot.completed_subtasks.store(0); + + for (int i = 0; i < 5; i++) { + EXPECT_FALSE(sched.on_subtask_complete(slot)); + } + EXPECT_TRUE(sched.on_subtask_complete(slot)); +} + +TEST_F(SchedulerStateTest, SubtaskCompleteConcurrent) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_RUNNING, 1, 1); + slot.total_required_subtasks = 6; + slot.completed_subtasks.store(0); + + std::atomic true_count{0}; + std::vector threads; + for (int i = 0; i < 6; i++) { + threads.emplace_back([&]() { + if (sched.on_subtask_complete(slot)) { + true_count.fetch_add(1); + } + }); + } + for (auto& t : threads) t.join(); + + EXPECT_EQ(true_count.load(), 1); + EXPECT_EQ(slot.completed_subtasks.load(), 6); +} + +// ============================================================================= +// on_scope_end +// ============================================================================= + +TEST_F(SchedulerStateTest, ScopeEndBatchRelease) { + constexpr int N = 4; + alignas(64) PTO2TaskSlotState slots[N]; + PTO2TaskSlotState* ptrs[N]; + + for (int i = 0; i < N; i++) { + init_slot(slots[i], PTO2_TASK_COMPLETED, 1, 2); + ptrs[i] = &slots[i]; + } + + sched.on_scope_end(ptrs, N); + + for (int i = 0; i < N; i++) { + EXPECT_EQ(slots[i].fanout_refcount.load(), 1); + } +} + +// ============================================================================= +// get_ready_tasks_batch: local buffer first +// ============================================================================= + +TEST_F(SchedulerStateTest, GetReadyTasksBatchLocalFirst) { + alignas(64) PTO2TaskSlotState slot_a, slot_b; + init_slot(slot_a, PTO2_TASK_READY, 0, 1); + init_slot(slot_b, PTO2_TASK_READY, 0, 1); + + PTO2TaskSlotState* local_buf_storage[4]; + PTO2LocalReadyBuffer local_buf; + local_buf.reset(local_buf_storage, 4); + local_buf.try_push(&slot_a); + + // Push slot_b to global queue + sched.ready_queues[0].push(&slot_b); + + PTO2TaskSlotState* out[4]; + int count = sched.get_ready_tasks_batch(PTO2ResourceShape::AIC, local_buf, out, 4); + + EXPECT_EQ(count, 2); + // Local buffer drains first (LIFO), so slot_a comes first + EXPECT_EQ(out[0], &slot_a); + EXPECT_EQ(out[1], &slot_b); +} diff --git a/tests/ut/cpp/test_shared_memory.cpp b/tests/ut/cpp/test_shared_memory.cpp new file mode 100644 index 000000000..0a2acf620 --- /dev/null +++ b/tests/ut/cpp/test_shared_memory.cpp @@ -0,0 +1,84 @@ +/** + * Unit tests for PTO2SharedMemory layout from pto_shared_memory.h + */ + +#include + +#include "pto_shared_memory.h" + +class SharedMemoryTest : public ::testing::Test { +protected: + PTO2SharedMemoryHandle* handle = nullptr; + + void SetUp() override { + handle = pto2_sm_create_default(); + ASSERT_NE(handle, nullptr); + } + + void TearDown() override { + if (handle) { + pto2_sm_destroy(handle); + handle = nullptr; + } + } +}; + +TEST_F(SharedMemoryTest, CreateDefaultReturnsNonNull) { + EXPECT_NE(handle->sm_base, nullptr); + EXPECT_GT(handle->sm_size, 0u); +} + +TEST_F(SharedMemoryTest, IsOwner) { + EXPECT_TRUE(handle->is_owner); +} + +TEST_F(SharedMemoryTest, HeaderInitValues) { + auto* hdr = handle->header; + EXPECT_EQ(hdr->orchestrator_done.load(), 0); + EXPECT_EQ(hdr->orch_error_code.load(), 0); + EXPECT_EQ(hdr->sched_error_bitmap.load(), 0); + EXPECT_EQ(hdr->sched_error_code.load(), 0); + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto& fc = hdr->rings[r].fc; + EXPECT_EQ(fc.current_task_index.load(), 0); + EXPECT_EQ(fc.last_task_alive.load(), 0); + } +} + +TEST_F(SharedMemoryTest, Validate) { + EXPECT_TRUE(pto2_sm_validate(handle)); +} + +TEST_F(SharedMemoryTest, PerRingIndependence) { + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + EXPECT_NE(handle->task_descriptors[r], nullptr) << "Ring " << r; + EXPECT_NE(handle->task_payloads[r], nullptr) << "Ring " << r; + } + // Different rings should have different pointers + for (int r = 1; r < PTO2_MAX_RING_DEPTH; r++) { + EXPECT_NE(handle->task_descriptors[r], handle->task_descriptors[0]) << "Ring " << r; + } +} + +TEST_F(SharedMemoryTest, PointerAlignment) { + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto addr = reinterpret_cast(handle->task_descriptors[r]); + EXPECT_EQ(addr % PTO2_ALIGN_SIZE, 0u) << "Ring " << r << " descriptors not aligned"; + } +} + +TEST(SharedMemoryCalcSize, NonZero) { + uint64_t size = pto2_sm_calculate_size(PTO2_TASK_WINDOW_SIZE); + EXPECT_GT(size, 0u); +} + +TEST(SharedMemoryCalcSize, LargerWindowGivesLargerSize) { + uint64_t small_size = pto2_sm_calculate_size(64); + uint64_t large_size = pto2_sm_calculate_size(256); + EXPECT_GT(large_size, small_size); +} + +TEST(SharedMemoryCalcSize, HeaderAligned) { + EXPECT_EQ(sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE, 0u); +} diff --git a/tests/ut/cpp/test_submit_types.cpp b/tests/ut/cpp/test_submit_types.cpp new file mode 100644 index 000000000..0d0856f77 --- /dev/null +++ b/tests/ut/cpp/test_submit_types.cpp @@ -0,0 +1,144 @@ +/** + * Unit tests for pto_submit_types.h + * + * Tests submit contract types: subtask masks, resource shapes, + * active mask derivation, and launch spec. + */ + +#include + +#include "pto_submit_types.h" + +// ============================================================================= +// pto2_subtask_active +// ============================================================================= + +TEST(SubtaskActive, AICMaskActivatesAICSlot) { + EXPECT_TRUE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIC, PTO2SubtaskSlot::AIC)); + EXPECT_FALSE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIC, PTO2SubtaskSlot::AIV0)); + EXPECT_FALSE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIC, PTO2SubtaskSlot::AIV1)); +} + +TEST(SubtaskActive, AIV0MaskActivatesAIV0Slot) { + EXPECT_FALSE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIV0, PTO2SubtaskSlot::AIC)); + EXPECT_TRUE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIV0, PTO2SubtaskSlot::AIV0)); + EXPECT_FALSE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIV0, PTO2SubtaskSlot::AIV1)); +} + +TEST(SubtaskActive, AIV1MaskActivatesAIV1Slot) { + EXPECT_FALSE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIV1, PTO2SubtaskSlot::AIC)); + EXPECT_FALSE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIV1, PTO2SubtaskSlot::AIV0)); + EXPECT_TRUE(pto2_subtask_active(PTO2_SUBTASK_MASK_AIV1, PTO2SubtaskSlot::AIV1)); +} + +TEST(SubtaskActive, CombinedMask) { + uint8_t mask = PTO2_SUBTASK_MASK_AIC | PTO2_SUBTASK_MASK_AIV1; + EXPECT_TRUE(pto2_subtask_active(mask, PTO2SubtaskSlot::AIC)); + EXPECT_FALSE(pto2_subtask_active(mask, PTO2SubtaskSlot::AIV0)); + EXPECT_TRUE(pto2_subtask_active(mask, PTO2SubtaskSlot::AIV1)); +} + +TEST(SubtaskActive, AllActive) { + uint8_t mask = PTO2_SUBTASK_MASK_AIC | PTO2_SUBTASK_MASK_AIV0 | PTO2_SUBTASK_MASK_AIV1; + EXPECT_TRUE(pto2_subtask_active(mask, PTO2SubtaskSlot::AIC)); + EXPECT_TRUE(pto2_subtask_active(mask, PTO2SubtaskSlot::AIV0)); + EXPECT_TRUE(pto2_subtask_active(mask, PTO2SubtaskSlot::AIV1)); +} + +// ============================================================================= +// pto2_active_mask_to_shape +// ============================================================================= + +TEST(ActiveMaskToShape, SingleAIC) { + EXPECT_EQ(pto2_active_mask_to_shape(PTO2_SUBTASK_MASK_AIC), PTO2ResourceShape::AIC); +} + +TEST(ActiveMaskToShape, SingleAIV0) { + EXPECT_EQ(pto2_active_mask_to_shape(PTO2_SUBTASK_MASK_AIV0), PTO2ResourceShape::AIV); +} + +TEST(ActiveMaskToShape, SingleAIV1) { + EXPECT_EQ(pto2_active_mask_to_shape(PTO2_SUBTASK_MASK_AIV1), PTO2ResourceShape::AIV); +} + +TEST(ActiveMaskToShape, TwoActiveBecomesMIX) { + uint8_t mask = PTO2_SUBTASK_MASK_AIC | PTO2_SUBTASK_MASK_AIV0; + EXPECT_EQ(pto2_active_mask_to_shape(mask), PTO2ResourceShape::MIX); +} + +TEST(ActiveMaskToShape, AllThreeBecomesMIX) { + uint8_t mask = PTO2_SUBTASK_MASK_AIC | PTO2_SUBTASK_MASK_AIV0 | PTO2_SUBTASK_MASK_AIV1; + EXPECT_EQ(pto2_active_mask_to_shape(mask), PTO2ResourceShape::MIX); +} + +// ============================================================================= +// pto2_mixed_kernels_to_active_mask +// ============================================================================= + +TEST(MixedKernelsToMask, AllInvalid) { + MixedKernels mk; + EXPECT_EQ(pto2_mixed_kernels_to_active_mask(mk), 0); +} + +TEST(MixedKernelsToMask, AICOnly) { + MixedKernels mk; + mk.aic_kernel_id = 42; + EXPECT_EQ(pto2_mixed_kernels_to_active_mask(mk), PTO2_SUBTASK_MASK_AIC); +} + +TEST(MixedKernelsToMask, AIV0Only) { + MixedKernels mk; + mk.aiv0_kernel_id = 7; + EXPECT_EQ(pto2_mixed_kernels_to_active_mask(mk), PTO2_SUBTASK_MASK_AIV0); +} + +TEST(MixedKernelsToMask, AllValid) { + MixedKernels mk; + mk.aic_kernel_id = 1; + mk.aiv0_kernel_id = 2; + mk.aiv1_kernel_id = 3; + uint8_t expected = PTO2_SUBTASK_MASK_AIC | PTO2_SUBTASK_MASK_AIV0 | PTO2_SUBTASK_MASK_AIV1; + EXPECT_EQ(pto2_mixed_kernels_to_active_mask(mk), expected); +} + +// ============================================================================= +// MixedKernels defaults +// ============================================================================= + +TEST(MixedKernels, DefaultsAreInvalid) { + MixedKernels mk; + EXPECT_EQ(mk.aic_kernel_id, INVALID_KERNEL_ID); + EXPECT_EQ(mk.aiv0_kernel_id, INVALID_KERNEL_ID); + EXPECT_EQ(mk.aiv1_kernel_id, INVALID_KERNEL_ID); +} + +// ============================================================================= +// PTO2LaunchSpec +// ============================================================================= + +TEST(LaunchSpec, DefaultBlockNumIsOne) { + PTO2LaunchSpec spec; + EXPECT_EQ(spec.block_num(), 1); +} + +TEST(LaunchSpec, SetAndGet) { + PTO2LaunchSpec spec; + spec.set_block_num(4); + EXPECT_EQ(spec.block_num(), 4); +} + +// ============================================================================= +// Constants +// ============================================================================= + +TEST(Constants, SubtaskSlotCount) { + EXPECT_EQ(PTO2_SUBTASK_SLOT_COUNT, 3); +} + +TEST(Constants, NumResourceShapes) { + EXPECT_EQ(PTO2_NUM_RESOURCE_SHAPES, 3); +} + +TEST(Constants, InvalidKernelId) { + EXPECT_EQ(INVALID_KERNEL_ID, -1); +} diff --git a/tests/ut/cpp/test_tensor.cpp b/tests/ut/cpp/test_tensor.cpp new file mode 100644 index 000000000..2a40354b6 --- /dev/null +++ b/tests/ut/cpp/test_tensor.cpp @@ -0,0 +1,354 @@ +/** + * Unit tests for Tensor and related types in tensor.h + * + * Tests Tensor operations, TensorCreateInfo, Segment intersection, + * and boundary conditions in cache-line layout coupling. + */ + +#include + +#include + +#include "pto_orchestration_api.h" + +// Helper: create a Tensor via make_tensor_external (the public factory) +static Tensor make_test_tensor(void* addr, uint64_t size, const uint32_t shapes[], + uint32_t ndims, DataType dtype = DataType::FLOAT32, + bool manual_dep = false, int32_t version = 0) { + return make_tensor_external(addr, shapes, ndims, dtype, manual_dep, version); +} + +// ============================================================================= +// Segment intersection +// ============================================================================= + +TEST(Segment, OverlappingIntersects) { + Segment a{0, 10}; + Segment b{5, 15}; + EXPECT_TRUE(a.line_segment_intersection(b)); + EXPECT_TRUE(b.line_segment_intersection(a)); +} + +TEST(Segment, TouchingDoesNotIntersect) { + Segment a{0, 10}; + Segment b{10, 20}; + EXPECT_FALSE(a.line_segment_intersection(b)); + EXPECT_FALSE(b.line_segment_intersection(a)); +} + +TEST(Segment, DisjointDoesNotIntersect) { + Segment a{0, 5}; + Segment b{10, 20}; + EXPECT_FALSE(a.line_segment_intersection(b)); + EXPECT_FALSE(b.line_segment_intersection(a)); +} + +TEST(Segment, ZeroLengthAtBoundary) { + // Zero-length segment at position 10 touching [0,10) + Segment a{10, 10}; + Segment b{0, 10}; + EXPECT_FALSE(a.line_segment_intersection(b)); +} + +TEST(Segment, ZeroLengthInsideRange) { + // Zero-length segment at position 5 inside [0,10) + // end(5) > other.begin(0) && other.end(10) > begin(5) => true + // KNOWN BEHAVIOR: zero-length segments report intersection. + // This could cause spurious dependencies in TensorMap overlap detection. + Segment a{5, 5}; + Segment b{0, 10}; + EXPECT_TRUE(a.line_segment_intersection(b)); +} + +TEST(Segment, IdenticalRanges) { + Segment a{0, 10}; + EXPECT_TRUE(a.line_segment_intersection(a)); +} + +TEST(Segment, ContainsFull) { + Segment outer{0, 20}; + Segment inner{5, 10}; + EXPECT_TRUE(outer.contains(inner)); +} + +TEST(Segment, ContainsIdentical) { + Segment a{0, 10}; + EXPECT_TRUE(a.contains(a)); +} + +TEST(Segment, DoesNotContainPartial) { + Segment a{0, 10}; + Segment b{5, 15}; + EXPECT_FALSE(a.contains(b)); +} + +TEST(Segment, ContainsAtBoundary) { + Segment a{0, 10}; + Segment b{0, 10}; + EXPECT_TRUE(a.contains(b)); +} + +// ============================================================================= +// TensorCreateInfo +// ============================================================================= + +TEST(TensorCreateInfo, BufferSizeBytes) { + uint32_t shapes[] = {4, 8}; + TensorCreateInfo ci(shapes, 2, DataType::FLOAT32); + EXPECT_EQ(ci.buffer_size_bytes(), 4u * 8u * 4u); // 4*8 elements * 4 bytes +} + +TEST(TensorCreateInfo, BufferSizeBytesInt8) { + uint32_t shapes[] = {10, 20, 30}; + TensorCreateInfo ci(shapes, 3, DataType::INT8); + EXPECT_EQ(ci.buffer_size_bytes(), 10u * 20u * 30u * 1u); +} + +TEST(TensorCreateInfo, SizeIs64Bytes) { + EXPECT_EQ(sizeof(TensorCreateInfo), 64u); +} + +TEST(TensorCreateInfo, InitialValueDefault) { + uint32_t shapes[] = {4}; + TensorCreateInfo ci(shapes, 1); + EXPECT_FALSE(ci.has_initial_value); +} + +TEST(TensorCreateInfo, SetInitialValue) { + uint32_t shapes[] = {4}; + TensorCreateInfo ci(shapes, 1); + ci.set_initial_value(3.14f); + EXPECT_TRUE(ci.has_initial_value); +} + +// ============================================================================= +// Tensor basic operations +// ============================================================================= + +TEST(Tensor, SizeIs128Bytes) { + EXPECT_EQ(sizeof(Tensor), 128u); +} + +TEST(Tensor, RawShapesAtOffset64) { + EXPECT_EQ(offsetof(Tensor, raw_shapes), 64u); +} + +TEST(Tensor, MakeExternal) { + char buf[256]; + uint32_t shapes[] = {4, 8}; + auto t = make_test_tensor(buf, sizeof(buf), shapes, 2); + EXPECT_EQ(t.buffer.addr, reinterpret_cast(buf)); + EXPECT_EQ(t.ndims, 2u); + EXPECT_EQ(t.shapes[0], 4u); + EXPECT_EQ(t.shapes[1], 8u); +} + +TEST(Tensor, Numel) { + char buf[256]; + uint32_t shapes[] = {4, 8, 2}; + auto t = make_test_tensor(buf, sizeof(buf), shapes, 3); + EXPECT_EQ(t.numel(), 64u); +} + +TEST(Tensor, NumelZeroDim) { + char buf[256]; + uint32_t shapes[] = {}; + auto t = make_test_tensor(buf, sizeof(buf), shapes, 0); + EXPECT_EQ(t.numel(), 0u); +} + +TEST(Tensor, IsContiguousWhenRawEqShapes) { + char buf[256]; + uint32_t shapes[] = {4, 8}; + auto t = make_test_tensor(buf, sizeof(buf), shapes, 2); + EXPECT_TRUE(t.is_raw_eq_shapes); + EXPECT_TRUE(t.is_contiguous()); +} + +TEST(Tensor, IsSameMemref) { + char buf1[256], buf2[256]; + uint32_t shapes[] = {4}; + auto t1 = make_test_tensor(buf1, sizeof(buf1), shapes, 1); + auto t2 = make_test_tensor(buf1, sizeof(buf1), shapes, 1); + auto t3 = make_test_tensor(buf2, sizeof(buf2), shapes, 1); + EXPECT_TRUE(t1.is_same_memref(t2)); + EXPECT_FALSE(t1.is_same_memref(t3)); +} + +// ============================================================================= +// View +// ============================================================================= + +TEST(Tensor, ViewWithZeroOffsets) { + char buf[256]; + uint32_t shapes[] = {10, 20}; + auto parent = make_test_tensor(buf, sizeof(buf), shapes, 2); + + uint32_t view_shapes[] = {5, 10}; + uint32_t view_offsets[] = {0, 0}; + auto v = parent.view(view_shapes, view_offsets); + + EXPECT_EQ(v.shapes[0], 5u); + EXPECT_EQ(v.shapes[1], 10u); + EXPECT_TRUE(v.is_all_offset_zero); + EXPECT_EQ(v.buffer.addr, parent.buffer.addr); +} + +TEST(Tensor, ViewWithNonZeroOffsets) { + char buf[256]; + uint32_t shapes[] = {10, 20}; + auto parent = make_test_tensor(buf, sizeof(buf), shapes, 2); + + uint32_t view_shapes[] = {5, 10}; + uint32_t view_offsets[] = {2, 3}; + auto v = parent.view(view_shapes, view_offsets); + + EXPECT_EQ(v.shapes[0], 5u); + EXPECT_EQ(v.shapes[1], 10u); + EXPECT_FALSE(v.is_all_offset_zero); + EXPECT_EQ(v.offsets[0], 2u); + EXPECT_EQ(v.offsets[1], 3u); +} + +TEST(Tensor, ViewOffsetAccumulation) { + char buf[256]; + uint32_t shapes[] = {20, 30}; + auto parent = make_test_tensor(buf, sizeof(buf), shapes, 2); + + // First view with offsets + uint32_t v1_shapes[] = {10, 15}; + uint32_t v1_offsets[] = {5, 10}; + auto v1 = parent.view(v1_shapes, v1_offsets); + + // Second view on top of first + uint32_t v2_shapes[] = {3, 4}; + uint32_t v2_offsets[] = {1, 2}; + auto v2 = v1.view(v2_shapes, v2_offsets); + + EXPECT_EQ(v2.offsets[0], 6u); // 5 + 1 + EXPECT_EQ(v2.offsets[1], 12u); // 10 + 2 +} + +// ============================================================================= +// Reshape +// ============================================================================= + +TEST(Tensor, ReshapeContiguous) { + char buf[256]; + uint32_t shapes[] = {4, 8}; + auto t = make_test_tensor(buf, sizeof(buf), shapes, 2); + + uint32_t new_shapes[] = {32}; + auto r = t.reshape(new_shapes, 1); + + EXPECT_EQ(r.numel(), 32u); + EXPECT_EQ(r.ndims, 1u); + EXPECT_EQ(r.shapes[0], 32u); + EXPECT_TRUE(r.is_raw_eq_shapes); + EXPECT_TRUE(r.is_all_offset_zero); +} + +TEST(Tensor, ReshapePreservesBuffer) { + char buf[256]; + uint32_t shapes[] = {4, 8}; + auto t = make_test_tensor(buf, sizeof(buf), shapes, 2); + + uint32_t new_shapes[] = {2, 16}; + auto r = t.reshape(new_shapes, 2); + + EXPECT_EQ(r.buffer.addr, t.buffer.addr); +} + +// ============================================================================= +// Transpose +// ============================================================================= + +TEST(Tensor, TransposeSwapsDims) { + char buf[256]; + uint32_t shapes[] = {4, 8, 2}; + auto t = make_test_tensor(buf, sizeof(buf), shapes, 3); + + auto tr = t.transpose(0, 2); + + EXPECT_EQ(tr.shapes[0], 2u); + EXPECT_EQ(tr.shapes[1], 8u); + EXPECT_EQ(tr.shapes[2], 4u); + EXPECT_EQ(tr.numel(), t.numel()); +} + +// ============================================================================= +// compute_flat_offset +// ============================================================================= + +TEST(Tensor, ComputeFlatOffsetZeroDim) { + char buf[256]; + uint32_t shapes[] = {}; + auto t = make_test_tensor(buf, sizeof(buf), shapes, 0); + uint32_t indices[] = {}; + EXPECT_EQ(t.compute_flat_offset(indices, 0), 0u); +} + +TEST(Tensor, ComputeFlatOffset1D) { + char buf[256]; + uint32_t shapes[] = {10}; + auto t = make_test_tensor(buf, sizeof(buf), shapes, 1); + uint32_t indices[] = {7}; + EXPECT_EQ(t.compute_flat_offset(indices, 1), 7u); +} + +TEST(Tensor, ComputeFlatOffset2D) { + char buf[256]; + uint32_t shapes[] = {4, 8}; + auto t = make_test_tensor(buf, sizeof(buf), shapes, 2); + // Row-major: offset = i0 * 8 + i1 = 2*8+3 = 19 + uint32_t indices[] = {2, 3}; + EXPECT_EQ(t.compute_flat_offset(indices, 2), 19u); +} + +// ============================================================================= +// update_start_offset +// ============================================================================= + +TEST(Tensor, UpdateStartOffsetZeroOffsets) { + char buf[256]; + uint32_t shapes[] = {4, 8}; + auto t = make_test_tensor(buf, sizeof(buf), shapes, 2); + t.update_start_offset(); + EXPECT_EQ(t.start_offset, 0u); +} + +// ============================================================================= +// fill_initial_value +// ============================================================================= + +TEST(Tensor, FillInitialValue) { + alignas(64) char buf[128]; + memset(buf, 0, sizeof(buf)); + + uint32_t shapes[] = {32}; + TensorCreateInfo ci(shapes, 1, DataType::FLOAT32); + ci.set_initial_value(1.0f); + + // Use make_tensor_external then overwrite with init_from_create_info + auto t = make_tensor_external(buf, shapes, 1); + t.init_from_create_info(ci, buf, sizeof(buf)); + + // Check that the buffer was filled with 1.0f + float* data = reinterpret_cast(buf); + for (int i = 0; i < 32; i++) { + EXPECT_FLOAT_EQ(data[i], 1.0f) << "Mismatch at index " << i; + } +} + +// ============================================================================= +// Layout coupling: TensorCreateInfo <-> Tensor cacheline 1 +// ============================================================================= + +TEST(LayoutCoupling, TensorCreateInfoMatchesTensor) { + // These static_asserts are in tensor.h but we verify they compile here + static_assert(offsetof(TensorCreateInfo, version) == offsetof(Tensor, version)); + static_assert(offsetof(TensorCreateInfo, dtype) == offsetof(Tensor, dtype)); + static_assert(offsetof(TensorCreateInfo, ndims) == offsetof(Tensor, ndims)); + static_assert(offsetof(TensorCreateInfo, is_all_offset_zero) == offsetof(Tensor, is_all_offset_zero)); + SUCCEED(); +} diff --git a/tests/ut/cpp/test_tensormap.cpp b/tests/ut/cpp/test_tensormap.cpp new file mode 100644 index 000000000..8e5ea5da9 --- /dev/null +++ b/tests/ut/cpp/test_tensormap.cpp @@ -0,0 +1,270 @@ +/** + * Unit tests for PTO2TensorMap and check_overlap from pto_tensormap.h + */ + +#include + +#include + +#include "pto_orchestration_api.h" +#include "pto_tensormap.h" + +// ============================================================================= +// TensorMapEntry::check_overlap tests +// ============================================================================= + +class CheckOverlapTest : public ::testing::Test { +protected: + alignas(64) PTO2TensorMapEntry entry; + char buf[256]; + + void SetUp() override { + memset(&entry, 0, sizeof(entry)); + memset(buf, 0, sizeof(buf)); + } + + Tensor make_input(uint32_t shapes[], uint32_t ndims, int32_t version = 0) { + return make_tensor_external(buf, shapes, ndims, DataType::FLOAT32, false, version); + } + + void setup_entry(uint32_t shapes[], uint32_t ndims, int32_t version = 0) { + entry.buffer_addr = reinterpret_cast(buf); + entry.ndims = ndims; + entry.version = version; + entry.is_all_offset_zero = true; + for (uint32_t i = 0; i < ndims; i++) { + entry.shapes[i] = shapes[i]; + } + } +}; + +TEST_F(CheckOverlapTest, IdenticalShapesZeroOffsets) { + uint32_t shapes[] = {10, 20}; + setup_entry(shapes, 2); + auto input = make_input(shapes, 2); + EXPECT_EQ(entry.check_overlap(input), OverlapStatus::COVERED); +} + +TEST_F(CheckOverlapTest, InputLargerThanOutput) { + uint32_t entry_shapes[] = {5, 10}; + uint32_t input_shapes[] = {10, 20}; + setup_entry(entry_shapes, 2); + auto input = make_input(input_shapes, 2); + EXPECT_EQ(entry.check_overlap(input), OverlapStatus::COVERED); +} + +TEST_F(CheckOverlapTest, InputSmallerThanOutput) { + uint32_t entry_shapes[] = {10, 20}; + uint32_t input_shapes[] = {5, 10}; + setup_entry(entry_shapes, 2); + auto input = make_input(input_shapes, 2); + EXPECT_EQ(entry.check_overlap(input), OverlapStatus::OTHER); +} + +TEST_F(CheckOverlapTest, VersionMismatch) { + // input.version > entry.version -> returns OTHER (not NO_OVERLAP) + // This means version bumps create dependencies (intentional) + uint32_t shapes[] = {10}; + setup_entry(shapes, 1, /*version=*/0); + auto input = make_input(shapes, 1, /*version=*/1); + EXPECT_EQ(entry.check_overlap(input), OverlapStatus::OTHER); +} + +TEST_F(CheckOverlapTest, DisjointOffsetsWithNonZeroEntry) { + // Entry covers [10,20), input covers [0,5) -> NO_OVERLAP + uint32_t entry_shapes[] = {10}; + uint32_t input_shapes[] = {5}; + setup_entry(entry_shapes, 1); + entry.is_all_offset_zero = false; + entry.offsets[0] = 10; + + auto input = make_input(input_shapes, 1); + // Input is [0,5), entry is [10,20) + EXPECT_EQ(entry.check_overlap(input), OverlapStatus::NO_OVERLAP); +} + +// ============================================================================= +// TensorMap lifecycle tests +// ============================================================================= + +class TensorMapTest : public ::testing::Test { +protected: + PTO2TensorMap tmap; + int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {16, 16, 16, 16}; + + void SetUp() override { + bool ok = tmap.init(64, 256, window_sizes); + ASSERT_TRUE(ok); + // Initialize last_task_alives to 0 for all rings + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + tmap.last_task_alives[r] = 0; + } + } + + void TearDown() override { + tmap.destroy(); + } +}; + +TEST_F(TensorMapTest, InitSucceeds) { + EXPECT_EQ(tmap.num_buckets, 64); + EXPECT_EQ(tmap.pool_size, 256); +} + +TEST_F(TensorMapTest, HashDistribution) { + // Aligned addresses should distribute across buckets + uint64_t addr1 = 0x1000; + uint64_t addr2 = 0x2000; + uint64_t addr3 = 0x3000; + uint32_t h1 = tmap.hash(addr1); + uint32_t h2 = tmap.hash(addr2); + uint32_t h3 = tmap.hash(addr3); + // At least some should be different + EXPECT_TRUE(h1 != h2 || h2 != h3); + // All within bucket range + EXPECT_LT(h1, 64u); + EXPECT_LT(h2, 64u); + EXPECT_LT(h3, 64u); +} + +TEST_F(TensorMapTest, InsertAndLookupExact) { + char buf[256]; + uint32_t shapes[] = {10, 20}; + auto tensor = make_tensor_external(buf, shapes, 2); + auto task_id = pto2_make_task_id(0, 5); + + tmap.insert(tensor, task_id, false); + + PTO2LookupResult result; + tmap.lookup(tensor, result); + EXPECT_GE(result.count, 1); +} + +TEST_F(TensorMapTest, LookupNoMatch) { + char buf1[256], buf2[256]; + uint32_t shapes[] = {10}; + auto tensor1 = make_tensor_external(buf1, shapes, 1); + auto tensor2 = make_tensor_external(buf2, shapes, 1); + auto task_id = pto2_make_task_id(0, 0); + + tmap.insert(tensor1, task_id, false); + + PTO2LookupResult result; + tmap.lookup(tensor2, result); + EXPECT_EQ(result.count, 0); +} + +TEST_F(TensorMapTest, LookupStaleEntrySkipped) { + char buf[256]; + uint32_t shapes[] = {10}; + auto tensor = make_tensor_external(buf, shapes, 1); + auto task_id = pto2_make_task_id(0, 0); + + tmap.insert(tensor, task_id, false); + + // Invalidate: advance last_task_alives past this task + tmap.sync_validity(0, 5); + + PTO2LookupResult result; + tmap.lookup(tensor, result); + EXPECT_EQ(result.count, 0); +} + +TEST_F(TensorMapTest, MultipleSameBucket) { + // Insert multiple entries for the same address + char buf[256]; + uint32_t shapes[] = {10}; + auto tensor = make_tensor_external(buf, shapes, 1); + + tmap.insert(tensor, pto2_make_task_id(0, 0), false); + tmap.insert(tensor, pto2_make_task_id(0, 1), false); + tmap.insert(tensor, pto2_make_task_id(0, 2), false); + + PTO2LookupResult result; + tmap.lookup(tensor, result); + EXPECT_EQ(result.count, 3); +} + +TEST_F(TensorMapTest, CleanupRetired) { + char buf[256]; + uint32_t shapes[] = {10}; + auto tensor = make_tensor_external(buf, shapes, 1); + + // Insert entries for tasks 0..4 + for (int i = 0; i < 5; i++) { + tmap.insert(tensor, pto2_make_task_id(0, i), false); + } + + // Retire tasks 0..3 + tmap.cleanup_retired(0, 0, 4); + tmap.sync_validity(0, 4); + + PTO2LookupResult result; + tmap.lookup(tensor, result); + EXPECT_EQ(result.count, 1); // Only task 4 remains +} + +TEST_F(TensorMapTest, NewEntryFreeListPriority) { + // Allocate, free, allocate again -> should reuse freed entry + PTO2TensorMapEntry* e1 = tmap.new_entry(); + ASSERT_NE(e1, nullptr); + // Link entry so we can free it + e1->bucket_index = 0; + e1->prev_in_bucket = nullptr; + e1->next_in_bucket = nullptr; + e1->next_in_task = nullptr; + e1->prev_in_task = nullptr; + tmap.buckets[0] = e1; + + tmap.free_entry(*e1); + + PTO2TensorMapEntry* e2 = tmap.new_entry(); + EXPECT_EQ(e1, e2); // Reused from free list +} + +TEST_F(TensorMapTest, EntryValidBoundary) { + alignas(64) PTO2TensorMapEntry entry; + memset(&entry, 0, sizeof(entry)); + + // local_id == last_task_alive -> valid (not yet retired) + entry.producer_task_id = pto2_make_task_id(0, 5); + tmap.last_task_alives[0] = 5; + EXPECT_TRUE(tmap.entry_valid(entry)); + + // local_id < last_task_alive -> stale + tmap.last_task_alives[0] = 6; + EXPECT_FALSE(tmap.entry_valid(entry)); +} + +TEST_F(TensorMapTest, MultiRingInterleaving) { + char buf[256]; + uint32_t shapes[] = {10}; + auto tensor = make_tensor_external(buf, shapes, 1); + + // Insert entries from ring 0 and ring 1 + tmap.insert(tensor, pto2_make_task_id(0, 0), false); + tmap.insert(tensor, pto2_make_task_id(1, 0), false); + tmap.insert(tensor, pto2_make_task_id(0, 1), false); + + // Retire ring 0 tasks + tmap.cleanup_retired(0, 0, 2); + tmap.sync_validity(0, 2); + + // Ring 1 entry should still be valid + PTO2LookupResult result; + tmap.lookup(tensor, result); + EXPECT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].entry->producer_task_id.ring(), 1); +} + +// ============================================================================= +// Static assertions (compile-time checks) +// ============================================================================= + +TEST(TensorMapLayout, EntrySizeIs128) { + EXPECT_EQ(sizeof(PTO2TensorMapEntry), 128u); +} + +TEST(TensorMapLayout, CacheLine2StartsAt64) { + EXPECT_EQ(offsetof(PTO2TensorMapEntry, prev_in_bucket), 64u); +} diff --git a/tests/ut/test_elf_parser.py b/tests/ut/test_elf_parser.py new file mode 100644 index 000000000..25dffa407 --- /dev/null +++ b/tests/ut/test_elf_parser.py @@ -0,0 +1,207 @@ +"""Tests for python/elf_parser.py - ELF64 and Mach-O .text extraction.""" + +import struct +import sys +import tempfile +from pathlib import Path + +import pytest + +_python_dir = str(Path(__file__).resolve().parent.parent.parent / "python") +if _python_dir not in sys.path: + sys.path.insert(0, _python_dir) + +from elf_parser import _extract_cstring, extract_text_section + + +def _build_elf64_with_text(text_data: bytes) -> bytes: + """Build a minimal ELF64 .o file with a .text section.""" + # String table: \0.text\0.shstrtab\0 + strtab = b"\x00.text\x00.shstrtab\x00" + text_name_offset = 1 # offset of ".text" in strtab + shstrtab_name_offset = 7 # offset of ".shstrtab" in strtab + + # ELF header (64 bytes) + e_shoff = 64 # section headers right after ELF header + e_shnum = 3 # null + .text + .shstrtab + e_shstrndx = 2 # .shstrtab is section 2 + + elf_header = bytearray(64) + elf_header[0:4] = b"\x7fELF" + elf_header[4] = 2 # 64-bit + elf_header[5] = 1 # little-endian + elf_header[6] = 1 # version + struct.pack_into(" bytes: + """Build a minimal Mach-O 64-bit .o file with __text section.""" + # Header (32 bytes) + header = bytearray(32) + struct.pack_into("