From c0d67fe48bd1e2999a2e1a199ec0399767ef4436 Mon Sep 17 00:00:00 2001 From: Kaitlyn Davis Date: Thu, 12 Feb 2026 08:35:25 -0800 Subject: [PATCH 1/3] prepared: add quantized matmul prepared context APIs What: Add optional prepare/execute/release APIs for repeated quantized matmul with stable descriptors. Why: Repeated-shape inference loops otherwise redo setup and scratch handling each call. Expected impact: Lower per-call CPU overhead for repeated quantized matmul workloads; additive opt-in API. Tests: add dedicated coverage for prepared lifecycle and invariants (tests/testDriver_quantized_matmul_prepared_context.c). Signed-off-by: Kaitlyn Davis Signed-off-by: Kaitlyn Davis --- ...Driver_quantized_matmul_prepared_context.c | 219 ++++++++++++++++++ zdnn/prepared_quantized_matmul.c | 199 ++++++++++++++++ zdnn/zdnn.h | 39 ++++ zdnn/zdnn.map | 3 + 4 files changed, 460 insertions(+) create mode 100644 tests/testDriver_quantized_matmul_prepared_context.c create mode 100644 zdnn/prepared_quantized_matmul.c diff --git a/tests/testDriver_quantized_matmul_prepared_context.c b/tests/testDriver_quantized_matmul_prepared_context.c new file mode 100644 index 0000000..7bcb74f --- /dev/null +++ b/tests/testDriver_quantized_matmul_prepared_context.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: Apache-2.0 +/* + * Copyright IBM Corp. 2024 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "testsupport.h" + +#include +#include + +void setUp(void) {} +void tearDown(void) {} + +static void init_quantized_matmul_tensors(zdnn_ztensor *input_a, + zdnn_ztensor *input_b, + zdnn_ztensor *input_c, + zdnn_ztensor *output, + zdnn_tensor_desc *a_desc, + zdnn_tensor_desc *b_desc, + zdnn_tensor_desc *c_desc, + zdnn_tensor_desc *out_desc) { + // Matmul shape family: + // input_a [S,1,M,N] = [2,1,4,3] + // input_b [S,1,N,P] = [2,1,3,5] + // input_c [S,1,1,P] = [2,1,1,5] + // output [S,1,M,P] = [2,1,4,5] + init_transformed_desc(ZDNN_NHWC, ZDNN_BINARY_INT8, ZDNN_FORMAT_4DFEATURE, + a_desc, 2, 1, 4, 3); + init_transformed_desc(ZDNN_NHWC, ZDNN_BINARY_INT8, ZDNN_FORMAT_4DWEIGHTS, + b_desc, 2, 1, 3, 5); + init_transformed_desc(ZDNN_NHWC, ZDNN_BINARY_INT8, ZDNN_FORMAT_4DFEATURE, + c_desc, 2, 1, 1, 5); + init_transformed_desc(ZDNN_NHWC, ZDNN_BINARY_INT8, ZDNN_FORMAT_4DFEATURE, + out_desc, 2, 1, 4, 5); + + memset(input_a, 0, sizeof(*input_a)); + memset(input_b, 0, sizeof(*input_b)); + memset(input_c, 0, sizeof(*input_c)); + memset(output, 0, sizeof(*output)); + + input_a->transformed_desc = a_desc; + input_b->transformed_desc = b_desc; + input_c->transformed_desc = c_desc; + output->transformed_desc = out_desc; +} + +void test_prepare_context_rejects_null_or_missing_descriptors(void) { + zdnn_quantized_matmul_prepared_context context = {0}; + zdnn_ztensor input_a = {0}, input_b = {0}, input_c = {0}, output = {0}; + zdnn_tensor_desc a_desc = {0}, b_desc = {0}, c_desc = {0}, out_desc = {0}; + + init_quantized_matmul_tensors(&input_a, &input_b, &input_c, &output, &a_desc, + &b_desc, &c_desc, &out_desc); + + TEST_ASSERT_EQUAL_UINT32( + ZDNN_INVALID_BUFFER, + zdnn_prepare_quantized_matmul_context(&input_a, &input_b, &input_c, false, + NULL)); + + input_b.transformed_desc = NULL; + TEST_ASSERT_EQUAL_UINT32( + ZDNN_INVALID_BUFFER, + zdnn_prepare_quantized_matmul_context(&input_a, &input_b, &input_c, false, + &context)); +} + +void test_prepare_context_allocates_required_work_area(void) { + zdnn_quantized_matmul_prepared_context context = {0}; + zdnn_ztensor input_a = {0}, input_b = {0}, input_c = {0}, output = {0}; + zdnn_tensor_desc a_desc = {0}, b_desc = {0}, c_desc = {0}, out_desc = {0}; + + init_quantized_matmul_tensors(&input_a, &input_b, &input_c, &output, &a_desc, + &b_desc, &c_desc, &out_desc); + + uint64_t expected_work_area = + zdnn_get_quantized_matmul_work_area_size(&input_c, false); + TEST_ASSERT_TRUE(expected_work_area > 0); + + TEST_ASSERT_EQUAL_UINT32( + ZDNN_OK, zdnn_prepare_quantized_matmul_context( + &input_a, &input_b, &input_c, false, &context)); + + TEST_ASSERT_EQUAL_UINT32(NNPA_MATMUL_OP, context.function_code); + TEST_ASSERT_EQUAL_UINT64(expected_work_area, context.required_work_area_size); + TEST_ASSERT_NOT_NULL(context.work_area_scratch.buffer); + TEST_ASSERT_EQUAL_UINT64(expected_work_area, context.work_area_scratch.size); + TEST_ASSERT_EQUAL_UINT64(0, (uintptr_t)context.work_area_scratch.buffer % + AIU_PAGESIZE_IN_BYTES); + + zdnn_release_quantized_matmul_context(&context); + TEST_ASSERT_NULL(context.work_area_scratch.buffer); + TEST_ASSERT_EQUAL_UINT64(0, context.work_area_scratch.size); + TEST_ASSERT_EQUAL_UINT64(0, context.required_work_area_size); +} + +void test_prepare_context_precomputed_path_has_no_work_area(void) { + zdnn_quantized_matmul_prepared_context context = {0}; + zdnn_ztensor input_a = {0}, input_b = {0}, input_c = {0}, output = {0}; + zdnn_tensor_desc a_desc = {0}, b_desc = {0}, c_desc = {0}, out_desc = {0}; + + init_quantized_matmul_tensors(&input_a, &input_b, &input_c, &output, &a_desc, + &b_desc, &c_desc, &out_desc); + + TEST_ASSERT_EQUAL_UINT32( + ZDNN_OK, zdnn_prepare_quantized_matmul_context( + &input_a, &input_b, &input_c, true, &context)); + + TEST_ASSERT_TRUE(context.pre_computed); + TEST_ASSERT_EQUAL_UINT64(0, context.required_work_area_size); + TEST_ASSERT_NULL(context.work_area_scratch.buffer); + TEST_ASSERT_EQUAL_UINT64(0, context.work_area_scratch.size); + + zdnn_release_quantized_matmul_context(&context); +} + +void test_prepare_context_rejects_non_int8_bias_when_not_precomputed(void) { + zdnn_quantized_matmul_prepared_context context = {0}; + zdnn_ztensor input_a = {0}, input_b = {0}, input_c = {0}, output = {0}; + zdnn_tensor_desc a_desc = {0}, b_desc = {0}, c_desc = {0}, out_desc = {0}; + + init_quantized_matmul_tensors(&input_a, &input_b, &input_c, &output, &a_desc, + &b_desc, &c_desc, &out_desc); + + c_desc.type = ZDNN_DLFLOAT16; + + TEST_ASSERT_EQUAL_UINT32( + ZDNN_INVALID_TYPE, + zdnn_prepare_quantized_matmul_context(&input_a, &input_b, &input_c, false, + &context)); +} + +void test_prepared_op_rejects_unprepared_context(void) { + zdnn_quantized_matmul_prepared_context context = {0}; + zdnn_ztensor input_a = {0}, input_b = {0}, input_c = {0}, output = {0}; + zdnn_tensor_desc a_desc = {0}, b_desc = {0}, c_desc = {0}, out_desc = {0}; + + init_quantized_matmul_tensors(&input_a, &input_b, &input_c, &output, &a_desc, + &b_desc, &c_desc, &out_desc); + + TEST_ASSERT_EQUAL_UINT32( + ZDNN_INVALID_BUFFER, + zdnn_quantized_matmul_op_prepared(&context, &input_a, &input_b, &input_c, + MATMUL_OP_ADDITION, 0, 0, false, false, + &output)); +} + +void test_prepared_op_rejects_shape_drift(void) { + zdnn_quantized_matmul_prepared_context context = {0}; + zdnn_ztensor input_a = {0}, input_b = {0}, input_c = {0}, output = {0}; + zdnn_tensor_desc a_desc = {0}, b_desc = {0}, c_desc = {0}, out_desc = {0}; + + init_quantized_matmul_tensors(&input_a, &input_b, &input_c, &output, &a_desc, + &b_desc, &c_desc, &out_desc); + + TEST_ASSERT_EQUAL_UINT32( + ZDNN_OK, zdnn_prepare_quantized_matmul_context( + &input_a, &input_b, &input_c, false, &context)); + + zdnn_tensor_desc a_desc_mismatch = a_desc; + a_desc_mismatch.dim2 += 1; + zdnn_ztensor input_a_mismatch = input_a; + input_a_mismatch.transformed_desc = &a_desc_mismatch; + + TEST_ASSERT_EQUAL_UINT32( + ZDNN_INVALID_SHAPE, + zdnn_quantized_matmul_op_prepared( + &context, &input_a_mismatch, &input_b, &input_c, MATMUL_OP_ADDITION, + 0, 0, false, false, &output)); + + zdnn_release_quantized_matmul_context(&context); +} + +void test_prepared_op_rejects_precomputed_offset_violation(void) { + zdnn_quantized_matmul_prepared_context context = {0}; + zdnn_ztensor input_a = {0}, input_b = {0}, input_c = {0}, output = {0}; + zdnn_tensor_desc a_desc = {0}, b_desc = {0}, c_desc = {0}, out_desc = {0}; + + init_quantized_matmul_tensors(&input_a, &input_b, &input_c, &output, &a_desc, + &b_desc, &c_desc, &out_desc); + + TEST_ASSERT_EQUAL_UINT32( + ZDNN_OK, zdnn_prepare_quantized_matmul_context( + &input_a, &input_b, &input_c, true, &context)); + + input_b.offset = 1.f; + TEST_ASSERT_EQUAL_UINT32( + ZDNN_INVALID_OFFSET, + zdnn_quantized_matmul_op_prepared(&context, &input_a, &input_b, &input_c, + MATMUL_OP_ADDITION, 0, 0, false, false, + &output)); + + zdnn_release_quantized_matmul_context(&context); +} + +int main(void) { + UNITY_BEGIN(); + + RUN_TEST(test_prepare_context_rejects_null_or_missing_descriptors); + RUN_TEST(test_prepare_context_allocates_required_work_area); + RUN_TEST(test_prepare_context_precomputed_path_has_no_work_area); + RUN_TEST(test_prepare_context_rejects_non_int8_bias_when_not_precomputed); + RUN_TEST(test_prepared_op_rejects_unprepared_context); + RUN_TEST(test_prepared_op_rejects_shape_drift); + RUN_TEST(test_prepared_op_rejects_precomputed_offset_violation); + + return UNITY_END(); +} diff --git a/zdnn/prepared_quantized_matmul.c b/zdnn/prepared_quantized_matmul.c new file mode 100644 index 0000000..99e5b3f --- /dev/null +++ b/zdnn/prepared_quantized_matmul.c @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: Apache-2.0 +/* + * Copyright IBM Corp. 2024 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "zdnn.h" +#include "zdnn_private.h" + +#ifdef __MVS__ +#pragma export(zdnn_prepare_quantized_matmul_context) +#pragma export(zdnn_quantized_matmul_op_prepared) +#pragma export(zdnn_release_quantized_matmul_context) +#endif + +#define ZDNN_PREPARED_QMATMUL_CONTEXT_MAGIC UINT64_C(0x514D415450524550) + +static bool has_valid_tfrmd_desc(const zdnn_ztensor *ztensor) { + return ztensor && ztensor->transformed_desc; +} + +static zdnn_status verify_prepared_desc_match( + const char *tensor_name, const zdnn_tensor_desc *expected, + const zdnn_ztensor *actual_tensor) { + if (!has_valid_tfrmd_desc(actual_tensor)) { + return ZDNN_STATUS(ZDNN_INVALID_BUFFER, + "%s ztensor or transformed descriptor is NULL", + tensor_name); + } + + const zdnn_tensor_desc *actual = actual_tensor->transformed_desc; + + if (actual->layout != expected->layout) { + return ZDNN_STATUS( + ZDNN_INVALID_LAYOUT, + "%s transformed layout mismatch (found %s, expected %s)", tensor_name, + get_data_layout_str(actual->layout), get_data_layout_str(expected->layout)); + } + + if (actual->format != expected->format) { + return ZDNN_STATUS( + ZDNN_INVALID_FORMAT, + "%s transformed format mismatch (found %s (%d), expected %s (%d))", + tensor_name, get_data_format_str(actual->format), actual->format, + get_data_format_str(expected->format), expected->format); + } + + if (actual->type != expected->type) { + return ZDNN_STATUS( + ZDNN_INVALID_TYPE, "%s transformed type mismatch (found %s (%d), " + "expected %s (%d))", + tensor_name, get_data_type_str(actual->type), actual->type, + get_data_type_str(expected->type), expected->type); + } + + if (actual->dim4 != expected->dim4 || actual->dim3 != expected->dim3 || + actual->dim2 != expected->dim2 || actual->dim1 != expected->dim1) { + return ZDNN_STATUS( + ZDNN_INVALID_SHAPE, + "%s transformed dims mismatch (found [%u,%u,%u,%u], expected " + "[%u,%u,%u,%u])", + tensor_name, actual->dim4, actual->dim3, actual->dim2, actual->dim1, + expected->dim4, expected->dim3, expected->dim2, expected->dim1); + } + + return ZDNN_STATUS_OK; +} + +zdnn_status zdnn_prepare_quantized_matmul_context( + const zdnn_ztensor *input_a, const zdnn_ztensor *input_b, + const zdnn_ztensor *input_c, bool pre_computed, + zdnn_quantized_matmul_prepared_context *context) { + if (!context || !has_valid_tfrmd_desc(input_a) || + !has_valid_tfrmd_desc(input_b) || !has_valid_tfrmd_desc(input_c)) { + return ZDNN_STATUS_NO_MSG(ZDNN_INVALID_BUFFER); + } + + // Only attempt to reuse/release previous scratch if this context was + // initialized by this API previously. + if (context->reserved_internal != ZDNN_PREPARED_QMATMUL_CONTEXT_MAGIC) { + memset(context, 0, sizeof(*context)); + } + + context->function_code = get_matmul_function(input_a->transformed_desc->dim4, + input_b->transformed_desc->dim4); + context->pre_computed = pre_computed; + context->input_a_desc = *input_a->transformed_desc; + context->input_b_desc = *input_b->transformed_desc; + context->input_c_desc = *input_c->transformed_desc; + context->required_work_area_size = + zdnn_get_quantized_matmul_work_area_size(input_c, pre_computed); + + // In non-precomputed mode input_c must be transformed int8 for qc_tilde + // scratch sizing and later execution. + if (!pre_computed && + input_c->transformed_desc->type != ZDNN_BINARY_INT8) { + return ZDNN_STATUS( + ZDNN_INVALID_TYPE, + "input_c tensor type is invalid for prepared quantized matmul " + "(found %s (%d), expects ZDNN_BINARY_INT8 (8))", + get_data_type_str(input_c->transformed_desc->type), + input_c->transformed_desc->type); + } + + if (context->required_work_area_size == 0) { + // Precomputed path does not require internal work area. + zdnn_release_scratch_buffer(&context->work_area_scratch); + } else { + zdnn_status reserve_status = zdnn_reserve_scratch_buffer( + &context->work_area_scratch, context->required_work_area_size); + if (reserve_status != ZDNN_OK) { + return reserve_status; + } + } + + context->reserved_internal = ZDNN_PREPARED_QMATMUL_CONTEXT_MAGIC; + return ZDNN_STATUS_OK; +} + +zdnn_status zdnn_quantized_matmul_op_prepared( + const zdnn_quantized_matmul_prepared_context *context, + const zdnn_ztensor *input_a, const zdnn_ztensor *input_b, + const zdnn_ztensor *input_c, zdnn_matmul_ops op_type, const int8_t clip_min, + const int8_t clip_max, const bool disable_clipping, const bool dequantize, + zdnn_ztensor *output) { + if (!context || context->reserved_internal != + ZDNN_PREPARED_QMATMUL_CONTEXT_MAGIC || + !output) { + return ZDNN_STATUS_NO_MSG(ZDNN_INVALID_BUFFER); + } + + zdnn_status status = verify_prepared_desc_match("input_a", &context->input_a_desc, + input_a); + if (status != ZDNN_OK) { + return status; + } + + status = verify_prepared_desc_match("input_b", &context->input_b_desc, input_b); + if (status != ZDNN_OK) { + return status; + } + + status = verify_prepared_desc_match("input_c", &context->input_c_desc, input_c); + if (status != ZDNN_OK) { + return status; + } + + if (context->pre_computed && input_b->offset != 0.f) { + return ZDNN_STATUS(ZDNN_INVALID_OFFSET, + "input_b offset (Zb) is invalid when pre_computed=true " + "(found %f, expects %f)", + input_b->offset, 0.f); + } + + void *work_area = NULL; + if (!context->pre_computed) { + work_area = context->work_area_scratch.buffer; + if (!work_area || + context->work_area_scratch.size < context->required_work_area_size) { + return ZDNN_STATUS( + ZDNN_INVALID_BUFFER, + "prepared quantized matmul context work_area is invalid " + "(buffer=%p, size=%" PRIu64 ", required=%" PRIu64 ")", + work_area, context->work_area_scratch.size, + context->required_work_area_size); + } + } + + return aiu_quantized_matmul( + NNPA_PARMBLKFORMAT_1, context->function_code, input_a, input_b, input_c, + op_type, clip_min, clip_max, work_area, output, dequantize, + disable_clipping, context->pre_computed); +} + +void zdnn_release_quantized_matmul_context( + zdnn_quantized_matmul_prepared_context *context) { + if (!context) { + return; + } + + if (context->reserved_internal == ZDNN_PREPARED_QMATMUL_CONTEXT_MAGIC) { + zdnn_release_scratch_buffer(&context->work_area_scratch); + } + + memset(context, 0, sizeof(*context)); +} diff --git a/zdnn/zdnn.h b/zdnn/zdnn.h index 0c9cacc..2ed4afa 100644 --- a/zdnn/zdnn.h +++ b/zdnn/zdnn.h @@ -374,6 +374,30 @@ typedef struct zdnn_ztensor { char reserved2[20]; // not currently used, should contain zeros. } zdnn_ztensor; +// Prepared context for repeated zdnn_quantized_matmul_op() execution with +// stable tensor descriptors. +// +// Callers should: +// 1) zero-initialize the struct +// 2) call zdnn_prepare_quantized_matmul_context() once +// 3) call zdnn_quantized_matmul_op_prepared() repeatedly +// 4) call zdnn_release_quantized_matmul_context() when done +// +// Notes: +// - Existing zdnn_quantized_matmul_op() remains fully supported. +// - The prepared API is an optional performance helper for repeated shapes. +typedef struct zdnn_quantized_matmul_prepared_context { + uint64_t reserved_internal; // library-managed marker + uint64_t required_work_area_size; + nnpa_function_code function_code; + bool pre_computed; + char reserved[3]; + zdnn_scratch_buffer work_area_scratch; + zdnn_tensor_desc input_a_desc; + zdnn_tensor_desc input_b_desc; + zdnn_tensor_desc input_c_desc; +} zdnn_quantized_matmul_prepared_context; + #define ZDNN_VERSION "1.2.0" #define ZDNN_VERNUM 0x010200 // 0x[major][minor][patch] #define ZDNN_VER_MAJOR 1 @@ -564,6 +588,21 @@ zdnn_status zdnn_quantized_matmul_op( const int8_t clip_max, const bool disable_clipping, const bool dequantize, const bool pre_computed, void *work_area, zdnn_ztensor *output); +zdnn_status zdnn_prepare_quantized_matmul_context( + const zdnn_ztensor *input_a, const zdnn_ztensor *input_b, + const zdnn_ztensor *input_c, bool pre_computed, + zdnn_quantized_matmul_prepared_context *context); + +zdnn_status zdnn_quantized_matmul_op_prepared( + const zdnn_quantized_matmul_prepared_context *context, + const zdnn_ztensor *input_a, const zdnn_ztensor *input_b, + const zdnn_ztensor *input_c, zdnn_matmul_ops op_type, const int8_t clip_min, + const int8_t clip_max, const bool disable_clipping, const bool dequantize, + zdnn_ztensor *output); + +void zdnn_release_quantized_matmul_context( + zdnn_quantized_matmul_prepared_context *context); + // ----------------------------------------------------------------------------- // External Norm Operations // ----------------------------------------------------------------------------- diff --git a/zdnn/zdnn.map b/zdnn/zdnn.map index d80d3a1..1bf258a 100644 --- a/zdnn/zdnn.map +++ b/zdnn/zdnn.map @@ -132,6 +132,9 @@ ZDNN_1.0 { zdnn_matmul_bcast_op; zdnn_matmul_transpose_op; zdnn_quantized_matmul_op; + zdnn_prepare_quantized_matmul_context; + zdnn_quantized_matmul_op_prepared; + zdnn_release_quantized_matmul_context; zdnn_batchnorm; zdnn_norm; zdnn_moments; From bfd35c07ccc732bb1cbd5d4b22cc8f970ea275ad Mon Sep 17 00:00:00 2001 From: Kaitlyn Davis Date: Thu, 12 Feb 2026 09:18:57 -0800 Subject: [PATCH 2/3] prepared: compute qc_tilde work_area size internally The prepared quantized matmul context previously depended on the separate\nwork-area sizing helper API. Compute qc_tilde sizing locally instead so this\nPR remains self-contained (only requires scratch-buffer support) and does not\nintroduce a hidden dependency on work_area.c helper changes. Signed-off-by: Kaitlyn Davis Signed-off-by: Kaitlyn Davis --- ...Driver_quantized_matmul_prepared_context.c | 9 ++++- zdnn/prepared_quantized_matmul.c | 40 ++++++++++++++++++- 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/tests/testDriver_quantized_matmul_prepared_context.c b/tests/testDriver_quantized_matmul_prepared_context.c index 7bcb74f..d246a82 100644 --- a/tests/testDriver_quantized_matmul_prepared_context.c +++ b/tests/testDriver_quantized_matmul_prepared_context.c @@ -84,8 +84,13 @@ void test_prepare_context_allocates_required_work_area(void) { init_quantized_matmul_tensors(&input_a, &input_b, &input_c, &output, &a_desc, &b_desc, &c_desc, &out_desc); - uint64_t expected_work_area = - zdnn_get_quantized_matmul_work_area_size(&input_c, false); + // qc_tilde uses input_c's transformed dims/layout/format but promotes the + // element type to DLFLOAT16. + zdnn_tensor_desc qc_tilde_desc = {0}; + init_transformed_desc(c_desc.layout, ZDNN_DLFLOAT16, c_desc.format, + &qc_tilde_desc, c_desc.dim4, c_desc.dim3, c_desc.dim2, + c_desc.dim1); + uint64_t expected_work_area = zdnn_getsize_ztensor(&qc_tilde_desc); TEST_ASSERT_TRUE(expected_work_area > 0); TEST_ASSERT_EQUAL_UINT32( diff --git a/zdnn/prepared_quantized_matmul.c b/zdnn/prepared_quantized_matmul.c index 99e5b3f..f72aed8 100644 --- a/zdnn/prepared_quantized_matmul.c +++ b/zdnn/prepared_quantized_matmul.c @@ -32,6 +32,41 @@ static bool has_valid_tfrmd_desc(const zdnn_ztensor *ztensor) { return ztensor && ztensor->transformed_desc; } +// Determine required work-area bytes for quantized matmul when pre_computed is +// false. +// +// Notes: +// - This intentionally does not rely on the separate work-area sizing helper +// API so the prepared context stays self-contained and only depends on the +// scratch-buffer infrastructure. +// - The work area is used for qc_tilde, which reuses input_c's transformed +// dimensions/layout/format but promotes the element type to DLFLOAT16. +static uint64_t get_required_qc_tilde_work_area_size(const zdnn_ztensor *input_c, + bool pre_computed) { + if (pre_computed) { + return 0; + } + + if (!has_valid_tfrmd_desc(input_c)) { + return 0; + } + + // When pre_computed is false, aiu_quantized_matmul requires input_c + // transformed type to be int8. + if (input_c->transformed_desc->type != ZDNN_BINARY_INT8) { + return 0; + } + + zdnn_tensor_desc qc_tilde_desc; + init_transformed_desc( + input_c->transformed_desc->layout, ZDNN_DLFLOAT16, + input_c->transformed_desc->format, &qc_tilde_desc, + input_c->transformed_desc->dim4, input_c->transformed_desc->dim3, + input_c->transformed_desc->dim2, input_c->transformed_desc->dim1); + + return zdnn_getsize_ztensor(&qc_tilde_desc); +} + static zdnn_status verify_prepared_desc_match( const char *tensor_name, const zdnn_tensor_desc *expected, const zdnn_ztensor *actual_tensor) { @@ -100,8 +135,6 @@ zdnn_status zdnn_prepare_quantized_matmul_context( context->input_a_desc = *input_a->transformed_desc; context->input_b_desc = *input_b->transformed_desc; context->input_c_desc = *input_c->transformed_desc; - context->required_work_area_size = - zdnn_get_quantized_matmul_work_area_size(input_c, pre_computed); // In non-precomputed mode input_c must be transformed int8 for qc_tilde // scratch sizing and later execution. @@ -115,6 +148,9 @@ zdnn_status zdnn_prepare_quantized_matmul_context( input_c->transformed_desc->type); } + context->required_work_area_size = + get_required_qc_tilde_work_area_size(input_c, pre_computed); + if (context->required_work_area_size == 0) { // Precomputed path does not require internal work area. zdnn_release_scratch_buffer(&context->work_area_scratch); From b502b3e0df8c3f8b8d8a16f7bff9ae555e246c6f Mon Sep 17 00:00:00 2001 From: Kaitlyn Davis Date: Thu, 12 Feb 2026 10:14:25 -0800 Subject: [PATCH 3/3] prepared: decouple context from scratch API dependency What: Replace prepared-context scratch API usage with internal aligned buffer management so the branch can stand alone on main. Why: Issue 12 previously depended on Issue 11 types/functions; this change removes that branch dependency while preserving behavior. Expected impact: Prepared quantized matmul remains functionally equivalent and independently PR-able to main. Tests: update prepared-context tests for internal buffer fields. Signed-off-by: Kaitlyn Davis Signed-off-by: Kaitlyn Davis --- ...Driver_quantized_matmul_prepared_context.c | 14 ++--- zdnn/prepared_quantized_matmul.c | 52 ++++++++++++++++--- zdnn/zdnn.h | 3 +- 3 files changed, 53 insertions(+), 16 deletions(-) diff --git a/tests/testDriver_quantized_matmul_prepared_context.c b/tests/testDriver_quantized_matmul_prepared_context.c index d246a82..99cdab5 100644 --- a/tests/testDriver_quantized_matmul_prepared_context.c +++ b/tests/testDriver_quantized_matmul_prepared_context.c @@ -99,14 +99,14 @@ void test_prepare_context_allocates_required_work_area(void) { TEST_ASSERT_EQUAL_UINT32(NNPA_MATMUL_OP, context.function_code); TEST_ASSERT_EQUAL_UINT64(expected_work_area, context.required_work_area_size); - TEST_ASSERT_NOT_NULL(context.work_area_scratch.buffer); - TEST_ASSERT_EQUAL_UINT64(expected_work_area, context.work_area_scratch.size); - TEST_ASSERT_EQUAL_UINT64(0, (uintptr_t)context.work_area_scratch.buffer % + TEST_ASSERT_NOT_NULL(context.work_area_buffer); + TEST_ASSERT_EQUAL_UINT64(expected_work_area, context.work_area_buffer_size); + TEST_ASSERT_EQUAL_UINT64(0, (uintptr_t)context.work_area_buffer % AIU_PAGESIZE_IN_BYTES); zdnn_release_quantized_matmul_context(&context); - TEST_ASSERT_NULL(context.work_area_scratch.buffer); - TEST_ASSERT_EQUAL_UINT64(0, context.work_area_scratch.size); + TEST_ASSERT_NULL(context.work_area_buffer); + TEST_ASSERT_EQUAL_UINT64(0, context.work_area_buffer_size); TEST_ASSERT_EQUAL_UINT64(0, context.required_work_area_size); } @@ -124,8 +124,8 @@ void test_prepare_context_precomputed_path_has_no_work_area(void) { TEST_ASSERT_TRUE(context.pre_computed); TEST_ASSERT_EQUAL_UINT64(0, context.required_work_area_size); - TEST_ASSERT_NULL(context.work_area_scratch.buffer); - TEST_ASSERT_EQUAL_UINT64(0, context.work_area_scratch.size); + TEST_ASSERT_NULL(context.work_area_buffer); + TEST_ASSERT_EQUAL_UINT64(0, context.work_area_buffer_size); zdnn_release_quantized_matmul_context(&context); } diff --git a/zdnn/prepared_quantized_matmul.c b/zdnn/prepared_quantized_matmul.c index f72aed8..26b7b25 100644 --- a/zdnn/prepared_quantized_matmul.c +++ b/zdnn/prepared_quantized_matmul.c @@ -15,6 +15,7 @@ * limitations under the License. */ +#include #include #include "zdnn.h" @@ -32,13 +33,48 @@ static bool has_valid_tfrmd_desc(const zdnn_ztensor *ztensor) { return ztensor && ztensor->transformed_desc; } +static void release_internal_work_area( + zdnn_quantized_matmul_prepared_context *context) { + if (context->work_area_buffer) { + free_aligned_4k(context->work_area_buffer); + } + context->work_area_buffer = NULL; + context->work_area_buffer_size = 0; +} + +static zdnn_status reserve_internal_work_area( + zdnn_quantized_matmul_prepared_context *context, uint64_t min_size) { + if (min_size == 0) { + return ZDNN_STATUS_OK; + } + + if (context->work_area_buffer && + context->work_area_buffer_size >= min_size) { + return ZDNN_STATUS_OK; + } + + if (min_size > SIZE_MAX) { + return ZDNN_STATUS_NO_MSG(ZDNN_ALLOCATION_FAILURE); + } + + void *new_buffer = malloc_aligned_4k((size_t)min_size); + if (!new_buffer) { + return ZDNN_STATUS_NO_MSG(ZDNN_ALLOCATION_FAILURE); + } + + release_internal_work_area(context); + context->work_area_buffer = new_buffer; + context->work_area_buffer_size = min_size; + return ZDNN_STATUS_OK; +} + // Determine required work-area bytes for quantized matmul when pre_computed is // false. // // Notes: // - This intentionally does not rely on the separate work-area sizing helper // API so the prepared context stays self-contained and only depends on the -// scratch-buffer infrastructure. +// prepared-context internals. // - The work area is used for qc_tilde, which reuses input_c's transformed // dimensions/layout/format but promotes the element type to DLFLOAT16. static uint64_t get_required_qc_tilde_work_area_size(const zdnn_ztensor *input_c, @@ -153,10 +189,10 @@ zdnn_status zdnn_prepare_quantized_matmul_context( if (context->required_work_area_size == 0) { // Precomputed path does not require internal work area. - zdnn_release_scratch_buffer(&context->work_area_scratch); + release_internal_work_area(context); } else { - zdnn_status reserve_status = zdnn_reserve_scratch_buffer( - &context->work_area_scratch, context->required_work_area_size); + zdnn_status reserve_status = reserve_internal_work_area( + context, context->required_work_area_size); if (reserve_status != ZDNN_OK) { return reserve_status; } @@ -203,14 +239,14 @@ zdnn_status zdnn_quantized_matmul_op_prepared( void *work_area = NULL; if (!context->pre_computed) { - work_area = context->work_area_scratch.buffer; + work_area = context->work_area_buffer; if (!work_area || - context->work_area_scratch.size < context->required_work_area_size) { + context->work_area_buffer_size < context->required_work_area_size) { return ZDNN_STATUS( ZDNN_INVALID_BUFFER, "prepared quantized matmul context work_area is invalid " "(buffer=%p, size=%" PRIu64 ", required=%" PRIu64 ")", - work_area, context->work_area_scratch.size, + work_area, context->work_area_buffer_size, context->required_work_area_size); } } @@ -228,7 +264,7 @@ void zdnn_release_quantized_matmul_context( } if (context->reserved_internal == ZDNN_PREPARED_QMATMUL_CONTEXT_MAGIC) { - zdnn_release_scratch_buffer(&context->work_area_scratch); + release_internal_work_area(context); } memset(context, 0, sizeof(*context)); diff --git a/zdnn/zdnn.h b/zdnn/zdnn.h index 2ed4afa..ab59dc7 100644 --- a/zdnn/zdnn.h +++ b/zdnn/zdnn.h @@ -392,7 +392,8 @@ typedef struct zdnn_quantized_matmul_prepared_context { nnpa_function_code function_code; bool pre_computed; char reserved[3]; - zdnn_scratch_buffer work_area_scratch; + void *work_area_buffer; + uint64_t work_area_buffer_size; zdnn_tensor_desc input_a_desc; zdnn_tensor_desc input_b_desc; zdnn_tensor_desc input_c_desc;