From 3f95fcf8cc0cedafa2235fc61f504fcafbcd53cf Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Fri, 5 Jun 2026 08:45:31 +0000 Subject: [PATCH 1/5] Improve Synet API documentation Co-authored-by: Ihar Yermalayeu --- src/Simd/SimdLib.h | 176 ++++++++++++++++++++++++++++++--------------- 1 file changed, 120 insertions(+), 56 deletions(-) diff --git a/src/Simd/SimdLib.h b/src/Simd/SimdLib.h index 336f11c788..fcbe4fec2a 100644 --- a/src/Simd/SimdLib.h +++ b/src/Simd/SimdLib.h @@ -7364,15 +7364,29 @@ extern "C" \fn void* SimdSynetAdd16bInit(const size_t* aShape, size_t aCount, SimdTensorDataType aType, const size_t* bShape, size_t bCount, SimdTensorDataType bType, SimdTensorDataType dstType, SimdTensorFormatType format); - \short Initializes add algorithm. + \short Initializes element-wise addition of two tensors in FP32 or BF16 format. + + The created context adds two tensors with equal shapes: + \verbatim + for(i = 0; i < shapeSize; ++i) + { + A = aType == SimdTensorData16b ? BFloat16ToFloat32(a[i]) : a[i]; + B = bType == SimdTensorData16b ? BFloat16ToFloat32(b[i]) : b[i]; + D = A + B; + dst[i] = dstType == SimdTensorData16b ? Float32ToBFloat16(D) : D; + } + \endverbatim + + The current implementation creates a context only for equal input shapes, FP32/BF16 input and output tensor types, + and SimdTensorFormatUnknown, SimdTensorFormatNchw or SimdTensorFormatNhwc tensor format. \param [in] aShape - a pointer to shape of input A tensor. \param [in] aCount - a count of dimensions of input A tensor. - \param [in] aType - a type of input A tensor. Can be FP32 of BF16. + \param [in] aType - a type of input A tensor. Can be FP32 or BF16. \param [in] bShape - a pointer to shape of input B tensor. \param [in] bCount - a count of dimensions of input B tensor. - \param [in] bType - a type of input B tensor. Can be FP32 of BF16. - \param [in] dstType - a type of output tensor. Can be FP32 of BF16. + \param [in] bType - a type of input B tensor. Can be FP32 or BF16. + \param [in] dstType - a type of output tensor. Can be FP32 or BF16. \param [in] format - a format of input / output tensors. \return a pointer to add context. On error it returns NULL. It must be released with using of function ::SimdRelease. This pointer is used in function ::SimdSynetAdd16bForward. @@ -7383,7 +7397,11 @@ extern "C" \fn void SimdSynetAdd16bForward(void* context, const uint8_t* a, const uint8_t* b, uint8_t* dst); - \short Performs forward propagation of add algorithm. + \short Performs element-wise addition of two FP32/BF16 tensors. + + The function adds corresponding elements of input tensors A and B using a context created by ::SimdSynetAdd16bInit. + The actual data types, tensor shape and output type are stored in the context. BF16 input values are converted to + FP32 before addition, and BF16 output values are converted from FP32 after addition. \param [in] context - a pointer to add context. It must be created by function ::SimdSynetAdd16bInit and released by function ::SimdRelease. \param [in] a - a pointer to input A tensor. @@ -7396,22 +7414,28 @@ extern "C" \fn void SimdSynetAddBias(const float * bias, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); - \short Adds a bias to given vector. + \short Adds per-channel bias to an FP32 tensor in place. Algorithm's details (example for NCHW tensor format): \verbatim for(c = 0; c < channels; ++c) - for(j = 0; j < spatial; ++j) + for(s = 0; s < spatial; ++s) dst[c*spatial + s] += bias[c]; \endverbatim + Algorithm's details (example for NHWC tensor format): + \verbatim + for(s = 0; s < spatial; ++s) + for(c = 0; c < channels; ++c) + dst[s*channels + c] += bias[c]; + \endverbatim \note This function is used in Synet Framework. \param [in] bias - a pointer to the 32-bit float array with bias coefficients. The size of the array is equal to channels. - \param [in] channels - a number of channels in the image tensor. - \param [in] spatial - a spatial size of image tensor. - \param [in, out] dst - a pointer to cumulative 32-bit image tensor. The size of the array is equal to channels * spatial. - \param [in] format - a format of image tensor. + \param [in] channels - a number of channels in the tensor. + \param [in] spatial - a spatial size (height * width) of the tensor. + \param [in, out] dst - a pointer to FP32 tensor updated in place. The size of the array is equal to channels * spatial. + \param [in] format - a format of the tensor. */ SIMD_API void SimdSynetAddBias(const float * bias, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format); @@ -7419,37 +7443,40 @@ extern "C" \fn void SimdSynetAdd8i(const uint8_t * aData, const float * aScale, const float* aShift, const uint8_t* bData, const float* bScale, const float* bShift, uint8_t* cData, const float* cScale, const float* cShift, size_t batch, size_t channels, size_t spatial, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility); - \short Adds two INT8 tensors. + \short Dequantizes, adds and requantizes two UINT8 tensors. - Algorithm's details (example for NCHW tensor format): + Algorithm's details (example for NCHW tensor format): \verbatim + upper = isNarrowed(compatibility) ? 180 : 255; for(b = 0; b < batch; ++b) for(c = 0; c < channels; ++c) for(s = 0; s < spatial; ++s) { offs = (b*channels + c)*spatial + s; - A = aData[offs]*aScale[c] + aShift[c]; + A = aData[offs]*aScale[c] + aShift[c]; B = bData[offs]*bScale[c] + bShift[c]; - cData[offs] = round((A + B)*cScale[c] + cShift[c]); + C = round((A + B)*cScale[c] + cShift[c]); + cData[offs] = restrict(C, 0, upper); } \endverbatim + For NHWC tensor format the same calculation uses offset (b*spatial + s)*channels + c. \note This function is used in Synet Framework. - \param [in] aData - a pointer to the first input 8-bit integer tensor. - \param [in] aScale - a pointer to the 32-bit float array with scale coefficients of the first input tensor. - \param [in] aShift - a pointer to the 32-bit float array with shift coefficients of the first input tensor. - \param [in] bData - a pointer to the second input 8-bit integer tensor. - \param [in] bScale - a pointer to the 32-bit float array with scale coefficients of the second input tensor. - \param [in] bShift - a pointer to the 32-bit float array with shift coefficients of the second input tensor. - \param [out] cData - a pointer to the output 8-bit integer tensor. - \param [in] cScale - a pointer to the 32-bit float array with scale coefficients of the output tensor. - \param [in] cShift - a pointer to the 32-bit float array with shift coefficients of the output tensor. - \param [in] batch - a batch size of input and output image tensors. - \param [in] channels - a number of channels in input and output image tensors. - \param [in] spatial - a spatial size of input and output image tensors. - \param [in] format - a format of input and output image tensors. - \param [in] compatibility - a flags of calculation compatibility. + \param [in] aData - a pointer to the first input UINT8 tensor. + \param [in] aScale - a pointer to the 32-bit float array with per-channel scale coefficients of the first input tensor. + \param [in] aShift - a pointer to the 32-bit float array with per-channel shift coefficients of the first input tensor. + \param [in] bData - a pointer to the second input UINT8 tensor. + \param [in] bScale - a pointer to the 32-bit float array with per-channel scale coefficients of the second input tensor. + \param [in] bShift - a pointer to the 32-bit float array with per-channel shift coefficients of the second input tensor. + \param [out] cData - a pointer to the output UINT8 tensor. + \param [in] cScale - a pointer to the 32-bit float array with per-channel scale coefficients of the output tensor. + \param [in] cShift - a pointer to the 32-bit float array with per-channel shift coefficients of the output tensor. + \param [in] batch - a batch size of input and output tensors. + \param [in] channels - a number of channels in input and output tensors. + \param [in] spatial - a spatial size (height * width) of input and output tensors. + \param [in] format - a format of input and output tensors. Can be NCHW or NHWC. + \param [in] compatibility - calculation compatibility flags. When narrowed 8-bit mode is active, output is limited to [0, 180], otherwise to [0, 255]. */ SIMD_API void SimdSynetAdd8i(const uint8_t * aData, const float * aScale, const float* aShift, const uint8_t* bData, const float* bScale, const float* bShift, uint8_t* cData, const float* cScale, const float* cShift, size_t batch, size_t channels, size_t spatial, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility); @@ -7458,20 +7485,30 @@ extern "C" \fn void SimdSynetChannelSum16b(const uint16_t* src, size_t channels, size_t spatial, SimdTensorFormatType format, float* sum); - \short Calculates channels sums in FP32 format for input tensor in BF16 format. + \short Calculates per-channel sums of a BF16 tensor in FP32 format. - Algorithm's details (example for NCHW tensor format) : + Algorithm's details (example for NCHW tensor format): \verbatim for(c = 0; c < channels; ++c) + { sum[c] = 0; for(s = 0; s < spatial; ++s) - sum[c] += src[c, s]; + sum[c] += BFloat16ToFloat32(src[c*spatial + s]); + } + \endverbatim + Algorithm's details (example for NHWC tensor format): + \verbatim + for(c = 0; c < channels; ++c) + sum[c] = 0; + for(s = 0; s < spatial; ++s) + for(c = 0; c < channels; ++c) + sum[c] += BFloat16ToFloat32(src[s*channels + c]); \endverbatim \note This function is used in Synet Framework. - \param [in] src - a pointer to the input 16-bit brain-float tensor. - \param [in] channels - a number of channels in input and output arrays. + \param [in] src - a pointer to the input BF16 tensor. + \param [in] channels - a number of channels in input tensor. \param [in] spatial - a spatial (width * height) size of input tensor. \param [in] format - a format of input tensor. \param [out] sum - a pointer to output 32-bit float array with channels sums. @@ -7482,20 +7519,34 @@ extern "C" \fn void SimdSynetConvert32fTo8u(const float * src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float * shift, uint8_t * dst, SimdSynetCompatibilityType compatibility); - \short Converts 32-bit float point image to 8-bit unsigned integer image. + \short Converts an FP32 tensor to a UINT8 tensor using per-channel scale and shift. + + Algorithm's details (example for NCHW tensor format): + \verbatim + upper = isNarrowed(compatibility) ? 180 : 255; + for(b = 0; b < batch; ++b) + for(c = 0; c < channels; ++c) + for(h = 0; h < height; ++h) + for(w = 0; w < width; ++w) + { + offs = ((b*channels + c)*height + h)*width + w; + dst[offs] = restrict(round(src[offs]*scale[c] + shift[c]), 0, upper); + } + \endverbatim + For NHWC tensor format the same calculation uses offset ((b*height + h)*width + w)*channels + c. \note This function is used in Synet Framework. - \param [in] src - a pointer to the 32-bit float array with input image tensor. - \param [in] batch - a number of images in the batch of (input/output) image tensor. - \param [in] channels - a number of channels in the (input/output) image tensor. - \param [in] height - a height of (input/output) image tensor. - \param [in] width - a width of (input/output) image tensor. - \param [in] format - a format of (input/output) image tensor. - \param [in] scale - a pointer to the 32-bit float array with scale coefficients. - \param [in] shift - a pointer to the 32-bit float array with shift coefficients. - \param [out] dst - a pointer to the 8-bit unsigned integer array with output image tensor. - \param [in] compatibility - a flags of calculation compatibility. + \param [in] src - a pointer to the FP32 input tensor. + \param [in] batch - a batch size of input and output tensors. + \param [in] channels - a number of channels in input and output tensors. + \param [in] height - a height of input and output tensors. + \param [in] width - a width of input and output tensors. + \param [in] format - a format of input and output tensors. Can be NCHW or NHWC. + \param [in] scale - a pointer to the 32-bit float array with per-channel scale coefficients. + \param [in] shift - a pointer to the 32-bit float array with per-channel shift coefficients. + \param [out] dst - a pointer to the UINT8 output tensor. + \param [in] compatibility - calculation compatibility flags. When narrowed 8-bit mode is active, output is limited to [0, 180], otherwise to [0, 255]. */ SIMD_API void SimdSynetConvert32fTo8u(const float * src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float * shift, uint8_t* dst, SimdSynetCompatibilityType compatibility); @@ -7503,20 +7554,33 @@ extern "C" \fn void SimdSynetConvert8uTo32f(const uint8_t* src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float* shift, float* dst, SimdSynetCompatibilityType compatibility); - \short Converts 8-bit unsigned integer image to 32-bit float point image. + \short Converts a UINT8 tensor to an FP32 tensor using per-channel scale and shift. + + Algorithm's details (example for NCHW tensor format): + \verbatim + for(b = 0; b < batch; ++b) + for(c = 0; c < channels; ++c) + for(h = 0; h < height; ++h) + for(w = 0; w < width; ++w) + { + offs = ((b*channels + c)*height + h)*width + w; + dst[offs] = src[offs]*scale[c] + shift[c]; + } + \endverbatim + For NHWC tensor format the same calculation uses offset ((b*height + h)*width + w)*channels + c. \note This function is used in Synet Framework. - \param [in] src - a pointer to the 8-bit unsigned integer array with input image tensor. - \param [in] batch - a number of images in the batch of (input/output) image tensor. - \param [in] channels - a number of channels in the (input/output) image tensor. - \param [in] height - a height of (input/output) image tensor. - \param [in] width - a width of (input/output) image tensor. - \param [in] format - a format of (input/output) image tensor. - \param [in] scale - a pointer to the 32-bit float array with scale coefficients. - \param [in] shift - a pointer to the 32-bit float array with shift coefficients. - \param [out] dst - a pointer to the array with 32-bit float output image tensor. - \param [in] compatibility - a flags of calculation compatibility. + \param [in] src - a pointer to the UINT8 input tensor. + \param [in] batch - a batch size of input and output tensors. + \param [in] channels - a number of channels in input and output tensors. + \param [in] height - a height of input and output tensors. + \param [in] width - a width of input and output tensors. + \param [in] format - a format of input and output tensors. Can be NCHW or NHWC. + \param [in] scale - a pointer to the 32-bit float array with per-channel scale coefficients. + \param [in] shift - a pointer to the 32-bit float array with per-channel shift coefficients. + \param [out] dst - a pointer to the FP32 output tensor. + \param [in] compatibility - calculation compatibility flags. */ SIMD_API void SimdSynetConvert8uTo32f(const uint8_t* src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float* shift, float* dst, SimdSynetCompatibilityType compatibility); From 7beebc0c5764a98e73bb0a2250504d3f56416d60 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Fri, 5 Jun 2026 08:58:44 +0000 Subject: [PATCH 2/5] Improve FP32 convolution API documentation Co-authored-by: Ihar Yermalayeu --- src/Simd/SimdLib.h | 79 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 62 insertions(+), 17 deletions(-) diff --git a/src/Simd/SimdLib.h b/src/Simd/SimdLib.h index fcbe4fec2a..ae90f46877 100644 --- a/src/Simd/SimdLib.h +++ b/src/Simd/SimdLib.h @@ -7588,10 +7588,21 @@ extern "C" \fn void * SimdSynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv); - \short Initializes FP32 convolution algorithm. + \short Initializes an FP32 convolution context. + + The function validates convolution parameters and chooses a suitable implementation (direct, depthwise, + Winograd, NHWC-specialized or GEMM-based). It supports FP32 source and destination tensors with matching + NCHW or NHWC format. The destination spatial size must match convolution parameters: + \verbatim + dstH = (srcH + padY + padH - (dilationY*(kernelY - 1) + 1)) / strideY + 1 + dstW = (srcW + padX + padW - (dilationX*(kernelX - 1) + 1)) / strideX + 1 + \endverbatim + + A created context stores tensor shape, format, convolution geometry, group count and activation type. + Weights, bias and activation parameters are attached later by ::SimdSynetConvolution32fSetParams. \param [in] batch - a batch size. - \param [in] conv - a pointer to convolution parameters. + \param [in] conv - a pointer to convolution parameters. Source and destination tensor types must be FP32. \return a pointer to FP32 convolution context. On error it returns NULL. It must be released with using of function ::SimdRelease. This pointer is used in functions ::SimdSynetConvolution32fExternalBufferSize, ::SimdSynetConvolution32fInternalBufferSize, ::SimdSynetConvolution32fInfo, ::SimdSynetConvolution32fSetParams and ::SimdSynetConvolution32fForward. @@ -7602,10 +7613,14 @@ extern "C" \fn size_t SimdSynetConvolution32fExternalBufferSize(const void * context); - \short Gets size of external temporary buffer required for FP32 convolution algorithm. + \short Gets the size of caller-provided temporary buffer for FP32 convolution. + + The returned value is a number of 32-bit float elements, not bytes. It depends on the implementation selected + during initialization and can be used to allocate the \a buf argument of ::SimdSynetConvolution32fForward. + Some implementations return 1 when they do not need external temporary storage. \param [in] context - a pointer to FP32 convolution context. It must be created by function ::SimdSynetConvolution32fInit and released by function ::SimdRelease. - \return size of external temporary buffer required for FP32 convolution algorithm. + \return a number of FP32 elements required for external temporary buffer. */ SIMD_API size_t SimdSynetConvolution32fExternalBufferSize(const void * context); @@ -7613,10 +7628,14 @@ extern "C" \fn size_t SimdSynetConvolution32fInternalBufferSize(const void * context); - \short Gets size of internal buffer used inside FP32 convolution algorithm. + \short Gets the size of internal storage used by an FP32 convolution context. + + The returned value is a number of 32-bit float elements, not bytes. It reports internal storage tracked by + the selected implementation, such as internal temporary buffers and implementation-specific reordered weights, + bias or activation parameters already allocated by the context. \param [in] context - a pointer to FP32 convolution context. It must be created by function ::SimdSynetConvolution32fInit and released by function ::SimdRelease. - \return size of internal buffer used inside FP32 convolution algorithm. + \return a number of FP32 elements used by internal buffers. */ SIMD_API size_t SimdSynetConvolution32fInternalBufferSize(const void * context); @@ -7624,10 +7643,14 @@ extern "C" \fn const char* SimdSynetConvolution32fInfo(const void* context); - \short Gets description of internal implementation of FP32 convolution algorithm. + \short Gets a short description of the selected FP32 convolution implementation. + + The returned string contains the implementation extension and algorithm name, for example a direct, depthwise, + Winograd, NHWC direct or GEMM-based variant. The returned pointer is owned by the context and remains valid + until the next call of this function for the same context or until the context is released. \param [in] context - a pointer to FP32 convolution context. It must be created by function ::SimdSynetConvolution32fInit and released by function ::SimdRelease. - \return string with description of internal implementation of FP32 convolution algorithm. + \return a string with description of internal implementation of FP32 convolution algorithm. */ SIMD_API const char* SimdSynetConvolution32fInfo(const void* context); @@ -7635,13 +7658,22 @@ extern "C" \fn void SimdSynetConvolution32fSetParams(void * context, const float * weight, SimdBool * internal, const float * bias, const float * params); - \short Sets weights, biases and parameters of activation function required for FP32 convolution algorithm. + \short Sets weights, bias and activation parameters for FP32 convolution. + + This function must be called before ::SimdSynetConvolution32fForward. The \a weight array contains FP32 + convolution weights with kernelY*kernelX*srcC*dstC/group elements. Depending on the selected implementation, + weights can be used directly or transformed and stored inside the context. If \a internal is not NULL, the + selected implementation writes the weight storage mode to it: SimdTrue means that weights were transformed and + stored internally, while SimdFalse means that the implementation may use the original \a weight array directly, + so the caller must keep it valid for later forward calls. Bias and activation parameters can also be copied + internally by some implementations; otherwise their pointers are stored in the context. \param [in, out] context - a pointer to FP32 convolution context. It must be created by function ::SimdSynetConvolution32fInit and released by function ::SimdRelease. - \param [in] weight - a pointer to convolution weights. - \param [out] internal - a flag signalizing that weight is stored in the internal buffer. Can be NULL. - \param [in] bias - a pointer to bias. Can be NULL. - \param [in] params - a pointer to parameters of activation functions (see ::SimdConvolutionActivationType). Can be NULL. + \param [in] weight - a pointer to FP32 convolution weights. + \param [out] internal - a pointer to a flag receiving weight ownership mode. Can be NULL. + \param [in] bias - a pointer to FP32 bias array with dstC elements. Can be NULL. + \param [in] params - a pointer to FP32 parameters of activation function (see ::SimdConvolutionActivationType). + Can be NULL when activation does not require parameters. */ SIMD_API void SimdSynetConvolution32fSetParams(void * context, const float * weight, SimdBool * internal, const float * bias, const float * params); @@ -7649,12 +7681,25 @@ extern "C" \fn void SimdSynetConvolution32fForward(void * context, const float * src, float * buf, float * dst); - \short Performs forward propagation of FP32 convolution algorithm. + \short Performs forward propagation of FP32 convolution. + + The function convolves each image in the batch, adds bias when it was set, and applies the activation specified + in ::SimdConvolutionParameters: + \verbatim + sum = bias == NULL ? 0 : bias[dc]; + for(sc = 0; sc < srcC/group; ++sc) + for(ky = 0; ky < kernelY; ++ky) + for(kx = 0; kx < kernelX; ++kx) + sum += src[inputOffset] * weight[weightOffset]; + dst[outputOffset] = Activate(sum, activation, params); + \endverbatim + The exact offsets depend on tensor format, padding, dilation, stride and group. The input and output tensors + use the shape and format from the context created by ::SimdSynetConvolution32fInit. \param [in] context - a pointer to FP32 convolution context. It must be created by function ::SimdSynetConvolution32fInit and released by function ::SimdRelease. - \param [in] src - a pointer to input tensor. - \param [out] buf - a pointer to external temporary buffer. The size of the external temporary buffer is determined by function ::SimdSynetConvolution32fExternalBufferSize. Can be NULL (it causes usage of internal buffer). - \param [out] dst - a pointer to output tensor. + \param [in] src - a pointer to FP32 input tensor. + \param [out] buf - a pointer to external temporary FP32 buffer. The required number of elements is determined by function ::SimdSynetConvolution32fExternalBufferSize. Can be NULL (it causes usage of internal buffer). + \param [out] dst - a pointer to FP32 output tensor. */ SIMD_API void SimdSynetConvolution32fForward(void * context, const float * src, float * buf, float * dst); From 2e7e3ba144937bd0715d23bd72b738e381e5e21d Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Fri, 5 Jun 2026 09:02:55 +0000 Subject: [PATCH 3/5] Improve BF16 convolution API documentation Co-authored-by: Ihar Yermalayeu --- src/Simd/SimdLib.h | 84 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 18 deletions(-) diff --git a/src/Simd/SimdLib.h b/src/Simd/SimdLib.h index ae90f46877..1de9cb5d2b 100644 --- a/src/Simd/SimdLib.h +++ b/src/Simd/SimdLib.h @@ -7707,11 +7707,24 @@ extern "C" \fn void * SimdSynetConvolution16bInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility); - \short Initializes BF16 convolution algorithm. + \short Initializes a BF16/FP32 convolution context. + + The function validates convolution parameters and chooses a suitable BF16-oriented implementation (GEMM, + NCHW/NHWC GEMM, NHWC depthwise, NHWC special convolution or AMX-BF16 variant when available). It supports + FP32 or BF16 source and destination tensors with matching NCHW or NHWC format. The destination spatial size + must match convolution parameters: + \verbatim + dstH = (srcH + padY + padH - (dilationY*(kernelY - 1) + 1)) / strideY + 1 + dstW = (srcW + padX + padW - (dilationX*(kernelX - 1) + 1)) / strideX + 1 + \endverbatim + + A created context stores tensor shape, data types, format, convolution geometry, group count, activation type + and compatibility flags. FP32 weights, bias and activation parameters are attached later by + ::SimdSynetConvolution16bSetParams. \param [in] batch - a batch size. - \param [in] conv - a pointer to convolution parameters. - \param [in] compatibility - a flags of calculation compatibility. + \param [in] conv - a pointer to convolution parameters. Source and destination tensor types must be FP32 or BF16. + \param [in] compatibility - calculation compatibility flags. \return a pointer to BF16 convolution context. On error it returns NULL. It must be released with using of function ::SimdRelease. This pointer is used in functions ::SimdSynetConvolution16bExternalBufferSize, ::SimdSynetConvolution16bInternalBufferSize, ::SimdSynetConvolution16bInfo, ::SimdSynetConvolution16bSetParams and ::SimdSynetConvolution16bForward. @@ -7722,10 +7735,14 @@ extern "C" \fn size_t SimdSynetConvolution16bExternalBufferSize(const void * context); - \short Gets size in bytes of external temporary buffer required for BF16 convolution algorithm. + \short Gets the size in bytes of caller-provided temporary buffer for BF16 convolution. + + The returned value is a number of bytes. It depends on the implementation selected during initialization and + can be used to allocate the \a buf argument of ::SimdSynetConvolution16bForward. Some implementations return 1 + or 0 when they do not need external temporary storage. \param [in] context - a pointer to BF16 convolution context. It must be created by function ::SimdSynetConvolution16bInit and released by function ::SimdRelease. - \return size of external temporary buffer required for BF16 convolution algorithm. + \return a number of bytes required for external temporary buffer. */ SIMD_API size_t SimdSynetConvolution16bExternalBufferSize(const void* context); @@ -7733,10 +7750,13 @@ extern "C" \fn size_t SimdSynetConvolution16bInternalBufferSize(const void * context); - \short Gets size (in bytes) of internal buffer used inside BF16 convolution algorithm. + \short Gets the size in bytes of internal storage used by a BF16 convolution context. + + The returned value reports internal storage tracked by the selected implementation, including internal + temporary buffers, transformed weights, copied bias and copied activation parameters. \param [in] context - a pointer to BF16 convolution context. It must be created by function ::SimdSynetConvolution16bInit and released by function ::SimdRelease. - \return size of internal buffer used inside BF16 convolution algorithm. + \return a number of bytes used by internal buffers. */ SIMD_API size_t SimdSynetConvolution16bInternalBufferSize(const void* context); @@ -7744,23 +7764,34 @@ extern "C" \fn const char* SimdSynetConvolution16bInfo(const void* context); - \short Gets description of internal implementation of BF16 convolution algorithm. + \short Gets a short description of the selected BF16 convolution implementation. + + The returned string contains the implementation extension and algorithm name, for example a GEMM, NCHW/NHWC + GEMM, NHWC depthwise, NHWC special or AMX-BF16 variant. The returned pointer is owned by the context and + remains valid until the next call of this function for the same context or until the context is released. \param [in] context - a pointer to BF16 convolution context. It must be created by function ::SimdSynetConvolution16bInit and released by function ::SimdRelease. - \return string with description of internal implementation of BF16 convolution algorithm. + \return a string with description of internal implementation of BF16 convolution algorithm. */ SIMD_API const char* SimdSynetConvolution16bInfo(const void* context); /*! @ingroup synet_convolution_bf16 - \fn void SimdSynetConvolution16bSetParams(void * context, const float * weight, const float * bias, const float * params, const float * const * stats); + \fn void SimdSynetConvolution16bSetParams(void * context, const float * weight, const float * bias, const float * params); + + \short Sets weights, bias and activation parameters for BF16 convolution. - \short Sets weights, biases, parameters of activation function, input/output tensor statistics required for BF16 convolution algorithm. + This function must be called before ::SimdSynetConvolution16bForward. The \a weight array contains FP32 + convolution weights with kernelY*kernelX*srcC*dstC/group elements. The selected implementation transforms + weights to its internal representation (usually BF16 and reordered; some depthwise paths keep FP32 weights). + Bias is copied to an internal FP32 array; when \a bias is NULL, zeros are used. Activation parameters are + copied or expanded to the internal FP32 array according to ::SimdConvolutionActivationType. \param [in, out] context - a pointer to BF16 convolution context. It must be created by function ::SimdSynetConvolution16bInit and released by function ::SimdRelease. - \param [in] weight - a pointer to original (32-bit float point) convolution weights. - \param [in] bias - a pointer to original (32-bit float point) bias. Can be NULL. - \param [in] params - a pointer to original (32-bit float point) parameters of activation functions (see ::SimdConvolutionActivationType). Can be NULL. + \param [in] weight - a pointer to FP32 convolution weights. + \param [in] bias - a pointer to FP32 bias array with dstC elements. Can be NULL. + \param [in] params - a pointer to FP32 parameters of activation function (see ::SimdConvolutionActivationType). + Can be NULL when activation does not require parameters. */ SIMD_API void SimdSynetConvolution16bSetParams(void* context, const float* weight, const float* bias, const float* params); @@ -7768,12 +7799,29 @@ extern "C" \fn void SimdSynetConvolution16bForward(void * context, const uint8_t * src, uint8_t * buf, uint8_t * dst); - \short Performs forward propagation of BF16 convolution algorithm. + \short Performs forward propagation of BF16/FP32 convolution. + + The function converts FP32 input to BF16 when the context source type is FP32, uses BF16 input directly when + the source type is BF16, accumulates convolution sums in FP32, adds bias, applies activation and writes FP32 + or BF16 output according to the context destination type: + \verbatim + sum = bias[dc]; + for(sc = 0; sc < srcC/group; ++sc) + for(ky = 0; ky < kernelY; ++ky) + for(kx = 0; kx < kernelX; ++kx) + sum += inputValue * weightValue; + value = Activate(sum, activation, params); + dst[outputOffset] = dstT == SimdTensorData16b ? Float32ToBFloat16(value) : value; + \endverbatim + The input value is read as BF16 or converted from FP32 to BF16 according to srcT. The weight value comes from + the internal representation prepared by ::SimdSynetConvolution16bSetParams. + The exact offsets depend on tensor format, padding, dilation, stride and group. The input and output tensors + use the shape, data types and format from the context created by ::SimdSynetConvolution16bInit. \param [in] context - a pointer to BF16 convolution context. It must be created by function ::SimdSynetConvolution16bInit and released by function ::SimdRelease. - \param [in] src - a pointer to input tensor. - \param [out] buf - a pointer to external temporary buffer. The size of the external temporary buffer is determined by function ::SimdSynetConvolution16bExternalBufferSize. Can be NULL (it causes usage of internal buffer). - \param [out] dst - a pointer to output tensor. + \param [in] src - a pointer to input tensor. Actual element type is defined by srcT in convolution parameters. + \param [out] buf - a pointer to external temporary byte buffer. The required size is determined by function ::SimdSynetConvolution16bExternalBufferSize. Can be NULL (it causes usage of internal buffer). + \param [out] dst - a pointer to output tensor. Actual element type is defined by dstT in convolution parameters. */ SIMD_API void SimdSynetConvolution16bForward(void* context, const uint8_t* src, uint8_t* buf, uint8_t* dst); From 2e27184cdd30c8c49f950068049340cc47d234e8 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Fri, 5 Jun 2026 09:12:57 +0000 Subject: [PATCH 4/5] Improve INT8 convolution API documentation Co-authored-by: Ihar Yermalayeu --- src/Simd/SimdLib.h | 89 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 71 insertions(+), 18 deletions(-) diff --git a/src/Simd/SimdLib.h b/src/Simd/SimdLib.h index 1de9cb5d2b..7a22a5baf5 100644 --- a/src/Simd/SimdLib.h +++ b/src/Simd/SimdLib.h @@ -7829,11 +7829,26 @@ extern "C" \fn void * SimdSynetConvolution8iInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility); - \short Initializes INT8 convolution algorithm. + \short Initializes an INT8 convolution context. + + The function validates convolution parameters and chooses a suitable implementation (GEMM, NHWC direct, + NHWC depthwise or architecture-specific VNNI/AMX/NEON variant when available). It supports FP32 or UINT8 + source and destination tensors with matching NCHW or NHWC format. The destination spatial size must match + convolution parameters: + \verbatim + dstH = (srcH + padY + padH - (dilationY*(kernelY - 1) + 1)) / strideY + 1 + dstW = (srcW + padX + padW - (dilationX*(kernelX - 1) + 1)) / strideX + 1 + \endverbatim + + A created context stores tensor shape, data types, format, convolution geometry, group count, activation type + and compatibility flags. FP32 weights, bias, activation parameters and tensor statistics are attached later by + ::SimdSynetConvolution8iSetParams. \param [in] batch - a batch size. - \param [in] conv - a pointer to convolution parameters. - \param [in] compatibility - a flags of calculation compatibility. + \param [in] conv - a pointer to convolution parameters. Source and destination tensor types must be FP32 or UINT8. + \param [in] compatibility - calculation compatibility flags. They select precise, overflow or narrowed INT8 + calculation mode. Narrowed mode uses unsigned range [0, 180] and signed range [-90, 90]; otherwise + ranges are [0, 255] and [-128, 127]. \return a pointer to INT8 convolution context. On error it returns NULL. It must be released with using of function ::SimdRelease. This pointer is used in functions ::SimdSynetConvolution8iExternalBufferSize, ::SimdSynetConvolution8iInternalBufferSize, ::SimdSynetConvolution8iInfo, ::SimdSynetConvolution8iSetParams and ::SimdSynetConvolution8iForward. @@ -7844,10 +7859,14 @@ extern "C" \fn size_t SimdSynetConvolution8iExternalBufferSize(const void * context); - \short Gets size in bytes of external temporary buffer required for INT8 convolution algorithm. + \short Gets the size in bytes of caller-provided temporary buffer for INT8 convolution. + + The returned value is a number of bytes. It depends on the implementation selected during initialization and + can be used to allocate the \a buf argument of ::SimdSynetConvolution8iForward. The buffer can contain temporary + UINT8 source conversion data, im2col/padded input data, INT32 sums and temporary FP32 output data. \param [in] context - a pointer to INT8 convolution context. It must be created by function ::SimdSynetConvolution8iInit and released by function ::SimdRelease. - \return size of external temporary buffer required for INT8 convolution algorithm. + \return a number of bytes required for external temporary buffer. */ SIMD_API size_t SimdSynetConvolution8iExternalBufferSize(const void * context); @@ -7855,10 +7874,14 @@ extern "C" \fn size_t SimdSynetConvolution8iInternalBufferSize(const void * context); - \short Gets size of internal buffer used inside INT8 convolution algorithm. + \short Gets the size in bytes of internal storage used by an INT8 convolution context. + + The returned value reports internal storage tracked by the selected implementation, including internal + temporary buffers, quantized/reordered INT8 weights, source and destination conversion parameters, + normalization, bias and activation parameters. \param [in] context - a pointer to INT8 convolution context. It must be created by function ::SimdSynetConvolution8iInit and released by function ::SimdRelease. - \return size of internal buffer used inside INT8 convolution algorithm. + \return a number of bytes used by internal buffers. */ SIMD_API size_t SimdSynetConvolution8iInternalBufferSize(const void * context); @@ -7866,10 +7889,15 @@ extern "C" \fn const char* SimdSynetConvolution8iInfo(const void* context); - \short Gets description of internal implementation of INT8 convolution algorithm. + \short Gets a short description of the selected INT8 convolution implementation. + + The returned string contains the implementation extension and algorithm name, for example a GEMM, NHWC direct + or NHWC depthwise variant, with a suffix for precise, overflow or narrowed mode when applicable. The returned + pointer is owned by the context and remains valid until the next call of this function for the same context or + until the context is released. \param [in] context - a pointer to INT8 convolution context. It must be created by function ::SimdSynetConvolution8iInit and released by function ::SimdRelease. - \return string with description of internal implementation of INT8 convolution algorithm. + \return a string with description of internal implementation of INT8 convolution algorithm. */ SIMD_API const char* SimdSynetConvolution8iInfo(const void* context); @@ -7877,13 +7905,23 @@ extern "C" \fn void SimdSynetConvolution8iSetParams(void * context, const float * weight, const float * bias, const float * params, const float * const * stats); - \short Sets weights, biases, parameters of activation function, input/output tensor statistics required for INT8 convolution algorithm. + \short Sets weights, bias, activation parameters and tensor statistics for INT8 convolution. + + This function must be called before ::SimdSynetConvolution8iForward. The \a weight array contains FP32 + convolution weights with kernelY*kernelX*srcC*dstC/group elements. Source statistics (\a stats[0], + \a stats[1], each with srcC elements) define per-channel source quantization parameters; destination statistics + (\a stats[2], \a stats[3], each with dstC elements) define per-channel output quantization parameters. The + selected implementation converts weights to INT8, may reorder them, and computes per-output-channel normalization + and bias terms used to convert INT32 sums back to FP32. Activation parameters are copied or expanded internally + according to ::SimdConvolutionActivationType. \param [in, out] context - a pointer to INT8 convolution context. It must be created by function ::SimdSynetConvolution8iInit and released by function ::SimdRelease. - \param [in] weight - a pointer to original (32-bit float point) convolution weights. - \param [in] bias - a pointer to original (32-bit float point) bias. Can be NULL. - \param [in] params - a pointer to original (32-bit float point) parameters of activation functions (see ::SimdConvolutionActivationType). Can be NULL. - \param [in] stats - a pointer to pointers with statistics of input(min - stats[0], max - stats[1]) and output(min - stats[2], max - stats[3]) tensors. + \param [in] weight - a pointer to FP32 convolution weights. + \param [in] bias - a pointer to FP32 bias array with dstC elements. Can be NULL. + \param [in] params - a pointer to FP32 parameters of activation function (see ::SimdConvolutionActivationType). + Can be NULL when activation does not require parameters. + \param [in] stats - a pointer to pointers with per-channel tensor statistics: + source minimum stats[0], source maximum stats[1], destination minimum stats[2], destination maximum stats[3]. */ SIMD_API void SimdSynetConvolution8iSetParams(void * context, const float * weight, const float * bias, const float * params, const float * const* stats); @@ -7891,12 +7929,27 @@ extern "C" \fn void SimdSynetConvolution8iForward(void * context, const uint8_t * src, uint8_t * buf, uint8_t * dst); - \short Performs forward propagation of INT8 convolution algorithm. + \short Performs forward propagation of INT8 convolution. + + The function converts FP32 input to UINT8 when the context source type is FP32, uses UINT8 input directly when + the source type is UINT8, accumulates convolution sums in INT32 with INT8 weights, converts sums to FP32 using + internal normalization and bias, applies activation, and writes FP32 or UINT8 output according to the context + destination type: + \verbatim + if(srcT == SimdTensorData32f) + src8u = restrict(round(src32f*srcScale[c] + srcShift[c]), srcLower, srcUpper); + sum = convolution_int32(src8u, weight8i, zero); + value = Activate(sum*norm[dc] + bias[dc], activation, params); + dst[outputOffset] = dstT == SimdTensorData8u ? + restrict(round(value*dstScale[dc] + dstShift[dc]), dstLower, dstUpper) : value; + \endverbatim + The exact offsets depend on tensor format, padding, dilation, stride and group. The input and output tensors + use the shape, data types and format from the context created by ::SimdSynetConvolution8iInit. \param [in] context - a pointer to INT8 convolution context. It must be created by function ::SimdSynetConvolution8iInit and released by function ::SimdRelease. - \param [in] src - a pointer to input tensor. - \param [out] buf - a pointer to external temporary buffer. The size of the external temporary buffer is determined by function ::SimdSynetConvolution8iExternalBufferSize. Can be NULL (it causes usage of internal buffer). - \param [out] dst - a pointer to output tensor. + \param [in] src - a pointer to input tensor. Actual element type is defined by srcT in convolution parameters. + \param [out] buf - a pointer to external temporary byte buffer. The required size is determined by function ::SimdSynetConvolution8iExternalBufferSize. Can be NULL (it causes usage of internal buffer). + \param [out] dst - a pointer to output tensor. Actual element type is defined by dstT in convolution parameters. */ SIMD_API void SimdSynetConvolution8iForward(void * context, const uint8_t * src, uint8_t * buf, uint8_t * dst); From d352473701476c878947ad9ffb2ff23246bc5151 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Fri, 5 Jun 2026 09:18:10 +0000 Subject: [PATCH 5/5] Improve FP32 deconvolution API documentation Co-authored-by: Ihar Yermalayeu --- src/Simd/SimdLib.h | 85 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 67 insertions(+), 18 deletions(-) diff --git a/src/Simd/SimdLib.h b/src/Simd/SimdLib.h index 7a22a5baf5..002ff5a239 100644 --- a/src/Simd/SimdLib.h +++ b/src/Simd/SimdLib.h @@ -7957,11 +7957,23 @@ extern "C" \fn void * SimdSynetDeconvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility); - \short Initializes FP32 deconvolution algorithm. + \short Initializes an FP32 deconvolution context. + + The function validates deconvolution parameters and chooses a suitable implementation (GEMM-based or + NHWC direct 2x2 when available). It supports FP32 source and destination tensors with matching NCHW format, + or matching NHWC format when group is 1. The destination spatial size must match deconvolution parameters: + \verbatim + dstH = strideY*(srcH - 1) + dilationY*(kernelY - 1) + 1 - padY - padH + dstW = strideX*(srcW - 1) + dilationX*(kernelX - 1) + 1 - padX - padW + \endverbatim + + A created context stores tensor shape, format, deconvolution geometry, group count, activation type and + compatibility flags. Weights, bias and activation parameters are attached later by + ::SimdSynetDeconvolution32fSetParams. \param [in] batch - a batch size. - \param [in] conv - a pointer to deconvolution parameters. - \param [in] compatibility - a flags of calculation compatibility. + \param [in] conv - a pointer to deconvolution parameters. Source and destination tensor types must be FP32. + \param [in] compatibility - calculation compatibility flags. \return a pointer to FP32 deconvolution context. On error it returns NULL. It must be released with using of function ::SimdRelease. This pointer is used in functions ::SimdSynetDeconvolution32fExternalBufferSize, ::SimdSynetDeconvolution32fInternalBufferSize, ::SimdSynetDeconvolution32fInfo, ::SimdSynetDeconvolution32fSetParams and ::SimdSynetDeconvolution32fForward. @@ -7972,10 +7984,14 @@ extern "C" \fn size_t SimdSynetDeconvolution32fExternalBufferSize(const void * context); - \short Gets size of external temporary buffer required for FP32 deconvolution algorithm. + \short Gets the size of caller-provided temporary buffer for FP32 deconvolution. + + The returned value is a number of 32-bit float elements, not bytes. It depends on the implementation selected + during initialization and can be used to allocate the \a buf argument of ::SimdSynetDeconvolution32fForward. + Some implementations return 1 when they do not need external temporary storage. \param [in] context - a pointer to FP32 deconvolution context. It must be created by function ::SimdSynetDeconvolution32fInit and released by function ::SimdRelease. - \return size of external temporary buffer required for FP32 deconvolution algorithm. + \return a number of FP32 elements required for external temporary buffer. */ SIMD_API size_t SimdSynetDeconvolution32fExternalBufferSize(const void * context); @@ -7983,10 +7999,14 @@ extern "C" \fn size_t SimdSynetDeconvolution32fInternalBufferSize(const void * context); - \short Gets size of internal buffer used inside FP32 deconvolution algorithm. + \short Gets the size of internal storage used by an FP32 deconvolution context. + + The returned value is a number of 32-bit float elements, not bytes. It reports internal storage tracked by + the selected implementation, such as internal temporary buffers and implementation-specific reordered weights, + bias or activation parameters already allocated by the context. \param [in] context - a pointer to FP32 deconvolution context. It must be created by function ::SimdSynetDeconvolution32fInit and released by function ::SimdRelease. - \return size of internal buffer used inside FP32 deconvolution algorithm. + \return a number of FP32 elements used by internal buffers. */ SIMD_API size_t SimdSynetDeconvolution32fInternalBufferSize(const void * context); @@ -7994,10 +8014,14 @@ extern "C" \fn const char* SimdSynetDeconvolution32fInfo(const void* context); - \short Gets description of internal implementation of FP32 deconvolution algorithm. + \short Gets a short description of the selected FP32 deconvolution implementation. + + The returned string contains the implementation extension and algorithm name, for example a GEMM-based or + NHWC direct 2x2 variant. The returned pointer is owned by the context and remains valid until the next call + of this function for the same context or until the context is released. \param [in] context - a pointer to FP32 deconvolution context. It must be created by function ::SimdSynetDeconvolution32fInit and released by function ::SimdRelease. - \return string with description of internal implementation of FP32 deconvolution algorithm. + \return a string with description of internal implementation of FP32 deconvolution algorithm. */ SIMD_API const char* SimdSynetDeconvolution32fInfo(const void* context); @@ -8005,13 +8029,22 @@ extern "C" \fn void SimdSynetDeconvolution32fSetParams(void * context, const float * weight, SimdBool * internal, const float * bias, const float * params); - \short Sets weights, biases and parameters of activation function required for FP32 deconvolution algorithm. + \short Sets weights, bias and activation parameters for FP32 deconvolution. + + This function must be called before ::SimdSynetDeconvolution32fForward. The \a weight array contains FP32 + deconvolution weights with kernelY*kernelX*srcC*dstC/group elements. Depending on the selected implementation, + weights can be used directly or transformed and stored inside the context. If \a internal is not NULL, the + selected implementation writes the weight storage mode to it: SimdTrue means that weights were transformed and + stored internally, while SimdFalse means that the implementation may use the original \a weight array directly, + so the caller must keep it valid for later forward calls. Bias and activation parameters can also be copied + internally by some implementations; otherwise their pointers are stored in the context. \param [in, out] context - a pointer to FP32 deconvolution context. It must be created by function ::SimdSynetDeconvolution32fInit and released by function ::SimdRelease. - \param [in] weight - a pointer to deconvolution weights. - \param [out] internal - a flag signalizing that weight is stored in the internal buffer. Can be NULL. - \param [in] bias - a pointer to bias. Can be NULL. - \param [in] params - a pointer to parameters of activation functions (see ::SimdConvolutionActivationType). Can be NULL. + \param [in] weight - a pointer to FP32 deconvolution weights. + \param [out] internal - a pointer to a flag receiving weight ownership mode. Can be NULL. + \param [in] bias - a pointer to FP32 bias array with dstC elements. Can be NULL. + \param [in] params - a pointer to FP32 parameters of activation function (see ::SimdConvolutionActivationType). + Can be NULL when activation does not require parameters. */ SIMD_API void SimdSynetDeconvolution32fSetParams(void * context, const float * weight, SimdBool * internal, const float * bias, const float * params); @@ -8019,12 +8052,28 @@ extern "C" \fn void SimdSynetDeconvolution32fForward(void * context, const float * src, float * buf, float * dst); - \short Performs forward propagation of FP32 deconvolution algorithm. + \short Performs forward propagation of FP32 deconvolution. + + The function applies transposed convolution to each image in the batch, adds bias when it was set, and applies + the activation specified in ::SimdConvolutionParameters: + \verbatim + dst[:] = 0; + for(sc = 0; sc < srcC/group; ++sc) + for(sy = 0; sy < srcH; ++sy) + for(sx = 0; sx < srcW; ++sx) + for(ky = 0; ky < kernelY; ++ky) + for(kx = 0; kx < kernelX; ++kx) + dst[outputOffset] += src[inputOffset] * weight[weightOffset]; + dst[outputOffset] = Activate(dst[outputOffset] + bias[dc], activation, params); + \endverbatim + The exact offsets depend on tensor format, padding, dilation, stride and group. The input and output tensors + use the shape and format from the context created by ::SimdSynetDeconvolution32fInit. \param [in] context - a pointer to FP32 deconvolution context. It must be created by function ::SimdSynetDeconvolution32fInit and released by function ::SimdRelease. - \param [in] src - a pointer to input tensor. - \param [out] buf - a pointer to external temporary buffer. The size of the external temporary buffer is determined by function ::SimdSynetDeconvolution32fExternalBufferSize. Can be NULL (it causes usage of internal buffer). - \param [out] dst - a pointer to output tensor. + \param [in] src - a pointer to FP32 input tensor. + \param [out] buf - a pointer to external temporary FP32 buffer. The required number of elements is determined by + function ::SimdSynetDeconvolution32fExternalBufferSize. Can be NULL (it causes usage of internal buffer). + \param [out] dst - a pointer to FP32 output tensor. */ SIMD_API void SimdSynetDeconvolution32fForward(void * context, const float * src, float * buf, float * dst);