From 85260d14cdbad05e6c58900b2ae82c590f86823d Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Fri, 5 Jun 2026 12:31:01 +0000
Subject: [PATCH] Improve Synet API documentation

Co-authored-by: Ihar Yermalayeu <ermig@tut.by>
---
 src/Simd/SimdLib.h | 473 ++++++++++++++++++++++++++++++---------------
 1 file changed, 316 insertions(+), 157 deletions(-)

diff --git a/src/Simd/SimdLib.h b/src/Simd/SimdLib.h
index 002ff5a239..1ebaa3e4c7 100644
--- a/src/Simd/SimdLib.h
+++ b/src/Simd/SimdLib.h
@@ -8081,12 +8081,25 @@ extern "C"
 
     \fn void * SimdSynetDeconvolution16bInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility);
 
-    \short Initializes BF16 deconvolution algorithm.
+    \short Initializes a BF16/FP32 deconvolution context.
+
+    The function validates deconvolution parameters and chooses a suitable BF16-oriented implementation (GEMM-based
+    or NHWC GEMM-based variant, including AMX-BF16 when available). It supports FP32 or BF16 source and destination
+    tensors with matching NCHW format, or matching NHWC format when group is 1. The destination spatial size must
+    match deconvolution parameters:
+    \verbatim
+    dstH = strideY*(srcH - 1) + dilationY*(kernelY - 1) + 1 - padY - padH
+    dstW = strideX*(srcW - 1) + dilationX*(kernelX - 1) + 1 - padX - padW
+    \endverbatim
+
+    A created context stores tensor shape, data types, format, deconvolution geometry, group count, activation type
+    and compatibility flags. FP32 weights, bias and activation parameters are attached later by
+    ::SimdSynetDeconvolution16bSetParams.
 
     \param [in] batch - a batch size.
-    \param [in] conv - a pointer to deconvolution parameters.
-    \param [in] compatibility - a flags of calculation compatibility.
-    \return a pointer to BF16 convolution context. On error it returns NULL. It must be released with using of function ::SimdRelease.
+    \param [in] conv - a pointer to deconvolution parameters. Source and destination tensor types must be FP32 or BF16.
+    \param [in] compatibility - calculation compatibility flags.
+    \return a pointer to BF16 deconvolution context. On error it returns NULL. It must be released with using of function ::SimdRelease.
         This pointer is used in functions ::SimdSynetDeconvolution16bExternalBufferSize, ::SimdSynetDeconvolution16bInternalBufferSize,
         ::SimdSynetDeconvolution16bInfo, ::SimdSynetDeconvolution16bSetParams and ::SimdSynetDeconvolution16bForward.
 */
@@ -8096,10 +8109,14 @@ extern "C"
 
         \fn size_t SimdSynetDeconvolution16bExternalBufferSize(const void * context);
 
-        \short Gets size in bytes of external temporary buffer required for BF16 deconvolution algorithm.
+        \short Gets the size in bytes of caller-provided temporary buffer for BF16 deconvolution.
+
+        The returned value is a number of bytes. It depends on the implementation selected during initialization and
+        can be used to allocate the \a buf argument of ::SimdSynetDeconvolution16bForward. Some implementations return
+        1 or 0 when they do not need external temporary storage.
 
         \param [in] context - a pointer to BF16 deconvolution context. It must be created by function ::SimdSynetDeconvolution16bInit and released by function ::SimdRelease.
-        \return size of external temporary buffer required for BF16 convolution algorithm.
+        \return a number of bytes required for external temporary buffer.
     */
     SIMD_API size_t SimdSynetDeconvolution16bExternalBufferSize(const void* context);
 
@@ -8107,10 +8124,13 @@ extern "C"
 
         \fn size_t SimdSynetDeconvolution16bInternalBufferSize(const void * context);
 
-        \short Gets size (in bytes) of internal buffer used inside BF16 deconvolution algorithm.
+        \short Gets the size in bytes of internal storage used by a BF16 deconvolution context.
+
+        The returned value reports internal storage tracked by the selected implementation, including internal
+        temporary buffers, transformed weights, copied bias and copied activation parameters.
 
         \param [in] context - a pointer to BF16 deconvolution context. It must be created by function ::SimdSynetDeconvolution16bInit and released by function ::SimdRelease.
-        \return size of internal buffer used inside BF16 convolution algorithm.
+        \return a number of bytes used by internal buffers.
     */
     SIMD_API size_t SimdSynetDeconvolution16bInternalBufferSize(const void* context);
 
@@ -8118,23 +8138,34 @@ extern "C"
 
         \fn const char* SimdSynetDeconvolution16bInfo(const void* context);
 
-        \short Gets description of internal implementation of BF16 deconvolution algorithm.
+        \short Gets a short description of the selected BF16 deconvolution implementation.
+
+        The returned string contains the implementation extension and algorithm name, for example a GEMM or NHWC GEMM
+        variant. The returned pointer is owned by the context and remains valid until the next call of this function
+        for the same context or until the context is released.
 
         \param [in] context - a pointer to BF16 deconvolution context. It must be created by function ::SimdSynetDeconvolution16bInit and released by function ::SimdRelease.
-        \return string with description of internal implementation of BF16 convolution algorithm.
+        \return a string with description of internal implementation of BF16 deconvolution algorithm.
     */
     SIMD_API const char* SimdSynetDeconvolution16bInfo(const void* context);
 
     /*! @ingroup synet_deconvolution_bf16
 
-        \fn void SimdSynetDeconvolution16bSetParams(void * context, const float * weight, const float * bias, const float * params, const float * const * stats);
+        \fn void SimdSynetDeconvolution16bSetParams(void * context, const float * weight, const float * bias, const float * params);
 
-        \short Sets weights, biases, parameters of activation function, input/output tensor statistics required for BF16 deconvolution algorithm.
+        \short Sets weights, bias and activation parameters for BF16 deconvolution.
+
+        This function must be called before ::SimdSynetDeconvolution16bForward. The \a weight array contains FP32
+        deconvolution weights with kernelY*kernelX*srcC*dstC/group elements. The selected implementation transforms
+        weights to its internal BF16/reordered representation. Bias is copied to an internal FP32 array; when \a bias
+        is NULL, zeros are used. Activation parameters are copied or expanded to the internal FP32 array according to
+        ::SimdConvolutionActivationType.
 
         \param [in, out] context - a pointer to BF16 deconvolution context. It must be created by function ::SimdSynetDeconvolution16bInit and released by function ::SimdRelease.
-        \param [in] weight - a pointer to original (32-bit float point) convolution weights.
-        \param [in] bias - a pointer to original (32-bit float point) bias. Can be NULL.
-        \param [in] params - a pointer to original (32-bit float point) parameters of activation functions (see ::SimdDeconvolutionActivationType). Can be NULL.
+        \param [in] weight - a pointer to FP32 deconvolution weights.
+        \param [in] bias - a pointer to FP32 bias array with dstC elements. Can be NULL.
+        \param [in] params - a pointer to FP32 parameters of activation function (see ::SimdConvolutionActivationType).
+            Can be NULL when activation does not require parameters.
     */
     SIMD_API void SimdSynetDeconvolution16bSetParams(void* context, const float* weight, const float* bias, const float* params);
 
@@ -8142,12 +8173,31 @@ extern "C"
 
         \fn void SimdSynetDeconvolution16bForward(void * context, const uint8_t * src, uint8_t * buf, uint8_t * dst);
 
-        \short Performs forward propagation of BF16 deconvolution algorithm.
+        \short Performs forward propagation of BF16/FP32 deconvolution.
+
+        The function converts FP32 input to BF16 when the context source type is FP32, uses BF16 input directly when
+        the source type is BF16, accumulates transposed convolution sums in FP32, adds bias, applies activation and
+        writes FP32 or BF16 output according to the context destination type:
+        \verbatim
+        dst[:] = 0;
+        for(sc = 0; sc < srcC/group; ++sc)
+            for(sy = 0; sy < srcH; ++sy)
+                for(sx = 0; sx < srcW; ++sx)
+                    for(ky = 0; ky < kernelY; ++ky)
+                        for(kx = 0; kx < kernelX; ++kx)
+                            dst[outputOffset] += inputValue * weightValue;
+        value = Activate(dst[outputOffset] + bias[dc], activation, params);
+        dst[outputOffset] = dstT == SimdTensorData16b ? Float32ToBFloat16(value) : value;
+        \endverbatim
+        The input value is read as BF16 or converted from FP32 to BF16 according to srcT. The weight value comes from
+        the internal representation prepared by ::SimdSynetDeconvolution16bSetParams.
+        The exact offsets depend on tensor format, padding, dilation, stride and group. The input and output tensors
+        use the shape, data types and format from the context created by ::SimdSynetDeconvolution16bInit.
 
         \param [in] context - a pointer to BF16 deconvolution context. It must be created by function ::SimdSynetDeconvolution16bInit and released by function ::SimdRelease.
-        \param [in] src - a pointer to input tensor.
-        \param [out] buf - a pointer to external temporary buffer. The size of the external temporary buffer is determined by function ::SimdSynetDeconvolution16bExternalBufferSize. Can be NULL (it causes usage of internal buffer).
-        \param [out] dst - a pointer to output tensor.
+        \param [in] src - a pointer to input tensor. Actual element type is defined by srcT in deconvolution parameters.
+        \param [out] buf - a pointer to external temporary byte buffer. The required size is determined by function ::SimdSynetDeconvolution16bExternalBufferSize. Can be NULL (it causes usage of internal buffer).
+        \param [out] dst - a pointer to output tensor. Actual element type is defined by dstT in deconvolution parameters.
     */
     SIMD_API void SimdSynetDeconvolution16bForward(void* context, const uint8_t* src, uint8_t* buf, uint8_t* dst);
 
@@ -8155,7 +8205,7 @@ extern "C"
 
         \fn void SimdSynetDequantizeLinear(const uint8_t* src, size_t size, int32_t bias, const float* norm, float* dst);
 
-        \short Performs UINT8 linear dequantization.
+        \short Dequantizes a UINT8 tensor to FP32 with a single scale and zero-point correction.
 
         Algorithm's details for ::SimdSynetDequantizeLinear:
         \verbatim
@@ -8163,10 +8213,12 @@ extern "C"
             dst[i] = (src[i] + bias) * norm[0];
         \endverbatim
 
+        This corresponds to dst = (src - zero) * scale when \a bias is equal to -zero and \a norm[0] is equal to scale.
+
         \param [in] src - a pointer to UINT8 input tensor.
-        \param [in] size - a size of the input and output tensors.
-        \param [in] bias - a dequantization bias (-zero).
-        \param [in] norm - a dequantization norm (scale).
+        \param [in] size - a number of elements in the input and output tensors.
+        \param [in] bias - an integer value added to every input element, typically negative zero-point.
+        \param [in] norm - a pointer to FP32 dequantization scale. Only norm[0] is used.
         \param [out] dst - a pointer to FP32 output tensor.
     */
     SIMD_API void SimdSynetDequantizeLinear(const uint8_t* src, size_t size, int32_t bias, const float* norm, float* dst);
@@ -8175,13 +8227,16 @@ extern "C"
 
         \fn void SimdSynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst);
 
-        \short This function is used for forward propagation of EltwiseLayer.
+        \short Performs element-wise product, weighted sum, maximum or minimum over several FP32 tensors.
+
+        The function reads \a count input arrays of equal length \a size and writes one output array.
+        The \a weight array is used only for ::SimdSynetEltwiseOperationSum.
 
         Algorithm's details for ::SimdSynetEltwiseOperationProduct:
         \verbatim
         for(j = 0; j < size; ++j)
-            dst[j] = 1;
-        for(i = 0; i < count; ++i)
+            dst[j] = src[0][j] * src[1][j];
+        for(i = 2; i < count; ++i)
             for(j = 0; j < size; ++j)
                 dst[j] *= src[i][j];
         \endverbatim
@@ -8189,8 +8244,8 @@ extern "C"
         Algorithm's details for ::SimdSynetEltwiseOperationSum:
         \verbatim
         for(j = 0; j < size; ++j)
-            dst[j] = 0;
-        for(i = 0; i < count; ++i)
+            dst[j] = src[0][j]*weight[0] + src[1][j]*weight[1];
+        for(i = 2; i < count; ++i)
             for(j = 0; j < size; ++j)
                 dst[j] += src[i][j]*weight[i];
         \endverbatim
@@ -8198,8 +8253,8 @@ extern "C"
         Algorithm's details for ::SimdSynetEltwiseOperationMax:
         \verbatim
         for(j = 0; j < size; ++j)
-            dst[j] = -FLT_MAX;
-        for(i = 0; i < count; ++i)
+            dst[j] = Max(src[0][j], src[1][j]);
+        for(i = 2; i < count; ++i)
             for(j = 0; j < size; ++j)
                 dst[j] = Max(dst[j], src[i][j]);
         \endverbatim
@@ -8207,20 +8262,20 @@ extern "C"
         Algorithm's details for ::SimdSynetEltwiseOperationMin:
         \verbatim
         for(j = 0; j < size; ++j)
-            dst[j] = FLT_MAX;
-        for(i = 0; i < count; ++i)
+            dst[j] = Min(src[0][j], src[1][j]);
+        for(i = 2; i < count; ++i)
             for(j = 0; j < size; ++j)
                 dst[j] = Min(dst[j], src[i][j]);
         \endverbatim
 
         \note This function is used in <a href="http://github.com/ermig1979/Synet">Synet Framework</a>.
 
-        \param [in] src - a pointer to pointers to the input 32-bit float arrays. 
-        \param [in] weight - a pointer to the 32-bit float array with sum coefficients. It is need only for ::SimdSynetEltwiseOperationSum operation type otherwise it can be NULL.
+        \param [in] src - a pointer to \a count pointers to input FP32 arrays.
+        \param [in] weight - a pointer to FP32 weighted-sum coefficients. It is used only for ::SimdSynetEltwiseOperationSum; otherwise it can be NULL.
         \param [in] count - a count of input arrays. Must be at least 2.
-        \param [in] size - a size of the input and output arrays.
+        \param [in] size - a number of elements in each input and output array.
         \param [in] type - a type of operation (see ::SimdSynetEltwiseOperationType).
-        \param [out] dst - a pointer to the output 32-bit float array.
+        \param [out] dst - a pointer to the output FP32 array.
     */
     SIMD_API void SimdSynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst);
 
@@ -8228,22 +8283,22 @@ extern "C"
 
         \fn void SimdSynetElu32f(const float * src, size_t size, const float * alpha, float * dst);
 
-        \short Calculates ELU activation function for 32-bit float array.
+        \short Calculates ELU activation for an FP32 array.
 
         The input and output arrays must have the same size.
 
         Algorithm's details:
         \verbatim
         for(i = 0; i < size; ++i)
-            dst[i] = src[i] >= 0 ? src[i] : alpha*(Exp(src[i]) - 1);
+            dst[i] = src[i] >= 0 ? src[i] : alpha[0]*(exp(src[i]) - 1);
         \endverbatim
 
         \note This function is used in <a href="http://github.com/ermig1979/Synet">Synet Framework</a>.
 
-        \param [in] src - a pointer to the input 32-bit float array.
-        \param [in] size - a size of input and output arrays.
-        \param [in] alpha - a pointer to alpha parameter.
-        \param [out] dst - a pointer to the output 32-bit float array.
+        \param [in] src - a pointer to the input FP32 array.
+        \param [in] size - a number of elements in the input and output arrays.
+        \param [in] alpha - a pointer to ELU alpha parameter. Only alpha[0] is used.
+        \param [out] dst - a pointer to the output FP32 array.
     */
     SIMD_API void SimdSynetElu32f(const float * src, size_t size, const float * alpha, float * dst);
 
@@ -8251,28 +8306,45 @@ extern "C"
 
         \fn void* SimdSynetGatherElementsInit(SimdTensorDataType dataType, SimdTensorDataType indexType, SimdBool indexConst, size_t indexUsers, const size_t * outer, size_t outerSize, size_t srcCount, size_t inner, size_t idxCount);
 
+        \short Initializes a gather-elements context.
+
+        The function creates a context for ONNX-style GatherElements along the dimension of length \a srcCount.
+        It supports FP32, BF16 and UINT8 data tensors and INT32 or INT64 index tensors. The input tensor shape is:
+        \verbatim
+        outer[0] * ... * outer[outerSize - 1] * srcCount * inner
+        \endverbatim
+        The index and output tensor shape is:
+        \verbatim
+        outer[0] * ... * outer[outerSize - 1] * idxCount * inner
+        \endverbatim
+
         Algorithm's details:
         \verbatim
-        for (o = 0; o < outer; ++o)
-            for (c = 0; c < idxCount; ++c)
-                for (i = 0; i < inner; ++i)
-                    ic = idx[o, c, i];
+        for(b = 0; b < outer[0]*...*outer[outerSize - 1]; ++b)
+            for(c = 0; c < idxCount; ++c)
+                for(i = 0; i < inner; ++i)
+                {
+                    ic = idx[b, c, i];
                     if (ic < 0)
                         ic += srcCount;
-                    dst[o, c, i] = src[o, ic, i];
+                    dst[b, c, i] = src[b, ic, i];
+                }
         \endverbatim
 
-        \param [in] dataType - a type of input and output tensor.
+        If \a indexConst is SimdTrue, constant indexes can be analyzed by ::SimdSynetGatherElementsSetIndex to avoid
+        repeated negative-index checks and to reduce repeated outer index processing when possible.
+
+        \param [in] dataType - a type of input and output tensor. It can be FP32, BF16 or UINT8.
         \param [in] indexType - a type of index tensor. It can be INT32 or INT64.
-        \param [in] indexConst - a flag of constant index. If this flag is True then you can set index once.
-        \param [in] indexUsers - a number of index users. 
-        \param [in] outer - a pointer to outer shape of input and output tensor.
-        \param [in] outerSize - a size of outer shape of input and output tensor.
-        \param [in] srcCount - an input tensor gather dimension.
-        \param [in] inner - an inner size of input and output tensor.
-        \param [in] idxCount - an index and output tensor gather dimension.
+        \param [in] indexConst - a flag indicating that index tensor is constant and can be set once.
+        \param [in] indexUsers - a number of consumers sharing the same constant index tensor. The current implementation stores this value but does not otherwise use it.
+        \param [in] outer - a pointer to outer shape dimensions before the gathered dimension.
+        \param [in] outerSize - a number of dimensions in \a outer.
+        \param [in] srcCount - a length of the gathered dimension in the input tensor.
+        \param [in] inner - a product of dimensions after the gathered dimension.
+        \param [in] idxCount - a length of the gathered dimension in the index and output tensors.
         \return a pointer to gather elements context. On error it returns NULL. It must be released with using of function ::SimdRelease.
-            This pointer is used in functions :: SimdSynetGatherElementsSetIndex, ::SimdSynetGatherElementsInternalBufferSize, and ::SimdSynetGatherElementsForward.
+            This pointer is used in functions ::SimdSynetGatherElementsSetIndex, ::SimdSynetGatherElementsInternalBufferSize and ::SimdSynetGatherElementsForward.
     */
     SIMD_API void* SimdSynetGatherElementsInit(SimdTensorDataType dataType, SimdTensorDataType indexType, 
         SimdBool indexConst, size_t indexUsers, const size_t * outer, size_t outerSize, size_t srcCount, size_t inner, size_t idxCount);
@@ -8281,11 +8353,14 @@ extern "C"
 
         \fn void SimdSynetGatherElementsSetIndex(const void* context, const uint8_t* idx);
 
-        \short Sets and analyses constant gather elements indexes.
+        \short Sets and analyzes constant gather-elements indexes.
+
+        The function has an effect only when the context was created with \a indexConst equal to SimdTrue. It detects
+        whether negative-index correction is needed and may collapse repeated outer batches of identical indexes.
+        The current implementation still expects the index pointer to be passed to ::SimdSynetGatherElementsForward.
 
         \param [in] context - a pointer to gather elements context. It must be created by function ::SimdSynetGatherElementsInit and released by function ::SimdRelease.
-        \param [in] idx - a pointer to tensor with indexes. It can be INT32 or INT64.
-                          Its size = outer[0] * .. * outer[outerSize - 1] * idxCount * inner. 
+        \param [in] idx - a pointer to INT32 or INT64 index tensor. Its shape is outer[0] * ... * outer[outerSize - 1] * idxCount * inner.
     */
     SIMD_API void SimdSynetGatherElementsSetIndex(const void* context, const uint8_t* idx);
 
@@ -8293,7 +8368,9 @@ extern "C"
 
         \fn size_t SimdSynetGatherElementsInternalBufferSize(const void* context);
 
-        \short Gets size of internal buffer in bytes used inside gather elements algorithm.
+        \short Gets the size in bytes of internal storage used by a gather-elements context.
+
+        The returned value reports implementation-specific buffers used by the context.
 
         \param [in] context - a pointer to gather elements context. It must be created by function ::SimdSynetGatherElementsInit and released by function ::SimdRelease.
         \return size of internal buffer in bytes used inside gather elements algorithm.
@@ -8304,12 +8381,16 @@ extern "C"
 
         \fn void SimdSynetGatherElementsForward(void* context, const uint8_t* src, const uint8_t* idx, uint8_t* dst);
 
-        \short Performs forward propagation of gather elements algorithm.
+        \short Performs gather-elements forward propagation.
+
+        The function gathers elements from \a src according to \a idx. If ::SimdSynetGatherElementsSetIndex was called,
+        the context can use the analysis results, but \a idx must still point to the index tensor in the current
+        implementation. Negative indexes are interpreted relative to \a srcCount.
 
-        \param [in] context - a pointer to gather elements algorithm. It must be created by function ::SimdSynetGatherElementsInit and released by function ::SimdRelease.
-        \param [in] src - a pointer to input tensor. Its size = outer[0] * .. * outer[outerSize - 1] * srcCount * inner.
-        \param [in] idx - a pointer to index tensor. Its size = outer[0] * .. * outer[outerSize - 1] * idxCount * inner. 
-        \param [out] dst - a pointer to output tensor. Its size = outer[0] * .. * outer[outerSize - 1] * idxCount * inner.
+        \param [in] context - a pointer to gather elements context. It must be created by function ::SimdSynetGatherElementsInit and released by function ::SimdRelease.
+        \param [in] src - a pointer to input tensor. Its shape is outer[0] * ... * outer[outerSize - 1] * srcCount * inner.
+        \param [in] idx - a pointer to INT32 or INT64 index tensor. Its shape is outer[0] * ... * outer[outerSize - 1] * idxCount * inner.
+        \param [out] dst - a pointer to output tensor. Its shape is outer[0] * ... * outer[outerSize - 1] * idxCount * inner.
     */
     SIMD_API void SimdSynetGatherElementsForward(void* context, const uint8_t* src, const uint8_t* idx, uint8_t* dst);
 
@@ -8317,7 +8398,7 @@ extern "C"
 
         \fn void SimdSynetGelu32f(const float* src, size_t size, float* dst);
 
-        \short This function is used for forward propagation of GeluLayer.
+        \short Calculates exact GELU activation for an FP32 array.
 
         Algorithm's details:
         \verbatim
@@ -8327,9 +8408,9 @@ extern "C"
 
         \note This function is used in <a href="http://github.com/ermig1979/Synet">Synet Framework</a>.
 
-        \param [in] src - a pointer to the 32-bit float array.
-        \param [in] size - a size of input and output arrays.
-        \param [out] dst - a pointer to output 32-bit float array.
+        \param [in] src - a pointer to the input FP32 array.
+        \param [in] size - a number of elements in the input and output arrays.
+        \param [out] dst - a pointer to the output FP32 array.
     */
     SIMD_API void SimdSynetGelu32f(const float* src, size_t size, float* dst);
 
@@ -8337,7 +8418,12 @@ extern "C"
 
         \fn void* SimdSynetGridSample2dInit(size_t batch, size_t channels, size_t srcH, size_t srcW, size_t dstH, size_t dstW, SimdTensorDataType type, SimdGridSampleInterpType interp, SimdGridSamplePaddingType padding, SimdBool align);
 
-        \short Initializes <a href="https://github.com/onnx/onnx/blob/main/docs/Operators.md#GridSample">grid sample</a> 2D algorithm.
+        \short Initializes an ONNX-compatible GridSample-2D context.
+
+        The function creates a context for NCHW tensors. The input tensor shape is batch*channels*srcH*srcW, the
+        grid tensor shape is batch*dstH*dstW*2 and the output tensor shape is batch*channels*dstH*dstW. The grid
+        stores normalized coordinates in the range [-1, 1] as pairs (x, y). The current implementation supports FP32
+        source, grid and destination tensors.
 
         \param [in] batch - a batch size.
         \param [in] channels - a number of channels in the input and output tensors.
@@ -8345,10 +8431,10 @@ extern "C"
         \param [in] srcW - a width of input tensor.
         \param [in] dstH - a height of output tensor.
         \param [in] dstW - a width of output tensor.
-        \param [in] type - a type of input, grid and output tensor.
-        \param [in] interp - an interpolation type.
-        \param [in] padding - a padding type.
-        \param [in] align - a flag to align corners.
+        \param [in] type - a type of input, grid and output tensor. Currently only FP32 is supported.
+        \param [in] interp - an interpolation type: bilinear, nearest or bicubic.
+        \param [in] padding - a padding type for out-of-bound grid coordinates.
+        \param [in] align - a flag corresponding to ONNX/PyTorch align_corners.
         \return a pointer to grid sample 2D context. On error it returns NULL. It must be released with using of function ::SimdRelease.
             This pointer is used in functions ::SimdSynetGridSample2dInternalBufferSize, and ::SimdSynetGridSample2dForward.
     */
@@ -8359,10 +8445,13 @@ extern "C"
 
         \fn size_t SimdSynetGridSample2dInternalBufferSize(const void* context);
 
-        \short Gets size of internal buffer used inside permute algorithm.
+        \short Gets the size in bytes of internal storage used by a GridSample-2D context.
+
+        The optimized bilinear/zero-padding FP32 implementation stores padded source rows, precomputed source indexes
+        and interpolation coefficients internally. The reference implementation returns 0.
 
         \param [in] context - a pointer to grid sample 2D context. It must be created by function ::SimdSynetGridSample2dInit and released by function ::SimdRelease.
-        \return size of internal buffer used inside grid sample 2D algorithm.
+        \return a number of bytes used by internal buffers.
     */
     SIMD_API size_t SimdSynetGridSample2dInternalBufferSize(const void* context);
 
@@ -8370,12 +8459,24 @@ extern "C"
 
         \fn void SimdSynetGridSample2dForward(void* context, const uint8_t* src, const uint8_t* grd, uint8_t* dst);
 
-        \short Performs forward propagation of grid sample algorithm.
+        \short Performs GridSample-2D forward propagation.
+
+        For every output pixel the function denormalizes grid coordinates to input image coordinates, applies the
+        selected padding rule for out-of-bound coordinates and samples the input with nearest, bilinear or bicubic
+        interpolation:
+        \verbatim
+        if(align)
+            x = (gridX + 1) * (srcW - 1) / 2;
+        else
+            x = ((gridX + 1) * srcW - 1) / 2;
+        y is computed in the same way from gridY and srcH.
+        dst[b, c, dy, dx] = Interpolate(src[b, c], x, y, interp, padding);
+        \endverbatim
 
         \param [in] context - a pointer to grid sample 2D context. It must be created by function ::SimdSynetGridSample2dInit and released by function ::SimdRelease.
-        \param [in] src - a pointer to input tensor. It has size = batch * channels * srcH * srcW.
-        \param [in] grd - a pointer to grid tensor. It has size = batch * dstH * dstW * 2.
-        \param [out] dst - a pointer to output tensor. It has size = batch * channels * dstH * dstW.
+        \param [in] src - a pointer to input tensor. Its shape is batch * channels * srcH * srcW.
+        \param [in] grd - a pointer to grid tensor. Its shape is batch * dstH * dstW * 2.
+        \param [out] dst - a pointer to output tensor. Its shape is batch * channels * dstH * dstW.
     */
     SIMD_API void SimdSynetGridSample2dForward(void* context, const uint8_t* src, const uint8_t* grd, uint8_t* dst);
 
@@ -8383,23 +8484,23 @@ extern "C"
 
         \fn void SimdSynetHardSigmoid32f(const float * src, size_t size, const float * scale, const float * shift, float * dst);
 
-        \short Calculates HardSigmoid activation function (https://pytorch.org/docs/stable/generated/torch.nn.Hardsigmoid.html) for 32-bit float array.
+        \short Calculates HardSigmoid activation for an FP32 array.
 
-        Input and output arrays must have the same size.
+        The input and output arrays must have the same size.
 
         Algorithm's details:
         \verbatim
         for(i = 0; i < size; ++i)
-            dst[i] = Max(0, Min(src[i] * scale + shift, 1));
+            dst[i] = Max(0, Min(src[i] * scale[0] + shift[0], 1));
         \endverbatim
 
         \note This function is used in <a href="http://github.com/ermig1979/Synet">Synet Framework</a>.
 
-        \param [in] src - a pointer to the input 32-bit float array.
-        \param [in] size - a size of input and output arrays.
-        \param [in] scale - a pointer to scale parameter. This parameter is equal to 1/6 in Pytorch documentation.
-        \param [in] shift - a pointer to shift parameter. This parameter is equal to 1/2 in Pytorch documentation.
-        \param [out] dst - a pointer to the output 32-bit float array.
+        \param [in] src - a pointer to the input FP32 array.
+        \param [in] size - a number of elements in the input and output arrays.
+        \param [in] scale - a pointer to scale parameter. Only scale[0] is used. This parameter is equal to 1/6 in PyTorch documentation.
+        \param [in] shift - a pointer to shift parameter. Only shift[0] is used. This parameter is equal to 1/2 in PyTorch documentation.
+        \param [out] dst - a pointer to the output FP32 array.
     */
     SIMD_API void SimdSynetHardSigmoid32f(const float * src, size_t size, const float * scale, const float * shift, float * dst);
 
@@ -8407,23 +8508,23 @@ extern "C"
 
         \fn void SimdSynetHswish32f(const float * src, size_t size, const float * shift, const float * scale, float * dst);
 
-        \short Calculates H-Swish activation function (https://arxiv.org/pdf/1905.02244.pdf) for 32-bit float array.
+        \short Calculates H-Swish activation for an FP32 array.
 
-        Input and output arrays must have the same size.
+        The input and output arrays must have the same size.
 
         Algorithm's details:
         \verbatim
         for(i = 0; i < size; ++i)
-            dst[i] = Max(Min(src[i], shift) + shift, 0)*scale*src[i];
+            dst[i] = Max(Min(src[i], shift[0]) + shift[0], 0)*scale[0]*src[i];
         \endverbatim
 
         \note This function is used in <a href="http://github.com/ermig1979/Synet">Synet Framework</a>.
 
-        \param [in] src - a pointer to the input 32-bit float array.
-        \param [in] size - a size of input and output arrays.
-        \param [in] shift - a pointer to shift parameter. It is equal to 3 in original paper.
-        \param [in] scale - a pointer to scale parameter. It is equal to 1/6 in original paper.
-        \param [out] dst - a pointer to the output 32-bit float array.
+        \param [in] src - a pointer to the input FP32 array.
+        \param [in] size - a number of elements in the input and output arrays.
+        \param [in] shift - a pointer to shift parameter. Only shift[0] is used. It is equal to 3 in the original paper.
+        \param [in] scale - a pointer to scale parameter. Only scale[0] is used. It is equal to 1/6 in the original paper.
+        \param [out] dst - a pointer to the output FP32 array.
     */
     SIMD_API void SimdSynetHswish32f(const float* src, size_t size, const float* shift, const float* scale, float* dst);
 
@@ -8431,13 +8532,28 @@ extern "C"
 
         \fn void * SimdSynetInnerProduct32fInit(size_t M, size_t N, size_t K, SimdBool transB, SimdBool constB, SimdBool bias, SimdConvolutionActivationType activation);
 
-        \short Initializes FP32 inner product algorithm.
+        \short Initializes an FP32 inner-product (matrix multiplication) context.
 
-        \param [in] M - a height of A and height of C matrices.
-        \param [in] N - a width of B and width of C matrices.
+        The context computes C = A*B, optionally adds bias and applies activation:
+        \verbatim
+        for(i = 0; i < M; ++i)
+            for(j = 0; j < N; ++j)
+            {
+                sum = bias ? bias[j] : 0;
+                for(k = 0; k < K; ++k)
+                    sum += A[i, k] * (transB ? B[j, k] : B[k, j]);
+                C[i, j] = Activate(sum, activation, params);
+            }
+        \endverbatim
+
+        When \a constB is SimdTrue, matrix B must be supplied to ::SimdSynetInnerProduct32fSetParams and can be
+        reordered or cached inside the context.
+
+        \param [in] M - a height of A and C matrices.
+        \param [in] N - a width of B and C matrices.
         \param [in] K - a width of A and height of B matrices.
-        \param [in] transB - a transpose matrix B before multiplication.
-        \param [in] constB - a matrix B is constant.
+        \param [in] transB - a flag indicating that B is stored as N*K instead of K*N.
+        \param [in] constB - a flag indicating that matrix B is constant and can be set once.
         \param [in] bias - a flag to add bias to output matrix C.
         \param [in] activation - an activation function type used after inner product.
         \return a pointer to FP32 inner product context. On error it returns NULL. It must be released with using of function ::SimdRelease.
@@ -8450,10 +8566,13 @@ extern "C"
 
         \fn size_t SimdSynetInnerProduct32fInternalBufferSize(const void * context);
 
-        \short Gets size of internal buffer used inside FP32 inner product algorithm.
+        \short Gets the size of internal storage used by an FP32 inner-product context.
+
+        The returned value is a number of FP32 elements. It reports implementation-specific storage such as reordered
+        constant weights and copied bias.
 
         \param [in] context - a pointer to FP32 inner product context. It must be created by function ::SimdSynetInnerProduct32fInit and released by function ::SimdRelease.
-        \return size of internal buffer used inside FP32 inner product algorithm.
+        \return a number of FP32 elements used by internal buffers.
     */
     SIMD_API size_t SimdSynetInnerProduct32fInternalBufferSize(const void* context);
 
@@ -8461,10 +8580,14 @@ extern "C"
 
         \fn size_t SimdSynetInnerProduct32fExternalBufferSize(const void * context);
 
-        \short Gets size of external buffer used in FP32 inner product algorithm.
+        \short Gets the size of caller-provided temporary buffer for FP32 inner product.
+
+        The returned value is a number of FP32 elements. The current FP32 implementations do not require an external
+        buffer and return 0, but callers can use this value when allocating the \a buf argument of
+        ::SimdSynetInnerProduct32fForward.
 
         \param [in] context - a pointer to FP32 inner product context. It must be created by function ::SimdSynetInnerProduct32fInit and released by function ::SimdRelease.
-        \return size of internal buffer used inside FP32 inner product algorithm.
+        \return a number of FP32 elements required for external temporary buffer.
     */
     SIMD_API size_t SimdSynetInnerProduct32fExternalBufferSize(const void* context);
 
@@ -8472,13 +8595,19 @@ extern "C"
 
         \fn void SimdSynetInnerProduct32fSetParams(void* context, const float* weight, SimdBool* internal, const float* bias, const float* params);
 
-        \short Sets weights, biases and parameters of activation function required for FP32 inner product algorithm.
+        \short Sets weights, bias and activation parameters for FP32 inner product.
+
+        This function must be called before ::SimdSynetInnerProduct32fForward. If \a constB was SimdTrue during
+        initialization, \a weight provides matrix B and the implementation may reorder and store it internally. If
+        \a internal is not NULL, SimdTrue means the weights were copied/reordered into the context; SimdFalse means
+        the original \a weight pointer can be used by later forward calls and must remain valid. Bias and activation
+        parameters are stored or referenced according to the selected implementation.
 
         \param [in, out] context - a pointer to FP32 inner product context. It must be created by function ::SimdSynetInnerProduct32fInit and released by function ::SimdRelease.
-        \param [in] weight - a pointer to inner product weights.
-        \param [out] internal - a flag signalizing that weight is stored in the internal buffer. Can be NULL.
-        \param [in] bias - a pointer to bias. Can be NULL.
-        \param [in] params - a pointer to parameters of activation functions (see ::SimdConvolutionActivationType). Can be NULL.
+        \param [in] weight - a pointer to FP32 matrix B weights.
+        \param [out] internal - a pointer to a flag receiving weight storage mode. Can be NULL.
+        \param [in] bias - a pointer to FP32 bias array with N elements. Can be NULL.
+        \param [in] params - a pointer to FP32 parameters of activation function (see ::SimdConvolutionActivationType). Can be NULL when activation does not require parameters.
     */
     SIMD_API void SimdSynetInnerProduct32fSetParams(void* context, const float* weight, SimdBool* internal, const float* bias, const float* params);
 
@@ -8486,15 +8615,15 @@ extern "C"
 
         \fn void SimdSynetInnerProduct32fForward(void* context, const float* A, const float* B, float *buf, float* C);
 
-        \short Performs forward propagation of FP32 inner product algorithm.
+        \short Performs FP32 inner-product forward propagation.
 
         \param [in] context - a pointer to FP32 inner product context. 
             It must be created by function ::SimdSynetInnerProduct32fInit and released by function ::SimdRelease.
-        \param [in] A - a pointer to A matrix.
-        \param [in] B - a pointer to B matrix. Can be NULL if B is constant matrix. In that case you have to set B (weight) in function SimdSynetInnerProduct16bSetParams.
-        \param [out] buf - a pointer to external buffer. The size of the external temporary buffer is determined by function ::SimdSynetInnerProduct16bExternalBufferSize.
+        \param [in] A - a pointer to FP32 A matrix with M*K elements.
+        \param [in] B - a pointer to FP32 B matrix. Can be NULL if B is constant; in that case B must be set by function ::SimdSynetInnerProduct32fSetParams.
+        \param [out] buf - a pointer to external temporary FP32 buffer. The required number of elements is determined by function ::SimdSynetInnerProduct32fExternalBufferSize.
             Can be NULL (it causes usage of internal buffer).
-        \param [out] C - a pointer to output matrix.
+        \param [out] C - a pointer to FP32 output matrix with M*N elements.
     */
     SIMD_API void SimdSynetInnerProduct32fForward(void* context, const float* A, const float* B, float *buf, float* C);
 
@@ -8502,7 +8631,7 @@ extern "C"
 
         \fn void SimdSynetInnerProductLayerForward(const float * src, const float * weight, const float * bias, size_t count, size_t size, float * dst);
 
-        \short This function is used for forward propagation of InnerProductLayer.
+        \short Performs FP32 forward propagation of a single inner-product layer.
 
         Algorithm's details:
         \verbatim
@@ -8516,12 +8645,12 @@ extern "C"
 
         \note This function is used in <a href="http://github.com/ermig1979/Synet">Synet Framework</a>.
 
-        \param [in] src - a pointer to the input 32-bit float array. The size of the array must be equal to size.
-        \param [in] weight - a pointer to the 32-bit float array with weight coefficients. The size of the array must be equal to count*size.
-        \param [in] bias - a pointer to the 32-bit float array with bias coefficients. The size of the array must be equal to count. Can be NULL. 
-        \param [in] count - a size of output array.
-        \param [in] size - a size of input array.
-        \param [out] dst - a pointer to the output 32-bit float array. The size of the array must be equal to count.
+        \param [in] src - a pointer to the input FP32 array with \a size elements.
+        \param [in] weight - a pointer to FP32 weight coefficients with count*size elements, stored as count rows.
+        \param [in] bias - a pointer to FP32 bias coefficients with \a count elements. Can be NULL.
+        \param [in] count - a number of output elements.
+        \param [in] size - a number of input elements.
+        \param [out] dst - a pointer to the output FP32 array with \a count elements.
     */
     SIMD_API void SimdSynetInnerProductLayerForward(const float * src, const float * weight, const float * bias, size_t count, size_t size, float * dst);
 
@@ -8529,27 +8658,32 @@ extern "C"
 
         \fn void* SimdSynetInnerProduct16bInit(size_t M, size_t N, size_t K, SimdTensorDataType typeA, SimdTensorDataType typeB, SimdTensorDataType typeC, SimdBool transB, SimdBool constB, SimdBool bias, SimdConvolutionActivationType activation);
 
-        \short Initializes BF16 inner product (matrix multiplication) algorithm.
+        \short Initializes a BF16/FP32 inner-product (matrix multiplication) context.
 
-        Algorithm's details (transpA = false, bias = true, activation = SimdConvolutionActivationIdentity):
+        The context computes C = A*B with FP32 accumulation, optionally adds bias and applies activation. A, B and C
+        can be FP32 or BF16 according to \a typeA, \a typeB and \a typeC:
         \verbatim
         for(i = 0; i < M; ++i)
             for(j = 0; j < N; ++j)
             {
-                C[i,j] = bias[j];
+                sum = bias ? bias[j] : 0;
                 for(k = 0; k < K; ++k)
-                    C[i,j] += A[i,k] * B[k,j];
+                    sum += A[i, k] * (transB ? B[j, k] : B[k, j]);
+                C[i, j] = ConvertToTypeC(Activate(sum, activation, params));
             }
         \endverbatim
 
-        \param [in] M - a height of A and height of C matrices.
-        \param [in] N - a width of B and width of C matrices.
+        When \a constB is SimdTrue, matrix B must be supplied to ::SimdSynetInnerProduct16bSetParams and is converted
+        or reordered into internal storage.
+
+        \param [in] M - a height of A and C matrices.
+        \param [in] N - a width of B and C matrices.
         \param [in] K - a width of A and height of B matrices.
         \param [in] typeA - a type of A matrix. It can be FP32 or BF16.
         \param [in] typeB - a type of B matrix. It can be FP32 or BF16.
         \param [in] typeC - a type of C matrix. It can be FP32 or BF16.
-        \param [in] transB - a transpose matrix B before multiplication.
-        \param [in] constB - a matrix B is constant.
+        \param [in] transB - a flag indicating that B is stored as N*K instead of K*N.
+        \param [in] constB - a flag indicating that matrix B is constant and can be set once.
         \param [in] bias - a flag to add bias to output matrix C.
         \param [in] activation - an activation function type used after inner product.
         \return a pointer to BF16 inner product context. On error it returns NULL. It must be released with using of function ::SimdRelease.
@@ -8562,10 +8696,13 @@ extern "C"
 
         \fn size_t SimdSynetInnerProduct16bInternalBufferSize(const void * context);
 
-        \short Gets size in bytes of internal buffer used inside BF16 inner product algorithm.
+        \short Gets the size in bytes of internal storage used by a BF16 inner-product context.
+
+        The returned value reports internal temporary storage, reordered constant weights, copied bias and copied
+        activation parameters.
 
         \param [in] context - a pointer to BF16 inner product context. It must be created by function ::SimdSynetInnerProduct16bInit and released by function ::SimdRelease.
-        \return size in bytes of internal buffer used inside BF16 inner product algorithm.
+        \return a number of bytes used by internal buffers.
     */
     SIMD_API size_t SimdSynetInnerProduct16bInternalBufferSize(const void* context);
 
@@ -8573,10 +8710,14 @@ extern "C"
 
         \fn size_t SimdSynetInnerProduct16bExternalBufferSize(const void * context);
 
-        \short Gets size in bytes of external buffer used in BF16 inner product algorithm.
+        \short Gets the size in bytes of caller-provided temporary buffer for BF16 inner product.
+
+        The returned value depends on matrix types and implementation. It covers temporary BF16 copies of FP32 inputs,
+        packed non-constant B matrices, FP32 accumulation buffers and optional post-processing buffers. It can be used
+        to allocate the \a buf argument of ::SimdSynetInnerProduct16bForward.
 
         \param [in] context - a pointer to BF16 inner product context. It must be created by function ::SimdSynetInnerProduct16bInit and released by function ::SimdRelease.
-        \return size in bytes of external buffer used in BF16 inner product algorithm.
+        \return a number of bytes required for external temporary buffer.
     */
     SIMD_API size_t SimdSynetInnerProduct16bExternalBufferSize(const void* context);
 
@@ -8584,23 +8725,33 @@ extern "C"
 
         \fn const char* SimdSynetInnerProduct16bInfo(const void * context);
 
-        \short Gets string with description of internal implementation of BF16 inner product algorithm.
+        \short Gets a short description of the selected BF16 inner-product implementation.
+
+        The returned string contains the implementation extension, algorithm name and parameter summary. The returned
+        pointer is owned by the context and remains valid until the next call of this function for the same context or
+        until the context is released.
 
         \param [in] context - a pointer to BF16 inner product context. It must be created by function ::SimdSynetInnerProduct16bInit and released by function ::SimdRelease.
-        \return string with description of internal implementation of BF16 inner product algorithm.
+        \return a string with description of internal implementation of BF16 inner product algorithm.
     */
     SIMD_API const char* SimdSynetInnerProduct16bInfo(const void* context);
 
     /*! @ingroup synet_inner_product_bf16
 
-        \fn void SimdSynetInnerProduct16bSetParams(void* context, const float* weight, SimdBool* internal, const float* bias);
+        \fn void SimdSynetInnerProduct16bSetParams(void* context, const float* weight, const float* bias, const float* params);
+
+        \short Sets weights, bias and activation parameters for BF16 inner product.
 
-        \short Sets weights and biases required for BF16 inner product algorithm.
+        This function must be called before ::SimdSynetInnerProduct16bForward. If \a constB was SimdTrue during
+        initialization, \a weight provides matrix B in FP32 form and the implementation converts it to BF16 and may
+        reorder it into internal storage. Bias is copied to an internal FP32 array; when \a bias is NULL, zeros are
+        used. Activation parameters are copied or expanded to the internal FP32 array according to
+        ::SimdConvolutionActivationType.
 
         \param [in, out] context - a pointer to BF16 inner product context. It must be created by function ::SimdSynetInnerProduct16bInit and released by function ::SimdRelease.
-        \param [in] weight - a pointer to B matrix. Can be NULL.
-        \param [in] bias - a pointer to bias. Can be NULL.
-        \param [in] params - a pointer to parameters of activation functions (see ::SimdConvolutionActivationType). Can be NULL.
+        \param [in] weight - a pointer to FP32 matrix B weights. Can be NULL only when B is not constant.
+        \param [in] bias - a pointer to FP32 bias array with N elements. Can be NULL.
+        \param [in] params - a pointer to FP32 parameters of activation function (see ::SimdConvolutionActivationType). Can be NULL when activation does not require parameters.
     */
     SIMD_API void SimdSynetInnerProduct16bSetParams(void* context, const float* weight, const float* bias, const float* params);
 
@@ -8608,17 +8759,21 @@ extern "C"
 
         \fn void SimdSynetInnerProduct16bForward(void* context, const uint8_t* A, const uint8_t* B, uint8_t* buf, uint8_t* C);
 
-        \short Performs forward propagation of BF16 inner product algorithm.
+        \short Performs BF16/FP32 inner-product forward propagation.
+
+        The function converts FP32 A or B inputs to BF16 when requested by the context, uses BF16 inputs directly
+        otherwise, accumulates the matrix product in FP32, adds bias, applies activation and writes FP32 or BF16
+        output according to \a typeC.
 
         \param [in] context - a pointer to BF16 inner product context. 
             It must be created by function ::SimdSynetInnerProduct16bInit and released by function ::SimdRelease.
-        \param [in] A - a pointer to A matrix.
-        \param [in] B - a pointer to B matrix. Can be NULL if B is constant matrix. 
-            In that case you have to set B (weight) in function SimdSynetInnerProduct16bSetParams.
-        \param [out] buf - a pointer to external buffer. 
-            The size of the external temporary buffer is determined by function ::SimdSynetInnerProduct16bExternalBufferSize. 
+        \param [in] A - a pointer to A matrix. Actual element type is defined by \a typeA in initialization.
+        \param [in] B - a pointer to B matrix. Can be NULL if B is constant; in that case B must be set by function ::SimdSynetInnerProduct16bSetParams.
+            Actual element type is defined by \a typeB in initialization for non-constant B.
+        \param [out] buf - a pointer to external temporary byte buffer.
+            The required size is determined by function ::SimdSynetInnerProduct16bExternalBufferSize.
             Can be NULL (it causes usage of internal buffer).
-        \param [out] C - a pointer to output matrix.
+        \param [out] C - a pointer to output matrix. Actual element type is defined by \a typeC in initialization.
     */
     SIMD_API void SimdSynetInnerProduct16bForward(void* context, const uint8_t* A, const uint8_t* B, uint8_t* buf, uint8_t* C);
 
@@ -8626,7 +8781,7 @@ extern "C"
 
         \fn void SimdSynetInnerProduct8i(size_t M, size_t N, size_t K, const uint8_t * src, const int8_t * weight, int32_t * dst, SimdSynetCompatibilityType compatibility);
 
-        \short This function is used for INT8 forward propagation of InnerProductLayer.
+        \short Performs UINT8-by-INT8 inner product with INT32 output.
 
         Algorithm's details:
         \verbatim
@@ -8636,21 +8791,25 @@ extern "C"
             {
                 sum = 0;
                 for (k = 0; k < K; ++k)
-                    sum += src[i * K + k] * weight[j * K + k];
+                    sum += int(src[i*K + k]) * int(weight[j*K + k]);
                 dst[i*N + j] = sum;
             }
         }
         \endverbatim
 
+        When compatibility flags allow overflow-compatible multiplication, adjacent products can be accumulated with
+        16-bit saturation before being added to the INT32 sum. Use ::SimdSynetCompatibility8iPrecise to request the
+        precise product accumulation path.
+
         \note This function is used in <a href="http://github.com/ermig1979/Synet">Synet Framework</a>.
 
-        \param [in] M - a batch size.
-        \param [in] N - an output size.
-        \param [in] K - an input size.
-        \param [in] src - a pointer to the input 8-bit unsigned integer array. The size of the array must be equal to M*K.
-        \param [in] weight - a pointer to the 8-bit signed integer array with weight. The size of the array must be equal to N*K.
-        \param [out] dst - a pointer to the output 32-bit integer array. The size of the array must be equal to M*N.
-        \param [in] compatibility - a flags of calculation compatibility.
+        \param [in] M - a batch size, or a number of input rows.
+        \param [in] N - an output size, or a number of weight rows.
+        \param [in] K - an input size, or a row length.
+        \param [in] src - a pointer to the UINT8 input matrix with M*K elements.
+        \param [in] weight - a pointer to the INT8 weight matrix with N*K elements, stored by output row.
+        \param [out] dst - a pointer to the INT32 output matrix with M*N elements.
+        \param [in] compatibility - calculation compatibility flags (see ::SimdSynetCompatibilityType).
     */
     SIMD_API void SimdSynetInnerProduct8i(size_t M, size_t N, size_t K, const uint8_t * src, const int8_t * weight, int32_t * dst, SimdSynetCompatibilityType compatibility);