From 3f95fcf8cc0cedafa2235fc61f504fcafbcd53cf Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Fri, 5 Jun 2026 08:45:31 +0000
Subject: [PATCH 1/5] Improve Synet API documentation

Co-authored-by: Ihar Yermalayeu <ermig@tut.by>
---
 src/Simd/SimdLib.h | 176 ++++++++++++++++++++++++++++++---------------
 1 file changed, 120 insertions(+), 56 deletions(-)

diff --git a/src/Simd/SimdLib.h b/src/Simd/SimdLib.h
index 336f11c788..fcbe4fec2a 100644
--- a/src/Simd/SimdLib.h
+++ b/src/Simd/SimdLib.h
@@ -7364,15 +7364,29 @@ extern "C"
 
         \fn void* SimdSynetAdd16bInit(const size_t* aShape, size_t aCount, SimdTensorDataType aType, const size_t* bShape, size_t bCount, SimdTensorDataType bType, SimdTensorDataType dstType, SimdTensorFormatType format);
 
-        \short Initializes add algorithm.
+        \short Initializes element-wise addition of two tensors in FP32 or BF16 format.
+
+        The created context adds two tensors with equal shapes:
+        \verbatim
+        for(i = 0; i < shapeSize; ++i)
+        {
+            A = aType == SimdTensorData16b ? BFloat16ToFloat32(a[i]) : a[i];
+            B = bType == SimdTensorData16b ? BFloat16ToFloat32(b[i]) : b[i];
+            D = A + B;
+            dst[i] = dstType == SimdTensorData16b ? Float32ToBFloat16(D) : D;
+        }
+        \endverbatim
+
+        The current implementation creates a context only for equal input shapes, FP32/BF16 input and output tensor types,
+        and SimdTensorFormatUnknown, SimdTensorFormatNchw or SimdTensorFormatNhwc tensor format.
 
         \param [in] aShape - a pointer to shape of input A tensor.
         \param [in] aCount - a count of dimensions of input A tensor.
-        \param [in] aType - a type of input A tensor. Can be FP32 of BF16.
+        \param [in] aType - a type of input A tensor. Can be FP32 or BF16.
         \param [in] bShape - a pointer to shape of input B tensor.
         \param [in] bCount - a count of dimensions of input B tensor.
-        \param [in] bType - a type of input B tensor. Can be FP32 of BF16.        
-        \param [in] dstType - a type of output tensor. Can be FP32 of BF16.
+        \param [in] bType - a type of input B tensor. Can be FP32 or BF16.
+        \param [in] dstType - a type of output tensor. Can be FP32 or BF16.
         \param [in] format - a format of input / output tensors.
         \return a pointer to add context. On error it returns NULL. It must be released with using of function ::SimdRelease.
             This pointer is used in function ::SimdSynetAdd16bForward.
@@ -7383,7 +7397,11 @@ extern "C"
 
         \fn void SimdSynetAdd16bForward(void* context, const uint8_t* a, const uint8_t* b, uint8_t* dst);
 
-        \short Performs forward propagation of add algorithm.
+        \short Performs element-wise addition of two FP32/BF16 tensors.
+
+        The function adds corresponding elements of input tensors A and B using a context created by ::SimdSynetAdd16bInit.
+        The actual data types, tensor shape and output type are stored in the context. BF16 input values are converted to
+        FP32 before addition, and BF16 output values are converted from FP32 after addition.
 
         \param [in] context - a pointer to add context. It must be created by function ::SimdSynetAdd16bInit and released by function ::SimdRelease.
         \param [in] a - a pointer to input A tensor.
@@ -7396,22 +7414,28 @@ extern "C"
 
         \fn void SimdSynetAddBias(const float * bias, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format);
 
-        \short Adds a bias to given vector.
+        \short Adds per-channel bias to an FP32 tensor in place.
 
         Algorithm's details (example for NCHW tensor format):
         \verbatim
         for(c = 0; c < channels; ++c)
-            for(j = 0; j < spatial; ++j)
+            for(s = 0; s < spatial; ++s)
                  dst[c*spatial + s] += bias[c];
         \endverbatim
+        Algorithm's details (example for NHWC tensor format):
+        \verbatim
+        for(s = 0; s < spatial; ++s)
+            for(c = 0; c < channels; ++c)
+                 dst[s*channels + c] += bias[c];
+        \endverbatim
 
         \note This function is used in <a href="http://github.com/ermig1979/Synet">Synet Framework</a>.
 
         \param [in] bias - a pointer to the 32-bit float array with bias coefficients. The size of the array is equal to channels.
-        \param [in] channels - a number of channels in the image tensor.
-        \param [in] spatial - a spatial size of image tensor.
-        \param [in, out] dst - a pointer to cumulative 32-bit image tensor. The size of the array is equal to channels * spatial.
-        \param [in] format - a format of image tensor.
+        \param [in] channels - a number of channels in the tensor.
+        \param [in] spatial - a spatial size (height * width) of the tensor.
+        \param [in, out] dst - a pointer to FP32 tensor updated in place. The size of the array is equal to channels * spatial.
+        \param [in] format - a format of the tensor.
     */
     SIMD_API void SimdSynetAddBias(const float * bias, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format);
 
@@ -7419,37 +7443,40 @@ extern "C"
 
         \fn void SimdSynetAdd8i(const uint8_t * aData, const float * aScale, const float* aShift, const uint8_t* bData, const float* bScale, const float* bShift, uint8_t* cData, const float* cScale, const float* cShift, size_t batch, size_t channels, size_t spatial, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility);
 
-        \short Adds two INT8 tensors.
+        \short Dequantizes, adds and requantizes two UINT8 tensors.
 
-         Algorithm's details (example for NCHW tensor format):
+        Algorithm's details (example for NCHW tensor format):
         \verbatim
+        upper = isNarrowed(compatibility) ? 180 : 255;
         for(b = 0; b < batch; ++b)
             for(c = 0; c < channels; ++c)
                 for(s = 0; s < spatial; ++s)
                 {
                      offs = (b*channels + c)*spatial + s;
-                     A = aData[offs]*aScale[c] + aShift[c]; 
+                     A = aData[offs]*aScale[c] + aShift[c];
                      B = bData[offs]*bScale[c] + bShift[c];
-                     cData[offs] = round((A + B)*cScale[c] + cShift[c]);
+                     C = round((A + B)*cScale[c] + cShift[c]);
+                     cData[offs] = restrict(C, 0, upper);
                 }
         \endverbatim
+        For NHWC tensor format the same calculation uses offset (b*spatial + s)*channels + c.
 
         \note This function is used in <a href="http://github.com/ermig1979/Synet">Synet Framework</a>.
 
-        \param [in] aData - a pointer to the first input 8-bit integer tensor.
-        \param [in] aScale - a pointer to the 32-bit float array with scale coefficients of the first input tensor.
-        \param [in] aShift - a pointer to the 32-bit float array with shift coefficients of the first input tensor.
-        \param [in] bData - a pointer to the second input 8-bit integer tensor.
-        \param [in] bScale - a pointer to the 32-bit float array with scale coefficients of the second input tensor.
-        \param [in] bShift - a pointer to the 32-bit float array with shift coefficients of the second input tensor.
-        \param [out] cData - a pointer to the output 8-bit integer tensor.
-        \param [in] cScale - a pointer to the 32-bit float array with scale coefficients of the output tensor.
-        \param [in] cShift - a pointer to the 32-bit float array with shift coefficients of the output tensor.
-        \param [in] batch - a batch size of input and output image tensors.
-        \param [in] channels - a number of channels in input and output image tensors.
-        \param [in] spatial - a spatial size of input and output image tensors.
-        \param [in] format - a format of input and output image tensors.
-        \param [in] compatibility - a flags of calculation compatibility.
+        \param [in] aData - a pointer to the first input UINT8 tensor.
+        \param [in] aScale - a pointer to the 32-bit float array with per-channel scale coefficients of the first input tensor.
+        \param [in] aShift - a pointer to the 32-bit float array with per-channel shift coefficients of the first input tensor.
+        \param [in] bData - a pointer to the second input UINT8 tensor.
+        \param [in] bScale - a pointer to the 32-bit float array with per-channel scale coefficients of the second input tensor.
+        \param [in] bShift - a pointer to the 32-bit float array with per-channel shift coefficients of the second input tensor.
+        \param [out] cData - a pointer to the output UINT8 tensor.
+        \param [in] cScale - a pointer to the 32-bit float array with per-channel scale coefficients of the output tensor.
+        \param [in] cShift - a pointer to the 32-bit float array with per-channel shift coefficients of the output tensor.
+        \param [in] batch - a batch size of input and output tensors.
+        \param [in] channels - a number of channels in input and output tensors.
+        \param [in] spatial - a spatial size (height * width) of input and output tensors.
+        \param [in] format - a format of input and output tensors. Can be NCHW or NHWC.
+        \param [in] compatibility - calculation compatibility flags. When narrowed 8-bit mode is active, output is limited to [0, 180], otherwise to [0, 255].
     */
     SIMD_API void SimdSynetAdd8i(const uint8_t * aData, const float * aScale, const float* aShift, const uint8_t* bData, const float* bScale, const float* bShift,
         uint8_t* cData, const float* cScale, const float* cShift, size_t batch, size_t channels, size_t spatial, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility);
@@ -7458,20 +7485,30 @@ extern "C"
 
         \fn void SimdSynetChannelSum16b(const uint16_t* src, size_t channels, size_t spatial, SimdTensorFormatType format, float* sum);
 
-        \short Calculates channels sums in FP32 format for input tensor in BF16 format.
+        \short Calculates per-channel sums of a BF16 tensor in FP32 format.
 
-        Algorithm's details (example for NCHW tensor format) :
+        Algorithm's details (example for NCHW tensor format):
         \verbatim
         for(c = 0; c < channels; ++c)
+        {
             sum[c] = 0;
             for(s = 0; s < spatial; ++s)
-                sum[c] += src[c, s];
+                sum[c] += BFloat16ToFloat32(src[c*spatial + s]);
+        }
+        \endverbatim
+        Algorithm's details (example for NHWC tensor format):
+        \verbatim
+        for(c = 0; c < channels; ++c)
+            sum[c] = 0;
+        for(s = 0; s < spatial; ++s)
+            for(c = 0; c < channels; ++c)
+                sum[c] += BFloat16ToFloat32(src[s*channels + c]);
         \endverbatim
 
         \note This function is used in <a href="http://github.com/ermig1979/Synet">Synet Framework</a>.
 
-        \param [in] src - a pointer to the input 16-bit brain-float tensor.
-        \param [in] channels - a number of channels in input and output arrays.
+        \param [in] src - a pointer to the input BF16 tensor.
+        \param [in] channels - a number of channels in input tensor.
         \param [in] spatial - a spatial (width * height) size of input tensor.
         \param [in] format - a format of input tensor.
         \param [out] sum - a pointer to output 32-bit float array with channels sums.
@@ -7482,20 +7519,34 @@ extern "C"
 
         \fn void SimdSynetConvert32fTo8u(const float * src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float * shift, uint8_t * dst, SimdSynetCompatibilityType compatibility);
 
-        \short Converts 32-bit float point image to 8-bit unsigned integer image.
+        \short Converts an FP32 tensor to a UINT8 tensor using per-channel scale and shift.
+
+        Algorithm's details (example for NCHW tensor format):
+        \verbatim
+        upper = isNarrowed(compatibility) ? 180 : 255;
+        for(b = 0; b < batch; ++b)
+            for(c = 0; c < channels; ++c)
+                for(h = 0; h < height; ++h)
+                    for(w = 0; w < width; ++w)
+                    {
+                        offs = ((b*channels + c)*height + h)*width + w;
+                        dst[offs] = restrict(round(src[offs]*scale[c] + shift[c]), 0, upper);
+                    }
+        \endverbatim
+        For NHWC tensor format the same calculation uses offset ((b*height + h)*width + w)*channels + c.
 
         \note This function is used in <a href="http://github.com/ermig1979/Synet">Synet Framework</a>. 
 
-        \param [in] src - a pointer to the 32-bit float array with input image tensor. 
-        \param [in] batch - a number of images in the batch of (input/output) image tensor.
-        \param [in] channels - a number of channels in the (input/output) image tensor.
-        \param [in] height - a height of (input/output) image tensor.
-        \param [in] width - a width of (input/output) image tensor.
-        \param [in] format - a format of (input/output) image tensor.
-        \param [in] scale - a pointer to the 32-bit float array with scale coefficients. 
-        \param [in] shift - a pointer to the 32-bit float array with shift coefficients. 
-        \param [out] dst - a pointer to the 8-bit unsigned integer array with output image tensor. 
-        \param [in] compatibility - a flags of calculation compatibility.
+        \param [in] src - a pointer to the FP32 input tensor.
+        \param [in] batch - a batch size of input and output tensors.
+        \param [in] channels - a number of channels in input and output tensors.
+        \param [in] height - a height of input and output tensors.
+        \param [in] width - a width of input and output tensors.
+        \param [in] format - a format of input and output tensors. Can be NCHW or NHWC.
+        \param [in] scale - a pointer to the 32-bit float array with per-channel scale coefficients.
+        \param [in] shift - a pointer to the 32-bit float array with per-channel shift coefficients.
+        \param [out] dst - a pointer to the UINT8 output tensor.
+        \param [in] compatibility - calculation compatibility flags. When narrowed 8-bit mode is active, output is limited to [0, 180], otherwise to [0, 255].
     */
     SIMD_API void SimdSynetConvert32fTo8u(const float * src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float * shift, uint8_t* dst, SimdSynetCompatibilityType compatibility);
 
@@ -7503,20 +7554,33 @@ extern "C"
 
         \fn void SimdSynetConvert8uTo32f(const uint8_t* src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float* shift, float* dst, SimdSynetCompatibilityType compatibility);
 
-        \short Converts 8-bit unsigned integer image to 32-bit float point image.
+        \short Converts a UINT8 tensor to an FP32 tensor using per-channel scale and shift.
+
+        Algorithm's details (example for NCHW tensor format):
+        \verbatim
+        for(b = 0; b < batch; ++b)
+            for(c = 0; c < channels; ++c)
+                for(h = 0; h < height; ++h)
+                    for(w = 0; w < width; ++w)
+                    {
+                        offs = ((b*channels + c)*height + h)*width + w;
+                        dst[offs] = src[offs]*scale[c] + shift[c];
+                    }
+        \endverbatim
+        For NHWC tensor format the same calculation uses offset ((b*height + h)*width + w)*channels + c.
 
         \note This function is used in <a href="http://github.com/ermig1979/Synet">Synet Framework</a>.
 
-        \param [in] src - a pointer to the 8-bit unsigned integer array with input image tensor.
-        \param [in] batch - a number of images in the batch of (input/output) image tensor.
-        \param [in] channels - a number of channels in the (input/output) image tensor.
-        \param [in] height - a height of (input/output) image tensor.
-        \param [in] width - a width of (input/output) image tensor.
-        \param [in] format - a format of (input/output) image tensor.
-        \param [in] scale - a pointer to the 32-bit float array with scale coefficients.
-        \param [in] shift - a pointer to the 32-bit float array with shift coefficients.
-        \param [out] dst - a pointer to the array with 32-bit float output image tensor.
-        \param [in] compatibility - a flags of calculation compatibility.
+        \param [in] src - a pointer to the UINT8 input tensor.
+        \param [in] batch - a batch size of input and output tensors.
+        \param [in] channels - a number of channels in input and output tensors.
+        \param [in] height - a height of input and output tensors.
+        \param [in] width - a width of input and output tensors.
+        \param [in] format - a format of input and output tensors. Can be NCHW or NHWC.
+        \param [in] scale - a pointer to the 32-bit float array with per-channel scale coefficients.
+        \param [in] shift - a pointer to the 32-bit float array with per-channel shift coefficients.
+        \param [out] dst - a pointer to the FP32 output tensor.
+        \param [in] compatibility - calculation compatibility flags.
     */
     SIMD_API void SimdSynetConvert8uTo32f(const uint8_t* src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float* shift, float* dst, SimdSynetCompatibilityType compatibility);
 

From 7beebc0c5764a98e73bb0a2250504d3f56416d60 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Fri, 5 Jun 2026 08:58:44 +0000
Subject: [PATCH 2/5] Improve FP32 convolution API documentation

Co-authored-by: Ihar Yermalayeu <ermig@tut.by>
---
 src/Simd/SimdLib.h | 79 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 62 insertions(+), 17 deletions(-)

diff --git a/src/Simd/SimdLib.h b/src/Simd/SimdLib.h
index fcbe4fec2a..ae90f46877 100644
--- a/src/Simd/SimdLib.h
+++ b/src/Simd/SimdLib.h
@@ -7588,10 +7588,21 @@ extern "C"
 
         \fn void * SimdSynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv);
 
-        \short Initializes FP32 convolution algorithm.
+        \short Initializes an FP32 convolution context.
+
+        The function validates convolution parameters and chooses a suitable implementation (direct, depthwise,
+        Winograd, NHWC-specialized or GEMM-based). It supports FP32 source and destination tensors with matching
+        NCHW or NHWC format. The destination spatial size must match convolution parameters:
+        \verbatim
+        dstH = (srcH + padY + padH - (dilationY*(kernelY - 1) + 1)) / strideY + 1
+        dstW = (srcW + padX + padW - (dilationX*(kernelX - 1) + 1)) / strideX + 1
+        \endverbatim
+
+        A created context stores tensor shape, format, convolution geometry, group count and activation type.
+        Weights, bias and activation parameters are attached later by ::SimdSynetConvolution32fSetParams.
 
         \param [in] batch - a batch size.
-        \param [in] conv - a pointer to convolution parameters.
+        \param [in] conv - a pointer to convolution parameters. Source and destination tensor types must be FP32.
         \return a pointer to FP32 convolution context. On error it returns NULL. It must be released with using of function ::SimdRelease.
             This pointer is used in functions ::SimdSynetConvolution32fExternalBufferSize, ::SimdSynetConvolution32fInternalBufferSize, 
             ::SimdSynetConvolution32fInfo, ::SimdSynetConvolution32fSetParams and ::SimdSynetConvolution32fForward.
@@ -7602,10 +7613,14 @@ extern "C"
 
         \fn size_t SimdSynetConvolution32fExternalBufferSize(const void * context);
 
-        \short Gets size of external temporary buffer required for FP32 convolution algorithm.
+        \short Gets the size of caller-provided temporary buffer for FP32 convolution.
+
+        The returned value is a number of 32-bit float elements, not bytes. It depends on the implementation selected
+        during initialization and can be used to allocate the \a buf argument of ::SimdSynetConvolution32fForward.
+        Some implementations return 1 when they do not need external temporary storage.
 
         \param [in] context - a pointer to FP32 convolution context. It must be created by function ::SimdSynetConvolution32fInit and released by function ::SimdRelease.
-        \return size of external temporary buffer required for FP32 convolution algorithm.
+        \return a number of FP32 elements required for external temporary buffer.
     */
     SIMD_API size_t SimdSynetConvolution32fExternalBufferSize(const void * context);
 
@@ -7613,10 +7628,14 @@ extern "C"
 
         \fn size_t SimdSynetConvolution32fInternalBufferSize(const void * context);
 
-        \short Gets size of internal buffer used inside FP32 convolution algorithm.
+        \short Gets the size of internal storage used by an FP32 convolution context.
+
+        The returned value is a number of 32-bit float elements, not bytes. It reports internal storage tracked by
+        the selected implementation, such as internal temporary buffers and implementation-specific reordered weights,
+        bias or activation parameters already allocated by the context.
 
         \param [in] context - a pointer to FP32 convolution context. It must be created by function ::SimdSynetConvolution32fInit and released by function ::SimdRelease.
-        \return size of internal buffer used inside FP32 convolution algorithm.
+        \return a number of FP32 elements used by internal buffers.
     */
     SIMD_API size_t SimdSynetConvolution32fInternalBufferSize(const void * context);
 
@@ -7624,10 +7643,14 @@ extern "C"
 
         \fn const char* SimdSynetConvolution32fInfo(const void* context);
 
-        \short Gets description of internal implementation of FP32 convolution algorithm.
+        \short Gets a short description of the selected FP32 convolution implementation.
+
+        The returned string contains the implementation extension and algorithm name, for example a direct, depthwise,
+        Winograd, NHWC direct or GEMM-based variant. The returned pointer is owned by the context and remains valid
+        until the next call of this function for the same context or until the context is released.
 
         \param [in] context - a pointer to FP32 convolution context. It must be created by function ::SimdSynetConvolution32fInit and released by function ::SimdRelease.
-        \return string with description of internal implementation of FP32 convolution algorithm.
+        \return a string with description of internal implementation of FP32 convolution algorithm.
     */
     SIMD_API const char* SimdSynetConvolution32fInfo(const void* context);
 
@@ -7635,13 +7658,22 @@ extern "C"
 
         \fn void SimdSynetConvolution32fSetParams(void * context, const float * weight, SimdBool * internal, const float * bias, const float * params);
 
-        \short Sets weights, biases and parameters of activation function required for FP32 convolution algorithm.
+        \short Sets weights, bias and activation parameters for FP32 convolution.
+
+        This function must be called before ::SimdSynetConvolution32fForward. The \a weight array contains FP32
+        convolution weights with kernelY*kernelX*srcC*dstC/group elements. Depending on the selected implementation,
+        weights can be used directly or transformed and stored inside the context. If \a internal is not NULL, the
+        selected implementation writes the weight storage mode to it: SimdTrue means that weights were transformed and
+        stored internally, while SimdFalse means that the implementation may use the original \a weight array directly,
+        so the caller must keep it valid for later forward calls. Bias and activation parameters can also be copied
+        internally by some implementations; otherwise their pointers are stored in the context.
 
         \param [in, out] context - a pointer to FP32 convolution context. It must be created by function ::SimdSynetConvolution32fInit and released by function ::SimdRelease.
-        \param [in] weight - a pointer to convolution weights.
-        \param [out] internal - a flag signalizing that weight is stored in the internal buffer. Can be NULL.
-        \param [in] bias - a pointer to bias. Can be NULL.
-        \param [in] params - a pointer to parameters of activation functions (see ::SimdConvolutionActivationType). Can be NULL.
+        \param [in] weight - a pointer to FP32 convolution weights.
+        \param [out] internal - a pointer to a flag receiving weight ownership mode. Can be NULL.
+        \param [in] bias - a pointer to FP32 bias array with dstC elements. Can be NULL.
+        \param [in] params - a pointer to FP32 parameters of activation function (see ::SimdConvolutionActivationType).
+            Can be NULL when activation does not require parameters.
     */
     SIMD_API void SimdSynetConvolution32fSetParams(void * context, const float * weight, SimdBool * internal, const float * bias, const float * params);
 
@@ -7649,12 +7681,25 @@ extern "C"
 
         \fn void SimdSynetConvolution32fForward(void * context, const float * src, float * buf, float * dst);
 
-        \short Performs forward propagation of FP32 convolution algorithm.
+        \short Performs forward propagation of FP32 convolution.
+
+        The function convolves each image in the batch, adds bias when it was set, and applies the activation specified
+        in ::SimdConvolutionParameters:
+        \verbatim
+        sum = bias == NULL ? 0 : bias[dc];
+        for(sc = 0; sc < srcC/group; ++sc)
+            for(ky = 0; ky < kernelY; ++ky)
+                for(kx = 0; kx < kernelX; ++kx)
+                    sum += src[inputOffset] * weight[weightOffset];
+        dst[outputOffset] = Activate(sum, activation, params);
+        \endverbatim
+        The exact offsets depend on tensor format, padding, dilation, stride and group. The input and output tensors
+        use the shape and format from the context created by ::SimdSynetConvolution32fInit.
 
         \param [in] context - a pointer to FP32 convolution context. It must be created by function ::SimdSynetConvolution32fInit and released by function ::SimdRelease.
-        \param [in] src - a pointer to input tensor.
-        \param [out] buf - a pointer to external temporary buffer. The size of the external temporary buffer is determined by function ::SimdSynetConvolution32fExternalBufferSize. Can be NULL (it causes usage of internal buffer).
-        \param [out] dst - a pointer to output tensor.
+        \param [in] src - a pointer to FP32 input tensor.
+        \param [out] buf - a pointer to external temporary FP32 buffer. The required number of elements is determined by function ::SimdSynetConvolution32fExternalBufferSize. Can be NULL (it causes usage of internal buffer).
+        \param [out] dst - a pointer to FP32 output tensor.
     */
     SIMD_API void SimdSynetConvolution32fForward(void * context, const float * src, float * buf, float * dst);
 

From 2e7e3ba144937bd0715d23bd72b738e381e5e21d Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Fri, 5 Jun 2026 09:02:55 +0000
Subject: [PATCH 3/5] Improve BF16 convolution API documentation

Co-authored-by: Ihar Yermalayeu <ermig@tut.by>
---
 src/Simd/SimdLib.h | 84 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 66 insertions(+), 18 deletions(-)

diff --git a/src/Simd/SimdLib.h b/src/Simd/SimdLib.h
index ae90f46877..1de9cb5d2b 100644
--- a/src/Simd/SimdLib.h
+++ b/src/Simd/SimdLib.h
@@ -7707,11 +7707,24 @@ extern "C"
 
         \fn void * SimdSynetConvolution16bInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility);
 
-        \short Initializes BF16 convolution algorithm.
+        \short Initializes a BF16/FP32 convolution context.
+
+        The function validates convolution parameters and chooses a suitable BF16-oriented implementation (GEMM,
+        NCHW/NHWC GEMM, NHWC depthwise, NHWC special convolution or AMX-BF16 variant when available). It supports
+        FP32 or BF16 source and destination tensors with matching NCHW or NHWC format. The destination spatial size
+        must match convolution parameters:
+        \verbatim
+        dstH = (srcH + padY + padH - (dilationY*(kernelY - 1) + 1)) / strideY + 1
+        dstW = (srcW + padX + padW - (dilationX*(kernelX - 1) + 1)) / strideX + 1
+        \endverbatim
+
+        A created context stores tensor shape, data types, format, convolution geometry, group count, activation type
+        and compatibility flags. FP32 weights, bias and activation parameters are attached later by
+        ::SimdSynetConvolution16bSetParams.
 
         \param [in] batch - a batch size.
-        \param [in] conv - a pointer to convolution parameters.
-        \param [in] compatibility - a flags of calculation compatibility.
+        \param [in] conv - a pointer to convolution parameters. Source and destination tensor types must be FP32 or BF16.
+        \param [in] compatibility - calculation compatibility flags.
         \return a pointer to BF16 convolution context. On error it returns NULL. It must be released with using of function ::SimdRelease.
             This pointer is used in functions ::SimdSynetConvolution16bExternalBufferSize, ::SimdSynetConvolution16bInternalBufferSize,
             ::SimdSynetConvolution16bInfo, ::SimdSynetConvolution16bSetParams and ::SimdSynetConvolution16bForward.
@@ -7722,10 +7735,14 @@ extern "C"
 
         \fn size_t SimdSynetConvolution16bExternalBufferSize(const void * context);
 
-        \short Gets size in bytes of external temporary buffer required for BF16 convolution algorithm.
+        \short Gets the size in bytes of caller-provided temporary buffer for BF16 convolution.
+
+        The returned value is a number of bytes. It depends on the implementation selected during initialization and
+        can be used to allocate the \a buf argument of ::SimdSynetConvolution16bForward. Some implementations return 1
+        or 0 when they do not need external temporary storage.
 
         \param [in] context - a pointer to BF16 convolution context. It must be created by function ::SimdSynetConvolution16bInit and released by function ::SimdRelease.
-        \return size of external temporary buffer required for BF16 convolution algorithm.
+        \return a number of bytes required for external temporary buffer.
     */
     SIMD_API size_t SimdSynetConvolution16bExternalBufferSize(const void* context);
 
@@ -7733,10 +7750,13 @@ extern "C"
 
         \fn size_t SimdSynetConvolution16bInternalBufferSize(const void * context);
 
-        \short Gets size (in bytes) of internal buffer used inside BF16 convolution algorithm.
+        \short Gets the size in bytes of internal storage used by a BF16 convolution context.
+
+        The returned value reports internal storage tracked by the selected implementation, including internal
+        temporary buffers, transformed weights, copied bias and copied activation parameters.
 
         \param [in] context - a pointer to BF16 convolution context. It must be created by function ::SimdSynetConvolution16bInit and released by function ::SimdRelease.
-        \return size of internal buffer used inside BF16 convolution algorithm.
+        \return a number of bytes used by internal buffers.
     */
     SIMD_API size_t SimdSynetConvolution16bInternalBufferSize(const void* context);
 
@@ -7744,23 +7764,34 @@ extern "C"
 
         \fn const char* SimdSynetConvolution16bInfo(const void* context);
 
-        \short Gets description of internal implementation of BF16 convolution algorithm.
+        \short Gets a short description of the selected BF16 convolution implementation.
+
+        The returned string contains the implementation extension and algorithm name, for example a GEMM, NCHW/NHWC
+        GEMM, NHWC depthwise, NHWC special or AMX-BF16 variant. The returned pointer is owned by the context and
+        remains valid until the next call of this function for the same context or until the context is released.
 
         \param [in] context - a pointer to BF16 convolution context. It must be created by function ::SimdSynetConvolution16bInit and released by function ::SimdRelease.
-        \return string with description of internal implementation of BF16 convolution algorithm.
+        \return a string with description of internal implementation of BF16 convolution algorithm.
     */
     SIMD_API const char* SimdSynetConvolution16bInfo(const void* context);
 
     /*! @ingroup synet_convolution_bf16
 
-        \fn void SimdSynetConvolution16bSetParams(void * context, const float * weight, const float * bias, const float * params, const float * const * stats);
+        \fn void SimdSynetConvolution16bSetParams(void * context, const float * weight, const float * bias, const float * params);
+
+        \short Sets weights, bias and activation parameters for BF16 convolution.
 
-        \short Sets weights, biases, parameters of activation function, input/output tensor statistics required for BF16 convolution algorithm.
+        This function must be called before ::SimdSynetConvolution16bForward. The \a weight array contains FP32
+        convolution weights with kernelY*kernelX*srcC*dstC/group elements. The selected implementation transforms
+        weights to its internal representation (usually BF16 and reordered; some depthwise paths keep FP32 weights).
+        Bias is copied to an internal FP32 array; when \a bias is NULL, zeros are used. Activation parameters are
+        copied or expanded to the internal FP32 array according to ::SimdConvolutionActivationType.
 
         \param [in, out] context - a pointer to BF16 convolution context. It must be created by function ::SimdSynetConvolution16bInit and released by function ::SimdRelease.
-        \param [in] weight - a pointer to original (32-bit float point) convolution weights.
-        \param [in] bias - a pointer to original (32-bit float point) bias. Can be NULL.
-        \param [in] params - a pointer to original (32-bit float point) parameters of activation functions (see ::SimdConvolutionActivationType). Can be NULL.
+        \param [in] weight - a pointer to FP32 convolution weights.
+        \param [in] bias - a pointer to FP32 bias array with dstC elements. Can be NULL.
+        \param [in] params - a pointer to FP32 parameters of activation function (see ::SimdConvolutionActivationType).
+            Can be NULL when activation does not require parameters.
     */
     SIMD_API void SimdSynetConvolution16bSetParams(void* context, const float* weight, const float* bias, const float* params);
 
@@ -7768,12 +7799,29 @@ extern "C"
 
         \fn void SimdSynetConvolution16bForward(void * context, const uint8_t * src, uint8_t * buf, uint8_t * dst);
 
-        \short Performs forward propagation of BF16 convolution algorithm.
+        \short Performs forward propagation of BF16/FP32 convolution.
+
+        The function converts FP32 input to BF16 when the context source type is FP32, uses BF16 input directly when
+        the source type is BF16, accumulates convolution sums in FP32, adds bias, applies activation and writes FP32
+        or BF16 output according to the context destination type:
+        \verbatim
+        sum = bias[dc];
+        for(sc = 0; sc < srcC/group; ++sc)
+            for(ky = 0; ky < kernelY; ++ky)
+                for(kx = 0; kx < kernelX; ++kx)
+                    sum += inputValue * weightValue;
+        value = Activate(sum, activation, params);
+        dst[outputOffset] = dstT == SimdTensorData16b ? Float32ToBFloat16(value) : value;
+        \endverbatim
+        The input value is read as BF16 or converted from FP32 to BF16 according to srcT. The weight value comes from
+        the internal representation prepared by ::SimdSynetConvolution16bSetParams.
+        The exact offsets depend on tensor format, padding, dilation, stride and group. The input and output tensors
+        use the shape, data types and format from the context created by ::SimdSynetConvolution16bInit.
 
         \param [in] context - a pointer to BF16 convolution context. It must be created by function ::SimdSynetConvolution16bInit and released by function ::SimdRelease.
-        \param [in] src - a pointer to input tensor.
-        \param [out] buf - a pointer to external temporary buffer. The size of the external temporary buffer is determined by function ::SimdSynetConvolution16bExternalBufferSize. Can be NULL (it causes usage of internal buffer).
-        \param [out] dst - a pointer to output tensor.
+        \param [in] src - a pointer to input tensor. Actual element type is defined by srcT in convolution parameters.
+        \param [out] buf - a pointer to external temporary byte buffer. The required size is determined by function ::SimdSynetConvolution16bExternalBufferSize. Can be NULL (it causes usage of internal buffer).
+        \param [out] dst - a pointer to output tensor. Actual element type is defined by dstT in convolution parameters.
     */
     SIMD_API void SimdSynetConvolution16bForward(void* context, const uint8_t* src, uint8_t* buf, uint8_t* dst);
 

From 2e27184cdd30c8c49f950068049340cc47d234e8 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Fri, 5 Jun 2026 09:12:57 +0000
Subject: [PATCH 4/5] Improve INT8 convolution API documentation

Co-authored-by: Ihar Yermalayeu <ermig@tut.by>
---
 src/Simd/SimdLib.h | 89 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 71 insertions(+), 18 deletions(-)

diff --git a/src/Simd/SimdLib.h b/src/Simd/SimdLib.h
index 1de9cb5d2b..7a22a5baf5 100644
--- a/src/Simd/SimdLib.h
+++ b/src/Simd/SimdLib.h
@@ -7829,11 +7829,26 @@ extern "C"
 
         \fn void * SimdSynetConvolution8iInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility);
 
-        \short Initializes INT8 convolution algorithm.
+        \short Initializes an INT8 convolution context.
+
+        The function validates convolution parameters and chooses a suitable implementation (GEMM, NHWC direct,
+        NHWC depthwise or architecture-specific VNNI/AMX/NEON variant when available). It supports FP32 or UINT8
+        source and destination tensors with matching NCHW or NHWC format. The destination spatial size must match
+        convolution parameters:
+        \verbatim
+        dstH = (srcH + padY + padH - (dilationY*(kernelY - 1) + 1)) / strideY + 1
+        dstW = (srcW + padX + padW - (dilationX*(kernelX - 1) + 1)) / strideX + 1
+        \endverbatim
+
+        A created context stores tensor shape, data types, format, convolution geometry, group count, activation type
+        and compatibility flags. FP32 weights, bias, activation parameters and tensor statistics are attached later by
+        ::SimdSynetConvolution8iSetParams.
 
         \param [in] batch - a batch size.
-        \param [in] conv - a pointer to convolution parameters.
-        \param [in] compatibility - a flags of calculation compatibility.
+        \param [in] conv - a pointer to convolution parameters. Source and destination tensor types must be FP32 or UINT8.
+        \param [in] compatibility - calculation compatibility flags. They select precise, overflow or narrowed INT8
+            calculation mode. Narrowed mode uses unsigned range [0, 180] and signed range [-90, 90]; otherwise
+            ranges are [0, 255] and [-128, 127].
         \return a pointer to INT8 convolution context. On error it returns NULL. It must be released with using of function ::SimdRelease.
             This pointer is used in functions ::SimdSynetConvolution8iExternalBufferSize, ::SimdSynetConvolution8iInternalBufferSize, 
             ::SimdSynetConvolution8iInfo, ::SimdSynetConvolution8iSetParams and ::SimdSynetConvolution8iForward.
@@ -7844,10 +7859,14 @@ extern "C"
 
         \fn size_t SimdSynetConvolution8iExternalBufferSize(const void * context);
 
-        \short Gets size in bytes of external temporary buffer required for INT8 convolution algorithm.
+        \short Gets the size in bytes of caller-provided temporary buffer for INT8 convolution.
+
+        The returned value is a number of bytes. It depends on the implementation selected during initialization and
+        can be used to allocate the \a buf argument of ::SimdSynetConvolution8iForward. The buffer can contain temporary
+        UINT8 source conversion data, im2col/padded input data, INT32 sums and temporary FP32 output data.
 
         \param [in] context - a pointer to INT8 convolution context. It must be created by function ::SimdSynetConvolution8iInit and released by function ::SimdRelease.
-        \return size of external temporary buffer required for INT8 convolution algorithm.
+        \return a number of bytes required for external temporary buffer.
     */
     SIMD_API size_t SimdSynetConvolution8iExternalBufferSize(const void * context);
 
@@ -7855,10 +7874,14 @@ extern "C"
 
         \fn size_t SimdSynetConvolution8iInternalBufferSize(const void * context);
 
-        \short Gets size of internal buffer used inside INT8 convolution algorithm.
+        \short Gets the size in bytes of internal storage used by an INT8 convolution context.
+
+        The returned value reports internal storage tracked by the selected implementation, including internal
+        temporary buffers, quantized/reordered INT8 weights, source and destination conversion parameters,
+        normalization, bias and activation parameters.
 
         \param [in] context - a pointer to INT8 convolution context. It must be created by function ::SimdSynetConvolution8iInit and released by function ::SimdRelease.
-        \return size of internal buffer used inside INT8 convolution algorithm.
+        \return a number of bytes used by internal buffers.
     */
     SIMD_API size_t SimdSynetConvolution8iInternalBufferSize(const void * context);
 
@@ -7866,10 +7889,15 @@ extern "C"
 
         \fn const char* SimdSynetConvolution8iInfo(const void* context);
 
-        \short Gets description of internal implementation of INT8 convolution algorithm.
+        \short Gets a short description of the selected INT8 convolution implementation.
+
+        The returned string contains the implementation extension and algorithm name, for example a GEMM, NHWC direct
+        or NHWC depthwise variant, with a suffix for precise, overflow or narrowed mode when applicable. The returned
+        pointer is owned by the context and remains valid until the next call of this function for the same context or
+        until the context is released.
 
         \param [in] context - a pointer to INT8 convolution context. It must be created by function ::SimdSynetConvolution8iInit and released by function ::SimdRelease.
-        \return string with description of internal implementation of INT8 convolution algorithm.
+        \return a string with description of internal implementation of INT8 convolution algorithm.
     */
     SIMD_API const char* SimdSynetConvolution8iInfo(const void* context);
 
@@ -7877,13 +7905,23 @@ extern "C"
 
         \fn void SimdSynetConvolution8iSetParams(void * context, const float * weight, const float * bias, const float * params, const float * const * stats);
 
-        \short Sets weights, biases, parameters of activation function, input/output tensor statistics required for INT8 convolution algorithm.
+        \short Sets weights, bias, activation parameters and tensor statistics for INT8 convolution.
+
+        This function must be called before ::SimdSynetConvolution8iForward. The \a weight array contains FP32
+        convolution weights with kernelY*kernelX*srcC*dstC/group elements. Source statistics (\a stats[0],
+        \a stats[1], each with srcC elements) define per-channel source quantization parameters; destination statistics
+        (\a stats[2], \a stats[3], each with dstC elements) define per-channel output quantization parameters. The
+        selected implementation converts weights to INT8, may reorder them, and computes per-output-channel normalization
+        and bias terms used to convert INT32 sums back to FP32. Activation parameters are copied or expanded internally
+        according to ::SimdConvolutionActivationType.
 
         \param [in, out] context - a pointer to INT8 convolution context. It must be created by function ::SimdSynetConvolution8iInit and released by function ::SimdRelease.
-        \param [in] weight - a pointer to original (32-bit float point) convolution weights.
-        \param [in] bias - a pointer to original (32-bit float point) bias. Can be NULL.
-        \param [in] params - a pointer to original (32-bit float point) parameters of activation functions (see ::SimdConvolutionActivationType). Can be NULL.
-        \param [in] stats - a pointer to pointers with statistics of input(min - stats[0], max - stats[1]) and output(min - stats[2], max - stats[3]) tensors.
+        \param [in] weight - a pointer to FP32 convolution weights.
+        \param [in] bias - a pointer to FP32 bias array with dstC elements. Can be NULL.
+        \param [in] params - a pointer to FP32 parameters of activation function (see ::SimdConvolutionActivationType).
+            Can be NULL when activation does not require parameters.
+        \param [in] stats - a pointer to pointers with per-channel tensor statistics:
+            source minimum stats[0], source maximum stats[1], destination minimum stats[2], destination maximum stats[3].
     */
     SIMD_API void SimdSynetConvolution8iSetParams(void * context, const float * weight, const float * bias, const float * params, const float * const* stats);
 
@@ -7891,12 +7929,27 @@ extern "C"
 
         \fn void SimdSynetConvolution8iForward(void * context, const uint8_t * src, uint8_t * buf, uint8_t * dst);
 
-        \short Performs forward propagation of INT8 convolution algorithm.
+        \short Performs forward propagation of INT8 convolution.
+
+        The function converts FP32 input to UINT8 when the context source type is FP32, uses UINT8 input directly when
+        the source type is UINT8, accumulates convolution sums in INT32 with INT8 weights, converts sums to FP32 using
+        internal normalization and bias, applies activation, and writes FP32 or UINT8 output according to the context
+        destination type:
+        \verbatim
+        if(srcT == SimdTensorData32f)
+            src8u = restrict(round(src32f*srcScale[c] + srcShift[c]), srcLower, srcUpper);
+        sum = convolution_int32(src8u, weight8i, zero);
+        value = Activate(sum*norm[dc] + bias[dc], activation, params);
+        dst[outputOffset] = dstT == SimdTensorData8u ?
+            restrict(round(value*dstScale[dc] + dstShift[dc]), dstLower, dstUpper) : value;
+        \endverbatim
+        The exact offsets depend on tensor format, padding, dilation, stride and group. The input and output tensors
+        use the shape, data types and format from the context created by ::SimdSynetConvolution8iInit.
 
         \param [in] context - a pointer to INT8 convolution context. It must be created by function ::SimdSynetConvolution8iInit and released by function ::SimdRelease.
-        \param [in] src - a pointer to input tensor.
-        \param [out] buf - a pointer to external temporary buffer. The size of the external temporary buffer is determined by function ::SimdSynetConvolution8iExternalBufferSize. Can be NULL (it causes usage of internal buffer).
-        \param [out] dst - a pointer to output tensor.
+        \param [in] src - a pointer to input tensor. Actual element type is defined by srcT in convolution parameters.
+        \param [out] buf - a pointer to external temporary byte buffer. The required size is determined by function ::SimdSynetConvolution8iExternalBufferSize. Can be NULL (it causes usage of internal buffer).
+        \param [out] dst - a pointer to output tensor. Actual element type is defined by dstT in convolution parameters.
     */
     SIMD_API void SimdSynetConvolution8iForward(void * context, const uint8_t * src, uint8_t * buf, uint8_t * dst);
 

From d352473701476c878947ad9ffb2ff23246bc5151 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Fri, 5 Jun 2026 09:18:10 +0000
Subject: [PATCH 5/5] Improve FP32 deconvolution API documentation

Co-authored-by: Ihar Yermalayeu <ermig@tut.by>
---
 src/Simd/SimdLib.h | 85 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 67 insertions(+), 18 deletions(-)

diff --git a/src/Simd/SimdLib.h b/src/Simd/SimdLib.h
index 7a22a5baf5..002ff5a239 100644
--- a/src/Simd/SimdLib.h
+++ b/src/Simd/SimdLib.h
@@ -7957,11 +7957,23 @@ extern "C"
 
         \fn void * SimdSynetDeconvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility);
 
-        \short Initializes FP32 deconvolution algorithm.
+        \short Initializes an FP32 deconvolution context.
+
+        The function validates deconvolution parameters and chooses a suitable implementation (GEMM-based or
+        NHWC direct 2x2 when available). It supports FP32 source and destination tensors with matching NCHW format,
+        or matching NHWC format when group is 1. The destination spatial size must match deconvolution parameters:
+        \verbatim
+        dstH = strideY*(srcH - 1) + dilationY*(kernelY - 1) + 1 - padY - padH
+        dstW = strideX*(srcW - 1) + dilationX*(kernelX - 1) + 1 - padX - padW
+        \endverbatim
+
+        A created context stores tensor shape, format, deconvolution geometry, group count, activation type and
+        compatibility flags. Weights, bias and activation parameters are attached later by
+        ::SimdSynetDeconvolution32fSetParams.
 
         \param [in] batch - a batch size.
-        \param [in] conv - a pointer to deconvolution parameters.
-        \param [in] compatibility - a flags of calculation compatibility.
+        \param [in] conv - a pointer to deconvolution parameters. Source and destination tensor types must be FP32.
+        \param [in] compatibility - calculation compatibility flags.
         \return a pointer to FP32 deconvolution context. On error it returns NULL. It must be released with using of function ::SimdRelease.
             This pointer is used in functions ::SimdSynetDeconvolution32fExternalBufferSize, ::SimdSynetDeconvolution32fInternalBufferSize, 
             ::SimdSynetDeconvolution32fInfo, ::SimdSynetDeconvolution32fSetParams and ::SimdSynetDeconvolution32fForward.
@@ -7972,10 +7984,14 @@ extern "C"
 
         \fn size_t SimdSynetDeconvolution32fExternalBufferSize(const void * context);
 
-        \short Gets size of external temporary buffer required for FP32 deconvolution algorithm.
+        \short Gets the size of caller-provided temporary buffer for FP32 deconvolution.
+
+        The returned value is a number of 32-bit float elements, not bytes. It depends on the implementation selected
+        during initialization and can be used to allocate the \a buf argument of ::SimdSynetDeconvolution32fForward.
+        Some implementations return 1 when they do not need external temporary storage.
 
         \param [in] context - a pointer to FP32 deconvolution context. It must be created by function ::SimdSynetDeconvolution32fInit and released by function ::SimdRelease.
-        \return size of external temporary buffer required for FP32 deconvolution algorithm.
+        \return a number of FP32 elements required for external temporary buffer.
     */
     SIMD_API size_t SimdSynetDeconvolution32fExternalBufferSize(const void * context);
 
@@ -7983,10 +7999,14 @@ extern "C"
 
         \fn size_t SimdSynetDeconvolution32fInternalBufferSize(const void * context);
 
-        \short Gets size of internal buffer used inside FP32 deconvolution algorithm.
+        \short Gets the size of internal storage used by an FP32 deconvolution context.
+
+        The returned value is a number of 32-bit float elements, not bytes. It reports internal storage tracked by
+        the selected implementation, such as internal temporary buffers and implementation-specific reordered weights,
+        bias or activation parameters already allocated by the context.
 
         \param [in] context - a pointer to FP32 deconvolution context. It must be created by function ::SimdSynetDeconvolution32fInit and released by function ::SimdRelease.
-        \return size of internal buffer used inside FP32 deconvolution algorithm.
+        \return a number of FP32 elements used by internal buffers.
     */
     SIMD_API size_t SimdSynetDeconvolution32fInternalBufferSize(const void * context);
 
@@ -7994,10 +8014,14 @@ extern "C"
 
         \fn const char* SimdSynetDeconvolution32fInfo(const void* context);
 
-        \short Gets description of internal implementation of FP32 deconvolution algorithm.
+        \short Gets a short description of the selected FP32 deconvolution implementation.
+
+        The returned string contains the implementation extension and algorithm name, for example a GEMM-based or
+        NHWC direct 2x2 variant. The returned pointer is owned by the context and remains valid until the next call
+        of this function for the same context or until the context is released.
 
         \param [in] context - a pointer to FP32 deconvolution context. It must be created by function ::SimdSynetDeconvolution32fInit and released by function ::SimdRelease.
-        \return string with description of internal implementation of FP32 deconvolution algorithm.
+        \return a string with description of internal implementation of FP32 deconvolution algorithm.
     */
     SIMD_API const char* SimdSynetDeconvolution32fInfo(const void* context);
 
@@ -8005,13 +8029,22 @@ extern "C"
 
         \fn void SimdSynetDeconvolution32fSetParams(void * context, const float * weight, SimdBool * internal, const float * bias, const float * params);
 
-        \short Sets weights, biases and parameters of activation function required for FP32 deconvolution algorithm.
+        \short Sets weights, bias and activation parameters for FP32 deconvolution.
+
+        This function must be called before ::SimdSynetDeconvolution32fForward. The \a weight array contains FP32
+        deconvolution weights with kernelY*kernelX*srcC*dstC/group elements. Depending on the selected implementation,
+        weights can be used directly or transformed and stored inside the context. If \a internal is not NULL, the
+        selected implementation writes the weight storage mode to it: SimdTrue means that weights were transformed and
+        stored internally, while SimdFalse means that the implementation may use the original \a weight array directly,
+        so the caller must keep it valid for later forward calls. Bias and activation parameters can also be copied
+        internally by some implementations; otherwise their pointers are stored in the context.
 
         \param [in, out] context - a pointer to FP32 deconvolution context. It must be created by function ::SimdSynetDeconvolution32fInit and released by function ::SimdRelease.
-        \param [in] weight - a pointer to deconvolution weights.
-        \param [out] internal - a flag signalizing that weight is stored in the internal buffer. Can be NULL.
-        \param [in] bias - a pointer to bias. Can be NULL.
-        \param [in] params - a pointer to parameters of activation functions (see ::SimdConvolutionActivationType). Can be NULL.
+        \param [in] weight - a pointer to FP32 deconvolution weights.
+        \param [out] internal - a pointer to a flag receiving weight ownership mode. Can be NULL.
+        \param [in] bias - a pointer to FP32 bias array with dstC elements. Can be NULL.
+        \param [in] params - a pointer to FP32 parameters of activation function (see ::SimdConvolutionActivationType).
+            Can be NULL when activation does not require parameters.
     */
     SIMD_API void SimdSynetDeconvolution32fSetParams(void * context, const float * weight, SimdBool * internal, const float * bias, const float * params);
 
@@ -8019,12 +8052,28 @@ extern "C"
 
         \fn void SimdSynetDeconvolution32fForward(void * context, const float * src, float * buf, float * dst);
 
-        \short Performs forward propagation of FP32 deconvolution algorithm.
+        \short Performs forward propagation of FP32 deconvolution.
+
+        The function applies transposed convolution to each image in the batch, adds bias when it was set, and applies
+        the activation specified in ::SimdConvolutionParameters:
+        \verbatim
+        dst[:] = 0;
+        for(sc = 0; sc < srcC/group; ++sc)
+            for(sy = 0; sy < srcH; ++sy)
+                for(sx = 0; sx < srcW; ++sx)
+                    for(ky = 0; ky < kernelY; ++ky)
+                        for(kx = 0; kx < kernelX; ++kx)
+                            dst[outputOffset] += src[inputOffset] * weight[weightOffset];
+        dst[outputOffset] = Activate(dst[outputOffset] + bias[dc], activation, params);
+        \endverbatim
+        The exact offsets depend on tensor format, padding, dilation, stride and group. The input and output tensors
+        use the shape and format from the context created by ::SimdSynetDeconvolution32fInit.
 
         \param [in] context - a pointer to FP32 deconvolution context. It must be created by function ::SimdSynetDeconvolution32fInit and released by function ::SimdRelease.
-        \param [in] src - a pointer to input tensor.
-        \param [out] buf - a pointer to external temporary buffer. The size of the external temporary buffer is determined by function ::SimdSynetDeconvolution32fExternalBufferSize. Can be NULL (it causes usage of internal buffer).
-        \param [out] dst - a pointer to output tensor.
+        \param [in] src - a pointer to FP32 input tensor.
+        \param [out] buf - a pointer to external temporary FP32 buffer. The required number of elements is determined by
+            function ::SimdSynetDeconvolution32fExternalBufferSize. Can be NULL (it causes usage of internal buffer).
+        \param [out] dst - a pointer to FP32 output tensor.
     */
     SIMD_API void SimdSynetDeconvolution32fForward(void * context, const float * src, float * buf, float * dst);