diff --git a/src/Simd/SimdLib.h b/src/Simd/SimdLib.h
index 336f11c788..002ff5a239 100644
--- a/src/Simd/SimdLib.h
+++ b/src/Simd/SimdLib.h
@@ -7364,15 +7364,29 @@ extern "C"
\fn void* SimdSynetAdd16bInit(const size_t* aShape, size_t aCount, SimdTensorDataType aType, const size_t* bShape, size_t bCount, SimdTensorDataType bType, SimdTensorDataType dstType, SimdTensorFormatType format);
- \short Initializes add algorithm.
+ \short Initializes element-wise addition of two tensors in FP32 or BF16 format.
+
+ The created context adds two tensors with equal shapes:
+ \verbatim
+ for(i = 0; i < shapeSize; ++i)
+ {
+ A = aType == SimdTensorData16b ? BFloat16ToFloat32(a[i]) : a[i];
+ B = bType == SimdTensorData16b ? BFloat16ToFloat32(b[i]) : b[i];
+ D = A + B;
+ dst[i] = dstType == SimdTensorData16b ? Float32ToBFloat16(D) : D;
+ }
+ \endverbatim
+
+ The current implementation creates a context only for equal input shapes, FP32/BF16 input and output tensor types,
+ and SimdTensorFormatUnknown, SimdTensorFormatNchw or SimdTensorFormatNhwc tensor format.
\param [in] aShape - a pointer to shape of input A tensor.
\param [in] aCount - a count of dimensions of input A tensor.
- \param [in] aType - a type of input A tensor. Can be FP32 of BF16.
+ \param [in] aType - a type of input A tensor. Can be FP32 or BF16.
\param [in] bShape - a pointer to shape of input B tensor.
\param [in] bCount - a count of dimensions of input B tensor.
- \param [in] bType - a type of input B tensor. Can be FP32 of BF16.
- \param [in] dstType - a type of output tensor. Can be FP32 of BF16.
+ \param [in] bType - a type of input B tensor. Can be FP32 or BF16.
+ \param [in] dstType - a type of output tensor. Can be FP32 or BF16.
\param [in] format - a format of input / output tensors.
\return a pointer to add context. On error it returns NULL. It must be released with using of function ::SimdRelease.
This pointer is used in function ::SimdSynetAdd16bForward.
@@ -7383,7 +7397,11 @@ extern "C"
\fn void SimdSynetAdd16bForward(void* context, const uint8_t* a, const uint8_t* b, uint8_t* dst);
- \short Performs forward propagation of add algorithm.
+ \short Performs element-wise addition of two FP32/BF16 tensors.
+
+ The function adds corresponding elements of input tensors A and B using a context created by ::SimdSynetAdd16bInit.
+ The actual data types, tensor shape and output type are stored in the context. BF16 input values are converted to
+ FP32 before addition, and BF16 output values are converted from FP32 after addition.
\param [in] context - a pointer to add context. It must be created by function ::SimdSynetAdd16bInit and released by function ::SimdRelease.
\param [in] a - a pointer to input A tensor.
@@ -7396,22 +7414,28 @@ extern "C"
\fn void SimdSynetAddBias(const float * bias, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format);
- \short Adds a bias to given vector.
+ \short Adds per-channel bias to an FP32 tensor in place.
Algorithm's details (example for NCHW tensor format):
\verbatim
for(c = 0; c < channels; ++c)
- for(j = 0; j < spatial; ++j)
+ for(s = 0; s < spatial; ++s)
dst[c*spatial + s] += bias[c];
\endverbatim
+ Algorithm's details (example for NHWC tensor format):
+ \verbatim
+ for(s = 0; s < spatial; ++s)
+ for(c = 0; c < channels; ++c)
+ dst[s*channels + c] += bias[c];
+ \endverbatim
\note This function is used in Synet Framework.
\param [in] bias - a pointer to the 32-bit float array with bias coefficients. The size of the array is equal to channels.
- \param [in] channels - a number of channels in the image tensor.
- \param [in] spatial - a spatial size of image tensor.
- \param [in, out] dst - a pointer to cumulative 32-bit image tensor. The size of the array is equal to channels * spatial.
- \param [in] format - a format of image tensor.
+ \param [in] channels - a number of channels in the tensor.
+ \param [in] spatial - a spatial size (height * width) of the tensor.
+ \param [in, out] dst - a pointer to FP32 tensor updated in place. The size of the array is equal to channels * spatial.
+ \param [in] format - a format of the tensor.
*/
SIMD_API void SimdSynetAddBias(const float * bias, size_t channels, size_t spatial, float * dst, SimdTensorFormatType format);
@@ -7419,37 +7443,40 @@ extern "C"
\fn void SimdSynetAdd8i(const uint8_t * aData, const float * aScale, const float* aShift, const uint8_t* bData, const float* bScale, const float* bShift, uint8_t* cData, const float* cScale, const float* cShift, size_t batch, size_t channels, size_t spatial, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility);
- \short Adds two INT8 tensors.
+ \short Dequantizes, adds and requantizes two UINT8 tensors.
- Algorithm's details (example for NCHW tensor format):
+ Algorithm's details (example for NCHW tensor format):
\verbatim
+ upper = isNarrowed(compatibility) ? 180 : 255;
for(b = 0; b < batch; ++b)
for(c = 0; c < channels; ++c)
for(s = 0; s < spatial; ++s)
{
offs = (b*channels + c)*spatial + s;
- A = aData[offs]*aScale[c] + aShift[c];
+ A = aData[offs]*aScale[c] + aShift[c];
B = bData[offs]*bScale[c] + bShift[c];
- cData[offs] = round((A + B)*cScale[c] + cShift[c]);
+ C = round((A + B)*cScale[c] + cShift[c]);
+ cData[offs] = restrict(C, 0, upper);
}
\endverbatim
+ For NHWC tensor format the same calculation uses offset (b*spatial + s)*channels + c.
\note This function is used in Synet Framework.
- \param [in] aData - a pointer to the first input 8-bit integer tensor.
- \param [in] aScale - a pointer to the 32-bit float array with scale coefficients of the first input tensor.
- \param [in] aShift - a pointer to the 32-bit float array with shift coefficients of the first input tensor.
- \param [in] bData - a pointer to the second input 8-bit integer tensor.
- \param [in] bScale - a pointer to the 32-bit float array with scale coefficients of the second input tensor.
- \param [in] bShift - a pointer to the 32-bit float array with shift coefficients of the second input tensor.
- \param [out] cData - a pointer to the output 8-bit integer tensor.
- \param [in] cScale - a pointer to the 32-bit float array with scale coefficients of the output tensor.
- \param [in] cShift - a pointer to the 32-bit float array with shift coefficients of the output tensor.
- \param [in] batch - a batch size of input and output image tensors.
- \param [in] channels - a number of channels in input and output image tensors.
- \param [in] spatial - a spatial size of input and output image tensors.
- \param [in] format - a format of input and output image tensors.
- \param [in] compatibility - a flags of calculation compatibility.
+ \param [in] aData - a pointer to the first input UINT8 tensor.
+ \param [in] aScale - a pointer to the 32-bit float array with per-channel scale coefficients of the first input tensor.
+ \param [in] aShift - a pointer to the 32-bit float array with per-channel shift coefficients of the first input tensor.
+ \param [in] bData - a pointer to the second input UINT8 tensor.
+ \param [in] bScale - a pointer to the 32-bit float array with per-channel scale coefficients of the second input tensor.
+ \param [in] bShift - a pointer to the 32-bit float array with per-channel shift coefficients of the second input tensor.
+ \param [out] cData - a pointer to the output UINT8 tensor.
+ \param [in] cScale - a pointer to the 32-bit float array with per-channel scale coefficients of the output tensor.
+ \param [in] cShift - a pointer to the 32-bit float array with per-channel shift coefficients of the output tensor.
+ \param [in] batch - a batch size of input and output tensors.
+ \param [in] channels - a number of channels in input and output tensors.
+ \param [in] spatial - a spatial size (height * width) of input and output tensors.
+ \param [in] format - a format of input and output tensors. Can be NCHW or NHWC.
+ \param [in] compatibility - calculation compatibility flags. When narrowed 8-bit mode is active, output is limited to [0, 180], otherwise to [0, 255].
*/
SIMD_API void SimdSynetAdd8i(const uint8_t * aData, const float * aScale, const float* aShift, const uint8_t* bData, const float* bScale, const float* bShift,
uint8_t* cData, const float* cScale, const float* cShift, size_t batch, size_t channels, size_t spatial, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility);
@@ -7458,20 +7485,30 @@ extern "C"
\fn void SimdSynetChannelSum16b(const uint16_t* src, size_t channels, size_t spatial, SimdTensorFormatType format, float* sum);
- \short Calculates channels sums in FP32 format for input tensor in BF16 format.
+ \short Calculates per-channel sums of a BF16 tensor in FP32 format.
- Algorithm's details (example for NCHW tensor format) :
+ Algorithm's details (example for NCHW tensor format):
\verbatim
for(c = 0; c < channels; ++c)
+ {
sum[c] = 0;
for(s = 0; s < spatial; ++s)
- sum[c] += src[c, s];
+ sum[c] += BFloat16ToFloat32(src[c*spatial + s]);
+ }
+ \endverbatim
+ Algorithm's details (example for NHWC tensor format):
+ \verbatim
+ for(c = 0; c < channels; ++c)
+ sum[c] = 0;
+ for(s = 0; s < spatial; ++s)
+ for(c = 0; c < channels; ++c)
+ sum[c] += BFloat16ToFloat32(src[s*channels + c]);
\endverbatim
\note This function is used in Synet Framework.
- \param [in] src - a pointer to the input 16-bit brain-float tensor.
- \param [in] channels - a number of channels in input and output arrays.
+ \param [in] src - a pointer to the input BF16 tensor.
+ \param [in] channels - a number of channels in input tensor.
\param [in] spatial - a spatial (width * height) size of input tensor.
\param [in] format - a format of input tensor.
\param [out] sum - a pointer to output 32-bit float array with channels sums.
@@ -7482,20 +7519,34 @@ extern "C"
\fn void SimdSynetConvert32fTo8u(const float * src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float * shift, uint8_t * dst, SimdSynetCompatibilityType compatibility);
- \short Converts 32-bit float point image to 8-bit unsigned integer image.
+ \short Converts an FP32 tensor to a UINT8 tensor using per-channel scale and shift.
+
+ Algorithm's details (example for NCHW tensor format):
+ \verbatim
+ upper = isNarrowed(compatibility) ? 180 : 255;
+ for(b = 0; b < batch; ++b)
+ for(c = 0; c < channels; ++c)
+ for(h = 0; h < height; ++h)
+ for(w = 0; w < width; ++w)
+ {
+ offs = ((b*channels + c)*height + h)*width + w;
+ dst[offs] = restrict(round(src[offs]*scale[c] + shift[c]), 0, upper);
+ }
+ \endverbatim
+ For NHWC tensor format the same calculation uses offset ((b*height + h)*width + w)*channels + c.
\note This function is used in Synet Framework.
- \param [in] src - a pointer to the 32-bit float array with input image tensor.
- \param [in] batch - a number of images in the batch of (input/output) image tensor.
- \param [in] channels - a number of channels in the (input/output) image tensor.
- \param [in] height - a height of (input/output) image tensor.
- \param [in] width - a width of (input/output) image tensor.
- \param [in] format - a format of (input/output) image tensor.
- \param [in] scale - a pointer to the 32-bit float array with scale coefficients.
- \param [in] shift - a pointer to the 32-bit float array with shift coefficients.
- \param [out] dst - a pointer to the 8-bit unsigned integer array with output image tensor.
- \param [in] compatibility - a flags of calculation compatibility.
+ \param [in] src - a pointer to the FP32 input tensor.
+ \param [in] batch - a batch size of input and output tensors.
+ \param [in] channels - a number of channels in input and output tensors.
+ \param [in] height - a height of input and output tensors.
+ \param [in] width - a width of input and output tensors.
+ \param [in] format - a format of input and output tensors. Can be NCHW or NHWC.
+ \param [in] scale - a pointer to the 32-bit float array with per-channel scale coefficients.
+ \param [in] shift - a pointer to the 32-bit float array with per-channel shift coefficients.
+ \param [out] dst - a pointer to the UINT8 output tensor.
+ \param [in] compatibility - calculation compatibility flags. When narrowed 8-bit mode is active, output is limited to [0, 180], otherwise to [0, 255].
*/
SIMD_API void SimdSynetConvert32fTo8u(const float * src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float * shift, uint8_t* dst, SimdSynetCompatibilityType compatibility);
@@ -7503,20 +7554,33 @@ extern "C"
\fn void SimdSynetConvert8uTo32f(const uint8_t* src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float* shift, float* dst, SimdSynetCompatibilityType compatibility);
- \short Converts 8-bit unsigned integer image to 32-bit float point image.
+ \short Converts a UINT8 tensor to an FP32 tensor using per-channel scale and shift.
+
+ Algorithm's details (example for NCHW tensor format):
+ \verbatim
+ for(b = 0; b < batch; ++b)
+ for(c = 0; c < channels; ++c)
+ for(h = 0; h < height; ++h)
+ for(w = 0; w < width; ++w)
+ {
+ offs = ((b*channels + c)*height + h)*width + w;
+ dst[offs] = src[offs]*scale[c] + shift[c];
+ }
+ \endverbatim
+ For NHWC tensor format the same calculation uses offset ((b*height + h)*width + w)*channels + c.
\note This function is used in Synet Framework.
- \param [in] src - a pointer to the 8-bit unsigned integer array with input image tensor.
- \param [in] batch - a number of images in the batch of (input/output) image tensor.
- \param [in] channels - a number of channels in the (input/output) image tensor.
- \param [in] height - a height of (input/output) image tensor.
- \param [in] width - a width of (input/output) image tensor.
- \param [in] format - a format of (input/output) image tensor.
- \param [in] scale - a pointer to the 32-bit float array with scale coefficients.
- \param [in] shift - a pointer to the 32-bit float array with shift coefficients.
- \param [out] dst - a pointer to the array with 32-bit float output image tensor.
- \param [in] compatibility - a flags of calculation compatibility.
+ \param [in] src - a pointer to the UINT8 input tensor.
+ \param [in] batch - a batch size of input and output tensors.
+ \param [in] channels - a number of channels in input and output tensors.
+ \param [in] height - a height of input and output tensors.
+ \param [in] width - a width of input and output tensors.
+ \param [in] format - a format of input and output tensors. Can be NCHW or NHWC.
+ \param [in] scale - a pointer to the 32-bit float array with per-channel scale coefficients.
+ \param [in] shift - a pointer to the 32-bit float array with per-channel shift coefficients.
+ \param [out] dst - a pointer to the FP32 output tensor.
+ \param [in] compatibility - calculation compatibility flags.
*/
SIMD_API void SimdSynetConvert8uTo32f(const uint8_t* src, size_t batch, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* scale, const float* shift, float* dst, SimdSynetCompatibilityType compatibility);
@@ -7524,10 +7588,21 @@ extern "C"
\fn void * SimdSynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv);
- \short Initializes FP32 convolution algorithm.
+ \short Initializes an FP32 convolution context.
+
+ The function validates convolution parameters and chooses a suitable implementation (direct, depthwise,
+ Winograd, NHWC-specialized or GEMM-based). It supports FP32 source and destination tensors with matching
+ NCHW or NHWC format. The destination spatial size must match convolution parameters:
+ \verbatim
+ dstH = (srcH + padY + padH - (dilationY*(kernelY - 1) + 1)) / strideY + 1
+ dstW = (srcW + padX + padW - (dilationX*(kernelX - 1) + 1)) / strideX + 1
+ \endverbatim
+
+ A created context stores tensor shape, format, convolution geometry, group count and activation type.
+ Weights, bias and activation parameters are attached later by ::SimdSynetConvolution32fSetParams.
\param [in] batch - a batch size.
- \param [in] conv - a pointer to convolution parameters.
+ \param [in] conv - a pointer to convolution parameters. Source and destination tensor types must be FP32.
\return a pointer to FP32 convolution context. On error it returns NULL. It must be released with using of function ::SimdRelease.
This pointer is used in functions ::SimdSynetConvolution32fExternalBufferSize, ::SimdSynetConvolution32fInternalBufferSize,
::SimdSynetConvolution32fInfo, ::SimdSynetConvolution32fSetParams and ::SimdSynetConvolution32fForward.
@@ -7538,10 +7613,14 @@ extern "C"
\fn size_t SimdSynetConvolution32fExternalBufferSize(const void * context);
- \short Gets size of external temporary buffer required for FP32 convolution algorithm.
+ \short Gets the size of caller-provided temporary buffer for FP32 convolution.
+
+ The returned value is a number of 32-bit float elements, not bytes. It depends on the implementation selected
+ during initialization and can be used to allocate the \a buf argument of ::SimdSynetConvolution32fForward.
+ Some implementations return 1 when they do not need external temporary storage.
\param [in] context - a pointer to FP32 convolution context. It must be created by function ::SimdSynetConvolution32fInit and released by function ::SimdRelease.
- \return size of external temporary buffer required for FP32 convolution algorithm.
+ \return a number of FP32 elements required for external temporary buffer.
*/
SIMD_API size_t SimdSynetConvolution32fExternalBufferSize(const void * context);
@@ -7549,10 +7628,14 @@ extern "C"
\fn size_t SimdSynetConvolution32fInternalBufferSize(const void * context);
- \short Gets size of internal buffer used inside FP32 convolution algorithm.
+ \short Gets the size of internal storage used by an FP32 convolution context.
+
+ The returned value is a number of 32-bit float elements, not bytes. It reports internal storage tracked by
+ the selected implementation, such as internal temporary buffers and implementation-specific reordered weights,
+ bias or activation parameters already allocated by the context.
\param [in] context - a pointer to FP32 convolution context. It must be created by function ::SimdSynetConvolution32fInit and released by function ::SimdRelease.
- \return size of internal buffer used inside FP32 convolution algorithm.
+ \return a number of FP32 elements used by internal buffers.
*/
SIMD_API size_t SimdSynetConvolution32fInternalBufferSize(const void * context);
@@ -7560,10 +7643,14 @@ extern "C"
\fn const char* SimdSynetConvolution32fInfo(const void* context);
- \short Gets description of internal implementation of FP32 convolution algorithm.
+ \short Gets a short description of the selected FP32 convolution implementation.
+
+ The returned string contains the implementation extension and algorithm name, for example a direct, depthwise,
+ Winograd, NHWC direct or GEMM-based variant. The returned pointer is owned by the context and remains valid
+ until the next call of this function for the same context or until the context is released.
\param [in] context - a pointer to FP32 convolution context. It must be created by function ::SimdSynetConvolution32fInit and released by function ::SimdRelease.
- \return string with description of internal implementation of FP32 convolution algorithm.
+ \return a string with description of internal implementation of FP32 convolution algorithm.
*/
SIMD_API const char* SimdSynetConvolution32fInfo(const void* context);
@@ -7571,13 +7658,22 @@ extern "C"
\fn void SimdSynetConvolution32fSetParams(void * context, const float * weight, SimdBool * internal, const float * bias, const float * params);
- \short Sets weights, biases and parameters of activation function required for FP32 convolution algorithm.
+ \short Sets weights, bias and activation parameters for FP32 convolution.
+
+ This function must be called before ::SimdSynetConvolution32fForward. The \a weight array contains FP32
+ convolution weights with kernelY*kernelX*srcC*dstC/group elements. Depending on the selected implementation,
+ weights can be used directly or transformed and stored inside the context. If \a internal is not NULL, the
+ selected implementation writes the weight storage mode to it: SimdTrue means that weights were transformed and
+ stored internally, while SimdFalse means that the implementation may use the original \a weight array directly,
+ so the caller must keep it valid for later forward calls. Bias and activation parameters can also be copied
+ internally by some implementations; otherwise their pointers are stored in the context.
\param [in, out] context - a pointer to FP32 convolution context. It must be created by function ::SimdSynetConvolution32fInit and released by function ::SimdRelease.
- \param [in] weight - a pointer to convolution weights.
- \param [out] internal - a flag signalizing that weight is stored in the internal buffer. Can be NULL.
- \param [in] bias - a pointer to bias. Can be NULL.
- \param [in] params - a pointer to parameters of activation functions (see ::SimdConvolutionActivationType). Can be NULL.
+ \param [in] weight - a pointer to FP32 convolution weights.
+ \param [out] internal - a pointer to a flag receiving weight ownership mode. Can be NULL.
+ \param [in] bias - a pointer to FP32 bias array with dstC elements. Can be NULL.
+ \param [in] params - a pointer to FP32 parameters of activation function (see ::SimdConvolutionActivationType).
+ Can be NULL when activation does not require parameters.
*/
SIMD_API void SimdSynetConvolution32fSetParams(void * context, const float * weight, SimdBool * internal, const float * bias, const float * params);
@@ -7585,12 +7681,25 @@ extern "C"
\fn void SimdSynetConvolution32fForward(void * context, const float * src, float * buf, float * dst);
- \short Performs forward propagation of FP32 convolution algorithm.
+ \short Performs forward propagation of FP32 convolution.
+
+ The function convolves each image in the batch, adds bias when it was set, and applies the activation specified
+ in ::SimdConvolutionParameters:
+ \verbatim
+ sum = bias == NULL ? 0 : bias[dc];
+ for(sc = 0; sc < srcC/group; ++sc)
+ for(ky = 0; ky < kernelY; ++ky)
+ for(kx = 0; kx < kernelX; ++kx)
+ sum += src[inputOffset] * weight[weightOffset];
+ dst[outputOffset] = Activate(sum, activation, params);
+ \endverbatim
+ The exact offsets depend on tensor format, padding, dilation, stride and group. The input and output tensors
+ use the shape and format from the context created by ::SimdSynetConvolution32fInit.
\param [in] context - a pointer to FP32 convolution context. It must be created by function ::SimdSynetConvolution32fInit and released by function ::SimdRelease.
- \param [in] src - a pointer to input tensor.
- \param [out] buf - a pointer to external temporary buffer. The size of the external temporary buffer is determined by function ::SimdSynetConvolution32fExternalBufferSize. Can be NULL (it causes usage of internal buffer).
- \param [out] dst - a pointer to output tensor.
+ \param [in] src - a pointer to FP32 input tensor.
+ \param [out] buf - a pointer to external temporary FP32 buffer. The required number of elements is determined by function ::SimdSynetConvolution32fExternalBufferSize. Can be NULL (it causes usage of internal buffer).
+ \param [out] dst - a pointer to FP32 output tensor.
*/
SIMD_API void SimdSynetConvolution32fForward(void * context, const float * src, float * buf, float * dst);
@@ -7598,11 +7707,24 @@ extern "C"
\fn void * SimdSynetConvolution16bInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility);
- \short Initializes BF16 convolution algorithm.
+ \short Initializes a BF16/FP32 convolution context.
+
+ The function validates convolution parameters and chooses a suitable BF16-oriented implementation (GEMM,
+ NCHW/NHWC GEMM, NHWC depthwise, NHWC special convolution or AMX-BF16 variant when available). It supports
+ FP32 or BF16 source and destination tensors with matching NCHW or NHWC format. The destination spatial size
+ must match convolution parameters:
+ \verbatim
+ dstH = (srcH + padY + padH - (dilationY*(kernelY - 1) + 1)) / strideY + 1
+ dstW = (srcW + padX + padW - (dilationX*(kernelX - 1) + 1)) / strideX + 1
+ \endverbatim
+
+ A created context stores tensor shape, data types, format, convolution geometry, group count, activation type
+ and compatibility flags. FP32 weights, bias and activation parameters are attached later by
+ ::SimdSynetConvolution16bSetParams.
\param [in] batch - a batch size.
- \param [in] conv - a pointer to convolution parameters.
- \param [in] compatibility - a flags of calculation compatibility.
+ \param [in] conv - a pointer to convolution parameters. Source and destination tensor types must be FP32 or BF16.
+ \param [in] compatibility - calculation compatibility flags.
\return a pointer to BF16 convolution context. On error it returns NULL. It must be released with using of function ::SimdRelease.
This pointer is used in functions ::SimdSynetConvolution16bExternalBufferSize, ::SimdSynetConvolution16bInternalBufferSize,
::SimdSynetConvolution16bInfo, ::SimdSynetConvolution16bSetParams and ::SimdSynetConvolution16bForward.
@@ -7613,10 +7735,14 @@ extern "C"
\fn size_t SimdSynetConvolution16bExternalBufferSize(const void * context);
- \short Gets size in bytes of external temporary buffer required for BF16 convolution algorithm.
+ \short Gets the size in bytes of caller-provided temporary buffer for BF16 convolution.
+
+ The returned value is a number of bytes. It depends on the implementation selected during initialization and
+ can be used to allocate the \a buf argument of ::SimdSynetConvolution16bForward. Some implementations return 1
+ or 0 when they do not need external temporary storage.
\param [in] context - a pointer to BF16 convolution context. It must be created by function ::SimdSynetConvolution16bInit and released by function ::SimdRelease.
- \return size of external temporary buffer required for BF16 convolution algorithm.
+ \return a number of bytes required for external temporary buffer.
*/
SIMD_API size_t SimdSynetConvolution16bExternalBufferSize(const void* context);
@@ -7624,10 +7750,13 @@ extern "C"
\fn size_t SimdSynetConvolution16bInternalBufferSize(const void * context);
- \short Gets size (in bytes) of internal buffer used inside BF16 convolution algorithm.
+ \short Gets the size in bytes of internal storage used by a BF16 convolution context.
+
+ The returned value reports internal storage tracked by the selected implementation, including internal
+ temporary buffers, transformed weights, copied bias and copied activation parameters.
\param [in] context - a pointer to BF16 convolution context. It must be created by function ::SimdSynetConvolution16bInit and released by function ::SimdRelease.
- \return size of internal buffer used inside BF16 convolution algorithm.
+ \return a number of bytes used by internal buffers.
*/
SIMD_API size_t SimdSynetConvolution16bInternalBufferSize(const void* context);
@@ -7635,23 +7764,34 @@ extern "C"
\fn const char* SimdSynetConvolution16bInfo(const void* context);
- \short Gets description of internal implementation of BF16 convolution algorithm.
+ \short Gets a short description of the selected BF16 convolution implementation.
+
+ The returned string contains the implementation extension and algorithm name, for example a GEMM, NCHW/NHWC
+ GEMM, NHWC depthwise, NHWC special or AMX-BF16 variant. The returned pointer is owned by the context and
+ remains valid until the next call of this function for the same context or until the context is released.
\param [in] context - a pointer to BF16 convolution context. It must be created by function ::SimdSynetConvolution16bInit and released by function ::SimdRelease.
- \return string with description of internal implementation of BF16 convolution algorithm.
+ \return a string with description of internal implementation of BF16 convolution algorithm.
*/
SIMD_API const char* SimdSynetConvolution16bInfo(const void* context);
/*! @ingroup synet_convolution_bf16
- \fn void SimdSynetConvolution16bSetParams(void * context, const float * weight, const float * bias, const float * params, const float * const * stats);
+ \fn void SimdSynetConvolution16bSetParams(void * context, const float * weight, const float * bias, const float * params);
- \short Sets weights, biases, parameters of activation function, input/output tensor statistics required for BF16 convolution algorithm.
+ \short Sets weights, bias and activation parameters for BF16 convolution.
+
+ This function must be called before ::SimdSynetConvolution16bForward. The \a weight array contains FP32
+ convolution weights with kernelY*kernelX*srcC*dstC/group elements. The selected implementation transforms
+ weights to its internal representation (usually BF16 and reordered; some depthwise paths keep FP32 weights).
+ Bias is copied to an internal FP32 array; when \a bias is NULL, zeros are used. Activation parameters are
+ copied or expanded to the internal FP32 array according to ::SimdConvolutionActivationType.
\param [in, out] context - a pointer to BF16 convolution context. It must be created by function ::SimdSynetConvolution16bInit and released by function ::SimdRelease.
- \param [in] weight - a pointer to original (32-bit float point) convolution weights.
- \param [in] bias - a pointer to original (32-bit float point) bias. Can be NULL.
- \param [in] params - a pointer to original (32-bit float point) parameters of activation functions (see ::SimdConvolutionActivationType). Can be NULL.
+ \param [in] weight - a pointer to FP32 convolution weights.
+ \param [in] bias - a pointer to FP32 bias array with dstC elements. Can be NULL.
+ \param [in] params - a pointer to FP32 parameters of activation function (see ::SimdConvolutionActivationType).
+ Can be NULL when activation does not require parameters.
*/
SIMD_API void SimdSynetConvolution16bSetParams(void* context, const float* weight, const float* bias, const float* params);
@@ -7659,12 +7799,29 @@ extern "C"
\fn void SimdSynetConvolution16bForward(void * context, const uint8_t * src, uint8_t * buf, uint8_t * dst);
- \short Performs forward propagation of BF16 convolution algorithm.
+ \short Performs forward propagation of BF16/FP32 convolution.
+
+ The function converts FP32 input to BF16 when the context source type is FP32, uses BF16 input directly when
+ the source type is BF16, accumulates convolution sums in FP32, adds bias, applies activation and writes FP32
+ or BF16 output according to the context destination type:
+ \verbatim
+ sum = bias[dc];
+ for(sc = 0; sc < srcC/group; ++sc)
+ for(ky = 0; ky < kernelY; ++ky)
+ for(kx = 0; kx < kernelX; ++kx)
+ sum += inputValue * weightValue;
+ value = Activate(sum, activation, params);
+ dst[outputOffset] = dstT == SimdTensorData16b ? Float32ToBFloat16(value) : value;
+ \endverbatim
+ The input value is read as BF16 or converted from FP32 to BF16 according to srcT. The weight value comes from
+ the internal representation prepared by ::SimdSynetConvolution16bSetParams.
+ The exact offsets depend on tensor format, padding, dilation, stride and group. The input and output tensors
+ use the shape, data types and format from the context created by ::SimdSynetConvolution16bInit.
\param [in] context - a pointer to BF16 convolution context. It must be created by function ::SimdSynetConvolution16bInit and released by function ::SimdRelease.
- \param [in] src - a pointer to input tensor.
- \param [out] buf - a pointer to external temporary buffer. The size of the external temporary buffer is determined by function ::SimdSynetConvolution16bExternalBufferSize. Can be NULL (it causes usage of internal buffer).
- \param [out] dst - a pointer to output tensor.
+ \param [in] src - a pointer to input tensor. Actual element type is defined by srcT in convolution parameters.
+ \param [out] buf - a pointer to external temporary byte buffer. The required size is determined by function ::SimdSynetConvolution16bExternalBufferSize. Can be NULL (it causes usage of internal buffer).
+ \param [out] dst - a pointer to output tensor. Actual element type is defined by dstT in convolution parameters.
*/
SIMD_API void SimdSynetConvolution16bForward(void* context, const uint8_t* src, uint8_t* buf, uint8_t* dst);
@@ -7672,11 +7829,26 @@ extern "C"
\fn void * SimdSynetConvolution8iInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility);
- \short Initializes INT8 convolution algorithm.
+ \short Initializes an INT8 convolution context.
+
+ The function validates convolution parameters and chooses a suitable implementation (GEMM, NHWC direct,
+ NHWC depthwise or architecture-specific VNNI/AMX/NEON variant when available). It supports FP32 or UINT8
+ source and destination tensors with matching NCHW or NHWC format. The destination spatial size must match
+ convolution parameters:
+ \verbatim
+ dstH = (srcH + padY + padH - (dilationY*(kernelY - 1) + 1)) / strideY + 1
+ dstW = (srcW + padX + padW - (dilationX*(kernelX - 1) + 1)) / strideX + 1
+ \endverbatim
+
+ A created context stores tensor shape, data types, format, convolution geometry, group count, activation type
+ and compatibility flags. FP32 weights, bias, activation parameters and tensor statistics are attached later by
+ ::SimdSynetConvolution8iSetParams.
\param [in] batch - a batch size.
- \param [in] conv - a pointer to convolution parameters.
- \param [in] compatibility - a flags of calculation compatibility.
+ \param [in] conv - a pointer to convolution parameters. Source and destination tensor types must be FP32 or UINT8.
+ \param [in] compatibility - calculation compatibility flags. They select precise, overflow or narrowed INT8
+ calculation mode. Narrowed mode uses unsigned range [0, 180] and signed range [-90, 90]; otherwise
+ ranges are [0, 255] and [-128, 127].
\return a pointer to INT8 convolution context. On error it returns NULL. It must be released with using of function ::SimdRelease.
This pointer is used in functions ::SimdSynetConvolution8iExternalBufferSize, ::SimdSynetConvolution8iInternalBufferSize,
::SimdSynetConvolution8iInfo, ::SimdSynetConvolution8iSetParams and ::SimdSynetConvolution8iForward.
@@ -7687,10 +7859,14 @@ extern "C"
\fn size_t SimdSynetConvolution8iExternalBufferSize(const void * context);
- \short Gets size in bytes of external temporary buffer required for INT8 convolution algorithm.
+ \short Gets the size in bytes of caller-provided temporary buffer for INT8 convolution.
+
+ The returned value is a number of bytes. It depends on the implementation selected during initialization and
+ can be used to allocate the \a buf argument of ::SimdSynetConvolution8iForward. The buffer can contain temporary
+ UINT8 source conversion data, im2col/padded input data, INT32 sums and temporary FP32 output data.
\param [in] context - a pointer to INT8 convolution context. It must be created by function ::SimdSynetConvolution8iInit and released by function ::SimdRelease.
- \return size of external temporary buffer required for INT8 convolution algorithm.
+ \return a number of bytes required for external temporary buffer.
*/
SIMD_API size_t SimdSynetConvolution8iExternalBufferSize(const void * context);
@@ -7698,10 +7874,14 @@ extern "C"
\fn size_t SimdSynetConvolution8iInternalBufferSize(const void * context);
- \short Gets size of internal buffer used inside INT8 convolution algorithm.
+ \short Gets the size in bytes of internal storage used by an INT8 convolution context.
+
+ The returned value reports internal storage tracked by the selected implementation, including internal
+ temporary buffers, quantized/reordered INT8 weights, source and destination conversion parameters,
+ normalization, bias and activation parameters.
\param [in] context - a pointer to INT8 convolution context. It must be created by function ::SimdSynetConvolution8iInit and released by function ::SimdRelease.
- \return size of internal buffer used inside INT8 convolution algorithm.
+ \return a number of bytes used by internal buffers.
*/
SIMD_API size_t SimdSynetConvolution8iInternalBufferSize(const void * context);
@@ -7709,10 +7889,15 @@ extern "C"
\fn const char* SimdSynetConvolution8iInfo(const void* context);
- \short Gets description of internal implementation of INT8 convolution algorithm.
+ \short Gets a short description of the selected INT8 convolution implementation.
+
+ The returned string contains the implementation extension and algorithm name, for example a GEMM, NHWC direct
+ or NHWC depthwise variant, with a suffix for precise, overflow or narrowed mode when applicable. The returned
+ pointer is owned by the context and remains valid until the next call of this function for the same context or
+ until the context is released.
\param [in] context - a pointer to INT8 convolution context. It must be created by function ::SimdSynetConvolution8iInit and released by function ::SimdRelease.
- \return string with description of internal implementation of INT8 convolution algorithm.
+ \return a string with description of internal implementation of INT8 convolution algorithm.
*/
SIMD_API const char* SimdSynetConvolution8iInfo(const void* context);
@@ -7720,13 +7905,23 @@ extern "C"
\fn void SimdSynetConvolution8iSetParams(void * context, const float * weight, const float * bias, const float * params, const float * const * stats);
- \short Sets weights, biases, parameters of activation function, input/output tensor statistics required for INT8 convolution algorithm.
+ \short Sets weights, bias, activation parameters and tensor statistics for INT8 convolution.
+
+ This function must be called before ::SimdSynetConvolution8iForward. The \a weight array contains FP32
+ convolution weights with kernelY*kernelX*srcC*dstC/group elements. Source statistics (\a stats[0],
+ \a stats[1], each with srcC elements) define per-channel source quantization parameters; destination statistics
+ (\a stats[2], \a stats[3], each with dstC elements) define per-channel output quantization parameters. The
+ selected implementation converts weights to INT8, may reorder them, and computes per-output-channel normalization
+ and bias terms used to convert INT32 sums back to FP32. Activation parameters are copied or expanded internally
+ according to ::SimdConvolutionActivationType.
\param [in, out] context - a pointer to INT8 convolution context. It must be created by function ::SimdSynetConvolution8iInit and released by function ::SimdRelease.
- \param [in] weight - a pointer to original (32-bit float point) convolution weights.
- \param [in] bias - a pointer to original (32-bit float point) bias. Can be NULL.
- \param [in] params - a pointer to original (32-bit float point) parameters of activation functions (see ::SimdConvolutionActivationType). Can be NULL.
- \param [in] stats - a pointer to pointers with statistics of input(min - stats[0], max - stats[1]) and output(min - stats[2], max - stats[3]) tensors.
+ \param [in] weight - a pointer to FP32 convolution weights.
+ \param [in] bias - a pointer to FP32 bias array with dstC elements. Can be NULL.
+ \param [in] params - a pointer to FP32 parameters of activation function (see ::SimdConvolutionActivationType).
+ Can be NULL when activation does not require parameters.
+ \param [in] stats - a pointer to pointers with per-channel tensor statistics:
+ source minimum stats[0], source maximum stats[1], destination minimum stats[2], destination maximum stats[3].
*/
SIMD_API void SimdSynetConvolution8iSetParams(void * context, const float * weight, const float * bias, const float * params, const float * const* stats);
@@ -7734,12 +7929,27 @@ extern "C"
\fn void SimdSynetConvolution8iForward(void * context, const uint8_t * src, uint8_t * buf, uint8_t * dst);
- \short Performs forward propagation of INT8 convolution algorithm.
+ \short Performs forward propagation of INT8 convolution.
+
+ The function converts FP32 input to UINT8 when the context source type is FP32, uses UINT8 input directly when
+ the source type is UINT8, accumulates convolution sums in INT32 with INT8 weights, converts sums to FP32 using
+ internal normalization and bias, applies activation, and writes FP32 or UINT8 output according to the context
+ destination type:
+ \verbatim
+ if(srcT == SimdTensorData32f)
+ src8u = restrict(round(src32f*srcScale[c] + srcShift[c]), srcLower, srcUpper);
+ sum = convolution_int32(src8u, weight8i, zero);
+ value = Activate(sum*norm[dc] + bias[dc], activation, params);
+ dst[outputOffset] = dstT == SimdTensorData8u ?
+ restrict(round(value*dstScale[dc] + dstShift[dc]), dstLower, dstUpper) : value;
+ \endverbatim
+ The exact offsets depend on tensor format, padding, dilation, stride and group. The input and output tensors
+ use the shape, data types and format from the context created by ::SimdSynetConvolution8iInit.
\param [in] context - a pointer to INT8 convolution context. It must be created by function ::SimdSynetConvolution8iInit and released by function ::SimdRelease.
- \param [in] src - a pointer to input tensor.
- \param [out] buf - a pointer to external temporary buffer. The size of the external temporary buffer is determined by function ::SimdSynetConvolution8iExternalBufferSize. Can be NULL (it causes usage of internal buffer).
- \param [out] dst - a pointer to output tensor.
+ \param [in] src - a pointer to input tensor. Actual element type is defined by srcT in convolution parameters.
+ \param [out] buf - a pointer to external temporary byte buffer. The required size is determined by function ::SimdSynetConvolution8iExternalBufferSize. Can be NULL (it causes usage of internal buffer).
+ \param [out] dst - a pointer to output tensor. Actual element type is defined by dstT in convolution parameters.
*/
SIMD_API void SimdSynetConvolution8iForward(void * context, const uint8_t * src, uint8_t * buf, uint8_t * dst);
@@ -7747,11 +7957,23 @@ extern "C"
\fn void * SimdSynetDeconvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility);
- \short Initializes FP32 deconvolution algorithm.
+ \short Initializes an FP32 deconvolution context.
+
+ The function validates deconvolution parameters and chooses a suitable implementation (GEMM-based or
+ NHWC direct 2x2 when available). It supports FP32 source and destination tensors with matching NCHW format,
+ or matching NHWC format when group is 1. The destination spatial size must match deconvolution parameters:
+ \verbatim
+ dstH = strideY*(srcH - 1) + dilationY*(kernelY - 1) + 1 - padY - padH
+ dstW = strideX*(srcW - 1) + dilationX*(kernelX - 1) + 1 - padX - padW
+ \endverbatim
+
+ A created context stores tensor shape, format, deconvolution geometry, group count, activation type and
+ compatibility flags. Weights, bias and activation parameters are attached later by
+ ::SimdSynetDeconvolution32fSetParams.
\param [in] batch - a batch size.
- \param [in] conv - a pointer to deconvolution parameters.
- \param [in] compatibility - a flags of calculation compatibility.
+ \param [in] conv - a pointer to deconvolution parameters. Source and destination tensor types must be FP32.
+ \param [in] compatibility - calculation compatibility flags.
\return a pointer to FP32 deconvolution context. On error it returns NULL. It must be released with using of function ::SimdRelease.
This pointer is used in functions ::SimdSynetDeconvolution32fExternalBufferSize, ::SimdSynetDeconvolution32fInternalBufferSize,
::SimdSynetDeconvolution32fInfo, ::SimdSynetDeconvolution32fSetParams and ::SimdSynetDeconvolution32fForward.
@@ -7762,10 +7984,14 @@ extern "C"
\fn size_t SimdSynetDeconvolution32fExternalBufferSize(const void * context);
- \short Gets size of external temporary buffer required for FP32 deconvolution algorithm.
+ \short Gets the size of caller-provided temporary buffer for FP32 deconvolution.
+
+ The returned value is a number of 32-bit float elements, not bytes. It depends on the implementation selected
+ during initialization and can be used to allocate the \a buf argument of ::SimdSynetDeconvolution32fForward.
+ Some implementations return 1 when they do not need external temporary storage.
\param [in] context - a pointer to FP32 deconvolution context. It must be created by function ::SimdSynetDeconvolution32fInit and released by function ::SimdRelease.
- \return size of external temporary buffer required for FP32 deconvolution algorithm.
+ \return a number of FP32 elements required for external temporary buffer.
*/
SIMD_API size_t SimdSynetDeconvolution32fExternalBufferSize(const void * context);
@@ -7773,10 +7999,14 @@ extern "C"
\fn size_t SimdSynetDeconvolution32fInternalBufferSize(const void * context);
- \short Gets size of internal buffer used inside FP32 deconvolution algorithm.
+ \short Gets the size of internal storage used by an FP32 deconvolution context.
+
+ The returned value is a number of 32-bit float elements, not bytes. It reports internal storage tracked by
+ the selected implementation, such as internal temporary buffers and implementation-specific reordered weights,
+ bias or activation parameters already allocated by the context.
\param [in] context - a pointer to FP32 deconvolution context. It must be created by function ::SimdSynetDeconvolution32fInit and released by function ::SimdRelease.
- \return size of internal buffer used inside FP32 deconvolution algorithm.
+ \return a number of FP32 elements used by internal buffers.
*/
SIMD_API size_t SimdSynetDeconvolution32fInternalBufferSize(const void * context);
@@ -7784,10 +8014,14 @@ extern "C"
\fn const char* SimdSynetDeconvolution32fInfo(const void* context);
- \short Gets description of internal implementation of FP32 deconvolution algorithm.
+ \short Gets a short description of the selected FP32 deconvolution implementation.
+
+ The returned string contains the implementation extension and algorithm name, for example a GEMM-based or
+ NHWC direct 2x2 variant. The returned pointer is owned by the context and remains valid until the next call
+ of this function for the same context or until the context is released.
\param [in] context - a pointer to FP32 deconvolution context. It must be created by function ::SimdSynetDeconvolution32fInit and released by function ::SimdRelease.
- \return string with description of internal implementation of FP32 deconvolution algorithm.
+ \return a string with description of internal implementation of FP32 deconvolution algorithm.
*/
SIMD_API const char* SimdSynetDeconvolution32fInfo(const void* context);
@@ -7795,13 +8029,22 @@ extern "C"
\fn void SimdSynetDeconvolution32fSetParams(void * context, const float * weight, SimdBool * internal, const float * bias, const float * params);
- \short Sets weights, biases and parameters of activation function required for FP32 deconvolution algorithm.
+ \short Sets weights, bias and activation parameters for FP32 deconvolution.
+
+ This function must be called before ::SimdSynetDeconvolution32fForward. The \a weight array contains FP32
+ deconvolution weights with kernelY*kernelX*srcC*dstC/group elements. Depending on the selected implementation,
+ weights can be used directly or transformed and stored inside the context. If \a internal is not NULL, the
+ selected implementation writes the weight storage mode to it: SimdTrue means that weights were transformed and
+ stored internally, while SimdFalse means that the implementation may use the original \a weight array directly,
+ so the caller must keep it valid for later forward calls. Bias and activation parameters can also be copied
+ internally by some implementations; otherwise their pointers are stored in the context.
\param [in, out] context - a pointer to FP32 deconvolution context. It must be created by function ::SimdSynetDeconvolution32fInit and released by function ::SimdRelease.
- \param [in] weight - a pointer to deconvolution weights.
- \param [out] internal - a flag signalizing that weight is stored in the internal buffer. Can be NULL.
- \param [in] bias - a pointer to bias. Can be NULL.
- \param [in] params - a pointer to parameters of activation functions (see ::SimdConvolutionActivationType). Can be NULL.
+ \param [in] weight - a pointer to FP32 deconvolution weights.
+ \param [out] internal - a pointer to a flag receiving weight ownership mode. Can be NULL.
+ \param [in] bias - a pointer to FP32 bias array with dstC elements. Can be NULL.
+ \param [in] params - a pointer to FP32 parameters of activation function (see ::SimdConvolutionActivationType).
+ Can be NULL when activation does not require parameters.
*/
SIMD_API void SimdSynetDeconvolution32fSetParams(void * context, const float * weight, SimdBool * internal, const float * bias, const float * params);
@@ -7809,12 +8052,28 @@ extern "C"
\fn void SimdSynetDeconvolution32fForward(void * context, const float * src, float * buf, float * dst);
- \short Performs forward propagation of FP32 deconvolution algorithm.
+ \short Performs forward propagation of FP32 deconvolution.
+
+ The function applies transposed convolution to each image in the batch, adds bias when it was set, and applies
+ the activation specified in ::SimdConvolutionParameters:
+ \verbatim
+ dst[:] = 0;
+ for(sc = 0; sc < srcC/group; ++sc)
+ for(sy = 0; sy < srcH; ++sy)
+ for(sx = 0; sx < srcW; ++sx)
+ for(ky = 0; ky < kernelY; ++ky)
+ for(kx = 0; kx < kernelX; ++kx)
+ dst[outputOffset] += src[inputOffset] * weight[weightOffset];
+ dst[outputOffset] = Activate(dst[outputOffset] + bias[dc], activation, params);
+ \endverbatim
+ The exact offsets depend on tensor format, padding, dilation, stride and group. The input and output tensors
+ use the shape and format from the context created by ::SimdSynetDeconvolution32fInit.
\param [in] context - a pointer to FP32 deconvolution context. It must be created by function ::SimdSynetDeconvolution32fInit and released by function ::SimdRelease.
- \param [in] src - a pointer to input tensor.
- \param [out] buf - a pointer to external temporary buffer. The size of the external temporary buffer is determined by function ::SimdSynetDeconvolution32fExternalBufferSize. Can be NULL (it causes usage of internal buffer).
- \param [out] dst - a pointer to output tensor.
+ \param [in] src - a pointer to FP32 input tensor.
+ \param [out] buf - a pointer to external temporary FP32 buffer. The required number of elements is determined by
+ function ::SimdSynetDeconvolution32fExternalBufferSize. Can be NULL (it causes usage of internal buffer).
+ \param [out] dst - a pointer to FP32 output tensor.
*/
SIMD_API void SimdSynetDeconvolution32fForward(void * context, const float * src, float * buf, float * dst);