From 85260d14cdbad05e6c58900b2ae82c590f86823d Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Fri, 5 Jun 2026 12:31:01 +0000 Subject: [PATCH] Improve Synet API documentation Co-authored-by: Ihar Yermalayeu --- src/Simd/SimdLib.h | 473 ++++++++++++++++++++++++++++++--------------- 1 file changed, 316 insertions(+), 157 deletions(-) diff --git a/src/Simd/SimdLib.h b/src/Simd/SimdLib.h index 002ff5a239..1ebaa3e4c7 100644 --- a/src/Simd/SimdLib.h +++ b/src/Simd/SimdLib.h @@ -8081,12 +8081,25 @@ extern "C" \fn void * SimdSynetDeconvolution16bInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility); - \short Initializes BF16 deconvolution algorithm. + \short Initializes a BF16/FP32 deconvolution context. + + The function validates deconvolution parameters and chooses a suitable BF16-oriented implementation (GEMM-based + or NHWC GEMM-based variant, including AMX-BF16 when available). It supports FP32 or BF16 source and destination + tensors with matching NCHW format, or matching NHWC format when group is 1. The destination spatial size must + match deconvolution parameters: + \verbatim + dstH = strideY*(srcH - 1) + dilationY*(kernelY - 1) + 1 - padY - padH + dstW = strideX*(srcW - 1) + dilationX*(kernelX - 1) + 1 - padX - padW + \endverbatim + + A created context stores tensor shape, data types, format, deconvolution geometry, group count, activation type + and compatibility flags. FP32 weights, bias and activation parameters are attached later by + ::SimdSynetDeconvolution16bSetParams. \param [in] batch - a batch size. - \param [in] conv - a pointer to deconvolution parameters. - \param [in] compatibility - a flags of calculation compatibility. - \return a pointer to BF16 convolution context. On error it returns NULL. It must be released with using of function ::SimdRelease. + \param [in] conv - a pointer to deconvolution parameters. Source and destination tensor types must be FP32 or BF16. + \param [in] compatibility - calculation compatibility flags. + \return a pointer to BF16 deconvolution context. On error it returns NULL. It must be released with using of function ::SimdRelease. This pointer is used in functions ::SimdSynetDeconvolution16bExternalBufferSize, ::SimdSynetDeconvolution16bInternalBufferSize, ::SimdSynetDeconvolution16bInfo, ::SimdSynetDeconvolution16bSetParams and ::SimdSynetDeconvolution16bForward. */ @@ -8096,10 +8109,14 @@ extern "C" \fn size_t SimdSynetDeconvolution16bExternalBufferSize(const void * context); - \short Gets size in bytes of external temporary buffer required for BF16 deconvolution algorithm. + \short Gets the size in bytes of caller-provided temporary buffer for BF16 deconvolution. + + The returned value is a number of bytes. It depends on the implementation selected during initialization and + can be used to allocate the \a buf argument of ::SimdSynetDeconvolution16bForward. Some implementations return + 1 or 0 when they do not need external temporary storage. \param [in] context - a pointer to BF16 deconvolution context. It must be created by function ::SimdSynetDeconvolution16bInit and released by function ::SimdRelease. - \return size of external temporary buffer required for BF16 convolution algorithm. + \return a number of bytes required for external temporary buffer. */ SIMD_API size_t SimdSynetDeconvolution16bExternalBufferSize(const void* context); @@ -8107,10 +8124,13 @@ extern "C" \fn size_t SimdSynetDeconvolution16bInternalBufferSize(const void * context); - \short Gets size (in bytes) of internal buffer used inside BF16 deconvolution algorithm. + \short Gets the size in bytes of internal storage used by a BF16 deconvolution context. + + The returned value reports internal storage tracked by the selected implementation, including internal + temporary buffers, transformed weights, copied bias and copied activation parameters. \param [in] context - a pointer to BF16 deconvolution context. It must be created by function ::SimdSynetDeconvolution16bInit and released by function ::SimdRelease. - \return size of internal buffer used inside BF16 convolution algorithm. + \return a number of bytes used by internal buffers. */ SIMD_API size_t SimdSynetDeconvolution16bInternalBufferSize(const void* context); @@ -8118,23 +8138,34 @@ extern "C" \fn const char* SimdSynetDeconvolution16bInfo(const void* context); - \short Gets description of internal implementation of BF16 deconvolution algorithm. + \short Gets a short description of the selected BF16 deconvolution implementation. + + The returned string contains the implementation extension and algorithm name, for example a GEMM or NHWC GEMM + variant. The returned pointer is owned by the context and remains valid until the next call of this function + for the same context or until the context is released. \param [in] context - a pointer to BF16 deconvolution context. It must be created by function ::SimdSynetDeconvolution16bInit and released by function ::SimdRelease. - \return string with description of internal implementation of BF16 convolution algorithm. + \return a string with description of internal implementation of BF16 deconvolution algorithm. */ SIMD_API const char* SimdSynetDeconvolution16bInfo(const void* context); /*! @ingroup synet_deconvolution_bf16 - \fn void SimdSynetDeconvolution16bSetParams(void * context, const float * weight, const float * bias, const float * params, const float * const * stats); + \fn void SimdSynetDeconvolution16bSetParams(void * context, const float * weight, const float * bias, const float * params); - \short Sets weights, biases, parameters of activation function, input/output tensor statistics required for BF16 deconvolution algorithm. + \short Sets weights, bias and activation parameters for BF16 deconvolution. + + This function must be called before ::SimdSynetDeconvolution16bForward. The \a weight array contains FP32 + deconvolution weights with kernelY*kernelX*srcC*dstC/group elements. The selected implementation transforms + weights to its internal BF16/reordered representation. Bias is copied to an internal FP32 array; when \a bias + is NULL, zeros are used. Activation parameters are copied or expanded to the internal FP32 array according to + ::SimdConvolutionActivationType. \param [in, out] context - a pointer to BF16 deconvolution context. It must be created by function ::SimdSynetDeconvolution16bInit and released by function ::SimdRelease. - \param [in] weight - a pointer to original (32-bit float point) convolution weights. - \param [in] bias - a pointer to original (32-bit float point) bias. Can be NULL. - \param [in] params - a pointer to original (32-bit float point) parameters of activation functions (see ::SimdDeconvolutionActivationType). Can be NULL. + \param [in] weight - a pointer to FP32 deconvolution weights. + \param [in] bias - a pointer to FP32 bias array with dstC elements. Can be NULL. + \param [in] params - a pointer to FP32 parameters of activation function (see ::SimdConvolutionActivationType). + Can be NULL when activation does not require parameters. */ SIMD_API void SimdSynetDeconvolution16bSetParams(void* context, const float* weight, const float* bias, const float* params); @@ -8142,12 +8173,31 @@ extern "C" \fn void SimdSynetDeconvolution16bForward(void * context, const uint8_t * src, uint8_t * buf, uint8_t * dst); - \short Performs forward propagation of BF16 deconvolution algorithm. + \short Performs forward propagation of BF16/FP32 deconvolution. + + The function converts FP32 input to BF16 when the context source type is FP32, uses BF16 input directly when + the source type is BF16, accumulates transposed convolution sums in FP32, adds bias, applies activation and + writes FP32 or BF16 output according to the context destination type: + \verbatim + dst[:] = 0; + for(sc = 0; sc < srcC/group; ++sc) + for(sy = 0; sy < srcH; ++sy) + for(sx = 0; sx < srcW; ++sx) + for(ky = 0; ky < kernelY; ++ky) + for(kx = 0; kx < kernelX; ++kx) + dst[outputOffset] += inputValue * weightValue; + value = Activate(dst[outputOffset] + bias[dc], activation, params); + dst[outputOffset] = dstT == SimdTensorData16b ? Float32ToBFloat16(value) : value; + \endverbatim + The input value is read as BF16 or converted from FP32 to BF16 according to srcT. The weight value comes from + the internal representation prepared by ::SimdSynetDeconvolution16bSetParams. + The exact offsets depend on tensor format, padding, dilation, stride and group. The input and output tensors + use the shape, data types and format from the context created by ::SimdSynetDeconvolution16bInit. \param [in] context - a pointer to BF16 deconvolution context. It must be created by function ::SimdSynetDeconvolution16bInit and released by function ::SimdRelease. - \param [in] src - a pointer to input tensor. - \param [out] buf - a pointer to external temporary buffer. The size of the external temporary buffer is determined by function ::SimdSynetDeconvolution16bExternalBufferSize. Can be NULL (it causes usage of internal buffer). - \param [out] dst - a pointer to output tensor. + \param [in] src - a pointer to input tensor. Actual element type is defined by srcT in deconvolution parameters. + \param [out] buf - a pointer to external temporary byte buffer. The required size is determined by function ::SimdSynetDeconvolution16bExternalBufferSize. Can be NULL (it causes usage of internal buffer). + \param [out] dst - a pointer to output tensor. Actual element type is defined by dstT in deconvolution parameters. */ SIMD_API void SimdSynetDeconvolution16bForward(void* context, const uint8_t* src, uint8_t* buf, uint8_t* dst); @@ -8155,7 +8205,7 @@ extern "C" \fn void SimdSynetDequantizeLinear(const uint8_t* src, size_t size, int32_t bias, const float* norm, float* dst); - \short Performs UINT8 linear dequantization. + \short Dequantizes a UINT8 tensor to FP32 with a single scale and zero-point correction. Algorithm's details for ::SimdSynetDequantizeLinear: \verbatim @@ -8163,10 +8213,12 @@ extern "C" dst[i] = (src[i] + bias) * norm[0]; \endverbatim + This corresponds to dst = (src - zero) * scale when \a bias is equal to -zero and \a norm[0] is equal to scale. + \param [in] src - a pointer to UINT8 input tensor. - \param [in] size - a size of the input and output tensors. - \param [in] bias - a dequantization bias (-zero). - \param [in] norm - a dequantization norm (scale). + \param [in] size - a number of elements in the input and output tensors. + \param [in] bias - an integer value added to every input element, typically negative zero-point. + \param [in] norm - a pointer to FP32 dequantization scale. Only norm[0] is used. \param [out] dst - a pointer to FP32 output tensor. */ SIMD_API void SimdSynetDequantizeLinear(const uint8_t* src, size_t size, int32_t bias, const float* norm, float* dst); @@ -8175,13 +8227,16 @@ extern "C" \fn void SimdSynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst); - \short This function is used for forward propagation of EltwiseLayer. + \short Performs element-wise product, weighted sum, maximum or minimum over several FP32 tensors. + + The function reads \a count input arrays of equal length \a size and writes one output array. + The \a weight array is used only for ::SimdSynetEltwiseOperationSum. Algorithm's details for ::SimdSynetEltwiseOperationProduct: \verbatim for(j = 0; j < size; ++j) - dst[j] = 1; - for(i = 0; i < count; ++i) + dst[j] = src[0][j] * src[1][j]; + for(i = 2; i < count; ++i) for(j = 0; j < size; ++j) dst[j] *= src[i][j]; \endverbatim @@ -8189,8 +8244,8 @@ extern "C" Algorithm's details for ::SimdSynetEltwiseOperationSum: \verbatim for(j = 0; j < size; ++j) - dst[j] = 0; - for(i = 0; i < count; ++i) + dst[j] = src[0][j]*weight[0] + src[1][j]*weight[1]; + for(i = 2; i < count; ++i) for(j = 0; j < size; ++j) dst[j] += src[i][j]*weight[i]; \endverbatim @@ -8198,8 +8253,8 @@ extern "C" Algorithm's details for ::SimdSynetEltwiseOperationMax: \verbatim for(j = 0; j < size; ++j) - dst[j] = -FLT_MAX; - for(i = 0; i < count; ++i) + dst[j] = Max(src[0][j], src[1][j]); + for(i = 2; i < count; ++i) for(j = 0; j < size; ++j) dst[j] = Max(dst[j], src[i][j]); \endverbatim @@ -8207,20 +8262,20 @@ extern "C" Algorithm's details for ::SimdSynetEltwiseOperationMin: \verbatim for(j = 0; j < size; ++j) - dst[j] = FLT_MAX; - for(i = 0; i < count; ++i) + dst[j] = Min(src[0][j], src[1][j]); + for(i = 2; i < count; ++i) for(j = 0; j < size; ++j) dst[j] = Min(dst[j], src[i][j]); \endverbatim \note This function is used in Synet Framework. - \param [in] src - a pointer to pointers to the input 32-bit float arrays. - \param [in] weight - a pointer to the 32-bit float array with sum coefficients. It is need only for ::SimdSynetEltwiseOperationSum operation type otherwise it can be NULL. + \param [in] src - a pointer to \a count pointers to input FP32 arrays. + \param [in] weight - a pointer to FP32 weighted-sum coefficients. It is used only for ::SimdSynetEltwiseOperationSum; otherwise it can be NULL. \param [in] count - a count of input arrays. Must be at least 2. - \param [in] size - a size of the input and output arrays. + \param [in] size - a number of elements in each input and output array. \param [in] type - a type of operation (see ::SimdSynetEltwiseOperationType). - \param [out] dst - a pointer to the output 32-bit float array. + \param [out] dst - a pointer to the output FP32 array. */ SIMD_API void SimdSynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst); @@ -8228,22 +8283,22 @@ extern "C" \fn void SimdSynetElu32f(const float * src, size_t size, const float * alpha, float * dst); - \short Calculates ELU activation function for 32-bit float array. + \short Calculates ELU activation for an FP32 array. The input and output arrays must have the same size. Algorithm's details: \verbatim for(i = 0; i < size; ++i) - dst[i] = src[i] >= 0 ? src[i] : alpha*(Exp(src[i]) - 1); + dst[i] = src[i] >= 0 ? src[i] : alpha[0]*(exp(src[i]) - 1); \endverbatim \note This function is used in Synet Framework. - \param [in] src - a pointer to the input 32-bit float array. - \param [in] size - a size of input and output arrays. - \param [in] alpha - a pointer to alpha parameter. - \param [out] dst - a pointer to the output 32-bit float array. + \param [in] src - a pointer to the input FP32 array. + \param [in] size - a number of elements in the input and output arrays. + \param [in] alpha - a pointer to ELU alpha parameter. Only alpha[0] is used. + \param [out] dst - a pointer to the output FP32 array. */ SIMD_API void SimdSynetElu32f(const float * src, size_t size, const float * alpha, float * dst); @@ -8251,28 +8306,45 @@ extern "C" \fn void* SimdSynetGatherElementsInit(SimdTensorDataType dataType, SimdTensorDataType indexType, SimdBool indexConst, size_t indexUsers, const size_t * outer, size_t outerSize, size_t srcCount, size_t inner, size_t idxCount); + \short Initializes a gather-elements context. + + The function creates a context for ONNX-style GatherElements along the dimension of length \a srcCount. + It supports FP32, BF16 and UINT8 data tensors and INT32 or INT64 index tensors. The input tensor shape is: + \verbatim + outer[0] * ... * outer[outerSize - 1] * srcCount * inner + \endverbatim + The index and output tensor shape is: + \verbatim + outer[0] * ... * outer[outerSize - 1] * idxCount * inner + \endverbatim + Algorithm's details: \verbatim - for (o = 0; o < outer; ++o) - for (c = 0; c < idxCount; ++c) - for (i = 0; i < inner; ++i) - ic = idx[o, c, i]; + for(b = 0; b < outer[0]*...*outer[outerSize - 1]; ++b) + for(c = 0; c < idxCount; ++c) + for(i = 0; i < inner; ++i) + { + ic = idx[b, c, i]; if (ic < 0) ic += srcCount; - dst[o, c, i] = src[o, ic, i]; + dst[b, c, i] = src[b, ic, i]; + } \endverbatim - \param [in] dataType - a type of input and output tensor. + If \a indexConst is SimdTrue, constant indexes can be analyzed by ::SimdSynetGatherElementsSetIndex to avoid + repeated negative-index checks and to reduce repeated outer index processing when possible. + + \param [in] dataType - a type of input and output tensor. It can be FP32, BF16 or UINT8. \param [in] indexType - a type of index tensor. It can be INT32 or INT64. - \param [in] indexConst - a flag of constant index. If this flag is True then you can set index once. - \param [in] indexUsers - a number of index users. - \param [in] outer - a pointer to outer shape of input and output tensor. - \param [in] outerSize - a size of outer shape of input and output tensor. - \param [in] srcCount - an input tensor gather dimension. - \param [in] inner - an inner size of input and output tensor. - \param [in] idxCount - an index and output tensor gather dimension. + \param [in] indexConst - a flag indicating that index tensor is constant and can be set once. + \param [in] indexUsers - a number of consumers sharing the same constant index tensor. The current implementation stores this value but does not otherwise use it. + \param [in] outer - a pointer to outer shape dimensions before the gathered dimension. + \param [in] outerSize - a number of dimensions in \a outer. + \param [in] srcCount - a length of the gathered dimension in the input tensor. + \param [in] inner - a product of dimensions after the gathered dimension. + \param [in] idxCount - a length of the gathered dimension in the index and output tensors. \return a pointer to gather elements context. On error it returns NULL. It must be released with using of function ::SimdRelease. - This pointer is used in functions :: SimdSynetGatherElementsSetIndex, ::SimdSynetGatherElementsInternalBufferSize, and ::SimdSynetGatherElementsForward. + This pointer is used in functions ::SimdSynetGatherElementsSetIndex, ::SimdSynetGatherElementsInternalBufferSize and ::SimdSynetGatherElementsForward. */ SIMD_API void* SimdSynetGatherElementsInit(SimdTensorDataType dataType, SimdTensorDataType indexType, SimdBool indexConst, size_t indexUsers, const size_t * outer, size_t outerSize, size_t srcCount, size_t inner, size_t idxCount); @@ -8281,11 +8353,14 @@ extern "C" \fn void SimdSynetGatherElementsSetIndex(const void* context, const uint8_t* idx); - \short Sets and analyses constant gather elements indexes. + \short Sets and analyzes constant gather-elements indexes. + + The function has an effect only when the context was created with \a indexConst equal to SimdTrue. It detects + whether negative-index correction is needed and may collapse repeated outer batches of identical indexes. + The current implementation still expects the index pointer to be passed to ::SimdSynetGatherElementsForward. \param [in] context - a pointer to gather elements context. It must be created by function ::SimdSynetGatherElementsInit and released by function ::SimdRelease. - \param [in] idx - a pointer to tensor with indexes. It can be INT32 or INT64. - Its size = outer[0] * .. * outer[outerSize - 1] * idxCount * inner. + \param [in] idx - a pointer to INT32 or INT64 index tensor. Its shape is outer[0] * ... * outer[outerSize - 1] * idxCount * inner. */ SIMD_API void SimdSynetGatherElementsSetIndex(const void* context, const uint8_t* idx); @@ -8293,7 +8368,9 @@ extern "C" \fn size_t SimdSynetGatherElementsInternalBufferSize(const void* context); - \short Gets size of internal buffer in bytes used inside gather elements algorithm. + \short Gets the size in bytes of internal storage used by a gather-elements context. + + The returned value reports implementation-specific buffers used by the context. \param [in] context - a pointer to gather elements context. It must be created by function ::SimdSynetGatherElementsInit and released by function ::SimdRelease. \return size of internal buffer in bytes used inside gather elements algorithm. @@ -8304,12 +8381,16 @@ extern "C" \fn void SimdSynetGatherElementsForward(void* context, const uint8_t* src, const uint8_t* idx, uint8_t* dst); - \short Performs forward propagation of gather elements algorithm. + \short Performs gather-elements forward propagation. + + The function gathers elements from \a src according to \a idx. If ::SimdSynetGatherElementsSetIndex was called, + the context can use the analysis results, but \a idx must still point to the index tensor in the current + implementation. Negative indexes are interpreted relative to \a srcCount. - \param [in] context - a pointer to gather elements algorithm. It must be created by function ::SimdSynetGatherElementsInit and released by function ::SimdRelease. - \param [in] src - a pointer to input tensor. Its size = outer[0] * .. * outer[outerSize - 1] * srcCount * inner. - \param [in] idx - a pointer to index tensor. Its size = outer[0] * .. * outer[outerSize - 1] * idxCount * inner. - \param [out] dst - a pointer to output tensor. Its size = outer[0] * .. * outer[outerSize - 1] * idxCount * inner. + \param [in] context - a pointer to gather elements context. It must be created by function ::SimdSynetGatherElementsInit and released by function ::SimdRelease. + \param [in] src - a pointer to input tensor. Its shape is outer[0] * ... * outer[outerSize - 1] * srcCount * inner. + \param [in] idx - a pointer to INT32 or INT64 index tensor. Its shape is outer[0] * ... * outer[outerSize - 1] * idxCount * inner. + \param [out] dst - a pointer to output tensor. Its shape is outer[0] * ... * outer[outerSize - 1] * idxCount * inner. */ SIMD_API void SimdSynetGatherElementsForward(void* context, const uint8_t* src, const uint8_t* idx, uint8_t* dst); @@ -8317,7 +8398,7 @@ extern "C" \fn void SimdSynetGelu32f(const float* src, size_t size, float* dst); - \short This function is used for forward propagation of GeluLayer. + \short Calculates exact GELU activation for an FP32 array. Algorithm's details: \verbatim @@ -8327,9 +8408,9 @@ extern "C" \note This function is used in Synet Framework. - \param [in] src - a pointer to the 32-bit float array. - \param [in] size - a size of input and output arrays. - \param [out] dst - a pointer to output 32-bit float array. + \param [in] src - a pointer to the input FP32 array. + \param [in] size - a number of elements in the input and output arrays. + \param [out] dst - a pointer to the output FP32 array. */ SIMD_API void SimdSynetGelu32f(const float* src, size_t size, float* dst); @@ -8337,7 +8418,12 @@ extern "C" \fn void* SimdSynetGridSample2dInit(size_t batch, size_t channels, size_t srcH, size_t srcW, size_t dstH, size_t dstW, SimdTensorDataType type, SimdGridSampleInterpType interp, SimdGridSamplePaddingType padding, SimdBool align); - \short Initializes grid sample 2D algorithm. + \short Initializes an ONNX-compatible GridSample-2D context. + + The function creates a context for NCHW tensors. The input tensor shape is batch*channels*srcH*srcW, the + grid tensor shape is batch*dstH*dstW*2 and the output tensor shape is batch*channels*dstH*dstW. The grid + stores normalized coordinates in the range [-1, 1] as pairs (x, y). The current implementation supports FP32 + source, grid and destination tensors. \param [in] batch - a batch size. \param [in] channels - a number of channels in the input and output tensors. @@ -8345,10 +8431,10 @@ extern "C" \param [in] srcW - a width of input tensor. \param [in] dstH - a height of output tensor. \param [in] dstW - a width of output tensor. - \param [in] type - a type of input, grid and output tensor. - \param [in] interp - an interpolation type. - \param [in] padding - a padding type. - \param [in] align - a flag to align corners. + \param [in] type - a type of input, grid and output tensor. Currently only FP32 is supported. + \param [in] interp - an interpolation type: bilinear, nearest or bicubic. + \param [in] padding - a padding type for out-of-bound grid coordinates. + \param [in] align - a flag corresponding to ONNX/PyTorch align_corners. \return a pointer to grid sample 2D context. On error it returns NULL. It must be released with using of function ::SimdRelease. This pointer is used in functions ::SimdSynetGridSample2dInternalBufferSize, and ::SimdSynetGridSample2dForward. */ @@ -8359,10 +8445,13 @@ extern "C" \fn size_t SimdSynetGridSample2dInternalBufferSize(const void* context); - \short Gets size of internal buffer used inside permute algorithm. + \short Gets the size in bytes of internal storage used by a GridSample-2D context. + + The optimized bilinear/zero-padding FP32 implementation stores padded source rows, precomputed source indexes + and interpolation coefficients internally. The reference implementation returns 0. \param [in] context - a pointer to grid sample 2D context. It must be created by function ::SimdSynetGridSample2dInit and released by function ::SimdRelease. - \return size of internal buffer used inside grid sample 2D algorithm. + \return a number of bytes used by internal buffers. */ SIMD_API size_t SimdSynetGridSample2dInternalBufferSize(const void* context); @@ -8370,12 +8459,24 @@ extern "C" \fn void SimdSynetGridSample2dForward(void* context, const uint8_t* src, const uint8_t* grd, uint8_t* dst); - \short Performs forward propagation of grid sample algorithm. + \short Performs GridSample-2D forward propagation. + + For every output pixel the function denormalizes grid coordinates to input image coordinates, applies the + selected padding rule for out-of-bound coordinates and samples the input with nearest, bilinear or bicubic + interpolation: + \verbatim + if(align) + x = (gridX + 1) * (srcW - 1) / 2; + else + x = ((gridX + 1) * srcW - 1) / 2; + y is computed in the same way from gridY and srcH. + dst[b, c, dy, dx] = Interpolate(src[b, c], x, y, interp, padding); + \endverbatim \param [in] context - a pointer to grid sample 2D context. It must be created by function ::SimdSynetGridSample2dInit and released by function ::SimdRelease. - \param [in] src - a pointer to input tensor. It has size = batch * channels * srcH * srcW. - \param [in] grd - a pointer to grid tensor. It has size = batch * dstH * dstW * 2. - \param [out] dst - a pointer to output tensor. It has size = batch * channels * dstH * dstW. + \param [in] src - a pointer to input tensor. Its shape is batch * channels * srcH * srcW. + \param [in] grd - a pointer to grid tensor. Its shape is batch * dstH * dstW * 2. + \param [out] dst - a pointer to output tensor. Its shape is batch * channels * dstH * dstW. */ SIMD_API void SimdSynetGridSample2dForward(void* context, const uint8_t* src, const uint8_t* grd, uint8_t* dst); @@ -8383,23 +8484,23 @@ extern "C" \fn void SimdSynetHardSigmoid32f(const float * src, size_t size, const float * scale, const float * shift, float * dst); - \short Calculates HardSigmoid activation function (https://pytorch.org/docs/stable/generated/torch.nn.Hardsigmoid.html) for 32-bit float array. + \short Calculates HardSigmoid activation for an FP32 array. - Input and output arrays must have the same size. + The input and output arrays must have the same size. Algorithm's details: \verbatim for(i = 0; i < size; ++i) - dst[i] = Max(0, Min(src[i] * scale + shift, 1)); + dst[i] = Max(0, Min(src[i] * scale[0] + shift[0], 1)); \endverbatim \note This function is used in Synet Framework. - \param [in] src - a pointer to the input 32-bit float array. - \param [in] size - a size of input and output arrays. - \param [in] scale - a pointer to scale parameter. This parameter is equal to 1/6 in Pytorch documentation. - \param [in] shift - a pointer to shift parameter. This parameter is equal to 1/2 in Pytorch documentation. - \param [out] dst - a pointer to the output 32-bit float array. + \param [in] src - a pointer to the input FP32 array. + \param [in] size - a number of elements in the input and output arrays. + \param [in] scale - a pointer to scale parameter. Only scale[0] is used. This parameter is equal to 1/6 in PyTorch documentation. + \param [in] shift - a pointer to shift parameter. Only shift[0] is used. This parameter is equal to 1/2 in PyTorch documentation. + \param [out] dst - a pointer to the output FP32 array. */ SIMD_API void SimdSynetHardSigmoid32f(const float * src, size_t size, const float * scale, const float * shift, float * dst); @@ -8407,23 +8508,23 @@ extern "C" \fn void SimdSynetHswish32f(const float * src, size_t size, const float * shift, const float * scale, float * dst); - \short Calculates H-Swish activation function (https://arxiv.org/pdf/1905.02244.pdf) for 32-bit float array. + \short Calculates H-Swish activation for an FP32 array. - Input and output arrays must have the same size. + The input and output arrays must have the same size. Algorithm's details: \verbatim for(i = 0; i < size; ++i) - dst[i] = Max(Min(src[i], shift) + shift, 0)*scale*src[i]; + dst[i] = Max(Min(src[i], shift[0]) + shift[0], 0)*scale[0]*src[i]; \endverbatim \note This function is used in Synet Framework. - \param [in] src - a pointer to the input 32-bit float array. - \param [in] size - a size of input and output arrays. - \param [in] shift - a pointer to shift parameter. It is equal to 3 in original paper. - \param [in] scale - a pointer to scale parameter. It is equal to 1/6 in original paper. - \param [out] dst - a pointer to the output 32-bit float array. + \param [in] src - a pointer to the input FP32 array. + \param [in] size - a number of elements in the input and output arrays. + \param [in] shift - a pointer to shift parameter. Only shift[0] is used. It is equal to 3 in the original paper. + \param [in] scale - a pointer to scale parameter. Only scale[0] is used. It is equal to 1/6 in the original paper. + \param [out] dst - a pointer to the output FP32 array. */ SIMD_API void SimdSynetHswish32f(const float* src, size_t size, const float* shift, const float* scale, float* dst); @@ -8431,13 +8532,28 @@ extern "C" \fn void * SimdSynetInnerProduct32fInit(size_t M, size_t N, size_t K, SimdBool transB, SimdBool constB, SimdBool bias, SimdConvolutionActivationType activation); - \short Initializes FP32 inner product algorithm. + \short Initializes an FP32 inner-product (matrix multiplication) context. - \param [in] M - a height of A and height of C matrices. - \param [in] N - a width of B and width of C matrices. + The context computes C = A*B, optionally adds bias and applies activation: + \verbatim + for(i = 0; i < M; ++i) + for(j = 0; j < N; ++j) + { + sum = bias ? bias[j] : 0; + for(k = 0; k < K; ++k) + sum += A[i, k] * (transB ? B[j, k] : B[k, j]); + C[i, j] = Activate(sum, activation, params); + } + \endverbatim + + When \a constB is SimdTrue, matrix B must be supplied to ::SimdSynetInnerProduct32fSetParams and can be + reordered or cached inside the context. + + \param [in] M - a height of A and C matrices. + \param [in] N - a width of B and C matrices. \param [in] K - a width of A and height of B matrices. - \param [in] transB - a transpose matrix B before multiplication. - \param [in] constB - a matrix B is constant. + \param [in] transB - a flag indicating that B is stored as N*K instead of K*N. + \param [in] constB - a flag indicating that matrix B is constant and can be set once. \param [in] bias - a flag to add bias to output matrix C. \param [in] activation - an activation function type used after inner product. \return a pointer to FP32 inner product context. On error it returns NULL. It must be released with using of function ::SimdRelease. @@ -8450,10 +8566,13 @@ extern "C" \fn size_t SimdSynetInnerProduct32fInternalBufferSize(const void * context); - \short Gets size of internal buffer used inside FP32 inner product algorithm. + \short Gets the size of internal storage used by an FP32 inner-product context. + + The returned value is a number of FP32 elements. It reports implementation-specific storage such as reordered + constant weights and copied bias. \param [in] context - a pointer to FP32 inner product context. It must be created by function ::SimdSynetInnerProduct32fInit and released by function ::SimdRelease. - \return size of internal buffer used inside FP32 inner product algorithm. + \return a number of FP32 elements used by internal buffers. */ SIMD_API size_t SimdSynetInnerProduct32fInternalBufferSize(const void* context); @@ -8461,10 +8580,14 @@ extern "C" \fn size_t SimdSynetInnerProduct32fExternalBufferSize(const void * context); - \short Gets size of external buffer used in FP32 inner product algorithm. + \short Gets the size of caller-provided temporary buffer for FP32 inner product. + + The returned value is a number of FP32 elements. The current FP32 implementations do not require an external + buffer and return 0, but callers can use this value when allocating the \a buf argument of + ::SimdSynetInnerProduct32fForward. \param [in] context - a pointer to FP32 inner product context. It must be created by function ::SimdSynetInnerProduct32fInit and released by function ::SimdRelease. - \return size of internal buffer used inside FP32 inner product algorithm. + \return a number of FP32 elements required for external temporary buffer. */ SIMD_API size_t SimdSynetInnerProduct32fExternalBufferSize(const void* context); @@ -8472,13 +8595,19 @@ extern "C" \fn void SimdSynetInnerProduct32fSetParams(void* context, const float* weight, SimdBool* internal, const float* bias, const float* params); - \short Sets weights, biases and parameters of activation function required for FP32 inner product algorithm. + \short Sets weights, bias and activation parameters for FP32 inner product. + + This function must be called before ::SimdSynetInnerProduct32fForward. If \a constB was SimdTrue during + initialization, \a weight provides matrix B and the implementation may reorder and store it internally. If + \a internal is not NULL, SimdTrue means the weights were copied/reordered into the context; SimdFalse means + the original \a weight pointer can be used by later forward calls and must remain valid. Bias and activation + parameters are stored or referenced according to the selected implementation. \param [in, out] context - a pointer to FP32 inner product context. It must be created by function ::SimdSynetInnerProduct32fInit and released by function ::SimdRelease. - \param [in] weight - a pointer to inner product weights. - \param [out] internal - a flag signalizing that weight is stored in the internal buffer. Can be NULL. - \param [in] bias - a pointer to bias. Can be NULL. - \param [in] params - a pointer to parameters of activation functions (see ::SimdConvolutionActivationType). Can be NULL. + \param [in] weight - a pointer to FP32 matrix B weights. + \param [out] internal - a pointer to a flag receiving weight storage mode. Can be NULL. + \param [in] bias - a pointer to FP32 bias array with N elements. Can be NULL. + \param [in] params - a pointer to FP32 parameters of activation function (see ::SimdConvolutionActivationType). Can be NULL when activation does not require parameters. */ SIMD_API void SimdSynetInnerProduct32fSetParams(void* context, const float* weight, SimdBool* internal, const float* bias, const float* params); @@ -8486,15 +8615,15 @@ extern "C" \fn void SimdSynetInnerProduct32fForward(void* context, const float* A, const float* B, float *buf, float* C); - \short Performs forward propagation of FP32 inner product algorithm. + \short Performs FP32 inner-product forward propagation. \param [in] context - a pointer to FP32 inner product context. It must be created by function ::SimdSynetInnerProduct32fInit and released by function ::SimdRelease. - \param [in] A - a pointer to A matrix. - \param [in] B - a pointer to B matrix. Can be NULL if B is constant matrix. In that case you have to set B (weight) in function SimdSynetInnerProduct16bSetParams. - \param [out] buf - a pointer to external buffer. The size of the external temporary buffer is determined by function ::SimdSynetInnerProduct16bExternalBufferSize. + \param [in] A - a pointer to FP32 A matrix with M*K elements. + \param [in] B - a pointer to FP32 B matrix. Can be NULL if B is constant; in that case B must be set by function ::SimdSynetInnerProduct32fSetParams. + \param [out] buf - a pointer to external temporary FP32 buffer. The required number of elements is determined by function ::SimdSynetInnerProduct32fExternalBufferSize. Can be NULL (it causes usage of internal buffer). - \param [out] C - a pointer to output matrix. + \param [out] C - a pointer to FP32 output matrix with M*N elements. */ SIMD_API void SimdSynetInnerProduct32fForward(void* context, const float* A, const float* B, float *buf, float* C); @@ -8502,7 +8631,7 @@ extern "C" \fn void SimdSynetInnerProductLayerForward(const float * src, const float * weight, const float * bias, size_t count, size_t size, float * dst); - \short This function is used for forward propagation of InnerProductLayer. + \short Performs FP32 forward propagation of a single inner-product layer. Algorithm's details: \verbatim @@ -8516,12 +8645,12 @@ extern "C" \note This function is used in Synet Framework. - \param [in] src - a pointer to the input 32-bit float array. The size of the array must be equal to size. - \param [in] weight - a pointer to the 32-bit float array with weight coefficients. The size of the array must be equal to count*size. - \param [in] bias - a pointer to the 32-bit float array with bias coefficients. The size of the array must be equal to count. Can be NULL. - \param [in] count - a size of output array. - \param [in] size - a size of input array. - \param [out] dst - a pointer to the output 32-bit float array. The size of the array must be equal to count. + \param [in] src - a pointer to the input FP32 array with \a size elements. + \param [in] weight - a pointer to FP32 weight coefficients with count*size elements, stored as count rows. + \param [in] bias - a pointer to FP32 bias coefficients with \a count elements. Can be NULL. + \param [in] count - a number of output elements. + \param [in] size - a number of input elements. + \param [out] dst - a pointer to the output FP32 array with \a count elements. */ SIMD_API void SimdSynetInnerProductLayerForward(const float * src, const float * weight, const float * bias, size_t count, size_t size, float * dst); @@ -8529,27 +8658,32 @@ extern "C" \fn void* SimdSynetInnerProduct16bInit(size_t M, size_t N, size_t K, SimdTensorDataType typeA, SimdTensorDataType typeB, SimdTensorDataType typeC, SimdBool transB, SimdBool constB, SimdBool bias, SimdConvolutionActivationType activation); - \short Initializes BF16 inner product (matrix multiplication) algorithm. + \short Initializes a BF16/FP32 inner-product (matrix multiplication) context. - Algorithm's details (transpA = false, bias = true, activation = SimdConvolutionActivationIdentity): + The context computes C = A*B with FP32 accumulation, optionally adds bias and applies activation. A, B and C + can be FP32 or BF16 according to \a typeA, \a typeB and \a typeC: \verbatim for(i = 0; i < M; ++i) for(j = 0; j < N; ++j) { - C[i,j] = bias[j]; + sum = bias ? bias[j] : 0; for(k = 0; k < K; ++k) - C[i,j] += A[i,k] * B[k,j]; + sum += A[i, k] * (transB ? B[j, k] : B[k, j]); + C[i, j] = ConvertToTypeC(Activate(sum, activation, params)); } \endverbatim - \param [in] M - a height of A and height of C matrices. - \param [in] N - a width of B and width of C matrices. + When \a constB is SimdTrue, matrix B must be supplied to ::SimdSynetInnerProduct16bSetParams and is converted + or reordered into internal storage. + + \param [in] M - a height of A and C matrices. + \param [in] N - a width of B and C matrices. \param [in] K - a width of A and height of B matrices. \param [in] typeA - a type of A matrix. It can be FP32 or BF16. \param [in] typeB - a type of B matrix. It can be FP32 or BF16. \param [in] typeC - a type of C matrix. It can be FP32 or BF16. - \param [in] transB - a transpose matrix B before multiplication. - \param [in] constB - a matrix B is constant. + \param [in] transB - a flag indicating that B is stored as N*K instead of K*N. + \param [in] constB - a flag indicating that matrix B is constant and can be set once. \param [in] bias - a flag to add bias to output matrix C. \param [in] activation - an activation function type used after inner product. \return a pointer to BF16 inner product context. On error it returns NULL. It must be released with using of function ::SimdRelease. @@ -8562,10 +8696,13 @@ extern "C" \fn size_t SimdSynetInnerProduct16bInternalBufferSize(const void * context); - \short Gets size in bytes of internal buffer used inside BF16 inner product algorithm. + \short Gets the size in bytes of internal storage used by a BF16 inner-product context. + + The returned value reports internal temporary storage, reordered constant weights, copied bias and copied + activation parameters. \param [in] context - a pointer to BF16 inner product context. It must be created by function ::SimdSynetInnerProduct16bInit and released by function ::SimdRelease. - \return size in bytes of internal buffer used inside BF16 inner product algorithm. + \return a number of bytes used by internal buffers. */ SIMD_API size_t SimdSynetInnerProduct16bInternalBufferSize(const void* context); @@ -8573,10 +8710,14 @@ extern "C" \fn size_t SimdSynetInnerProduct16bExternalBufferSize(const void * context); - \short Gets size in bytes of external buffer used in BF16 inner product algorithm. + \short Gets the size in bytes of caller-provided temporary buffer for BF16 inner product. + + The returned value depends on matrix types and implementation. It covers temporary BF16 copies of FP32 inputs, + packed non-constant B matrices, FP32 accumulation buffers and optional post-processing buffers. It can be used + to allocate the \a buf argument of ::SimdSynetInnerProduct16bForward. \param [in] context - a pointer to BF16 inner product context. It must be created by function ::SimdSynetInnerProduct16bInit and released by function ::SimdRelease. - \return size in bytes of external buffer used in BF16 inner product algorithm. + \return a number of bytes required for external temporary buffer. */ SIMD_API size_t SimdSynetInnerProduct16bExternalBufferSize(const void* context); @@ -8584,23 +8725,33 @@ extern "C" \fn const char* SimdSynetInnerProduct16bInfo(const void * context); - \short Gets string with description of internal implementation of BF16 inner product algorithm. + \short Gets a short description of the selected BF16 inner-product implementation. + + The returned string contains the implementation extension, algorithm name and parameter summary. The returned + pointer is owned by the context and remains valid until the next call of this function for the same context or + until the context is released. \param [in] context - a pointer to BF16 inner product context. It must be created by function ::SimdSynetInnerProduct16bInit and released by function ::SimdRelease. - \return string with description of internal implementation of BF16 inner product algorithm. + \return a string with description of internal implementation of BF16 inner product algorithm. */ SIMD_API const char* SimdSynetInnerProduct16bInfo(const void* context); /*! @ingroup synet_inner_product_bf16 - \fn void SimdSynetInnerProduct16bSetParams(void* context, const float* weight, SimdBool* internal, const float* bias); + \fn void SimdSynetInnerProduct16bSetParams(void* context, const float* weight, const float* bias, const float* params); + + \short Sets weights, bias and activation parameters for BF16 inner product. - \short Sets weights and biases required for BF16 inner product algorithm. + This function must be called before ::SimdSynetInnerProduct16bForward. If \a constB was SimdTrue during + initialization, \a weight provides matrix B in FP32 form and the implementation converts it to BF16 and may + reorder it into internal storage. Bias is copied to an internal FP32 array; when \a bias is NULL, zeros are + used. Activation parameters are copied or expanded to the internal FP32 array according to + ::SimdConvolutionActivationType. \param [in, out] context - a pointer to BF16 inner product context. It must be created by function ::SimdSynetInnerProduct16bInit and released by function ::SimdRelease. - \param [in] weight - a pointer to B matrix. Can be NULL. - \param [in] bias - a pointer to bias. Can be NULL. - \param [in] params - a pointer to parameters of activation functions (see ::SimdConvolutionActivationType). Can be NULL. + \param [in] weight - a pointer to FP32 matrix B weights. Can be NULL only when B is not constant. + \param [in] bias - a pointer to FP32 bias array with N elements. Can be NULL. + \param [in] params - a pointer to FP32 parameters of activation function (see ::SimdConvolutionActivationType). Can be NULL when activation does not require parameters. */ SIMD_API void SimdSynetInnerProduct16bSetParams(void* context, const float* weight, const float* bias, const float* params); @@ -8608,17 +8759,21 @@ extern "C" \fn void SimdSynetInnerProduct16bForward(void* context, const uint8_t* A, const uint8_t* B, uint8_t* buf, uint8_t* C); - \short Performs forward propagation of BF16 inner product algorithm. + \short Performs BF16/FP32 inner-product forward propagation. + + The function converts FP32 A or B inputs to BF16 when requested by the context, uses BF16 inputs directly + otherwise, accumulates the matrix product in FP32, adds bias, applies activation and writes FP32 or BF16 + output according to \a typeC. \param [in] context - a pointer to BF16 inner product context. It must be created by function ::SimdSynetInnerProduct16bInit and released by function ::SimdRelease. - \param [in] A - a pointer to A matrix. - \param [in] B - a pointer to B matrix. Can be NULL if B is constant matrix. - In that case you have to set B (weight) in function SimdSynetInnerProduct16bSetParams. - \param [out] buf - a pointer to external buffer. - The size of the external temporary buffer is determined by function ::SimdSynetInnerProduct16bExternalBufferSize. + \param [in] A - a pointer to A matrix. Actual element type is defined by \a typeA in initialization. + \param [in] B - a pointer to B matrix. Can be NULL if B is constant; in that case B must be set by function ::SimdSynetInnerProduct16bSetParams. + Actual element type is defined by \a typeB in initialization for non-constant B. + \param [out] buf - a pointer to external temporary byte buffer. + The required size is determined by function ::SimdSynetInnerProduct16bExternalBufferSize. Can be NULL (it causes usage of internal buffer). - \param [out] C - a pointer to output matrix. + \param [out] C - a pointer to output matrix. Actual element type is defined by \a typeC in initialization. */ SIMD_API void SimdSynetInnerProduct16bForward(void* context, const uint8_t* A, const uint8_t* B, uint8_t* buf, uint8_t* C); @@ -8626,7 +8781,7 @@ extern "C" \fn void SimdSynetInnerProduct8i(size_t M, size_t N, size_t K, const uint8_t * src, const int8_t * weight, int32_t * dst, SimdSynetCompatibilityType compatibility); - \short This function is used for INT8 forward propagation of InnerProductLayer. + \short Performs UINT8-by-INT8 inner product with INT32 output. Algorithm's details: \verbatim @@ -8636,21 +8791,25 @@ extern "C" { sum = 0; for (k = 0; k < K; ++k) - sum += src[i * K + k] * weight[j * K + k]; + sum += int(src[i*K + k]) * int(weight[j*K + k]); dst[i*N + j] = sum; } } \endverbatim + When compatibility flags allow overflow-compatible multiplication, adjacent products can be accumulated with + 16-bit saturation before being added to the INT32 sum. Use ::SimdSynetCompatibility8iPrecise to request the + precise product accumulation path. + \note This function is used in Synet Framework. - \param [in] M - a batch size. - \param [in] N - an output size. - \param [in] K - an input size. - \param [in] src - a pointer to the input 8-bit unsigned integer array. The size of the array must be equal to M*K. - \param [in] weight - a pointer to the 8-bit signed integer array with weight. The size of the array must be equal to N*K. - \param [out] dst - a pointer to the output 32-bit integer array. The size of the array must be equal to M*N. - \param [in] compatibility - a flags of calculation compatibility. + \param [in] M - a batch size, or a number of input rows. + \param [in] N - an output size, or a number of weight rows. + \param [in] K - an input size, or a row length. + \param [in] src - a pointer to the UINT8 input matrix with M*K elements. + \param [in] weight - a pointer to the INT8 weight matrix with N*K elements, stored by output row. + \param [out] dst - a pointer to the INT32 output matrix with M*N elements. + \param [in] compatibility - calculation compatibility flags (see ::SimdSynetCompatibilityType). */ SIMD_API void SimdSynetInnerProduct8i(size_t M, size_t N, size_t K, const uint8_t * src, const int8_t * weight, int32_t * dst, SimdSynetCompatibilityType compatibility);