diff --git a/build_srcs.bzl b/build_srcs.bzl index 503dbd4919b..21facac8aae 100644 --- a/build_srcs.bzl +++ b/build_srcs.bzl @@ -94,6 +94,7 @@ LOGGING_SRCS = [ ] MICROKERNEL_DEFS = [ + "src/bf16-f32-vcvt/bf16-f32-vcvt.inc", "src/f16-avgpool/f16-avgpool-minmax.inc", "src/f16-dwconv/f16-dwconv-minmax.inc", "src/f16-f32-vcvt/f16-f32-vcvt.inc", @@ -156,6 +157,7 @@ MICROKERNEL_DEFS = [ "src/f32-conv-hwc/f32-conv-hwc.inc", "src/f32-dwconv/f32-dwconv-minmax.inc", "src/f32-dwconv/f32-dwconv.inc", + "src/f32-bf16-vcvt/f32-bf16-vcvt.inc", "src/f32-f16-vcvt/f32-f16-vcvt.inc", "src/f32-maxpool/f32-maxpool-minmax.inc", "src/f32-qs8-vcvt/f32-qs8-vcvt.inc", diff --git a/cmake/gen/scalar_microkernels.cmake b/cmake/gen/scalar_microkernels.cmake index c441ca8f8b3..5f4af371963 100644 --- a/cmake/gen/scalar_microkernels.cmake +++ b/cmake/gen/scalar_microkernels.cmake @@ -10,6 +10,7 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS + src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u2.c src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u4.c src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u4.c src/f16-qu8-vcvt/gen/f16-qu8-vcvt-scalar-imagic-u4.c @@ -25,6 +26,7 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS src/f16-vsin/gen/f16-vsin-scalar-rational-3-2-div.c src/f32-argmaxpool/f32-argmaxpool-9p8x-scalar-c1.c src/f32-avgpool/gen/f32-avgpool-9p-minmax-scalar-u1.c + src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u2.c src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x4-scalar-1x1.c src/f32-dwconv/gen/f32-dwconv-3p1c-minmax-scalar-acc2.c src/f32-dwconv/gen/f32-dwconv-3p1c-scalar-acc2.c @@ -248,6 +250,9 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS SET(NON_PROD_SCALAR_MICROKERNEL_SRCS src/bf16-f32-gemm/bf16-f32-gemm-1x4c2-minmax-scalar.c + src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u1.c + src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u3.c + src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u4.c src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u1.c src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u2.c src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u3.c @@ -273,6 +278,9 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS src/f16-rminmax/gen/f16-rminmax-scalar-u3-acc3.c src/f16-rminmax/gen/f16-rminmax-scalar-u4-acc2.c src/f16-rminmax/gen/f16-rminmax-scalar-u4-acc4.c + src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u1.c + src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u3.c + src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u4.c src/f32-conv-hwc/f32-conv-hwc-3x3s2p0p1c3x4-scalar-1x1.c src/f32-conv-hwc/f32-conv-hwc-3x3s2p1c3x4-scalar-1x1.c src/f32-dwconv/gen/f32-dwconv-3p1c-minmax-scalar.c diff --git a/gen/scalar_microkernels.bzl b/gen/scalar_microkernels.bzl index 94f61c45f8d..4cb550b0ff4 100644 --- a/gen/scalar_microkernels.bzl +++ b/gen/scalar_microkernels.bzl @@ -6,6 +6,7 @@ # PROD_SCALAR_MICROKERNEL_SRCS = [ + "src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u2.c", "src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u4.c", "src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u4.c", "src/f16-qu8-vcvt/gen/f16-qu8-vcvt-scalar-imagic-u4.c", @@ -21,6 +22,7 @@ PROD_SCALAR_MICROKERNEL_SRCS = [ "src/f16-vsin/gen/f16-vsin-scalar-rational-3-2-div.c", "src/f32-argmaxpool/f32-argmaxpool-9p8x-scalar-c1.c", "src/f32-avgpool/gen/f32-avgpool-9p-minmax-scalar-u1.c", + "src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u2.c", "src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x4-scalar-1x1.c", "src/f32-dwconv/gen/f32-dwconv-3p1c-minmax-scalar-acc2.c", "src/f32-dwconv/gen/f32-dwconv-3p1c-scalar-acc2.c", @@ -245,6 +247,9 @@ PROD_SCALAR_MICROKERNEL_SRCS = [ NON_PROD_SCALAR_MICROKERNEL_SRCS = [ "src/bf16-f32-gemm/bf16-f32-gemm-1x4c2-minmax-scalar.c", + "src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u1.c", + "src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u3.c", + "src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u4.c", "src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u1.c", "src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u2.c", "src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u3.c", @@ -270,6 +275,9 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [ "src/f16-rminmax/gen/f16-rminmax-scalar-u3-acc3.c", "src/f16-rminmax/gen/f16-rminmax-scalar-u4-acc2.c", "src/f16-rminmax/gen/f16-rminmax-scalar-u4-acc4.c", + "src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u1.c", + "src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u3.c", + "src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u4.c", "src/f32-conv-hwc/f32-conv-hwc-3x3s2p0p1c3x4-scalar-1x1.c", "src/f32-conv-hwc/f32-conv-hwc-3x3s2p1c3x4-scalar-1x1.c", "src/f32-dwconv/gen/f32-dwconv-3p1c-minmax-scalar.c", diff --git a/scripts/generate-bf16-f32-vcvt.sh b/scripts/generate-bf16-f32-vcvt.sh new file mode 100755 index 00000000000..b6f4fcced20 --- /dev/null +++ b/scripts/generate-bf16-f32-vcvt.sh @@ -0,0 +1,13 @@ +#!/bin/sh +# Copyright 2021 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +#################################### Scalar ################################### +tools/xngen src/bf16-f32-vcvt/scalar.c.in -D BATCH_TILE=1 -o src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u1.c & +tools/xngen src/bf16-f32-vcvt/scalar.c.in -D BATCH_TILE=2 -o src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u2.c & +tools/xngen src/bf16-f32-vcvt/scalar.c.in -D BATCH_TILE=3 -o src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u3.c & +tools/xngen src/bf16-f32-vcvt/scalar.c.in -D BATCH_TILE=4 -o src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u4.c & + +wait diff --git a/scripts/generate-f32-bf16-vcvt.sh b/scripts/generate-f32-bf16-vcvt.sh new file mode 100755 index 00000000000..d079dfd88e1 --- /dev/null +++ b/scripts/generate-f32-bf16-vcvt.sh @@ -0,0 +1,13 @@ +#!/bin/sh +# Copyright 2021 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +#################################### Scalar ################################### +tools/xngen src/f32-bf16-vcvt/scalar.c.in -D BATCH_TILE=1 -o src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u1.c & +tools/xngen src/f32-bf16-vcvt/scalar.c.in -D BATCH_TILE=2 -o src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u2.c & +tools/xngen src/f32-bf16-vcvt/scalar.c.in -D BATCH_TILE=3 -o src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u3.c & +tools/xngen src/f32-bf16-vcvt/scalar.c.in -D BATCH_TILE=4 -o src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u4.c & + +wait diff --git a/src/bf16-f32-vcvt/bf16-f32-vcvt.inc b/src/bf16-f32-vcvt/bf16-f32-vcvt.inc new file mode 100644 index 00000000000..94a879ae1af --- /dev/null +++ b/src/bf16-f32-vcvt/bf16-f32-vcvt.inc @@ -0,0 +1,10 @@ +// clang-format off +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +XNN_UKERNEL(xnn_arch_none, xnn_bf16_f32_vcvt_ukernel__scalar_u1, 1, false, xnn_bfloat16, float, void, NULL) +XNN_UKERNEL(xnn_arch_none, xnn_bf16_f32_vcvt_ukernel__scalar_u2, 2, false, xnn_bfloat16, float, void, NULL) +XNN_UKERNEL(xnn_arch_none, xnn_bf16_f32_vcvt_ukernel__scalar_u3, 3, false, xnn_bfloat16, float, void, NULL) +XNN_UKERNEL(xnn_arch_none, xnn_bf16_f32_vcvt_ukernel__scalar_u4, 4, false, xnn_bfloat16, float, void, NULL) diff --git a/src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u1.c b/src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u1.c new file mode 100644 index 00000000000..a0d86de8760 --- /dev/null +++ b/src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u1.c @@ -0,0 +1,40 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/bf16-f32-vcvt/scalar.c.in +// Generator: tools/xngen +// +// Copyright 2021 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vcvt.h" + + +void xnn_bf16_f32_vcvt_ukernel__scalar_u1( + size_t batch, + const xnn_bfloat16* input, + float* output, + const void* params) +{ + assert(batch != 0); + assert(batch % sizeof(xnn_bfloat16) == 0); + assert(input != NULL); + assert(output != NULL); + + const xnn_bfloat16* i = input; + float* o = output; + do { + const xnn_bfloat16 vh = *i++; + + *o++ = xnn_bfloat16_to_float(vh); + + batch -= sizeof(xnn_bfloat16); + } while (batch != 0); +} diff --git a/src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u2.c b/src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u2.c new file mode 100644 index 00000000000..2b8e317fa96 --- /dev/null +++ b/src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u2.c @@ -0,0 +1,47 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/bf16-f32-vcvt/scalar.c.in +// Generator: tools/xngen +// +// Copyright 2021 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vcvt.h" + + +void xnn_bf16_f32_vcvt_ukernel__scalar_u2( + size_t batch, + const xnn_bfloat16* input, + float* output, + const void* params) +{ + assert(batch != 0); + assert(batch % sizeof(xnn_bfloat16) == 0); + assert(input != NULL); + assert(output != NULL); + + const xnn_bfloat16* i = input; + float* o = output; + for (; batch >= 2 * sizeof(xnn_bfloat16); batch -= 2 * sizeof(xnn_bfloat16)) { + const xnn_bfloat16 vh0 = i[0]; + const xnn_bfloat16 vh1 = i[1]; + i += 2; + + o[0] = xnn_bfloat16_to_float(vh0); + o[1] = xnn_bfloat16_to_float(vh1); + o += 2; + } + if XNN_UNLIKELY(batch != 0) { + const xnn_bfloat16 vh = *i; + + *o = xnn_bfloat16_to_float(vh); + } +} diff --git a/src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u3.c b/src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u3.c new file mode 100644 index 00000000000..84f717f4f6f --- /dev/null +++ b/src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u3.c @@ -0,0 +1,53 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/bf16-f32-vcvt/scalar.c.in +// Generator: tools/xngen +// +// Copyright 2021 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vcvt.h" + + +void xnn_bf16_f32_vcvt_ukernel__scalar_u3( + size_t batch, + const xnn_bfloat16* input, + float* output, + const void* params) +{ + assert(batch != 0); + assert(batch % sizeof(xnn_bfloat16) == 0); + assert(input != NULL); + assert(output != NULL); + + const xnn_bfloat16* i = input; + float* o = output; + for (; batch >= 3 * sizeof(xnn_bfloat16); batch -= 3 * sizeof(xnn_bfloat16)) { + const xnn_bfloat16 vh0 = i[0]; + const xnn_bfloat16 vh1 = i[1]; + const xnn_bfloat16 vh2 = i[2]; + i += 3; + + o[0] = xnn_bfloat16_to_float(vh0); + o[1] = xnn_bfloat16_to_float(vh1); + o[2] = xnn_bfloat16_to_float(vh2); + o += 3; + } + if XNN_UNLIKELY(batch != 0) { + do { + const xnn_bfloat16 vh = *i++; + + *o++ = xnn_bfloat16_to_float(vh); + + batch -= sizeof(xnn_bfloat16); + } while (batch != 0); + } +} diff --git a/src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u4.c b/src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u4.c new file mode 100644 index 00000000000..1fae879a1fd --- /dev/null +++ b/src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u4.c @@ -0,0 +1,55 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/bf16-f32-vcvt/scalar.c.in +// Generator: tools/xngen +// +// Copyright 2021 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vcvt.h" + + +void xnn_bf16_f32_vcvt_ukernel__scalar_u4( + size_t batch, + const xnn_bfloat16* input, + float* output, + const void* params) +{ + assert(batch != 0); + assert(batch % sizeof(xnn_bfloat16) == 0); + assert(input != NULL); + assert(output != NULL); + + const xnn_bfloat16* i = input; + float* o = output; + for (; batch >= 4 * sizeof(xnn_bfloat16); batch -= 4 * sizeof(xnn_bfloat16)) { + const xnn_bfloat16 vh0 = i[0]; + const xnn_bfloat16 vh1 = i[1]; + const xnn_bfloat16 vh2 = i[2]; + const xnn_bfloat16 vh3 = i[3]; + i += 4; + + o[0] = xnn_bfloat16_to_float(vh0); + o[1] = xnn_bfloat16_to_float(vh1); + o[2] = xnn_bfloat16_to_float(vh2); + o[3] = xnn_bfloat16_to_float(vh3); + o += 4; + } + if XNN_UNLIKELY(batch != 0) { + do { + const xnn_bfloat16 vh = *i++; + + *o++ = xnn_bfloat16_to_float(vh); + + batch -= sizeof(xnn_bfloat16); + } while (batch != 0); + } +} diff --git a/src/bf16-f32-vcvt/scalar.c.in b/src/bf16-f32-vcvt/scalar.c.in new file mode 100644 index 00000000000..4cbb9fe0b7c --- /dev/null +++ b/src/bf16-f32-vcvt/scalar.c.in @@ -0,0 +1,63 @@ +// Copyright 2021 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert BATCH_TILE >= 1 +#include +#include +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vcvt.h" + + +void xnn_bf16_f32_vcvt_ukernel__scalar_u${BATCH_TILE}( + size_t batch, + const xnn_bfloat16* input, + float* output, + const void* params) +{ + assert(batch != 0); + assert(batch % sizeof(xnn_bfloat16) == 0); + assert(input != NULL); + assert(output != NULL); + + const xnn_bfloat16* i = input; + float* o = output; + $if BATCH_TILE > 1: + for (; batch >= ${BATCH_TILE} * sizeof(xnn_bfloat16); batch -= ${BATCH_TILE} * sizeof(xnn_bfloat16)) { + $for N in range(BATCH_TILE): + const xnn_bfloat16 vh${N} = i[${N}]; + i += ${BATCH_TILE}; + + $for N in range(BATCH_TILE): + o[${N}] = xnn_bfloat16_to_float(vh${N}); + o += ${BATCH_TILE}; + } + $if BATCH_TILE == 1: + do { + const xnn_bfloat16 vh = *i++; + + *o++ = xnn_bfloat16_to_float(vh); + + batch -= sizeof(xnn_bfloat16); + } while (batch != 0); + $elif BATCH_TILE == 2: + if XNN_UNLIKELY(batch != 0) { + const xnn_bfloat16 vh = *i; + + *o = xnn_bfloat16_to_float(vh); + } + $else: + if XNN_UNLIKELY(batch != 0) { + do { + const xnn_bfloat16 vh = *i++; + + *o++ = xnn_bfloat16_to_float(vh); + + batch -= sizeof(xnn_bfloat16); + } while (batch != 0); + } +} diff --git a/src/configs/unary-elementwise-config.c b/src/configs/unary-elementwise-config.c index 00dda5e9704..3868008282e 100644 --- a/src/configs/unary-elementwise-config.c +++ b/src/configs/unary-elementwise-config.c @@ -23,6 +23,7 @@ static const int default_config = 0; static const int consistent_config = 1; +static struct xnn_unary_elementwise_config bf16_to_f32_cvt_config = {0}; static struct xnn_unary_elementwise_config f16_abs_config = {0}; static struct xnn_unary_elementwise_config f16_approxgelu_config = {0}; static struct xnn_unary_elementwise_config f16_clamp_config = {0}; @@ -67,6 +68,7 @@ static struct xnn_unary_elementwise_config f32_sine_config[2] = {0}; static struct xnn_unary_elementwise_config f32_sqr_config = {0}; static struct xnn_unary_elementwise_config f32_sqrt_config[2] = {0}; static struct xnn_unary_elementwise_config f32_tanh_config[2] = {0}; +static struct xnn_unary_elementwise_config f32_to_bf16_cvt_config = {0}; static struct xnn_unary_elementwise_config f32_to_f16_cvt_config = {0}; static struct xnn_unary_elementwise_config f32_to_qp8_cvt_config = {0}; static struct xnn_unary_elementwise_config f32_to_qs8_cvt_config = {0}; @@ -83,6 +85,7 @@ static struct xnn_unary_elementwise_config u8_clamp_config = {0}; static struct xnn_unary_elementwise_config xx_copy_config = {0}; +XNN_INIT_ONCE_GUARD(bf16_to_f32_cvt); XNN_INIT_ONCE_GUARD(f16_abs); XNN_INIT_ONCE_GUARD(f16_approxgelu); XNN_INIT_ONCE_GUARD(f16_clamp); @@ -127,6 +130,7 @@ XNN_INIT_ONCE_GUARD(f32_sine); XNN_INIT_ONCE_GUARD(f32_sqr); XNN_INIT_ONCE_GUARD(f32_sqrt); XNN_INIT_ONCE_GUARD(f32_tanh); +XNN_INIT_ONCE_GUARD(f32_to_bf16_cvt); XNN_INIT_ONCE_GUARD(f32_to_f16_cvt); XNN_INIT_ONCE_GUARD(f32_to_qp8_cvt); XNN_INIT_ONCE_GUARD(f32_to_qs8_cvt); @@ -705,6 +709,11 @@ static void init_f16_tanh_config(void) { #endif } +static void init_bf16_to_f32_cvt_config(void) { + bf16_to_f32_cvt_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_bf16_f32_vcvt_ukernel__scalar_u2); + bf16_to_f32_cvt_config.element_tile = 2; +} + static void init_f16_to_f32_cvt_config(void) { #if XNN_ARCH_ARM const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); @@ -2444,6 +2453,11 @@ static void init_f32_to_f16_cvt_config(void) { #endif } +static void init_f32_to_bf16_cvt_config(void) { + f32_to_bf16_cvt_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_bf16_vcvt_ukernel__scalar_u2); + f32_to_bf16_cvt_config.element_tile = 2; +} + static void init_f32_to_qp8_cvt_config(void) { #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI f32_to_qp8_cvt_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_x8_packq_f32qp8_ukernel__aarch64_neon_u2); @@ -3517,6 +3531,15 @@ const struct xnn_unary_elementwise_config* xnn_init_f16_tanh_config() { return &f16_tanh_config; } +const struct xnn_unary_elementwise_config* xnn_init_bf16_to_f32_cvt_config() { + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + if (hardware_config == NULL) { + return NULL; + } + XNN_INIT_ONCE(bf16_to_f32_cvt); + return &bf16_to_f32_cvt_config; +} + const struct xnn_unary_elementwise_config* xnn_init_f16_to_f32_cvt_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL) { @@ -3778,6 +3801,15 @@ const struct xnn_unary_elementwise_config* xnn_init_f32_tanh_config(uint32_t fla } } +const struct xnn_unary_elementwise_config* xnn_init_f32_to_bf16_cvt_config() { + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + if (hardware_config == NULL) { + return NULL; + } + XNN_INIT_ONCE(f32_to_bf16_cvt); + return &f32_to_bf16_cvt_config; +} + const struct xnn_unary_elementwise_config* xnn_init_f32_to_f16_cvt_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL) { diff --git a/src/f32-bf16-vcvt/f32-bf16-vcvt.inc b/src/f32-bf16-vcvt/f32-bf16-vcvt.inc new file mode 100644 index 00000000000..c353f64cae1 --- /dev/null +++ b/src/f32-bf16-vcvt/f32-bf16-vcvt.inc @@ -0,0 +1,10 @@ +// clang-format off +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +XNN_UKERNEL(xnn_arch_none, xnn_f32_bf16_vcvt_ukernel__scalar_u1, 1, false, float, xnn_bfloat16, void, NULL) +XNN_UKERNEL(xnn_arch_none, xnn_f32_bf16_vcvt_ukernel__scalar_u2, 2, false, float, xnn_bfloat16, void, NULL) +XNN_UKERNEL(xnn_arch_none, xnn_f32_bf16_vcvt_ukernel__scalar_u3, 3, false, float, xnn_bfloat16, void, NULL) +XNN_UKERNEL(xnn_arch_none, xnn_f32_bf16_vcvt_ukernel__scalar_u4, 4, false, float, xnn_bfloat16, void, NULL) \ No newline at end of file diff --git a/src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u1.c b/src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u1.c new file mode 100644 index 00000000000..c53cd156a2e --- /dev/null +++ b/src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u1.c @@ -0,0 +1,42 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-bf16-vcvt/scalar.c.in +// Generator: tools/xngen +// +// Copyright 2021 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vcvt.h" + + +void xnn_f32_bf16_vcvt_ukernel__scalar_u1( + size_t batch, + const float* input, + xnn_bfloat16* output, + const void* params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + const float* i = input; + xnn_bfloat16* o = output; + do { + const float vw = *i++; + + xnn_bfloat16 vbf = xnn_bfloat16_from_float(vw); + + *o++ = vbf; + + batch -= sizeof(float); + } while (batch != 0); +} diff --git a/src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u2.c b/src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u2.c new file mode 100644 index 00000000000..2eb66165c9f --- /dev/null +++ b/src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u2.c @@ -0,0 +1,52 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-bf16-vcvt/scalar.c.in +// Generator: tools/xngen +// +// Copyright 2021 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vcvt.h" + + +void xnn_f32_bf16_vcvt_ukernel__scalar_u2( + size_t batch, + const float* input, + xnn_bfloat16* output, + const void* params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + const float* i = input; + xnn_bfloat16* o = output; + for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { + const float vw0 = i[0]; + const float vw1 = i[1]; + i += 2; + + xnn_bfloat16 vbf0 = xnn_bfloat16_from_float(vw0); + xnn_bfloat16 vbf1 = xnn_bfloat16_from_float(vw1); + + o[0] = vbf0; + o[1] = vbf1; + o += 2; + } + if XNN_UNLIKELY(batch != 0) { + const float vw = *i; + + xnn_bfloat16 vbf = xnn_bfloat16_from_float(vw); + + *o = vbf; + } +} diff --git a/src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u3.c b/src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u3.c new file mode 100644 index 00000000000..4751f814502 --- /dev/null +++ b/src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u3.c @@ -0,0 +1,59 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-bf16-vcvt/scalar.c.in +// Generator: tools/xngen +// +// Copyright 2021 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vcvt.h" + + +void xnn_f32_bf16_vcvt_ukernel__scalar_u3( + size_t batch, + const float* input, + xnn_bfloat16* output, + const void* params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + const float* i = input; + xnn_bfloat16* o = output; + for (; batch >= 3 * sizeof(float); batch -= 3 * sizeof(float)) { + const float vw0 = i[0]; + const float vw1 = i[1]; + const float vw2 = i[2]; + i += 3; + + xnn_bfloat16 vbf0 = xnn_bfloat16_from_float(vw0); + xnn_bfloat16 vbf1 = xnn_bfloat16_from_float(vw1); + xnn_bfloat16 vbf2 = xnn_bfloat16_from_float(vw2); + + o[0] = vbf0; + o[1] = vbf1; + o[2] = vbf2; + o += 3; + } + if XNN_UNLIKELY(batch != 0) { + do { + const float vw = *i++; + + xnn_bfloat16 vbf = xnn_bfloat16_from_float(vw); + + *o++ = vbf; + + batch -= sizeof(float); + } while (batch != 0); + } +} diff --git a/src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u4.c b/src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u4.c new file mode 100644 index 00000000000..2f624d108ac --- /dev/null +++ b/src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u4.c @@ -0,0 +1,62 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-bf16-vcvt/scalar.c.in +// Generator: tools/xngen +// +// Copyright 2021 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vcvt.h" + + +void xnn_f32_bf16_vcvt_ukernel__scalar_u4( + size_t batch, + const float* input, + xnn_bfloat16* output, + const void* params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + const float* i = input; + xnn_bfloat16* o = output; + for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { + const float vw0 = i[0]; + const float vw1 = i[1]; + const float vw2 = i[2]; + const float vw3 = i[3]; + i += 4; + + xnn_bfloat16 vbf0 = xnn_bfloat16_from_float(vw0); + xnn_bfloat16 vbf1 = xnn_bfloat16_from_float(vw1); + xnn_bfloat16 vbf2 = xnn_bfloat16_from_float(vw2); + xnn_bfloat16 vbf3 = xnn_bfloat16_from_float(vw3); + + o[0] = vbf0; + o[1] = vbf1; + o[2] = vbf2; + o[3] = vbf3; + o += 4; + } + if XNN_UNLIKELY(batch != 0) { + do { + const float vw = *i++; + + xnn_bfloat16 vbf = xnn_bfloat16_from_float(vw); + + *o++ = vbf; + + batch -= sizeof(float); + } while (batch != 0); + } +} diff --git a/src/f32-bf16-vcvt/scalar.c.in b/src/f32-bf16-vcvt/scalar.c.in new file mode 100644 index 00000000000..815cf51423c --- /dev/null +++ b/src/f32-bf16-vcvt/scalar.c.in @@ -0,0 +1,72 @@ +// Copyright 2021 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert BATCH_TILE >= 1 +#include +#include +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vcvt.h" + + +void xnn_f32_bf16_vcvt_ukernel__scalar_u${BATCH_TILE}( + size_t batch, + const float* input, + xnn_bfloat16* output, + const void* params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + const float* i = input; + xnn_bfloat16* o = output; + $if BATCH_TILE == 1: + do { + const float vw = *i++; + + xnn_bfloat16 vbf = xnn_bfloat16_from_float(vw); + + *o++ = vbf; + + batch -= sizeof(float); + } while (batch != 0); + $else: + for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { + $for N in range(BATCH_TILE): + const float vw${N} = i[${N}]; + i += ${BATCH_TILE}; + + $for N in range(BATCH_TILE): + xnn_bfloat16 vbf${N} = xnn_bfloat16_from_float(vw${N}); + + $for N in range(BATCH_TILE): + o[${N}] = vbf${N}; + o += ${BATCH_TILE}; + } + $if BATCH_TILE == 2: + if XNN_UNLIKELY(batch != 0) { + const float vw = *i; + + xnn_bfloat16 vbf = xnn_bfloat16_from_float(vw); + + *o = vbf; + } + $else: + if XNN_UNLIKELY(batch != 0) { + do { + const float vw = *i++; + + xnn_bfloat16 vbf = xnn_bfloat16_from_float(vw); + + *o++ = vbf; + + batch -= sizeof(float); + } while (batch != 0); + } +} diff --git a/src/operators/unary-elementwise-nc.c b/src/operators/unary-elementwise-nc.c index 89862832845..7a117b85b4e 100644 --- a/src/operators/unary-elementwise-nc.c +++ b/src/operators/unary-elementwise-nc.c @@ -93,7 +93,9 @@ static const struct xnn_unary_elementwise_config* get_config( const struct xnn_quantization_params* output_quantization, uint32_t flags) { if (input_datatype != output_datatype) { if (op_type == xnn_unary_convert) { - if (input_datatype == xnn_datatype_fp32 && output_datatype == xnn_datatype_fp16) { + if (input_datatype == xnn_datatype_fp32 && output_datatype == xnn_datatype_bf16) { + return xnn_init_f32_to_bf16_cvt_config(); + } else if (input_datatype == xnn_datatype_fp32 && output_datatype == xnn_datatype_fp16) { return xnn_init_f32_to_f16_cvt_config(); } else if (input_datatype == xnn_datatype_fp32 && output_datatype == xnn_datatype_qint8) { return xnn_init_f32_to_qs8_cvt_config(); @@ -101,6 +103,8 @@ static const struct xnn_unary_elementwise_config* get_config( return xnn_init_f32_to_qu8_cvt_config(); } else if (input_datatype == xnn_datatype_fp32 && output_datatype == xnn_datatype_qpint8) { return xnn_init_f32_to_qp8_cvt_config(); + } else if (input_datatype == xnn_datatype_bf16 && output_datatype == xnn_datatype_fp32) { + return xnn_init_bf16_to_f32_cvt_config(); } else if (input_datatype == xnn_datatype_fp16 && output_datatype == xnn_datatype_fp32) { return xnn_init_f16_to_f32_cvt_config(); } else if (input_datatype == xnn_datatype_fp16 && output_datatype == xnn_datatype_qint8) { diff --git a/src/xnnpack/config.h b/src/xnnpack/config.h index 3839557f95e..df958d8d654 100644 --- a/src/xnnpack/config.h +++ b/src/xnnpack/config.h @@ -125,6 +125,8 @@ xnn_init_f16_sqrt_config(); XNN_INTERNAL const struct xnn_unary_elementwise_config* xnn_init_f16_tanh_config(); XNN_INTERNAL const struct xnn_unary_elementwise_config* +xnn_init_bf16_to_f32_cvt_config(); +XNN_INTERNAL const struct xnn_unary_elementwise_config* xnn_init_f16_to_f32_cvt_config(); XNN_INTERNAL const struct xnn_unary_elementwise_config* xnn_init_f16_to_qs8_cvt_config(); @@ -173,6 +175,8 @@ xnn_init_f32_sqrt_config(uint32_t flags); XNN_INTERNAL const struct xnn_unary_elementwise_config* xnn_init_f32_tanh_config(uint32_t flags); XNN_INTERNAL const struct xnn_unary_elementwise_config* +xnn_init_f32_to_bf16_cvt_config(); +XNN_INTERNAL const struct xnn_unary_elementwise_config* xnn_init_f32_to_f16_cvt_config(); XNN_INTERNAL const struct xnn_unary_elementwise_config* xnn_init_f32_to_qp8_cvt_config(); diff --git a/src/xnnpack/math.h b/src/xnnpack/math.h index 48fa81ba74f..fc9b9ca49cd 100644 --- a/src/xnnpack/math.h +++ b/src/xnnpack/math.h @@ -42,6 +42,8 @@ #define M_PI 3.141592653589793238462643383280 /* pi */ #endif +#define F32_EXP_MASK 0x7F800000u + #ifdef __cplusplus extern "C" { #endif @@ -469,7 +471,11 @@ XNN_INLINE static uint16_t math_cvt_bf16_fp32(float x) { } bits; bits.as_float = x; - // TODO Handle fraction rounding + // Apply rounding correction if not inf/nan. + if ((bits.as_uint32 & F32_EXP_MASK) != F32_EXP_MASK) { + bits.as_uint32 += 0x7FFFu + ((bits.as_uint32 >> 16) & 1u); + } + return bits.as_uint32 >> 16; } diff --git a/src/xnnpack/vcvt.h b/src/xnnpack/vcvt.h index 2a8d064e8e4..0165c7de4de 100644 --- a/src/xnnpack/vcvt.h +++ b/src/xnnpack/vcvt.h @@ -21,9 +21,11 @@ extern "C" { params_type, init_params) \ XNN_INTERNAL void ukernel(size_t n, const type_in* input, type_out* output, \ const params_type* params); +#include "src/bf16-f32-vcvt/bf16-f32-vcvt.inc" #include "src/f16-f32-vcvt/f16-f32-vcvt.inc" #include "src/f16-qs8-vcvt/f16-qs8-vcvt.inc" #include "src/f16-qu8-vcvt/f16-qu8-vcvt.inc" +#include "src/f32-bf16-vcvt/f32-bf16-vcvt.inc" #include "src/f32-f16-vcvt/f32-f16-vcvt.inc" #include "src/f32-qs8-vcvt/f32-qs8-vcvt.inc" #include "src/f32-qu8-vcvt/f32-qu8-vcvt.inc" diff --git a/test/BUILD.bazel b/test/BUILD.bazel index 40a6424f728..2e556f18fd3 100644 --- a/test/BUILD.bazel +++ b/test/BUILD.bazel @@ -119,6 +119,7 @@ xnnpack_cxx_library( ":vunary_microkernel_tester", ], ) for kernel in [ + "bf16_f32_vcvt", "f16_f32_vcvt", "f16_qs8_vcvt", "f16_qu8_vcvt", @@ -142,6 +143,7 @@ xnnpack_cxx_library( "f16_vsqr", "f16_vsqrt", "f16_vtanh", + "f32_bf16_vcvt", "f32_f16_vcvt", "f32_qs8_vcvt", "f32_qu8_vcvt", diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b5f017b90fc..315468fdb9a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -340,9 +340,11 @@ FOREACH(TEST ${MICROKERNEL_VBINARY_UNIT_TESTS}) ENDFOREACH() SET(MICROKERNEL_VCVT_TESTS + bf16-f32-vcvt f16-f32-vcvt f16-qs8-vcvt f16-qu8-vcvt + f32-bf16-vcvt f32-f16-vcvt f32-qs8-vcvt f32-qu8-vcvt diff --git a/test/bf16-f32-vcvt.cc b/test/bf16-f32-vcvt.cc new file mode 100644 index 00000000000..a008fa9fc14 --- /dev/null +++ b/test/bf16-f32-vcvt.cc @@ -0,0 +1,30 @@ +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include "src/xnnpack/microparams-init.h" +#include "src/xnnpack/vcvt.h" +#include "test/vunary-microkernel-tester.h" + +#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, \ + vector_tile, datatype_in, datatype_out, \ + params_type, init_params) \ + TEST(ukernel, batch_eq) { \ + TestBatchEq(arch_flags, batch_tile, \ + ukernel, init_params); \ + } \ + TEST(ukernel, batch_div) { \ + TestBatchDiv(arch_flags, batch_tile, \ + ukernel, init_params); \ + } \ + TEST(ukernel, batch_lt) { \ + TestBatchLT(arch_flags, batch_tile, \ + ukernel, init_params); \ + } \ + TEST(ukernel, batch_gt) { \ + TestBatchGT(arch_flags, batch_tile, \ + ukernel, init_params); \ + } +#include "src/bf16-f32-vcvt/bf16-f32-vcvt.inc" +#undef XNN_UKERNEL diff --git a/test/f32-bf16-vcvt.cc b/test/f32-bf16-vcvt.cc new file mode 100644 index 00000000000..ada00e2e0a5 --- /dev/null +++ b/test/f32-bf16-vcvt.cc @@ -0,0 +1,30 @@ +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include "src/xnnpack/microparams-init.h" +#include "src/xnnpack/vcvt.h" +#include "test/vunary-microkernel-tester.h" + +#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, \ + vector_tile, datatype_in, datatype_out, \ + params_type, init_params) \ + TEST(ukernel, batch_eq) { \ + TestBatchEq(arch_flags, batch_tile, \ + ukernel, init_params); \ + } \ + TEST(ukernel, batch_div) { \ + TestBatchDiv(arch_flags, batch_tile, \ + ukernel, init_params); \ + } \ + TEST(ukernel, batch_lt) { \ + TestBatchLT(arch_flags, batch_tile, \ + ukernel, init_params); \ + } \ + TEST(ukernel, batch_gt) { \ + TestBatchGT(arch_flags, batch_tile, \ + ukernel, init_params); \ + } +#include "src/f32-bf16-vcvt/f32-bf16-vcvt.inc" +#undef XNN_UKERNEL