diff --git a/bench/f16-raddstoreexpminusmax.cc b/bench/f16-raddstoreexpminusmax.cc index 17ab243040d..0e9529c0d04 100644 --- a/bench/f16-raddstoreexpminusmax.cc +++ b/bench/f16-raddstoreexpminusmax.cc @@ -226,6 +226,33 @@ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_u64_acc4, ->UseRealTime(); #endif // XNN_ENABLE_AVX2 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR +BENCHMARK_CAPTURE( + f16_raddstoreexpminusmax, rvvfp16arith_rr2_p2_u1v, + xnn_f16_rmax_ukernel__rvvfp16arith_u8v, + xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u1v, nullptr, + xnn_arch_riscv_vector_fp16_arith) + ->Apply( + benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); +BENCHMARK_CAPTURE( + f16_raddstoreexpminusmax, rvvfp16arith_rr2_p2_u2v, + xnn_f16_rmax_ukernel__rvvfp16arith_u8v, + xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u2v, nullptr, + xnn_arch_riscv_vector_fp16_arith) + ->Apply( + benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); +BENCHMARK_CAPTURE( + f16_raddstoreexpminusmax, rvvfp16arith_rr2_p2_u4v, + xnn_f16_rmax_ukernel__rvvfp16arith_u8v, + xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u4v, nullptr, + xnn_arch_riscv_vector_fp16_arith) + ->Apply( + benchmark::utils::UnaryElementwiseParameters) + ->UseRealTime(); +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR + #ifndef XNNPACK_BENCHMARK_NO_MAIN XNN_BENCHMARK_MAIN(); #endif diff --git a/cmake/gen/avx_microkernels.cmake b/cmake/gen/avx_microkernels.cmake index 4babe1f760c..e0ae912d702 100644 --- a/cmake/gen/avx_microkernels.cmake +++ b/cmake/gen/avx_microkernels.cmake @@ -77,7 +77,6 @@ SET(PROD_AVX_MICROKERNEL_SRCS src/f32-vrsqrt/gen/f32-vrsqrt-avx-rsqrt.c src/f32-vrsqrt/gen/f32-vrsqrt-avx-sqrt.c src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-u16.c - src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-u16.c src/f32-vsin/gen/f32-vsin-avx-rational-5-4-div.c src/f32-vsqrt/gen/f32-vsqrt-avx-rsqrt.c src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt.c @@ -237,6 +236,7 @@ SET(NON_PROD_AVX_MICROKERNEL_SRCS src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-u24.c src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-u32.c src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-u8.c + src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-u16.c src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-u24.c src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-u32.c src/f32-vtanh/gen/f32-vtanh-avx-rational-9-8-nr.c diff --git a/cmake/gen/rvvfp16arith_microkernels.cmake b/cmake/gen/rvvfp16arith_microkernels.cmake index 3c4580a0ab9..ff41da2bdde 100644 --- a/cmake/gen/rvvfp16arith_microkernels.cmake +++ b/cmake/gen/rvvfp16arith_microkernels.cmake @@ -23,6 +23,7 @@ SET(PROD_RVVFP16ARITH_MICROKERNEL_SRCS src/f16-gemm/gen/f16-gemm-7x4v-minmax-rvvfp16arith.c src/f16-igemm/gen/f16-igemm-1x4v-minmax-rvvfp16arith.c src/f16-igemm/gen/f16-igemm-7x4v-minmax-rvvfp16arith.c + src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u4v.c src/f16-rdminmax/gen/f16-rdmax-2p2x-rvvfp16arith-u8v.c src/f16-rdminmax/gen/f16-rdmin-2p2x-rvvfp16arith-u8v.c src/f16-rminmax/gen/f16-rmax-rvvfp16arith-u8v.c @@ -73,6 +74,8 @@ SET(NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS src/f16-f32acc-rsum2/gen/f16-f32acc-rsum2-rvvfp16arith-u2v.c src/f16-gemm/gen/f16-gemm-4x4v-minmax-rvvfp16arith.c src/f16-igemm/gen/f16-igemm-4x4v-minmax-rvvfp16arith.c + src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u1v.c + src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u2v.c src/f16-rminmax/gen/f16-rmax-rvvfp16arith-u4v.c src/f16-rminmax/gen/f16-rmin-rvvfp16arith-u4v.c src/f16-rminmax/gen/f16-rminmax-rvvfp16arith-u4v.c diff --git a/gen/avx_microkernels.bzl b/gen/avx_microkernels.bzl index 244b0609517..84705bc06c2 100644 --- a/gen/avx_microkernels.bzl +++ b/gen/avx_microkernels.bzl @@ -73,7 +73,6 @@ PROD_AVX_MICROKERNEL_SRCS = [ "src/f32-vrsqrt/gen/f32-vrsqrt-avx-rsqrt.c", "src/f32-vrsqrt/gen/f32-vrsqrt-avx-sqrt.c", "src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-u16.c", - "src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-u16.c", "src/f32-vsin/gen/f32-vsin-avx-rational-5-4-div.c", "src/f32-vsqrt/gen/f32-vsqrt-avx-rsqrt.c", "src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt.c", @@ -234,6 +233,7 @@ NON_PROD_AVX_MICROKERNEL_SRCS = [ "src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-u24.c", "src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-u32.c", "src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-u8.c", + "src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-u16.c", "src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-u24.c", "src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-u32.c", "src/f32-vtanh/gen/f32-vtanh-avx-rational-9-8-nr.c", diff --git a/gen/rvvfp16arith_microkernels.bzl b/gen/rvvfp16arith_microkernels.bzl index 31d43cc0976..be7a319d35f 100644 --- a/gen/rvvfp16arith_microkernels.bzl +++ b/gen/rvvfp16arith_microkernels.bzl @@ -19,6 +19,7 @@ PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-gemm/gen/f16-gemm-7x4v-minmax-rvvfp16arith.c", "src/f16-igemm/gen/f16-igemm-1x4v-minmax-rvvfp16arith.c", "src/f16-igemm/gen/f16-igemm-7x4v-minmax-rvvfp16arith.c", + "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u4v.c", "src/f16-rdminmax/gen/f16-rdmax-2p2x-rvvfp16arith-u8v.c", "src/f16-rdminmax/gen/f16-rdmin-2p2x-rvvfp16arith-u8v.c", "src/f16-rminmax/gen/f16-rmax-rvvfp16arith-u8v.c", @@ -70,6 +71,8 @@ NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-f32acc-rsum2/gen/f16-f32acc-rsum2-rvvfp16arith-u2v.c", "src/f16-gemm/gen/f16-gemm-4x4v-minmax-rvvfp16arith.c", "src/f16-igemm/gen/f16-igemm-4x4v-minmax-rvvfp16arith.c", + "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u1v.c", + "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u2v.c", "src/f16-rminmax/gen/f16-rmax-rvvfp16arith-u4v.c", "src/f16-rminmax/gen/f16-rmin-rvvfp16arith-u4v.c", "src/f16-rminmax/gen/f16-rminmax-rvvfp16arith-u4v.c", diff --git a/scripts/generate-f16-raddstoreexpminusmax.sh b/scripts/generate-f16-raddstoreexpminusmax.sh index 767a473c8b2..919ce5c5f64 100755 --- a/scripts/generate-f16-raddstoreexpminusmax.sh +++ b/scripts/generate-f16-raddstoreexpminusmax.sh @@ -26,3 +26,8 @@ tools/xngen src/f16-raddstoreexpminusmax/avx2-rr1-p2.c.in -D BATCH_TILE=48 -D AC tools/xngen src/f16-raddstoreexpminusmax/avx2-rr1-p2.c.in -D BATCH_TILE=64 -D ACCUMULATORS=1 -o src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u64.c & tools/xngen src/f16-raddstoreexpminusmax/avx2-rr1-p2.c.in -D BATCH_TILE=64 -D ACCUMULATORS=2 -o src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u64-acc2.c & tools/xngen src/f16-raddstoreexpminusmax/avx2-rr1-p2.c.in -D BATCH_TILE=64 -D ACCUMULATORS=4 -o src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u64-acc4.c & + +# RISC-V Vector +tools/xngen src/f16-raddstoreexpminusmax/rvvfp16arith-rr2-p2.c.in -D LMUL=1 -o src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u1v.c & +tools/xngen src/f16-raddstoreexpminusmax/rvvfp16arith-rr2-p2.c.in -D LMUL=2 -o src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u2v.c & +tools/xngen src/f16-raddstoreexpminusmax/rvvfp16arith-rr2-p2.c.in -D LMUL=4 -o src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u4v.c & diff --git a/src/configs/raddstoreexpminusmax-config.c b/src/configs/raddstoreexpminusmax-config.c index 3be8c0deec3..4d633674f29 100644 --- a/src/configs/raddstoreexpminusmax-config.c +++ b/src/configs/raddstoreexpminusmax-config.c @@ -55,6 +55,12 @@ static void init_f16_raddstoreexpminusmax_config(void) { f16_raddstoreexpminusmax_config.ukernel = XNN_INIT_RADDSTOREEXPMINUSMAX_UKERNEL(xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_u32); } #endif + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) { + f16_raddstoreexpminusmax_config.ukernel = XNN_INIT_RADDSTOREEXPMINUSMAX_UKERNEL(xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u4v); + } #endif } @@ -127,6 +133,8 @@ static bool is_f16_compatible_config(const struct xnn_hardware_config* hardware_ return (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith); #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 return (hardware_config->arch_flags & xnn_arch_x86_avx2); + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR + return (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith); #else return false; #endif diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u1v.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u1v.c new file mode 100644 index 00000000000..ba85f1b5739 --- /dev/null +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u1v.c @@ -0,0 +1,89 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-raddstoreexpminusmax/rvvfp16arith-rr2-p2.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/raddstoreexpminusmax.h" + + +void xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u1v( + size_t batch, + const xnn_float16* input, + const xnn_float16* max, + xnn_float16* output, + float* sum, + const void* params) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const xnn_float16 vlog2e = 0x1.715476p0f; + const xnn_float16 vmagic_bias = 0x1.83Cp+10f; + const xnn_float16 vminus_ln2_hi = -0x1.630p-1f; + const xnn_float16 vminus_ln2_lo = 0x1.BD0p-13f; + const xnn_float16 vc2 = 0x1.FF3A32p-2f; + const xnn_float16 vc1 = 0x1.039E10p+0f; + const xnn_float16 vdenorm_cutoff = -0x1.368000p+3f; + + const xnn_float16* i = input; + xnn_float16* o = output; + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + + size_t vlmax = __riscv_vsetvl_e16m1(batch); + vfloat32m2_t vacc = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); + + do { + size_t vl = __riscv_vsetvl_e16m1(batch); batch -= vl; + + vfloat16m1_t vi = __riscv_vle16_v_f16m1(i, vl); i += vl; + + const vfloat16m1_t vx = __riscv_vfsub(vi, *max, vl); + + vfloat16m1_t vn = __riscv_vfmv_v_f_f16m1(vmagic_bias, vl); + vn = __riscv_vfmacc(vn, vlog2e, vx, vl); + + const vfloat16m1_t vs = __riscv_vreinterpret_f16m1(__riscv_vsll(__riscv_vreinterpret_i16m1(vn), 10, vl)); + + vn = __riscv_vfsub(vn, vmagic_bias, vl); + + vfloat16m1_t vt = __riscv_vmv_v(vx, vl); + vt = __riscv_vfmacc(vt, vminus_ln2_hi, vn, vl); + vt = __riscv_vfmacc(vt, vminus_ln2_lo, vn, vl); + + vfloat16m1_t vp = __riscv_vfmv_v_f_f16m1(vc1, vl); + vp = __riscv_vfmacc(vp, vc2, vt, vl); + + vt = __riscv_vfmul(vt, vs, vl); + + vfloat16m1_t vf = __riscv_vmv_v(vs, vl); + vf = __riscv_vfmacc(vf, vp, vt, vl); + + const vbool16_t vmask = __riscv_vmflt(vx, vdenorm_cutoff, vl); + vf = __riscv_vfmerge(vf, 0.0f, vmask, vl); + + __riscv_vse16(o, vf, vl); o += vl; + + vacc = __riscv_vfwadd_wv(vacc, vf, vl); + } while (batch > 0); + + vfloat32m1_t v0 = __riscv_vfmv_s_f_f32m1(0.0f, 1); + *sum = __riscv_vfmv_f(__riscv_vfredusum(vacc, v0, vlmax)); +} diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u2v.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u2v.c new file mode 100644 index 00000000000..586437fd83b --- /dev/null +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u2v.c @@ -0,0 +1,89 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-raddstoreexpminusmax/rvvfp16arith-rr2-p2.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/raddstoreexpminusmax.h" + + +void xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u2v( + size_t batch, + const xnn_float16* input, + const xnn_float16* max, + xnn_float16* output, + float* sum, + const void* params) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const xnn_float16 vlog2e = 0x1.715476p0f; + const xnn_float16 vmagic_bias = 0x1.83Cp+10f; + const xnn_float16 vminus_ln2_hi = -0x1.630p-1f; + const xnn_float16 vminus_ln2_lo = 0x1.BD0p-13f; + const xnn_float16 vc2 = 0x1.FF3A32p-2f; + const xnn_float16 vc1 = 0x1.039E10p+0f; + const xnn_float16 vdenorm_cutoff = -0x1.368000p+3f; + + const xnn_float16* i = input; + xnn_float16* o = output; + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + + size_t vlmax = __riscv_vsetvl_e16m2(batch); + vfloat32m4_t vacc = __riscv_vfmv_v_f_f32m4(0.0f, vlmax); + + do { + size_t vl = __riscv_vsetvl_e16m2(batch); batch -= vl; + + vfloat16m2_t vi = __riscv_vle16_v_f16m2(i, vl); i += vl; + + const vfloat16m2_t vx = __riscv_vfsub(vi, *max, vl); + + vfloat16m2_t vn = __riscv_vfmv_v_f_f16m2(vmagic_bias, vl); + vn = __riscv_vfmacc(vn, vlog2e, vx, vl); + + const vfloat16m2_t vs = __riscv_vreinterpret_f16m2(__riscv_vsll(__riscv_vreinterpret_i16m2(vn), 10, vl)); + + vn = __riscv_vfsub(vn, vmagic_bias, vl); + + vfloat16m2_t vt = __riscv_vmv_v(vx, vl); + vt = __riscv_vfmacc(vt, vminus_ln2_hi, vn, vl); + vt = __riscv_vfmacc(vt, vminus_ln2_lo, vn, vl); + + vfloat16m2_t vp = __riscv_vfmv_v_f_f16m2(vc1, vl); + vp = __riscv_vfmacc(vp, vc2, vt, vl); + + vt = __riscv_vfmul(vt, vs, vl); + + vfloat16m2_t vf = __riscv_vmv_v(vs, vl); + vf = __riscv_vfmacc(vf, vp, vt, vl); + + const vbool8_t vmask = __riscv_vmflt(vx, vdenorm_cutoff, vl); + vf = __riscv_vfmerge(vf, 0.0f, vmask, vl); + + __riscv_vse16(o, vf, vl); o += vl; + + vacc = __riscv_vfwadd_wv(vacc, vf, vl); + } while (batch > 0); + + vfloat32m1_t v0 = __riscv_vfmv_s_f_f32m1(0.0f, 1); + *sum = __riscv_vfmv_f(__riscv_vfredusum(vacc, v0, vlmax)); +} diff --git a/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u4v.c b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u4v.c new file mode 100644 index 00000000000..f9f64d675f4 --- /dev/null +++ b/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u4v.c @@ -0,0 +1,89 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-raddstoreexpminusmax/rvvfp16arith-rr2-p2.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/raddstoreexpminusmax.h" + + +void xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u4v( + size_t batch, + const xnn_float16* input, + const xnn_float16* max, + xnn_float16* output, + float* sum, + const void* params) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const xnn_float16 vlog2e = 0x1.715476p0f; + const xnn_float16 vmagic_bias = 0x1.83Cp+10f; + const xnn_float16 vminus_ln2_hi = -0x1.630p-1f; + const xnn_float16 vminus_ln2_lo = 0x1.BD0p-13f; + const xnn_float16 vc2 = 0x1.FF3A32p-2f; + const xnn_float16 vc1 = 0x1.039E10p+0f; + const xnn_float16 vdenorm_cutoff = -0x1.368000p+3f; + + const xnn_float16* i = input; + xnn_float16* o = output; + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + + size_t vlmax = __riscv_vsetvl_e16m4(batch); + vfloat32m8_t vacc = __riscv_vfmv_v_f_f32m8(0.0f, vlmax); + + do { + size_t vl = __riscv_vsetvl_e16m4(batch); batch -= vl; + + vfloat16m4_t vi = __riscv_vle16_v_f16m4(i, vl); i += vl; + + const vfloat16m4_t vx = __riscv_vfsub(vi, *max, vl); + + vfloat16m4_t vn = __riscv_vfmv_v_f_f16m4(vmagic_bias, vl); + vn = __riscv_vfmacc(vn, vlog2e, vx, vl); + + const vfloat16m4_t vs = __riscv_vreinterpret_f16m4(__riscv_vsll(__riscv_vreinterpret_i16m4(vn), 10, vl)); + + vn = __riscv_vfsub(vn, vmagic_bias, vl); + + vfloat16m4_t vt = __riscv_vmv_v(vx, vl); + vt = __riscv_vfmacc(vt, vminus_ln2_hi, vn, vl); + vt = __riscv_vfmacc(vt, vminus_ln2_lo, vn, vl); + + vfloat16m4_t vp = __riscv_vfmv_v_f_f16m4(vc1, vl); + vp = __riscv_vfmacc(vp, vc2, vt, vl); + + vt = __riscv_vfmul(vt, vs, vl); + + vfloat16m4_t vf = __riscv_vmv_v(vs, vl); + vf = __riscv_vfmacc(vf, vp, vt, vl); + + const vbool4_t vmask = __riscv_vmflt(vx, vdenorm_cutoff, vl); + vf = __riscv_vfmerge(vf, 0.0f, vmask, vl); + + __riscv_vse16(o, vf, vl); o += vl; + + vacc = __riscv_vfwadd_wv(vacc, vf, vl); + } while (batch > 0); + + vfloat32m1_t v0 = __riscv_vfmv_s_f_f32m1(0.0f, 1); + *sum = __riscv_vfmv_f(__riscv_vfredusum(vacc, v0, vlmax)); +} diff --git a/src/f16-raddstoreexpminusmax/rvvfp16arith-rr2-p2.c.in b/src/f16-raddstoreexpminusmax/rvvfp16arith-rr2-p2.c.in new file mode 100644 index 00000000000..d8d8aa9018b --- /dev/null +++ b/src/f16-raddstoreexpminusmax/rvvfp16arith-rr2-p2.c.in @@ -0,0 +1,85 @@ +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert LMUL in [1, 2, 4] +#include +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/raddstoreexpminusmax.h" + + +void xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u${LMUL}v( + size_t batch, + const xnn_float16* input, + const xnn_float16* max, + xnn_float16* output, + float* sum, + const void* params) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(max != NULL); + assert(output != NULL); + assert(sum != NULL); + + const xnn_float16 vlog2e = 0x1.715476p0f; + const xnn_float16 vmagic_bias = 0x1.83Cp+10f; + const xnn_float16 vminus_ln2_hi = -0x1.630p-1f; + const xnn_float16 vminus_ln2_lo = 0x1.BD0p-13f; + const xnn_float16 vc2 = 0x1.FF3A32p-2f; + const xnn_float16 vc1 = 0x1.039E10p+0f; + const xnn_float16 vdenorm_cutoff = -0x1.368000p+3f; + + const xnn_float16* i = input; + xnn_float16* o = output; + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + + size_t vlmax = __riscv_vsetvl_e16m${LMUL}(batch); + vfloat32m${LMUL*2}_t vacc = __riscv_vfmv_v_f_f32m${LMUL*2}(0.0f, vlmax); + + do { + size_t vl = __riscv_vsetvl_e16m${LMUL}(batch); batch -= vl; + + vfloat16m${LMUL}_t vi = __riscv_vle16_v_f16m${LMUL}(i, vl); i += vl; + + const vfloat16m${LMUL}_t vx = __riscv_vfsub(vi, *max, vl); + + vfloat16m${LMUL}_t vn = __riscv_vfmv_v_f_f16m${LMUL}(vmagic_bias, vl); + vn = __riscv_vfmacc(vn, vlog2e, vx, vl); + + const vfloat16m${LMUL}_t vs = __riscv_vreinterpret_f16m${LMUL}(__riscv_vsll(__riscv_vreinterpret_i16m${LMUL}(vn), 10, vl)); + + vn = __riscv_vfsub(vn, vmagic_bias, vl); + + vfloat16m${LMUL}_t vt = __riscv_vmv_v(vx, vl); + vt = __riscv_vfmacc(vt, vminus_ln2_hi, vn, vl); + vt = __riscv_vfmacc(vt, vminus_ln2_lo, vn, vl); + + vfloat16m${LMUL}_t vp = __riscv_vfmv_v_f_f16m${LMUL}(vc1, vl); + vp = __riscv_vfmacc(vp, vc2, vt, vl); + + vt = __riscv_vfmul(vt, vs, vl); + + vfloat16m${LMUL}_t vf = __riscv_vmv_v(vs, vl); + vf = __riscv_vfmacc(vf, vp, vt, vl); + + const vbool${16//LMUL}_t vmask = __riscv_vmflt(vx, vdenorm_cutoff, vl); + vf = __riscv_vfmerge(vf, 0.0f, vmask, vl); + + __riscv_vse16(o, vf, vl); o += vl; + + vacc = __riscv_vfwadd_wv(vacc, vf, vl); + } while (batch > 0); + + vfloat32m1_t v0 = __riscv_vfmv_s_f_f32m1(0.0f, 1); + *sum = __riscv_vfmv_f(__riscv_vfredusum(vacc, v0, vlmax)); +} diff --git a/src/xnnpack/raddstoreexpminusmax.h b/src/xnnpack/raddstoreexpminusmax.h index bcb04407f24..acecc2d4526 100644 --- a/src/xnnpack/raddstoreexpminusmax.h +++ b/src/xnnpack/raddstoreexpminusmax.h @@ -61,6 +61,13 @@ DECLARE_F16_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION( DECLARE_F16_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION( xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u32_acc4) +DECLARE_F16_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION( + xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u1v) +DECLARE_F16_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION( + xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u2v) +DECLARE_F16_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION( + xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u4v) + #define DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(fn_name) \ XNN_INTERNAL void fn_name(size_t n, const float* input, const float* max, \ float* output, float* sum, const void* params); diff --git a/test/f16-raddstoreexpminusmax.cc b/test/f16-raddstoreexpminusmax.cc index 3ff752bdb7a..f254d596c91 100644 --- a/test/f16-raddstoreexpminusmax.cc +++ b/test/f16-raddstoreexpminusmax.cc @@ -718,3 +718,132 @@ } } #endif // XNN_ENABLE_AVX2 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + + +#if XNN_ENABLE_RISCV_FP16_VECTOR && XNN_ARCH_RISCV + TEST(F16_RADDSTOREEXPMINUSMAX__RVVFP16ARITH_RR2_P2_U1V, elements_eq_1v) { + TEST_REQUIRES_ARCH_FLAGS(xnn_arch_riscv_vector_fp16_arith); + RAddStoreExpMinusMaxMicrokernelTester() + .elements(1 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t)) + .Test(xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u1v, nullptr); + } + + TEST(F16_RADDSTOREEXPMINUSMAX__RVVFP16ARITH_RR2_P2_U1V, elements_div_1v) { + TEST_REQUIRES_ARCH_FLAGS(xnn_arch_riscv_vector_fp16_arith); + for (size_t elements = 2 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t); + elements < 10 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t); + elements += 1 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t)) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u1v, nullptr); + } + } + + TEST(F16_RADDSTOREEXPMINUSMAX__RVVFP16ARITH_RR2_P2_U1V, elements_lt_1v) { + TEST_REQUIRES_ARCH_FLAGS(xnn_arch_riscv_vector_fp16_arith); + for (size_t elements = 1; + elements < 1 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t); + elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u1v, nullptr); + } + } + + TEST(F16_RADDSTOREEXPMINUSMAX__RVVFP16ARITH_RR2_P2_U1V, elements_gt_1v) { + TEST_REQUIRES_ARCH_FLAGS(xnn_arch_riscv_vector_fp16_arith); + for (size_t elements = 1 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t) + 1; + elements < 10 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t); + elements += 2) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u1v, nullptr); + } + } +#endif // XNN_ENABLE_RISCV_FP16_VECTOR && XNN_ARCH_RISCV + + +#if XNN_ENABLE_RISCV_FP16_VECTOR && XNN_ARCH_RISCV + TEST(F16_RADDSTOREEXPMINUSMAX__RVVFP16ARITH_RR2_P2_U2V, elements_eq_2v) { + TEST_REQUIRES_ARCH_FLAGS(xnn_arch_riscv_vector_fp16_arith); + RAddStoreExpMinusMaxMicrokernelTester() + .elements(2 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t)) + .Test(xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u2v, nullptr); + } + + TEST(F16_RADDSTOREEXPMINUSMAX__RVVFP16ARITH_RR2_P2_U2V, elements_div_2v) { + TEST_REQUIRES_ARCH_FLAGS(xnn_arch_riscv_vector_fp16_arith); + for (size_t elements = 4 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t); + elements < 20 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t); + elements += 2 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t)) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u2v, nullptr); + } + } + + TEST(F16_RADDSTOREEXPMINUSMAX__RVVFP16ARITH_RR2_P2_U2V, elements_lt_2v) { + TEST_REQUIRES_ARCH_FLAGS(xnn_arch_riscv_vector_fp16_arith); + for (size_t elements = 1; + elements < 2 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t); + elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u2v, nullptr); + } + } + + TEST(F16_RADDSTOREEXPMINUSMAX__RVVFP16ARITH_RR2_P2_U2V, elements_gt_2v) { + TEST_REQUIRES_ARCH_FLAGS(xnn_arch_riscv_vector_fp16_arith); + for (size_t elements = 2 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t) + 1; + elements < 4 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t); + elements += 4) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u2v, nullptr); + } + } +#endif // XNN_ENABLE_RISCV_FP16_VECTOR && XNN_ARCH_RISCV + + +#if XNN_ENABLE_RISCV_FP16_VECTOR && XNN_ARCH_RISCV + TEST(F16_RADDSTOREEXPMINUSMAX__RVVFP16ARITH_RR2_P2_U4V, elements_eq_4v) { + TEST_REQUIRES_ARCH_FLAGS(xnn_arch_riscv_vector_fp16_arith); + RAddStoreExpMinusMaxMicrokernelTester() + .elements(4 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t)) + .Test(xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u4v, nullptr); + } + + TEST(F16_RADDSTOREEXPMINUSMAX__RVVFP16ARITH_RR2_P2_U4V, elements_div_4v) { + TEST_REQUIRES_ARCH_FLAGS(xnn_arch_riscv_vector_fp16_arith); + for (size_t elements = 8 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t); + elements < 40 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t); + elements += 4 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t)) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u4v, nullptr); + } + } + + TEST(F16_RADDSTOREEXPMINUSMAX__RVVFP16ARITH_RR2_P2_U4V, elements_lt_4v) { + TEST_REQUIRES_ARCH_FLAGS(xnn_arch_riscv_vector_fp16_arith); + for (size_t elements = 1; + elements < 4 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t); + elements++) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u4v, nullptr); + } + } + + TEST(F16_RADDSTOREEXPMINUSMAX__RVVFP16ARITH_RR2_P2_U4V, elements_gt_4v) { + TEST_REQUIRES_ARCH_FLAGS(xnn_arch_riscv_vector_fp16_arith); + for (size_t elements = 4 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t) + 1; + elements < 8 * xnn_init_hardware_config()->vlenb / sizeof(uint16_t); + elements += 8) { + RAddStoreExpMinusMaxMicrokernelTester() + .elements(elements) + .Test(xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u4v, nullptr); + } + } +#endif // XNN_ENABLE_RISCV_FP16_VECTOR && XNN_ARCH_RISCV diff --git a/test/f16-raddstoreexpminusmax.yaml b/test/f16-raddstoreexpminusmax.yaml index 2756cff18fa..d05f51d9e49 100644 --- a/test/f16-raddstoreexpminusmax.yaml +++ b/test/f16-raddstoreexpminusmax.yaml @@ -3,7 +3,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -# ARN NEON+FP16ARITH +# ARM NEON+FP16ARITH - name: xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u16 - name: xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u16_acc2 - name: xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_u32 @@ -24,4 +24,9 @@ - name: xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_u48_acc3 - name: xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_u64 - name: xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_u64_acc2 -- name: xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_u64_acc4 \ No newline at end of file +- name: xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_u64_acc4 + +# RISC-V Vector +- name: xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u1v +- name: xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u2v +- name: xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u4v