From 550760945b5f73d9a754f0789ae8ebf10f67f20b Mon Sep 17 00:00:00 2001 From: Ken Unger Date: Mon, 30 Mar 2026 23:16:09 -0700 Subject: [PATCH] Copybara import of the project: -- 4a29295db4d2dabc70e236d608972361fac902f4 by Ken Unger : add rvv support for f16-vhswish, f16-vlrelu, f16-vrnd, f16-vrsqrt, f16-vsqrt, f32-vcopysign -- 99f32dece9e7975f3626e97c95504692e6130086 by Ken Unger : updated copyright per review comments FUTURE_COPYBARA_INTEGRATE_REVIEW=https://github.com/google/XNNPACK/pull/9693 from ken-unger:unary-rvv 72587b2187dd13160582fe09e9ebc2feb061f190 PiperOrigin-RevId: 892124176 --- cmake/gen/rvv_microkernels.cmake | 6 ++ cmake/gen/rvvfp16arith_microkernels.cmake | 16 ++++ gen/rvv_microkernels.bzl | 6 ++ gen/rvvfp16arith_microkernels.bzl | 16 ++++ scripts/generate-f16-vhswish.sh | 4 + scripts/generate-f16-vlrelu.sh | 4 + scripts/generate-f16-vrnd.sh | 12 ++- scripts/generate-f16-vrsqrt.sh | 6 +- scripts/generate-f16-vsqrt.sh | 4 + scripts/generate-f32-vcopysign.sh | 11 ++- src/configs/binary-elementwise-config.c | 14 ++++ src/configs/unary-elementwise-config.c | 57 +++++++++++++ src/f16-vhswish/f16-vhswish.inc | 4 + .../gen/f16-vhswish-rvvfp16arith-u4v.c | 50 ++++++++++++ .../gen/f16-vhswish-rvvfp16arith-u8v.c | 50 ++++++++++++ src/f16-vhswish/rvv.c.in | 46 +++++++++++ src/f16-vlrelu/f16-vlrelu.inc | 4 + .../gen/f16-vlrelu-rvvfp16arith-u4v.c | 40 ++++++++++ .../gen/f16-vlrelu-rvvfp16arith-u8v.c | 40 ++++++++++ src/f16-vlrelu/rvv.c.in | 36 +++++++++ src/f16-vrnd/f16-vrndd.inc | 4 + src/f16-vrnd/f16-vrndne.inc | 4 + src/f16-vrnd/f16-vrndu.inc | 4 + src/f16-vrnd/f16-vrndz.inc | 5 ++ src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u4v.c | 50 ++++++++++++ src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u8v.c | 50 ++++++++++++ .../gen/f16-vrndne-rvvfp16arith-u4v.c | 50 ++++++++++++ .../gen/f16-vrndne-rvvfp16arith-u8v.c | 50 ++++++++++++ src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u4v.c | 50 ++++++++++++ src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u8v.c | 50 ++++++++++++ src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u4v.c | 50 ++++++++++++ src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u8v.c | 50 ++++++++++++ src/f16-vrnd/rvv.c.in | 53 +++++++++++++ src/f16-vrsqrt/f16-vrsqrt.inc | 4 + .../gen/f16-vrsqrt-rvvfp16arith-rsqrt-u2v.c | 79 +++++++++++++++++++ .../gen/f16-vrsqrt-rvvfp16arith-rsqrt-u4v.c | 79 +++++++++++++++++++ src/f16-vrsqrt/rvv.c.in | 75 ++++++++++++++++++ src/f16-vsqrt/f16-vsqrt.inc | 4 + .../gen/f16-vsqrt-rvvfp16arith-sqrt-u4v.c | 42 ++++++++++ .../gen/f16-vsqrt-rvvfp16arith-sqrt-u8v.c | 42 ++++++++++ src/f16-vsqrt/rvv.c.in | 38 +++++++++ src/f32-vbinary/f32-vcopysign.inc | 5 ++ src/f32-vbinary/f32-vcopysignc.inc | 5 ++ src/f32-vbinary/f32-vrcopysignc.inc | 5 ++ src/f32-vcopysign/gen/f32-vcopysign-rvv-u4v.c | 45 +++++++++++ src/f32-vcopysign/gen/f32-vcopysign-rvv-u8v.c | 45 +++++++++++ .../gen/f32-vcopysignc-rvv-u4v.c | 45 +++++++++++ .../gen/f32-vcopysignc-rvv-u8v.c | 45 +++++++++++ .../gen/f32-vrcopysignc-rvv-u4v.c | 46 +++++++++++ .../gen/f32-vrcopysignc-rvv-u8v.c | 46 +++++++++++ src/f32-vcopysign/rvv.c.in | 58 ++++++++++++++ 51 files changed, 1601 insertions(+), 3 deletions(-) create mode 100644 src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u4v.c create mode 100644 src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u8v.c create mode 100644 src/f16-vhswish/rvv.c.in create mode 100644 src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u4v.c create mode 100644 src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u8v.c create mode 100644 src/f16-vlrelu/rvv.c.in create mode 100644 src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u4v.c create mode 100644 src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u8v.c create mode 100644 src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u4v.c create mode 100644 src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u8v.c create mode 100644 src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u4v.c create mode 100644 src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u8v.c create mode 100644 src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u4v.c create mode 100644 src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u8v.c create mode 100644 src/f16-vrnd/rvv.c.in create mode 100644 src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u2v.c create mode 100644 src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u4v.c create mode 100644 src/f16-vrsqrt/rvv.c.in create mode 100644 src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u4v.c create mode 100644 src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u8v.c create mode 100644 src/f16-vsqrt/rvv.c.in create mode 100644 src/f32-vcopysign/gen/f32-vcopysign-rvv-u4v.c create mode 100644 src/f32-vcopysign/gen/f32-vcopysign-rvv-u8v.c create mode 100644 src/f32-vcopysign/gen/f32-vcopysignc-rvv-u4v.c create mode 100644 src/f32-vcopysign/gen/f32-vcopysignc-rvv-u8v.c create mode 100644 src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u4v.c create mode 100644 src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u8v.c create mode 100644 src/f32-vcopysign/rvv.c.in diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake index d6ad1ad5937..db093021c08 100644 --- a/cmake/gen/rvv_microkernels.cmake +++ b/cmake/gen/rvv_microkernels.cmake @@ -63,6 +63,9 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vsubc-rvv-u8v.c src/f32-vclamp/gen/f32-vclamp-rvv-u8v.c src/f32-vcmul/gen/f32-vcmul-rvv-u2v.c + src/f32-vcopysign/gen/f32-vcopysign-rvv-u8v.c + src/f32-vcopysign/gen/f32-vcopysignc-rvv-u8v.c + src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u8v.c src/f32-vhswish/gen/f32-vhswish-rvv-u4v.c src/f32-vlrelu/gen/f32-vlrelu-rvv-u4v.c src/f32-vrnd/gen/f32-vrndd-rvv-u4v.c @@ -214,6 +217,9 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS src/f32-vclamp/gen/f32-vclamp-rvv-u4v.c src/f32-vcmul/gen/f32-vcmul-rvv-u1v.c src/f32-vcmul/gen/f32-vcmul-rvv-u4v.c + src/f32-vcopysign/gen/f32-vcopysign-rvv-u4v.c + src/f32-vcopysign/gen/f32-vcopysignc-rvv-u4v.c + src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u4v.c src/f32-vhswish/gen/f32-vhswish-rvv-u1v.c src/f32-vhswish/gen/f32-vhswish-rvv-u2v.c src/f32-vhswish/gen/f32-vhswish-rvv-u8v.c diff --git a/cmake/gen/rvvfp16arith_microkernels.cmake b/cmake/gen/rvvfp16arith_microkernels.cmake index 3c4580a0ab9..d99a66c635c 100644 --- a/cmake/gen/rvvfp16arith_microkernels.cmake +++ b/cmake/gen/rvvfp16arith_microkernels.cmake @@ -49,6 +49,14 @@ SET(PROD_RVVFP16ARITH_MICROKERNEL_SRCS src/f16-vbinary/gen/f16-vsub-rvvfp16arith-u8v.c src/f16-vbinary/gen/f16-vsubc-rvvfp16arith-u8v.c src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u8v.c + src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u8v.c + src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u8v.c + src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u8v.c + src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u8v.c + src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u8v.c + src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u8v.c + src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u4v.c + src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u8v.c src/f16-vunary/gen/f16-vabs-rvvfp16arith-u8v.c src/f16-vunary/gen/f16-vneg-rvvfp16arith-u8v.c src/f16-vunary/gen/f16-vsqr-rvvfp16arith-u8v.c @@ -139,6 +147,14 @@ SET(NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u1v.c src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u2v.c src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u4v.c + src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u4v.c + src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u4v.c + src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u4v.c + src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u4v.c + src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u4v.c + src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u4v.c + src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u2v.c + src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u4v.c src/f16-vunary/gen/f16-vabs-rvvfp16arith-u1v.c src/f16-vunary/gen/f16-vabs-rvvfp16arith-u2v.c src/f16-vunary/gen/f16-vabs-rvvfp16arith-u4v.c diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl index 6261a1484c6..30064f30eee 100644 --- a/gen/rvv_microkernels.bzl +++ b/gen/rvv_microkernels.bzl @@ -59,6 +59,9 @@ PROD_RVV_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vsubc-rvv-u8v.c", "src/f32-vclamp/gen/f32-vclamp-rvv-u8v.c", "src/f32-vcmul/gen/f32-vcmul-rvv-u2v.c", + "src/f32-vcopysign/gen/f32-vcopysign-rvv-u8v.c", + "src/f32-vcopysign/gen/f32-vcopysignc-rvv-u8v.c", + "src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u8v.c", "src/f32-vhswish/gen/f32-vhswish-rvv-u4v.c", "src/f32-vlrelu/gen/f32-vlrelu-rvv-u4v.c", "src/f32-vrnd/gen/f32-vrndd-rvv-u4v.c", @@ -211,6 +214,9 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [ "src/f32-vclamp/gen/f32-vclamp-rvv-u4v.c", "src/f32-vcmul/gen/f32-vcmul-rvv-u1v.c", "src/f32-vcmul/gen/f32-vcmul-rvv-u4v.c", + "src/f32-vcopysign/gen/f32-vcopysign-rvv-u4v.c", + "src/f32-vcopysign/gen/f32-vcopysignc-rvv-u4v.c", + "src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u4v.c", "src/f32-vhswish/gen/f32-vhswish-rvv-u1v.c", "src/f32-vhswish/gen/f32-vhswish-rvv-u2v.c", "src/f32-vhswish/gen/f32-vhswish-rvv-u8v.c", diff --git a/gen/rvvfp16arith_microkernels.bzl b/gen/rvvfp16arith_microkernels.bzl index 31d43cc0976..1c2fa7b5ea8 100644 --- a/gen/rvvfp16arith_microkernels.bzl +++ b/gen/rvvfp16arith_microkernels.bzl @@ -45,6 +45,14 @@ PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-vbinary/gen/f16-vsub-rvvfp16arith-u8v.c", "src/f16-vbinary/gen/f16-vsubc-rvvfp16arith-u8v.c", "src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u8v.c", + "src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u8v.c", + "src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u8v.c", + "src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u8v.c", + "src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u8v.c", + "src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u8v.c", + "src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u8v.c", + "src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u4v.c", + "src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u8v.c", "src/f16-vunary/gen/f16-vabs-rvvfp16arith-u8v.c", "src/f16-vunary/gen/f16-vneg-rvvfp16arith-u8v.c", "src/f16-vunary/gen/f16-vsqr-rvvfp16arith-u8v.c", @@ -136,6 +144,14 @@ NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u1v.c", "src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u2v.c", "src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u4v.c", + "src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u4v.c", + "src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u4v.c", + "src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u4v.c", + "src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u4v.c", + "src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u4v.c", + "src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u4v.c", + "src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u2v.c", + "src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u4v.c", "src/f16-vunary/gen/f16-vabs-rvvfp16arith-u1v.c", "src/f16-vunary/gen/f16-vabs-rvvfp16arith-u2v.c", "src/f16-vunary/gen/f16-vabs-rvvfp16arith-u4v.c", diff --git a/scripts/generate-f16-vhswish.sh b/scripts/generate-f16-vhswish.sh index 368f1e47b11..e0526e530f4 100755 --- a/scripts/generate-f16-vhswish.sh +++ b/scripts/generate-f16-vhswish.sh @@ -12,4 +12,8 @@ tools/xngen src/f16-vhswish/neonfp16arith.c.in -D BATCH_TILE=16 -o src/f16-vhswi tools/xngen src/f16-vhswish/f16c.c.in -D BATCH_TILE=8 -o src/f16-vhswish/gen/f16-vhswish-f16c-u8.c & tools/xngen src/f16-vhswish/f16c.c.in -D BATCH_TILE=16 -o src/f16-vhswish/gen/f16-vhswish-f16c-u16.c & +################################### RISC-V Vector ############################# +tools/xngen src/f16-vhswish/rvv.c.in -D LMUL=4 -o src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u4v.c & +tools/xngen src/f16-vhswish/rvv.c.in -D LMUL=8 -o src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u8v.c & + wait diff --git a/scripts/generate-f16-vlrelu.sh b/scripts/generate-f16-vlrelu.sh index b7e9caa8ee9..f07803a73df 100755 --- a/scripts/generate-f16-vlrelu.sh +++ b/scripts/generate-f16-vlrelu.sh @@ -12,4 +12,8 @@ tools/xngen src/f16-vlrelu/neonfp16arith.c.in -D BATCH_TILE=16 -o src/f16-vlrelu tools/xngen src/f16-vlrelu/f16c.c.in -D BATCH_TILE=8 -o src/f16-vlrelu/gen/f16-vlrelu-f16c-u8.c & tools/xngen src/f16-vlrelu/f16c.c.in -D BATCH_TILE=16 -o src/f16-vlrelu/gen/f16-vlrelu-f16c-u16.c & +################################### RISC-V Vector ############################# +tools/xngen src/f16-vlrelu/rvv.c.in -D LMUL=4 -o src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u4v.c & +tools/xngen src/f16-vlrelu/rvv.c.in -D LMUL=8 -o src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u8v.c & + wait diff --git a/scripts/generate-f16-vrnd.sh b/scripts/generate-f16-vrnd.sh index fc88462ce82..a78ce75020d 100755 --- a/scripts/generate-f16-vrnd.sh +++ b/scripts/generate-f16-vrnd.sh @@ -14,7 +14,7 @@ tools/xngen src/f16-vrnd/neonfp16arith.c.in -D OP=RNDU -D BATCH_TILE=16 -o src/ tools/xngen src/f16-vrnd/neonfp16arith.c.in -D OP=RNDD -D BATCH_TILE=8 -o src/f16-vrnd/gen/f16-vrndd-neonfp16arith-u8.c & tools/xngen src/f16-vrnd/neonfp16arith.c.in -D OP=RNDD -D BATCH_TILE=16 -o src/f16-vrnd/gen/f16-vrndd-neonfp16arith-u16.c & -################################# x86 F16C ################################# +################################# x86 F16C #################################### tools/xngen src/f16-vrnd/f16c.c.in -D OP=RNDNE -D BATCH_TILE=8 -o src/f16-vrnd/gen/f16-vrndne-f16c-u8.c & tools/xngen src/f16-vrnd/f16c.c.in -D OP=RNDNE -D BATCH_TILE=16 -o src/f16-vrnd/gen/f16-vrndne-f16c-u16.c & tools/xngen src/f16-vrnd/f16c.c.in -D OP=RNDZ -D BATCH_TILE=8 -o src/f16-vrnd/gen/f16-vrndz-f16c-u8.c & @@ -24,4 +24,14 @@ tools/xngen src/f16-vrnd/f16c.c.in -D OP=RNDU -D BATCH_TILE=16 -o src/f16-vrnd/ tools/xngen src/f16-vrnd/f16c.c.in -D OP=RNDD -D BATCH_TILE=8 -o src/f16-vrnd/gen/f16-vrndd-f16c-u8.c & tools/xngen src/f16-vrnd/f16c.c.in -D OP=RNDD -D BATCH_TILE=16 -o src/f16-vrnd/gen/f16-vrndd-f16c-u16.c & +################################ RISC-V Vector ################################ +tools/xngen src/f16-vrnd/rvv.c.in -D OP=RNDNE -D LMUL=4 -o src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u4v.c & +tools/xngen src/f16-vrnd/rvv.c.in -D OP=RNDNE -D LMUL=8 -o src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u8v.c & +tools/xngen src/f16-vrnd/rvv.c.in -D OP=RNDZ -D LMUL=4 -o src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u4v.c & +tools/xngen src/f16-vrnd/rvv.c.in -D OP=RNDZ -D LMUL=8 -o src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u8v.c & +tools/xngen src/f16-vrnd/rvv.c.in -D OP=RNDU -D LMUL=4 -o src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u4v.c & +tools/xngen src/f16-vrnd/rvv.c.in -D OP=RNDU -D LMUL=8 -o src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u8v.c & +tools/xngen src/f16-vrnd/rvv.c.in -D OP=RNDD -D LMUL=4 -o src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u4v.c & +tools/xngen src/f16-vrnd/rvv.c.in -D OP=RNDD -D LMUL=8 -o src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u8v.c & + wait diff --git a/scripts/generate-f16-vrsqrt.sh b/scripts/generate-f16-vrsqrt.sh index dd522c4c71b..6da8933faaf 100755 --- a/scripts/generate-f16-vrsqrt.sh +++ b/scripts/generate-f16-vrsqrt.sh @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -############################### ARM NEONFP16ARITH ############################## +############################### ARM NEONFP16ARITH ############################# tools/xngen src/f16-vrsqrt/neonfp16arith-rsqrt.c.in -D BATCH_TILE=8 -D FULL_ACC=1 -o src/f16-vrsqrt/gen/f16-vrsqrt-neonfp16arith-rsqrt-u8.c & tools/xngen src/f16-vrsqrt/neonfp16arith-rsqrt.c.in -D BATCH_TILE=16 -D FULL_ACC=1 -o src/f16-vrsqrt/gen/f16-vrsqrt-neonfp16arith-rsqrt-u16.c & tools/xngen src/f16-vrsqrt/neonfp16arith-rsqrt.c.in -D BATCH_TILE=32 -D FULL_ACC=1 -o src/f16-vrsqrt/gen/f16-vrsqrt-neonfp16arith-rsqrt-u32.c & @@ -14,4 +14,8 @@ tools/xngen src/f16-vrsqrt/f16c-rsqrt.c.in -D BATCH_TILE=8 -o src/f16-vrsqrt/ge tools/xngen src/f16-vrsqrt/f16c-rsqrt.c.in -D BATCH_TILE=16 -o src/f16-vrsqrt/gen/f16-vrsqrt-f16c-rsqrt-u16.c & tools/xngen src/f16-vrsqrt/f16c-rsqrt.c.in -D BATCH_TILE=32 -o src/f16-vrsqrt/gen/f16-vrsqrt-f16c-rsqrt-u32.c & +############################### RISC-V Vector ################################# +tools/xngen src/f16-vrsqrt/rvv.c.in -D LMUL=2 -o src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u2v.c & +tools/xngen src/f16-vrsqrt/rvv.c.in -D LMUL=4 -o src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u4v.c & + wait diff --git a/scripts/generate-f16-vsqrt.sh b/scripts/generate-f16-vsqrt.sh index 422e55624da..675c070bb5a 100755 --- a/scripts/generate-f16-vsqrt.sh +++ b/scripts/generate-f16-vsqrt.sh @@ -43,4 +43,8 @@ tools/xngen src/f16-vsqrt/avx512fp16-sqrt.c.in -D BATCH_TILE=32 -o src/f16-vsqr tools/xngen src/f16-vsqrt/avx512fp16-sqrt.c.in -D BATCH_TILE=64 -o src/f16-vsqrt/gen/f16-vsqrt-avx512fp16-sqrt-u64.c & tools/xngen src/f16-vsqrt/avx512fp16-sqrt.c.in -D BATCH_TILE=128 -o src/f16-vsqrt/gen/f16-vsqrt-avx512fp16-sqrt-u128.c & +################################ RISC-V Vector ################################ +tools/xngen src/f16-vsqrt/rvv.c.in -D LMUL=4 -o src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u4v.c & +tools/xngen src/f16-vsqrt/rvv.c.in -D LMUL=8 -o src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u8v.c & + wait diff --git a/scripts/generate-f32-vcopysign.sh b/scripts/generate-f32-vcopysign.sh index 52aae9b38eb..93d859fdbd2 100755 --- a/scripts/generate-f32-vcopysign.sh +++ b/scripts/generate-f32-vcopysign.sh @@ -13,7 +13,6 @@ tools/xngen src/f32-vcopysign/copysign.c.in -D ARCH=avx -D BATCH_TILES=8, tools/xngen src/f32-vcopysign/copysign.c.in -D ARCH=avx512f -D BATCH_TILES=16,32,48,64 -o src/f32-vcopysign/gen/f32-vcopysign-avx512f.c & tools/xngen src/f32-vcopysign/copysign.c.in -D ARCH=hvx -D BATCH_TILES=32,64,96,128 -o src/f32-vcopysign/gen/f32-vcopysign-hvx.c & - # Scalar sign tools/xngen src/f32-vcopysign/copysignc.c.in -D ARCH=scalar -D BATCH_TILES=1,2,4,8 -o src/f32-vcopysign/gen/f32-vcopysignc-scalar.c & tools/xngen src/f32-vcopysign/copysignc.c.in -D ARCH=sse2 -D BATCH_TILES=4,8,12,16 -o src/f32-vcopysign/gen/f32-vcopysignc-sse2.c & @@ -36,4 +35,14 @@ tools/xngen src/f32-vcopysign/copysign.c.in -D ARCH=avx512f -D BATCH_TILES=16 tools/xngen src/f32-vcopysign/copysignc.c.in -D ARCH=avx512f -D BATCH_TILES=16,32,48,64 -o src/f32-vcopysign/gen/f32-vcopysignc-avx512f.c & tools/xngen src/f32-vcopysign/rcopysignc.c.in -D ARCH=avx512f -D BATCH_TILES=16,32,48,64 -o src/f32-vcopysign/gen/f32-vrcopysignc-avx512f.c & +##################################### RISC-V Vector ############################ +tools/xngen src/f32-vcopysign/rvv.c.in -D LMUL=4 -D OP=COPYSIGN -o src/f32-vcopysign/gen/f32-vcopysign-rvv-u4v.c & +tools/xngen src/f32-vcopysign/rvv.c.in -D LMUL=8 -D OP=COPYSIGN -o src/f32-vcopysign/gen/f32-vcopysign-rvv-u8v.c & + +tools/xngen src/f32-vcopysign/rvv.c.in -D LMUL=4 -D OP=COPYSIGNC -o src/f32-vcopysign/gen/f32-vcopysignc-rvv-u4v.c & +tools/xngen src/f32-vcopysign/rvv.c.in -D LMUL=8 -D OP=COPYSIGNC -o src/f32-vcopysign/gen/f32-vcopysignc-rvv-u8v.c & + +tools/xngen src/f32-vcopysign/rvv.c.in -D LMUL=4 -D OP=RCOPYSIGNC -o src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u4v.c & +tools/xngen src/f32-vcopysign/rvv.c.in -D LMUL=8 -D OP=RCOPYSIGNC -o src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u8v.c & + wait diff --git a/src/configs/binary-elementwise-config.c b/src/configs/binary-elementwise-config.c index 525a0b1e401..d772bea8375 100644 --- a/src/configs/binary-elementwise-config.c +++ b/src/configs/binary-elementwise-config.c @@ -632,6 +632,20 @@ static void init_f32_vcopysign_config(void) { f32_vcopysign_config.ropc_ukernel = XNN_INIT_BINARY_UKERNEL(xnn_f32_vrcopysignc_ukernel__hvx_u128); f32_vcopysign_config.element_tile = 128; } + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (hardware_config->arch_flags & xnn_arch_riscv_vector) { + f32_vcopysign_config.op_ukernel = XNN_INIT_BINARY_UKERNEL(xnn_f32_vcopysign_ukernel__rvv_u8v); + f32_vcopysign_config.opc_ukernel = XNN_INIT_BINARY_UKERNEL(xnn_f32_vcopysignc_ukernel__rvv_u8v); + f32_vcopysign_config.ropc_ukernel = XNN_INIT_BINARY_UKERNEL(xnn_f32_vrcopysignc_ukernel__rvv_u8v); + f32_vcopysign_config.element_tile = 8 * hardware_config->vlenb / sizeof(float); + } else { + f32_vcopysign_config.op_ukernel = XNN_INIT_BINARY_UKERNEL(xnn_f32_vcopysign_ukernel__scalar_u2); + f32_vcopysign_config.opc_ukernel = XNN_INIT_BINARY_UKERNEL(xnn_f32_vcopysignc_ukernel__scalar_u2); + f32_vcopysign_config.ropc_ukernel = XNN_INIT_BINARY_UKERNEL(xnn_f32_vrcopysignc_ukernel__scalar_u2); + f32_vcopysign_config.element_tile = 8; + } #else f32_vcopysign_config.op_ukernel = XNN_INIT_BINARY_UKERNEL(xnn_f32_vcopysign_ukernel__scalar_u2); f32_vcopysign_config.opc_ukernel = XNN_INIT_BINARY_UKERNEL(xnn_f32_vcopysignc_ukernel__scalar_u2); diff --git a/src/configs/unary-elementwise-config.c b/src/configs/unary-elementwise-config.c index 00dda5e9704..475faadc1ea 100644 --- a/src/configs/unary-elementwise-config.c +++ b/src/configs/unary-elementwise-config.c @@ -339,6 +339,13 @@ static void init_f16_hswish_config(void) { f16_hswish_config.element_tile = 16; } #endif + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) { + f16_hswish_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f16_vhswish_ukernel__rvvfp16arith_u8v); + f16_hswish_config.element_tile = 8 * hardware_config->vlenb / sizeof(xnn_float16); + } #endif } @@ -372,6 +379,14 @@ static void init_f16_lrelu_config(void) { f16_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f16_lrelu_scalar_params; } #endif + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) { + f16_lrelu_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f16_vlrelu_ukernel__rvvfp16arith_u8v); + f16_lrelu_config.element_tile = 8 * hardware_config->vlenb / sizeof(xnn_float16); + f16_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f16_lrelu_scalar_params; + } #endif } @@ -436,6 +451,13 @@ static void init_f16_rndd_config(void) { f16_rndd_config.element_tile = 16; } #endif + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) { + f16_rndd_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f16_vrndd_ukernel__rvvfp16arith_u8v); + f16_rndd_config.element_tile = 8 * hardware_config->vlenb / sizeof(xnn_float16); + } #endif } @@ -466,6 +488,13 @@ static void init_f16_rndne_config(void) { f16_rndne_config.element_tile = 16; } #endif + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) { + f16_rndne_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f16_vrndne_ukernel__rvvfp16arith_u8v); + f16_rndne_config.element_tile = 8 * hardware_config->vlenb / sizeof(xnn_float16); + } #endif } @@ -496,6 +525,13 @@ static void init_f16_rndu_config(void) { f16_rndu_config.element_tile = 16; } #endif + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) { + f16_rndu_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f16_vrndu_ukernel__rvvfp16arith_u8v); + f16_rndu_config.element_tile = 8 * hardware_config->vlenb / sizeof(xnn_float16); + } #endif } @@ -526,6 +562,13 @@ static void init_f16_rndz_config(void) { f16_rndz_config.element_tile = 16; } #endif + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) { + f16_rndz_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f16_vrndz_ukernel__rvvfp16arith_u8v); + f16_rndz_config.element_tile = 8 * hardware_config->vlenb / sizeof(xnn_float16); + } #endif } @@ -556,6 +599,13 @@ static void init_f16_rsqrt_config(void) { f16_rsqrt_config.element_tile = 32; } #endif + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) { + f16_rsqrt_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f16_vrsqrt_ukernel__rvvfp16arith_rsqrt_u4v); + f16_rsqrt_config.element_tile = 4 * hardware_config->vlenb / sizeof(xnn_float16); + } #endif } @@ -665,6 +715,13 @@ static void init_f16_sqrt_config(void) { f16_sqrt_config.element_tile = 32; } #endif + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) { + f16_sqrt_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f16_vsqrt_ukernel__rvvfp16arith_sqrt_u8v); + f16_sqrt_config.element_tile = 8 * hardware_config->vlenb / sizeof(xnn_float16); + } #endif } diff --git a/src/f16-vhswish/f16-vhswish.inc b/src/f16-vhswish/f16-vhswish.inc index 09402fc6250..0c5350e6b8c 100644 --- a/src/f16-vhswish/f16-vhswish.inc +++ b/src/f16-vhswish/f16-vhswish.inc @@ -15,3 +15,7 @@ XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vhswish_ukernel__f16c_u8, 8, false, xnn_f XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vhswish_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, NULL) #endif // XNN_ENABLE_F16C && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vhswish_ukernel__rvvfp16arith_u4v, 4, true, xnn_float16, struct xnn_f16_default_params, NULL) +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vhswish_ukernel__rvvfp16arith_u8v, 8, true, xnn_float16, struct xnn_f16_default_params, NULL) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR diff --git a/src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u4v.c b/src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u4v.c new file mode 100644 index 00000000000..a602a138339 --- /dev/null +++ b/src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u4v.c @@ -0,0 +1,50 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-vhswish/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f16_vhswish_ukernel__rvvfp16arith_u4v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + const xnn_float16 vsixth = 0x1.555556p-3f; + const xnn_float16 vthree = 3.0f; + const xnn_float16 vsix = 6.0f; + const xnn_float16 vzero = 0.0f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + do { + const size_t vl = __riscv_vsetvl_e16m4(batch); + vfloat16m4_t vx = __riscv_vle16_v_f16m4(input, vl); + input += vl; + vfloat16m4_t vacc = __riscv_vfadd(vx, vthree, vl); + vx = __riscv_vfmul(vx, vsixth, vl); + vacc = __riscv_vfmax(vacc, vzero, vl); + vacc = __riscv_vfmin(vacc, vsix, vl); + vacc = __riscv_vfmul(vacc, vx, vl); + __riscv_vse16(output, vacc, vl); + output += vl; + + batch -= vl; + } while (batch != 0); +} diff --git a/src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u8v.c b/src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u8v.c new file mode 100644 index 00000000000..f955e111f8d --- /dev/null +++ b/src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u8v.c @@ -0,0 +1,50 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-vhswish/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f16_vhswish_ukernel__rvvfp16arith_u8v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + const xnn_float16 vsixth = 0x1.555556p-3f; + const xnn_float16 vthree = 3.0f; + const xnn_float16 vsix = 6.0f; + const xnn_float16 vzero = 0.0f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + do { + const size_t vl = __riscv_vsetvl_e16m8(batch); + vfloat16m8_t vx = __riscv_vle16_v_f16m8(input, vl); + input += vl; + vfloat16m8_t vacc = __riscv_vfadd(vx, vthree, vl); + vx = __riscv_vfmul(vx, vsixth, vl); + vacc = __riscv_vfmax(vacc, vzero, vl); + vacc = __riscv_vfmin(vacc, vsix, vl); + vacc = __riscv_vfmul(vacc, vx, vl); + __riscv_vse16(output, vacc, vl); + output += vl; + + batch -= vl; + } while (batch != 0); +} diff --git a/src/f16-vhswish/rvv.c.in b/src/f16-vhswish/rvv.c.in new file mode 100644 index 00000000000..a843e06c34f --- /dev/null +++ b/src/f16-vhswish/rvv.c.in @@ -0,0 +1,46 @@ +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert LMUL in [1, 2, 4, 8] +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f16_vhswish_ukernel__rvvfp16arith_u${LMUL}v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + const xnn_float16 vsixth = 0x1.555556p-3f; + const xnn_float16 vthree = 3.0f; + const xnn_float16 vsix = 6.0f; + const xnn_float16 vzero = 0.0f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + do { + const size_t vl = __riscv_vsetvl_e16m${LMUL}(batch); + vfloat16m${LMUL}_t vx = __riscv_vle16_v_f16m${LMUL}(input, vl); + input += vl; + vfloat16m${LMUL}_t vacc = __riscv_vfadd(vx, vthree, vl); + vx = __riscv_vfmul(vx, vsixth, vl); + vacc = __riscv_vfmax(vacc, vzero, vl); + vacc = __riscv_vfmin(vacc, vsix, vl); + vacc = __riscv_vfmul(vacc, vx, vl); + __riscv_vse16(output, vacc, vl); + output += vl; + + batch -= vl; + } while (batch != 0); +} diff --git a/src/f16-vlrelu/f16-vlrelu.inc b/src/f16-vlrelu/f16-vlrelu.inc index a873595220d..30f02c9c426 100644 --- a/src/f16-vlrelu/f16-vlrelu.inc +++ b/src/f16-vlrelu/f16-vlrelu.inc @@ -15,3 +15,7 @@ XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vlrelu_ukernel__f16c_u8, 8, false, xnn_fl XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vlrelu_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_lrelu_params, xnn_init_f16_lrelu_scalar_params) #endif // XNN_ENABLE_F16C && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vlrelu_ukernel__rvvfp16arith_u4v, 4, true, xnn_float16, struct xnn_f16_lrelu_params, xnn_init_f16_lrelu_scalar_params) +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vlrelu_ukernel__rvvfp16arith_u8v, 8, true, xnn_float16, struct xnn_f16_lrelu_params, xnn_init_f16_lrelu_scalar_params) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR diff --git a/src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u4v.c b/src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u4v.c new file mode 100644 index 00000000000..0254168f7ce --- /dev/null +++ b/src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u4v.c @@ -0,0 +1,40 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-vlrelu/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Imagination Technologies, Inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + +void xnn_f16_vlrelu_ukernel__rvvfp16arith_u4v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_lrelu_params* restrict params) +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + const xnn_float16 slope = params->scalar.slope; + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + + do { + size_t vl = __riscv_vsetvl_e16m4(batch); + vfloat16m4_t in_f16v = __riscv_vle16_v_f16m4(input, vl); input += vl; + vbool4_t mask_f16v = __riscv_vmflt(in_f16v, 0.0f, vl); + vfloat16m4_t out_f16v = __riscv_vfmul(mask_f16v, in_f16v, slope, vl); + __riscv_vse16(output, out_f16v, vl); output += vl; + + batch -= vl; + } while (batch != 0); +} diff --git a/src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u8v.c b/src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u8v.c new file mode 100644 index 00000000000..12f5cbaaa7d --- /dev/null +++ b/src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u8v.c @@ -0,0 +1,40 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-vlrelu/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Imagination Technologies, Inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + +void xnn_f16_vlrelu_ukernel__rvvfp16arith_u8v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_lrelu_params* restrict params) +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + const xnn_float16 slope = params->scalar.slope; + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + + do { + size_t vl = __riscv_vsetvl_e16m8(batch); + vfloat16m8_t in_f16v = __riscv_vle16_v_f16m8(input, vl); input += vl; + vbool2_t mask_f16v = __riscv_vmflt(in_f16v, 0.0f, vl); + vfloat16m8_t out_f16v = __riscv_vfmul(mask_f16v, in_f16v, slope, vl); + __riscv_vse16(output, out_f16v, vl); output += vl; + + batch -= vl; + } while (batch != 0); +} diff --git a/src/f16-vlrelu/rvv.c.in b/src/f16-vlrelu/rvv.c.in new file mode 100644 index 00000000000..062ff8426f5 --- /dev/null +++ b/src/f16-vlrelu/rvv.c.in @@ -0,0 +1,36 @@ +// Copyright 2026 Imagination Technologies, Inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert LMUL in [1, 2, 4, 8] +#include + +#include +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + +void xnn_f16_vlrelu_ukernel__rvvfp16arith_u${LMUL}v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_lrelu_params* restrict params) +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + const xnn_float16 slope = params->scalar.slope; + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + + do { + size_t vl = __riscv_vsetvl_e16m${LMUL}(batch); + vfloat16m${LMUL}_t in_f16v = __riscv_vle16_v_f16m${LMUL}(input, vl); input += vl; + vbool${16//LMUL}_t mask_f16v = __riscv_vmflt(in_f16v, 0.0f, vl); + vfloat16m${LMUL}_t out_f16v = __riscv_vfmul(mask_f16v, in_f16v, slope, vl); + __riscv_vse16(output, out_f16v, vl); output += vl; + + batch -= vl; + } while (batch != 0); +} diff --git a/src/f16-vrnd/f16-vrndd.inc b/src/f16-vrnd/f16-vrndd.inc index 0293e05084c..4ba33362d86 100644 --- a/src/f16-vrnd/f16-vrndd.inc +++ b/src/f16-vrnd/f16-vrndd.inc @@ -15,3 +15,7 @@ XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vrndd_ukernel__f16c_u8, 8, false, xnn_flo XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vrndd_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, NULL) #endif // XNN_ENABLE_F16C && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vrndd_ukernel__rvvfp16arith_u4v, 4, true, xnn_float16, struct xnn_f16_default_params, NULL) +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vrndd_ukernel__rvvfp16arith_u8v, 8, true, xnn_float16, struct xnn_f16_default_params, NULL) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR diff --git a/src/f16-vrnd/f16-vrndne.inc b/src/f16-vrnd/f16-vrndne.inc index cee077888e7..8e4e4a9e3ef 100644 --- a/src/f16-vrnd/f16-vrndne.inc +++ b/src/f16-vrnd/f16-vrndne.inc @@ -15,3 +15,7 @@ XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vrndne_ukernel__f16c_u8, 8, false, xnn_fl XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vrndne_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, NULL) #endif // XNN_ENABLE_F16C && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vrndne_ukernel__rvvfp16arith_u4v, 4, true, xnn_float16, struct xnn_f16_default_params, NULL) +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vrndne_ukernel__rvvfp16arith_u8v, 8, true, xnn_float16, struct xnn_f16_default_params, NULL) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR diff --git a/src/f16-vrnd/f16-vrndu.inc b/src/f16-vrnd/f16-vrndu.inc index bf2a2a3426b..7414e88b336 100644 --- a/src/f16-vrnd/f16-vrndu.inc +++ b/src/f16-vrnd/f16-vrndu.inc @@ -15,3 +15,7 @@ XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vrndu_ukernel__f16c_u8, 8, false, xnn_flo XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vrndu_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, NULL) #endif // XNN_ENABLE_F16C && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vrndu_ukernel__rvvfp16arith_u4v, 4, true, xnn_float16, struct xnn_f16_default_params, NULL) +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vrndu_ukernel__rvvfp16arith_u8v, 8, true, xnn_float16, struct xnn_f16_default_params, NULL) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR diff --git a/src/f16-vrnd/f16-vrndz.inc b/src/f16-vrnd/f16-vrndz.inc index d9af324eb64..d971ce2e6b6 100644 --- a/src/f16-vrnd/f16-vrndz.inc +++ b/src/f16-vrnd/f16-vrndz.inc @@ -15,3 +15,8 @@ XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vrndz_ukernel__f16c_u8, 8, false, xnn_flo XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vrndz_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, NULL) #endif // XNN_ENABLE_F16C && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vrndz_ukernel__rvvfp16arith_u4v, 4, true, xnn_float16, struct xnn_f16_default_params, NULL) +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vrndz_ukernel__rvvfp16arith_u8v, 8, true, xnn_float16, struct xnn_f16_default_params, NULL) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR + diff --git a/src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u4v.c b/src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u4v.c new file mode 100644 index 00000000000..6d492e9153f --- /dev/null +++ b/src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u4v.c @@ -0,0 +1,50 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-vrnd/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Imagination Technologies inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f16_vrndd_ukernel__rvvfp16arith_u4v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + do { + const size_t vl = __riscv_vsetvl_e16m4(batch); + vfloat16m4_t x_f16v = __riscv_vle16_v_f16m4(input, vl); input += vl; + + // preserve NaN + vbool4_t nan_bv = __riscv_vmfeq(x_f16v, x_f16v, vl); + // magnitude < (1 << FLT16_MANT_DIG) + vfloat16m4_t mag = __riscv_vfabs(x_f16v, vl); + vbool4_t mag_bv = __riscv_vmflt(mag, (1 << __FLT16_MANT_DIG__), vl); + vbool4_t mask_bv = __riscv_vmnand(nan_bv, mag_bv, vl); + + vint16m4_t x_rnd_i16v = __riscv_vfcvt_x(x_f16v, __RISCV_FRM_RDN, vl); + vfloat16m4_t out_f16v = __riscv_vfcvt_f(x_rnd_i16v, vl); + out_f16v = __riscv_vmerge(out_f16v, x_f16v, mask_bv, vl); + __riscv_vse16(output, out_f16v, vl); output += vl; + batch -= vl; + } while (batch != 0); +} diff --git a/src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u8v.c b/src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u8v.c new file mode 100644 index 00000000000..d8a044e3dfc --- /dev/null +++ b/src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u8v.c @@ -0,0 +1,50 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-vrnd/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Imagination Technologies inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f16_vrndd_ukernel__rvvfp16arith_u8v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + do { + const size_t vl = __riscv_vsetvl_e16m8(batch); + vfloat16m8_t x_f16v = __riscv_vle16_v_f16m8(input, vl); input += vl; + + // preserve NaN + vbool2_t nan_bv = __riscv_vmfeq(x_f16v, x_f16v, vl); + // magnitude < (1 << FLT16_MANT_DIG) + vfloat16m8_t mag = __riscv_vfabs(x_f16v, vl); + vbool2_t mag_bv = __riscv_vmflt(mag, (1 << __FLT16_MANT_DIG__), vl); + vbool2_t mask_bv = __riscv_vmnand(nan_bv, mag_bv, vl); + + vint16m8_t x_rnd_i16v = __riscv_vfcvt_x(x_f16v, __RISCV_FRM_RDN, vl); + vfloat16m8_t out_f16v = __riscv_vfcvt_f(x_rnd_i16v, vl); + out_f16v = __riscv_vmerge(out_f16v, x_f16v, mask_bv, vl); + __riscv_vse16(output, out_f16v, vl); output += vl; + batch -= vl; + } while (batch != 0); +} diff --git a/src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u4v.c b/src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u4v.c new file mode 100644 index 00000000000..8721794794a --- /dev/null +++ b/src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u4v.c @@ -0,0 +1,50 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-vrnd/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Imagination Technologies inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f16_vrndne_ukernel__rvvfp16arith_u4v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + do { + const size_t vl = __riscv_vsetvl_e16m4(batch); + vfloat16m4_t x_f16v = __riscv_vle16_v_f16m4(input, vl); input += vl; + + // preserve NaN + vbool4_t nan_bv = __riscv_vmfeq(x_f16v, x_f16v, vl); + // magnitude < (1 << FLT16_MANT_DIG) + vfloat16m4_t mag = __riscv_vfabs(x_f16v, vl); + vbool4_t mag_bv = __riscv_vmflt(mag, (1 << __FLT16_MANT_DIG__), vl); + vbool4_t mask_bv = __riscv_vmnand(nan_bv, mag_bv, vl); + + vint16m4_t x_rnd_i16v = __riscv_vfcvt_x(x_f16v, __RISCV_FRM_RNE, vl); + vfloat16m4_t out_f16v = __riscv_vfcvt_f(x_rnd_i16v, vl); + out_f16v = __riscv_vmerge(out_f16v, x_f16v, mask_bv, vl); + __riscv_vse16(output, out_f16v, vl); output += vl; + batch -= vl; + } while (batch != 0); +} diff --git a/src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u8v.c b/src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u8v.c new file mode 100644 index 00000000000..e93ea8dc05e --- /dev/null +++ b/src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u8v.c @@ -0,0 +1,50 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-vrnd/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Imagination Technologies inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f16_vrndne_ukernel__rvvfp16arith_u8v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + do { + const size_t vl = __riscv_vsetvl_e16m8(batch); + vfloat16m8_t x_f16v = __riscv_vle16_v_f16m8(input, vl); input += vl; + + // preserve NaN + vbool2_t nan_bv = __riscv_vmfeq(x_f16v, x_f16v, vl); + // magnitude < (1 << FLT16_MANT_DIG) + vfloat16m8_t mag = __riscv_vfabs(x_f16v, vl); + vbool2_t mag_bv = __riscv_vmflt(mag, (1 << __FLT16_MANT_DIG__), vl); + vbool2_t mask_bv = __riscv_vmnand(nan_bv, mag_bv, vl); + + vint16m8_t x_rnd_i16v = __riscv_vfcvt_x(x_f16v, __RISCV_FRM_RNE, vl); + vfloat16m8_t out_f16v = __riscv_vfcvt_f(x_rnd_i16v, vl); + out_f16v = __riscv_vmerge(out_f16v, x_f16v, mask_bv, vl); + __riscv_vse16(output, out_f16v, vl); output += vl; + batch -= vl; + } while (batch != 0); +} diff --git a/src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u4v.c b/src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u4v.c new file mode 100644 index 00000000000..6379b914eb0 --- /dev/null +++ b/src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u4v.c @@ -0,0 +1,50 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-vrnd/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Imagination Technologies inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f16_vrndu_ukernel__rvvfp16arith_u4v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + do { + const size_t vl = __riscv_vsetvl_e16m4(batch); + vfloat16m4_t x_f16v = __riscv_vle16_v_f16m4(input, vl); input += vl; + + // preserve NaN + vbool4_t nan_bv = __riscv_vmfeq(x_f16v, x_f16v, vl); + // magnitude < (1 << FLT16_MANT_DIG) + vfloat16m4_t mag = __riscv_vfabs(x_f16v, vl); + vbool4_t mag_bv = __riscv_vmflt(mag, (1 << __FLT16_MANT_DIG__), vl); + vbool4_t mask_bv = __riscv_vmnand(nan_bv, mag_bv, vl); + + vint16m4_t x_rnd_i16v = __riscv_vfcvt_x(x_f16v, __RISCV_FRM_RUP, vl); + vfloat16m4_t out_f16v = __riscv_vfcvt_f(x_rnd_i16v, vl); + out_f16v = __riscv_vmerge(out_f16v, x_f16v, mask_bv, vl); + __riscv_vse16(output, out_f16v, vl); output += vl; + batch -= vl; + } while (batch != 0); +} diff --git a/src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u8v.c b/src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u8v.c new file mode 100644 index 00000000000..723455ec821 --- /dev/null +++ b/src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u8v.c @@ -0,0 +1,50 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-vrnd/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Imagination Technologies inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f16_vrndu_ukernel__rvvfp16arith_u8v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + do { + const size_t vl = __riscv_vsetvl_e16m8(batch); + vfloat16m8_t x_f16v = __riscv_vle16_v_f16m8(input, vl); input += vl; + + // preserve NaN + vbool2_t nan_bv = __riscv_vmfeq(x_f16v, x_f16v, vl); + // magnitude < (1 << FLT16_MANT_DIG) + vfloat16m8_t mag = __riscv_vfabs(x_f16v, vl); + vbool2_t mag_bv = __riscv_vmflt(mag, (1 << __FLT16_MANT_DIG__), vl); + vbool2_t mask_bv = __riscv_vmnand(nan_bv, mag_bv, vl); + + vint16m8_t x_rnd_i16v = __riscv_vfcvt_x(x_f16v, __RISCV_FRM_RUP, vl); + vfloat16m8_t out_f16v = __riscv_vfcvt_f(x_rnd_i16v, vl); + out_f16v = __riscv_vmerge(out_f16v, x_f16v, mask_bv, vl); + __riscv_vse16(output, out_f16v, vl); output += vl; + batch -= vl; + } while (batch != 0); +} diff --git a/src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u4v.c b/src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u4v.c new file mode 100644 index 00000000000..f25a850016d --- /dev/null +++ b/src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u4v.c @@ -0,0 +1,50 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-vrnd/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Imagination Technologies inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f16_vrndz_ukernel__rvvfp16arith_u4v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + do { + const size_t vl = __riscv_vsetvl_e16m4(batch); + vfloat16m4_t x_f16v = __riscv_vle16_v_f16m4(input, vl); input += vl; + + // preserve NaN + vbool4_t nan_bv = __riscv_vmfeq(x_f16v, x_f16v, vl); + // magnitude < (1 << FLT16_MANT_DIG) + vfloat16m4_t mag = __riscv_vfabs(x_f16v, vl); + vbool4_t mag_bv = __riscv_vmflt(mag, (1 << __FLT16_MANT_DIG__), vl); + vbool4_t mask_bv = __riscv_vmnand(nan_bv, mag_bv, vl); + + vint16m4_t x_rnd_i16v = __riscv_vfcvt_x(x_f16v, __RISCV_FRM_RTZ, vl); + vfloat16m4_t out_f16v = __riscv_vfcvt_f(x_rnd_i16v, vl); + out_f16v = __riscv_vmerge(out_f16v, x_f16v, mask_bv, vl); + __riscv_vse16(output, out_f16v, vl); output += vl; + batch -= vl; + } while (batch != 0); +} diff --git a/src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u8v.c b/src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u8v.c new file mode 100644 index 00000000000..591e7a99fd9 --- /dev/null +++ b/src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u8v.c @@ -0,0 +1,50 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-vrnd/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Imagination Technologies inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f16_vrndz_ukernel__rvvfp16arith_u8v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + do { + const size_t vl = __riscv_vsetvl_e16m8(batch); + vfloat16m8_t x_f16v = __riscv_vle16_v_f16m8(input, vl); input += vl; + + // preserve NaN + vbool2_t nan_bv = __riscv_vmfeq(x_f16v, x_f16v, vl); + // magnitude < (1 << FLT16_MANT_DIG) + vfloat16m8_t mag = __riscv_vfabs(x_f16v, vl); + vbool2_t mag_bv = __riscv_vmflt(mag, (1 << __FLT16_MANT_DIG__), vl); + vbool2_t mask_bv = __riscv_vmnand(nan_bv, mag_bv, vl); + + vint16m8_t x_rnd_i16v = __riscv_vfcvt_x(x_f16v, __RISCV_FRM_RTZ, vl); + vfloat16m8_t out_f16v = __riscv_vfcvt_f(x_rnd_i16v, vl); + out_f16v = __riscv_vmerge(out_f16v, x_f16v, mask_bv, vl); + __riscv_vse16(output, out_f16v, vl); output += vl; + batch -= vl; + } while (batch != 0); +} diff --git a/src/f16-vrnd/rvv.c.in b/src/f16-vrnd/rvv.c.in new file mode 100644 index 00000000000..25507cabe40 --- /dev/null +++ b/src/f16-vrnd/rvv.c.in @@ -0,0 +1,53 @@ +// Copyright 2026 Imagination Technologies inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert LMUL in [1, 2, 4, 8] +$assert OP in ["RNDNE", "RNDZ", "RNDU", "RNDD"] +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vunary.h" + + +$ROUND_MODE = { +$ "RNDNE": "__RISCV_FRM_RNE", +$ "RNDZ": "__RISCV_FRM_RTZ", +$ "RNDU": "__RISCV_FRM_RUP", +$ "RNDD": "__RISCV_FRM_RDN", +$}[OP] +void xnn_f16_v${OP.lower()}_ukernel__rvvfp16arith_u${LMUL}v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + do { + const size_t vl = __riscv_vsetvl_e16m${LMUL}(batch); + vfloat16m${LMUL}_t x_f16v = __riscv_vle16_v_f16m${LMUL}(input, vl); input += vl; + + // preserve NaN + vbool${16//LMUL}_t nan_bv = __riscv_vmfeq(x_f16v, x_f16v, vl); + // magnitude < (1 << FLT16_MANT_DIG) + vfloat16m${LMUL}_t mag = __riscv_vfabs(x_f16v, vl); + vbool${16//LMUL}_t mag_bv = __riscv_vmflt(mag, (1 << __FLT16_MANT_DIG__), vl); + vbool${16//LMUL}_t mask_bv = __riscv_vmnand(nan_bv, mag_bv, vl); + + vint16m${LMUL}_t x_rnd_i16v = __riscv_vfcvt_x(x_f16v, ${ROUND_MODE}, vl); + vfloat16m${LMUL}_t out_f16v = __riscv_vfcvt_f(x_rnd_i16v, vl); + out_f16v = __riscv_vmerge(out_f16v, x_f16v, mask_bv, vl); + __riscv_vse16(output, out_f16v, vl); output += vl; + batch -= vl; + } while (batch != 0); +} diff --git a/src/f16-vrsqrt/f16-vrsqrt.inc b/src/f16-vrsqrt/f16-vrsqrt.inc index cae6c9336eb..f20c36abb3f 100644 --- a/src/f16-vrsqrt/f16-vrsqrt.inc +++ b/src/f16-vrsqrt/f16-vrsqrt.inc @@ -17,3 +17,7 @@ XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vrsqrt_ukernel__f16c_rsqrt_u16, 16, false XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vrsqrt_ukernel__f16c_rsqrt_u32, 32, false, xnn_float16, struct xnn_f16_default_params, NULL) #endif // XNN_ENABLE_F16C && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vrsqrt_ukernel__rvvfp16arith_rsqrt_u2v, 2, true, xnn_float16, struct xnn_f16_default_params, NULL) +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vrsqrt_ukernel__rvvfp16arith_rsqrt_u4v, 4, true, xnn_float16, struct xnn_f16_default_params, NULL) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR diff --git a/src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u2v.c b/src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u2v.c new file mode 100644 index 00000000000..e7a7bb84c84 --- /dev/null +++ b/src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u2v.c @@ -0,0 +1,79 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-vrsqrt/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + +// In the following, we do a single Newton-Raphson step on the equation +// $x^{-2} - a$, which expands to: +// +// $$x_{k+1} = 0.5 * x_k * (3.0 - a * x_k^2)$$ +// +// So we do the following steps: +// +// 1. t0 = x_k +// 2. t1 = t0 * t0 (x_k^2) +// 3. t2 = a * t1 (a * x_k^2) +// 4. t3 = 3.0 - t2 (3.0 - a * x_k^2) +// 5. t4 = 0.5 * t0 (0.5 * x_k) +// 6. y = t3 * t4 (0.5 * x_k * (3.0 - a * x_k^2)) +// +// Where $x_k$ is the original approximation and `y` contains the improved +// approximation $x_{k+1}$. +// +// Note also that the initial approximation computed by the `vfrsqrt7` +// instruction is only accurate to 7 bits (as opposed to 12 or 14 for x86_64), +// which requires us to do two steps of the above. + +void xnn_f16_vrsqrt_ukernel__rvvfp16arith_rsqrt_u2v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + + vfloat16m2_t onephalf_f16v = __riscv_vfmv_v_f_f16m2(1.5f, __riscv_vsetvl_e16m2(batch)); + vfloat16m2_t zero_f16v = __riscv_vfmv_v_f_f16m2(0.0f, __riscv_vsetvl_e16m2(batch)); + + for (; batch > 0; ) { + size_t n = __riscv_vsetvl_e16m2(batch); batch -= n; + vfloat16m2_t in_f16v = __riscv_vle16_v_f16m2(input, n); input += n; + + vfloat16m2_t t0_f16v = __riscv_vfrsqrt7(in_f16v, n); + vfloat16m2_t in_half_f16v = __riscv_vfmul(in_f16v, 0.5f, n); + + // First Newton-Raphson iteration + vfloat16m2_t t1_f16v = __riscv_vfmul(t0_f16v, t0_f16v, n); + vfloat16m2_t t2_f16v = __riscv_vfnmsub_vv_f16m2(in_half_f16v, t1_f16v, onephalf_f16v, n); + t0_f16v = __riscv_vfmul(t0_f16v, t2_f16v, n); + + // Second Newton-Raphson iteration + t1_f16v = __riscv_vfmul(t0_f16v, t0_f16v, n); + t2_f16v = __riscv_vfnmsub_vv_f16m2(in_half_f16v, t1_f16v, onephalf_f16v, n); + vfloat16m2_t y_f16v = __riscv_vfmul(t0_f16v, t2_f16v, n); + + // Set output to 0 where the input is infinity (and not NaN) + vbool8_t inf_bv = __riscv_vmfeq(in_f16v, INFINITY, n); + y_f16v = __riscv_vmerge(y_f16v, zero_f16v, inf_bv, n); + + __riscv_vse16(output, y_f16v, n); output += n; + } +} diff --git a/src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u4v.c b/src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u4v.c new file mode 100644 index 00000000000..e282f5f5aa0 --- /dev/null +++ b/src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u4v.c @@ -0,0 +1,79 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-vrsqrt/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + +// In the following, we do a single Newton-Raphson step on the equation +// $x^{-2} - a$, which expands to: +// +// $$x_{k+1} = 0.5 * x_k * (3.0 - a * x_k^2)$$ +// +// So we do the following steps: +// +// 1. t0 = x_k +// 2. t1 = t0 * t0 (x_k^2) +// 3. t2 = a * t1 (a * x_k^2) +// 4. t3 = 3.0 - t2 (3.0 - a * x_k^2) +// 5. t4 = 0.5 * t0 (0.5 * x_k) +// 6. y = t3 * t4 (0.5 * x_k * (3.0 - a * x_k^2)) +// +// Where $x_k$ is the original approximation and `y` contains the improved +// approximation $x_{k+1}$. +// +// Note also that the initial approximation computed by the `vfrsqrt7` +// instruction is only accurate to 7 bits (as opposed to 12 or 14 for x86_64), +// which requires us to do two steps of the above. + +void xnn_f16_vrsqrt_ukernel__rvvfp16arith_rsqrt_u4v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + + vfloat16m4_t onephalf_f16v = __riscv_vfmv_v_f_f16m4(1.5f, __riscv_vsetvl_e16m4(batch)); + vfloat16m4_t zero_f16v = __riscv_vfmv_v_f_f16m4(0.0f, __riscv_vsetvl_e16m4(batch)); + + for (; batch > 0; ) { + size_t n = __riscv_vsetvl_e16m4(batch); batch -= n; + vfloat16m4_t in_f16v = __riscv_vle16_v_f16m4(input, n); input += n; + + vfloat16m4_t t0_f16v = __riscv_vfrsqrt7(in_f16v, n); + vfloat16m4_t in_half_f16v = __riscv_vfmul(in_f16v, 0.5f, n); + + // First Newton-Raphson iteration + vfloat16m4_t t1_f16v = __riscv_vfmul(t0_f16v, t0_f16v, n); + vfloat16m4_t t2_f16v = __riscv_vfnmsub_vv_f16m4(in_half_f16v, t1_f16v, onephalf_f16v, n); + t0_f16v = __riscv_vfmul(t0_f16v, t2_f16v, n); + + // Second Newton-Raphson iteration + t1_f16v = __riscv_vfmul(t0_f16v, t0_f16v, n); + t2_f16v = __riscv_vfnmsub_vv_f16m4(in_half_f16v, t1_f16v, onephalf_f16v, n); + vfloat16m4_t y_f16v = __riscv_vfmul(t0_f16v, t2_f16v, n); + + // Set output to 0 where the input is infinity (and not NaN) + vbool4_t inf_bv = __riscv_vmfeq(in_f16v, INFINITY, n); + y_f16v = __riscv_vmerge(y_f16v, zero_f16v, inf_bv, n); + + __riscv_vse16(output, y_f16v, n); output += n; + } +} diff --git a/src/f16-vrsqrt/rvv.c.in b/src/f16-vrsqrt/rvv.c.in new file mode 100644 index 00000000000..100fd190b92 --- /dev/null +++ b/src/f16-vrsqrt/rvv.c.in @@ -0,0 +1,75 @@ +// Copyright 2026 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert LMUL in [1, 2, 4, 8] +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + +// In the following, we do a single Newton-Raphson step on the equation +// $x^{-2} - a$, which expands to: +// +// $$x_{k+1} = 0.5 * x_k * (3.0 - a * x_k^2)$$ +// +// So we do the following steps: +// +// 1. t0 = x_k +// 2. t1 = t0 * t0 (x_k^2) +// 3. t2 = a * t1 (a * x_k^2) +// 4. t3 = 3.0 - t2 (3.0 - a * x_k^2) +// 5. t4 = 0.5 * t0 (0.5 * x_k) +// 6. y = t3 * t4 (0.5 * x_k * (3.0 - a * x_k^2)) +// +// Where $x_k$ is the original approximation and `y` contains the improved +// approximation $x_{k+1}$. +// +// Note also that the initial approximation computed by the `vfrsqrt7` +// instruction is only accurate to 7 bits (as opposed to 12 or 14 for x86_64), +// which requires us to do two steps of the above. + +void xnn_f16_vrsqrt_ukernel__rvvfp16arith_rsqrt_u${LMUL}v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + + vfloat16m${LMUL}_t onephalf_f16v = __riscv_vfmv_v_f_f16m${LMUL}(1.5f, __riscv_vsetvl_e16m${LMUL}(batch)); + vfloat16m${LMUL}_t zero_f16v = __riscv_vfmv_v_f_f16m${LMUL}(0.0f, __riscv_vsetvl_e16m${LMUL}(batch)); + + for (; batch > 0; ) { + size_t n = __riscv_vsetvl_e16m${LMUL}(batch); batch -= n; + vfloat16m${LMUL}_t in_f16v = __riscv_vle16_v_f16m${LMUL}(input, n); input += n; + + vfloat16m${LMUL}_t t0_f16v = __riscv_vfrsqrt7(in_f16v, n); + vfloat16m${LMUL}_t in_half_f16v = __riscv_vfmul(in_f16v, 0.5f, n); + + // First Newton-Raphson iteration + vfloat16m${LMUL}_t t1_f16v = __riscv_vfmul(t0_f16v, t0_f16v, n); + vfloat16m${LMUL}_t t2_f16v = __riscv_vfnmsub_vv_f16m${LMUL}(in_half_f16v, t1_f16v, onephalf_f16v, n); + t0_f16v = __riscv_vfmul(t0_f16v, t2_f16v, n); + + // Second Newton-Raphson iteration + t1_f16v = __riscv_vfmul(t0_f16v, t0_f16v, n); + t2_f16v = __riscv_vfnmsub_vv_f16m${LMUL}(in_half_f16v, t1_f16v, onephalf_f16v, n); + vfloat16m${LMUL}_t y_f16v = __riscv_vfmul(t0_f16v, t2_f16v, n); + + // Set output to 0 where the input is infinity (and not NaN) + vbool${16//LMUL}_t inf_bv = __riscv_vmfeq(in_f16v, INFINITY, n); + y_f16v = __riscv_vmerge(y_f16v, zero_f16v, inf_bv, n); + + __riscv_vse16(output, y_f16v, n); output += n; + } +} diff --git a/src/f16-vsqrt/f16-vsqrt.inc b/src/f16-vsqrt/f16-vsqrt.inc index 71e5838da0b..47f0c78971f 100644 --- a/src/f16-vsqrt/f16-vsqrt.inc +++ b/src/f16-vsqrt/f16-vsqrt.inc @@ -44,3 +44,7 @@ XNN_UKERNEL(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u32, 3 XNN_UKERNEL(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u64, 64, false, xnn_float16, struct xnn_f16_default_params, NULL) #endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vsqrt_ukernel__rvvfp16arith_sqrt_u4v, 4, true, xnn_float16, struct xnn_f16_default_params, NULL) +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vsqrt_ukernel__rvvfp16arith_sqrt_u8v, 8, true, xnn_float16, struct xnn_f16_default_params, NULL) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR diff --git a/src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u4v.c b/src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u4v.c new file mode 100644 index 00000000000..6be36e71412 --- /dev/null +++ b/src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u4v.c @@ -0,0 +1,42 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-vsqrt/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f16_vsqrt_ukernel__rvvfp16arith_sqrt_u4v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + do { + const size_t vl = __riscv_vsetvl_e16m4(batch); + vfloat16m4_t vx = __riscv_vle16_v_f16m4(input, vl); + input += vl; + vfloat16m4_t vacc = __riscv_vfsqrt(vx, vl); + __riscv_vse16(output, vacc, vl); + output += vl; + + batch -= vl; + } while (batch != 0); +} diff --git a/src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u8v.c b/src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u8v.c new file mode 100644 index 00000000000..801dabe212c --- /dev/null +++ b/src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u8v.c @@ -0,0 +1,42 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-vsqrt/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f16_vsqrt_ukernel__rvvfp16arith_sqrt_u8v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + do { + const size_t vl = __riscv_vsetvl_e16m8(batch); + vfloat16m8_t vx = __riscv_vle16_v_f16m8(input, vl); + input += vl; + vfloat16m8_t vacc = __riscv_vfsqrt(vx, vl); + __riscv_vse16(output, vacc, vl); + output += vl; + + batch -= vl; + } while (batch != 0); +} diff --git a/src/f16-vsqrt/rvv.c.in b/src/f16-vsqrt/rvv.c.in new file mode 100644 index 00000000000..894a079829d --- /dev/null +++ b/src/f16-vsqrt/rvv.c.in @@ -0,0 +1,38 @@ +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert LMUL in [1, 2, 4, 8] +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f16_vsqrt_ukernel__rvvfp16arith_sqrt_u${LMUL}v( + size_t batch, + const xnn_float16* input, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + do { + const size_t vl = __riscv_vsetvl_e16m${LMUL}(batch); + vfloat16m${LMUL}_t vx = __riscv_vle16_v_f16m${LMUL}(input, vl); + input += vl; + vfloat16m${LMUL}_t vacc = __riscv_vfsqrt(vx, vl); + __riscv_vse16(output, vacc, vl); + output += vl; + + batch -= vl; + } while (batch != 0); +} diff --git a/src/f32-vbinary/f32-vcopysign.inc b/src/f32-vbinary/f32-vcopysign.inc index e747a9e838c..49a1e66263c 100644 --- a/src/f32-vbinary/f32-vcopysign.inc +++ b/src/f32-vbinary/f32-vcopysign.inc @@ -30,6 +30,7 @@ XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_f32_vcopysign_ukernel__avx512f_u32, 32, fa XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_f32_vcopysign_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_f32_vcopysign_ukernel__avx512f_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL(xnn_arch_none, xnn_f32_vcopysign_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL(xnn_arch_none, xnn_f32_vcopysign_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) @@ -51,3 +52,7 @@ XNN_UKERNEL(xnn_arch_arm_neon, xnn_f32_vcopysign_ukernel__neon_u12, 12, false, f XNN_UKERNEL(xnn_arch_arm_neon, xnn_f32_vcopysign_ukernel__neon_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vcopysign_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vcopysign_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR diff --git a/src/f32-vbinary/f32-vcopysignc.inc b/src/f32-vbinary/f32-vcopysignc.inc index 10c0963b5df..99788d0b2b3 100644 --- a/src/f32-vbinary/f32-vcopysignc.inc +++ b/src/f32-vbinary/f32-vcopysignc.inc @@ -30,6 +30,7 @@ XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_f32_vcopysignc_ukernel__avx512f_u32, 32, f XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_f32_vcopysignc_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_f32_vcopysignc_ukernel__avx512f_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL(xnn_arch_none, xnn_f32_vcopysignc_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL(xnn_arch_none, xnn_f32_vcopysignc_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) @@ -51,3 +52,7 @@ XNN_UKERNEL(xnn_arch_arm_neon, xnn_f32_vcopysignc_ukernel__neon_u12, 12, false, XNN_UKERNEL(xnn_arch_arm_neon, xnn_f32_vcopysignc_ukernel__neon_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vcopysignc_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vcopysignc_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR diff --git a/src/f32-vbinary/f32-vrcopysignc.inc b/src/f32-vbinary/f32-vrcopysignc.inc index a584f64d201..2020ae51d71 100644 --- a/src/f32-vbinary/f32-vrcopysignc.inc +++ b/src/f32-vbinary/f32-vrcopysignc.inc @@ -30,6 +30,7 @@ XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_f32_vrcopysignc_ukernel__avx512f_u32, 32, XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_f32_vrcopysignc_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_f32_vrcopysignc_ukernel__avx512f_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL(xnn_arch_none, xnn_f32_vrcopysignc_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL(xnn_arch_none, xnn_f32_vrcopysignc_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) @@ -51,3 +52,7 @@ XNN_UKERNEL(xnn_arch_arm_neon, xnn_f32_vrcopysignc_ukernel__neon_u12, 12, false, XNN_UKERNEL(xnn_arch_arm_neon, xnn_f32_vrcopysignc_ukernel__neon_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vrcopysignc_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vrcopysignc_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR diff --git a/src/f32-vcopysign/gen/f32-vcopysign-rvv-u4v.c b/src/f32-vcopysign/gen/f32-vcopysign-rvv-u4v.c new file mode 100644 index 00000000000..1fbc405eb22 --- /dev/null +++ b/src/f32-vcopysign/gen/f32-vcopysign-rvv-u4v.c @@ -0,0 +1,45 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vcopysign/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vunary.h" + +void xnn_f32_vcopysign_ukernel__rvv_u4v( + size_t batch, + const float* mag, + const float* sign, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(mag != NULL); + assert(sign != NULL); + assert(output != NULL); + + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t vl = __riscv_vsetvl_e32m4(batch); batch -= vl; + + vfloat32m4_t vmag = __riscv_vle32_v_f32m4(mag, vl); mag += vl; + vfloat32m4_t vsign = __riscv_vle32_v_f32m4(sign, vl); sign += vl; + vfloat32m4_t vy = __riscv_vfsgnj(vmag, vsign, vl); + + __riscv_vse32(output, vy, vl); output += vl; + + } while (batch != 0); +} diff --git a/src/f32-vcopysign/gen/f32-vcopysign-rvv-u8v.c b/src/f32-vcopysign/gen/f32-vcopysign-rvv-u8v.c new file mode 100644 index 00000000000..d4533fda776 --- /dev/null +++ b/src/f32-vcopysign/gen/f32-vcopysign-rvv-u8v.c @@ -0,0 +1,45 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vcopysign/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vunary.h" + +void xnn_f32_vcopysign_ukernel__rvv_u8v( + size_t batch, + const float* mag, + const float* sign, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(mag != NULL); + assert(sign != NULL); + assert(output != NULL); + + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t vl = __riscv_vsetvl_e32m8(batch); batch -= vl; + + vfloat32m8_t vmag = __riscv_vle32_v_f32m8(mag, vl); mag += vl; + vfloat32m8_t vsign = __riscv_vle32_v_f32m8(sign, vl); sign += vl; + vfloat32m8_t vy = __riscv_vfsgnj(vmag, vsign, vl); + + __riscv_vse32(output, vy, vl); output += vl; + + } while (batch != 0); +} diff --git a/src/f32-vcopysign/gen/f32-vcopysignc-rvv-u4v.c b/src/f32-vcopysign/gen/f32-vcopysignc-rvv-u4v.c new file mode 100644 index 00000000000..4336d2e7497 --- /dev/null +++ b/src/f32-vcopysign/gen/f32-vcopysignc-rvv-u4v.c @@ -0,0 +1,45 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vcopysign/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vunary.h" + +void xnn_f32_vcopysignc_ukernel__rvv_u4v( + size_t batch, + const float* mag, + const float* sign, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(mag != NULL); + assert(sign != NULL); + assert(output != NULL); + + const float signc = *sign; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t vl = __riscv_vsetvl_e32m4(batch); batch -= vl; + + vfloat32m4_t vmag = __riscv_vle32_v_f32m4(mag, vl); mag += vl; + vfloat32m4_t vy = __riscv_vfsgnj(vmag, signc, vl); + + __riscv_vse32(output, vy, vl); output += vl; + + } while (batch != 0); +} diff --git a/src/f32-vcopysign/gen/f32-vcopysignc-rvv-u8v.c b/src/f32-vcopysign/gen/f32-vcopysignc-rvv-u8v.c new file mode 100644 index 00000000000..f3f4e254e12 --- /dev/null +++ b/src/f32-vcopysign/gen/f32-vcopysignc-rvv-u8v.c @@ -0,0 +1,45 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vcopysign/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vunary.h" + +void xnn_f32_vcopysignc_ukernel__rvv_u8v( + size_t batch, + const float* mag, + const float* sign, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(mag != NULL); + assert(sign != NULL); + assert(output != NULL); + + const float signc = *sign; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t vl = __riscv_vsetvl_e32m8(batch); batch -= vl; + + vfloat32m8_t vmag = __riscv_vle32_v_f32m8(mag, vl); mag += vl; + vfloat32m8_t vy = __riscv_vfsgnj(vmag, signc, vl); + + __riscv_vse32(output, vy, vl); output += vl; + + } while (batch != 0); +} diff --git a/src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u4v.c b/src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u4v.c new file mode 100644 index 00000000000..9d7b0f71572 --- /dev/null +++ b/src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u4v.c @@ -0,0 +1,46 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vcopysign/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vunary.h" + +void xnn_f32_vrcopysignc_ukernel__rvv_u4v( + size_t batch, + const float* sign, + const float* mag, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(mag != NULL); + assert(sign != NULL); + assert(output != NULL); + + const float magc = *mag; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t vl = __riscv_vsetvl_e32m4(batch); batch -= vl; + + vfloat32m4_t vmag = __riscv_vfmv_v_f_f32m4(magc, vl); + vfloat32m4_t vsign = __riscv_vle32_v_f32m4(sign, vl); sign += vl; + vfloat32m4_t vy = __riscv_vfsgnj(vmag, vsign, vl); + + __riscv_vse32(output, vy, vl); output += vl; + + } while (batch != 0); +} diff --git a/src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u8v.c b/src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u8v.c new file mode 100644 index 00000000000..1d6aa7a5946 --- /dev/null +++ b/src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u8v.c @@ -0,0 +1,46 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vcopysign/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vunary.h" + +void xnn_f32_vrcopysignc_ukernel__rvv_u8v( + size_t batch, + const float* sign, + const float* mag, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(mag != NULL); + assert(sign != NULL); + assert(output != NULL); + + const float magc = *mag; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t vl = __riscv_vsetvl_e32m8(batch); batch -= vl; + + vfloat32m8_t vmag = __riscv_vfmv_v_f_f32m8(magc, vl); + vfloat32m8_t vsign = __riscv_vle32_v_f32m8(sign, vl); sign += vl; + vfloat32m8_t vy = __riscv_vfsgnj(vmag, vsign, vl); + + __riscv_vse32(output, vy, vl); output += vl; + + } while (batch != 0); +} diff --git a/src/f32-vcopysign/rvv.c.in b/src/f32-vcopysign/rvv.c.in new file mode 100644 index 00000000000..cad4f003f49 --- /dev/null +++ b/src/f32-vcopysign/rvv.c.in @@ -0,0 +1,58 @@ +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert LMUL in [1, 2, 4, 8] +$assert OP in ["COPYSIGN", "COPYSIGNC", "RCOPYSIGNC"] +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/vunary.h" + +void xnn_f32_v${OP.lower()}_ukernel__rvv_u${LMUL}v( + size_t batch, + $if OP == "RCOPYSIGNC": + const float* sign, + const float* mag, + $else: + const float* mag, + const float* sign, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(mag != NULL); + assert(sign != NULL); + assert(output != NULL); + + $if OP == "COPYSIGNC": + const float signc = *sign; + $elif OP == "RCOPYSIGNC": + const float magc = *mag; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t vl = __riscv_vsetvl_e32m${LMUL}(batch); batch -= vl; + + $if OP == "COPYSIGN": + vfloat32m${LMUL}_t vmag = __riscv_vle32_v_f32m${LMUL}(mag, vl); mag += vl; + vfloat32m${LMUL}_t vsign = __riscv_vle32_v_f32m${LMUL}(sign, vl); sign += vl; + vfloat32m${LMUL}_t vy = __riscv_vfsgnj(vmag, vsign, vl); + $elif OP == "COPYSIGNC": + vfloat32m${LMUL}_t vmag = __riscv_vle32_v_f32m${LMUL}(mag, vl); mag += vl; + vfloat32m${LMUL}_t vy = __riscv_vfsgnj(vmag, signc, vl); + $elif OP == "RCOPYSIGNC": + vfloat32m${LMUL}_t vmag = __riscv_vfmv_v_f_f32m${LMUL}(magc, vl); + vfloat32m${LMUL}_t vsign = __riscv_vle32_v_f32m${LMUL}(sign, vl); sign += vl; + vfloat32m${LMUL}_t vy = __riscv_vfsgnj(vmag, vsign, vl); + + __riscv_vse32(output, vy, vl); output += vl; + + } while (batch != 0); +}