From 550760945b5f73d9a754f0789ae8ebf10f67f20b Mon Sep 17 00:00:00 2001
From: Ken Unger <ken.j.unger@gmail.com>
Date: Mon, 30 Mar 2026 23:16:09 -0700
Subject: [PATCH] Copybara import of the project:

--
4a29295db4d2dabc70e236d608972361fac902f4 by Ken Unger <ken.j.unger@gmail.com>:

add rvv support for f16-vhswish, f16-vlrelu, f16-vrnd, f16-vrsqrt, f16-vsqrt, f32-vcopysign

--
99f32dece9e7975f3626e97c95504692e6130086 by Ken Unger <ken.j.unger@gmail.com>:

updated copyright per review comments

FUTURE_COPYBARA_INTEGRATE_REVIEW=https://github.com/google/XNNPACK/pull/9693 from ken-unger:unary-rvv 72587b2187dd13160582fe09e9ebc2feb061f190
PiperOrigin-RevId: 892124176
---
 cmake/gen/rvv_microkernels.cmake              |  6 ++
 cmake/gen/rvvfp16arith_microkernels.cmake     | 16 ++++
 gen/rvv_microkernels.bzl                      |  6 ++
 gen/rvvfp16arith_microkernels.bzl             | 16 ++++
 scripts/generate-f16-vhswish.sh               |  4 +
 scripts/generate-f16-vlrelu.sh                |  4 +
 scripts/generate-f16-vrnd.sh                  | 12 ++-
 scripts/generate-f16-vrsqrt.sh                |  6 +-
 scripts/generate-f16-vsqrt.sh                 |  4 +
 scripts/generate-f32-vcopysign.sh             | 11 ++-
 src/configs/binary-elementwise-config.c       | 14 ++++
 src/configs/unary-elementwise-config.c        | 57 +++++++++++++
 src/f16-vhswish/f16-vhswish.inc               |  4 +
 .../gen/f16-vhswish-rvvfp16arith-u4v.c        | 50 ++++++++++++
 .../gen/f16-vhswish-rvvfp16arith-u8v.c        | 50 ++++++++++++
 src/f16-vhswish/rvv.c.in                      | 46 +++++++++++
 src/f16-vlrelu/f16-vlrelu.inc                 |  4 +
 .../gen/f16-vlrelu-rvvfp16arith-u4v.c         | 40 ++++++++++
 .../gen/f16-vlrelu-rvvfp16arith-u8v.c         | 40 ++++++++++
 src/f16-vlrelu/rvv.c.in                       | 36 +++++++++
 src/f16-vrnd/f16-vrndd.inc                    |  4 +
 src/f16-vrnd/f16-vrndne.inc                   |  4 +
 src/f16-vrnd/f16-vrndu.inc                    |  4 +
 src/f16-vrnd/f16-vrndz.inc                    |  5 ++
 src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u4v.c | 50 ++++++++++++
 src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u8v.c | 50 ++++++++++++
 .../gen/f16-vrndne-rvvfp16arith-u4v.c         | 50 ++++++++++++
 .../gen/f16-vrndne-rvvfp16arith-u8v.c         | 50 ++++++++++++
 src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u4v.c | 50 ++++++++++++
 src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u8v.c | 50 ++++++++++++
 src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u4v.c | 50 ++++++++++++
 src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u8v.c | 50 ++++++++++++
 src/f16-vrnd/rvv.c.in                         | 53 +++++++++++++
 src/f16-vrsqrt/f16-vrsqrt.inc                 |  4 +
 .../gen/f16-vrsqrt-rvvfp16arith-rsqrt-u2v.c   | 79 +++++++++++++++++++
 .../gen/f16-vrsqrt-rvvfp16arith-rsqrt-u4v.c   | 79 +++++++++++++++++++
 src/f16-vrsqrt/rvv.c.in                       | 75 ++++++++++++++++++
 src/f16-vsqrt/f16-vsqrt.inc                   |  4 +
 .../gen/f16-vsqrt-rvvfp16arith-sqrt-u4v.c     | 42 ++++++++++
 .../gen/f16-vsqrt-rvvfp16arith-sqrt-u8v.c     | 42 ++++++++++
 src/f16-vsqrt/rvv.c.in                        | 38 +++++++++
 src/f32-vbinary/f32-vcopysign.inc             |  5 ++
 src/f32-vbinary/f32-vcopysignc.inc            |  5 ++
 src/f32-vbinary/f32-vrcopysignc.inc           |  5 ++
 src/f32-vcopysign/gen/f32-vcopysign-rvv-u4v.c | 45 +++++++++++
 src/f32-vcopysign/gen/f32-vcopysign-rvv-u8v.c | 45 +++++++++++
 .../gen/f32-vcopysignc-rvv-u4v.c              | 45 +++++++++++
 .../gen/f32-vcopysignc-rvv-u8v.c              | 45 +++++++++++
 .../gen/f32-vrcopysignc-rvv-u4v.c             | 46 +++++++++++
 .../gen/f32-vrcopysignc-rvv-u8v.c             | 46 +++++++++++
 src/f32-vcopysign/rvv.c.in                    | 58 ++++++++++++++
 51 files changed, 1601 insertions(+), 3 deletions(-)
 create mode 100644 src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u4v.c
 create mode 100644 src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u8v.c
 create mode 100644 src/f16-vhswish/rvv.c.in
 create mode 100644 src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u4v.c
 create mode 100644 src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u8v.c
 create mode 100644 src/f16-vlrelu/rvv.c.in
 create mode 100644 src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u4v.c
 create mode 100644 src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u8v.c
 create mode 100644 src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u4v.c
 create mode 100644 src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u8v.c
 create mode 100644 src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u4v.c
 create mode 100644 src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u8v.c
 create mode 100644 src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u4v.c
 create mode 100644 src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u8v.c
 create mode 100644 src/f16-vrnd/rvv.c.in
 create mode 100644 src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u2v.c
 create mode 100644 src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u4v.c
 create mode 100644 src/f16-vrsqrt/rvv.c.in
 create mode 100644 src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u4v.c
 create mode 100644 src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u8v.c
 create mode 100644 src/f16-vsqrt/rvv.c.in
 create mode 100644 src/f32-vcopysign/gen/f32-vcopysign-rvv-u4v.c
 create mode 100644 src/f32-vcopysign/gen/f32-vcopysign-rvv-u8v.c
 create mode 100644 src/f32-vcopysign/gen/f32-vcopysignc-rvv-u4v.c
 create mode 100644 src/f32-vcopysign/gen/f32-vcopysignc-rvv-u8v.c
 create mode 100644 src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u4v.c
 create mode 100644 src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u8v.c
 create mode 100644 src/f32-vcopysign/rvv.c.in

diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake
index d6ad1ad5937..db093021c08 100644
--- a/cmake/gen/rvv_microkernels.cmake
+++ b/cmake/gen/rvv_microkernels.cmake
@@ -63,6 +63,9 @@ SET(PROD_RVV_MICROKERNEL_SRCS
   src/f32-vbinary/gen/f32-vsubc-rvv-u8v.c
   src/f32-vclamp/gen/f32-vclamp-rvv-u8v.c
   src/f32-vcmul/gen/f32-vcmul-rvv-u2v.c
+  src/f32-vcopysign/gen/f32-vcopysign-rvv-u8v.c
+  src/f32-vcopysign/gen/f32-vcopysignc-rvv-u8v.c
+  src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u8v.c
   src/f32-vhswish/gen/f32-vhswish-rvv-u4v.c
   src/f32-vlrelu/gen/f32-vlrelu-rvv-u4v.c
   src/f32-vrnd/gen/f32-vrndd-rvv-u4v.c
@@ -214,6 +217,9 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS
   src/f32-vclamp/gen/f32-vclamp-rvv-u4v.c
   src/f32-vcmul/gen/f32-vcmul-rvv-u1v.c
   src/f32-vcmul/gen/f32-vcmul-rvv-u4v.c
+  src/f32-vcopysign/gen/f32-vcopysign-rvv-u4v.c
+  src/f32-vcopysign/gen/f32-vcopysignc-rvv-u4v.c
+  src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u4v.c
   src/f32-vhswish/gen/f32-vhswish-rvv-u1v.c
   src/f32-vhswish/gen/f32-vhswish-rvv-u2v.c
   src/f32-vhswish/gen/f32-vhswish-rvv-u8v.c
diff --git a/cmake/gen/rvvfp16arith_microkernels.cmake b/cmake/gen/rvvfp16arith_microkernels.cmake
index 3c4580a0ab9..d99a66c635c 100644
--- a/cmake/gen/rvvfp16arith_microkernels.cmake
+++ b/cmake/gen/rvvfp16arith_microkernels.cmake
@@ -49,6 +49,14 @@ SET(PROD_RVVFP16ARITH_MICROKERNEL_SRCS
   src/f16-vbinary/gen/f16-vsub-rvvfp16arith-u8v.c
   src/f16-vbinary/gen/f16-vsubc-rvvfp16arith-u8v.c
   src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u8v.c
+  src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u8v.c
+  src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u8v.c
+  src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u8v.c
+  src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u8v.c
+  src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u8v.c
+  src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u8v.c
+  src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u4v.c
+  src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u8v.c
   src/f16-vunary/gen/f16-vabs-rvvfp16arith-u8v.c
   src/f16-vunary/gen/f16-vneg-rvvfp16arith-u8v.c
   src/f16-vunary/gen/f16-vsqr-rvvfp16arith-u8v.c
@@ -139,6 +147,14 @@ SET(NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS
   src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u1v.c
   src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u2v.c
   src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u4v.c
+  src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u4v.c
+  src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u4v.c
+  src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u4v.c
+  src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u4v.c
+  src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u4v.c
+  src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u4v.c
+  src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u2v.c
+  src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u4v.c
   src/f16-vunary/gen/f16-vabs-rvvfp16arith-u1v.c
   src/f16-vunary/gen/f16-vabs-rvvfp16arith-u2v.c
   src/f16-vunary/gen/f16-vabs-rvvfp16arith-u4v.c
diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl
index 6261a1484c6..30064f30eee 100644
--- a/gen/rvv_microkernels.bzl
+++ b/gen/rvv_microkernels.bzl
@@ -59,6 +59,9 @@ PROD_RVV_MICROKERNEL_SRCS = [
     "src/f32-vbinary/gen/f32-vsubc-rvv-u8v.c",
     "src/f32-vclamp/gen/f32-vclamp-rvv-u8v.c",
     "src/f32-vcmul/gen/f32-vcmul-rvv-u2v.c",
+    "src/f32-vcopysign/gen/f32-vcopysign-rvv-u8v.c",
+    "src/f32-vcopysign/gen/f32-vcopysignc-rvv-u8v.c",
+    "src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u8v.c",
     "src/f32-vhswish/gen/f32-vhswish-rvv-u4v.c",
     "src/f32-vlrelu/gen/f32-vlrelu-rvv-u4v.c",
     "src/f32-vrnd/gen/f32-vrndd-rvv-u4v.c",
@@ -211,6 +214,9 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [
     "src/f32-vclamp/gen/f32-vclamp-rvv-u4v.c",
     "src/f32-vcmul/gen/f32-vcmul-rvv-u1v.c",
     "src/f32-vcmul/gen/f32-vcmul-rvv-u4v.c",
+    "src/f32-vcopysign/gen/f32-vcopysign-rvv-u4v.c",
+    "src/f32-vcopysign/gen/f32-vcopysignc-rvv-u4v.c",
+    "src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u4v.c",
     "src/f32-vhswish/gen/f32-vhswish-rvv-u1v.c",
     "src/f32-vhswish/gen/f32-vhswish-rvv-u2v.c",
     "src/f32-vhswish/gen/f32-vhswish-rvv-u8v.c",
diff --git a/gen/rvvfp16arith_microkernels.bzl b/gen/rvvfp16arith_microkernels.bzl
index 31d43cc0976..1c2fa7b5ea8 100644
--- a/gen/rvvfp16arith_microkernels.bzl
+++ b/gen/rvvfp16arith_microkernels.bzl
@@ -45,6 +45,14 @@ PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [
     "src/f16-vbinary/gen/f16-vsub-rvvfp16arith-u8v.c",
     "src/f16-vbinary/gen/f16-vsubc-rvvfp16arith-u8v.c",
     "src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u8v.c",
+    "src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u8v.c",
+    "src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u8v.c",
+    "src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u8v.c",
+    "src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u8v.c",
+    "src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u8v.c",
+    "src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u8v.c",
+    "src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u4v.c",
+    "src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u8v.c",
     "src/f16-vunary/gen/f16-vabs-rvvfp16arith-u8v.c",
     "src/f16-vunary/gen/f16-vneg-rvvfp16arith-u8v.c",
     "src/f16-vunary/gen/f16-vsqr-rvvfp16arith-u8v.c",
@@ -136,6 +144,14 @@ NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [
     "src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u1v.c",
     "src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u2v.c",
     "src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u4v.c",
+    "src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u4v.c",
+    "src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u4v.c",
+    "src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u4v.c",
+    "src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u4v.c",
+    "src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u4v.c",
+    "src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u4v.c",
+    "src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u2v.c",
+    "src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u4v.c",
     "src/f16-vunary/gen/f16-vabs-rvvfp16arith-u1v.c",
     "src/f16-vunary/gen/f16-vabs-rvvfp16arith-u2v.c",
     "src/f16-vunary/gen/f16-vabs-rvvfp16arith-u4v.c",
diff --git a/scripts/generate-f16-vhswish.sh b/scripts/generate-f16-vhswish.sh
index 368f1e47b11..e0526e530f4 100755
--- a/scripts/generate-f16-vhswish.sh
+++ b/scripts/generate-f16-vhswish.sh
@@ -12,4 +12,8 @@ tools/xngen src/f16-vhswish/neonfp16arith.c.in -D BATCH_TILE=16 -o src/f16-vhswi
 tools/xngen src/f16-vhswish/f16c.c.in -D BATCH_TILE=8  -o src/f16-vhswish/gen/f16-vhswish-f16c-u8.c &
 tools/xngen src/f16-vhswish/f16c.c.in -D BATCH_TILE=16 -o src/f16-vhswish/gen/f16-vhswish-f16c-u16.c &
 
+################################### RISC-V Vector #############################
+tools/xngen src/f16-vhswish/rvv.c.in -D LMUL=4 -o src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u4v.c &
+tools/xngen src/f16-vhswish/rvv.c.in -D LMUL=8 -o src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u8v.c &
+
 wait
diff --git a/scripts/generate-f16-vlrelu.sh b/scripts/generate-f16-vlrelu.sh
index b7e9caa8ee9..f07803a73df 100755
--- a/scripts/generate-f16-vlrelu.sh
+++ b/scripts/generate-f16-vlrelu.sh
@@ -12,4 +12,8 @@ tools/xngen src/f16-vlrelu/neonfp16arith.c.in -D BATCH_TILE=16 -o src/f16-vlrelu
 tools/xngen src/f16-vlrelu/f16c.c.in -D BATCH_TILE=8  -o src/f16-vlrelu/gen/f16-vlrelu-f16c-u8.c &
 tools/xngen src/f16-vlrelu/f16c.c.in -D BATCH_TILE=16 -o src/f16-vlrelu/gen/f16-vlrelu-f16c-u16.c &
 
+################################### RISC-V Vector #############################
+tools/xngen src/f16-vlrelu/rvv.c.in -D LMUL=4 -o src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u4v.c &
+tools/xngen src/f16-vlrelu/rvv.c.in -D LMUL=8 -o src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u8v.c &
+
 wait
diff --git a/scripts/generate-f16-vrnd.sh b/scripts/generate-f16-vrnd.sh
index fc88462ce82..a78ce75020d 100755
--- a/scripts/generate-f16-vrnd.sh
+++ b/scripts/generate-f16-vrnd.sh
@@ -14,7 +14,7 @@ tools/xngen src/f16-vrnd/neonfp16arith.c.in -D OP=RNDU  -D BATCH_TILE=16 -o src/
 tools/xngen src/f16-vrnd/neonfp16arith.c.in -D OP=RNDD  -D BATCH_TILE=8  -o src/f16-vrnd/gen/f16-vrndd-neonfp16arith-u8.c &
 tools/xngen src/f16-vrnd/neonfp16arith.c.in -D OP=RNDD  -D BATCH_TILE=16 -o src/f16-vrnd/gen/f16-vrndd-neonfp16arith-u16.c &
 
-################################# x86 F16C #################################
+################################# x86 F16C ####################################
 tools/xngen src/f16-vrnd/f16c.c.in -D OP=RNDNE -D BATCH_TILE=8  -o src/f16-vrnd/gen/f16-vrndne-f16c-u8.c &
 tools/xngen src/f16-vrnd/f16c.c.in -D OP=RNDNE -D BATCH_TILE=16 -o src/f16-vrnd/gen/f16-vrndne-f16c-u16.c &
 tools/xngen src/f16-vrnd/f16c.c.in -D OP=RNDZ  -D BATCH_TILE=8  -o src/f16-vrnd/gen/f16-vrndz-f16c-u8.c &
@@ -24,4 +24,14 @@ tools/xngen src/f16-vrnd/f16c.c.in -D OP=RNDU  -D BATCH_TILE=16 -o src/f16-vrnd/
 tools/xngen src/f16-vrnd/f16c.c.in -D OP=RNDD  -D BATCH_TILE=8  -o src/f16-vrnd/gen/f16-vrndd-f16c-u8.c &
 tools/xngen src/f16-vrnd/f16c.c.in -D OP=RNDD  -D BATCH_TILE=16 -o src/f16-vrnd/gen/f16-vrndd-f16c-u16.c &
 
+################################ RISC-V Vector ################################
+tools/xngen src/f16-vrnd/rvv.c.in -D OP=RNDNE -D LMUL=4 -o src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u4v.c &
+tools/xngen src/f16-vrnd/rvv.c.in -D OP=RNDNE -D LMUL=8 -o src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u8v.c &
+tools/xngen src/f16-vrnd/rvv.c.in -D OP=RNDZ  -D LMUL=4 -o src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u4v.c &
+tools/xngen src/f16-vrnd/rvv.c.in -D OP=RNDZ  -D LMUL=8 -o src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u8v.c &
+tools/xngen src/f16-vrnd/rvv.c.in -D OP=RNDU  -D LMUL=4 -o src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u4v.c &
+tools/xngen src/f16-vrnd/rvv.c.in -D OP=RNDU  -D LMUL=8 -o src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u8v.c &
+tools/xngen src/f16-vrnd/rvv.c.in -D OP=RNDD  -D LMUL=4 -o src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u4v.c &
+tools/xngen src/f16-vrnd/rvv.c.in -D OP=RNDD  -D LMUL=8 -o src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u8v.c &
+
 wait
diff --git a/scripts/generate-f16-vrsqrt.sh b/scripts/generate-f16-vrsqrt.sh
index dd522c4c71b..6da8933faaf 100755
--- a/scripts/generate-f16-vrsqrt.sh
+++ b/scripts/generate-f16-vrsqrt.sh
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-############################### ARM NEONFP16ARITH ##############################
+############################### ARM NEONFP16ARITH #############################
 tools/xngen src/f16-vrsqrt/neonfp16arith-rsqrt.c.in -D BATCH_TILE=8 -D FULL_ACC=1 -o src/f16-vrsqrt/gen/f16-vrsqrt-neonfp16arith-rsqrt-u8.c &
 tools/xngen src/f16-vrsqrt/neonfp16arith-rsqrt.c.in -D BATCH_TILE=16 -D FULL_ACC=1 -o src/f16-vrsqrt/gen/f16-vrsqrt-neonfp16arith-rsqrt-u16.c &
 tools/xngen src/f16-vrsqrt/neonfp16arith-rsqrt.c.in -D BATCH_TILE=32 -D FULL_ACC=1 -o src/f16-vrsqrt/gen/f16-vrsqrt-neonfp16arith-rsqrt-u32.c &
@@ -14,4 +14,8 @@ tools/xngen src/f16-vrsqrt/f16c-rsqrt.c.in -D BATCH_TILE=8  -o src/f16-vrsqrt/ge
 tools/xngen src/f16-vrsqrt/f16c-rsqrt.c.in -D BATCH_TILE=16 -o src/f16-vrsqrt/gen/f16-vrsqrt-f16c-rsqrt-u16.c &
 tools/xngen src/f16-vrsqrt/f16c-rsqrt.c.in -D BATCH_TILE=32 -o src/f16-vrsqrt/gen/f16-vrsqrt-f16c-rsqrt-u32.c &
 
+############################### RISC-V Vector #################################
+tools/xngen src/f16-vrsqrt/rvv.c.in -D LMUL=2 -o src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u2v.c &
+tools/xngen src/f16-vrsqrt/rvv.c.in -D LMUL=4 -o src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u4v.c &
+
 wait
diff --git a/scripts/generate-f16-vsqrt.sh b/scripts/generate-f16-vsqrt.sh
index 422e55624da..675c070bb5a 100755
--- a/scripts/generate-f16-vsqrt.sh
+++ b/scripts/generate-f16-vsqrt.sh
@@ -43,4 +43,8 @@ tools/xngen src/f16-vsqrt/avx512fp16-sqrt.c.in -D BATCH_TILE=32  -o src/f16-vsqr
 tools/xngen src/f16-vsqrt/avx512fp16-sqrt.c.in -D BATCH_TILE=64  -o src/f16-vsqrt/gen/f16-vsqrt-avx512fp16-sqrt-u64.c &
 tools/xngen src/f16-vsqrt/avx512fp16-sqrt.c.in -D BATCH_TILE=128 -o src/f16-vsqrt/gen/f16-vsqrt-avx512fp16-sqrt-u128.c &
 
+################################ RISC-V Vector ################################
+tools/xngen src/f16-vsqrt/rvv.c.in -D LMUL=4 -o src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u4v.c &
+tools/xngen src/f16-vsqrt/rvv.c.in -D LMUL=8 -o src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u8v.c &
+
 wait
diff --git a/scripts/generate-f32-vcopysign.sh b/scripts/generate-f32-vcopysign.sh
index 52aae9b38eb..93d859fdbd2 100755
--- a/scripts/generate-f32-vcopysign.sh
+++ b/scripts/generate-f32-vcopysign.sh
@@ -13,7 +13,6 @@ tools/xngen src/f32-vcopysign/copysign.c.in   -D ARCH=avx      -D BATCH_TILES=8,
 tools/xngen src/f32-vcopysign/copysign.c.in   -D ARCH=avx512f  -D BATCH_TILES=16,32,48,64  -o src/f32-vcopysign/gen/f32-vcopysign-avx512f.c &
 tools/xngen src/f32-vcopysign/copysign.c.in   -D ARCH=hvx      -D BATCH_TILES=32,64,96,128 -o src/f32-vcopysign/gen/f32-vcopysign-hvx.c &
 
-
 # Scalar sign
 tools/xngen src/f32-vcopysign/copysignc.c.in  -D ARCH=scalar   -D BATCH_TILES=1,2,4,8      -o src/f32-vcopysign/gen/f32-vcopysignc-scalar.c &
 tools/xngen src/f32-vcopysign/copysignc.c.in  -D ARCH=sse2     -D BATCH_TILES=4,8,12,16    -o src/f32-vcopysign/gen/f32-vcopysignc-sse2.c &
@@ -36,4 +35,14 @@ tools/xngen src/f32-vcopysign/copysign.c.in   -D ARCH=avx512f  -D BATCH_TILES=16
 tools/xngen src/f32-vcopysign/copysignc.c.in  -D ARCH=avx512f  -D BATCH_TILES=16,32,48,64  -o src/f32-vcopysign/gen/f32-vcopysignc-avx512f.c &
 tools/xngen src/f32-vcopysign/rcopysignc.c.in -D ARCH=avx512f  -D BATCH_TILES=16,32,48,64  -o src/f32-vcopysign/gen/f32-vrcopysignc-avx512f.c &
 
+##################################### RISC-V Vector ############################
+tools/xngen src/f32-vcopysign/rvv.c.in -D LMUL=4  -D OP=COPYSIGN -o src/f32-vcopysign/gen/f32-vcopysign-rvv-u4v.c &
+tools/xngen src/f32-vcopysign/rvv.c.in -D LMUL=8  -D OP=COPYSIGN -o src/f32-vcopysign/gen/f32-vcopysign-rvv-u8v.c &
+
+tools/xngen src/f32-vcopysign/rvv.c.in -D LMUL=4  -D OP=COPYSIGNC -o src/f32-vcopysign/gen/f32-vcopysignc-rvv-u4v.c &
+tools/xngen src/f32-vcopysign/rvv.c.in -D LMUL=8  -D OP=COPYSIGNC -o src/f32-vcopysign/gen/f32-vcopysignc-rvv-u8v.c &
+
+tools/xngen src/f32-vcopysign/rvv.c.in -D LMUL=4  -D OP=RCOPYSIGNC -o src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u4v.c &
+tools/xngen src/f32-vcopysign/rvv.c.in -D LMUL=8  -D OP=RCOPYSIGNC -o src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u8v.c &
+
 wait
diff --git a/src/configs/binary-elementwise-config.c b/src/configs/binary-elementwise-config.c
index 525a0b1e401..d772bea8375 100644
--- a/src/configs/binary-elementwise-config.c
+++ b/src/configs/binary-elementwise-config.c
@@ -632,6 +632,20 @@ static void init_f32_vcopysign_config(void) {
       f32_vcopysign_config.ropc_ukernel = XNN_INIT_BINARY_UKERNEL(xnn_f32_vrcopysignc_ukernel__hvx_u128);
       f32_vcopysign_config.element_tile = 128;
     }
+  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    if (hardware_config->arch_flags & xnn_arch_riscv_vector) {
+      f32_vcopysign_config.op_ukernel = XNN_INIT_BINARY_UKERNEL(xnn_f32_vcopysign_ukernel__rvv_u8v);
+      f32_vcopysign_config.opc_ukernel = XNN_INIT_BINARY_UKERNEL(xnn_f32_vcopysignc_ukernel__rvv_u8v);
+      f32_vcopysign_config.ropc_ukernel = XNN_INIT_BINARY_UKERNEL(xnn_f32_vrcopysignc_ukernel__rvv_u8v);
+      f32_vcopysign_config.element_tile = 8 * hardware_config->vlenb / sizeof(float);
+    } else {
+      f32_vcopysign_config.op_ukernel = XNN_INIT_BINARY_UKERNEL(xnn_f32_vcopysign_ukernel__scalar_u2);
+      f32_vcopysign_config.opc_ukernel = XNN_INIT_BINARY_UKERNEL(xnn_f32_vcopysignc_ukernel__scalar_u2);
+      f32_vcopysign_config.ropc_ukernel = XNN_INIT_BINARY_UKERNEL(xnn_f32_vrcopysignc_ukernel__scalar_u2);
+      f32_vcopysign_config.element_tile = 8;
+    }
   #else
     f32_vcopysign_config.op_ukernel = XNN_INIT_BINARY_UKERNEL(xnn_f32_vcopysign_ukernel__scalar_u2);
     f32_vcopysign_config.opc_ukernel = XNN_INIT_BINARY_UKERNEL(xnn_f32_vcopysignc_ukernel__scalar_u2);
diff --git a/src/configs/unary-elementwise-config.c b/src/configs/unary-elementwise-config.c
index 00dda5e9704..475faadc1ea 100644
--- a/src/configs/unary-elementwise-config.c
+++ b/src/configs/unary-elementwise-config.c
@@ -339,6 +339,13 @@ static void init_f16_hswish_config(void) {
         f16_hswish_config.element_tile = 16;
       }
     #endif
+  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) {
+      f16_hswish_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f16_vhswish_ukernel__rvvfp16arith_u8v);
+      f16_hswish_config.element_tile = 8 * hardware_config->vlenb / sizeof(xnn_float16);
+    }
   #endif
 }
 
@@ -372,6 +379,14 @@ static void init_f16_lrelu_config(void) {
         f16_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f16_lrelu_scalar_params;
       }
     #endif
+  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) {
+      f16_lrelu_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f16_vlrelu_ukernel__rvvfp16arith_u8v);
+      f16_lrelu_config.element_tile = 8 * hardware_config->vlenb / sizeof(xnn_float16);
+      f16_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f16_lrelu_scalar_params;
+    }
   #endif
 }
 
@@ -436,6 +451,13 @@ static void init_f16_rndd_config(void) {
         f16_rndd_config.element_tile = 16;
       }
     #endif
+  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) {
+      f16_rndd_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f16_vrndd_ukernel__rvvfp16arith_u8v);
+      f16_rndd_config.element_tile = 8 * hardware_config->vlenb / sizeof(xnn_float16);
+    }
   #endif
 }
 
@@ -466,6 +488,13 @@ static void init_f16_rndne_config(void) {
         f16_rndne_config.element_tile = 16;
       }
     #endif
+  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) {
+      f16_rndne_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f16_vrndne_ukernel__rvvfp16arith_u8v);
+      f16_rndne_config.element_tile = 8 * hardware_config->vlenb / sizeof(xnn_float16);
+    }
   #endif
 }
 
@@ -496,6 +525,13 @@ static void init_f16_rndu_config(void) {
         f16_rndu_config.element_tile = 16;
       }
     #endif
+  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) {
+      f16_rndu_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f16_vrndu_ukernel__rvvfp16arith_u8v);
+      f16_rndu_config.element_tile = 8 * hardware_config->vlenb / sizeof(xnn_float16);
+    }
   #endif
 }
 
@@ -526,6 +562,13 @@ static void init_f16_rndz_config(void) {
         f16_rndz_config.element_tile = 16;
       }
     #endif
+  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) {
+      f16_rndz_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f16_vrndz_ukernel__rvvfp16arith_u8v);
+      f16_rndz_config.element_tile = 8 * hardware_config->vlenb / sizeof(xnn_float16);
+    }
   #endif
 }
 
@@ -556,6 +599,13 @@ static void init_f16_rsqrt_config(void) {
         f16_rsqrt_config.element_tile = 32;
       }
     #endif
+  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) {
+      f16_rsqrt_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f16_vrsqrt_ukernel__rvvfp16arith_rsqrt_u4v);
+      f16_rsqrt_config.element_tile = 4 * hardware_config->vlenb / sizeof(xnn_float16);
+    }
   #endif
 }
 
@@ -665,6 +715,13 @@ static void init_f16_sqrt_config(void) {
         f16_sqrt_config.element_tile = 32;
       }
     #endif
+  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) {
+      f16_sqrt_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f16_vsqrt_ukernel__rvvfp16arith_sqrt_u8v);
+      f16_sqrt_config.element_tile = 8 * hardware_config->vlenb / sizeof(xnn_float16);
+    }
   #endif
 }
 
diff --git a/src/f16-vhswish/f16-vhswish.inc b/src/f16-vhswish/f16-vhswish.inc
index 09402fc6250..0c5350e6b8c 100644
--- a/src/f16-vhswish/f16-vhswish.inc
+++ b/src/f16-vhswish/f16-vhswish.inc
@@ -15,3 +15,7 @@ XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vhswish_ukernel__f16c_u8, 8, false, xnn_f
 XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vhswish_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, NULL)
 #endif  // XNN_ENABLE_F16C && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
+#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vhswish_ukernel__rvvfp16arith_u4v, 4, true, xnn_float16, struct xnn_f16_default_params, NULL)
+XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vhswish_ukernel__rvvfp16arith_u8v, 8, true, xnn_float16, struct xnn_f16_default_params, NULL)
+#endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
diff --git a/src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u4v.c b/src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u4v.c
new file mode 100644
index 00000000000..a602a138339
--- /dev/null
+++ b/src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u4v.c
@@ -0,0 +1,50 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f16-vhswish/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f16_vhswish_ukernel__rvvfp16arith_u4v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_default_params* restrict params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const xnn_float16 vsixth = 0x1.555556p-3f;
+  const xnn_float16 vthree = 3.0f;
+  const xnn_float16 vsix = 6.0f;
+  const xnn_float16 vzero = 0.0f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+  do {
+    const size_t vl = __riscv_vsetvl_e16m4(batch);
+    vfloat16m4_t vx = __riscv_vle16_v_f16m4(input, vl);
+    input += vl;
+    vfloat16m4_t vacc = __riscv_vfadd(vx, vthree, vl);
+    vx = __riscv_vfmul(vx, vsixth, vl);
+    vacc = __riscv_vfmax(vacc, vzero, vl);
+    vacc = __riscv_vfmin(vacc, vsix, vl);
+    vacc = __riscv_vfmul(vacc, vx, vl);
+    __riscv_vse16(output, vacc, vl);
+    output += vl;
+
+    batch -= vl;
+  } while (batch != 0);
+}
diff --git a/src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u8v.c b/src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u8v.c
new file mode 100644
index 00000000000..f955e111f8d
--- /dev/null
+++ b/src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u8v.c
@@ -0,0 +1,50 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f16-vhswish/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f16_vhswish_ukernel__rvvfp16arith_u8v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_default_params* restrict params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const xnn_float16 vsixth = 0x1.555556p-3f;
+  const xnn_float16 vthree = 3.0f;
+  const xnn_float16 vsix = 6.0f;
+  const xnn_float16 vzero = 0.0f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+  do {
+    const size_t vl = __riscv_vsetvl_e16m8(batch);
+    vfloat16m8_t vx = __riscv_vle16_v_f16m8(input, vl);
+    input += vl;
+    vfloat16m8_t vacc = __riscv_vfadd(vx, vthree, vl);
+    vx = __riscv_vfmul(vx, vsixth, vl);
+    vacc = __riscv_vfmax(vacc, vzero, vl);
+    vacc = __riscv_vfmin(vacc, vsix, vl);
+    vacc = __riscv_vfmul(vacc, vx, vl);
+    __riscv_vse16(output, vacc, vl);
+    output += vl;
+
+    batch -= vl;
+  } while (batch != 0);
+}
diff --git a/src/f16-vhswish/rvv.c.in b/src/f16-vhswish/rvv.c.in
new file mode 100644
index 00000000000..a843e06c34f
--- /dev/null
+++ b/src/f16-vhswish/rvv.c.in
@@ -0,0 +1,46 @@
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert LMUL in [1, 2, 4, 8]
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f16_vhswish_ukernel__rvvfp16arith_u${LMUL}v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_default_params* restrict params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const xnn_float16 vsixth = 0x1.555556p-3f;
+  const xnn_float16 vthree = 3.0f;
+  const xnn_float16 vsix = 6.0f;
+  const xnn_float16 vzero = 0.0f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+  do {
+    const size_t vl = __riscv_vsetvl_e16m${LMUL}(batch);
+    vfloat16m${LMUL}_t vx = __riscv_vle16_v_f16m${LMUL}(input, vl);
+    input += vl;
+    vfloat16m${LMUL}_t vacc = __riscv_vfadd(vx, vthree, vl);
+    vx = __riscv_vfmul(vx, vsixth, vl);
+    vacc = __riscv_vfmax(vacc, vzero, vl);
+    vacc = __riscv_vfmin(vacc, vsix, vl);
+    vacc = __riscv_vfmul(vacc, vx, vl);
+    __riscv_vse16(output, vacc, vl);
+    output += vl;
+
+    batch -= vl;
+  } while (batch != 0);
+}
diff --git a/src/f16-vlrelu/f16-vlrelu.inc b/src/f16-vlrelu/f16-vlrelu.inc
index a873595220d..30f02c9c426 100644
--- a/src/f16-vlrelu/f16-vlrelu.inc
+++ b/src/f16-vlrelu/f16-vlrelu.inc
@@ -15,3 +15,7 @@ XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vlrelu_ukernel__f16c_u8, 8, false, xnn_fl
 XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vlrelu_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_lrelu_params, xnn_init_f16_lrelu_scalar_params)
 #endif  // XNN_ENABLE_F16C && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
+#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vlrelu_ukernel__rvvfp16arith_u4v, 4, true, xnn_float16, struct xnn_f16_lrelu_params, xnn_init_f16_lrelu_scalar_params)
+XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vlrelu_ukernel__rvvfp16arith_u8v, 8, true, xnn_float16, struct xnn_f16_lrelu_params, xnn_init_f16_lrelu_scalar_params)
+#endif  //  XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
diff --git a/src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u4v.c b/src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u4v.c
new file mode 100644
index 00000000000..0254168f7ce
--- /dev/null
+++ b/src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u4v.c
@@ -0,0 +1,40 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f16-vlrelu/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Imagination Technologies, Inc.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+void xnn_f16_vlrelu_ukernel__rvvfp16arith_u4v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_lrelu_params* restrict params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const xnn_float16 slope = params->scalar.slope;
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+
+  do {
+    size_t vl = __riscv_vsetvl_e16m4(batch);
+    vfloat16m4_t in_f16v = __riscv_vle16_v_f16m4(input, vl); input += vl;
+    vbool4_t mask_f16v = __riscv_vmflt(in_f16v, 0.0f, vl);
+    vfloat16m4_t out_f16v = __riscv_vfmul(mask_f16v, in_f16v, slope, vl);
+    __riscv_vse16(output, out_f16v, vl); output += vl;
+
+    batch -= vl;
+  } while (batch != 0);
+}
diff --git a/src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u8v.c b/src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u8v.c
new file mode 100644
index 00000000000..12f5cbaaa7d
--- /dev/null
+++ b/src/f16-vlrelu/gen/f16-vlrelu-rvvfp16arith-u8v.c
@@ -0,0 +1,40 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f16-vlrelu/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Imagination Technologies, Inc.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+void xnn_f16_vlrelu_ukernel__rvvfp16arith_u8v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_lrelu_params* restrict params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const xnn_float16 slope = params->scalar.slope;
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+
+  do {
+    size_t vl = __riscv_vsetvl_e16m8(batch);
+    vfloat16m8_t in_f16v = __riscv_vle16_v_f16m8(input, vl); input += vl;
+    vbool2_t mask_f16v = __riscv_vmflt(in_f16v, 0.0f, vl);
+    vfloat16m8_t out_f16v = __riscv_vfmul(mask_f16v, in_f16v, slope, vl);
+    __riscv_vse16(output, out_f16v, vl); output += vl;
+
+    batch -= vl;
+  } while (batch != 0);
+}
diff --git a/src/f16-vlrelu/rvv.c.in b/src/f16-vlrelu/rvv.c.in
new file mode 100644
index 00000000000..062ff8426f5
--- /dev/null
+++ b/src/f16-vlrelu/rvv.c.in
@@ -0,0 +1,36 @@
+// Copyright 2026 Imagination Technologies, Inc.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert LMUL in [1, 2, 4, 8]
+#include <assert.h>
+
+#include <riscv_vector.h>
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+void xnn_f16_vlrelu_ukernel__rvvfp16arith_u${LMUL}v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_lrelu_params* restrict params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const xnn_float16 slope = params->scalar.slope;
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+
+  do {
+    size_t vl = __riscv_vsetvl_e16m${LMUL}(batch);
+    vfloat16m${LMUL}_t in_f16v = __riscv_vle16_v_f16m${LMUL}(input, vl); input += vl;
+    vbool${16//LMUL}_t mask_f16v = __riscv_vmflt(in_f16v, 0.0f, vl);
+    vfloat16m${LMUL}_t out_f16v = __riscv_vfmul(mask_f16v, in_f16v, slope, vl);
+    __riscv_vse16(output, out_f16v, vl); output += vl;
+
+    batch -= vl;
+  } while (batch != 0);
+}
diff --git a/src/f16-vrnd/f16-vrndd.inc b/src/f16-vrnd/f16-vrndd.inc
index 0293e05084c..4ba33362d86 100644
--- a/src/f16-vrnd/f16-vrndd.inc
+++ b/src/f16-vrnd/f16-vrndd.inc
@@ -15,3 +15,7 @@ XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vrndd_ukernel__f16c_u8, 8, false, xnn_flo
 XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vrndd_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, NULL)
 #endif  // XNN_ENABLE_F16C && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
+#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vrndd_ukernel__rvvfp16arith_u4v, 4, true, xnn_float16, struct xnn_f16_default_params, NULL)
+XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vrndd_ukernel__rvvfp16arith_u8v, 8, true, xnn_float16, struct xnn_f16_default_params, NULL)
+#endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
diff --git a/src/f16-vrnd/f16-vrndne.inc b/src/f16-vrnd/f16-vrndne.inc
index cee077888e7..8e4e4a9e3ef 100644
--- a/src/f16-vrnd/f16-vrndne.inc
+++ b/src/f16-vrnd/f16-vrndne.inc
@@ -15,3 +15,7 @@ XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vrndne_ukernel__f16c_u8, 8, false, xnn_fl
 XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vrndne_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, NULL)
 #endif  // XNN_ENABLE_F16C && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
+#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vrndne_ukernel__rvvfp16arith_u4v, 4, true, xnn_float16, struct xnn_f16_default_params, NULL)
+XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vrndne_ukernel__rvvfp16arith_u8v, 8, true, xnn_float16, struct xnn_f16_default_params, NULL)
+#endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
diff --git a/src/f16-vrnd/f16-vrndu.inc b/src/f16-vrnd/f16-vrndu.inc
index bf2a2a3426b..7414e88b336 100644
--- a/src/f16-vrnd/f16-vrndu.inc
+++ b/src/f16-vrnd/f16-vrndu.inc
@@ -15,3 +15,7 @@ XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vrndu_ukernel__f16c_u8, 8, false, xnn_flo
 XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vrndu_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, NULL)
 #endif  // XNN_ENABLE_F16C && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
+#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vrndu_ukernel__rvvfp16arith_u4v, 4, true, xnn_float16, struct xnn_f16_default_params, NULL)
+XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vrndu_ukernel__rvvfp16arith_u8v, 8, true, xnn_float16, struct xnn_f16_default_params, NULL)
+#endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
diff --git a/src/f16-vrnd/f16-vrndz.inc b/src/f16-vrnd/f16-vrndz.inc
index d9af324eb64..d971ce2e6b6 100644
--- a/src/f16-vrnd/f16-vrndz.inc
+++ b/src/f16-vrnd/f16-vrndz.inc
@@ -15,3 +15,8 @@ XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vrndz_ukernel__f16c_u8, 8, false, xnn_flo
 XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vrndz_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, NULL)
 #endif  // XNN_ENABLE_F16C && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
+#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vrndz_ukernel__rvvfp16arith_u4v, 4, true, xnn_float16, struct xnn_f16_default_params, NULL)
+XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vrndz_ukernel__rvvfp16arith_u8v, 8, true, xnn_float16, struct xnn_f16_default_params, NULL)
+#endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+
diff --git a/src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u4v.c b/src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u4v.c
new file mode 100644
index 00000000000..6d492e9153f
--- /dev/null
+++ b/src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u4v.c
@@ -0,0 +1,50 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f16-vrnd/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Imagination Technologies inc.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f16_vrndd_ukernel__rvvfp16arith_u4v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_default_params* restrict params) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+  do {
+    const size_t vl = __riscv_vsetvl_e16m4(batch);
+    vfloat16m4_t x_f16v = __riscv_vle16_v_f16m4(input, vl); input += vl;
+
+    // preserve NaN
+    vbool4_t nan_bv = __riscv_vmfeq(x_f16v, x_f16v, vl);
+    // magnitude < (1 << FLT16_MANT_DIG)
+    vfloat16m4_t mag = __riscv_vfabs(x_f16v, vl);
+    vbool4_t mag_bv = __riscv_vmflt(mag, (1 << __FLT16_MANT_DIG__), vl);
+    vbool4_t mask_bv = __riscv_vmnand(nan_bv, mag_bv, vl);
+
+    vint16m4_t x_rnd_i16v = __riscv_vfcvt_x(x_f16v, __RISCV_FRM_RDN, vl);
+    vfloat16m4_t out_f16v = __riscv_vfcvt_f(x_rnd_i16v, vl);
+    out_f16v = __riscv_vmerge(out_f16v, x_f16v, mask_bv, vl);
+    __riscv_vse16(output, out_f16v, vl); output += vl;
+    batch -= vl;
+  } while (batch != 0);
+}
diff --git a/src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u8v.c b/src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u8v.c
new file mode 100644
index 00000000000..d8a044e3dfc
--- /dev/null
+++ b/src/f16-vrnd/gen/f16-vrndd-rvvfp16arith-u8v.c
@@ -0,0 +1,50 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f16-vrnd/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Imagination Technologies inc.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f16_vrndd_ukernel__rvvfp16arith_u8v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_default_params* restrict params) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+  do {
+    const size_t vl = __riscv_vsetvl_e16m8(batch);
+    vfloat16m8_t x_f16v = __riscv_vle16_v_f16m8(input, vl); input += vl;
+
+    // preserve NaN
+    vbool2_t nan_bv = __riscv_vmfeq(x_f16v, x_f16v, vl);
+    // magnitude < (1 << FLT16_MANT_DIG)
+    vfloat16m8_t mag = __riscv_vfabs(x_f16v, vl);
+    vbool2_t mag_bv = __riscv_vmflt(mag, (1 << __FLT16_MANT_DIG__), vl);
+    vbool2_t mask_bv = __riscv_vmnand(nan_bv, mag_bv, vl);
+
+    vint16m8_t x_rnd_i16v = __riscv_vfcvt_x(x_f16v, __RISCV_FRM_RDN, vl);
+    vfloat16m8_t out_f16v = __riscv_vfcvt_f(x_rnd_i16v, vl);
+    out_f16v = __riscv_vmerge(out_f16v, x_f16v, mask_bv, vl);
+    __riscv_vse16(output, out_f16v, vl); output += vl;
+    batch -= vl;
+  } while (batch != 0);
+}
diff --git a/src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u4v.c b/src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u4v.c
new file mode 100644
index 00000000000..8721794794a
--- /dev/null
+++ b/src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u4v.c
@@ -0,0 +1,50 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f16-vrnd/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Imagination Technologies inc.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f16_vrndne_ukernel__rvvfp16arith_u4v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_default_params* restrict params) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+  do {
+    const size_t vl = __riscv_vsetvl_e16m4(batch);
+    vfloat16m4_t x_f16v = __riscv_vle16_v_f16m4(input, vl); input += vl;
+
+    // preserve NaN
+    vbool4_t nan_bv = __riscv_vmfeq(x_f16v, x_f16v, vl);
+    // magnitude < (1 << FLT16_MANT_DIG)
+    vfloat16m4_t mag = __riscv_vfabs(x_f16v, vl);
+    vbool4_t mag_bv = __riscv_vmflt(mag, (1 << __FLT16_MANT_DIG__), vl);
+    vbool4_t mask_bv = __riscv_vmnand(nan_bv, mag_bv, vl);
+
+    vint16m4_t x_rnd_i16v = __riscv_vfcvt_x(x_f16v, __RISCV_FRM_RNE, vl);
+    vfloat16m4_t out_f16v = __riscv_vfcvt_f(x_rnd_i16v, vl);
+    out_f16v = __riscv_vmerge(out_f16v, x_f16v, mask_bv, vl);
+    __riscv_vse16(output, out_f16v, vl); output += vl;
+    batch -= vl;
+  } while (batch != 0);
+}
diff --git a/src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u8v.c b/src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u8v.c
new file mode 100644
index 00000000000..e93ea8dc05e
--- /dev/null
+++ b/src/f16-vrnd/gen/f16-vrndne-rvvfp16arith-u8v.c
@@ -0,0 +1,50 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f16-vrnd/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Imagination Technologies inc.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f16_vrndne_ukernel__rvvfp16arith_u8v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_default_params* restrict params) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+  do {
+    const size_t vl = __riscv_vsetvl_e16m8(batch);
+    vfloat16m8_t x_f16v = __riscv_vle16_v_f16m8(input, vl); input += vl;
+
+    // preserve NaN
+    vbool2_t nan_bv = __riscv_vmfeq(x_f16v, x_f16v, vl);
+    // magnitude < (1 << FLT16_MANT_DIG)
+    vfloat16m8_t mag = __riscv_vfabs(x_f16v, vl);
+    vbool2_t mag_bv = __riscv_vmflt(mag, (1 << __FLT16_MANT_DIG__), vl);
+    vbool2_t mask_bv = __riscv_vmnand(nan_bv, mag_bv, vl);
+
+    vint16m8_t x_rnd_i16v = __riscv_vfcvt_x(x_f16v, __RISCV_FRM_RNE, vl);
+    vfloat16m8_t out_f16v = __riscv_vfcvt_f(x_rnd_i16v, vl);
+    out_f16v = __riscv_vmerge(out_f16v, x_f16v, mask_bv, vl);
+    __riscv_vse16(output, out_f16v, vl); output += vl;
+    batch -= vl;
+  } while (batch != 0);
+}
diff --git a/src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u4v.c b/src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u4v.c
new file mode 100644
index 00000000000..6379b914eb0
--- /dev/null
+++ b/src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u4v.c
@@ -0,0 +1,50 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f16-vrnd/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Imagination Technologies inc.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f16_vrndu_ukernel__rvvfp16arith_u4v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_default_params* restrict params) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+  do {
+    const size_t vl = __riscv_vsetvl_e16m4(batch);
+    vfloat16m4_t x_f16v = __riscv_vle16_v_f16m4(input, vl); input += vl;
+
+    // preserve NaN
+    vbool4_t nan_bv = __riscv_vmfeq(x_f16v, x_f16v, vl);
+    // magnitude < (1 << FLT16_MANT_DIG)
+    vfloat16m4_t mag = __riscv_vfabs(x_f16v, vl);
+    vbool4_t mag_bv = __riscv_vmflt(mag, (1 << __FLT16_MANT_DIG__), vl);
+    vbool4_t mask_bv = __riscv_vmnand(nan_bv, mag_bv, vl);
+
+    vint16m4_t x_rnd_i16v = __riscv_vfcvt_x(x_f16v, __RISCV_FRM_RUP, vl);
+    vfloat16m4_t out_f16v = __riscv_vfcvt_f(x_rnd_i16v, vl);
+    out_f16v = __riscv_vmerge(out_f16v, x_f16v, mask_bv, vl);
+    __riscv_vse16(output, out_f16v, vl); output += vl;
+    batch -= vl;
+  } while (batch != 0);
+}
diff --git a/src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u8v.c b/src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u8v.c
new file mode 100644
index 00000000000..723455ec821
--- /dev/null
+++ b/src/f16-vrnd/gen/f16-vrndu-rvvfp16arith-u8v.c
@@ -0,0 +1,50 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f16-vrnd/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Imagination Technologies inc.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f16_vrndu_ukernel__rvvfp16arith_u8v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_default_params* restrict params) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+  do {
+    const size_t vl = __riscv_vsetvl_e16m8(batch);
+    vfloat16m8_t x_f16v = __riscv_vle16_v_f16m8(input, vl); input += vl;
+
+    // preserve NaN
+    vbool2_t nan_bv = __riscv_vmfeq(x_f16v, x_f16v, vl);
+    // magnitude < (1 << FLT16_MANT_DIG)
+    vfloat16m8_t mag = __riscv_vfabs(x_f16v, vl);
+    vbool2_t mag_bv = __riscv_vmflt(mag, (1 << __FLT16_MANT_DIG__), vl);
+    vbool2_t mask_bv = __riscv_vmnand(nan_bv, mag_bv, vl);
+
+    vint16m8_t x_rnd_i16v = __riscv_vfcvt_x(x_f16v, __RISCV_FRM_RUP, vl);
+    vfloat16m8_t out_f16v = __riscv_vfcvt_f(x_rnd_i16v, vl);
+    out_f16v = __riscv_vmerge(out_f16v, x_f16v, mask_bv, vl);
+    __riscv_vse16(output, out_f16v, vl); output += vl;
+    batch -= vl;
+  } while (batch != 0);
+}
diff --git a/src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u4v.c b/src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u4v.c
new file mode 100644
index 00000000000..f25a850016d
--- /dev/null
+++ b/src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u4v.c
@@ -0,0 +1,50 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f16-vrnd/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Imagination Technologies inc.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f16_vrndz_ukernel__rvvfp16arith_u4v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_default_params* restrict params) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+  do {
+    const size_t vl = __riscv_vsetvl_e16m4(batch);
+    vfloat16m4_t x_f16v = __riscv_vle16_v_f16m4(input, vl); input += vl;
+
+    // preserve NaN
+    vbool4_t nan_bv = __riscv_vmfeq(x_f16v, x_f16v, vl);
+    // magnitude < (1 << FLT16_MANT_DIG)
+    vfloat16m4_t mag = __riscv_vfabs(x_f16v, vl);
+    vbool4_t mag_bv = __riscv_vmflt(mag, (1 << __FLT16_MANT_DIG__), vl);
+    vbool4_t mask_bv = __riscv_vmnand(nan_bv, mag_bv, vl);
+
+    vint16m4_t x_rnd_i16v = __riscv_vfcvt_x(x_f16v, __RISCV_FRM_RTZ, vl);
+    vfloat16m4_t out_f16v = __riscv_vfcvt_f(x_rnd_i16v, vl);
+    out_f16v = __riscv_vmerge(out_f16v, x_f16v, mask_bv, vl);
+    __riscv_vse16(output, out_f16v, vl); output += vl;
+    batch -= vl;
+  } while (batch != 0);
+}
diff --git a/src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u8v.c b/src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u8v.c
new file mode 100644
index 00000000000..591e7a99fd9
--- /dev/null
+++ b/src/f16-vrnd/gen/f16-vrndz-rvvfp16arith-u8v.c
@@ -0,0 +1,50 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f16-vrnd/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Imagination Technologies inc.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f16_vrndz_ukernel__rvvfp16arith_u8v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_default_params* restrict params) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+  do {
+    const size_t vl = __riscv_vsetvl_e16m8(batch);
+    vfloat16m8_t x_f16v = __riscv_vle16_v_f16m8(input, vl); input += vl;
+
+    // preserve NaN
+    vbool2_t nan_bv = __riscv_vmfeq(x_f16v, x_f16v, vl);
+    // magnitude < (1 << FLT16_MANT_DIG)
+    vfloat16m8_t mag = __riscv_vfabs(x_f16v, vl);
+    vbool2_t mag_bv = __riscv_vmflt(mag, (1 << __FLT16_MANT_DIG__), vl);
+    vbool2_t mask_bv = __riscv_vmnand(nan_bv, mag_bv, vl);
+
+    vint16m8_t x_rnd_i16v = __riscv_vfcvt_x(x_f16v, __RISCV_FRM_RTZ, vl);
+    vfloat16m8_t out_f16v = __riscv_vfcvt_f(x_rnd_i16v, vl);
+    out_f16v = __riscv_vmerge(out_f16v, x_f16v, mask_bv, vl);
+    __riscv_vse16(output, out_f16v, vl); output += vl;
+    batch -= vl;
+  } while (batch != 0);
+}
diff --git a/src/f16-vrnd/rvv.c.in b/src/f16-vrnd/rvv.c.in
new file mode 100644
index 00000000000..25507cabe40
--- /dev/null
+++ b/src/f16-vrnd/rvv.c.in
@@ -0,0 +1,53 @@
+// Copyright 2026 Imagination Technologies inc.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert LMUL in [1, 2, 4, 8]
+$assert OP in ["RNDNE", "RNDZ", "RNDU", "RNDD"]
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vunary.h"
+
+
+$ROUND_MODE = {
+$  "RNDNE": "__RISCV_FRM_RNE",
+$  "RNDZ": "__RISCV_FRM_RTZ",
+$  "RNDU": "__RISCV_FRM_RUP",
+$  "RNDD": "__RISCV_FRM_RDN",
+$}[OP]
+void xnn_f16_v${OP.lower()}_ukernel__rvvfp16arith_u${LMUL}v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_default_params* restrict params) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+  do {
+    const size_t vl = __riscv_vsetvl_e16m${LMUL}(batch);
+    vfloat16m${LMUL}_t x_f16v = __riscv_vle16_v_f16m${LMUL}(input, vl); input += vl;
+
+    // preserve NaN
+    vbool${16//LMUL}_t nan_bv = __riscv_vmfeq(x_f16v, x_f16v, vl);
+    // magnitude < (1 << FLT16_MANT_DIG)
+    vfloat16m${LMUL}_t mag = __riscv_vfabs(x_f16v, vl);
+    vbool${16//LMUL}_t mag_bv = __riscv_vmflt(mag, (1 << __FLT16_MANT_DIG__), vl);
+    vbool${16//LMUL}_t mask_bv = __riscv_vmnand(nan_bv, mag_bv, vl);
+
+    vint16m${LMUL}_t x_rnd_i16v = __riscv_vfcvt_x(x_f16v, ${ROUND_MODE}, vl);
+    vfloat16m${LMUL}_t out_f16v = __riscv_vfcvt_f(x_rnd_i16v, vl);
+    out_f16v = __riscv_vmerge(out_f16v, x_f16v, mask_bv, vl);
+    __riscv_vse16(output, out_f16v, vl); output += vl;
+    batch -= vl;
+  } while (batch != 0);
+}
diff --git a/src/f16-vrsqrt/f16-vrsqrt.inc b/src/f16-vrsqrt/f16-vrsqrt.inc
index cae6c9336eb..f20c36abb3f 100644
--- a/src/f16-vrsqrt/f16-vrsqrt.inc
+++ b/src/f16-vrsqrt/f16-vrsqrt.inc
@@ -17,3 +17,7 @@ XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vrsqrt_ukernel__f16c_rsqrt_u16, 16, false
 XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_vrsqrt_ukernel__f16c_rsqrt_u32, 32, false, xnn_float16, struct xnn_f16_default_params, NULL)
 #endif  // XNN_ENABLE_F16C && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
+#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vrsqrt_ukernel__rvvfp16arith_rsqrt_u2v, 2, true, xnn_float16, struct xnn_f16_default_params, NULL)
+XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vrsqrt_ukernel__rvvfp16arith_rsqrt_u4v, 4, true, xnn_float16, struct xnn_f16_default_params, NULL)
+#endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
diff --git a/src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u2v.c b/src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u2v.c
new file mode 100644
index 00000000000..e7a7bb84c84
--- /dev/null
+++ b/src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u2v.c
@@ -0,0 +1,79 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f16-vrsqrt/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Imagination Technologies, inc.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+// In the following, we do a single Newton-Raphson step on the equation
+// $x^{-2} - a$, which expands to:
+//
+//  $$x_{k+1} = 0.5 * x_k * (3.0 - a * x_k^2)$$
+//
+// So we do the following steps:
+//
+//  1. t0 = x_k
+//  2. t1 = t0 * t0   (x_k^2)
+//  3. t2 = a * t1    (a * x_k^2)
+//  4. t3 = 3.0 - t2  (3.0 - a * x_k^2)
+//  5. t4 = 0.5 * t0  (0.5 * x_k)
+//  6. y  = t3 * t4   (0.5 * x_k * (3.0 - a * x_k^2))
+//
+// Where $x_k$ is the original approximation and `y` contains the improved
+// approximation $x_{k+1}$.
+//
+// Note also that the initial approximation computed by the `vfrsqrt7`
+// instruction is only accurate to 7 bits (as opposed to 12 or 14 for x86_64),
+// which requires us to do two steps of the above.
+
+void xnn_f16_vrsqrt_ukernel__rvvfp16arith_rsqrt_u2v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_default_params* restrict params) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+
+  vfloat16m2_t onephalf_f16v = __riscv_vfmv_v_f_f16m2(1.5f, __riscv_vsetvl_e16m2(batch));
+  vfloat16m2_t zero_f16v = __riscv_vfmv_v_f_f16m2(0.0f, __riscv_vsetvl_e16m2(batch));
+
+  for (; batch > 0; ) {
+    size_t n = __riscv_vsetvl_e16m2(batch); batch -= n;
+    vfloat16m2_t in_f16v = __riscv_vle16_v_f16m2(input, n); input += n;
+
+    vfloat16m2_t t0_f16v = __riscv_vfrsqrt7(in_f16v, n);
+    vfloat16m2_t in_half_f16v = __riscv_vfmul(in_f16v, 0.5f, n);
+
+    // First Newton-Raphson iteration
+    vfloat16m2_t t1_f16v = __riscv_vfmul(t0_f16v, t0_f16v, n);
+    vfloat16m2_t t2_f16v = __riscv_vfnmsub_vv_f16m2(in_half_f16v, t1_f16v, onephalf_f16v, n);
+    t0_f16v = __riscv_vfmul(t0_f16v, t2_f16v, n);
+
+    // Second Newton-Raphson iteration
+    t1_f16v = __riscv_vfmul(t0_f16v, t0_f16v, n);
+    t2_f16v = __riscv_vfnmsub_vv_f16m2(in_half_f16v, t1_f16v, onephalf_f16v, n);
+    vfloat16m2_t y_f16v = __riscv_vfmul(t0_f16v, t2_f16v, n);
+
+    // Set output to 0 where the input is infinity (and not NaN)
+    vbool8_t inf_bv = __riscv_vmfeq(in_f16v, INFINITY, n);
+    y_f16v = __riscv_vmerge(y_f16v, zero_f16v, inf_bv, n);
+
+    __riscv_vse16(output, y_f16v, n); output += n;
+  }
+}
diff --git a/src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u4v.c b/src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u4v.c
new file mode 100644
index 00000000000..e282f5f5aa0
--- /dev/null
+++ b/src/f16-vrsqrt/gen/f16-vrsqrt-rvvfp16arith-rsqrt-u4v.c
@@ -0,0 +1,79 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f16-vrsqrt/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Imagination Technologies, inc.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+// In the following, we do a single Newton-Raphson step on the equation
+// $x^{-2} - a$, which expands to:
+//
+//  $$x_{k+1} = 0.5 * x_k * (3.0 - a * x_k^2)$$
+//
+// So we do the following steps:
+//
+//  1. t0 = x_k
+//  2. t1 = t0 * t0   (x_k^2)
+//  3. t2 = a * t1    (a * x_k^2)
+//  4. t3 = 3.0 - t2  (3.0 - a * x_k^2)
+//  5. t4 = 0.5 * t0  (0.5 * x_k)
+//  6. y  = t3 * t4   (0.5 * x_k * (3.0 - a * x_k^2))
+//
+// Where $x_k$ is the original approximation and `y` contains the improved
+// approximation $x_{k+1}$.
+//
+// Note also that the initial approximation computed by the `vfrsqrt7`
+// instruction is only accurate to 7 bits (as opposed to 12 or 14 for x86_64),
+// which requires us to do two steps of the above.
+
+void xnn_f16_vrsqrt_ukernel__rvvfp16arith_rsqrt_u4v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_default_params* restrict params) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+
+  vfloat16m4_t onephalf_f16v = __riscv_vfmv_v_f_f16m4(1.5f, __riscv_vsetvl_e16m4(batch));
+  vfloat16m4_t zero_f16v = __riscv_vfmv_v_f_f16m4(0.0f, __riscv_vsetvl_e16m4(batch));
+
+  for (; batch > 0; ) {
+    size_t n = __riscv_vsetvl_e16m4(batch); batch -= n;
+    vfloat16m4_t in_f16v = __riscv_vle16_v_f16m4(input, n); input += n;
+
+    vfloat16m4_t t0_f16v = __riscv_vfrsqrt7(in_f16v, n);
+    vfloat16m4_t in_half_f16v = __riscv_vfmul(in_f16v, 0.5f, n);
+
+    // First Newton-Raphson iteration
+    vfloat16m4_t t1_f16v = __riscv_vfmul(t0_f16v, t0_f16v, n);
+    vfloat16m4_t t2_f16v = __riscv_vfnmsub_vv_f16m4(in_half_f16v, t1_f16v, onephalf_f16v, n);
+    t0_f16v = __riscv_vfmul(t0_f16v, t2_f16v, n);
+
+    // Second Newton-Raphson iteration
+    t1_f16v = __riscv_vfmul(t0_f16v, t0_f16v, n);
+    t2_f16v = __riscv_vfnmsub_vv_f16m4(in_half_f16v, t1_f16v, onephalf_f16v, n);
+    vfloat16m4_t y_f16v = __riscv_vfmul(t0_f16v, t2_f16v, n);
+
+    // Set output to 0 where the input is infinity (and not NaN)
+    vbool4_t inf_bv = __riscv_vmfeq(in_f16v, INFINITY, n);
+    y_f16v = __riscv_vmerge(y_f16v, zero_f16v, inf_bv, n);
+
+    __riscv_vse16(output, y_f16v, n); output += n;
+  }
+}
diff --git a/src/f16-vrsqrt/rvv.c.in b/src/f16-vrsqrt/rvv.c.in
new file mode 100644
index 00000000000..100fd190b92
--- /dev/null
+++ b/src/f16-vrsqrt/rvv.c.in
@@ -0,0 +1,75 @@
+// Copyright 2026 Imagination Technologies, inc.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert LMUL in [1, 2, 4, 8]
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+// In the following, we do a single Newton-Raphson step on the equation
+// $x^{-2} - a$, which expands to:
+//
+//  $$x_{k+1} = 0.5 * x_k * (3.0 - a * x_k^2)$$
+//
+// So we do the following steps:
+//
+//  1. t0 = x_k
+//  2. t1 = t0 * t0   (x_k^2)
+//  3. t2 = a * t1    (a * x_k^2)
+//  4. t3 = 3.0 - t2  (3.0 - a * x_k^2)
+//  5. t4 = 0.5 * t0  (0.5 * x_k)
+//  6. y  = t3 * t4   (0.5 * x_k * (3.0 - a * x_k^2))
+//
+// Where $x_k$ is the original approximation and `y` contains the improved
+// approximation $x_{k+1}$.
+//
+// Note also that the initial approximation computed by the `vfrsqrt7`
+// instruction is only accurate to 7 bits (as opposed to 12 or 14 for x86_64),
+// which requires us to do two steps of the above.
+
+void xnn_f16_vrsqrt_ukernel__rvvfp16arith_rsqrt_u${LMUL}v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_default_params* restrict params) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+
+  vfloat16m${LMUL}_t onephalf_f16v = __riscv_vfmv_v_f_f16m${LMUL}(1.5f, __riscv_vsetvl_e16m${LMUL}(batch));
+  vfloat16m${LMUL}_t zero_f16v = __riscv_vfmv_v_f_f16m${LMUL}(0.0f, __riscv_vsetvl_e16m${LMUL}(batch));
+
+  for (; batch > 0; ) {
+    size_t n = __riscv_vsetvl_e16m${LMUL}(batch); batch -= n;
+    vfloat16m${LMUL}_t in_f16v = __riscv_vle16_v_f16m${LMUL}(input, n); input += n;
+
+    vfloat16m${LMUL}_t t0_f16v = __riscv_vfrsqrt7(in_f16v, n);
+    vfloat16m${LMUL}_t in_half_f16v = __riscv_vfmul(in_f16v, 0.5f, n);
+
+    // First Newton-Raphson iteration
+    vfloat16m${LMUL}_t t1_f16v = __riscv_vfmul(t0_f16v, t0_f16v, n);
+    vfloat16m${LMUL}_t t2_f16v = __riscv_vfnmsub_vv_f16m${LMUL}(in_half_f16v, t1_f16v, onephalf_f16v, n);
+    t0_f16v = __riscv_vfmul(t0_f16v, t2_f16v, n);
+
+    // Second Newton-Raphson iteration
+    t1_f16v = __riscv_vfmul(t0_f16v, t0_f16v, n);
+    t2_f16v = __riscv_vfnmsub_vv_f16m${LMUL}(in_half_f16v, t1_f16v, onephalf_f16v, n);
+    vfloat16m${LMUL}_t y_f16v = __riscv_vfmul(t0_f16v, t2_f16v, n);
+
+    // Set output to 0 where the input is infinity (and not NaN)
+    vbool${16//LMUL}_t inf_bv = __riscv_vmfeq(in_f16v, INFINITY, n);
+    y_f16v = __riscv_vmerge(y_f16v, zero_f16v, inf_bv, n);
+
+    __riscv_vse16(output, y_f16v, n); output += n;
+  }
+}
diff --git a/src/f16-vsqrt/f16-vsqrt.inc b/src/f16-vsqrt/f16-vsqrt.inc
index 71e5838da0b..47f0c78971f 100644
--- a/src/f16-vsqrt/f16-vsqrt.inc
+++ b/src/f16-vsqrt/f16-vsqrt.inc
@@ -44,3 +44,7 @@ XNN_UKERNEL(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u32, 3
 XNN_UKERNEL(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u64, 64, false, xnn_float16, struct xnn_f16_default_params, NULL)
 #endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
+#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vsqrt_ukernel__rvvfp16arith_sqrt_u4v, 4, true, xnn_float16, struct xnn_f16_default_params, NULL)
+XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vsqrt_ukernel__rvvfp16arith_sqrt_u8v, 8, true, xnn_float16, struct xnn_f16_default_params, NULL)
+#endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
diff --git a/src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u4v.c b/src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u4v.c
new file mode 100644
index 00000000000..6be36e71412
--- /dev/null
+++ b/src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u4v.c
@@ -0,0 +1,42 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f16-vsqrt/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f16_vsqrt_ukernel__rvvfp16arith_sqrt_u4v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_default_params* restrict params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+  do {
+    const size_t vl = __riscv_vsetvl_e16m4(batch);
+    vfloat16m4_t vx = __riscv_vle16_v_f16m4(input, vl);
+    input += vl;
+    vfloat16m4_t vacc = __riscv_vfsqrt(vx, vl);
+    __riscv_vse16(output, vacc, vl);
+    output += vl;
+
+    batch -= vl;
+  } while (batch != 0);
+}
diff --git a/src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u8v.c b/src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u8v.c
new file mode 100644
index 00000000000..801dabe212c
--- /dev/null
+++ b/src/f16-vsqrt/gen/f16-vsqrt-rvvfp16arith-sqrt-u8v.c
@@ -0,0 +1,42 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f16-vsqrt/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f16_vsqrt_ukernel__rvvfp16arith_sqrt_u8v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_default_params* restrict params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+  do {
+    const size_t vl = __riscv_vsetvl_e16m8(batch);
+    vfloat16m8_t vx = __riscv_vle16_v_f16m8(input, vl);
+    input += vl;
+    vfloat16m8_t vacc = __riscv_vfsqrt(vx, vl);
+    __riscv_vse16(output, vacc, vl);
+    output += vl;
+
+    batch -= vl;
+  } while (batch != 0);
+}
diff --git a/src/f16-vsqrt/rvv.c.in b/src/f16-vsqrt/rvv.c.in
new file mode 100644
index 00000000000..894a079829d
--- /dev/null
+++ b/src/f16-vsqrt/rvv.c.in
@@ -0,0 +1,38 @@
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert LMUL in [1, 2, 4, 8]
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f16_vsqrt_ukernel__rvvfp16arith_sqrt_u${LMUL}v(
+    size_t batch,
+    const xnn_float16* input,
+    xnn_float16* output,
+    const struct xnn_f16_default_params* restrict params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(xnn_float16) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT16;
+  do {
+    const size_t vl = __riscv_vsetvl_e16m${LMUL}(batch);
+    vfloat16m${LMUL}_t vx = __riscv_vle16_v_f16m${LMUL}(input, vl);
+    input += vl;
+    vfloat16m${LMUL}_t vacc = __riscv_vfsqrt(vx, vl);
+    __riscv_vse16(output, vacc, vl);
+    output += vl;
+
+    batch -= vl;
+  } while (batch != 0);
+}
diff --git a/src/f32-vbinary/f32-vcopysign.inc b/src/f32-vbinary/f32-vcopysign.inc
index e747a9e838c..49a1e66263c 100644
--- a/src/f32-vbinary/f32-vcopysign.inc
+++ b/src/f32-vbinary/f32-vcopysign.inc
@@ -30,6 +30,7 @@ XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_f32_vcopysign_ukernel__avx512f_u32, 32, fa
 XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_f32_vcopysign_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_f32_vcopysign_ukernel__avx512f_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_UKERNEL(xnn_arch_none, xnn_f32_vcopysign_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL(xnn_arch_none, xnn_f32_vcopysign_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
@@ -51,3 +52,7 @@ XNN_UKERNEL(xnn_arch_arm_neon, xnn_f32_vcopysign_ukernel__neon_u12, 12, false, f
 XNN_UKERNEL(xnn_arch_arm_neon, xnn_f32_vcopysign_ukernel__neon_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
+#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vcopysign_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vcopysign_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
diff --git a/src/f32-vbinary/f32-vcopysignc.inc b/src/f32-vbinary/f32-vcopysignc.inc
index 10c0963b5df..99788d0b2b3 100644
--- a/src/f32-vbinary/f32-vcopysignc.inc
+++ b/src/f32-vbinary/f32-vcopysignc.inc
@@ -30,6 +30,7 @@ XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_f32_vcopysignc_ukernel__avx512f_u32, 32, f
 XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_f32_vcopysignc_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_f32_vcopysignc_ukernel__avx512f_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_UKERNEL(xnn_arch_none, xnn_f32_vcopysignc_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL(xnn_arch_none, xnn_f32_vcopysignc_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
@@ -51,3 +52,7 @@ XNN_UKERNEL(xnn_arch_arm_neon, xnn_f32_vcopysignc_ukernel__neon_u12, 12, false,
 XNN_UKERNEL(xnn_arch_arm_neon, xnn_f32_vcopysignc_ukernel__neon_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
+#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vcopysignc_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vcopysignc_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
diff --git a/src/f32-vbinary/f32-vrcopysignc.inc b/src/f32-vbinary/f32-vrcopysignc.inc
index a584f64d201..2020ae51d71 100644
--- a/src/f32-vbinary/f32-vrcopysignc.inc
+++ b/src/f32-vbinary/f32-vrcopysignc.inc
@@ -30,6 +30,7 @@ XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_f32_vrcopysignc_ukernel__avx512f_u32, 32,
 XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_f32_vrcopysignc_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_f32_vrcopysignc_ukernel__avx512f_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_UKERNEL(xnn_arch_none, xnn_f32_vrcopysignc_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL(xnn_arch_none, xnn_f32_vrcopysignc_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
@@ -51,3 +52,7 @@ XNN_UKERNEL(xnn_arch_arm_neon, xnn_f32_vrcopysignc_ukernel__neon_u12, 12, false,
 XNN_UKERNEL(xnn_arch_arm_neon, xnn_f32_vrcopysignc_ukernel__neon_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
+#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vrcopysignc_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vrcopysignc_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
diff --git a/src/f32-vcopysign/gen/f32-vcopysign-rvv-u4v.c b/src/f32-vcopysign/gen/f32-vcopysign-rvv-u4v.c
new file mode 100644
index 00000000000..1fbc405eb22
--- /dev/null
+++ b/src/f32-vcopysign/gen/f32-vcopysign-rvv-u4v.c
@@ -0,0 +1,45 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vcopysign/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vunary.h"
+
+void xnn_f32_vcopysign_ukernel__rvv_u4v(
+      size_t batch,
+      const float* mag,
+      const float* sign,
+      float* output,
+      const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(mag != NULL);
+  assert(sign != NULL);
+  assert(output != NULL);
+
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t vl = __riscv_vsetvl_e32m4(batch); batch -= vl;
+
+    vfloat32m4_t vmag = __riscv_vle32_v_f32m4(mag, vl); mag += vl;
+    vfloat32m4_t vsign = __riscv_vle32_v_f32m4(sign, vl); sign += vl;
+    vfloat32m4_t vy = __riscv_vfsgnj(vmag, vsign, vl);
+
+    __riscv_vse32(output, vy, vl); output += vl;
+ 
+  } while (batch != 0);
+}
diff --git a/src/f32-vcopysign/gen/f32-vcopysign-rvv-u8v.c b/src/f32-vcopysign/gen/f32-vcopysign-rvv-u8v.c
new file mode 100644
index 00000000000..d4533fda776
--- /dev/null
+++ b/src/f32-vcopysign/gen/f32-vcopysign-rvv-u8v.c
@@ -0,0 +1,45 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vcopysign/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vunary.h"
+
+void xnn_f32_vcopysign_ukernel__rvv_u8v(
+      size_t batch,
+      const float* mag,
+      const float* sign,
+      float* output,
+      const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(mag != NULL);
+  assert(sign != NULL);
+  assert(output != NULL);
+
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t vl = __riscv_vsetvl_e32m8(batch); batch -= vl;
+
+    vfloat32m8_t vmag = __riscv_vle32_v_f32m8(mag, vl); mag += vl;
+    vfloat32m8_t vsign = __riscv_vle32_v_f32m8(sign, vl); sign += vl;
+    vfloat32m8_t vy = __riscv_vfsgnj(vmag, vsign, vl);
+
+    __riscv_vse32(output, vy, vl); output += vl;
+ 
+  } while (batch != 0);
+}
diff --git a/src/f32-vcopysign/gen/f32-vcopysignc-rvv-u4v.c b/src/f32-vcopysign/gen/f32-vcopysignc-rvv-u4v.c
new file mode 100644
index 00000000000..4336d2e7497
--- /dev/null
+++ b/src/f32-vcopysign/gen/f32-vcopysignc-rvv-u4v.c
@@ -0,0 +1,45 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vcopysign/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vunary.h"
+
+void xnn_f32_vcopysignc_ukernel__rvv_u4v(
+      size_t batch,
+      const float* mag,
+      const float* sign,
+      float* output,
+      const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(mag != NULL);
+  assert(sign != NULL);
+  assert(output != NULL);
+
+  const float signc = *sign;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t vl = __riscv_vsetvl_e32m4(batch); batch -= vl;
+
+    vfloat32m4_t vmag = __riscv_vle32_v_f32m4(mag, vl); mag += vl;
+    vfloat32m4_t vy = __riscv_vfsgnj(vmag, signc, vl);
+
+    __riscv_vse32(output, vy, vl); output += vl;
+ 
+  } while (batch != 0);
+}
diff --git a/src/f32-vcopysign/gen/f32-vcopysignc-rvv-u8v.c b/src/f32-vcopysign/gen/f32-vcopysignc-rvv-u8v.c
new file mode 100644
index 00000000000..f3f4e254e12
--- /dev/null
+++ b/src/f32-vcopysign/gen/f32-vcopysignc-rvv-u8v.c
@@ -0,0 +1,45 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vcopysign/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vunary.h"
+
+void xnn_f32_vcopysignc_ukernel__rvv_u8v(
+      size_t batch,
+      const float* mag,
+      const float* sign,
+      float* output,
+      const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(mag != NULL);
+  assert(sign != NULL);
+  assert(output != NULL);
+
+  const float signc = *sign;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t vl = __riscv_vsetvl_e32m8(batch); batch -= vl;
+
+    vfloat32m8_t vmag = __riscv_vle32_v_f32m8(mag, vl); mag += vl;
+    vfloat32m8_t vy = __riscv_vfsgnj(vmag, signc, vl);
+
+    __riscv_vse32(output, vy, vl); output += vl;
+ 
+  } while (batch != 0);
+}
diff --git a/src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u4v.c b/src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u4v.c
new file mode 100644
index 00000000000..9d7b0f71572
--- /dev/null
+++ b/src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u4v.c
@@ -0,0 +1,46 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vcopysign/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vunary.h"
+
+void xnn_f32_vrcopysignc_ukernel__rvv_u4v(
+      size_t batch,
+      const float* sign,
+      const float* mag,
+      float* output,
+      const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(mag != NULL);
+  assert(sign != NULL);
+  assert(output != NULL);
+
+  const float magc = *mag;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t vl = __riscv_vsetvl_e32m4(batch); batch -= vl;
+
+    vfloat32m4_t vmag = __riscv_vfmv_v_f_f32m4(magc, vl);
+    vfloat32m4_t vsign = __riscv_vle32_v_f32m4(sign, vl); sign += vl;
+    vfloat32m4_t vy = __riscv_vfsgnj(vmag, vsign, vl);
+
+    __riscv_vse32(output, vy, vl); output += vl;
+ 
+  } while (batch != 0);
+}
diff --git a/src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u8v.c b/src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u8v.c
new file mode 100644
index 00000000000..1d6aa7a5946
--- /dev/null
+++ b/src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u8v.c
@@ -0,0 +1,46 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vcopysign/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vunary.h"
+
+void xnn_f32_vrcopysignc_ukernel__rvv_u8v(
+      size_t batch,
+      const float* sign,
+      const float* mag,
+      float* output,
+      const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(mag != NULL);
+  assert(sign != NULL);
+  assert(output != NULL);
+
+  const float magc = *mag;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t vl = __riscv_vsetvl_e32m8(batch); batch -= vl;
+
+    vfloat32m8_t vmag = __riscv_vfmv_v_f_f32m8(magc, vl);
+    vfloat32m8_t vsign = __riscv_vle32_v_f32m8(sign, vl); sign += vl;
+    vfloat32m8_t vy = __riscv_vfsgnj(vmag, vsign, vl);
+
+    __riscv_vse32(output, vy, vl); output += vl;
+ 
+  } while (batch != 0);
+}
diff --git a/src/f32-vcopysign/rvv.c.in b/src/f32-vcopysign/rvv.c.in
new file mode 100644
index 00000000000..cad4f003f49
--- /dev/null
+++ b/src/f32-vcopysign/rvv.c.in
@@ -0,0 +1,58 @@
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert LMUL in [1, 2, 4, 8]
+$assert OP in ["COPYSIGN", "COPYSIGNC", "RCOPYSIGNC"]
+#include <assert.h>
+#include <math.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/vunary.h"
+
+void xnn_f32_v${OP.lower()}_ukernel__rvv_u${LMUL}v(
+      size_t batch,
+      $if OP == "RCOPYSIGNC":
+        const float* sign,
+        const float* mag,
+      $else:
+        const float* mag,
+        const float* sign,
+      float* output,
+      const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(mag != NULL);
+  assert(sign != NULL);
+  assert(output != NULL);
+
+  $if OP == "COPYSIGNC":
+    const float signc = *sign;
+  $elif OP == "RCOPYSIGNC":
+    const float magc = *mag;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t vl = __riscv_vsetvl_e32m${LMUL}(batch); batch -= vl;
+
+    $if OP == "COPYSIGN":
+      vfloat32m${LMUL}_t vmag = __riscv_vle32_v_f32m${LMUL}(mag, vl); mag += vl;
+      vfloat32m${LMUL}_t vsign = __riscv_vle32_v_f32m${LMUL}(sign, vl); sign += vl;
+      vfloat32m${LMUL}_t vy = __riscv_vfsgnj(vmag, vsign, vl);
+    $elif OP == "COPYSIGNC":
+      vfloat32m${LMUL}_t vmag = __riscv_vle32_v_f32m${LMUL}(mag, vl); mag += vl;
+      vfloat32m${LMUL}_t vy = __riscv_vfsgnj(vmag, signc, vl);
+    $elif OP == "RCOPYSIGNC":
+      vfloat32m${LMUL}_t vmag = __riscv_vfmv_v_f_f32m${LMUL}(magc, vl);
+      vfloat32m${LMUL}_t vsign = __riscv_vle32_v_f32m${LMUL}(sign, vl); sign += vl;
+      vfloat32m${LMUL}_t vy = __riscv_vfsgnj(vmag, vsign, vl);
+
+    __riscv_vse32(output, vy, vl); output += vl;
+ 
+  } while (batch != 0);
+}