From 7596bbce20b1c1b39e57d83ac022324e8edcaa4b Mon Sep 17 00:00:00 2001
From: Ken Unger <ken.j.unger@gmail.com>
Date: Mon, 2 Mar 2026 11:11:08 -0800
Subject: [PATCH 1/2] rvv maxpool for f32, f16, s8, u8

---
 cmake/gen/rvv_microkernels.cmake              |   4 +
 cmake/gen/rvvfp16arith_microkernels.cmake     |   2 +
 gen/rvv_microkernels.bzl                      |   4 +
 gen/rvvfp16arith_microkernels.bzl             |   2 +
 scripts/generate-f16-maxpool.sh               |   4 +
 scripts/generate-f32-maxpool.sh               |   4 +-
 scripts/generate-s8-maxpool.sh                |   4 +
 scripts/generate-u8-maxpool.sh                |   4 +
 src/configs/maxpool-config.c                  |  14 ++
 src/f16-maxpool/f16-maxpool-minmax.inc        |   4 +
 .../f16-maxpool-9p-minmax-rvvfp16arith-u1v.c  | 147 ++++++++++++++
 .../f16-maxpool-9p-minmax-rvvfp16arith-u2v.c  | 147 ++++++++++++++
 .../gen/f32-maxpool-9p-minmax-rvv-u1v.c       | 159 ++++++++--------
 .../gen/f32-maxpool-9p-minmax-rvv-u2v.c       | 159 ++++++++--------
 src/f32-maxpool/rvv.c.in                      | 180 ++++++++----------
 .../gen/s8-maxpool-9p-minmax-rvv-u1v.c        | 147 ++++++++++++++
 .../gen/s8-maxpool-9p-minmax-rvv-u2v.c        | 147 ++++++++++++++
 src/s8-maxpool/s8-maxpool-minmax.inc          |   7 +-
 .../gen/u8-maxpool-9p-minmax-rvv-u1v.c        | 147 ++++++++++++++
 .../gen/u8-maxpool-9p-minmax-rvv-u2v.c        | 147 ++++++++++++++
 src/u8-maxpool/u8-maxpool-minmax.inc          |   7 +-
 21 files changed, 1183 insertions(+), 257 deletions(-)
 create mode 100644 src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c
 create mode 100644 src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u2v.c
 create mode 100644 src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c
 create mode 100644 src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u2v.c
 create mode 100644 src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c
 create mode 100644 src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u2v.c

diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake
index 38117bc1754..04d5e143861 100644
--- a/cmake/gen/rvv_microkernels.cmake
+++ b/cmake/gen/rvv_microkernels.cmake
@@ -103,7 +103,9 @@ SET(PROD_RVV_MICROKERNEL_SRCS
   src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u2v.c
   src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u2v.c
   src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u2v.c
+  src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u2v.c
   src/s8-vclamp/gen/s8-vclamp-rvv-u4v.c
+  src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u2v.c
   src/u8-vclamp/gen/u8-vclamp-rvv-u4v.c
   src/x32-packw/gen/x32-packw-x4v-gemm-goi-rvv-u8.c
   src/x32-transposec/gen/x32-transposec-4x4-rvv.c
@@ -258,9 +260,11 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS
   src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u1v.c
   src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c
   src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c
+  src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c
   src/s8-vclamp/gen/s8-vclamp-rvv-u1v.c
   src/s8-vclamp/gen/s8-vclamp-rvv-u2v.c
   src/s8-vclamp/gen/s8-vclamp-rvv-u8v.c
+  src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c
   src/u8-vclamp/gen/u8-vclamp-rvv-u1v.c
   src/u8-vclamp/gen/u8-vclamp-rvv-u2v.c
   src/u8-vclamp/gen/u8-vclamp-rvv-u8v.c
diff --git a/cmake/gen/rvvfp16arith_microkernels.cmake b/cmake/gen/rvvfp16arith_microkernels.cmake
index 543a311c1cc..ac5f45c7724 100644
--- a/cmake/gen/rvvfp16arith_microkernels.cmake
+++ b/cmake/gen/rvvfp16arith_microkernels.cmake
@@ -19,6 +19,7 @@ SET(PROD_RVVFP16ARITH_MICROKERNEL_SRCS
   src/f16-gemm/gen/f16-gemm-7x4v-minmax-rvvfp16arith.c
   src/f16-igemm/gen/f16-igemm-1x4v-minmax-rvvfp16arith.c
   src/f16-igemm/gen/f16-igemm-7x4v-minmax-rvvfp16arith.c
+  src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u2v.c
   src/f16-spmm/gen/f16-spmm-8vx1-minmax-rvvfp16arith.c
   src/f16-vbinary/gen/f16-vadd-rvvfp16arith-u8v.c
   src/f16-vbinary/gen/f16-vaddc-rvvfp16arith-u8v.c
@@ -54,6 +55,7 @@ SET(NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS
   src/f16-f32-vcvt/gen/f16-f32-vcvt-rvvfp16arith-u2v.c
   src/f16-gemm/gen/f16-gemm-4x4v-minmax-rvvfp16arith.c
   src/f16-igemm/gen/f16-igemm-4x4v-minmax-rvvfp16arith.c
+  src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c
   src/f16-spmm/gen/f16-spmm-1vx1-minmax-rvvfp16arith.c
   src/f16-spmm/gen/f16-spmm-2vx1-minmax-rvvfp16arith.c
   src/f16-spmm/gen/f16-spmm-4vx1-minmax-rvvfp16arith.c
diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl
index d864e7d10be..fb7468b48b3 100644
--- a/gen/rvv_microkernels.bzl
+++ b/gen/rvv_microkernels.bzl
@@ -99,7 +99,9 @@ PROD_RVV_MICROKERNEL_SRCS = [
     "src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u2v.c",
     "src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u2v.c",
     "src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u2v.c",
+    "src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u2v.c",
     "src/s8-vclamp/gen/s8-vclamp-rvv-u4v.c",
+    "src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u2v.c",
     "src/u8-vclamp/gen/u8-vclamp-rvv-u4v.c",
     "src/x32-packw/gen/x32-packw-x4v-gemm-goi-rvv-u8.c",
     "src/x32-transposec/gen/x32-transposec-4x4-rvv.c",
@@ -255,9 +257,11 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [
     "src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u1v.c",
     "src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c",
     "src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c",
+    "src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c",
     "src/s8-vclamp/gen/s8-vclamp-rvv-u1v.c",
     "src/s8-vclamp/gen/s8-vclamp-rvv-u2v.c",
     "src/s8-vclamp/gen/s8-vclamp-rvv-u8v.c",
+    "src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c",
     "src/u8-vclamp/gen/u8-vclamp-rvv-u1v.c",
     "src/u8-vclamp/gen/u8-vclamp-rvv-u2v.c",
     "src/u8-vclamp/gen/u8-vclamp-rvv-u8v.c",
diff --git a/gen/rvvfp16arith_microkernels.bzl b/gen/rvvfp16arith_microkernels.bzl
index 2c098847bf7..acbae4d7c04 100644
--- a/gen/rvvfp16arith_microkernels.bzl
+++ b/gen/rvvfp16arith_microkernels.bzl
@@ -15,6 +15,7 @@ PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [
     "src/f16-gemm/gen/f16-gemm-7x4v-minmax-rvvfp16arith.c",
     "src/f16-igemm/gen/f16-igemm-1x4v-minmax-rvvfp16arith.c",
     "src/f16-igemm/gen/f16-igemm-7x4v-minmax-rvvfp16arith.c",
+    "src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u2v.c",
     "src/f16-spmm/gen/f16-spmm-8vx1-minmax-rvvfp16arith.c",
     "src/f16-vbinary/gen/f16-vadd-rvvfp16arith-u8v.c",
     "src/f16-vbinary/gen/f16-vaddc-rvvfp16arith-u8v.c",
@@ -51,6 +52,7 @@ NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [
     "src/f16-f32-vcvt/gen/f16-f32-vcvt-rvvfp16arith-u2v.c",
     "src/f16-gemm/gen/f16-gemm-4x4v-minmax-rvvfp16arith.c",
     "src/f16-igemm/gen/f16-igemm-4x4v-minmax-rvvfp16arith.c",
+    "src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c",
     "src/f16-spmm/gen/f16-spmm-1vx1-minmax-rvvfp16arith.c",
     "src/f16-spmm/gen/f16-spmm-2vx1-minmax-rvvfp16arith.c",
     "src/f16-spmm/gen/f16-spmm-4vx1-minmax-rvvfp16arith.c",
diff --git a/scripts/generate-f16-maxpool.sh b/scripts/generate-f16-maxpool.sh
index 73fff4d8256..9bfb490c2cc 100755
--- a/scripts/generate-f16-maxpool.sh
+++ b/scripts/generate-f16-maxpool.sh
@@ -9,4 +9,8 @@ tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=f16 -D ARCH=neonfp16arith -
 tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=f16 -D ARCH=avx2 -D SIMD_SIZE=16 -o src/f16-maxpool/gen/f16-maxpool-9p-minmax-avx2-u16.c &
 tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=f16 -D ARCH=sse41 -D SIMD_SIZE=8 -o src/f16-maxpool/gen/f16-maxpool-9p-minmax-sse41-u8.c &
 
+################################ RISC-V Vector ################################
+tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=f16 -D LMUL=1 -o src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c &
+tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=f16 -D LMUL=2 -o src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u2v.c &
+
 wait
diff --git a/scripts/generate-f32-maxpool.sh b/scripts/generate-f32-maxpool.sh
index 2a142c77987..4c15f633750 100755
--- a/scripts/generate-f32-maxpool.sh
+++ b/scripts/generate-f32-maxpool.sh
@@ -12,7 +12,7 @@ tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=f32 -D ARCH=neon     -D SIM
 tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=f32 -D ARCH=hvx      -D SIMD_SIZE=32 -o src/f32-maxpool/gen/f32-maxpool-9p-minmax-hvx-u32.c &
 
 ################################ RISC-V Vector ################################
-tools/xngen src/f32-maxpool/rvv.c.in -D LMUL=1 -o src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c &
-tools/xngen src/f32-maxpool/rvv.c.in -D LMUL=2 -o src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u2v.c &
+tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=f32 -D LMUL=1 -o src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c &
+tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=f32 -D LMUL=2 -o src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u2v.c &
 
 wait
diff --git a/scripts/generate-s8-maxpool.sh b/scripts/generate-s8-maxpool.sh
index 0ae6063e03f..6b379899ffd 100755
--- a/scripts/generate-s8-maxpool.sh
+++ b/scripts/generate-s8-maxpool.sh
@@ -10,4 +10,8 @@ tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=s8 -D ARCH=sse41 -D SIMD_SI
 tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=s8 -D ARCH=wasmsimd -D SIMD_SIZE=16 -o src/s8-maxpool/gen/s8-maxpool-9p-minmax-wasmsimd-u16.c &
 tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=s8 -D ARCH=neon -D SIMD_SIZE=16 -o src/s8-maxpool/gen/s8-maxpool-9p-minmax-neon-u16.c &
 
+################################ RISC-V Vector #################################
+tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=s8 -D LMUL=1 -o src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c &
+tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=s8 -D LMUL=2 -o src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u2v.c &
+
 wait
diff --git a/scripts/generate-u8-maxpool.sh b/scripts/generate-u8-maxpool.sh
index d7df2b21f3c..aa1b1f66faf 100755
--- a/scripts/generate-u8-maxpool.sh
+++ b/scripts/generate-u8-maxpool.sh
@@ -10,4 +10,8 @@ tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=u8 -D KERNEL_TILE=9 -D ARCH
 tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=u8 -D KERNEL_TILE=9 -D ARCH=wasmsimd -D SIMD_SIZE=16 -o src/u8-maxpool/gen/u8-maxpool-9p-minmax-wasmsimd-u16.c &
 tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=u8 -D KERNEL_TILE=9 -D ARCH=neon -D SIMD_SIZE=16 -o src/u8-maxpool/gen/u8-maxpool-9p-minmax-neon-u16.c &
 
+################################ RISC-V Vector #################################
+tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=u8 -D LMUL=1 -o src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c &
+tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=u8 -D LMUL=2 -o src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u2v.c &
+
 wait
diff --git a/src/configs/maxpool-config.c b/src/configs/maxpool-config.c
index 4026b8d8e01..df48fc24774 100644
--- a/src/configs/maxpool-config.c
+++ b/src/configs/maxpool-config.c
@@ -65,6 +65,14 @@ static void init_f16_maxpool_config(void) {
       } else
     #endif
     ;  // no f16 support
+  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    (void) hardware_config;  // May be unused.
+    if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) {
+      f16_maxpool_config.ukernel = XNN_INIT_MAXPOOL_UKERNEL(xnn_f16_maxpool_minmax_ukernel_9p__rvvfp16arith_u2v);
+      f16_maxpool_config.init.f16 = xnn_init_f16_minmax_scalar_params;
+    }
   #endif
 }
 
@@ -144,6 +152,9 @@ static void init_s8_maxpool_config(void) {
   #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
     s8_maxpool_config.ukernel = XNN_INIT_MAXPOOL_UKERNEL(xnn_s8_maxpool_minmax_ukernel_9p__wasmsimd_u16);
     s8_maxpool_config.init.s8 = xnn_init_s8_minmax_scalar_params;
+  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
+    s8_maxpool_config.ukernel = XNN_INIT_MAXPOOL_UKERNEL(xnn_s8_maxpool_minmax_ukernel_9p__rvv_u2v);
+    s8_maxpool_config.init.s8 = xnn_init_s8_minmax_scalar_params;
   #else
     s8_maxpool_config.ukernel = XNN_INIT_MAXPOOL_UKERNEL(xnn_s8_maxpool_minmax_ukernel_9p__scalar_u1);
     s8_maxpool_config.init.s8 = xnn_init_s8_minmax_scalar_params;
@@ -178,6 +189,9 @@ static void init_u8_maxpool_config(void) {
   #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
     u8_maxpool_config.ukernel = XNN_INIT_MAXPOOL_UKERNEL(xnn_u8_maxpool_minmax_ukernel_9p__wasmsimd_u16);
     u8_maxpool_config.init.u8 = xnn_init_u8_minmax_scalar_params;
+  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
+    u8_maxpool_config.ukernel = XNN_INIT_MAXPOOL_UKERNEL(xnn_u8_maxpool_minmax_ukernel_9p__rvv_u2v);
+    u8_maxpool_config.init.u8 = xnn_init_u8_minmax_scalar_params;
   #else
     u8_maxpool_config.ukernel = XNN_INIT_MAXPOOL_UKERNEL(xnn_u8_maxpool_minmax_ukernel_9p__scalar_u1);
     u8_maxpool_config.init.u8 = xnn_init_u8_minmax_scalar_params;
diff --git a/src/f16-maxpool/f16-maxpool-minmax.inc b/src/f16-maxpool/f16-maxpool-minmax.inc
index 1bff1d6df51..ebc7a65487c 100644
--- a/src/f16-maxpool/f16-maxpool-minmax.inc
+++ b/src/f16-maxpool/f16-maxpool-minmax.inc
@@ -17,3 +17,7 @@ XNN_UKERNEL(xnn_arch_x86_sse4_1, xnn_f16_maxpool_minmax_ukernel_9p__sse41_u8, 8,
 XNN_UKERNEL(xnn_arch_x86_avx2, xnn_f16_maxpool_minmax_ukernel_9p__avx2_u16, 16, 9, xnn_float16, struct xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params)
 #endif  // XNN_ENABLE_AVX2 && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
+#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_maxpool_minmax_ukernel_9p__rvvfp16arith_u1v, (1*xnn_init_hardware_config()->vlenb/sizeof(xnn_float16)), 9, xnn_float16, struct xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params)
+XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_maxpool_minmax_ukernel_9p__rvvfp16arith_u2v, (2*xnn_init_hardware_config()->vlenb/sizeof(xnn_float16)), 9, xnn_float16, struct xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params)
+#endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
diff --git a/src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c b/src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c
new file mode 100644
index 00000000000..266b3701f49
--- /dev/null
+++ b/src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c
@@ -0,0 +1,147 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-maxpool/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include "src/xnnpack/maxpool.h"
+#include <riscv_vector.h>
+
+
+void xnn_f16_maxpool_minmax_ukernel_9p__rvvfp16arith_u1v(
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
+    const xnn_float16** input,
+    size_t input_offset,
+    size_t input_pixel_stride,
+    xnn_float16* output,
+    size_t input_increment,
+    size_t output_increment,
+    const struct xnn_f16_minmax_params* restrict params)
+{
+  assert(output_pixels != 0);
+  assert(kernel_elements != 0);
+  assert(channels != 0);
+
+  const xnn_float16 output_min = params->scalar.min;
+  const xnn_float16 output_max = params->scalar.max;
+  do {
+    const xnn_float16** i = input;
+
+    // First pass: load the inputs, store the max pool in the output.
+    const xnn_float16* i0 = *i++;
+    const xnn_float16* i1 = 1 < kernel_elements ? *i++ : i0;
+    const xnn_float16* i2 = 2 < kernel_elements ? *i++ : i0;
+    const xnn_float16* i3 = 3 < kernel_elements ? *i++ : i0;
+    const xnn_float16* i4 = 4 < kernel_elements ? *i++ : i0;
+    const xnn_float16* i5 = 5 < kernel_elements ? *i++ : i0;
+    const xnn_float16* i6 = 6 < kernel_elements ? *i++ : i0;
+    const xnn_float16* i7 = 7 < kernel_elements ? *i++ : i0;
+    const xnn_float16* i8 = 8 < kernel_elements ? *i++ : i0;
+    i0 = (const xnn_float16*) ((uintptr_t) i0 + input_offset);
+    i1 = (const xnn_float16*) ((uintptr_t) i1 + input_offset);
+    i2 = (const xnn_float16*) ((uintptr_t) i2 + input_offset);
+    i3 = (const xnn_float16*) ((uintptr_t) i3 + input_offset);
+    i4 = (const xnn_float16*) ((uintptr_t) i4 + input_offset);
+    i5 = (const xnn_float16*) ((uintptr_t) i5 + input_offset);
+    i6 = (const xnn_float16*) ((uintptr_t) i6 + input_offset);
+    i7 = (const xnn_float16*) ((uintptr_t) i7 + input_offset);
+    i8 = (const xnn_float16*) ((uintptr_t) i8 + input_offset);
+
+    xnn_float16* o = output;
+    size_t c = channels;
+    do {
+      size_t vl = __riscv_vsetvl_e16m1(c);
+      vfloat16m1_t vi0 = __riscv_vle16_v_f16m1(i0, vl); i0 += vl;
+      vfloat16m1_t vi1 = __riscv_vle16_v_f16m1(i1, vl); i1 += vl;
+      vfloat16m1_t vi2 = __riscv_vle16_v_f16m1(i2, vl); i2 += vl;
+      vfloat16m1_t vi3 = __riscv_vle16_v_f16m1(i3, vl); i3 += vl;
+      vfloat16m1_t vi4 = __riscv_vle16_v_f16m1(i4, vl); i4 += vl;
+      vfloat16m1_t vi5 = __riscv_vle16_v_f16m1(i5, vl); i5 += vl;
+      vfloat16m1_t vi6 = __riscv_vle16_v_f16m1(i6, vl); i6 += vl;
+      vfloat16m1_t vi7 = __riscv_vle16_v_f16m1(i7, vl); i7 += vl;
+      vfloat16m1_t vi8 = __riscv_vle16_v_f16m1(i8, vl); i8 += vl;
+
+      vfloat16m1_t vmax01 = __riscv_vfmax(vi0, vi1, vl);
+      vfloat16m1_t vmax23 = __riscv_vfmax(vi2, vi3, vl);
+      vfloat16m1_t vmax45 = __riscv_vfmax(vi4, vi5, vl);
+      vfloat16m1_t vmax67 = __riscv_vfmax(vi6, vi7, vl);
+      vfloat16m1_t vmax018 = __riscv_vfmax(vmax01, vi8, vl);
+
+      vfloat16m1_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl);
+      vfloat16m1_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl);
+      vfloat16m1_t vacc = __riscv_vfmax(vmax2345, vmax01678, vl);
+
+      vacc = __riscv_vfmax(vacc, output_min, vl);
+      vacc = __riscv_vfmin(vacc, output_max, vl);
+      __riscv_vse16_v_f16m1(o, vacc, vl); o += vl;
+
+      c -= vl;
+    } while (c != 0);
+
+    // Passes 1 - n: Max more inputs to the output.
+    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) {
+      const xnn_float16* i0 = *i++;
+      const xnn_float16* i1 = 1 < k ? *i++ : i0;
+      const xnn_float16* i2 = 2 < k ? *i++ : i0;
+      const xnn_float16* i3 = 3 < k ? *i++ : i0;
+      const xnn_float16* i4 = 4 < k ? *i++ : i0;
+      const xnn_float16* i5 = 5 < k ? *i++ : i0;
+      const xnn_float16* i6 = 6 < k ? *i++ : i0;
+      const xnn_float16* i7 = 7 < k ? *i++ : i0;
+      const xnn_float16* i8 = 8 < k ? *i++ : i0;
+      i0 = (const xnn_float16*) ((uintptr_t) i0 + input_offset);
+      i1 = (const xnn_float16*) ((uintptr_t) i1 + input_offset);
+      i2 = (const xnn_float16*) ((uintptr_t) i2 + input_offset);
+      i3 = (const xnn_float16*) ((uintptr_t) i3 + input_offset);
+      i4 = (const xnn_float16*) ((uintptr_t) i4 + input_offset);
+      i5 = (const xnn_float16*) ((uintptr_t) i5 + input_offset);
+      i6 = (const xnn_float16*) ((uintptr_t) i6 + input_offset);
+      i7 = (const xnn_float16*) ((uintptr_t) i7 + input_offset);
+      i8 = (const xnn_float16*) ((uintptr_t) i8 + input_offset);
+
+      o = output;
+      size_t c = channels;
+      do {
+        size_t vl = __riscv_vsetvl_e16m1(c);
+
+        vfloat16m1_t vi0 = __riscv_vle16_v_f16m1(i0, vl); i0 += vl;
+        vfloat16m1_t vi1 = __riscv_vle16_v_f16m1(i1, vl); i1 += vl;
+        vfloat16m1_t vi2 = __riscv_vle16_v_f16m1(i2, vl); i2 += vl;
+        vfloat16m1_t vi3 = __riscv_vle16_v_f16m1(i3, vl); i3 += vl;
+        vfloat16m1_t vi4 = __riscv_vle16_v_f16m1(i4, vl); i4 += vl;
+        vfloat16m1_t vi5 = __riscv_vle16_v_f16m1(i5, vl); i5 += vl;
+        vfloat16m1_t vi6 = __riscv_vle16_v_f16m1(i6, vl); i6 += vl;
+        vfloat16m1_t vi7 = __riscv_vle16_v_f16m1(i7, vl); i7 += vl;
+        vfloat16m1_t vi8 = __riscv_vle16_v_f16m1(i8, vl); i8 += vl;
+
+        vfloat16m1_t vprev = __riscv_vle16_v_f16m1(o, vl);
+
+        vfloat16m1_t vmax01 = __riscv_vfmax(vi0, vi1, vl);
+        vfloat16m1_t vmax23 = __riscv_vfmax(vi2, vi3, vl);
+        vfloat16m1_t vmax45 = __riscv_vfmax(vi4, vi5, vl);
+        vfloat16m1_t vmax67 = __riscv_vfmax(vi6, vi7, vl);
+        vfloat16m1_t vmax018 = __riscv_vfmax(vmax01, vi8, vl);
+
+        vfloat16m1_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl);
+        vfloat16m1_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl);
+        vfloat16m1_t vmax012345678 = __riscv_vfmax(vmax2345, vmax01678, vl);
+
+        vfloat16m1_t vacc = __riscv_vfmax(vprev, vmax012345678, vl);
+        vacc = __riscv_vfmin(vacc, output_max, vl);
+        __riscv_vse16_v_f16m1(o, vacc, vl); o += vl;
+
+        c -= vl;
+      } while (c != 0);
+    }
+    input = (const xnn_float16**) ((uintptr_t) input + input_increment);
+    input_offset += input_pixel_stride;
+    output = (xnn_float16*) ((uintptr_t) output + output_increment);
+  } while (--output_pixels != 0);
+}
diff --git a/src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u2v.c b/src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u2v.c
new file mode 100644
index 00000000000..6bb8937af18
--- /dev/null
+++ b/src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u2v.c
@@ -0,0 +1,147 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-maxpool/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include "src/xnnpack/maxpool.h"
+#include <riscv_vector.h>
+
+
+void xnn_f16_maxpool_minmax_ukernel_9p__rvvfp16arith_u2v(
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
+    const xnn_float16** input,
+    size_t input_offset,
+    size_t input_pixel_stride,
+    xnn_float16* output,
+    size_t input_increment,
+    size_t output_increment,
+    const struct xnn_f16_minmax_params* restrict params)
+{
+  assert(output_pixels != 0);
+  assert(kernel_elements != 0);
+  assert(channels != 0);
+
+  const xnn_float16 output_min = params->scalar.min;
+  const xnn_float16 output_max = params->scalar.max;
+  do {
+    const xnn_float16** i = input;
+
+    // First pass: load the inputs, store the max pool in the output.
+    const xnn_float16* i0 = *i++;
+    const xnn_float16* i1 = 1 < kernel_elements ? *i++ : i0;
+    const xnn_float16* i2 = 2 < kernel_elements ? *i++ : i0;
+    const xnn_float16* i3 = 3 < kernel_elements ? *i++ : i0;
+    const xnn_float16* i4 = 4 < kernel_elements ? *i++ : i0;
+    const xnn_float16* i5 = 5 < kernel_elements ? *i++ : i0;
+    const xnn_float16* i6 = 6 < kernel_elements ? *i++ : i0;
+    const xnn_float16* i7 = 7 < kernel_elements ? *i++ : i0;
+    const xnn_float16* i8 = 8 < kernel_elements ? *i++ : i0;
+    i0 = (const xnn_float16*) ((uintptr_t) i0 + input_offset);
+    i1 = (const xnn_float16*) ((uintptr_t) i1 + input_offset);
+    i2 = (const xnn_float16*) ((uintptr_t) i2 + input_offset);
+    i3 = (const xnn_float16*) ((uintptr_t) i3 + input_offset);
+    i4 = (const xnn_float16*) ((uintptr_t) i4 + input_offset);
+    i5 = (const xnn_float16*) ((uintptr_t) i5 + input_offset);
+    i6 = (const xnn_float16*) ((uintptr_t) i6 + input_offset);
+    i7 = (const xnn_float16*) ((uintptr_t) i7 + input_offset);
+    i8 = (const xnn_float16*) ((uintptr_t) i8 + input_offset);
+
+    xnn_float16* o = output;
+    size_t c = channels;
+    do {
+      size_t vl = __riscv_vsetvl_e16m2(c);
+      vfloat16m2_t vi0 = __riscv_vle16_v_f16m2(i0, vl); i0 += vl;
+      vfloat16m2_t vi1 = __riscv_vle16_v_f16m2(i1, vl); i1 += vl;
+      vfloat16m2_t vi2 = __riscv_vle16_v_f16m2(i2, vl); i2 += vl;
+      vfloat16m2_t vi3 = __riscv_vle16_v_f16m2(i3, vl); i3 += vl;
+      vfloat16m2_t vi4 = __riscv_vle16_v_f16m2(i4, vl); i4 += vl;
+      vfloat16m2_t vi5 = __riscv_vle16_v_f16m2(i5, vl); i5 += vl;
+      vfloat16m2_t vi6 = __riscv_vle16_v_f16m2(i6, vl); i6 += vl;
+      vfloat16m2_t vi7 = __riscv_vle16_v_f16m2(i7, vl); i7 += vl;
+      vfloat16m2_t vi8 = __riscv_vle16_v_f16m2(i8, vl); i8 += vl;
+
+      vfloat16m2_t vmax01 = __riscv_vfmax(vi0, vi1, vl);
+      vfloat16m2_t vmax23 = __riscv_vfmax(vi2, vi3, vl);
+      vfloat16m2_t vmax45 = __riscv_vfmax(vi4, vi5, vl);
+      vfloat16m2_t vmax67 = __riscv_vfmax(vi6, vi7, vl);
+      vfloat16m2_t vmax018 = __riscv_vfmax(vmax01, vi8, vl);
+
+      vfloat16m2_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl);
+      vfloat16m2_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl);
+      vfloat16m2_t vacc = __riscv_vfmax(vmax2345, vmax01678, vl);
+
+      vacc = __riscv_vfmax(vacc, output_min, vl);
+      vacc = __riscv_vfmin(vacc, output_max, vl);
+      __riscv_vse16_v_f16m2(o, vacc, vl); o += vl;
+
+      c -= vl;
+    } while (c != 0);
+
+    // Passes 1 - n: Max more inputs to the output.
+    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) {
+      const xnn_float16* i0 = *i++;
+      const xnn_float16* i1 = 1 < k ? *i++ : i0;
+      const xnn_float16* i2 = 2 < k ? *i++ : i0;
+      const xnn_float16* i3 = 3 < k ? *i++ : i0;
+      const xnn_float16* i4 = 4 < k ? *i++ : i0;
+      const xnn_float16* i5 = 5 < k ? *i++ : i0;
+      const xnn_float16* i6 = 6 < k ? *i++ : i0;
+      const xnn_float16* i7 = 7 < k ? *i++ : i0;
+      const xnn_float16* i8 = 8 < k ? *i++ : i0;
+      i0 = (const xnn_float16*) ((uintptr_t) i0 + input_offset);
+      i1 = (const xnn_float16*) ((uintptr_t) i1 + input_offset);
+      i2 = (const xnn_float16*) ((uintptr_t) i2 + input_offset);
+      i3 = (const xnn_float16*) ((uintptr_t) i3 + input_offset);
+      i4 = (const xnn_float16*) ((uintptr_t) i4 + input_offset);
+      i5 = (const xnn_float16*) ((uintptr_t) i5 + input_offset);
+      i6 = (const xnn_float16*) ((uintptr_t) i6 + input_offset);
+      i7 = (const xnn_float16*) ((uintptr_t) i7 + input_offset);
+      i8 = (const xnn_float16*) ((uintptr_t) i8 + input_offset);
+
+      o = output;
+      size_t c = channels;
+      do {
+        size_t vl = __riscv_vsetvl_e16m2(c);
+
+        vfloat16m2_t vi0 = __riscv_vle16_v_f16m2(i0, vl); i0 += vl;
+        vfloat16m2_t vi1 = __riscv_vle16_v_f16m2(i1, vl); i1 += vl;
+        vfloat16m2_t vi2 = __riscv_vle16_v_f16m2(i2, vl); i2 += vl;
+        vfloat16m2_t vi3 = __riscv_vle16_v_f16m2(i3, vl); i3 += vl;
+        vfloat16m2_t vi4 = __riscv_vle16_v_f16m2(i4, vl); i4 += vl;
+        vfloat16m2_t vi5 = __riscv_vle16_v_f16m2(i5, vl); i5 += vl;
+        vfloat16m2_t vi6 = __riscv_vle16_v_f16m2(i6, vl); i6 += vl;
+        vfloat16m2_t vi7 = __riscv_vle16_v_f16m2(i7, vl); i7 += vl;
+        vfloat16m2_t vi8 = __riscv_vle16_v_f16m2(i8, vl); i8 += vl;
+
+        vfloat16m2_t vprev = __riscv_vle16_v_f16m2(o, vl);
+
+        vfloat16m2_t vmax01 = __riscv_vfmax(vi0, vi1, vl);
+        vfloat16m2_t vmax23 = __riscv_vfmax(vi2, vi3, vl);
+        vfloat16m2_t vmax45 = __riscv_vfmax(vi4, vi5, vl);
+        vfloat16m2_t vmax67 = __riscv_vfmax(vi6, vi7, vl);
+        vfloat16m2_t vmax018 = __riscv_vfmax(vmax01, vi8, vl);
+
+        vfloat16m2_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl);
+        vfloat16m2_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl);
+        vfloat16m2_t vmax012345678 = __riscv_vfmax(vmax2345, vmax01678, vl);
+
+        vfloat16m2_t vacc = __riscv_vfmax(vprev, vmax012345678, vl);
+        vacc = __riscv_vfmin(vacc, output_max, vl);
+        __riscv_vse16_v_f16m2(o, vacc, vl); o += vl;
+
+        c -= vl;
+      } while (c != 0);
+    }
+    input = (const xnn_float16**) ((uintptr_t) input + input_increment);
+    input_offset += input_pixel_stride;
+    output = (xnn_float16*) ((uintptr_t) output + output_increment);
+  } while (--output_pixels != 0);
+}
diff --git a/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c b/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c
index bec551b6017..a4873239b0a 100644
--- a/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c
+++ b/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c
@@ -3,7 +3,7 @@
 //   Template: src/f32-maxpool/rvv.c.in
 //   Generator: tools/xngen
 //
-// Copyright 2024 Imagination Technologies, inc.
+// Copyright 2026 Google LLC
 //
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
@@ -12,6 +12,7 @@
 #include "src/xnnpack/maxpool.h"
 #include <riscv_vector.h>
 
+
 void xnn_f32_maxpool_minmax_ukernel_9p__rvv_u1v(
     size_t output_pixels,
     size_t kernel_elements,
@@ -32,58 +33,60 @@ void xnn_f32_maxpool_minmax_ukernel_9p__rvv_u1v(
   const float output_max = params->scalar.max;
   do {
     const float** i = input;
+
+    // First pass: load the inputs, store the max pool in the output.
+    const float* i0 = *i++;
+    const float* i1 = 1 < kernel_elements ? *i++ : i0;
+    const float* i2 = 2 < kernel_elements ? *i++ : i0;
+    const float* i3 = 3 < kernel_elements ? *i++ : i0;
+    const float* i4 = 4 < kernel_elements ? *i++ : i0;
+    const float* i5 = 5 < kernel_elements ? *i++ : i0;
+    const float* i6 = 6 < kernel_elements ? *i++ : i0;
+    const float* i7 = 7 < kernel_elements ? *i++ : i0;
+    const float* i8 = 8 < kernel_elements ? *i++ : i0;
+    i0 = (const float*) ((uintptr_t) i0 + input_offset);
+    i1 = (const float*) ((uintptr_t) i1 + input_offset);
+    i2 = (const float*) ((uintptr_t) i2 + input_offset);
+    i3 = (const float*) ((uintptr_t) i3 + input_offset);
+    i4 = (const float*) ((uintptr_t) i4 + input_offset);
+    i5 = (const float*) ((uintptr_t) i5 + input_offset);
+    i6 = (const float*) ((uintptr_t) i6 + input_offset);
+    i7 = (const float*) ((uintptr_t) i7 + input_offset);
+    i8 = (const float*) ((uintptr_t) i8 + input_offset);
+
     float* o = output;
-    {
-      const float* i0 = *i++;
-      const float* i1 = 1 < kernel_elements ? *i++ : i0;
-      const float* i2 = 2 < kernel_elements ? *i++ : i0;
-      const float* i3 = 3 < kernel_elements ? *i++ : i0;
-      const float* i4 = 4 < kernel_elements ? *i++ : i0;
-      const float* i5 = 5 < kernel_elements ? *i++ : i0;
-      const float* i6 = 6 < kernel_elements ? *i++ : i0;
-      const float* i7 = 7 < kernel_elements ? *i++ : i0;
-      const float* i8 = 8 < kernel_elements ? *i++ : i0;
-      i0 = (const float*) ((uintptr_t) i0 + input_offset);
-      i1 = (const float*) ((uintptr_t) i1 + input_offset);
-      i2 = (const float*) ((uintptr_t) i2 + input_offset);
-      i3 = (const float*) ((uintptr_t) i3 + input_offset);
-      i4 = (const float*) ((uintptr_t) i4 + input_offset);
-      i5 = (const float*) ((uintptr_t) i5 + input_offset);
-      i6 = (const float*) ((uintptr_t) i6 + input_offset);
-      i7 = (const float*) ((uintptr_t) i7 + input_offset);
-      i8 = (const float*) ((uintptr_t) i8 + input_offset);
+    size_t c = channels;
+    do {
+      size_t vl = __riscv_vsetvl_e32m1(c);
+      vfloat32m1_t vi0 = __riscv_vle32_v_f32m1(i0, vl); i0 += vl;
+      vfloat32m1_t vi1 = __riscv_vle32_v_f32m1(i1, vl); i1 += vl;
+      vfloat32m1_t vi2 = __riscv_vle32_v_f32m1(i2, vl); i2 += vl;
+      vfloat32m1_t vi3 = __riscv_vle32_v_f32m1(i3, vl); i3 += vl;
+      vfloat32m1_t vi4 = __riscv_vle32_v_f32m1(i4, vl); i4 += vl;
+      vfloat32m1_t vi5 = __riscv_vle32_v_f32m1(i5, vl); i5 += vl;
+      vfloat32m1_t vi6 = __riscv_vle32_v_f32m1(i6, vl); i6 += vl;
+      vfloat32m1_t vi7 = __riscv_vle32_v_f32m1(i7, vl); i7 += vl;
+      vfloat32m1_t vi8 = __riscv_vle32_v_f32m1(i8, vl); i8 += vl;
 
-      size_t c = channels;
-      do {
-        int32_t n = __riscv_vsetvl_e32m1(c);
-
-        vfloat32m1_t i0_f32v = __riscv_vle32_v_f32m1(i0, n); i0 += n;
-        vfloat32m1_t i1_f32v = __riscv_vle32_v_f32m1(i1, n); i1 += n;
-        vfloat32m1_t i2_f32v = __riscv_vle32_v_f32m1(i2, n); i2 += n;
-        vfloat32m1_t i3_f32v = __riscv_vle32_v_f32m1(i3, n); i3 += n;
-        vfloat32m1_t i4_f32v = __riscv_vle32_v_f32m1(i4, n); i4 += n;
-        vfloat32m1_t i5_f32v = __riscv_vle32_v_f32m1(i5, n); i5 += n;
-        vfloat32m1_t i6_f32v = __riscv_vle32_v_f32m1(i6, n); i6 += n;
-        vfloat32m1_t i7_f32v = __riscv_vle32_v_f32m1(i7, n); i7 += n;
-        vfloat32m1_t i8_f32v = __riscv_vle32_v_f32m1(i8, n); i8 += n;
-
-        vfloat32m1_t max01_f32v = __riscv_vfmax_vv_f32m1(i0_f32v, i1_f32v, n);
-        vfloat32m1_t max23_f32v = __riscv_vfmax_vv_f32m1(i2_f32v, i3_f32v, n);
-        vfloat32m1_t max45_f32v = __riscv_vfmax_vv_f32m1(i4_f32v, i5_f32v, n);
-        vfloat32m1_t max67_f32v = __riscv_vfmax_vv_f32m1(i6_f32v, i7_f32v, n);
-        vfloat32m1_t max018_f32v = __riscv_vfmax_vv_f32m1(max01_f32v, i8_f32v, n);
-
-        vfloat32m1_t max2345_f32v = __riscv_vfmax_vv_f32m1(max23_f32v, max45_f32v, n);
-        vfloat32m1_t max01678_f32v = __riscv_vfmax_vv_f32m1(max67_f32v, max018_f32v, n);
-        vfloat32m1_t out_f32v = __riscv_vfmax_vv_f32m1(max2345_f32v, max01678_f32v, n);
-        out_f32v = __riscv_vfmin_vf_f32m1(__riscv_vfmax_vf_f32m1(out_f32v, output_min, n), output_max, n);
-        __riscv_vse32_v_f32m1(o, out_f32v, n); o += n;
-
-        c -= n;
-      } while (c != 0);
-    }
+      vfloat32m1_t vmax01 = __riscv_vfmax(vi0, vi1, vl);
+      vfloat32m1_t vmax23 = __riscv_vfmax(vi2, vi3, vl);
+      vfloat32m1_t vmax45 = __riscv_vfmax(vi4, vi5, vl);
+      vfloat32m1_t vmax67 = __riscv_vfmax(vi6, vi7, vl);
+      vfloat32m1_t vmax018 = __riscv_vfmax(vmax01, vi8, vl);
+
+      vfloat32m1_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl);
+      vfloat32m1_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl);
+      vfloat32m1_t vacc = __riscv_vfmax(vmax2345, vmax01678, vl);
 
-    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
+      vacc = __riscv_vfmax(vacc, output_min, vl);
+      vacc = __riscv_vfmin(vacc, output_max, vl);
+      __riscv_vse32_v_f32m1(o, vacc, vl); o += vl;
+
+      c -= vl;
+    } while (c != 0);
+
+    // Passes 1 - n: Max more inputs to the output.
+    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) {
       const float* i0 = *i++;
       const float* i1 = 1 < k ? *i++ : i0;
       const float* i2 = 2 < k ? *i++ : i0;
@@ -92,6 +95,7 @@ void xnn_f32_maxpool_minmax_ukernel_9p__rvv_u1v(
       const float* i5 = 5 < k ? *i++ : i0;
       const float* i6 = 6 < k ? *i++ : i0;
       const float* i7 = 7 < k ? *i++ : i0;
+      const float* i8 = 8 < k ? *i++ : i0;
       i0 = (const float*) ((uintptr_t) i0 + input_offset);
       i1 = (const float*) ((uintptr_t) i1 + input_offset);
       i2 = (const float*) ((uintptr_t) i2 + input_offset);
@@ -100,35 +104,40 @@ void xnn_f32_maxpool_minmax_ukernel_9p__rvv_u1v(
       i5 = (const float*) ((uintptr_t) i5 + input_offset);
       i6 = (const float*) ((uintptr_t) i6 + input_offset);
       i7 = (const float*) ((uintptr_t) i7 + input_offset);
+      i8 = (const float*) ((uintptr_t) i8 + input_offset);
 
       o = output;
       size_t c = channels;
       do {
-        int32_t n = __riscv_vsetvl_e32m1(c);
-
-        vfloat32m1_t i0_f32v = __riscv_vle32_v_f32m1(i0, n); i0 += n;
-        vfloat32m1_t i1_f32v = __riscv_vle32_v_f32m1(i1, n); i1 += n;
-        vfloat32m1_t i2_f32v = __riscv_vle32_v_f32m1(i2, n); i2 += n;
-        vfloat32m1_t i3_f32v = __riscv_vle32_v_f32m1(i3, n); i3 += n;
-        vfloat32m1_t i4_f32v = __riscv_vle32_v_f32m1(i4, n); i4 += n;
-        vfloat32m1_t i5_f32v = __riscv_vle32_v_f32m1(i5, n); i5 += n;
-        vfloat32m1_t i6_f32v = __riscv_vle32_v_f32m1(i6, n); i6 += n;
-        vfloat32m1_t i7_f32v = __riscv_vle32_v_f32m1(i7, n); i7 += n;
-        vfloat32m1_t i8_f32v = __riscv_vle32_v_f32m1(o, n);
-
-        vfloat32m1_t max01_f32v = __riscv_vfmax_vv_f32m1(i0_f32v, i1_f32v, n);
-        vfloat32m1_t max23_f32v = __riscv_vfmax_vv_f32m1(i2_f32v, i3_f32v, n);
-        vfloat32m1_t max45_f32v = __riscv_vfmax_vv_f32m1(i4_f32v, i5_f32v, n);
-        vfloat32m1_t max67_f32v = __riscv_vfmax_vv_f32m1(i6_f32v, i7_f32v, n);
-        vfloat32m1_t max018_f32v = __riscv_vfmax_vv_f32m1(max01_f32v, i8_f32v, n);
-
-        vfloat32m1_t max2345_f32v = __riscv_vfmax_vv_f32m1(max23_f32v, max45_f32v, n);
-        vfloat32m1_t max01678_f32v = __riscv_vfmax_vv_f32m1(max67_f32v, max018_f32v, n);
-        vfloat32m1_t out_f32v = __riscv_vfmax_vv_f32m1(max2345_f32v, max01678_f32v, n);
-        out_f32v = __riscv_vfmin_vf_f32m1(__riscv_vfmax_vf_f32m1(out_f32v, output_min, n), output_max, n);
-        __riscv_vse32_v_f32m1(o, out_f32v, n); o += n;
-
-        c -= n;
+        size_t vl = __riscv_vsetvl_e32m1(c);
+
+        vfloat32m1_t vi0 = __riscv_vle32_v_f32m1(i0, vl); i0 += vl;
+        vfloat32m1_t vi1 = __riscv_vle32_v_f32m1(i1, vl); i1 += vl;
+        vfloat32m1_t vi2 = __riscv_vle32_v_f32m1(i2, vl); i2 += vl;
+        vfloat32m1_t vi3 = __riscv_vle32_v_f32m1(i3, vl); i3 += vl;
+        vfloat32m1_t vi4 = __riscv_vle32_v_f32m1(i4, vl); i4 += vl;
+        vfloat32m1_t vi5 = __riscv_vle32_v_f32m1(i5, vl); i5 += vl;
+        vfloat32m1_t vi6 = __riscv_vle32_v_f32m1(i6, vl); i6 += vl;
+        vfloat32m1_t vi7 = __riscv_vle32_v_f32m1(i7, vl); i7 += vl;
+        vfloat32m1_t vi8 = __riscv_vle32_v_f32m1(i8, vl); i8 += vl;
+
+        vfloat32m1_t vprev = __riscv_vle32_v_f32m1(o, vl);
+
+        vfloat32m1_t vmax01 = __riscv_vfmax(vi0, vi1, vl);
+        vfloat32m1_t vmax23 = __riscv_vfmax(vi2, vi3, vl);
+        vfloat32m1_t vmax45 = __riscv_vfmax(vi4, vi5, vl);
+        vfloat32m1_t vmax67 = __riscv_vfmax(vi6, vi7, vl);
+        vfloat32m1_t vmax018 = __riscv_vfmax(vmax01, vi8, vl);
+
+        vfloat32m1_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl);
+        vfloat32m1_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl);
+        vfloat32m1_t vmax012345678 = __riscv_vfmax(vmax2345, vmax01678, vl);
+
+        vfloat32m1_t vacc = __riscv_vfmax(vprev, vmax012345678, vl);
+        vacc = __riscv_vfmin(vacc, output_max, vl);
+        __riscv_vse32_v_f32m1(o, vacc, vl); o += vl;
+
+        c -= vl;
       } while (c != 0);
     }
     input = (const float**) ((uintptr_t) input + input_increment);
diff --git a/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u2v.c b/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u2v.c
index e2dce631f6c..eb8bd0a71e4 100644
--- a/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u2v.c
+++ b/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u2v.c
@@ -3,7 +3,7 @@
 //   Template: src/f32-maxpool/rvv.c.in
 //   Generator: tools/xngen
 //
-// Copyright 2024 Imagination Technologies, inc.
+// Copyright 2026 Google LLC
 //
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
@@ -12,6 +12,7 @@
 #include "src/xnnpack/maxpool.h"
 #include <riscv_vector.h>
 
+
 void xnn_f32_maxpool_minmax_ukernel_9p__rvv_u2v(
     size_t output_pixels,
     size_t kernel_elements,
@@ -32,58 +33,60 @@ void xnn_f32_maxpool_minmax_ukernel_9p__rvv_u2v(
   const float output_max = params->scalar.max;
   do {
     const float** i = input;
+
+    // First pass: load the inputs, store the max pool in the output.
+    const float* i0 = *i++;
+    const float* i1 = 1 < kernel_elements ? *i++ : i0;
+    const float* i2 = 2 < kernel_elements ? *i++ : i0;
+    const float* i3 = 3 < kernel_elements ? *i++ : i0;
+    const float* i4 = 4 < kernel_elements ? *i++ : i0;
+    const float* i5 = 5 < kernel_elements ? *i++ : i0;
+    const float* i6 = 6 < kernel_elements ? *i++ : i0;
+    const float* i7 = 7 < kernel_elements ? *i++ : i0;
+    const float* i8 = 8 < kernel_elements ? *i++ : i0;
+    i0 = (const float*) ((uintptr_t) i0 + input_offset);
+    i1 = (const float*) ((uintptr_t) i1 + input_offset);
+    i2 = (const float*) ((uintptr_t) i2 + input_offset);
+    i3 = (const float*) ((uintptr_t) i3 + input_offset);
+    i4 = (const float*) ((uintptr_t) i4 + input_offset);
+    i5 = (const float*) ((uintptr_t) i5 + input_offset);
+    i6 = (const float*) ((uintptr_t) i6 + input_offset);
+    i7 = (const float*) ((uintptr_t) i7 + input_offset);
+    i8 = (const float*) ((uintptr_t) i8 + input_offset);
+
     float* o = output;
-    {
-      const float* i0 = *i++;
-      const float* i1 = 1 < kernel_elements ? *i++ : i0;
-      const float* i2 = 2 < kernel_elements ? *i++ : i0;
-      const float* i3 = 3 < kernel_elements ? *i++ : i0;
-      const float* i4 = 4 < kernel_elements ? *i++ : i0;
-      const float* i5 = 5 < kernel_elements ? *i++ : i0;
-      const float* i6 = 6 < kernel_elements ? *i++ : i0;
-      const float* i7 = 7 < kernel_elements ? *i++ : i0;
-      const float* i8 = 8 < kernel_elements ? *i++ : i0;
-      i0 = (const float*) ((uintptr_t) i0 + input_offset);
-      i1 = (const float*) ((uintptr_t) i1 + input_offset);
-      i2 = (const float*) ((uintptr_t) i2 + input_offset);
-      i3 = (const float*) ((uintptr_t) i3 + input_offset);
-      i4 = (const float*) ((uintptr_t) i4 + input_offset);
-      i5 = (const float*) ((uintptr_t) i5 + input_offset);
-      i6 = (const float*) ((uintptr_t) i6 + input_offset);
-      i7 = (const float*) ((uintptr_t) i7 + input_offset);
-      i8 = (const float*) ((uintptr_t) i8 + input_offset);
+    size_t c = channels;
+    do {
+      size_t vl = __riscv_vsetvl_e32m2(c);
+      vfloat32m2_t vi0 = __riscv_vle32_v_f32m2(i0, vl); i0 += vl;
+      vfloat32m2_t vi1 = __riscv_vle32_v_f32m2(i1, vl); i1 += vl;
+      vfloat32m2_t vi2 = __riscv_vle32_v_f32m2(i2, vl); i2 += vl;
+      vfloat32m2_t vi3 = __riscv_vle32_v_f32m2(i3, vl); i3 += vl;
+      vfloat32m2_t vi4 = __riscv_vle32_v_f32m2(i4, vl); i4 += vl;
+      vfloat32m2_t vi5 = __riscv_vle32_v_f32m2(i5, vl); i5 += vl;
+      vfloat32m2_t vi6 = __riscv_vle32_v_f32m2(i6, vl); i6 += vl;
+      vfloat32m2_t vi7 = __riscv_vle32_v_f32m2(i7, vl); i7 += vl;
+      vfloat32m2_t vi8 = __riscv_vle32_v_f32m2(i8, vl); i8 += vl;
 
-      size_t c = channels;
-      do {
-        int32_t n = __riscv_vsetvl_e32m2(c);
-
-        vfloat32m2_t i0_f32v = __riscv_vle32_v_f32m2(i0, n); i0 += n;
-        vfloat32m2_t i1_f32v = __riscv_vle32_v_f32m2(i1, n); i1 += n;
-        vfloat32m2_t i2_f32v = __riscv_vle32_v_f32m2(i2, n); i2 += n;
-        vfloat32m2_t i3_f32v = __riscv_vle32_v_f32m2(i3, n); i3 += n;
-        vfloat32m2_t i4_f32v = __riscv_vle32_v_f32m2(i4, n); i4 += n;
-        vfloat32m2_t i5_f32v = __riscv_vle32_v_f32m2(i5, n); i5 += n;
-        vfloat32m2_t i6_f32v = __riscv_vle32_v_f32m2(i6, n); i6 += n;
-        vfloat32m2_t i7_f32v = __riscv_vle32_v_f32m2(i7, n); i7 += n;
-        vfloat32m2_t i8_f32v = __riscv_vle32_v_f32m2(i8, n); i8 += n;
-
-        vfloat32m2_t max01_f32v = __riscv_vfmax_vv_f32m2(i0_f32v, i1_f32v, n);
-        vfloat32m2_t max23_f32v = __riscv_vfmax_vv_f32m2(i2_f32v, i3_f32v, n);
-        vfloat32m2_t max45_f32v = __riscv_vfmax_vv_f32m2(i4_f32v, i5_f32v, n);
-        vfloat32m2_t max67_f32v = __riscv_vfmax_vv_f32m2(i6_f32v, i7_f32v, n);
-        vfloat32m2_t max018_f32v = __riscv_vfmax_vv_f32m2(max01_f32v, i8_f32v, n);
-
-        vfloat32m2_t max2345_f32v = __riscv_vfmax_vv_f32m2(max23_f32v, max45_f32v, n);
-        vfloat32m2_t max01678_f32v = __riscv_vfmax_vv_f32m2(max67_f32v, max018_f32v, n);
-        vfloat32m2_t out_f32v = __riscv_vfmax_vv_f32m2(max2345_f32v, max01678_f32v, n);
-        out_f32v = __riscv_vfmin_vf_f32m2(__riscv_vfmax_vf_f32m2(out_f32v, output_min, n), output_max, n);
-        __riscv_vse32_v_f32m2(o, out_f32v, n); o += n;
-
-        c -= n;
-      } while (c != 0);
-    }
+      vfloat32m2_t vmax01 = __riscv_vfmax(vi0, vi1, vl);
+      vfloat32m2_t vmax23 = __riscv_vfmax(vi2, vi3, vl);
+      vfloat32m2_t vmax45 = __riscv_vfmax(vi4, vi5, vl);
+      vfloat32m2_t vmax67 = __riscv_vfmax(vi6, vi7, vl);
+      vfloat32m2_t vmax018 = __riscv_vfmax(vmax01, vi8, vl);
+
+      vfloat32m2_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl);
+      vfloat32m2_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl);
+      vfloat32m2_t vacc = __riscv_vfmax(vmax2345, vmax01678, vl);
 
-    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
+      vacc = __riscv_vfmax(vacc, output_min, vl);
+      vacc = __riscv_vfmin(vacc, output_max, vl);
+      __riscv_vse32_v_f32m2(o, vacc, vl); o += vl;
+
+      c -= vl;
+    } while (c != 0);
+
+    // Passes 1 - n: Max more inputs to the output.
+    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) {
       const float* i0 = *i++;
       const float* i1 = 1 < k ? *i++ : i0;
       const float* i2 = 2 < k ? *i++ : i0;
@@ -92,6 +95,7 @@ void xnn_f32_maxpool_minmax_ukernel_9p__rvv_u2v(
       const float* i5 = 5 < k ? *i++ : i0;
       const float* i6 = 6 < k ? *i++ : i0;
       const float* i7 = 7 < k ? *i++ : i0;
+      const float* i8 = 8 < k ? *i++ : i0;
       i0 = (const float*) ((uintptr_t) i0 + input_offset);
       i1 = (const float*) ((uintptr_t) i1 + input_offset);
       i2 = (const float*) ((uintptr_t) i2 + input_offset);
@@ -100,35 +104,40 @@ void xnn_f32_maxpool_minmax_ukernel_9p__rvv_u2v(
       i5 = (const float*) ((uintptr_t) i5 + input_offset);
       i6 = (const float*) ((uintptr_t) i6 + input_offset);
       i7 = (const float*) ((uintptr_t) i7 + input_offset);
+      i8 = (const float*) ((uintptr_t) i8 + input_offset);
 
       o = output;
       size_t c = channels;
       do {
-        int32_t n = __riscv_vsetvl_e32m2(c);
-
-        vfloat32m2_t i0_f32v = __riscv_vle32_v_f32m2(i0, n); i0 += n;
-        vfloat32m2_t i1_f32v = __riscv_vle32_v_f32m2(i1, n); i1 += n;
-        vfloat32m2_t i2_f32v = __riscv_vle32_v_f32m2(i2, n); i2 += n;
-        vfloat32m2_t i3_f32v = __riscv_vle32_v_f32m2(i3, n); i3 += n;
-        vfloat32m2_t i4_f32v = __riscv_vle32_v_f32m2(i4, n); i4 += n;
-        vfloat32m2_t i5_f32v = __riscv_vle32_v_f32m2(i5, n); i5 += n;
-        vfloat32m2_t i6_f32v = __riscv_vle32_v_f32m2(i6, n); i6 += n;
-        vfloat32m2_t i7_f32v = __riscv_vle32_v_f32m2(i7, n); i7 += n;
-        vfloat32m2_t i8_f32v = __riscv_vle32_v_f32m2(o, n);
-
-        vfloat32m2_t max01_f32v = __riscv_vfmax_vv_f32m2(i0_f32v, i1_f32v, n);
-        vfloat32m2_t max23_f32v = __riscv_vfmax_vv_f32m2(i2_f32v, i3_f32v, n);
-        vfloat32m2_t max45_f32v = __riscv_vfmax_vv_f32m2(i4_f32v, i5_f32v, n);
-        vfloat32m2_t max67_f32v = __riscv_vfmax_vv_f32m2(i6_f32v, i7_f32v, n);
-        vfloat32m2_t max018_f32v = __riscv_vfmax_vv_f32m2(max01_f32v, i8_f32v, n);
-
-        vfloat32m2_t max2345_f32v = __riscv_vfmax_vv_f32m2(max23_f32v, max45_f32v, n);
-        vfloat32m2_t max01678_f32v = __riscv_vfmax_vv_f32m2(max67_f32v, max018_f32v, n);
-        vfloat32m2_t out_f32v = __riscv_vfmax_vv_f32m2(max2345_f32v, max01678_f32v, n);
-        out_f32v = __riscv_vfmin_vf_f32m2(__riscv_vfmax_vf_f32m2(out_f32v, output_min, n), output_max, n);
-        __riscv_vse32_v_f32m2(o, out_f32v, n); o += n;
-
-        c -= n;
+        size_t vl = __riscv_vsetvl_e32m2(c);
+
+        vfloat32m2_t vi0 = __riscv_vle32_v_f32m2(i0, vl); i0 += vl;
+        vfloat32m2_t vi1 = __riscv_vle32_v_f32m2(i1, vl); i1 += vl;
+        vfloat32m2_t vi2 = __riscv_vle32_v_f32m2(i2, vl); i2 += vl;
+        vfloat32m2_t vi3 = __riscv_vle32_v_f32m2(i3, vl); i3 += vl;
+        vfloat32m2_t vi4 = __riscv_vle32_v_f32m2(i4, vl); i4 += vl;
+        vfloat32m2_t vi5 = __riscv_vle32_v_f32m2(i5, vl); i5 += vl;
+        vfloat32m2_t vi6 = __riscv_vle32_v_f32m2(i6, vl); i6 += vl;
+        vfloat32m2_t vi7 = __riscv_vle32_v_f32m2(i7, vl); i7 += vl;
+        vfloat32m2_t vi8 = __riscv_vle32_v_f32m2(i8, vl); i8 += vl;
+
+        vfloat32m2_t vprev = __riscv_vle32_v_f32m2(o, vl);
+
+        vfloat32m2_t vmax01 = __riscv_vfmax(vi0, vi1, vl);
+        vfloat32m2_t vmax23 = __riscv_vfmax(vi2, vi3, vl);
+        vfloat32m2_t vmax45 = __riscv_vfmax(vi4, vi5, vl);
+        vfloat32m2_t vmax67 = __riscv_vfmax(vi6, vi7, vl);
+        vfloat32m2_t vmax018 = __riscv_vfmax(vmax01, vi8, vl);
+
+        vfloat32m2_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl);
+        vfloat32m2_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl);
+        vfloat32m2_t vmax012345678 = __riscv_vfmax(vmax2345, vmax01678, vl);
+
+        vfloat32m2_t vacc = __riscv_vfmax(vprev, vmax012345678, vl);
+        vacc = __riscv_vfmin(vacc, output_max, vl);
+        __riscv_vse32_v_f32m2(o, vacc, vl); o += vl;
+
+        c -= vl;
       } while (c != 0);
     }
     input = (const float**) ((uintptr_t) input + input_increment);
diff --git a/src/f32-maxpool/rvv.c.in b/src/f32-maxpool/rvv.c.in
index 00f32b0d877..87515e75d6f 100755
--- a/src/f32-maxpool/rvv.c.in
+++ b/src/f32-maxpool/rvv.c.in
@@ -1,134 +1,112 @@
-// Copyright 2024 Imagination Technologies, inc.
+// Copyright 2026 Google LLC
 //
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
 $assert LMUL in [1, 2, 4, 8]
+$assert(DATATYPE in ["f32", "f16", "u8", "s8"])
 #include <assert.h>
 #include "src/xnnpack/maxpool.h"
 #include <riscv_vector.h>
 
-void xnn_f32_maxpool_minmax_ukernel_9p__rvv_u${LMUL}v(
+$CTYPE = {"f32": "float", "f16": "xnn_float16", "u8": "uint8_t", "s8": "int8_t"}[DATATYPE]
+$VTYPE = {"f32": "vfloat32", "f16": "vfloat16", "u8": "vuint8", "s8": "vint8"}[DATATYPE]
+$VLOAD = {"f32": "__riscv_vle32_v_f32", "f16": "__riscv_vle16_v_f16", "u8": "__riscv_vle8_v_u8", "s8": "__riscv_vle8_v_i8"}[DATATYPE]
+$VSTORE = {"f32": "__riscv_vse32_v_f32", "f16": "__riscv_vse16_v_f16", "u8": "__riscv_vse8_v_u8", "s8": "__riscv_vse8_v_i8"}[DATATYPE]
+$VSETVL = {"f32": "__riscv_vsetvl_e32", "f16": "__riscv_vsetvl_e16", "u8": "__riscv_vsetvl_e8", "s8": "__riscv_vsetvl_e8"}[DATATYPE]
+$VMAX = {"f32": "__riscv_vfmax", "f16": "__riscv_vfmax", "u8": "__riscv_vmaxu", "s8": "__riscv_vmax"}[DATATYPE]
+$VMIN = {"f32": "__riscv_vfmin", "f16": "__riscv_vfmin", "u8": "__riscv_vminu", "s8": "__riscv_vmin"}[DATATYPE]
+$ISA = "fp16arith" if DATATYPE == "f16" else ""
+
+void xnn_${DATATYPE}_maxpool_minmax_ukernel_9p__rvv${ISA}_u${LMUL}v(
     size_t output_pixels,
     size_t kernel_elements,
     size_t channels,
-    const float** input,
+    const ${CTYPE}** input,
     size_t input_offset,
     size_t input_pixel_stride,
-    float* output,
+    ${CTYPE}* output,
     size_t input_increment,
     size_t output_increment,
-    const struct xnn_f32_minmax_params* restrict params)
+    const struct xnn_${DATATYPE}_minmax_params* restrict params)
 {
   assert(output_pixels != 0);
   assert(kernel_elements != 0);
   assert(channels != 0);
 
-  const float output_min = params->scalar.min;
-  const float output_max = params->scalar.max;
+  const ${CTYPE} output_min = params->scalar.min;
+  const ${CTYPE} output_max = params->scalar.max;
   do {
-    const float** i = input;
-    float* o = output;
-    {
-      const float* i0 = *i++;
-      const float* i1 = 1 < kernel_elements ? *i++ : i0;
-      const float* i2 = 2 < kernel_elements ? *i++ : i0;
-      const float* i3 = 3 < kernel_elements ? *i++ : i0;
-      const float* i4 = 4 < kernel_elements ? *i++ : i0;
-      const float* i5 = 5 < kernel_elements ? *i++ : i0;
-      const float* i6 = 6 < kernel_elements ? *i++ : i0;
-      const float* i7 = 7 < kernel_elements ? *i++ : i0;
-      const float* i8 = 8 < kernel_elements ? *i++ : i0;
-      i0 = (const float*) ((uintptr_t) i0 + input_offset);
-      i1 = (const float*) ((uintptr_t) i1 + input_offset);
-      i2 = (const float*) ((uintptr_t) i2 + input_offset);
-      i3 = (const float*) ((uintptr_t) i3 + input_offset);
-      i4 = (const float*) ((uintptr_t) i4 + input_offset);
-      i5 = (const float*) ((uintptr_t) i5 + input_offset);
-      i6 = (const float*) ((uintptr_t) i6 + input_offset);
-      i7 = (const float*) ((uintptr_t) i7 + input_offset);
-      i8 = (const float*) ((uintptr_t) i8 + input_offset);
+    const ${CTYPE}** i = input;
 
-      size_t c = channels;
-      do {
-        int32_t n = __riscv_vsetvl_e32m${LMUL}(c);
-
-        vfloat32m${LMUL}_t i0_f32v = __riscv_vle32_v_f32m${LMUL}(i0, n); i0 += n;
-        vfloat32m${LMUL}_t i1_f32v = __riscv_vle32_v_f32m${LMUL}(i1, n); i1 += n;
-        vfloat32m${LMUL}_t i2_f32v = __riscv_vle32_v_f32m${LMUL}(i2, n); i2 += n;
-        vfloat32m${LMUL}_t i3_f32v = __riscv_vle32_v_f32m${LMUL}(i3, n); i3 += n;
-        vfloat32m${LMUL}_t i4_f32v = __riscv_vle32_v_f32m${LMUL}(i4, n); i4 += n;
-        vfloat32m${LMUL}_t i5_f32v = __riscv_vle32_v_f32m${LMUL}(i5, n); i5 += n;
-        vfloat32m${LMUL}_t i6_f32v = __riscv_vle32_v_f32m${LMUL}(i6, n); i6 += n;
-        vfloat32m${LMUL}_t i7_f32v = __riscv_vle32_v_f32m${LMUL}(i7, n); i7 += n;
-        vfloat32m${LMUL}_t i8_f32v = __riscv_vle32_v_f32m${LMUL}(i8, n); i8 += n;
-
-        vfloat32m${LMUL}_t max01_f32v = __riscv_vfmax_vv_f32m${LMUL}(i0_f32v, i1_f32v, n);
-        vfloat32m${LMUL}_t max23_f32v = __riscv_vfmax_vv_f32m${LMUL}(i2_f32v, i3_f32v, n);
-        vfloat32m${LMUL}_t max45_f32v = __riscv_vfmax_vv_f32m${LMUL}(i4_f32v, i5_f32v, n);
-        vfloat32m${LMUL}_t max67_f32v = __riscv_vfmax_vv_f32m${LMUL}(i6_f32v, i7_f32v, n);
-        vfloat32m${LMUL}_t max018_f32v = __riscv_vfmax_vv_f32m${LMUL}(max01_f32v, i8_f32v, n);
-
-        vfloat32m${LMUL}_t max2345_f32v = __riscv_vfmax_vv_f32m${LMUL}(max23_f32v, max45_f32v, n);
-        vfloat32m${LMUL}_t max01678_f32v = __riscv_vfmax_vv_f32m${LMUL}(max67_f32v, max018_f32v, n);
-        vfloat32m${LMUL}_t out_f32v = __riscv_vfmax_vv_f32m${LMUL}(max2345_f32v, max01678_f32v, n);
-        out_f32v = __riscv_vfmin_vf_f32m${LMUL}(__riscv_vfmax_vf_f32m${LMUL}(out_f32v, output_min, n), output_max, n);
-        __riscv_vse32_v_f32m${LMUL}(o, out_f32v, n); o += n;
-
-        c -= n;
-      } while (c != 0);
-    }
+    // First pass: load the inputs, store the max pool in the output.
+    const ${CTYPE}* i0 = *i++;
+    $for K in range(1, 9):
+      const ${CTYPE}* i${K} = ${K} < kernel_elements ? *i++ : i0;
+    $for K in range(9):
+      i${K} = (const ${CTYPE}*) ((uintptr_t) i${K} + input_offset);
+
+    ${CTYPE}* o = output;
+    size_t c = channels;
+    do {
+      size_t vl = ${VSETVL}m${LMUL}(c);
+      $for K in range(9):
+        ${VTYPE}m${LMUL}_t vi${K} = ${VLOAD}m${LMUL}(i${K}, vl); i${K} += vl;
+
+      ${VTYPE}m${LMUL}_t vmax01 = ${VMAX}(vi0, vi1, vl);
+      ${VTYPE}m${LMUL}_t vmax23 = ${VMAX}(vi2, vi3, vl);
+      ${VTYPE}m${LMUL}_t vmax45 = ${VMAX}(vi4, vi5, vl);
+      ${VTYPE}m${LMUL}_t vmax67 = ${VMAX}(vi6, vi7, vl);
+      ${VTYPE}m${LMUL}_t vmax018 = ${VMAX}(vmax01, vi8, vl);
+
+      ${VTYPE}m${LMUL}_t vmax2345 = ${VMAX}(vmax23, vmax45, vl);
+      ${VTYPE}m${LMUL}_t vmax01678 = ${VMAX}(vmax67, vmax018, vl);
+      ${VTYPE}m${LMUL}_t vacc = ${VMAX}(vmax2345, vmax01678, vl);
 
-    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
-      const float* i0 = *i++;
-      const float* i1 = 1 < k ? *i++ : i0;
-      const float* i2 = 2 < k ? *i++ : i0;
-      const float* i3 = 3 < k ? *i++ : i0;
-      const float* i4 = 4 < k ? *i++ : i0;
-      const float* i5 = 5 < k ? *i++ : i0;
-      const float* i6 = 6 < k ? *i++ : i0;
-      const float* i7 = 7 < k ? *i++ : i0;
-      i0 = (const float*) ((uintptr_t) i0 + input_offset);
-      i1 = (const float*) ((uintptr_t) i1 + input_offset);
-      i2 = (const float*) ((uintptr_t) i2 + input_offset);
-      i3 = (const float*) ((uintptr_t) i3 + input_offset);
-      i4 = (const float*) ((uintptr_t) i4 + input_offset);
-      i5 = (const float*) ((uintptr_t) i5 + input_offset);
-      i6 = (const float*) ((uintptr_t) i6 + input_offset);
-      i7 = (const float*) ((uintptr_t) i7 + input_offset);
+      vacc = ${VMAX}(vacc, output_min, vl);
+      vacc = ${VMIN}(vacc, output_max, vl);
+      ${VSTORE}m${LMUL}(o, vacc, vl); o += vl;
+
+      c -= vl;
+    } while (c != 0);
+
+    // Passes 1 - n: Max more inputs to the output.
+    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) {
+      const ${CTYPE}* i0 = *i++;
+      $for K in range(1, 9):
+        const ${CTYPE}* i${K} = ${K} < k ? *i++ : i0;
+      $for K in range(9):
+        i${K} = (const ${CTYPE}*) ((uintptr_t) i${K} + input_offset);
 
       o = output;
       size_t c = channels;
       do {
-        int32_t n = __riscv_vsetvl_e32m${LMUL}(c);
-
-        vfloat32m${LMUL}_t i0_f32v = __riscv_vle32_v_f32m${LMUL}(i0, n); i0 += n;
-        vfloat32m${LMUL}_t i1_f32v = __riscv_vle32_v_f32m${LMUL}(i1, n); i1 += n;
-        vfloat32m${LMUL}_t i2_f32v = __riscv_vle32_v_f32m${LMUL}(i2, n); i2 += n;
-        vfloat32m${LMUL}_t i3_f32v = __riscv_vle32_v_f32m${LMUL}(i3, n); i3 += n;
-        vfloat32m${LMUL}_t i4_f32v = __riscv_vle32_v_f32m${LMUL}(i4, n); i4 += n;
-        vfloat32m${LMUL}_t i5_f32v = __riscv_vle32_v_f32m${LMUL}(i5, n); i5 += n;
-        vfloat32m${LMUL}_t i6_f32v = __riscv_vle32_v_f32m${LMUL}(i6, n); i6 += n;
-        vfloat32m${LMUL}_t i7_f32v = __riscv_vle32_v_f32m${LMUL}(i7, n); i7 += n;
-        vfloat32m${LMUL}_t i8_f32v = __riscv_vle32_v_f32m${LMUL}(o, n);
-
-        vfloat32m${LMUL}_t max01_f32v = __riscv_vfmax_vv_f32m${LMUL}(i0_f32v, i1_f32v, n);
-        vfloat32m${LMUL}_t max23_f32v = __riscv_vfmax_vv_f32m${LMUL}(i2_f32v, i3_f32v, n);
-        vfloat32m${LMUL}_t max45_f32v = __riscv_vfmax_vv_f32m${LMUL}(i4_f32v, i5_f32v, n);
-        vfloat32m${LMUL}_t max67_f32v = __riscv_vfmax_vv_f32m${LMUL}(i6_f32v, i7_f32v, n);
-        vfloat32m${LMUL}_t max018_f32v = __riscv_vfmax_vv_f32m${LMUL}(max01_f32v, i8_f32v, n);
-
-        vfloat32m${LMUL}_t max2345_f32v = __riscv_vfmax_vv_f32m${LMUL}(max23_f32v, max45_f32v, n);
-        vfloat32m${LMUL}_t max01678_f32v = __riscv_vfmax_vv_f32m${LMUL}(max67_f32v, max018_f32v, n);
-        vfloat32m${LMUL}_t out_f32v = __riscv_vfmax_vv_f32m${LMUL}(max2345_f32v, max01678_f32v, n);
-        out_f32v = __riscv_vfmin_vf_f32m${LMUL}(__riscv_vfmax_vf_f32m${LMUL}(out_f32v, output_min, n), output_max, n);
-        __riscv_vse32_v_f32m${LMUL}(o, out_f32v, n); o += n;
-
-        c -= n;
+        size_t vl = ${VSETVL}m${LMUL}(c);
+
+        $for K in range(9):
+          ${VTYPE}m${LMUL}_t vi${K} = ${VLOAD}m${LMUL}(i${K}, vl); i${K} += vl;
+
+        ${VTYPE}m${LMUL}_t vprev = ${VLOAD}m${LMUL}(o, vl);
+
+        ${VTYPE}m${LMUL}_t vmax01 = ${VMAX}(vi0, vi1, vl);
+        ${VTYPE}m${LMUL}_t vmax23 = ${VMAX}(vi2, vi3, vl);
+        ${VTYPE}m${LMUL}_t vmax45 = ${VMAX}(vi4, vi5, vl);
+        ${VTYPE}m${LMUL}_t vmax67 = ${VMAX}(vi6, vi7, vl);
+        ${VTYPE}m${LMUL}_t vmax018 = ${VMAX}(vmax01, vi8, vl);
+
+        ${VTYPE}m${LMUL}_t vmax2345 = ${VMAX}(vmax23, vmax45, vl);
+        ${VTYPE}m${LMUL}_t vmax01678 = ${VMAX}(vmax67, vmax018, vl);
+        ${VTYPE}m${LMUL}_t vmax012345678 = ${VMAX}(vmax2345, vmax01678, vl);
+
+        ${VTYPE}m${LMUL}_t vacc = ${VMAX}(vprev, vmax012345678, vl);
+        vacc = ${VMIN}(vacc, output_max, vl);
+        ${VSTORE}m${LMUL}(o, vacc, vl); o += vl;
+
+        c -= vl;
       } while (c != 0);
     }
-    input = (const float**) ((uintptr_t) input + input_increment);
+    input = (const ${CTYPE}**) ((uintptr_t) input + input_increment);
     input_offset += input_pixel_stride;
-    output = (float*) ((uintptr_t) output + output_increment);
+    output = (${CTYPE}*) ((uintptr_t) output + output_increment);
   } while (--output_pixels != 0);
 }
diff --git a/src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c b/src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c
new file mode 100644
index 00000000000..3114918b9b0
--- /dev/null
+++ b/src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c
@@ -0,0 +1,147 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-maxpool/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include "src/xnnpack/maxpool.h"
+#include <riscv_vector.h>
+
+
+void xnn_s8_maxpool_minmax_ukernel_9p__rvv_u1v(
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
+    const int8_t** input,
+    size_t input_offset,
+    size_t input_pixel_stride,
+    int8_t* output,
+    size_t input_increment,
+    size_t output_increment,
+    const struct xnn_s8_minmax_params* restrict params)
+{
+  assert(output_pixels != 0);
+  assert(kernel_elements != 0);
+  assert(channels != 0);
+
+  const int8_t output_min = params->scalar.min;
+  const int8_t output_max = params->scalar.max;
+  do {
+    const int8_t** i = input;
+
+    // First pass: load the inputs, store the max pool in the output.
+    const int8_t* i0 = *i++;
+    const int8_t* i1 = 1 < kernel_elements ? *i++ : i0;
+    const int8_t* i2 = 2 < kernel_elements ? *i++ : i0;
+    const int8_t* i3 = 3 < kernel_elements ? *i++ : i0;
+    const int8_t* i4 = 4 < kernel_elements ? *i++ : i0;
+    const int8_t* i5 = 5 < kernel_elements ? *i++ : i0;
+    const int8_t* i6 = 6 < kernel_elements ? *i++ : i0;
+    const int8_t* i7 = 7 < kernel_elements ? *i++ : i0;
+    const int8_t* i8 = 8 < kernel_elements ? *i++ : i0;
+    i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
+    i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
+    i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
+    i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
+    i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
+    i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
+    i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
+    i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
+    i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
+
+    int8_t* o = output;
+    size_t c = channels;
+    do {
+      size_t vl = __riscv_vsetvl_e8m1(c);
+      vint8m1_t vi0 = __riscv_vle8_v_i8m1(i0, vl); i0 += vl;
+      vint8m1_t vi1 = __riscv_vle8_v_i8m1(i1, vl); i1 += vl;
+      vint8m1_t vi2 = __riscv_vle8_v_i8m1(i2, vl); i2 += vl;
+      vint8m1_t vi3 = __riscv_vle8_v_i8m1(i3, vl); i3 += vl;
+      vint8m1_t vi4 = __riscv_vle8_v_i8m1(i4, vl); i4 += vl;
+      vint8m1_t vi5 = __riscv_vle8_v_i8m1(i5, vl); i5 += vl;
+      vint8m1_t vi6 = __riscv_vle8_v_i8m1(i6, vl); i6 += vl;
+      vint8m1_t vi7 = __riscv_vle8_v_i8m1(i7, vl); i7 += vl;
+      vint8m1_t vi8 = __riscv_vle8_v_i8m1(i8, vl); i8 += vl;
+
+      vint8m1_t vmax01 = __riscv_vmax(vi0, vi1, vl);
+      vint8m1_t vmax23 = __riscv_vmax(vi2, vi3, vl);
+      vint8m1_t vmax45 = __riscv_vmax(vi4, vi5, vl);
+      vint8m1_t vmax67 = __riscv_vmax(vi6, vi7, vl);
+      vint8m1_t vmax018 = __riscv_vmax(vmax01, vi8, vl);
+
+      vint8m1_t vmax2345 = __riscv_vmax(vmax23, vmax45, vl);
+      vint8m1_t vmax01678 = __riscv_vmax(vmax67, vmax018, vl);
+      vint8m1_t vacc = __riscv_vmax(vmax2345, vmax01678, vl);
+
+      vacc = __riscv_vmax(vacc, output_min, vl);
+      vacc = __riscv_vmin(vacc, output_max, vl);
+      __riscv_vse8_v_i8m1(o, vacc, vl); o += vl;
+
+      c -= vl;
+    } while (c != 0);
+
+    // Passes 1 - n: Max more inputs to the output.
+    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) {
+      const int8_t* i0 = *i++;
+      const int8_t* i1 = 1 < k ? *i++ : i0;
+      const int8_t* i2 = 2 < k ? *i++ : i0;
+      const int8_t* i3 = 3 < k ? *i++ : i0;
+      const int8_t* i4 = 4 < k ? *i++ : i0;
+      const int8_t* i5 = 5 < k ? *i++ : i0;
+      const int8_t* i6 = 6 < k ? *i++ : i0;
+      const int8_t* i7 = 7 < k ? *i++ : i0;
+      const int8_t* i8 = 8 < k ? *i++ : i0;
+      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
+      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
+      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
+      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
+      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
+      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
+      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
+      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
+      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
+
+      o = output;
+      size_t c = channels;
+      do {
+        size_t vl = __riscv_vsetvl_e8m1(c);
+
+        vint8m1_t vi0 = __riscv_vle8_v_i8m1(i0, vl); i0 += vl;
+        vint8m1_t vi1 = __riscv_vle8_v_i8m1(i1, vl); i1 += vl;
+        vint8m1_t vi2 = __riscv_vle8_v_i8m1(i2, vl); i2 += vl;
+        vint8m1_t vi3 = __riscv_vle8_v_i8m1(i3, vl); i3 += vl;
+        vint8m1_t vi4 = __riscv_vle8_v_i8m1(i4, vl); i4 += vl;
+        vint8m1_t vi5 = __riscv_vle8_v_i8m1(i5, vl); i5 += vl;
+        vint8m1_t vi6 = __riscv_vle8_v_i8m1(i6, vl); i6 += vl;
+        vint8m1_t vi7 = __riscv_vle8_v_i8m1(i7, vl); i7 += vl;
+        vint8m1_t vi8 = __riscv_vle8_v_i8m1(i8, vl); i8 += vl;
+
+        vint8m1_t vprev = __riscv_vle8_v_i8m1(o, vl);
+
+        vint8m1_t vmax01 = __riscv_vmax(vi0, vi1, vl);
+        vint8m1_t vmax23 = __riscv_vmax(vi2, vi3, vl);
+        vint8m1_t vmax45 = __riscv_vmax(vi4, vi5, vl);
+        vint8m1_t vmax67 = __riscv_vmax(vi6, vi7, vl);
+        vint8m1_t vmax018 = __riscv_vmax(vmax01, vi8, vl);
+
+        vint8m1_t vmax2345 = __riscv_vmax(vmax23, vmax45, vl);
+        vint8m1_t vmax01678 = __riscv_vmax(vmax67, vmax018, vl);
+        vint8m1_t vmax012345678 = __riscv_vmax(vmax2345, vmax01678, vl);
+
+        vint8m1_t vacc = __riscv_vmax(vprev, vmax012345678, vl);
+        vacc = __riscv_vmin(vacc, output_max, vl);
+        __riscv_vse8_v_i8m1(o, vacc, vl); o += vl;
+
+        c -= vl;
+      } while (c != 0);
+    }
+    input = (const int8_t**) ((uintptr_t) input + input_increment);
+    input_offset += input_pixel_stride;
+    output = (int8_t*) ((uintptr_t) output + output_increment);
+  } while (--output_pixels != 0);
+}
diff --git a/src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u2v.c b/src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u2v.c
new file mode 100644
index 00000000000..439ab947045
--- /dev/null
+++ b/src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u2v.c
@@ -0,0 +1,147 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-maxpool/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include "src/xnnpack/maxpool.h"
+#include <riscv_vector.h>
+
+
+void xnn_s8_maxpool_minmax_ukernel_9p__rvv_u2v(
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
+    const int8_t** input,
+    size_t input_offset,
+    size_t input_pixel_stride,
+    int8_t* output,
+    size_t input_increment,
+    size_t output_increment,
+    const struct xnn_s8_minmax_params* restrict params)
+{
+  assert(output_pixels != 0);
+  assert(kernel_elements != 0);
+  assert(channels != 0);
+
+  const int8_t output_min = params->scalar.min;
+  const int8_t output_max = params->scalar.max;
+  do {
+    const int8_t** i = input;
+
+    // First pass: load the inputs, store the max pool in the output.
+    const int8_t* i0 = *i++;
+    const int8_t* i1 = 1 < kernel_elements ? *i++ : i0;
+    const int8_t* i2 = 2 < kernel_elements ? *i++ : i0;
+    const int8_t* i3 = 3 < kernel_elements ? *i++ : i0;
+    const int8_t* i4 = 4 < kernel_elements ? *i++ : i0;
+    const int8_t* i5 = 5 < kernel_elements ? *i++ : i0;
+    const int8_t* i6 = 6 < kernel_elements ? *i++ : i0;
+    const int8_t* i7 = 7 < kernel_elements ? *i++ : i0;
+    const int8_t* i8 = 8 < kernel_elements ? *i++ : i0;
+    i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
+    i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
+    i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
+    i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
+    i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
+    i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
+    i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
+    i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
+    i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
+
+    int8_t* o = output;
+    size_t c = channels;
+    do {
+      size_t vl = __riscv_vsetvl_e8m2(c);
+      vint8m2_t vi0 = __riscv_vle8_v_i8m2(i0, vl); i0 += vl;
+      vint8m2_t vi1 = __riscv_vle8_v_i8m2(i1, vl); i1 += vl;
+      vint8m2_t vi2 = __riscv_vle8_v_i8m2(i2, vl); i2 += vl;
+      vint8m2_t vi3 = __riscv_vle8_v_i8m2(i3, vl); i3 += vl;
+      vint8m2_t vi4 = __riscv_vle8_v_i8m2(i4, vl); i4 += vl;
+      vint8m2_t vi5 = __riscv_vle8_v_i8m2(i5, vl); i5 += vl;
+      vint8m2_t vi6 = __riscv_vle8_v_i8m2(i6, vl); i6 += vl;
+      vint8m2_t vi7 = __riscv_vle8_v_i8m2(i7, vl); i7 += vl;
+      vint8m2_t vi8 = __riscv_vle8_v_i8m2(i8, vl); i8 += vl;
+
+      vint8m2_t vmax01 = __riscv_vmax(vi0, vi1, vl);
+      vint8m2_t vmax23 = __riscv_vmax(vi2, vi3, vl);
+      vint8m2_t vmax45 = __riscv_vmax(vi4, vi5, vl);
+      vint8m2_t vmax67 = __riscv_vmax(vi6, vi7, vl);
+      vint8m2_t vmax018 = __riscv_vmax(vmax01, vi8, vl);
+
+      vint8m2_t vmax2345 = __riscv_vmax(vmax23, vmax45, vl);
+      vint8m2_t vmax01678 = __riscv_vmax(vmax67, vmax018, vl);
+      vint8m2_t vacc = __riscv_vmax(vmax2345, vmax01678, vl);
+
+      vacc = __riscv_vmax(vacc, output_min, vl);
+      vacc = __riscv_vmin(vacc, output_max, vl);
+      __riscv_vse8_v_i8m2(o, vacc, vl); o += vl;
+
+      c -= vl;
+    } while (c != 0);
+
+    // Passes 1 - n: Max more inputs to the output.
+    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) {
+      const int8_t* i0 = *i++;
+      const int8_t* i1 = 1 < k ? *i++ : i0;
+      const int8_t* i2 = 2 < k ? *i++ : i0;
+      const int8_t* i3 = 3 < k ? *i++ : i0;
+      const int8_t* i4 = 4 < k ? *i++ : i0;
+      const int8_t* i5 = 5 < k ? *i++ : i0;
+      const int8_t* i6 = 6 < k ? *i++ : i0;
+      const int8_t* i7 = 7 < k ? *i++ : i0;
+      const int8_t* i8 = 8 < k ? *i++ : i0;
+      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
+      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
+      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
+      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
+      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
+      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
+      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
+      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
+      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
+
+      o = output;
+      size_t c = channels;
+      do {
+        size_t vl = __riscv_vsetvl_e8m2(c);
+
+        vint8m2_t vi0 = __riscv_vle8_v_i8m2(i0, vl); i0 += vl;
+        vint8m2_t vi1 = __riscv_vle8_v_i8m2(i1, vl); i1 += vl;
+        vint8m2_t vi2 = __riscv_vle8_v_i8m2(i2, vl); i2 += vl;
+        vint8m2_t vi3 = __riscv_vle8_v_i8m2(i3, vl); i3 += vl;
+        vint8m2_t vi4 = __riscv_vle8_v_i8m2(i4, vl); i4 += vl;
+        vint8m2_t vi5 = __riscv_vle8_v_i8m2(i5, vl); i5 += vl;
+        vint8m2_t vi6 = __riscv_vle8_v_i8m2(i6, vl); i6 += vl;
+        vint8m2_t vi7 = __riscv_vle8_v_i8m2(i7, vl); i7 += vl;
+        vint8m2_t vi8 = __riscv_vle8_v_i8m2(i8, vl); i8 += vl;
+
+        vint8m2_t vprev = __riscv_vle8_v_i8m2(o, vl);
+
+        vint8m2_t vmax01 = __riscv_vmax(vi0, vi1, vl);
+        vint8m2_t vmax23 = __riscv_vmax(vi2, vi3, vl);
+        vint8m2_t vmax45 = __riscv_vmax(vi4, vi5, vl);
+        vint8m2_t vmax67 = __riscv_vmax(vi6, vi7, vl);
+        vint8m2_t vmax018 = __riscv_vmax(vmax01, vi8, vl);
+
+        vint8m2_t vmax2345 = __riscv_vmax(vmax23, vmax45, vl);
+        vint8m2_t vmax01678 = __riscv_vmax(vmax67, vmax018, vl);
+        vint8m2_t vmax012345678 = __riscv_vmax(vmax2345, vmax01678, vl);
+
+        vint8m2_t vacc = __riscv_vmax(vprev, vmax012345678, vl);
+        vacc = __riscv_vmin(vacc, output_max, vl);
+        __riscv_vse8_v_i8m2(o, vacc, vl); o += vl;
+
+        c -= vl;
+      } while (c != 0);
+    }
+    input = (const int8_t**) ((uintptr_t) input + input_increment);
+    input_offset += input_pixel_stride;
+    output = (int8_t*) ((uintptr_t) output + output_increment);
+  } while (--output_pixels != 0);
+}
diff --git a/src/s8-maxpool/s8-maxpool-minmax.inc b/src/s8-maxpool/s8-maxpool-minmax.inc
index 8c453aae27e..498299cb8d6 100644
--- a/src/s8-maxpool/s8-maxpool-minmax.inc
+++ b/src/s8-maxpool/s8-maxpool-minmax.inc
@@ -17,6 +17,9 @@ XNN_UKERNEL(xnn_arch_x86_sse4_1, xnn_s8_maxpool_minmax_ukernel_9p__sse41_u16, 16
 XNN_UKERNEL(xnn_arch_none, xnn_s8_maxpool_minmax_ukernel_9p__wasmsimd_u16, 16, 9, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params)
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 
-XNN_UKERNEL(xnn_arch_none, xnn_s8_maxpool_minmax_ukernel_9p__scalar_u1, 1, 9, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params)
-
+#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_s8_maxpool_minmax_ukernel_9p__rvv_u1v, (1*xnn_init_hardware_config()->vlenb/sizeof(int8_t)), 9, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params)
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_s8_maxpool_minmax_ukernel_9p__rvv_u2v, (2*xnn_init_hardware_config()->vlenb/sizeof(int8_t)), 9, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params)
+#endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
 
+XNN_UKERNEL(xnn_arch_none, xnn_s8_maxpool_minmax_ukernel_9p__scalar_u1, 1, 9, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params)
diff --git a/src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c b/src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c
new file mode 100644
index 00000000000..c0561f3d658
--- /dev/null
+++ b/src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c
@@ -0,0 +1,147 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-maxpool/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include "src/xnnpack/maxpool.h"
+#include <riscv_vector.h>
+
+
+void xnn_u8_maxpool_minmax_ukernel_9p__rvv_u1v(
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
+    const uint8_t** input,
+    size_t input_offset,
+    size_t input_pixel_stride,
+    uint8_t* output,
+    size_t input_increment,
+    size_t output_increment,
+    const struct xnn_u8_minmax_params* restrict params)
+{
+  assert(output_pixels != 0);
+  assert(kernel_elements != 0);
+  assert(channels != 0);
+
+  const uint8_t output_min = params->scalar.min;
+  const uint8_t output_max = params->scalar.max;
+  do {
+    const uint8_t** i = input;
+
+    // First pass: load the inputs, store the max pool in the output.
+    const uint8_t* i0 = *i++;
+    const uint8_t* i1 = 1 < kernel_elements ? *i++ : i0;
+    const uint8_t* i2 = 2 < kernel_elements ? *i++ : i0;
+    const uint8_t* i3 = 3 < kernel_elements ? *i++ : i0;
+    const uint8_t* i4 = 4 < kernel_elements ? *i++ : i0;
+    const uint8_t* i5 = 5 < kernel_elements ? *i++ : i0;
+    const uint8_t* i6 = 6 < kernel_elements ? *i++ : i0;
+    const uint8_t* i7 = 7 < kernel_elements ? *i++ : i0;
+    const uint8_t* i8 = 8 < kernel_elements ? *i++ : i0;
+    i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+    i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+    i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+    i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+    i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+    i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+    i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+    i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+    i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+
+    uint8_t* o = output;
+    size_t c = channels;
+    do {
+      size_t vl = __riscv_vsetvl_e8m1(c);
+      vuint8m1_t vi0 = __riscv_vle8_v_u8m1(i0, vl); i0 += vl;
+      vuint8m1_t vi1 = __riscv_vle8_v_u8m1(i1, vl); i1 += vl;
+      vuint8m1_t vi2 = __riscv_vle8_v_u8m1(i2, vl); i2 += vl;
+      vuint8m1_t vi3 = __riscv_vle8_v_u8m1(i3, vl); i3 += vl;
+      vuint8m1_t vi4 = __riscv_vle8_v_u8m1(i4, vl); i4 += vl;
+      vuint8m1_t vi5 = __riscv_vle8_v_u8m1(i5, vl); i5 += vl;
+      vuint8m1_t vi6 = __riscv_vle8_v_u8m1(i6, vl); i6 += vl;
+      vuint8m1_t vi7 = __riscv_vle8_v_u8m1(i7, vl); i7 += vl;
+      vuint8m1_t vi8 = __riscv_vle8_v_u8m1(i8, vl); i8 += vl;
+
+      vuint8m1_t vmax01 = __riscv_vmaxu(vi0, vi1, vl);
+      vuint8m1_t vmax23 = __riscv_vmaxu(vi2, vi3, vl);
+      vuint8m1_t vmax45 = __riscv_vmaxu(vi4, vi5, vl);
+      vuint8m1_t vmax67 = __riscv_vmaxu(vi6, vi7, vl);
+      vuint8m1_t vmax018 = __riscv_vmaxu(vmax01, vi8, vl);
+
+      vuint8m1_t vmax2345 = __riscv_vmaxu(vmax23, vmax45, vl);
+      vuint8m1_t vmax01678 = __riscv_vmaxu(vmax67, vmax018, vl);
+      vuint8m1_t vacc = __riscv_vmaxu(vmax2345, vmax01678, vl);
+
+      vacc = __riscv_vmaxu(vacc, output_min, vl);
+      vacc = __riscv_vminu(vacc, output_max, vl);
+      __riscv_vse8_v_u8m1(o, vacc, vl); o += vl;
+
+      c -= vl;
+    } while (c != 0);
+
+    // Passes 1 - n: Max more inputs to the output.
+    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) {
+      const uint8_t* i0 = *i++;
+      const uint8_t* i1 = 1 < k ? *i++ : i0;
+      const uint8_t* i2 = 2 < k ? *i++ : i0;
+      const uint8_t* i3 = 3 < k ? *i++ : i0;
+      const uint8_t* i4 = 4 < k ? *i++ : i0;
+      const uint8_t* i5 = 5 < k ? *i++ : i0;
+      const uint8_t* i6 = 6 < k ? *i++ : i0;
+      const uint8_t* i7 = 7 < k ? *i++ : i0;
+      const uint8_t* i8 = 8 < k ? *i++ : i0;
+      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+
+      o = output;
+      size_t c = channels;
+      do {
+        size_t vl = __riscv_vsetvl_e8m1(c);
+
+        vuint8m1_t vi0 = __riscv_vle8_v_u8m1(i0, vl); i0 += vl;
+        vuint8m1_t vi1 = __riscv_vle8_v_u8m1(i1, vl); i1 += vl;
+        vuint8m1_t vi2 = __riscv_vle8_v_u8m1(i2, vl); i2 += vl;
+        vuint8m1_t vi3 = __riscv_vle8_v_u8m1(i3, vl); i3 += vl;
+        vuint8m1_t vi4 = __riscv_vle8_v_u8m1(i4, vl); i4 += vl;
+        vuint8m1_t vi5 = __riscv_vle8_v_u8m1(i5, vl); i5 += vl;
+        vuint8m1_t vi6 = __riscv_vle8_v_u8m1(i6, vl); i6 += vl;
+        vuint8m1_t vi7 = __riscv_vle8_v_u8m1(i7, vl); i7 += vl;
+        vuint8m1_t vi8 = __riscv_vle8_v_u8m1(i8, vl); i8 += vl;
+
+        vuint8m1_t vprev = __riscv_vle8_v_u8m1(o, vl);
+
+        vuint8m1_t vmax01 = __riscv_vmaxu(vi0, vi1, vl);
+        vuint8m1_t vmax23 = __riscv_vmaxu(vi2, vi3, vl);
+        vuint8m1_t vmax45 = __riscv_vmaxu(vi4, vi5, vl);
+        vuint8m1_t vmax67 = __riscv_vmaxu(vi6, vi7, vl);
+        vuint8m1_t vmax018 = __riscv_vmaxu(vmax01, vi8, vl);
+
+        vuint8m1_t vmax2345 = __riscv_vmaxu(vmax23, vmax45, vl);
+        vuint8m1_t vmax01678 = __riscv_vmaxu(vmax67, vmax018, vl);
+        vuint8m1_t vmax012345678 = __riscv_vmaxu(vmax2345, vmax01678, vl);
+
+        vuint8m1_t vacc = __riscv_vmaxu(vprev, vmax012345678, vl);
+        vacc = __riscv_vminu(vacc, output_max, vl);
+        __riscv_vse8_v_u8m1(o, vacc, vl); o += vl;
+
+        c -= vl;
+      } while (c != 0);
+    }
+    input = (const uint8_t**) ((uintptr_t) input + input_increment);
+    input_offset += input_pixel_stride;
+    output = (uint8_t*) ((uintptr_t) output + output_increment);
+  } while (--output_pixels != 0);
+}
diff --git a/src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u2v.c b/src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u2v.c
new file mode 100644
index 00000000000..b1609fb0ef5
--- /dev/null
+++ b/src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u2v.c
@@ -0,0 +1,147 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-maxpool/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include "src/xnnpack/maxpool.h"
+#include <riscv_vector.h>
+
+
+void xnn_u8_maxpool_minmax_ukernel_9p__rvv_u2v(
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
+    const uint8_t** input,
+    size_t input_offset,
+    size_t input_pixel_stride,
+    uint8_t* output,
+    size_t input_increment,
+    size_t output_increment,
+    const struct xnn_u8_minmax_params* restrict params)
+{
+  assert(output_pixels != 0);
+  assert(kernel_elements != 0);
+  assert(channels != 0);
+
+  const uint8_t output_min = params->scalar.min;
+  const uint8_t output_max = params->scalar.max;
+  do {
+    const uint8_t** i = input;
+
+    // First pass: load the inputs, store the max pool in the output.
+    const uint8_t* i0 = *i++;
+    const uint8_t* i1 = 1 < kernel_elements ? *i++ : i0;
+    const uint8_t* i2 = 2 < kernel_elements ? *i++ : i0;
+    const uint8_t* i3 = 3 < kernel_elements ? *i++ : i0;
+    const uint8_t* i4 = 4 < kernel_elements ? *i++ : i0;
+    const uint8_t* i5 = 5 < kernel_elements ? *i++ : i0;
+    const uint8_t* i6 = 6 < kernel_elements ? *i++ : i0;
+    const uint8_t* i7 = 7 < kernel_elements ? *i++ : i0;
+    const uint8_t* i8 = 8 < kernel_elements ? *i++ : i0;
+    i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+    i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+    i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+    i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+    i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+    i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+    i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+    i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+    i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+
+    uint8_t* o = output;
+    size_t c = channels;
+    do {
+      size_t vl = __riscv_vsetvl_e8m2(c);
+      vuint8m2_t vi0 = __riscv_vle8_v_u8m2(i0, vl); i0 += vl;
+      vuint8m2_t vi1 = __riscv_vle8_v_u8m2(i1, vl); i1 += vl;
+      vuint8m2_t vi2 = __riscv_vle8_v_u8m2(i2, vl); i2 += vl;
+      vuint8m2_t vi3 = __riscv_vle8_v_u8m2(i3, vl); i3 += vl;
+      vuint8m2_t vi4 = __riscv_vle8_v_u8m2(i4, vl); i4 += vl;
+      vuint8m2_t vi5 = __riscv_vle8_v_u8m2(i5, vl); i5 += vl;
+      vuint8m2_t vi6 = __riscv_vle8_v_u8m2(i6, vl); i6 += vl;
+      vuint8m2_t vi7 = __riscv_vle8_v_u8m2(i7, vl); i7 += vl;
+      vuint8m2_t vi8 = __riscv_vle8_v_u8m2(i8, vl); i8 += vl;
+
+      vuint8m2_t vmax01 = __riscv_vmaxu(vi0, vi1, vl);
+      vuint8m2_t vmax23 = __riscv_vmaxu(vi2, vi3, vl);
+      vuint8m2_t vmax45 = __riscv_vmaxu(vi4, vi5, vl);
+      vuint8m2_t vmax67 = __riscv_vmaxu(vi6, vi7, vl);
+      vuint8m2_t vmax018 = __riscv_vmaxu(vmax01, vi8, vl);
+
+      vuint8m2_t vmax2345 = __riscv_vmaxu(vmax23, vmax45, vl);
+      vuint8m2_t vmax01678 = __riscv_vmaxu(vmax67, vmax018, vl);
+      vuint8m2_t vacc = __riscv_vmaxu(vmax2345, vmax01678, vl);
+
+      vacc = __riscv_vmaxu(vacc, output_min, vl);
+      vacc = __riscv_vminu(vacc, output_max, vl);
+      __riscv_vse8_v_u8m2(o, vacc, vl); o += vl;
+
+      c -= vl;
+    } while (c != 0);
+
+    // Passes 1 - n: Max more inputs to the output.
+    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) {
+      const uint8_t* i0 = *i++;
+      const uint8_t* i1 = 1 < k ? *i++ : i0;
+      const uint8_t* i2 = 2 < k ? *i++ : i0;
+      const uint8_t* i3 = 3 < k ? *i++ : i0;
+      const uint8_t* i4 = 4 < k ? *i++ : i0;
+      const uint8_t* i5 = 5 < k ? *i++ : i0;
+      const uint8_t* i6 = 6 < k ? *i++ : i0;
+      const uint8_t* i7 = 7 < k ? *i++ : i0;
+      const uint8_t* i8 = 8 < k ? *i++ : i0;
+      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+
+      o = output;
+      size_t c = channels;
+      do {
+        size_t vl = __riscv_vsetvl_e8m2(c);
+
+        vuint8m2_t vi0 = __riscv_vle8_v_u8m2(i0, vl); i0 += vl;
+        vuint8m2_t vi1 = __riscv_vle8_v_u8m2(i1, vl); i1 += vl;
+        vuint8m2_t vi2 = __riscv_vle8_v_u8m2(i2, vl); i2 += vl;
+        vuint8m2_t vi3 = __riscv_vle8_v_u8m2(i3, vl); i3 += vl;
+        vuint8m2_t vi4 = __riscv_vle8_v_u8m2(i4, vl); i4 += vl;
+        vuint8m2_t vi5 = __riscv_vle8_v_u8m2(i5, vl); i5 += vl;
+        vuint8m2_t vi6 = __riscv_vle8_v_u8m2(i6, vl); i6 += vl;
+        vuint8m2_t vi7 = __riscv_vle8_v_u8m2(i7, vl); i7 += vl;
+        vuint8m2_t vi8 = __riscv_vle8_v_u8m2(i8, vl); i8 += vl;
+
+        vuint8m2_t vprev = __riscv_vle8_v_u8m2(o, vl);
+
+        vuint8m2_t vmax01 = __riscv_vmaxu(vi0, vi1, vl);
+        vuint8m2_t vmax23 = __riscv_vmaxu(vi2, vi3, vl);
+        vuint8m2_t vmax45 = __riscv_vmaxu(vi4, vi5, vl);
+        vuint8m2_t vmax67 = __riscv_vmaxu(vi6, vi7, vl);
+        vuint8m2_t vmax018 = __riscv_vmaxu(vmax01, vi8, vl);
+
+        vuint8m2_t vmax2345 = __riscv_vmaxu(vmax23, vmax45, vl);
+        vuint8m2_t vmax01678 = __riscv_vmaxu(vmax67, vmax018, vl);
+        vuint8m2_t vmax012345678 = __riscv_vmaxu(vmax2345, vmax01678, vl);
+
+        vuint8m2_t vacc = __riscv_vmaxu(vprev, vmax012345678, vl);
+        vacc = __riscv_vminu(vacc, output_max, vl);
+        __riscv_vse8_v_u8m2(o, vacc, vl); o += vl;
+
+        c -= vl;
+      } while (c != 0);
+    }
+    input = (const uint8_t**) ((uintptr_t) input + input_increment);
+    input_offset += input_pixel_stride;
+    output = (uint8_t*) ((uintptr_t) output + output_increment);
+  } while (--output_pixels != 0);
+}
diff --git a/src/u8-maxpool/u8-maxpool-minmax.inc b/src/u8-maxpool/u8-maxpool-minmax.inc
index 94e651a04ea..f090da6c6c4 100644
--- a/src/u8-maxpool/u8-maxpool-minmax.inc
+++ b/src/u8-maxpool/u8-maxpool-minmax.inc
@@ -17,6 +17,9 @@ XNN_UKERNEL(xnn_arch_x86_sse2, xnn_u8_maxpool_minmax_ukernel_9p__sse2_u16, 16, 9
 XNN_UKERNEL(xnn_arch_none, xnn_u8_maxpool_minmax_ukernel_9p__wasmsimd_u16, 16, 9, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params)
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 
-XNN_UKERNEL(xnn_arch_none, xnn_u8_maxpool_minmax_ukernel_9p__scalar_u1, 1, 9, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params)
-
+#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_u8_maxpool_minmax_ukernel_9p__rvv_u1v, (1*xnn_init_hardware_config()->vlenb/sizeof(uint8_t)), 9, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params)
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_u8_maxpool_minmax_ukernel_9p__rvv_u2v, (2*xnn_init_hardware_config()->vlenb/sizeof(uint8_t)), 9, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params)
+#endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
 
+XNN_UKERNEL(xnn_arch_none, xnn_u8_maxpool_minmax_ukernel_9p__scalar_u1, 1, 9, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params)

From afaabc20c8ce4df867fa95ce6eb2a5027fc4b301 Mon Sep 17 00:00:00 2001
From: Ken Unger <ken.j.unger@gmail.com>
Date: Mon, 2 Mar 2026 22:59:20 -0800
Subject: [PATCH 2/2] cleanup rvv maxpool, add rvv avgpool

---
 cmake/gen/rvv_microkernels.cmake              |   4 +-
 cmake/gen/rvvfp16arith_microkernels.cmake     |   2 +-
 gen/rvv_microkernels.bzl                      |   4 +-
 gen/rvvfp16arith_microkernels.bzl             |   2 +-
 scripts/generate-f16-avgpool.sh               |   3 +
 scripts/generate-f16-maxpool.sh               |   1 -
 scripts/generate-f32-avgpool.sh               |   3 +
 scripts/generate-f32-maxpool.sh               |   1 -
 scripts/generate-s8-maxpool.sh                |   1 -
 scripts/generate-u8-maxpool.sh                |   1 -
 src/configs/avgpool-config.c                  |  17 ++
 src/f16-avgpool/f16-avgpool-minmax.inc        |   4 +
 .../f16-avgpool-9p-minmax-rvvfp16arith-u2v.c  | 216 ++++++++++++++++++
 src/f16-maxpool/f16-maxpool-minmax.inc        |   1 -
 .../f16-maxpool-9p-minmax-rvvfp16arith-u1v.c  | 147 ------------
 src/f32-avgpool/f32-avgpool-minmax.inc        |   4 +
 .../gen/f32-avgpool-9p-minmax-rvv-u2v.c       | 216 ++++++++++++++++++
 src/f32-avgpool/rvv.c.in                      | 127 ++++++++++
 src/f32-maxpool/f32-maxpool-minmax.inc        |   1 -
 .../gen/f32-maxpool-9p-minmax-rvv-u1v.c       | 147 ------------
 .../gen/s8-maxpool-9p-minmax-rvv-u1v.c        | 147 ------------
 src/s8-maxpool/s8-maxpool-minmax.inc          |   1 -
 .../gen/u8-maxpool-9p-minmax-rvv-u1v.c        | 147 ------------
 src/u8-maxpool/u8-maxpool-minmax.inc          |   1 -
 24 files changed, 594 insertions(+), 604 deletions(-)
 create mode 100644 src/f16-avgpool/gen/f16-avgpool-9p-minmax-rvvfp16arith-u2v.c
 delete mode 100644 src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c
 create mode 100644 src/f32-avgpool/gen/f32-avgpool-9p-minmax-rvv-u2v.c
 create mode 100644 src/f32-avgpool/rvv.c.in
 delete mode 100644 src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c
 delete mode 100644 src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c
 delete mode 100644 src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c

diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake
index 04d5e143861..7307f77ddeb 100644
--- a/cmake/gen/rvv_microkernels.cmake
+++ b/cmake/gen/rvv_microkernels.cmake
@@ -11,6 +11,7 @@
 
 SET(PROD_RVV_MICROKERNEL_SRCS
   src/f32-argmaxpool/f32-argmaxpool-9p8x-rvv-u1v.c
+  src/f32-avgpool/gen/f32-avgpool-9p-minmax-rvv-u2v.c
   src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x2v-rvv-2x2.c
   src/f32-dwconv/gen/f32-dwconv-3p8vc-minmax-rvv.c
   src/f32-dwconv/gen/f32-dwconv-3p8vc-rvv.c
@@ -138,7 +139,6 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS
   src/f32-igemm/gen/f32-igemm-1x4v-rvv.c
   src/f32-igemm/gen/f32-igemm-7x4v-relu-rvv.c
   src/f32-igemm/gen/f32-igemm-7x4v-rvv.c
-  src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c
   src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u1v.c
   src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u4v.c
   src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u8v.c
@@ -260,11 +260,9 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS
   src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u1v.c
   src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c
   src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c
-  src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c
   src/s8-vclamp/gen/s8-vclamp-rvv-u1v.c
   src/s8-vclamp/gen/s8-vclamp-rvv-u2v.c
   src/s8-vclamp/gen/s8-vclamp-rvv-u8v.c
-  src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c
   src/u8-vclamp/gen/u8-vclamp-rvv-u1v.c
   src/u8-vclamp/gen/u8-vclamp-rvv-u2v.c
   src/u8-vclamp/gen/u8-vclamp-rvv-u8v.c
diff --git a/cmake/gen/rvvfp16arith_microkernels.cmake b/cmake/gen/rvvfp16arith_microkernels.cmake
index ac5f45c7724..1780d593920 100644
--- a/cmake/gen/rvvfp16arith_microkernels.cmake
+++ b/cmake/gen/rvvfp16arith_microkernels.cmake
@@ -10,6 +10,7 @@
 
 
 SET(PROD_RVVFP16ARITH_MICROKERNEL_SRCS
+  src/f16-avgpool/gen/f16-avgpool-9p-minmax-rvvfp16arith-u2v.c
   src/f16-dwconv/gen/f16-dwconv-3p8vc-minmax-rvvfp16arith.c
   src/f16-dwconv/gen/f16-dwconv-4p8vc-minmax-rvvfp16arith.c
   src/f16-dwconv/gen/f16-dwconv-9p8vc-minmax-rvvfp16arith.c
@@ -55,7 +56,6 @@ SET(NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS
   src/f16-f32-vcvt/gen/f16-f32-vcvt-rvvfp16arith-u2v.c
   src/f16-gemm/gen/f16-gemm-4x4v-minmax-rvvfp16arith.c
   src/f16-igemm/gen/f16-igemm-4x4v-minmax-rvvfp16arith.c
-  src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c
   src/f16-spmm/gen/f16-spmm-1vx1-minmax-rvvfp16arith.c
   src/f16-spmm/gen/f16-spmm-2vx1-minmax-rvvfp16arith.c
   src/f16-spmm/gen/f16-spmm-4vx1-minmax-rvvfp16arith.c
diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl
index fb7468b48b3..335ed2f7d4a 100644
--- a/gen/rvv_microkernels.bzl
+++ b/gen/rvv_microkernels.bzl
@@ -7,6 +7,7 @@
 
 PROD_RVV_MICROKERNEL_SRCS = [
     "src/f32-argmaxpool/f32-argmaxpool-9p8x-rvv-u1v.c",
+    "src/f32-avgpool/gen/f32-avgpool-9p-minmax-rvv-u2v.c",
     "src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x2v-rvv-2x2.c",
     "src/f32-dwconv/gen/f32-dwconv-3p8vc-minmax-rvv.c",
     "src/f32-dwconv/gen/f32-dwconv-3p8vc-rvv.c",
@@ -135,7 +136,6 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [
     "src/f32-igemm/gen/f32-igemm-1x4v-rvv.c",
     "src/f32-igemm/gen/f32-igemm-7x4v-relu-rvv.c",
     "src/f32-igemm/gen/f32-igemm-7x4v-rvv.c",
-    "src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c",
     "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u1v.c",
     "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u4v.c",
     "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u8v.c",
@@ -257,11 +257,9 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [
     "src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u1v.c",
     "src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c",
     "src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c",
-    "src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c",
     "src/s8-vclamp/gen/s8-vclamp-rvv-u1v.c",
     "src/s8-vclamp/gen/s8-vclamp-rvv-u2v.c",
     "src/s8-vclamp/gen/s8-vclamp-rvv-u8v.c",
-    "src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c",
     "src/u8-vclamp/gen/u8-vclamp-rvv-u1v.c",
     "src/u8-vclamp/gen/u8-vclamp-rvv-u2v.c",
     "src/u8-vclamp/gen/u8-vclamp-rvv-u8v.c",
diff --git a/gen/rvvfp16arith_microkernels.bzl b/gen/rvvfp16arith_microkernels.bzl
index acbae4d7c04..c007293ecb4 100644
--- a/gen/rvvfp16arith_microkernels.bzl
+++ b/gen/rvvfp16arith_microkernels.bzl
@@ -6,6 +6,7 @@
 #
 
 PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [
+    "src/f16-avgpool/gen/f16-avgpool-9p-minmax-rvvfp16arith-u2v.c",
     "src/f16-dwconv/gen/f16-dwconv-3p8vc-minmax-rvvfp16arith.c",
     "src/f16-dwconv/gen/f16-dwconv-4p8vc-minmax-rvvfp16arith.c",
     "src/f16-dwconv/gen/f16-dwconv-9p8vc-minmax-rvvfp16arith.c",
@@ -52,7 +53,6 @@ NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [
     "src/f16-f32-vcvt/gen/f16-f32-vcvt-rvvfp16arith-u2v.c",
     "src/f16-gemm/gen/f16-gemm-4x4v-minmax-rvvfp16arith.c",
     "src/f16-igemm/gen/f16-igemm-4x4v-minmax-rvvfp16arith.c",
-    "src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c",
     "src/f16-spmm/gen/f16-spmm-1vx1-minmax-rvvfp16arith.c",
     "src/f16-spmm/gen/f16-spmm-2vx1-minmax-rvvfp16arith.c",
     "src/f16-spmm/gen/f16-spmm-4vx1-minmax-rvvfp16arith.c",
diff --git a/scripts/generate-f16-avgpool.sh b/scripts/generate-f16-avgpool.sh
index b9bf469fc21..32fcaeb0207 100755
--- a/scripts/generate-f16-avgpool.sh
+++ b/scripts/generate-f16-avgpool.sh
@@ -10,4 +10,7 @@ tools/xngen src/f32-avgpool/avgpool.c.in -D ARCH=neonfp16arith -D DATATYPE=f16 -
 ##################################### f16c #####################################
 tools/xngen src/f16-avgpool/f16c.c.in -D SIMD_SIZE=8 -o src/f16-avgpool/gen/f16-avgpool-9p-minmax-f16c.c &
 
+################################ RISC-V Vector #################################
+tools/xngen src/f32-avgpool/rvv.c.in -D DATATYPE=f16 -D LMUL=2 -o src/f16-avgpool/gen/f16-avgpool-9p-minmax-rvvfp16arith-u2v.c &
+
 wait
diff --git a/scripts/generate-f16-maxpool.sh b/scripts/generate-f16-maxpool.sh
index 9bfb490c2cc..0c3b041ea09 100755
--- a/scripts/generate-f16-maxpool.sh
+++ b/scripts/generate-f16-maxpool.sh
@@ -10,7 +10,6 @@ tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=f16 -D ARCH=avx2 -D SIMD_SI
 tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=f16 -D ARCH=sse41 -D SIMD_SIZE=8 -o src/f16-maxpool/gen/f16-maxpool-9p-minmax-sse41-u8.c &
 
 ################################ RISC-V Vector ################################
-tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=f16 -D LMUL=1 -o src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c &
 tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=f16 -D LMUL=2 -o src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u2v.c &
 
 wait
diff --git a/scripts/generate-f32-avgpool.sh b/scripts/generate-f32-avgpool.sh
index c59e51ddc8d..454f4957e77 100755
--- a/scripts/generate-f32-avgpool.sh
+++ b/scripts/generate-f32-avgpool.sh
@@ -13,4 +13,7 @@ tools/xngen src/f32-avgpool/avgpool.c.in -D ARCH=avx      -D DATATYPE=f32 -D SIM
 tools/xngen src/f32-avgpool/avgpool.c.in -D ARCH=avx512f  -D DATATYPE=f32 -D SIMD_SIZE=16 -o src/f32-avgpool/gen/f32-avgpool-9p-minmax-avx512f-u16.c &
 tools/xngen src/f32-avgpool/avgpool.c.in -D ARCH=hvx      -D DATATYPE=f32 -D SIMD_SIZE=32 -o src/f32-avgpool/gen/f32-avgpool-9p-minmax-hvx-u32.c &
 
+################################ RISC-V Vector ################################
+tools/xngen src/f32-avgpool/rvv.c.in -D DATATYPE=f32 -D LMUL=2 -o src/f32-avgpool/gen/f32-avgpool-9p-minmax-rvv-u2v.c &
+
 wait
diff --git a/scripts/generate-f32-maxpool.sh b/scripts/generate-f32-maxpool.sh
index 4c15f633750..322f3151ec9 100755
--- a/scripts/generate-f32-maxpool.sh
+++ b/scripts/generate-f32-maxpool.sh
@@ -12,7 +12,6 @@ tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=f32 -D ARCH=neon     -D SIM
 tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=f32 -D ARCH=hvx      -D SIMD_SIZE=32 -o src/f32-maxpool/gen/f32-maxpool-9p-minmax-hvx-u32.c &
 
 ################################ RISC-V Vector ################################
-tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=f32 -D LMUL=1 -o src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c &
 tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=f32 -D LMUL=2 -o src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u2v.c &
 
 wait
diff --git a/scripts/generate-s8-maxpool.sh b/scripts/generate-s8-maxpool.sh
index 6b379899ffd..84c32a10663 100755
--- a/scripts/generate-s8-maxpool.sh
+++ b/scripts/generate-s8-maxpool.sh
@@ -11,7 +11,6 @@ tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=s8 -D ARCH=wasmsimd -D SIMD
 tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=s8 -D ARCH=neon -D SIMD_SIZE=16 -o src/s8-maxpool/gen/s8-maxpool-9p-minmax-neon-u16.c &
 
 ################################ RISC-V Vector #################################
-tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=s8 -D LMUL=1 -o src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c &
 tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=s8 -D LMUL=2 -o src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u2v.c &
 
 wait
diff --git a/scripts/generate-u8-maxpool.sh b/scripts/generate-u8-maxpool.sh
index aa1b1f66faf..6a52b039fa9 100755
--- a/scripts/generate-u8-maxpool.sh
+++ b/scripts/generate-u8-maxpool.sh
@@ -11,7 +11,6 @@ tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=u8 -D KERNEL_TILE=9 -D ARCH
 tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=u8 -D KERNEL_TILE=9 -D ARCH=neon -D SIMD_SIZE=16 -o src/u8-maxpool/gen/u8-maxpool-9p-minmax-neon-u16.c &
 
 ################################ RISC-V Vector #################################
-tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=u8 -D LMUL=1 -o src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c &
 tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=u8 -D LMUL=2 -o src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u2v.c &
 
 wait
diff --git a/src/configs/avgpool-config.c b/src/configs/avgpool-config.c
index 79bca26a612..40493569ec0 100644
--- a/src/configs/avgpool-config.c
+++ b/src/configs/avgpool-config.c
@@ -51,6 +51,16 @@ static void init_f16_avgpool_config(void) {
       } else
     #endif
     ;  // no f16 support
+  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    (void) hardware_config;  // May be unused.
+    if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) {
+      f16_avgpool_config.ukernel = XNN_INIT_AVGPOOL_UKERNEL(xnn_f16_avgpool_minmax_ukernel_9p__rvvfp16arith_u2v);
+      f16_avgpool_config.init.f16 = xnn_init_f16_scaleminmax_scalar_params;
+      f16_avgpool_config.primary_tile = 9;
+      f16_avgpool_config.channel_tile = 2 * hardware_config->vlenb / sizeof(xnn_float16);
+    }
   #endif
 }
 
@@ -123,6 +133,13 @@ static void init_f32_avgpool_config(void) {
       f32_avgpool_config.primary_tile = 9;
       f32_avgpool_config.channel_tile = 32;
     }
+  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    f32_avgpool_config.ukernel = XNN_INIT_AVGPOOL_UKERNEL(xnn_f32_avgpool_minmax_ukernel_9p__rvv_u2v);
+    f32_avgpool_config.init.f32 = xnn_init_f32_scaleminmax_scalar_params;
+    f32_avgpool_config.primary_tile = 9;
+    f32_avgpool_config.channel_tile = 2 * hardware_config->vlenb / sizeof(float);
   #else
     f32_avgpool_config.ukernel = XNN_INIT_AVGPOOL_UKERNEL(xnn_f32_avgpool_minmax_ukernel_9p__scalar_u1);
     f32_avgpool_config.init.f32 = xnn_init_f32_scaleminmax_scalar_params;
diff --git a/src/f16-avgpool/f16-avgpool-minmax.inc b/src/f16-avgpool/f16-avgpool-minmax.inc
index c099013791e..cba9b2d1370 100644
--- a/src/f16-avgpool/f16-avgpool-minmax.inc
+++ b/src/f16-avgpool/f16-avgpool-minmax.inc
@@ -14,3 +14,7 @@ XNN_UKERNEL(xnn_arch_arm_neon_fp16_arith, xnn_f16_avgpool_minmax_ukernel_9p__neo
 #if XNN_ENABLE_F16C && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_avgpool_minmax_ukernel_9p__f16c_u8, 8, 9, xnn_float16, struct xnn_f16_scaleminmax_params, xnn_init_f16_scaleminmax_scalar_params)
 #endif  // XNN_ENABLE_F16C && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+
+#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
+XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_avgpool_minmax_ukernel_9p__rvvfp16arith_u2v, (2*xnn_init_hardware_config()->vlenb/sizeof(xnn_float16)), 9, xnn_float16, struct xnn_f16_scaleminmax_params, xnn_init_f16_scaleminmax_scalar_params)
+#endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
diff --git a/src/f16-avgpool/gen/f16-avgpool-9p-minmax-rvvfp16arith-u2v.c b/src/f16-avgpool/gen/f16-avgpool-9p-minmax-rvvfp16arith-u2v.c
new file mode 100644
index 00000000000..57b68030e69
--- /dev/null
+++ b/src/f16-avgpool/gen/f16-avgpool-9p-minmax-rvvfp16arith-u2v.c
@@ -0,0 +1,216 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-avgpool/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include "src/xnnpack/maxpool.h"
+#include <riscv_vector.h>
+
+
+void xnn_f16_avgpool_minmax_ukernel_9p__rvvfp16arith_u2v(
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
+    const xnn_float16** input,
+    size_t input_offset,
+    size_t input_pixel_stride,
+    const xnn_float16* zero,
+    const xnn_float16* multiplier,
+    xnn_float16* output,
+    size_t input_increment,
+    size_t output_increment,
+    const struct xnn_f16_scaleminmax_params* restrict params)
+{
+  assert(output_pixels != 0);
+  assert(kernel_elements != 0);
+  assert(channels != 0);
+
+  const xnn_float16 min = params->scalar.min;
+  const xnn_float16 max = params->scalar.max;
+  xnn_float16 scale = params->scalar.scale;
+
+  do {
+    // Start with the previous output as the zero buffer.
+    const xnn_float16* prev_output = zero;
+
+    const xnn_float16** i = input;
+
+    // Passes 0 - n-1: load the output, add 9 inputs.
+    size_t k = kernel_elements;
+    for (; k > 9; k -= 9) {
+      const xnn_float16* i0 = *i++;
+      assert(i0 != NULL);
+      if XNN_UNPREDICTABLE(i0 != zero) {
+        i0 = (const xnn_float16*) ((uintptr_t) i0 + input_offset);
+      }
+      const xnn_float16* i1 = *i++;
+      assert(i1 != NULL);
+      if XNN_UNPREDICTABLE(i1 != zero) {
+        i1 = (const xnn_float16*) ((uintptr_t) i1 + input_offset);
+      }
+      const xnn_float16* i2 = *i++;
+      assert(i2 != NULL);
+      if XNN_UNPREDICTABLE(i2 != zero) {
+        i2 = (const xnn_float16*) ((uintptr_t) i2 + input_offset);
+      }
+      const xnn_float16* i3 = *i++;
+      assert(i3 != NULL);
+      if XNN_UNPREDICTABLE(i3 != zero) {
+        i3 = (const xnn_float16*) ((uintptr_t) i3 + input_offset);
+      }
+      const xnn_float16* i4 = *i++;
+      assert(i4 != NULL);
+      if XNN_UNPREDICTABLE(i4 != zero) {
+        i4 = (const xnn_float16*) ((uintptr_t) i4 + input_offset);
+      }
+      const xnn_float16* i5 = *i++;
+      assert(i5 != NULL);
+      if XNN_UNPREDICTABLE(i5 != zero) {
+        i5 = (const xnn_float16*) ((uintptr_t) i5 + input_offset);
+      }
+      const xnn_float16* i6 = *i++;
+      assert(i6 != NULL);
+      if XNN_UNPREDICTABLE(i6 != zero) {
+        i6 = (const xnn_float16*) ((uintptr_t) i6 + input_offset);
+      }
+      const xnn_float16* i7 = *i++;
+      assert(i7 != NULL);
+      if XNN_UNPREDICTABLE(i7 != zero) {
+        i7 = (const xnn_float16*) ((uintptr_t) i7 + input_offset);
+      }
+      const xnn_float16* i8 = *i++;
+      assert(i8 != NULL);
+      if XNN_UNPREDICTABLE(i8 != zero) {
+        i8 = (const xnn_float16*) ((uintptr_t) i8 + input_offset);
+      }
+
+      xnn_float16* o = output;
+      size_t c = channels;
+      do {
+        size_t vl = __riscv_vsetvl_e16m2(c);
+        vfloat16m2_t vi0 = __riscv_vle16_v_f16m2(i0, vl); i0 += vl;
+        vfloat16m2_t vi1 = __riscv_vle16_v_f16m2(i1, vl); i1 += vl;
+        vfloat16m2_t vi2 = __riscv_vle16_v_f16m2(i2, vl); i2 += vl;
+        vfloat16m2_t vi3 = __riscv_vle16_v_f16m2(i3, vl); i3 += vl;
+        vfloat16m2_t vi4 = __riscv_vle16_v_f16m2(i4, vl); i4 += vl;
+        vfloat16m2_t vi5 = __riscv_vle16_v_f16m2(i5, vl); i5 += vl;
+        vfloat16m2_t vi6 = __riscv_vle16_v_f16m2(i6, vl); i6 += vl;
+        vfloat16m2_t vi7 = __riscv_vle16_v_f16m2(i7, vl); i7 += vl;
+        vfloat16m2_t vi8 = __riscv_vle16_v_f16m2(i8, vl); i8 += vl;
+        vfloat16m2_t vprev = __riscv_vle16_v_f16m2(prev_output, vl); prev_output += vl;
+
+        vfloat16m2_t vsum01 = __riscv_vfadd(vi0, vi1, vl);
+        vfloat16m2_t vsum23 = __riscv_vfadd(vi2, vi3, vl);
+        vfloat16m2_t vsum45 = __riscv_vfadd(vi4, vi5, vl);
+        vfloat16m2_t vsum67 = __riscv_vfadd(vi6, vi7, vl);
+        vfloat16m2_t vsum018 = __riscv_vfadd(vsum01, vi8, vl);
+
+        vfloat16m2_t vsum2345 = __riscv_vfadd(vsum23, vsum45, vl);
+        vfloat16m2_t vsum01678 = __riscv_vfadd(vsum67, vsum018, vl);
+        vfloat16m2_t vsum012345678 = __riscv_vfadd(vsum2345, vsum01678, vl);
+        vfloat16m2_t vacc = __riscv_vfadd(vprev, vsum012345678, vl);
+        __riscv_vse16_v_f16m2(o, vacc, vl); o += vl;
+
+        c -= vl;
+      } while (c != 0);
+
+      // Subsequent passes read from the previous output.
+      prev_output = output;
+    }
+
+    // Final pass: load the output, add remaining kernel elements, apply scaling/min/max
+    const xnn_float16* i0 = 0 < k ? *i++ : zero;
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != zero) {
+      i0 = (const xnn_float16*) ((uintptr_t) i0 + input_offset);
+    }
+    const xnn_float16* i1 = 1 < k ? *i++ : zero;
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != zero) {
+      i1 = (const xnn_float16*) ((uintptr_t) i1 + input_offset);
+    }
+    const xnn_float16* i2 = 2 < k ? *i++ : zero;
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != zero) {
+      i2 = (const xnn_float16*) ((uintptr_t) i2 + input_offset);
+    }
+    const xnn_float16* i3 = 3 < k ? *i++ : zero;
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != zero) {
+      i3 = (const xnn_float16*) ((uintptr_t) i3 + input_offset);
+    }
+    const xnn_float16* i4 = 4 < k ? *i++ : zero;
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != zero) {
+      i4 = (const xnn_float16*) ((uintptr_t) i4 + input_offset);
+    }
+    const xnn_float16* i5 = 5 < k ? *i++ : zero;
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != zero) {
+      i5 = (const xnn_float16*) ((uintptr_t) i5 + input_offset);
+    }
+    const xnn_float16* i6 = 6 < k ? *i++ : zero;
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != zero) {
+      i6 = (const xnn_float16*) ((uintptr_t) i6 + input_offset);
+    }
+    const xnn_float16* i7 = 7 < k ? *i++ : zero;
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != zero) {
+      i7 = (const xnn_float16*) ((uintptr_t) i7 + input_offset);
+    }
+    const xnn_float16* i8 = 8 < k ? *i++ : zero;
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != zero) {
+      i8 = (const xnn_float16*) ((uintptr_t) i8 + input_offset);
+    }
+
+    if (multiplier != NULL) {
+      scale = *multiplier++;
+    }
+    xnn_float16* o = output;
+    size_t c = channels;
+    do {
+      size_t vl = __riscv_vsetvl_e16m2(c);
+      vfloat16m2_t vi0 = __riscv_vle16_v_f16m2(i0, vl); i0 += vl;
+      vfloat16m2_t vi1 = __riscv_vle16_v_f16m2(i1, vl); i1 += vl;
+      vfloat16m2_t vi2 = __riscv_vle16_v_f16m2(i2, vl); i2 += vl;
+      vfloat16m2_t vi3 = __riscv_vle16_v_f16m2(i3, vl); i3 += vl;
+      vfloat16m2_t vi4 = __riscv_vle16_v_f16m2(i4, vl); i4 += vl;
+      vfloat16m2_t vi5 = __riscv_vle16_v_f16m2(i5, vl); i5 += vl;
+      vfloat16m2_t vi6 = __riscv_vle16_v_f16m2(i6, vl); i6 += vl;
+      vfloat16m2_t vi7 = __riscv_vle16_v_f16m2(i7, vl); i7 += vl;
+      vfloat16m2_t vi8 = __riscv_vle16_v_f16m2(i8, vl); i8 += vl;
+      vfloat16m2_t vprev = __riscv_vle16_v_f16m2(prev_output, vl); prev_output += vl;
+
+      vfloat16m2_t vsum01 = __riscv_vfadd(vi0, vi1, vl);
+      vfloat16m2_t vsum23 = __riscv_vfadd(vi2, vi3, vl);
+      vfloat16m2_t vsum45 = __riscv_vfadd(vi4, vi5, vl);
+      vfloat16m2_t vsum67 = __riscv_vfadd(vi6, vi7, vl);
+      vfloat16m2_t vsum018 = __riscv_vfadd(vsum01, vi8, vl);
+
+      vfloat16m2_t vsum2345 = __riscv_vfadd(vsum23, vsum45, vl);
+      vfloat16m2_t vsum01678 = __riscv_vfadd(vsum67, vsum018, vl);
+      vfloat16m2_t vsum012345678 = __riscv_vfadd(vsum2345, vsum01678, vl);
+      vfloat16m2_t vacc = __riscv_vfadd(vprev, vsum012345678, vl);
+
+      vacc = __riscv_vfmul(vacc, scale, vl);
+      vacc = __riscv_vfmax(vacc, min, vl);
+      vacc = __riscv_vfmin(vacc, max, vl);
+
+      __riscv_vse16_v_f16m2(o, vacc, vl); o += vl;
+
+      c -= vl;
+    } while (c != 0);
+
+    input = (const xnn_float16**) ((uintptr_t) input + input_increment);
+    input_offset += input_pixel_stride;
+    output = (xnn_float16*) ((uintptr_t) output + output_increment);
+  } while (--output_pixels != 0);
+}
diff --git a/src/f16-maxpool/f16-maxpool-minmax.inc b/src/f16-maxpool/f16-maxpool-minmax.inc
index ebc7a65487c..a954de9bb4b 100644
--- a/src/f16-maxpool/f16-maxpool-minmax.inc
+++ b/src/f16-maxpool/f16-maxpool-minmax.inc
@@ -18,6 +18,5 @@ XNN_UKERNEL(xnn_arch_x86_avx2, xnn_f16_maxpool_minmax_ukernel_9p__avx2_u16, 16,
 #endif  // XNN_ENABLE_AVX2 && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
-XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_maxpool_minmax_ukernel_9p__rvvfp16arith_u1v, (1*xnn_init_hardware_config()->vlenb/sizeof(xnn_float16)), 9, xnn_float16, struct xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params)
 XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_maxpool_minmax_ukernel_9p__rvvfp16arith_u2v, (2*xnn_init_hardware_config()->vlenb/sizeof(xnn_float16)), 9, xnn_float16, struct xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params)
 #endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
diff --git a/src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c b/src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c
deleted file mode 100644
index 266b3701f49..00000000000
--- a/src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c
+++ /dev/null
@@ -1,147 +0,0 @@
-// clang-format off
-// Auto-generated file. Do not edit!
-//   Template: src/f32-maxpool/rvv.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2026 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include "src/xnnpack/maxpool.h"
-#include <riscv_vector.h>
-
-
-void xnn_f16_maxpool_minmax_ukernel_9p__rvvfp16arith_u1v(
-    size_t output_pixels,
-    size_t kernel_elements,
-    size_t channels,
-    const xnn_float16** input,
-    size_t input_offset,
-    size_t input_pixel_stride,
-    xnn_float16* output,
-    size_t input_increment,
-    size_t output_increment,
-    const struct xnn_f16_minmax_params* restrict params)
-{
-  assert(output_pixels != 0);
-  assert(kernel_elements != 0);
-  assert(channels != 0);
-
-  const xnn_float16 output_min = params->scalar.min;
-  const xnn_float16 output_max = params->scalar.max;
-  do {
-    const xnn_float16** i = input;
-
-    // First pass: load the inputs, store the max pool in the output.
-    const xnn_float16* i0 = *i++;
-    const xnn_float16* i1 = 1 < kernel_elements ? *i++ : i0;
-    const xnn_float16* i2 = 2 < kernel_elements ? *i++ : i0;
-    const xnn_float16* i3 = 3 < kernel_elements ? *i++ : i0;
-    const xnn_float16* i4 = 4 < kernel_elements ? *i++ : i0;
-    const xnn_float16* i5 = 5 < kernel_elements ? *i++ : i0;
-    const xnn_float16* i6 = 6 < kernel_elements ? *i++ : i0;
-    const xnn_float16* i7 = 7 < kernel_elements ? *i++ : i0;
-    const xnn_float16* i8 = 8 < kernel_elements ? *i++ : i0;
-    i0 = (const xnn_float16*) ((uintptr_t) i0 + input_offset);
-    i1 = (const xnn_float16*) ((uintptr_t) i1 + input_offset);
-    i2 = (const xnn_float16*) ((uintptr_t) i2 + input_offset);
-    i3 = (const xnn_float16*) ((uintptr_t) i3 + input_offset);
-    i4 = (const xnn_float16*) ((uintptr_t) i4 + input_offset);
-    i5 = (const xnn_float16*) ((uintptr_t) i5 + input_offset);
-    i6 = (const xnn_float16*) ((uintptr_t) i6 + input_offset);
-    i7 = (const xnn_float16*) ((uintptr_t) i7 + input_offset);
-    i8 = (const xnn_float16*) ((uintptr_t) i8 + input_offset);
-
-    xnn_float16* o = output;
-    size_t c = channels;
-    do {
-      size_t vl = __riscv_vsetvl_e16m1(c);
-      vfloat16m1_t vi0 = __riscv_vle16_v_f16m1(i0, vl); i0 += vl;
-      vfloat16m1_t vi1 = __riscv_vle16_v_f16m1(i1, vl); i1 += vl;
-      vfloat16m1_t vi2 = __riscv_vle16_v_f16m1(i2, vl); i2 += vl;
-      vfloat16m1_t vi3 = __riscv_vle16_v_f16m1(i3, vl); i3 += vl;
-      vfloat16m1_t vi4 = __riscv_vle16_v_f16m1(i4, vl); i4 += vl;
-      vfloat16m1_t vi5 = __riscv_vle16_v_f16m1(i5, vl); i5 += vl;
-      vfloat16m1_t vi6 = __riscv_vle16_v_f16m1(i6, vl); i6 += vl;
-      vfloat16m1_t vi7 = __riscv_vle16_v_f16m1(i7, vl); i7 += vl;
-      vfloat16m1_t vi8 = __riscv_vle16_v_f16m1(i8, vl); i8 += vl;
-
-      vfloat16m1_t vmax01 = __riscv_vfmax(vi0, vi1, vl);
-      vfloat16m1_t vmax23 = __riscv_vfmax(vi2, vi3, vl);
-      vfloat16m1_t vmax45 = __riscv_vfmax(vi4, vi5, vl);
-      vfloat16m1_t vmax67 = __riscv_vfmax(vi6, vi7, vl);
-      vfloat16m1_t vmax018 = __riscv_vfmax(vmax01, vi8, vl);
-
-      vfloat16m1_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl);
-      vfloat16m1_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl);
-      vfloat16m1_t vacc = __riscv_vfmax(vmax2345, vmax01678, vl);
-
-      vacc = __riscv_vfmax(vacc, output_min, vl);
-      vacc = __riscv_vfmin(vacc, output_max, vl);
-      __riscv_vse16_v_f16m1(o, vacc, vl); o += vl;
-
-      c -= vl;
-    } while (c != 0);
-
-    // Passes 1 - n: Max more inputs to the output.
-    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) {
-      const xnn_float16* i0 = *i++;
-      const xnn_float16* i1 = 1 < k ? *i++ : i0;
-      const xnn_float16* i2 = 2 < k ? *i++ : i0;
-      const xnn_float16* i3 = 3 < k ? *i++ : i0;
-      const xnn_float16* i4 = 4 < k ? *i++ : i0;
-      const xnn_float16* i5 = 5 < k ? *i++ : i0;
-      const xnn_float16* i6 = 6 < k ? *i++ : i0;
-      const xnn_float16* i7 = 7 < k ? *i++ : i0;
-      const xnn_float16* i8 = 8 < k ? *i++ : i0;
-      i0 = (const xnn_float16*) ((uintptr_t) i0 + input_offset);
-      i1 = (const xnn_float16*) ((uintptr_t) i1 + input_offset);
-      i2 = (const xnn_float16*) ((uintptr_t) i2 + input_offset);
-      i3 = (const xnn_float16*) ((uintptr_t) i3 + input_offset);
-      i4 = (const xnn_float16*) ((uintptr_t) i4 + input_offset);
-      i5 = (const xnn_float16*) ((uintptr_t) i5 + input_offset);
-      i6 = (const xnn_float16*) ((uintptr_t) i6 + input_offset);
-      i7 = (const xnn_float16*) ((uintptr_t) i7 + input_offset);
-      i8 = (const xnn_float16*) ((uintptr_t) i8 + input_offset);
-
-      o = output;
-      size_t c = channels;
-      do {
-        size_t vl = __riscv_vsetvl_e16m1(c);
-
-        vfloat16m1_t vi0 = __riscv_vle16_v_f16m1(i0, vl); i0 += vl;
-        vfloat16m1_t vi1 = __riscv_vle16_v_f16m1(i1, vl); i1 += vl;
-        vfloat16m1_t vi2 = __riscv_vle16_v_f16m1(i2, vl); i2 += vl;
-        vfloat16m1_t vi3 = __riscv_vle16_v_f16m1(i3, vl); i3 += vl;
-        vfloat16m1_t vi4 = __riscv_vle16_v_f16m1(i4, vl); i4 += vl;
-        vfloat16m1_t vi5 = __riscv_vle16_v_f16m1(i5, vl); i5 += vl;
-        vfloat16m1_t vi6 = __riscv_vle16_v_f16m1(i6, vl); i6 += vl;
-        vfloat16m1_t vi7 = __riscv_vle16_v_f16m1(i7, vl); i7 += vl;
-        vfloat16m1_t vi8 = __riscv_vle16_v_f16m1(i8, vl); i8 += vl;
-
-        vfloat16m1_t vprev = __riscv_vle16_v_f16m1(o, vl);
-
-        vfloat16m1_t vmax01 = __riscv_vfmax(vi0, vi1, vl);
-        vfloat16m1_t vmax23 = __riscv_vfmax(vi2, vi3, vl);
-        vfloat16m1_t vmax45 = __riscv_vfmax(vi4, vi5, vl);
-        vfloat16m1_t vmax67 = __riscv_vfmax(vi6, vi7, vl);
-        vfloat16m1_t vmax018 = __riscv_vfmax(vmax01, vi8, vl);
-
-        vfloat16m1_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl);
-        vfloat16m1_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl);
-        vfloat16m1_t vmax012345678 = __riscv_vfmax(vmax2345, vmax01678, vl);
-
-        vfloat16m1_t vacc = __riscv_vfmax(vprev, vmax012345678, vl);
-        vacc = __riscv_vfmin(vacc, output_max, vl);
-        __riscv_vse16_v_f16m1(o, vacc, vl); o += vl;
-
-        c -= vl;
-      } while (c != 0);
-    }
-    input = (const xnn_float16**) ((uintptr_t) input + input_increment);
-    input_offset += input_pixel_stride;
-    output = (xnn_float16*) ((uintptr_t) output + output_increment);
-  } while (--output_pixels != 0);
-}
diff --git a/src/f32-avgpool/f32-avgpool-minmax.inc b/src/f32-avgpool/f32-avgpool-minmax.inc
index e530e870b4b..534eb3c83ca 100644
--- a/src/f32-avgpool/f32-avgpool-minmax.inc
+++ b/src/f32-avgpool/f32-avgpool-minmax.inc
@@ -28,4 +28,8 @@ XNN_UKERNEL(xnn_arch_none, xnn_f32_avgpool_minmax_ukernel_9p__wasmsimd_u4, 4, 9,
 XNN_UKERNEL(xnn_arch_hvx, xnn_f32_avgpool_minmax_ukernel_9p__hvx_u32, 32, 9, float, struct xnn_f32_scaleminmax_params, xnn_init_f32_scaleminmax_scalar_params)
 #endif  // XNN_ENABLE_HVX && XNN_ARCH_HEXAGON
 
+#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_avgpool_minmax_ukernel_9p__rvv_u2v, (2*xnn_init_hardware_config()->vlenb/sizeof(float)), 9, float, struct xnn_f32_scaleminmax_params, xnn_init_f32_scaleminmax_scalar_params)
+#endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
+
 XNN_UKERNEL(xnn_arch_none, xnn_f32_avgpool_minmax_ukernel_9p__scalar_u1, 1, 9, float, struct xnn_f32_scaleminmax_params, xnn_init_f32_scaleminmax_scalar_params)
diff --git a/src/f32-avgpool/gen/f32-avgpool-9p-minmax-rvv-u2v.c b/src/f32-avgpool/gen/f32-avgpool-9p-minmax-rvv-u2v.c
new file mode 100644
index 00000000000..fd032a5d64a
--- /dev/null
+++ b/src/f32-avgpool/gen/f32-avgpool-9p-minmax-rvv-u2v.c
@@ -0,0 +1,216 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-avgpool/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include "src/xnnpack/maxpool.h"
+#include <riscv_vector.h>
+
+
+void xnn_f32_avgpool_minmax_ukernel_9p__rvv_u2v(
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
+    const float** input,
+    size_t input_offset,
+    size_t input_pixel_stride,
+    const float* zero,
+    const float* multiplier,
+    float* output,
+    size_t input_increment,
+    size_t output_increment,
+    const struct xnn_f32_scaleminmax_params* restrict params)
+{
+  assert(output_pixels != 0);
+  assert(kernel_elements != 0);
+  assert(channels != 0);
+
+  const float min = params->scalar.min;
+  const float max = params->scalar.max;
+  float scale = params->scalar.scale;
+
+  do {
+    // Start with the previous output as the zero buffer.
+    const float* prev_output = zero;
+
+    const float** i = input;
+
+    // Passes 0 - n-1: load the output, add 9 inputs.
+    size_t k = kernel_elements;
+    for (; k > 9; k -= 9) {
+      const float* i0 = *i++;
+      assert(i0 != NULL);
+      if XNN_UNPREDICTABLE(i0 != zero) {
+        i0 = (const float*) ((uintptr_t) i0 + input_offset);
+      }
+      const float* i1 = *i++;
+      assert(i1 != NULL);
+      if XNN_UNPREDICTABLE(i1 != zero) {
+        i1 = (const float*) ((uintptr_t) i1 + input_offset);
+      }
+      const float* i2 = *i++;
+      assert(i2 != NULL);
+      if XNN_UNPREDICTABLE(i2 != zero) {
+        i2 = (const float*) ((uintptr_t) i2 + input_offset);
+      }
+      const float* i3 = *i++;
+      assert(i3 != NULL);
+      if XNN_UNPREDICTABLE(i3 != zero) {
+        i3 = (const float*) ((uintptr_t) i3 + input_offset);
+      }
+      const float* i4 = *i++;
+      assert(i4 != NULL);
+      if XNN_UNPREDICTABLE(i4 != zero) {
+        i4 = (const float*) ((uintptr_t) i4 + input_offset);
+      }
+      const float* i5 = *i++;
+      assert(i5 != NULL);
+      if XNN_UNPREDICTABLE(i5 != zero) {
+        i5 = (const float*) ((uintptr_t) i5 + input_offset);
+      }
+      const float* i6 = *i++;
+      assert(i6 != NULL);
+      if XNN_UNPREDICTABLE(i6 != zero) {
+        i6 = (const float*) ((uintptr_t) i6 + input_offset);
+      }
+      const float* i7 = *i++;
+      assert(i7 != NULL);
+      if XNN_UNPREDICTABLE(i7 != zero) {
+        i7 = (const float*) ((uintptr_t) i7 + input_offset);
+      }
+      const float* i8 = *i++;
+      assert(i8 != NULL);
+      if XNN_UNPREDICTABLE(i8 != zero) {
+        i8 = (const float*) ((uintptr_t) i8 + input_offset);
+      }
+
+      float* o = output;
+      size_t c = channels;
+      do {
+        size_t vl = __riscv_vsetvl_e32m2(c);
+        vfloat32m2_t vi0 = __riscv_vle32_v_f32m2(i0, vl); i0 += vl;
+        vfloat32m2_t vi1 = __riscv_vle32_v_f32m2(i1, vl); i1 += vl;
+        vfloat32m2_t vi2 = __riscv_vle32_v_f32m2(i2, vl); i2 += vl;
+        vfloat32m2_t vi3 = __riscv_vle32_v_f32m2(i3, vl); i3 += vl;
+        vfloat32m2_t vi4 = __riscv_vle32_v_f32m2(i4, vl); i4 += vl;
+        vfloat32m2_t vi5 = __riscv_vle32_v_f32m2(i5, vl); i5 += vl;
+        vfloat32m2_t vi6 = __riscv_vle32_v_f32m2(i6, vl); i6 += vl;
+        vfloat32m2_t vi7 = __riscv_vle32_v_f32m2(i7, vl); i7 += vl;
+        vfloat32m2_t vi8 = __riscv_vle32_v_f32m2(i8, vl); i8 += vl;
+        vfloat32m2_t vprev = __riscv_vle32_v_f32m2(prev_output, vl); prev_output += vl;
+
+        vfloat32m2_t vsum01 = __riscv_vfadd(vi0, vi1, vl);
+        vfloat32m2_t vsum23 = __riscv_vfadd(vi2, vi3, vl);
+        vfloat32m2_t vsum45 = __riscv_vfadd(vi4, vi5, vl);
+        vfloat32m2_t vsum67 = __riscv_vfadd(vi6, vi7, vl);
+        vfloat32m2_t vsum018 = __riscv_vfadd(vsum01, vi8, vl);
+
+        vfloat32m2_t vsum2345 = __riscv_vfadd(vsum23, vsum45, vl);
+        vfloat32m2_t vsum01678 = __riscv_vfadd(vsum67, vsum018, vl);
+        vfloat32m2_t vsum012345678 = __riscv_vfadd(vsum2345, vsum01678, vl);
+        vfloat32m2_t vacc = __riscv_vfadd(vprev, vsum012345678, vl);
+        __riscv_vse32_v_f32m2(o, vacc, vl); o += vl;
+
+        c -= vl;
+      } while (c != 0);
+
+      // Subsequent passes read from the previous output.
+      prev_output = output;
+    }
+
+    // Final pass: load the output, add remaining kernel elements, apply scaling/min/max
+    const float* i0 = 0 < k ? *i++ : zero;
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != zero) {
+      i0 = (const float*) ((uintptr_t) i0 + input_offset);
+    }
+    const float* i1 = 1 < k ? *i++ : zero;
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != zero) {
+      i1 = (const float*) ((uintptr_t) i1 + input_offset);
+    }
+    const float* i2 = 2 < k ? *i++ : zero;
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != zero) {
+      i2 = (const float*) ((uintptr_t) i2 + input_offset);
+    }
+    const float* i3 = 3 < k ? *i++ : zero;
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != zero) {
+      i3 = (const float*) ((uintptr_t) i3 + input_offset);
+    }
+    const float* i4 = 4 < k ? *i++ : zero;
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != zero) {
+      i4 = (const float*) ((uintptr_t) i4 + input_offset);
+    }
+    const float* i5 = 5 < k ? *i++ : zero;
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != zero) {
+      i5 = (const float*) ((uintptr_t) i5 + input_offset);
+    }
+    const float* i6 = 6 < k ? *i++ : zero;
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != zero) {
+      i6 = (const float*) ((uintptr_t) i6 + input_offset);
+    }
+    const float* i7 = 7 < k ? *i++ : zero;
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != zero) {
+      i7 = (const float*) ((uintptr_t) i7 + input_offset);
+    }
+    const float* i8 = 8 < k ? *i++ : zero;
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != zero) {
+      i8 = (const float*) ((uintptr_t) i8 + input_offset);
+    }
+
+    if (multiplier != NULL) {
+      scale = *multiplier++;
+    }
+    float* o = output;
+    size_t c = channels;
+    do {
+      size_t vl = __riscv_vsetvl_e32m2(c);
+      vfloat32m2_t vi0 = __riscv_vle32_v_f32m2(i0, vl); i0 += vl;
+      vfloat32m2_t vi1 = __riscv_vle32_v_f32m2(i1, vl); i1 += vl;
+      vfloat32m2_t vi2 = __riscv_vle32_v_f32m2(i2, vl); i2 += vl;
+      vfloat32m2_t vi3 = __riscv_vle32_v_f32m2(i3, vl); i3 += vl;
+      vfloat32m2_t vi4 = __riscv_vle32_v_f32m2(i4, vl); i4 += vl;
+      vfloat32m2_t vi5 = __riscv_vle32_v_f32m2(i5, vl); i5 += vl;
+      vfloat32m2_t vi6 = __riscv_vle32_v_f32m2(i6, vl); i6 += vl;
+      vfloat32m2_t vi7 = __riscv_vle32_v_f32m2(i7, vl); i7 += vl;
+      vfloat32m2_t vi8 = __riscv_vle32_v_f32m2(i8, vl); i8 += vl;
+      vfloat32m2_t vprev = __riscv_vle32_v_f32m2(prev_output, vl); prev_output += vl;
+
+      vfloat32m2_t vsum01 = __riscv_vfadd(vi0, vi1, vl);
+      vfloat32m2_t vsum23 = __riscv_vfadd(vi2, vi3, vl);
+      vfloat32m2_t vsum45 = __riscv_vfadd(vi4, vi5, vl);
+      vfloat32m2_t vsum67 = __riscv_vfadd(vi6, vi7, vl);
+      vfloat32m2_t vsum018 = __riscv_vfadd(vsum01, vi8, vl);
+
+      vfloat32m2_t vsum2345 = __riscv_vfadd(vsum23, vsum45, vl);
+      vfloat32m2_t vsum01678 = __riscv_vfadd(vsum67, vsum018, vl);
+      vfloat32m2_t vsum012345678 = __riscv_vfadd(vsum2345, vsum01678, vl);
+      vfloat32m2_t vacc = __riscv_vfadd(vprev, vsum012345678, vl);
+
+      vacc = __riscv_vfmul(vacc, scale, vl);
+      vacc = __riscv_vfmax(vacc, min, vl);
+      vacc = __riscv_vfmin(vacc, max, vl);
+
+      __riscv_vse32_v_f32m2(o, vacc, vl); o += vl;
+
+      c -= vl;
+    } while (c != 0);
+
+    input = (const float**) ((uintptr_t) input + input_increment);
+    input_offset += input_pixel_stride;
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_pixels != 0);
+}
diff --git a/src/f32-avgpool/rvv.c.in b/src/f32-avgpool/rvv.c.in
new file mode 100644
index 00000000000..2fd67ee4055
--- /dev/null
+++ b/src/f32-avgpool/rvv.c.in
@@ -0,0 +1,127 @@
+// Copyright 2026 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert LMUL in [1, 2, 4, 8]
+$assert(DATATYPE in ["f32", "f16"])
+#include <assert.h>
+#include "src/xnnpack/maxpool.h"
+#include <riscv_vector.h>
+
+$CTYPE = {"f32": "float", "f16": "xnn_float16"}[DATATYPE]
+$VTYPE = {"f32": "vfloat32", "f16": "vfloat16"}[DATATYPE]
+$VLOAD = {"f32": "__riscv_vle32_v_f32", "f16": "__riscv_vle16_v_f16"}[DATATYPE]
+$VSTORE = {"f32": "__riscv_vse32_v_f32", "f16": "__riscv_vse16_v_f16"}[DATATYPE]
+$VSETVL = {"f32": "__riscv_vsetvl_e32", "f16": "__riscv_vsetvl_e16"}[DATATYPE]
+$ISA = "fp16arith" if DATATYPE == "f16" else ""
+
+void xnn_${DATATYPE}_avgpool_minmax_ukernel_9p__rvv${ISA}_u${LMUL}v(
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
+    const ${CTYPE}** input,
+    size_t input_offset,
+    size_t input_pixel_stride,
+    const ${CTYPE}* zero,
+    const ${CTYPE}* multiplier,
+    ${CTYPE}* output,
+    size_t input_increment,
+    size_t output_increment,
+    const struct xnn_${DATATYPE}_scaleminmax_params* restrict params)
+{
+  assert(output_pixels != 0);
+  assert(kernel_elements != 0);
+  assert(channels != 0);
+
+  const ${CTYPE} min = params->scalar.min;
+  const ${CTYPE} max = params->scalar.max;
+  ${CTYPE} scale = params->scalar.scale;
+
+  do {
+    // Start with the previous output as the zero buffer.
+    const ${CTYPE}* prev_output = zero;
+
+    const ${CTYPE}** i = input;
+
+    // Passes 0 - n-1: load the output, add 9 inputs.
+    size_t k = kernel_elements;
+    for (; k > 9; k -= 9) {
+      $for K in range(9):
+        const ${CTYPE}* i${K} = *i++;
+        assert(i${K} != NULL);
+        if XNN_UNPREDICTABLE(i${K} != zero) {
+          i${K} = (const ${CTYPE}*) ((uintptr_t) i${K} + input_offset);
+        }
+
+      ${CTYPE}* o = output;
+      size_t c = channels;
+      do {
+        size_t vl = ${VSETVL}m${LMUL}(c);
+        $for K in range(9):
+          ${VTYPE}m${LMUL}_t vi${K} = ${VLOAD}m${LMUL}(i${K}, vl); i${K} += vl;
+        ${VTYPE}m${LMUL}_t vprev = ${VLOAD}m${LMUL}(prev_output, vl); prev_output += vl;
+
+        ${VTYPE}m${LMUL}_t vsum01 = __riscv_vfadd(vi0, vi1, vl);
+        ${VTYPE}m${LMUL}_t vsum23 = __riscv_vfadd(vi2, vi3, vl);
+        ${VTYPE}m${LMUL}_t vsum45 = __riscv_vfadd(vi4, vi5, vl);
+        ${VTYPE}m${LMUL}_t vsum67 = __riscv_vfadd(vi6, vi7, vl);
+        ${VTYPE}m${LMUL}_t vsum018 = __riscv_vfadd(vsum01, vi8, vl);
+
+        ${VTYPE}m${LMUL}_t vsum2345 = __riscv_vfadd(vsum23, vsum45, vl);
+        ${VTYPE}m${LMUL}_t vsum01678 = __riscv_vfadd(vsum67, vsum018, vl);
+        ${VTYPE}m${LMUL}_t vsum012345678 = __riscv_vfadd(vsum2345, vsum01678, vl);
+        ${VTYPE}m${LMUL}_t vacc = __riscv_vfadd(vprev, vsum012345678, vl);
+        ${VSTORE}m${LMUL}(o, vacc, vl); o += vl;
+
+        c -= vl;
+      } while (c != 0);
+
+      // Subsequent passes read from the previous output.
+      prev_output = output;
+    }
+
+    // Final pass: load the output, add remaining kernel elements, apply scaling/min/max
+    $for K in range(9):
+      const ${CTYPE}* i${K} = ${K} < k ? *i++ : zero;
+      assert(i${K} != NULL);
+      if XNN_UNPREDICTABLE(i${K} != zero) {
+        i${K} = (const ${CTYPE}*) ((uintptr_t) i${K} + input_offset);
+      }
+
+    if (multiplier != NULL) {
+      scale = *multiplier++;
+    }
+    ${CTYPE}* o = output;
+    size_t c = channels;
+    do {
+      size_t vl = ${VSETVL}m${LMUL}(c);
+      $for K in range(9):
+        ${VTYPE}m${LMUL}_t vi${K} = ${VLOAD}m${LMUL}(i${K}, vl); i${K} += vl;
+      ${VTYPE}m${LMUL}_t vprev = ${VLOAD}m${LMUL}(prev_output, vl); prev_output += vl;
+
+      ${VTYPE}m${LMUL}_t vsum01 = __riscv_vfadd(vi0, vi1, vl);
+      ${VTYPE}m${LMUL}_t vsum23 = __riscv_vfadd(vi2, vi3, vl);
+      ${VTYPE}m${LMUL}_t vsum45 = __riscv_vfadd(vi4, vi5, vl);
+      ${VTYPE}m${LMUL}_t vsum67 = __riscv_vfadd(vi6, vi7, vl);
+      ${VTYPE}m${LMUL}_t vsum018 = __riscv_vfadd(vsum01, vi8, vl);
+
+      ${VTYPE}m${LMUL}_t vsum2345 = __riscv_vfadd(vsum23, vsum45, vl);
+      ${VTYPE}m${LMUL}_t vsum01678 = __riscv_vfadd(vsum67, vsum018, vl);
+      ${VTYPE}m${LMUL}_t vsum012345678 = __riscv_vfadd(vsum2345, vsum01678, vl);
+      ${VTYPE}m${LMUL}_t vacc = __riscv_vfadd(vprev, vsum012345678, vl);
+
+      vacc = __riscv_vfmul(vacc, scale, vl);
+      vacc = __riscv_vfmax(vacc, min, vl);
+      vacc = __riscv_vfmin(vacc, max, vl);
+
+      ${VSTORE}m${LMUL}(o, vacc, vl); o += vl;
+
+      c -= vl;
+    } while (c != 0);
+
+    input = (const ${CTYPE}**) ((uintptr_t) input + input_increment);
+    input_offset += input_pixel_stride;
+    output = (${CTYPE}*) ((uintptr_t) output + output_increment);
+  } while (--output_pixels != 0);
+}
diff --git a/src/f32-maxpool/f32-maxpool-minmax.inc b/src/f32-maxpool/f32-maxpool-minmax.inc
index d3434811e17..10bc3f2e5d7 100644
--- a/src/f32-maxpool/f32-maxpool-minmax.inc
+++ b/src/f32-maxpool/f32-maxpool-minmax.inc
@@ -18,7 +18,6 @@ XNN_UKERNEL(xnn_arch_none, xnn_f32_maxpool_minmax_ukernel_9p__wasmsimd_u4, 4, 9,
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 
 #if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
-XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_maxpool_minmax_ukernel_9p__rvv_u1v, (1*xnn_init_hardware_config()->vlenb/sizeof(float)), 9, float, struct xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params)
 XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_maxpool_minmax_ukernel_9p__rvv_u2v, (2*xnn_init_hardware_config()->vlenb/sizeof(float)), 9, float, struct xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params)
 #endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
 
diff --git a/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c b/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c
deleted file mode 100644
index a4873239b0a..00000000000
--- a/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c
+++ /dev/null
@@ -1,147 +0,0 @@
-// clang-format off
-// Auto-generated file. Do not edit!
-//   Template: src/f32-maxpool/rvv.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2026 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include "src/xnnpack/maxpool.h"
-#include <riscv_vector.h>
-
-
-void xnn_f32_maxpool_minmax_ukernel_9p__rvv_u1v(
-    size_t output_pixels,
-    size_t kernel_elements,
-    size_t channels,
-    const float** input,
-    size_t input_offset,
-    size_t input_pixel_stride,
-    float* output,
-    size_t input_increment,
-    size_t output_increment,
-    const struct xnn_f32_minmax_params* restrict params)
-{
-  assert(output_pixels != 0);
-  assert(kernel_elements != 0);
-  assert(channels != 0);
-
-  const float output_min = params->scalar.min;
-  const float output_max = params->scalar.max;
-  do {
-    const float** i = input;
-
-    // First pass: load the inputs, store the max pool in the output.
-    const float* i0 = *i++;
-    const float* i1 = 1 < kernel_elements ? *i++ : i0;
-    const float* i2 = 2 < kernel_elements ? *i++ : i0;
-    const float* i3 = 3 < kernel_elements ? *i++ : i0;
-    const float* i4 = 4 < kernel_elements ? *i++ : i0;
-    const float* i5 = 5 < kernel_elements ? *i++ : i0;
-    const float* i6 = 6 < kernel_elements ? *i++ : i0;
-    const float* i7 = 7 < kernel_elements ? *i++ : i0;
-    const float* i8 = 8 < kernel_elements ? *i++ : i0;
-    i0 = (const float*) ((uintptr_t) i0 + input_offset);
-    i1 = (const float*) ((uintptr_t) i1 + input_offset);
-    i2 = (const float*) ((uintptr_t) i2 + input_offset);
-    i3 = (const float*) ((uintptr_t) i3 + input_offset);
-    i4 = (const float*) ((uintptr_t) i4 + input_offset);
-    i5 = (const float*) ((uintptr_t) i5 + input_offset);
-    i6 = (const float*) ((uintptr_t) i6 + input_offset);
-    i7 = (const float*) ((uintptr_t) i7 + input_offset);
-    i8 = (const float*) ((uintptr_t) i8 + input_offset);
-
-    float* o = output;
-    size_t c = channels;
-    do {
-      size_t vl = __riscv_vsetvl_e32m1(c);
-      vfloat32m1_t vi0 = __riscv_vle32_v_f32m1(i0, vl); i0 += vl;
-      vfloat32m1_t vi1 = __riscv_vle32_v_f32m1(i1, vl); i1 += vl;
-      vfloat32m1_t vi2 = __riscv_vle32_v_f32m1(i2, vl); i2 += vl;
-      vfloat32m1_t vi3 = __riscv_vle32_v_f32m1(i3, vl); i3 += vl;
-      vfloat32m1_t vi4 = __riscv_vle32_v_f32m1(i4, vl); i4 += vl;
-      vfloat32m1_t vi5 = __riscv_vle32_v_f32m1(i5, vl); i5 += vl;
-      vfloat32m1_t vi6 = __riscv_vle32_v_f32m1(i6, vl); i6 += vl;
-      vfloat32m1_t vi7 = __riscv_vle32_v_f32m1(i7, vl); i7 += vl;
-      vfloat32m1_t vi8 = __riscv_vle32_v_f32m1(i8, vl); i8 += vl;
-
-      vfloat32m1_t vmax01 = __riscv_vfmax(vi0, vi1, vl);
-      vfloat32m1_t vmax23 = __riscv_vfmax(vi2, vi3, vl);
-      vfloat32m1_t vmax45 = __riscv_vfmax(vi4, vi5, vl);
-      vfloat32m1_t vmax67 = __riscv_vfmax(vi6, vi7, vl);
-      vfloat32m1_t vmax018 = __riscv_vfmax(vmax01, vi8, vl);
-
-      vfloat32m1_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl);
-      vfloat32m1_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl);
-      vfloat32m1_t vacc = __riscv_vfmax(vmax2345, vmax01678, vl);
-
-      vacc = __riscv_vfmax(vacc, output_min, vl);
-      vacc = __riscv_vfmin(vacc, output_max, vl);
-      __riscv_vse32_v_f32m1(o, vacc, vl); o += vl;
-
-      c -= vl;
-    } while (c != 0);
-
-    // Passes 1 - n: Max more inputs to the output.
-    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) {
-      const float* i0 = *i++;
-      const float* i1 = 1 < k ? *i++ : i0;
-      const float* i2 = 2 < k ? *i++ : i0;
-      const float* i3 = 3 < k ? *i++ : i0;
-      const float* i4 = 4 < k ? *i++ : i0;
-      const float* i5 = 5 < k ? *i++ : i0;
-      const float* i6 = 6 < k ? *i++ : i0;
-      const float* i7 = 7 < k ? *i++ : i0;
-      const float* i8 = 8 < k ? *i++ : i0;
-      i0 = (const float*) ((uintptr_t) i0 + input_offset);
-      i1 = (const float*) ((uintptr_t) i1 + input_offset);
-      i2 = (const float*) ((uintptr_t) i2 + input_offset);
-      i3 = (const float*) ((uintptr_t) i3 + input_offset);
-      i4 = (const float*) ((uintptr_t) i4 + input_offset);
-      i5 = (const float*) ((uintptr_t) i5 + input_offset);
-      i6 = (const float*) ((uintptr_t) i6 + input_offset);
-      i7 = (const float*) ((uintptr_t) i7 + input_offset);
-      i8 = (const float*) ((uintptr_t) i8 + input_offset);
-
-      o = output;
-      size_t c = channels;
-      do {
-        size_t vl = __riscv_vsetvl_e32m1(c);
-
-        vfloat32m1_t vi0 = __riscv_vle32_v_f32m1(i0, vl); i0 += vl;
-        vfloat32m1_t vi1 = __riscv_vle32_v_f32m1(i1, vl); i1 += vl;
-        vfloat32m1_t vi2 = __riscv_vle32_v_f32m1(i2, vl); i2 += vl;
-        vfloat32m1_t vi3 = __riscv_vle32_v_f32m1(i3, vl); i3 += vl;
-        vfloat32m1_t vi4 = __riscv_vle32_v_f32m1(i4, vl); i4 += vl;
-        vfloat32m1_t vi5 = __riscv_vle32_v_f32m1(i5, vl); i5 += vl;
-        vfloat32m1_t vi6 = __riscv_vle32_v_f32m1(i6, vl); i6 += vl;
-        vfloat32m1_t vi7 = __riscv_vle32_v_f32m1(i7, vl); i7 += vl;
-        vfloat32m1_t vi8 = __riscv_vle32_v_f32m1(i8, vl); i8 += vl;
-
-        vfloat32m1_t vprev = __riscv_vle32_v_f32m1(o, vl);
-
-        vfloat32m1_t vmax01 = __riscv_vfmax(vi0, vi1, vl);
-        vfloat32m1_t vmax23 = __riscv_vfmax(vi2, vi3, vl);
-        vfloat32m1_t vmax45 = __riscv_vfmax(vi4, vi5, vl);
-        vfloat32m1_t vmax67 = __riscv_vfmax(vi6, vi7, vl);
-        vfloat32m1_t vmax018 = __riscv_vfmax(vmax01, vi8, vl);
-
-        vfloat32m1_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl);
-        vfloat32m1_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl);
-        vfloat32m1_t vmax012345678 = __riscv_vfmax(vmax2345, vmax01678, vl);
-
-        vfloat32m1_t vacc = __riscv_vfmax(vprev, vmax012345678, vl);
-        vacc = __riscv_vfmin(vacc, output_max, vl);
-        __riscv_vse32_v_f32m1(o, vacc, vl); o += vl;
-
-        c -= vl;
-      } while (c != 0);
-    }
-    input = (const float**) ((uintptr_t) input + input_increment);
-    input_offset += input_pixel_stride;
-    output = (float*) ((uintptr_t) output + output_increment);
-  } while (--output_pixels != 0);
-}
diff --git a/src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c b/src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c
deleted file mode 100644
index 3114918b9b0..00000000000
--- a/src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c
+++ /dev/null
@@ -1,147 +0,0 @@
-// clang-format off
-// Auto-generated file. Do not edit!
-//   Template: src/f32-maxpool/rvv.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2026 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include "src/xnnpack/maxpool.h"
-#include <riscv_vector.h>
-
-
-void xnn_s8_maxpool_minmax_ukernel_9p__rvv_u1v(
-    size_t output_pixels,
-    size_t kernel_elements,
-    size_t channels,
-    const int8_t** input,
-    size_t input_offset,
-    size_t input_pixel_stride,
-    int8_t* output,
-    size_t input_increment,
-    size_t output_increment,
-    const struct xnn_s8_minmax_params* restrict params)
-{
-  assert(output_pixels != 0);
-  assert(kernel_elements != 0);
-  assert(channels != 0);
-
-  const int8_t output_min = params->scalar.min;
-  const int8_t output_max = params->scalar.max;
-  do {
-    const int8_t** i = input;
-
-    // First pass: load the inputs, store the max pool in the output.
-    const int8_t* i0 = *i++;
-    const int8_t* i1 = 1 < kernel_elements ? *i++ : i0;
-    const int8_t* i2 = 2 < kernel_elements ? *i++ : i0;
-    const int8_t* i3 = 3 < kernel_elements ? *i++ : i0;
-    const int8_t* i4 = 4 < kernel_elements ? *i++ : i0;
-    const int8_t* i5 = 5 < kernel_elements ? *i++ : i0;
-    const int8_t* i6 = 6 < kernel_elements ? *i++ : i0;
-    const int8_t* i7 = 7 < kernel_elements ? *i++ : i0;
-    const int8_t* i8 = 8 < kernel_elements ? *i++ : i0;
-    i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
-    i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
-    i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
-    i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
-    i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
-    i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
-    i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
-    i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
-    i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
-
-    int8_t* o = output;
-    size_t c = channels;
-    do {
-      size_t vl = __riscv_vsetvl_e8m1(c);
-      vint8m1_t vi0 = __riscv_vle8_v_i8m1(i0, vl); i0 += vl;
-      vint8m1_t vi1 = __riscv_vle8_v_i8m1(i1, vl); i1 += vl;
-      vint8m1_t vi2 = __riscv_vle8_v_i8m1(i2, vl); i2 += vl;
-      vint8m1_t vi3 = __riscv_vle8_v_i8m1(i3, vl); i3 += vl;
-      vint8m1_t vi4 = __riscv_vle8_v_i8m1(i4, vl); i4 += vl;
-      vint8m1_t vi5 = __riscv_vle8_v_i8m1(i5, vl); i5 += vl;
-      vint8m1_t vi6 = __riscv_vle8_v_i8m1(i6, vl); i6 += vl;
-      vint8m1_t vi7 = __riscv_vle8_v_i8m1(i7, vl); i7 += vl;
-      vint8m1_t vi8 = __riscv_vle8_v_i8m1(i8, vl); i8 += vl;
-
-      vint8m1_t vmax01 = __riscv_vmax(vi0, vi1, vl);
-      vint8m1_t vmax23 = __riscv_vmax(vi2, vi3, vl);
-      vint8m1_t vmax45 = __riscv_vmax(vi4, vi5, vl);
-      vint8m1_t vmax67 = __riscv_vmax(vi6, vi7, vl);
-      vint8m1_t vmax018 = __riscv_vmax(vmax01, vi8, vl);
-
-      vint8m1_t vmax2345 = __riscv_vmax(vmax23, vmax45, vl);
-      vint8m1_t vmax01678 = __riscv_vmax(vmax67, vmax018, vl);
-      vint8m1_t vacc = __riscv_vmax(vmax2345, vmax01678, vl);
-
-      vacc = __riscv_vmax(vacc, output_min, vl);
-      vacc = __riscv_vmin(vacc, output_max, vl);
-      __riscv_vse8_v_i8m1(o, vacc, vl); o += vl;
-
-      c -= vl;
-    } while (c != 0);
-
-    // Passes 1 - n: Max more inputs to the output.
-    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) {
-      const int8_t* i0 = *i++;
-      const int8_t* i1 = 1 < k ? *i++ : i0;
-      const int8_t* i2 = 2 < k ? *i++ : i0;
-      const int8_t* i3 = 3 < k ? *i++ : i0;
-      const int8_t* i4 = 4 < k ? *i++ : i0;
-      const int8_t* i5 = 5 < k ? *i++ : i0;
-      const int8_t* i6 = 6 < k ? *i++ : i0;
-      const int8_t* i7 = 7 < k ? *i++ : i0;
-      const int8_t* i8 = 8 < k ? *i++ : i0;
-      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
-      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
-      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
-      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
-      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
-      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
-      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
-      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
-      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
-
-      o = output;
-      size_t c = channels;
-      do {
-        size_t vl = __riscv_vsetvl_e8m1(c);
-
-        vint8m1_t vi0 = __riscv_vle8_v_i8m1(i0, vl); i0 += vl;
-        vint8m1_t vi1 = __riscv_vle8_v_i8m1(i1, vl); i1 += vl;
-        vint8m1_t vi2 = __riscv_vle8_v_i8m1(i2, vl); i2 += vl;
-        vint8m1_t vi3 = __riscv_vle8_v_i8m1(i3, vl); i3 += vl;
-        vint8m1_t vi4 = __riscv_vle8_v_i8m1(i4, vl); i4 += vl;
-        vint8m1_t vi5 = __riscv_vle8_v_i8m1(i5, vl); i5 += vl;
-        vint8m1_t vi6 = __riscv_vle8_v_i8m1(i6, vl); i6 += vl;
-        vint8m1_t vi7 = __riscv_vle8_v_i8m1(i7, vl); i7 += vl;
-        vint8m1_t vi8 = __riscv_vle8_v_i8m1(i8, vl); i8 += vl;
-
-        vint8m1_t vprev = __riscv_vle8_v_i8m1(o, vl);
-
-        vint8m1_t vmax01 = __riscv_vmax(vi0, vi1, vl);
-        vint8m1_t vmax23 = __riscv_vmax(vi2, vi3, vl);
-        vint8m1_t vmax45 = __riscv_vmax(vi4, vi5, vl);
-        vint8m1_t vmax67 = __riscv_vmax(vi6, vi7, vl);
-        vint8m1_t vmax018 = __riscv_vmax(vmax01, vi8, vl);
-
-        vint8m1_t vmax2345 = __riscv_vmax(vmax23, vmax45, vl);
-        vint8m1_t vmax01678 = __riscv_vmax(vmax67, vmax018, vl);
-        vint8m1_t vmax012345678 = __riscv_vmax(vmax2345, vmax01678, vl);
-
-        vint8m1_t vacc = __riscv_vmax(vprev, vmax012345678, vl);
-        vacc = __riscv_vmin(vacc, output_max, vl);
-        __riscv_vse8_v_i8m1(o, vacc, vl); o += vl;
-
-        c -= vl;
-      } while (c != 0);
-    }
-    input = (const int8_t**) ((uintptr_t) input + input_increment);
-    input_offset += input_pixel_stride;
-    output = (int8_t*) ((uintptr_t) output + output_increment);
-  } while (--output_pixels != 0);
-}
diff --git a/src/s8-maxpool/s8-maxpool-minmax.inc b/src/s8-maxpool/s8-maxpool-minmax.inc
index 498299cb8d6..c385f36fd66 100644
--- a/src/s8-maxpool/s8-maxpool-minmax.inc
+++ b/src/s8-maxpool/s8-maxpool-minmax.inc
@@ -18,7 +18,6 @@ XNN_UKERNEL(xnn_arch_none, xnn_s8_maxpool_minmax_ukernel_9p__wasmsimd_u16, 16, 9
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 
 #if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
-XNN_UKERNEL(xnn_arch_riscv_vector, xnn_s8_maxpool_minmax_ukernel_9p__rvv_u1v, (1*xnn_init_hardware_config()->vlenb/sizeof(int8_t)), 9, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params)
 XNN_UKERNEL(xnn_arch_riscv_vector, xnn_s8_maxpool_minmax_ukernel_9p__rvv_u2v, (2*xnn_init_hardware_config()->vlenb/sizeof(int8_t)), 9, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params)
 #endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
 
diff --git a/src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c b/src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c
deleted file mode 100644
index c0561f3d658..00000000000
--- a/src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c
+++ /dev/null
@@ -1,147 +0,0 @@
-// clang-format off
-// Auto-generated file. Do not edit!
-//   Template: src/f32-maxpool/rvv.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2026 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include "src/xnnpack/maxpool.h"
-#include <riscv_vector.h>
-
-
-void xnn_u8_maxpool_minmax_ukernel_9p__rvv_u1v(
-    size_t output_pixels,
-    size_t kernel_elements,
-    size_t channels,
-    const uint8_t** input,
-    size_t input_offset,
-    size_t input_pixel_stride,
-    uint8_t* output,
-    size_t input_increment,
-    size_t output_increment,
-    const struct xnn_u8_minmax_params* restrict params)
-{
-  assert(output_pixels != 0);
-  assert(kernel_elements != 0);
-  assert(channels != 0);
-
-  const uint8_t output_min = params->scalar.min;
-  const uint8_t output_max = params->scalar.max;
-  do {
-    const uint8_t** i = input;
-
-    // First pass: load the inputs, store the max pool in the output.
-    const uint8_t* i0 = *i++;
-    const uint8_t* i1 = 1 < kernel_elements ? *i++ : i0;
-    const uint8_t* i2 = 2 < kernel_elements ? *i++ : i0;
-    const uint8_t* i3 = 3 < kernel_elements ? *i++ : i0;
-    const uint8_t* i4 = 4 < kernel_elements ? *i++ : i0;
-    const uint8_t* i5 = 5 < kernel_elements ? *i++ : i0;
-    const uint8_t* i6 = 6 < kernel_elements ? *i++ : i0;
-    const uint8_t* i7 = 7 < kernel_elements ? *i++ : i0;
-    const uint8_t* i8 = 8 < kernel_elements ? *i++ : i0;
-    i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
-    i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
-    i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
-    i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
-    i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
-    i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
-    i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
-    i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
-    i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
-
-    uint8_t* o = output;
-    size_t c = channels;
-    do {
-      size_t vl = __riscv_vsetvl_e8m1(c);
-      vuint8m1_t vi0 = __riscv_vle8_v_u8m1(i0, vl); i0 += vl;
-      vuint8m1_t vi1 = __riscv_vle8_v_u8m1(i1, vl); i1 += vl;
-      vuint8m1_t vi2 = __riscv_vle8_v_u8m1(i2, vl); i2 += vl;
-      vuint8m1_t vi3 = __riscv_vle8_v_u8m1(i3, vl); i3 += vl;
-      vuint8m1_t vi4 = __riscv_vle8_v_u8m1(i4, vl); i4 += vl;
-      vuint8m1_t vi5 = __riscv_vle8_v_u8m1(i5, vl); i5 += vl;
-      vuint8m1_t vi6 = __riscv_vle8_v_u8m1(i6, vl); i6 += vl;
-      vuint8m1_t vi7 = __riscv_vle8_v_u8m1(i7, vl); i7 += vl;
-      vuint8m1_t vi8 = __riscv_vle8_v_u8m1(i8, vl); i8 += vl;
-
-      vuint8m1_t vmax01 = __riscv_vmaxu(vi0, vi1, vl);
-      vuint8m1_t vmax23 = __riscv_vmaxu(vi2, vi3, vl);
-      vuint8m1_t vmax45 = __riscv_vmaxu(vi4, vi5, vl);
-      vuint8m1_t vmax67 = __riscv_vmaxu(vi6, vi7, vl);
-      vuint8m1_t vmax018 = __riscv_vmaxu(vmax01, vi8, vl);
-
-      vuint8m1_t vmax2345 = __riscv_vmaxu(vmax23, vmax45, vl);
-      vuint8m1_t vmax01678 = __riscv_vmaxu(vmax67, vmax018, vl);
-      vuint8m1_t vacc = __riscv_vmaxu(vmax2345, vmax01678, vl);
-
-      vacc = __riscv_vmaxu(vacc, output_min, vl);
-      vacc = __riscv_vminu(vacc, output_max, vl);
-      __riscv_vse8_v_u8m1(o, vacc, vl); o += vl;
-
-      c -= vl;
-    } while (c != 0);
-
-    // Passes 1 - n: Max more inputs to the output.
-    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) {
-      const uint8_t* i0 = *i++;
-      const uint8_t* i1 = 1 < k ? *i++ : i0;
-      const uint8_t* i2 = 2 < k ? *i++ : i0;
-      const uint8_t* i3 = 3 < k ? *i++ : i0;
-      const uint8_t* i4 = 4 < k ? *i++ : i0;
-      const uint8_t* i5 = 5 < k ? *i++ : i0;
-      const uint8_t* i6 = 6 < k ? *i++ : i0;
-      const uint8_t* i7 = 7 < k ? *i++ : i0;
-      const uint8_t* i8 = 8 < k ? *i++ : i0;
-      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
-      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
-      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
-      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
-      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
-      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
-      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
-      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
-      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
-
-      o = output;
-      size_t c = channels;
-      do {
-        size_t vl = __riscv_vsetvl_e8m1(c);
-
-        vuint8m1_t vi0 = __riscv_vle8_v_u8m1(i0, vl); i0 += vl;
-        vuint8m1_t vi1 = __riscv_vle8_v_u8m1(i1, vl); i1 += vl;
-        vuint8m1_t vi2 = __riscv_vle8_v_u8m1(i2, vl); i2 += vl;
-        vuint8m1_t vi3 = __riscv_vle8_v_u8m1(i3, vl); i3 += vl;
-        vuint8m1_t vi4 = __riscv_vle8_v_u8m1(i4, vl); i4 += vl;
-        vuint8m1_t vi5 = __riscv_vle8_v_u8m1(i5, vl); i5 += vl;
-        vuint8m1_t vi6 = __riscv_vle8_v_u8m1(i6, vl); i6 += vl;
-        vuint8m1_t vi7 = __riscv_vle8_v_u8m1(i7, vl); i7 += vl;
-        vuint8m1_t vi8 = __riscv_vle8_v_u8m1(i8, vl); i8 += vl;
-
-        vuint8m1_t vprev = __riscv_vle8_v_u8m1(o, vl);
-
-        vuint8m1_t vmax01 = __riscv_vmaxu(vi0, vi1, vl);
-        vuint8m1_t vmax23 = __riscv_vmaxu(vi2, vi3, vl);
-        vuint8m1_t vmax45 = __riscv_vmaxu(vi4, vi5, vl);
-        vuint8m1_t vmax67 = __riscv_vmaxu(vi6, vi7, vl);
-        vuint8m1_t vmax018 = __riscv_vmaxu(vmax01, vi8, vl);
-
-        vuint8m1_t vmax2345 = __riscv_vmaxu(vmax23, vmax45, vl);
-        vuint8m1_t vmax01678 = __riscv_vmaxu(vmax67, vmax018, vl);
-        vuint8m1_t vmax012345678 = __riscv_vmaxu(vmax2345, vmax01678, vl);
-
-        vuint8m1_t vacc = __riscv_vmaxu(vprev, vmax012345678, vl);
-        vacc = __riscv_vminu(vacc, output_max, vl);
-        __riscv_vse8_v_u8m1(o, vacc, vl); o += vl;
-
-        c -= vl;
-      } while (c != 0);
-    }
-    input = (const uint8_t**) ((uintptr_t) input + input_increment);
-    input_offset += input_pixel_stride;
-    output = (uint8_t*) ((uintptr_t) output + output_increment);
-  } while (--output_pixels != 0);
-}
diff --git a/src/u8-maxpool/u8-maxpool-minmax.inc b/src/u8-maxpool/u8-maxpool-minmax.inc
index f090da6c6c4..6cff5d4938f 100644
--- a/src/u8-maxpool/u8-maxpool-minmax.inc
+++ b/src/u8-maxpool/u8-maxpool-minmax.inc
@@ -18,7 +18,6 @@ XNN_UKERNEL(xnn_arch_none, xnn_u8_maxpool_minmax_ukernel_9p__wasmsimd_u16, 16, 9
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 
 #if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
-XNN_UKERNEL(xnn_arch_riscv_vector, xnn_u8_maxpool_minmax_ukernel_9p__rvv_u1v, (1*xnn_init_hardware_config()->vlenb/sizeof(uint8_t)), 9, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params)
 XNN_UKERNEL(xnn_arch_riscv_vector, xnn_u8_maxpool_minmax_ukernel_9p__rvv_u2v, (2*xnn_init_hardware_config()->vlenb/sizeof(uint8_t)), 9, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params)
 #endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR