From 7596bbce20b1c1b39e57d83ac022324e8edcaa4b Mon Sep 17 00:00:00 2001 From: Ken Unger Date: Mon, 2 Mar 2026 11:11:08 -0800 Subject: [PATCH 1/2] rvv maxpool for f32, f16, s8, u8 --- cmake/gen/rvv_microkernels.cmake | 4 + cmake/gen/rvvfp16arith_microkernels.cmake | 2 + gen/rvv_microkernels.bzl | 4 + gen/rvvfp16arith_microkernels.bzl | 2 + scripts/generate-f16-maxpool.sh | 4 + scripts/generate-f32-maxpool.sh | 4 +- scripts/generate-s8-maxpool.sh | 4 + scripts/generate-u8-maxpool.sh | 4 + src/configs/maxpool-config.c | 14 ++ src/f16-maxpool/f16-maxpool-minmax.inc | 4 + .../f16-maxpool-9p-minmax-rvvfp16arith-u1v.c | 147 ++++++++++++++ .../f16-maxpool-9p-minmax-rvvfp16arith-u2v.c | 147 ++++++++++++++ .../gen/f32-maxpool-9p-minmax-rvv-u1v.c | 159 ++++++++-------- .../gen/f32-maxpool-9p-minmax-rvv-u2v.c | 159 ++++++++-------- src/f32-maxpool/rvv.c.in | 180 ++++++++---------- .../gen/s8-maxpool-9p-minmax-rvv-u1v.c | 147 ++++++++++++++ .../gen/s8-maxpool-9p-minmax-rvv-u2v.c | 147 ++++++++++++++ src/s8-maxpool/s8-maxpool-minmax.inc | 7 +- .../gen/u8-maxpool-9p-minmax-rvv-u1v.c | 147 ++++++++++++++ .../gen/u8-maxpool-9p-minmax-rvv-u2v.c | 147 ++++++++++++++ src/u8-maxpool/u8-maxpool-minmax.inc | 7 +- 21 files changed, 1183 insertions(+), 257 deletions(-) create mode 100644 src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c create mode 100644 src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u2v.c create mode 100644 src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c create mode 100644 src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u2v.c create mode 100644 src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c create mode 100644 src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u2v.c diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake index 38117bc1754..04d5e143861 100644 --- a/cmake/gen/rvv_microkernels.cmake +++ b/cmake/gen/rvv_microkernels.cmake @@ -103,7 +103,9 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u2v.c src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u2v.c src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u2v.c + src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u2v.c src/s8-vclamp/gen/s8-vclamp-rvv-u4v.c + src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u2v.c src/u8-vclamp/gen/u8-vclamp-rvv-u4v.c src/x32-packw/gen/x32-packw-x4v-gemm-goi-rvv-u8.c src/x32-transposec/gen/x32-transposec-4x4-rvv.c @@ -258,9 +260,11 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u1v.c src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c + src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c src/s8-vclamp/gen/s8-vclamp-rvv-u1v.c src/s8-vclamp/gen/s8-vclamp-rvv-u2v.c src/s8-vclamp/gen/s8-vclamp-rvv-u8v.c + src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c src/u8-vclamp/gen/u8-vclamp-rvv-u1v.c src/u8-vclamp/gen/u8-vclamp-rvv-u2v.c src/u8-vclamp/gen/u8-vclamp-rvv-u8v.c diff --git a/cmake/gen/rvvfp16arith_microkernels.cmake b/cmake/gen/rvvfp16arith_microkernels.cmake index 543a311c1cc..ac5f45c7724 100644 --- a/cmake/gen/rvvfp16arith_microkernels.cmake +++ b/cmake/gen/rvvfp16arith_microkernels.cmake @@ -19,6 +19,7 @@ SET(PROD_RVVFP16ARITH_MICROKERNEL_SRCS src/f16-gemm/gen/f16-gemm-7x4v-minmax-rvvfp16arith.c src/f16-igemm/gen/f16-igemm-1x4v-minmax-rvvfp16arith.c src/f16-igemm/gen/f16-igemm-7x4v-minmax-rvvfp16arith.c + src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u2v.c src/f16-spmm/gen/f16-spmm-8vx1-minmax-rvvfp16arith.c src/f16-vbinary/gen/f16-vadd-rvvfp16arith-u8v.c src/f16-vbinary/gen/f16-vaddc-rvvfp16arith-u8v.c @@ -54,6 +55,7 @@ SET(NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS src/f16-f32-vcvt/gen/f16-f32-vcvt-rvvfp16arith-u2v.c src/f16-gemm/gen/f16-gemm-4x4v-minmax-rvvfp16arith.c src/f16-igemm/gen/f16-igemm-4x4v-minmax-rvvfp16arith.c + src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c src/f16-spmm/gen/f16-spmm-1vx1-minmax-rvvfp16arith.c src/f16-spmm/gen/f16-spmm-2vx1-minmax-rvvfp16arith.c src/f16-spmm/gen/f16-spmm-4vx1-minmax-rvvfp16arith.c diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl index d864e7d10be..fb7468b48b3 100644 --- a/gen/rvv_microkernels.bzl +++ b/gen/rvv_microkernels.bzl @@ -99,7 +99,9 @@ PROD_RVV_MICROKERNEL_SRCS = [ "src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u2v.c", "src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u2v.c", "src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u2v.c", + "src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u2v.c", "src/s8-vclamp/gen/s8-vclamp-rvv-u4v.c", + "src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u2v.c", "src/u8-vclamp/gen/u8-vclamp-rvv-u4v.c", "src/x32-packw/gen/x32-packw-x4v-gemm-goi-rvv-u8.c", "src/x32-transposec/gen/x32-transposec-4x4-rvv.c", @@ -255,9 +257,11 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [ "src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u1v.c", "src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c", "src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c", + "src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c", "src/s8-vclamp/gen/s8-vclamp-rvv-u1v.c", "src/s8-vclamp/gen/s8-vclamp-rvv-u2v.c", "src/s8-vclamp/gen/s8-vclamp-rvv-u8v.c", + "src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c", "src/u8-vclamp/gen/u8-vclamp-rvv-u1v.c", "src/u8-vclamp/gen/u8-vclamp-rvv-u2v.c", "src/u8-vclamp/gen/u8-vclamp-rvv-u8v.c", diff --git a/gen/rvvfp16arith_microkernels.bzl b/gen/rvvfp16arith_microkernels.bzl index 2c098847bf7..acbae4d7c04 100644 --- a/gen/rvvfp16arith_microkernels.bzl +++ b/gen/rvvfp16arith_microkernels.bzl @@ -15,6 +15,7 @@ PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-gemm/gen/f16-gemm-7x4v-minmax-rvvfp16arith.c", "src/f16-igemm/gen/f16-igemm-1x4v-minmax-rvvfp16arith.c", "src/f16-igemm/gen/f16-igemm-7x4v-minmax-rvvfp16arith.c", + "src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u2v.c", "src/f16-spmm/gen/f16-spmm-8vx1-minmax-rvvfp16arith.c", "src/f16-vbinary/gen/f16-vadd-rvvfp16arith-u8v.c", "src/f16-vbinary/gen/f16-vaddc-rvvfp16arith-u8v.c", @@ -51,6 +52,7 @@ NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-f32-vcvt/gen/f16-f32-vcvt-rvvfp16arith-u2v.c", "src/f16-gemm/gen/f16-gemm-4x4v-minmax-rvvfp16arith.c", "src/f16-igemm/gen/f16-igemm-4x4v-minmax-rvvfp16arith.c", + "src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c", "src/f16-spmm/gen/f16-spmm-1vx1-minmax-rvvfp16arith.c", "src/f16-spmm/gen/f16-spmm-2vx1-minmax-rvvfp16arith.c", "src/f16-spmm/gen/f16-spmm-4vx1-minmax-rvvfp16arith.c", diff --git a/scripts/generate-f16-maxpool.sh b/scripts/generate-f16-maxpool.sh index 73fff4d8256..9bfb490c2cc 100755 --- a/scripts/generate-f16-maxpool.sh +++ b/scripts/generate-f16-maxpool.sh @@ -9,4 +9,8 @@ tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=f16 -D ARCH=neonfp16arith - tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=f16 -D ARCH=avx2 -D SIMD_SIZE=16 -o src/f16-maxpool/gen/f16-maxpool-9p-minmax-avx2-u16.c & tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=f16 -D ARCH=sse41 -D SIMD_SIZE=8 -o src/f16-maxpool/gen/f16-maxpool-9p-minmax-sse41-u8.c & +################################ RISC-V Vector ################################ +tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=f16 -D LMUL=1 -o src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c & +tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=f16 -D LMUL=2 -o src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u2v.c & + wait diff --git a/scripts/generate-f32-maxpool.sh b/scripts/generate-f32-maxpool.sh index 2a142c77987..4c15f633750 100755 --- a/scripts/generate-f32-maxpool.sh +++ b/scripts/generate-f32-maxpool.sh @@ -12,7 +12,7 @@ tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=f32 -D ARCH=neon -D SIM tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=f32 -D ARCH=hvx -D SIMD_SIZE=32 -o src/f32-maxpool/gen/f32-maxpool-9p-minmax-hvx-u32.c & ################################ RISC-V Vector ################################ -tools/xngen src/f32-maxpool/rvv.c.in -D LMUL=1 -o src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c & -tools/xngen src/f32-maxpool/rvv.c.in -D LMUL=2 -o src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u2v.c & +tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=f32 -D LMUL=1 -o src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c & +tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=f32 -D LMUL=2 -o src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u2v.c & wait diff --git a/scripts/generate-s8-maxpool.sh b/scripts/generate-s8-maxpool.sh index 0ae6063e03f..6b379899ffd 100755 --- a/scripts/generate-s8-maxpool.sh +++ b/scripts/generate-s8-maxpool.sh @@ -10,4 +10,8 @@ tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=s8 -D ARCH=sse41 -D SIMD_SI tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=s8 -D ARCH=wasmsimd -D SIMD_SIZE=16 -o src/s8-maxpool/gen/s8-maxpool-9p-minmax-wasmsimd-u16.c & tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=s8 -D ARCH=neon -D SIMD_SIZE=16 -o src/s8-maxpool/gen/s8-maxpool-9p-minmax-neon-u16.c & +################################ RISC-V Vector ################################# +tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=s8 -D LMUL=1 -o src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c & +tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=s8 -D LMUL=2 -o src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u2v.c & + wait diff --git a/scripts/generate-u8-maxpool.sh b/scripts/generate-u8-maxpool.sh index d7df2b21f3c..aa1b1f66faf 100755 --- a/scripts/generate-u8-maxpool.sh +++ b/scripts/generate-u8-maxpool.sh @@ -10,4 +10,8 @@ tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=u8 -D KERNEL_TILE=9 -D ARCH tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=u8 -D KERNEL_TILE=9 -D ARCH=wasmsimd -D SIMD_SIZE=16 -o src/u8-maxpool/gen/u8-maxpool-9p-minmax-wasmsimd-u16.c & tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=u8 -D KERNEL_TILE=9 -D ARCH=neon -D SIMD_SIZE=16 -o src/u8-maxpool/gen/u8-maxpool-9p-minmax-neon-u16.c & +################################ RISC-V Vector ################################# +tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=u8 -D LMUL=1 -o src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c & +tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=u8 -D LMUL=2 -o src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u2v.c & + wait diff --git a/src/configs/maxpool-config.c b/src/configs/maxpool-config.c index 4026b8d8e01..df48fc24774 100644 --- a/src/configs/maxpool-config.c +++ b/src/configs/maxpool-config.c @@ -65,6 +65,14 @@ static void init_f16_maxpool_config(void) { } else #endif ; // no f16 support + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + (void) hardware_config; // May be unused. + if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) { + f16_maxpool_config.ukernel = XNN_INIT_MAXPOOL_UKERNEL(xnn_f16_maxpool_minmax_ukernel_9p__rvvfp16arith_u2v); + f16_maxpool_config.init.f16 = xnn_init_f16_minmax_scalar_params; + } #endif } @@ -144,6 +152,9 @@ static void init_s8_maxpool_config(void) { #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD s8_maxpool_config.ukernel = XNN_INIT_MAXPOOL_UKERNEL(xnn_s8_maxpool_minmax_ukernel_9p__wasmsimd_u16); s8_maxpool_config.init.s8 = xnn_init_s8_minmax_scalar_params; + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + s8_maxpool_config.ukernel = XNN_INIT_MAXPOOL_UKERNEL(xnn_s8_maxpool_minmax_ukernel_9p__rvv_u2v); + s8_maxpool_config.init.s8 = xnn_init_s8_minmax_scalar_params; #else s8_maxpool_config.ukernel = XNN_INIT_MAXPOOL_UKERNEL(xnn_s8_maxpool_minmax_ukernel_9p__scalar_u1); s8_maxpool_config.init.s8 = xnn_init_s8_minmax_scalar_params; @@ -178,6 +189,9 @@ static void init_u8_maxpool_config(void) { #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD u8_maxpool_config.ukernel = XNN_INIT_MAXPOOL_UKERNEL(xnn_u8_maxpool_minmax_ukernel_9p__wasmsimd_u16); u8_maxpool_config.init.u8 = xnn_init_u8_minmax_scalar_params; + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + u8_maxpool_config.ukernel = XNN_INIT_MAXPOOL_UKERNEL(xnn_u8_maxpool_minmax_ukernel_9p__rvv_u2v); + u8_maxpool_config.init.u8 = xnn_init_u8_minmax_scalar_params; #else u8_maxpool_config.ukernel = XNN_INIT_MAXPOOL_UKERNEL(xnn_u8_maxpool_minmax_ukernel_9p__scalar_u1); u8_maxpool_config.init.u8 = xnn_init_u8_minmax_scalar_params; diff --git a/src/f16-maxpool/f16-maxpool-minmax.inc b/src/f16-maxpool/f16-maxpool-minmax.inc index 1bff1d6df51..ebc7a65487c 100644 --- a/src/f16-maxpool/f16-maxpool-minmax.inc +++ b/src/f16-maxpool/f16-maxpool-minmax.inc @@ -17,3 +17,7 @@ XNN_UKERNEL(xnn_arch_x86_sse4_1, xnn_f16_maxpool_minmax_ukernel_9p__sse41_u8, 8, XNN_UKERNEL(xnn_arch_x86_avx2, xnn_f16_maxpool_minmax_ukernel_9p__avx2_u16, 16, 9, xnn_float16, struct xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) #endif // XNN_ENABLE_AVX2 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_maxpool_minmax_ukernel_9p__rvvfp16arith_u1v, (1*xnn_init_hardware_config()->vlenb/sizeof(xnn_float16)), 9, xnn_float16, struct xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_maxpool_minmax_ukernel_9p__rvvfp16arith_u2v, (2*xnn_init_hardware_config()->vlenb/sizeof(xnn_float16)), 9, xnn_float16, struct xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR diff --git a/src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c b/src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c new file mode 100644 index 00000000000..266b3701f49 --- /dev/null +++ b/src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c @@ -0,0 +1,147 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-maxpool/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include "src/xnnpack/maxpool.h" +#include + + +void xnn_f16_maxpool_minmax_ukernel_9p__rvvfp16arith_u1v( + size_t output_pixels, + size_t kernel_elements, + size_t channels, + const xnn_float16** input, + size_t input_offset, + size_t input_pixel_stride, + xnn_float16* output, + size_t input_increment, + size_t output_increment, + const struct xnn_f16_minmax_params* restrict params) +{ + assert(output_pixels != 0); + assert(kernel_elements != 0); + assert(channels != 0); + + const xnn_float16 output_min = params->scalar.min; + const xnn_float16 output_max = params->scalar.max; + do { + const xnn_float16** i = input; + + // First pass: load the inputs, store the max pool in the output. + const xnn_float16* i0 = *i++; + const xnn_float16* i1 = 1 < kernel_elements ? *i++ : i0; + const xnn_float16* i2 = 2 < kernel_elements ? *i++ : i0; + const xnn_float16* i3 = 3 < kernel_elements ? *i++ : i0; + const xnn_float16* i4 = 4 < kernel_elements ? *i++ : i0; + const xnn_float16* i5 = 5 < kernel_elements ? *i++ : i0; + const xnn_float16* i6 = 6 < kernel_elements ? *i++ : i0; + const xnn_float16* i7 = 7 < kernel_elements ? *i++ : i0; + const xnn_float16* i8 = 8 < kernel_elements ? *i++ : i0; + i0 = (const xnn_float16*) ((uintptr_t) i0 + input_offset); + i1 = (const xnn_float16*) ((uintptr_t) i1 + input_offset); + i2 = (const xnn_float16*) ((uintptr_t) i2 + input_offset); + i3 = (const xnn_float16*) ((uintptr_t) i3 + input_offset); + i4 = (const xnn_float16*) ((uintptr_t) i4 + input_offset); + i5 = (const xnn_float16*) ((uintptr_t) i5 + input_offset); + i6 = (const xnn_float16*) ((uintptr_t) i6 + input_offset); + i7 = (const xnn_float16*) ((uintptr_t) i7 + input_offset); + i8 = (const xnn_float16*) ((uintptr_t) i8 + input_offset); + + xnn_float16* o = output; + size_t c = channels; + do { + size_t vl = __riscv_vsetvl_e16m1(c); + vfloat16m1_t vi0 = __riscv_vle16_v_f16m1(i0, vl); i0 += vl; + vfloat16m1_t vi1 = __riscv_vle16_v_f16m1(i1, vl); i1 += vl; + vfloat16m1_t vi2 = __riscv_vle16_v_f16m1(i2, vl); i2 += vl; + vfloat16m1_t vi3 = __riscv_vle16_v_f16m1(i3, vl); i3 += vl; + vfloat16m1_t vi4 = __riscv_vle16_v_f16m1(i4, vl); i4 += vl; + vfloat16m1_t vi5 = __riscv_vle16_v_f16m1(i5, vl); i5 += vl; + vfloat16m1_t vi6 = __riscv_vle16_v_f16m1(i6, vl); i6 += vl; + vfloat16m1_t vi7 = __riscv_vle16_v_f16m1(i7, vl); i7 += vl; + vfloat16m1_t vi8 = __riscv_vle16_v_f16m1(i8, vl); i8 += vl; + + vfloat16m1_t vmax01 = __riscv_vfmax(vi0, vi1, vl); + vfloat16m1_t vmax23 = __riscv_vfmax(vi2, vi3, vl); + vfloat16m1_t vmax45 = __riscv_vfmax(vi4, vi5, vl); + vfloat16m1_t vmax67 = __riscv_vfmax(vi6, vi7, vl); + vfloat16m1_t vmax018 = __riscv_vfmax(vmax01, vi8, vl); + + vfloat16m1_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl); + vfloat16m1_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl); + vfloat16m1_t vacc = __riscv_vfmax(vmax2345, vmax01678, vl); + + vacc = __riscv_vfmax(vacc, output_min, vl); + vacc = __riscv_vfmin(vacc, output_max, vl); + __riscv_vse16_v_f16m1(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + + // Passes 1 - n: Max more inputs to the output. + for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) { + const xnn_float16* i0 = *i++; + const xnn_float16* i1 = 1 < k ? *i++ : i0; + const xnn_float16* i2 = 2 < k ? *i++ : i0; + const xnn_float16* i3 = 3 < k ? *i++ : i0; + const xnn_float16* i4 = 4 < k ? *i++ : i0; + const xnn_float16* i5 = 5 < k ? *i++ : i0; + const xnn_float16* i6 = 6 < k ? *i++ : i0; + const xnn_float16* i7 = 7 < k ? *i++ : i0; + const xnn_float16* i8 = 8 < k ? *i++ : i0; + i0 = (const xnn_float16*) ((uintptr_t) i0 + input_offset); + i1 = (const xnn_float16*) ((uintptr_t) i1 + input_offset); + i2 = (const xnn_float16*) ((uintptr_t) i2 + input_offset); + i3 = (const xnn_float16*) ((uintptr_t) i3 + input_offset); + i4 = (const xnn_float16*) ((uintptr_t) i4 + input_offset); + i5 = (const xnn_float16*) ((uintptr_t) i5 + input_offset); + i6 = (const xnn_float16*) ((uintptr_t) i6 + input_offset); + i7 = (const xnn_float16*) ((uintptr_t) i7 + input_offset); + i8 = (const xnn_float16*) ((uintptr_t) i8 + input_offset); + + o = output; + size_t c = channels; + do { + size_t vl = __riscv_vsetvl_e16m1(c); + + vfloat16m1_t vi0 = __riscv_vle16_v_f16m1(i0, vl); i0 += vl; + vfloat16m1_t vi1 = __riscv_vle16_v_f16m1(i1, vl); i1 += vl; + vfloat16m1_t vi2 = __riscv_vle16_v_f16m1(i2, vl); i2 += vl; + vfloat16m1_t vi3 = __riscv_vle16_v_f16m1(i3, vl); i3 += vl; + vfloat16m1_t vi4 = __riscv_vle16_v_f16m1(i4, vl); i4 += vl; + vfloat16m1_t vi5 = __riscv_vle16_v_f16m1(i5, vl); i5 += vl; + vfloat16m1_t vi6 = __riscv_vle16_v_f16m1(i6, vl); i6 += vl; + vfloat16m1_t vi7 = __riscv_vle16_v_f16m1(i7, vl); i7 += vl; + vfloat16m1_t vi8 = __riscv_vle16_v_f16m1(i8, vl); i8 += vl; + + vfloat16m1_t vprev = __riscv_vle16_v_f16m1(o, vl); + + vfloat16m1_t vmax01 = __riscv_vfmax(vi0, vi1, vl); + vfloat16m1_t vmax23 = __riscv_vfmax(vi2, vi3, vl); + vfloat16m1_t vmax45 = __riscv_vfmax(vi4, vi5, vl); + vfloat16m1_t vmax67 = __riscv_vfmax(vi6, vi7, vl); + vfloat16m1_t vmax018 = __riscv_vfmax(vmax01, vi8, vl); + + vfloat16m1_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl); + vfloat16m1_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl); + vfloat16m1_t vmax012345678 = __riscv_vfmax(vmax2345, vmax01678, vl); + + vfloat16m1_t vacc = __riscv_vfmax(vprev, vmax012345678, vl); + vacc = __riscv_vfmin(vacc, output_max, vl); + __riscv_vse16_v_f16m1(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + } + input = (const xnn_float16**) ((uintptr_t) input + input_increment); + input_offset += input_pixel_stride; + output = (xnn_float16*) ((uintptr_t) output + output_increment); + } while (--output_pixels != 0); +} diff --git a/src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u2v.c b/src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u2v.c new file mode 100644 index 00000000000..6bb8937af18 --- /dev/null +++ b/src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u2v.c @@ -0,0 +1,147 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-maxpool/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include "src/xnnpack/maxpool.h" +#include + + +void xnn_f16_maxpool_minmax_ukernel_9p__rvvfp16arith_u2v( + size_t output_pixels, + size_t kernel_elements, + size_t channels, + const xnn_float16** input, + size_t input_offset, + size_t input_pixel_stride, + xnn_float16* output, + size_t input_increment, + size_t output_increment, + const struct xnn_f16_minmax_params* restrict params) +{ + assert(output_pixels != 0); + assert(kernel_elements != 0); + assert(channels != 0); + + const xnn_float16 output_min = params->scalar.min; + const xnn_float16 output_max = params->scalar.max; + do { + const xnn_float16** i = input; + + // First pass: load the inputs, store the max pool in the output. + const xnn_float16* i0 = *i++; + const xnn_float16* i1 = 1 < kernel_elements ? *i++ : i0; + const xnn_float16* i2 = 2 < kernel_elements ? *i++ : i0; + const xnn_float16* i3 = 3 < kernel_elements ? *i++ : i0; + const xnn_float16* i4 = 4 < kernel_elements ? *i++ : i0; + const xnn_float16* i5 = 5 < kernel_elements ? *i++ : i0; + const xnn_float16* i6 = 6 < kernel_elements ? *i++ : i0; + const xnn_float16* i7 = 7 < kernel_elements ? *i++ : i0; + const xnn_float16* i8 = 8 < kernel_elements ? *i++ : i0; + i0 = (const xnn_float16*) ((uintptr_t) i0 + input_offset); + i1 = (const xnn_float16*) ((uintptr_t) i1 + input_offset); + i2 = (const xnn_float16*) ((uintptr_t) i2 + input_offset); + i3 = (const xnn_float16*) ((uintptr_t) i3 + input_offset); + i4 = (const xnn_float16*) ((uintptr_t) i4 + input_offset); + i5 = (const xnn_float16*) ((uintptr_t) i5 + input_offset); + i6 = (const xnn_float16*) ((uintptr_t) i6 + input_offset); + i7 = (const xnn_float16*) ((uintptr_t) i7 + input_offset); + i8 = (const xnn_float16*) ((uintptr_t) i8 + input_offset); + + xnn_float16* o = output; + size_t c = channels; + do { + size_t vl = __riscv_vsetvl_e16m2(c); + vfloat16m2_t vi0 = __riscv_vle16_v_f16m2(i0, vl); i0 += vl; + vfloat16m2_t vi1 = __riscv_vle16_v_f16m2(i1, vl); i1 += vl; + vfloat16m2_t vi2 = __riscv_vle16_v_f16m2(i2, vl); i2 += vl; + vfloat16m2_t vi3 = __riscv_vle16_v_f16m2(i3, vl); i3 += vl; + vfloat16m2_t vi4 = __riscv_vle16_v_f16m2(i4, vl); i4 += vl; + vfloat16m2_t vi5 = __riscv_vle16_v_f16m2(i5, vl); i5 += vl; + vfloat16m2_t vi6 = __riscv_vle16_v_f16m2(i6, vl); i6 += vl; + vfloat16m2_t vi7 = __riscv_vle16_v_f16m2(i7, vl); i7 += vl; + vfloat16m2_t vi8 = __riscv_vle16_v_f16m2(i8, vl); i8 += vl; + + vfloat16m2_t vmax01 = __riscv_vfmax(vi0, vi1, vl); + vfloat16m2_t vmax23 = __riscv_vfmax(vi2, vi3, vl); + vfloat16m2_t vmax45 = __riscv_vfmax(vi4, vi5, vl); + vfloat16m2_t vmax67 = __riscv_vfmax(vi6, vi7, vl); + vfloat16m2_t vmax018 = __riscv_vfmax(vmax01, vi8, vl); + + vfloat16m2_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl); + vfloat16m2_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl); + vfloat16m2_t vacc = __riscv_vfmax(vmax2345, vmax01678, vl); + + vacc = __riscv_vfmax(vacc, output_min, vl); + vacc = __riscv_vfmin(vacc, output_max, vl); + __riscv_vse16_v_f16m2(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + + // Passes 1 - n: Max more inputs to the output. + for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) { + const xnn_float16* i0 = *i++; + const xnn_float16* i1 = 1 < k ? *i++ : i0; + const xnn_float16* i2 = 2 < k ? *i++ : i0; + const xnn_float16* i3 = 3 < k ? *i++ : i0; + const xnn_float16* i4 = 4 < k ? *i++ : i0; + const xnn_float16* i5 = 5 < k ? *i++ : i0; + const xnn_float16* i6 = 6 < k ? *i++ : i0; + const xnn_float16* i7 = 7 < k ? *i++ : i0; + const xnn_float16* i8 = 8 < k ? *i++ : i0; + i0 = (const xnn_float16*) ((uintptr_t) i0 + input_offset); + i1 = (const xnn_float16*) ((uintptr_t) i1 + input_offset); + i2 = (const xnn_float16*) ((uintptr_t) i2 + input_offset); + i3 = (const xnn_float16*) ((uintptr_t) i3 + input_offset); + i4 = (const xnn_float16*) ((uintptr_t) i4 + input_offset); + i5 = (const xnn_float16*) ((uintptr_t) i5 + input_offset); + i6 = (const xnn_float16*) ((uintptr_t) i6 + input_offset); + i7 = (const xnn_float16*) ((uintptr_t) i7 + input_offset); + i8 = (const xnn_float16*) ((uintptr_t) i8 + input_offset); + + o = output; + size_t c = channels; + do { + size_t vl = __riscv_vsetvl_e16m2(c); + + vfloat16m2_t vi0 = __riscv_vle16_v_f16m2(i0, vl); i0 += vl; + vfloat16m2_t vi1 = __riscv_vle16_v_f16m2(i1, vl); i1 += vl; + vfloat16m2_t vi2 = __riscv_vle16_v_f16m2(i2, vl); i2 += vl; + vfloat16m2_t vi3 = __riscv_vle16_v_f16m2(i3, vl); i3 += vl; + vfloat16m2_t vi4 = __riscv_vle16_v_f16m2(i4, vl); i4 += vl; + vfloat16m2_t vi5 = __riscv_vle16_v_f16m2(i5, vl); i5 += vl; + vfloat16m2_t vi6 = __riscv_vle16_v_f16m2(i6, vl); i6 += vl; + vfloat16m2_t vi7 = __riscv_vle16_v_f16m2(i7, vl); i7 += vl; + vfloat16m2_t vi8 = __riscv_vle16_v_f16m2(i8, vl); i8 += vl; + + vfloat16m2_t vprev = __riscv_vle16_v_f16m2(o, vl); + + vfloat16m2_t vmax01 = __riscv_vfmax(vi0, vi1, vl); + vfloat16m2_t vmax23 = __riscv_vfmax(vi2, vi3, vl); + vfloat16m2_t vmax45 = __riscv_vfmax(vi4, vi5, vl); + vfloat16m2_t vmax67 = __riscv_vfmax(vi6, vi7, vl); + vfloat16m2_t vmax018 = __riscv_vfmax(vmax01, vi8, vl); + + vfloat16m2_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl); + vfloat16m2_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl); + vfloat16m2_t vmax012345678 = __riscv_vfmax(vmax2345, vmax01678, vl); + + vfloat16m2_t vacc = __riscv_vfmax(vprev, vmax012345678, vl); + vacc = __riscv_vfmin(vacc, output_max, vl); + __riscv_vse16_v_f16m2(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + } + input = (const xnn_float16**) ((uintptr_t) input + input_increment); + input_offset += input_pixel_stride; + output = (xnn_float16*) ((uintptr_t) output + output_increment); + } while (--output_pixels != 0); +} diff --git a/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c b/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c index bec551b6017..a4873239b0a 100644 --- a/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c +++ b/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c @@ -3,7 +3,7 @@ // Template: src/f32-maxpool/rvv.c.in // Generator: tools/xngen // -// Copyright 2024 Imagination Technologies, inc. +// Copyright 2026 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. @@ -12,6 +12,7 @@ #include "src/xnnpack/maxpool.h" #include + void xnn_f32_maxpool_minmax_ukernel_9p__rvv_u1v( size_t output_pixels, size_t kernel_elements, @@ -32,58 +33,60 @@ void xnn_f32_maxpool_minmax_ukernel_9p__rvv_u1v( const float output_max = params->scalar.max; do { const float** i = input; + + // First pass: load the inputs, store the max pool in the output. + const float* i0 = *i++; + const float* i1 = 1 < kernel_elements ? *i++ : i0; + const float* i2 = 2 < kernel_elements ? *i++ : i0; + const float* i3 = 3 < kernel_elements ? *i++ : i0; + const float* i4 = 4 < kernel_elements ? *i++ : i0; + const float* i5 = 5 < kernel_elements ? *i++ : i0; + const float* i6 = 6 < kernel_elements ? *i++ : i0; + const float* i7 = 7 < kernel_elements ? *i++ : i0; + const float* i8 = 8 < kernel_elements ? *i++ : i0; + i0 = (const float*) ((uintptr_t) i0 + input_offset); + i1 = (const float*) ((uintptr_t) i1 + input_offset); + i2 = (const float*) ((uintptr_t) i2 + input_offset); + i3 = (const float*) ((uintptr_t) i3 + input_offset); + i4 = (const float*) ((uintptr_t) i4 + input_offset); + i5 = (const float*) ((uintptr_t) i5 + input_offset); + i6 = (const float*) ((uintptr_t) i6 + input_offset); + i7 = (const float*) ((uintptr_t) i7 + input_offset); + i8 = (const float*) ((uintptr_t) i8 + input_offset); + float* o = output; - { - const float* i0 = *i++; - const float* i1 = 1 < kernel_elements ? *i++ : i0; - const float* i2 = 2 < kernel_elements ? *i++ : i0; - const float* i3 = 3 < kernel_elements ? *i++ : i0; - const float* i4 = 4 < kernel_elements ? *i++ : i0; - const float* i5 = 5 < kernel_elements ? *i++ : i0; - const float* i6 = 6 < kernel_elements ? *i++ : i0; - const float* i7 = 7 < kernel_elements ? *i++ : i0; - const float* i8 = 8 < kernel_elements ? *i++ : i0; - i0 = (const float*) ((uintptr_t) i0 + input_offset); - i1 = (const float*) ((uintptr_t) i1 + input_offset); - i2 = (const float*) ((uintptr_t) i2 + input_offset); - i3 = (const float*) ((uintptr_t) i3 + input_offset); - i4 = (const float*) ((uintptr_t) i4 + input_offset); - i5 = (const float*) ((uintptr_t) i5 + input_offset); - i6 = (const float*) ((uintptr_t) i6 + input_offset); - i7 = (const float*) ((uintptr_t) i7 + input_offset); - i8 = (const float*) ((uintptr_t) i8 + input_offset); + size_t c = channels; + do { + size_t vl = __riscv_vsetvl_e32m1(c); + vfloat32m1_t vi0 = __riscv_vle32_v_f32m1(i0, vl); i0 += vl; + vfloat32m1_t vi1 = __riscv_vle32_v_f32m1(i1, vl); i1 += vl; + vfloat32m1_t vi2 = __riscv_vle32_v_f32m1(i2, vl); i2 += vl; + vfloat32m1_t vi3 = __riscv_vle32_v_f32m1(i3, vl); i3 += vl; + vfloat32m1_t vi4 = __riscv_vle32_v_f32m1(i4, vl); i4 += vl; + vfloat32m1_t vi5 = __riscv_vle32_v_f32m1(i5, vl); i5 += vl; + vfloat32m1_t vi6 = __riscv_vle32_v_f32m1(i6, vl); i6 += vl; + vfloat32m1_t vi7 = __riscv_vle32_v_f32m1(i7, vl); i7 += vl; + vfloat32m1_t vi8 = __riscv_vle32_v_f32m1(i8, vl); i8 += vl; - size_t c = channels; - do { - int32_t n = __riscv_vsetvl_e32m1(c); - - vfloat32m1_t i0_f32v = __riscv_vle32_v_f32m1(i0, n); i0 += n; - vfloat32m1_t i1_f32v = __riscv_vle32_v_f32m1(i1, n); i1 += n; - vfloat32m1_t i2_f32v = __riscv_vle32_v_f32m1(i2, n); i2 += n; - vfloat32m1_t i3_f32v = __riscv_vle32_v_f32m1(i3, n); i3 += n; - vfloat32m1_t i4_f32v = __riscv_vle32_v_f32m1(i4, n); i4 += n; - vfloat32m1_t i5_f32v = __riscv_vle32_v_f32m1(i5, n); i5 += n; - vfloat32m1_t i6_f32v = __riscv_vle32_v_f32m1(i6, n); i6 += n; - vfloat32m1_t i7_f32v = __riscv_vle32_v_f32m1(i7, n); i7 += n; - vfloat32m1_t i8_f32v = __riscv_vle32_v_f32m1(i8, n); i8 += n; - - vfloat32m1_t max01_f32v = __riscv_vfmax_vv_f32m1(i0_f32v, i1_f32v, n); - vfloat32m1_t max23_f32v = __riscv_vfmax_vv_f32m1(i2_f32v, i3_f32v, n); - vfloat32m1_t max45_f32v = __riscv_vfmax_vv_f32m1(i4_f32v, i5_f32v, n); - vfloat32m1_t max67_f32v = __riscv_vfmax_vv_f32m1(i6_f32v, i7_f32v, n); - vfloat32m1_t max018_f32v = __riscv_vfmax_vv_f32m1(max01_f32v, i8_f32v, n); - - vfloat32m1_t max2345_f32v = __riscv_vfmax_vv_f32m1(max23_f32v, max45_f32v, n); - vfloat32m1_t max01678_f32v = __riscv_vfmax_vv_f32m1(max67_f32v, max018_f32v, n); - vfloat32m1_t out_f32v = __riscv_vfmax_vv_f32m1(max2345_f32v, max01678_f32v, n); - out_f32v = __riscv_vfmin_vf_f32m1(__riscv_vfmax_vf_f32m1(out_f32v, output_min, n), output_max, n); - __riscv_vse32_v_f32m1(o, out_f32v, n); o += n; - - c -= n; - } while (c != 0); - } + vfloat32m1_t vmax01 = __riscv_vfmax(vi0, vi1, vl); + vfloat32m1_t vmax23 = __riscv_vfmax(vi2, vi3, vl); + vfloat32m1_t vmax45 = __riscv_vfmax(vi4, vi5, vl); + vfloat32m1_t vmax67 = __riscv_vfmax(vi6, vi7, vl); + vfloat32m1_t vmax018 = __riscv_vfmax(vmax01, vi8, vl); + + vfloat32m1_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl); + vfloat32m1_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl); + vfloat32m1_t vacc = __riscv_vfmax(vmax2345, vmax01678, vl); - for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) { + vacc = __riscv_vfmax(vacc, output_min, vl); + vacc = __riscv_vfmin(vacc, output_max, vl); + __riscv_vse32_v_f32m1(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + + // Passes 1 - n: Max more inputs to the output. + for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) { const float* i0 = *i++; const float* i1 = 1 < k ? *i++ : i0; const float* i2 = 2 < k ? *i++ : i0; @@ -92,6 +95,7 @@ void xnn_f32_maxpool_minmax_ukernel_9p__rvv_u1v( const float* i5 = 5 < k ? *i++ : i0; const float* i6 = 6 < k ? *i++ : i0; const float* i7 = 7 < k ? *i++ : i0; + const float* i8 = 8 < k ? *i++ : i0; i0 = (const float*) ((uintptr_t) i0 + input_offset); i1 = (const float*) ((uintptr_t) i1 + input_offset); i2 = (const float*) ((uintptr_t) i2 + input_offset); @@ -100,35 +104,40 @@ void xnn_f32_maxpool_minmax_ukernel_9p__rvv_u1v( i5 = (const float*) ((uintptr_t) i5 + input_offset); i6 = (const float*) ((uintptr_t) i6 + input_offset); i7 = (const float*) ((uintptr_t) i7 + input_offset); + i8 = (const float*) ((uintptr_t) i8 + input_offset); o = output; size_t c = channels; do { - int32_t n = __riscv_vsetvl_e32m1(c); - - vfloat32m1_t i0_f32v = __riscv_vle32_v_f32m1(i0, n); i0 += n; - vfloat32m1_t i1_f32v = __riscv_vle32_v_f32m1(i1, n); i1 += n; - vfloat32m1_t i2_f32v = __riscv_vle32_v_f32m1(i2, n); i2 += n; - vfloat32m1_t i3_f32v = __riscv_vle32_v_f32m1(i3, n); i3 += n; - vfloat32m1_t i4_f32v = __riscv_vle32_v_f32m1(i4, n); i4 += n; - vfloat32m1_t i5_f32v = __riscv_vle32_v_f32m1(i5, n); i5 += n; - vfloat32m1_t i6_f32v = __riscv_vle32_v_f32m1(i6, n); i6 += n; - vfloat32m1_t i7_f32v = __riscv_vle32_v_f32m1(i7, n); i7 += n; - vfloat32m1_t i8_f32v = __riscv_vle32_v_f32m1(o, n); - - vfloat32m1_t max01_f32v = __riscv_vfmax_vv_f32m1(i0_f32v, i1_f32v, n); - vfloat32m1_t max23_f32v = __riscv_vfmax_vv_f32m1(i2_f32v, i3_f32v, n); - vfloat32m1_t max45_f32v = __riscv_vfmax_vv_f32m1(i4_f32v, i5_f32v, n); - vfloat32m1_t max67_f32v = __riscv_vfmax_vv_f32m1(i6_f32v, i7_f32v, n); - vfloat32m1_t max018_f32v = __riscv_vfmax_vv_f32m1(max01_f32v, i8_f32v, n); - - vfloat32m1_t max2345_f32v = __riscv_vfmax_vv_f32m1(max23_f32v, max45_f32v, n); - vfloat32m1_t max01678_f32v = __riscv_vfmax_vv_f32m1(max67_f32v, max018_f32v, n); - vfloat32m1_t out_f32v = __riscv_vfmax_vv_f32m1(max2345_f32v, max01678_f32v, n); - out_f32v = __riscv_vfmin_vf_f32m1(__riscv_vfmax_vf_f32m1(out_f32v, output_min, n), output_max, n); - __riscv_vse32_v_f32m1(o, out_f32v, n); o += n; - - c -= n; + size_t vl = __riscv_vsetvl_e32m1(c); + + vfloat32m1_t vi0 = __riscv_vle32_v_f32m1(i0, vl); i0 += vl; + vfloat32m1_t vi1 = __riscv_vle32_v_f32m1(i1, vl); i1 += vl; + vfloat32m1_t vi2 = __riscv_vle32_v_f32m1(i2, vl); i2 += vl; + vfloat32m1_t vi3 = __riscv_vle32_v_f32m1(i3, vl); i3 += vl; + vfloat32m1_t vi4 = __riscv_vle32_v_f32m1(i4, vl); i4 += vl; + vfloat32m1_t vi5 = __riscv_vle32_v_f32m1(i5, vl); i5 += vl; + vfloat32m1_t vi6 = __riscv_vle32_v_f32m1(i6, vl); i6 += vl; + vfloat32m1_t vi7 = __riscv_vle32_v_f32m1(i7, vl); i7 += vl; + vfloat32m1_t vi8 = __riscv_vle32_v_f32m1(i8, vl); i8 += vl; + + vfloat32m1_t vprev = __riscv_vle32_v_f32m1(o, vl); + + vfloat32m1_t vmax01 = __riscv_vfmax(vi0, vi1, vl); + vfloat32m1_t vmax23 = __riscv_vfmax(vi2, vi3, vl); + vfloat32m1_t vmax45 = __riscv_vfmax(vi4, vi5, vl); + vfloat32m1_t vmax67 = __riscv_vfmax(vi6, vi7, vl); + vfloat32m1_t vmax018 = __riscv_vfmax(vmax01, vi8, vl); + + vfloat32m1_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl); + vfloat32m1_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl); + vfloat32m1_t vmax012345678 = __riscv_vfmax(vmax2345, vmax01678, vl); + + vfloat32m1_t vacc = __riscv_vfmax(vprev, vmax012345678, vl); + vacc = __riscv_vfmin(vacc, output_max, vl); + __riscv_vse32_v_f32m1(o, vacc, vl); o += vl; + + c -= vl; } while (c != 0); } input = (const float**) ((uintptr_t) input + input_increment); diff --git a/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u2v.c b/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u2v.c index e2dce631f6c..eb8bd0a71e4 100644 --- a/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u2v.c +++ b/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u2v.c @@ -3,7 +3,7 @@ // Template: src/f32-maxpool/rvv.c.in // Generator: tools/xngen // -// Copyright 2024 Imagination Technologies, inc. +// Copyright 2026 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. @@ -12,6 +12,7 @@ #include "src/xnnpack/maxpool.h" #include + void xnn_f32_maxpool_minmax_ukernel_9p__rvv_u2v( size_t output_pixels, size_t kernel_elements, @@ -32,58 +33,60 @@ void xnn_f32_maxpool_minmax_ukernel_9p__rvv_u2v( const float output_max = params->scalar.max; do { const float** i = input; + + // First pass: load the inputs, store the max pool in the output. + const float* i0 = *i++; + const float* i1 = 1 < kernel_elements ? *i++ : i0; + const float* i2 = 2 < kernel_elements ? *i++ : i0; + const float* i3 = 3 < kernel_elements ? *i++ : i0; + const float* i4 = 4 < kernel_elements ? *i++ : i0; + const float* i5 = 5 < kernel_elements ? *i++ : i0; + const float* i6 = 6 < kernel_elements ? *i++ : i0; + const float* i7 = 7 < kernel_elements ? *i++ : i0; + const float* i8 = 8 < kernel_elements ? *i++ : i0; + i0 = (const float*) ((uintptr_t) i0 + input_offset); + i1 = (const float*) ((uintptr_t) i1 + input_offset); + i2 = (const float*) ((uintptr_t) i2 + input_offset); + i3 = (const float*) ((uintptr_t) i3 + input_offset); + i4 = (const float*) ((uintptr_t) i4 + input_offset); + i5 = (const float*) ((uintptr_t) i5 + input_offset); + i6 = (const float*) ((uintptr_t) i6 + input_offset); + i7 = (const float*) ((uintptr_t) i7 + input_offset); + i8 = (const float*) ((uintptr_t) i8 + input_offset); + float* o = output; - { - const float* i0 = *i++; - const float* i1 = 1 < kernel_elements ? *i++ : i0; - const float* i2 = 2 < kernel_elements ? *i++ : i0; - const float* i3 = 3 < kernel_elements ? *i++ : i0; - const float* i4 = 4 < kernel_elements ? *i++ : i0; - const float* i5 = 5 < kernel_elements ? *i++ : i0; - const float* i6 = 6 < kernel_elements ? *i++ : i0; - const float* i7 = 7 < kernel_elements ? *i++ : i0; - const float* i8 = 8 < kernel_elements ? *i++ : i0; - i0 = (const float*) ((uintptr_t) i0 + input_offset); - i1 = (const float*) ((uintptr_t) i1 + input_offset); - i2 = (const float*) ((uintptr_t) i2 + input_offset); - i3 = (const float*) ((uintptr_t) i3 + input_offset); - i4 = (const float*) ((uintptr_t) i4 + input_offset); - i5 = (const float*) ((uintptr_t) i5 + input_offset); - i6 = (const float*) ((uintptr_t) i6 + input_offset); - i7 = (const float*) ((uintptr_t) i7 + input_offset); - i8 = (const float*) ((uintptr_t) i8 + input_offset); + size_t c = channels; + do { + size_t vl = __riscv_vsetvl_e32m2(c); + vfloat32m2_t vi0 = __riscv_vle32_v_f32m2(i0, vl); i0 += vl; + vfloat32m2_t vi1 = __riscv_vle32_v_f32m2(i1, vl); i1 += vl; + vfloat32m2_t vi2 = __riscv_vle32_v_f32m2(i2, vl); i2 += vl; + vfloat32m2_t vi3 = __riscv_vle32_v_f32m2(i3, vl); i3 += vl; + vfloat32m2_t vi4 = __riscv_vle32_v_f32m2(i4, vl); i4 += vl; + vfloat32m2_t vi5 = __riscv_vle32_v_f32m2(i5, vl); i5 += vl; + vfloat32m2_t vi6 = __riscv_vle32_v_f32m2(i6, vl); i6 += vl; + vfloat32m2_t vi7 = __riscv_vle32_v_f32m2(i7, vl); i7 += vl; + vfloat32m2_t vi8 = __riscv_vle32_v_f32m2(i8, vl); i8 += vl; - size_t c = channels; - do { - int32_t n = __riscv_vsetvl_e32m2(c); - - vfloat32m2_t i0_f32v = __riscv_vle32_v_f32m2(i0, n); i0 += n; - vfloat32m2_t i1_f32v = __riscv_vle32_v_f32m2(i1, n); i1 += n; - vfloat32m2_t i2_f32v = __riscv_vle32_v_f32m2(i2, n); i2 += n; - vfloat32m2_t i3_f32v = __riscv_vle32_v_f32m2(i3, n); i3 += n; - vfloat32m2_t i4_f32v = __riscv_vle32_v_f32m2(i4, n); i4 += n; - vfloat32m2_t i5_f32v = __riscv_vle32_v_f32m2(i5, n); i5 += n; - vfloat32m2_t i6_f32v = __riscv_vle32_v_f32m2(i6, n); i6 += n; - vfloat32m2_t i7_f32v = __riscv_vle32_v_f32m2(i7, n); i7 += n; - vfloat32m2_t i8_f32v = __riscv_vle32_v_f32m2(i8, n); i8 += n; - - vfloat32m2_t max01_f32v = __riscv_vfmax_vv_f32m2(i0_f32v, i1_f32v, n); - vfloat32m2_t max23_f32v = __riscv_vfmax_vv_f32m2(i2_f32v, i3_f32v, n); - vfloat32m2_t max45_f32v = __riscv_vfmax_vv_f32m2(i4_f32v, i5_f32v, n); - vfloat32m2_t max67_f32v = __riscv_vfmax_vv_f32m2(i6_f32v, i7_f32v, n); - vfloat32m2_t max018_f32v = __riscv_vfmax_vv_f32m2(max01_f32v, i8_f32v, n); - - vfloat32m2_t max2345_f32v = __riscv_vfmax_vv_f32m2(max23_f32v, max45_f32v, n); - vfloat32m2_t max01678_f32v = __riscv_vfmax_vv_f32m2(max67_f32v, max018_f32v, n); - vfloat32m2_t out_f32v = __riscv_vfmax_vv_f32m2(max2345_f32v, max01678_f32v, n); - out_f32v = __riscv_vfmin_vf_f32m2(__riscv_vfmax_vf_f32m2(out_f32v, output_min, n), output_max, n); - __riscv_vse32_v_f32m2(o, out_f32v, n); o += n; - - c -= n; - } while (c != 0); - } + vfloat32m2_t vmax01 = __riscv_vfmax(vi0, vi1, vl); + vfloat32m2_t vmax23 = __riscv_vfmax(vi2, vi3, vl); + vfloat32m2_t vmax45 = __riscv_vfmax(vi4, vi5, vl); + vfloat32m2_t vmax67 = __riscv_vfmax(vi6, vi7, vl); + vfloat32m2_t vmax018 = __riscv_vfmax(vmax01, vi8, vl); + + vfloat32m2_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl); + vfloat32m2_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl); + vfloat32m2_t vacc = __riscv_vfmax(vmax2345, vmax01678, vl); - for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) { + vacc = __riscv_vfmax(vacc, output_min, vl); + vacc = __riscv_vfmin(vacc, output_max, vl); + __riscv_vse32_v_f32m2(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + + // Passes 1 - n: Max more inputs to the output. + for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) { const float* i0 = *i++; const float* i1 = 1 < k ? *i++ : i0; const float* i2 = 2 < k ? *i++ : i0; @@ -92,6 +95,7 @@ void xnn_f32_maxpool_minmax_ukernel_9p__rvv_u2v( const float* i5 = 5 < k ? *i++ : i0; const float* i6 = 6 < k ? *i++ : i0; const float* i7 = 7 < k ? *i++ : i0; + const float* i8 = 8 < k ? *i++ : i0; i0 = (const float*) ((uintptr_t) i0 + input_offset); i1 = (const float*) ((uintptr_t) i1 + input_offset); i2 = (const float*) ((uintptr_t) i2 + input_offset); @@ -100,35 +104,40 @@ void xnn_f32_maxpool_minmax_ukernel_9p__rvv_u2v( i5 = (const float*) ((uintptr_t) i5 + input_offset); i6 = (const float*) ((uintptr_t) i6 + input_offset); i7 = (const float*) ((uintptr_t) i7 + input_offset); + i8 = (const float*) ((uintptr_t) i8 + input_offset); o = output; size_t c = channels; do { - int32_t n = __riscv_vsetvl_e32m2(c); - - vfloat32m2_t i0_f32v = __riscv_vle32_v_f32m2(i0, n); i0 += n; - vfloat32m2_t i1_f32v = __riscv_vle32_v_f32m2(i1, n); i1 += n; - vfloat32m2_t i2_f32v = __riscv_vle32_v_f32m2(i2, n); i2 += n; - vfloat32m2_t i3_f32v = __riscv_vle32_v_f32m2(i3, n); i3 += n; - vfloat32m2_t i4_f32v = __riscv_vle32_v_f32m2(i4, n); i4 += n; - vfloat32m2_t i5_f32v = __riscv_vle32_v_f32m2(i5, n); i5 += n; - vfloat32m2_t i6_f32v = __riscv_vle32_v_f32m2(i6, n); i6 += n; - vfloat32m2_t i7_f32v = __riscv_vle32_v_f32m2(i7, n); i7 += n; - vfloat32m2_t i8_f32v = __riscv_vle32_v_f32m2(o, n); - - vfloat32m2_t max01_f32v = __riscv_vfmax_vv_f32m2(i0_f32v, i1_f32v, n); - vfloat32m2_t max23_f32v = __riscv_vfmax_vv_f32m2(i2_f32v, i3_f32v, n); - vfloat32m2_t max45_f32v = __riscv_vfmax_vv_f32m2(i4_f32v, i5_f32v, n); - vfloat32m2_t max67_f32v = __riscv_vfmax_vv_f32m2(i6_f32v, i7_f32v, n); - vfloat32m2_t max018_f32v = __riscv_vfmax_vv_f32m2(max01_f32v, i8_f32v, n); - - vfloat32m2_t max2345_f32v = __riscv_vfmax_vv_f32m2(max23_f32v, max45_f32v, n); - vfloat32m2_t max01678_f32v = __riscv_vfmax_vv_f32m2(max67_f32v, max018_f32v, n); - vfloat32m2_t out_f32v = __riscv_vfmax_vv_f32m2(max2345_f32v, max01678_f32v, n); - out_f32v = __riscv_vfmin_vf_f32m2(__riscv_vfmax_vf_f32m2(out_f32v, output_min, n), output_max, n); - __riscv_vse32_v_f32m2(o, out_f32v, n); o += n; - - c -= n; + size_t vl = __riscv_vsetvl_e32m2(c); + + vfloat32m2_t vi0 = __riscv_vle32_v_f32m2(i0, vl); i0 += vl; + vfloat32m2_t vi1 = __riscv_vle32_v_f32m2(i1, vl); i1 += vl; + vfloat32m2_t vi2 = __riscv_vle32_v_f32m2(i2, vl); i2 += vl; + vfloat32m2_t vi3 = __riscv_vle32_v_f32m2(i3, vl); i3 += vl; + vfloat32m2_t vi4 = __riscv_vle32_v_f32m2(i4, vl); i4 += vl; + vfloat32m2_t vi5 = __riscv_vle32_v_f32m2(i5, vl); i5 += vl; + vfloat32m2_t vi6 = __riscv_vle32_v_f32m2(i6, vl); i6 += vl; + vfloat32m2_t vi7 = __riscv_vle32_v_f32m2(i7, vl); i7 += vl; + vfloat32m2_t vi8 = __riscv_vle32_v_f32m2(i8, vl); i8 += vl; + + vfloat32m2_t vprev = __riscv_vle32_v_f32m2(o, vl); + + vfloat32m2_t vmax01 = __riscv_vfmax(vi0, vi1, vl); + vfloat32m2_t vmax23 = __riscv_vfmax(vi2, vi3, vl); + vfloat32m2_t vmax45 = __riscv_vfmax(vi4, vi5, vl); + vfloat32m2_t vmax67 = __riscv_vfmax(vi6, vi7, vl); + vfloat32m2_t vmax018 = __riscv_vfmax(vmax01, vi8, vl); + + vfloat32m2_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl); + vfloat32m2_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl); + vfloat32m2_t vmax012345678 = __riscv_vfmax(vmax2345, vmax01678, vl); + + vfloat32m2_t vacc = __riscv_vfmax(vprev, vmax012345678, vl); + vacc = __riscv_vfmin(vacc, output_max, vl); + __riscv_vse32_v_f32m2(o, vacc, vl); o += vl; + + c -= vl; } while (c != 0); } input = (const float**) ((uintptr_t) input + input_increment); diff --git a/src/f32-maxpool/rvv.c.in b/src/f32-maxpool/rvv.c.in index 00f32b0d877..87515e75d6f 100755 --- a/src/f32-maxpool/rvv.c.in +++ b/src/f32-maxpool/rvv.c.in @@ -1,134 +1,112 @@ -// Copyright 2024 Imagination Technologies, inc. +// Copyright 2026 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. $assert LMUL in [1, 2, 4, 8] +$assert(DATATYPE in ["f32", "f16", "u8", "s8"]) #include #include "src/xnnpack/maxpool.h" #include -void xnn_f32_maxpool_minmax_ukernel_9p__rvv_u${LMUL}v( +$CTYPE = {"f32": "float", "f16": "xnn_float16", "u8": "uint8_t", "s8": "int8_t"}[DATATYPE] +$VTYPE = {"f32": "vfloat32", "f16": "vfloat16", "u8": "vuint8", "s8": "vint8"}[DATATYPE] +$VLOAD = {"f32": "__riscv_vle32_v_f32", "f16": "__riscv_vle16_v_f16", "u8": "__riscv_vle8_v_u8", "s8": "__riscv_vle8_v_i8"}[DATATYPE] +$VSTORE = {"f32": "__riscv_vse32_v_f32", "f16": "__riscv_vse16_v_f16", "u8": "__riscv_vse8_v_u8", "s8": "__riscv_vse8_v_i8"}[DATATYPE] +$VSETVL = {"f32": "__riscv_vsetvl_e32", "f16": "__riscv_vsetvl_e16", "u8": "__riscv_vsetvl_e8", "s8": "__riscv_vsetvl_e8"}[DATATYPE] +$VMAX = {"f32": "__riscv_vfmax", "f16": "__riscv_vfmax", "u8": "__riscv_vmaxu", "s8": "__riscv_vmax"}[DATATYPE] +$VMIN = {"f32": "__riscv_vfmin", "f16": "__riscv_vfmin", "u8": "__riscv_vminu", "s8": "__riscv_vmin"}[DATATYPE] +$ISA = "fp16arith" if DATATYPE == "f16" else "" + +void xnn_${DATATYPE}_maxpool_minmax_ukernel_9p__rvv${ISA}_u${LMUL}v( size_t output_pixels, size_t kernel_elements, size_t channels, - const float** input, + const ${CTYPE}** input, size_t input_offset, size_t input_pixel_stride, - float* output, + ${CTYPE}* output, size_t input_increment, size_t output_increment, - const struct xnn_f32_minmax_params* restrict params) + const struct xnn_${DATATYPE}_minmax_params* restrict params) { assert(output_pixels != 0); assert(kernel_elements != 0); assert(channels != 0); - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; + const ${CTYPE} output_min = params->scalar.min; + const ${CTYPE} output_max = params->scalar.max; do { - const float** i = input; - float* o = output; - { - const float* i0 = *i++; - const float* i1 = 1 < kernel_elements ? *i++ : i0; - const float* i2 = 2 < kernel_elements ? *i++ : i0; - const float* i3 = 3 < kernel_elements ? *i++ : i0; - const float* i4 = 4 < kernel_elements ? *i++ : i0; - const float* i5 = 5 < kernel_elements ? *i++ : i0; - const float* i6 = 6 < kernel_elements ? *i++ : i0; - const float* i7 = 7 < kernel_elements ? *i++ : i0; - const float* i8 = 8 < kernel_elements ? *i++ : i0; - i0 = (const float*) ((uintptr_t) i0 + input_offset); - i1 = (const float*) ((uintptr_t) i1 + input_offset); - i2 = (const float*) ((uintptr_t) i2 + input_offset); - i3 = (const float*) ((uintptr_t) i3 + input_offset); - i4 = (const float*) ((uintptr_t) i4 + input_offset); - i5 = (const float*) ((uintptr_t) i5 + input_offset); - i6 = (const float*) ((uintptr_t) i6 + input_offset); - i7 = (const float*) ((uintptr_t) i7 + input_offset); - i8 = (const float*) ((uintptr_t) i8 + input_offset); + const ${CTYPE}** i = input; - size_t c = channels; - do { - int32_t n = __riscv_vsetvl_e32m${LMUL}(c); - - vfloat32m${LMUL}_t i0_f32v = __riscv_vle32_v_f32m${LMUL}(i0, n); i0 += n; - vfloat32m${LMUL}_t i1_f32v = __riscv_vle32_v_f32m${LMUL}(i1, n); i1 += n; - vfloat32m${LMUL}_t i2_f32v = __riscv_vle32_v_f32m${LMUL}(i2, n); i2 += n; - vfloat32m${LMUL}_t i3_f32v = __riscv_vle32_v_f32m${LMUL}(i3, n); i3 += n; - vfloat32m${LMUL}_t i4_f32v = __riscv_vle32_v_f32m${LMUL}(i4, n); i4 += n; - vfloat32m${LMUL}_t i5_f32v = __riscv_vle32_v_f32m${LMUL}(i5, n); i5 += n; - vfloat32m${LMUL}_t i6_f32v = __riscv_vle32_v_f32m${LMUL}(i6, n); i6 += n; - vfloat32m${LMUL}_t i7_f32v = __riscv_vle32_v_f32m${LMUL}(i7, n); i7 += n; - vfloat32m${LMUL}_t i8_f32v = __riscv_vle32_v_f32m${LMUL}(i8, n); i8 += n; - - vfloat32m${LMUL}_t max01_f32v = __riscv_vfmax_vv_f32m${LMUL}(i0_f32v, i1_f32v, n); - vfloat32m${LMUL}_t max23_f32v = __riscv_vfmax_vv_f32m${LMUL}(i2_f32v, i3_f32v, n); - vfloat32m${LMUL}_t max45_f32v = __riscv_vfmax_vv_f32m${LMUL}(i4_f32v, i5_f32v, n); - vfloat32m${LMUL}_t max67_f32v = __riscv_vfmax_vv_f32m${LMUL}(i6_f32v, i7_f32v, n); - vfloat32m${LMUL}_t max018_f32v = __riscv_vfmax_vv_f32m${LMUL}(max01_f32v, i8_f32v, n); - - vfloat32m${LMUL}_t max2345_f32v = __riscv_vfmax_vv_f32m${LMUL}(max23_f32v, max45_f32v, n); - vfloat32m${LMUL}_t max01678_f32v = __riscv_vfmax_vv_f32m${LMUL}(max67_f32v, max018_f32v, n); - vfloat32m${LMUL}_t out_f32v = __riscv_vfmax_vv_f32m${LMUL}(max2345_f32v, max01678_f32v, n); - out_f32v = __riscv_vfmin_vf_f32m${LMUL}(__riscv_vfmax_vf_f32m${LMUL}(out_f32v, output_min, n), output_max, n); - __riscv_vse32_v_f32m${LMUL}(o, out_f32v, n); o += n; - - c -= n; - } while (c != 0); - } + // First pass: load the inputs, store the max pool in the output. + const ${CTYPE}* i0 = *i++; + $for K in range(1, 9): + const ${CTYPE}* i${K} = ${K} < kernel_elements ? *i++ : i0; + $for K in range(9): + i${K} = (const ${CTYPE}*) ((uintptr_t) i${K} + input_offset); + + ${CTYPE}* o = output; + size_t c = channels; + do { + size_t vl = ${VSETVL}m${LMUL}(c); + $for K in range(9): + ${VTYPE}m${LMUL}_t vi${K} = ${VLOAD}m${LMUL}(i${K}, vl); i${K} += vl; + + ${VTYPE}m${LMUL}_t vmax01 = ${VMAX}(vi0, vi1, vl); + ${VTYPE}m${LMUL}_t vmax23 = ${VMAX}(vi2, vi3, vl); + ${VTYPE}m${LMUL}_t vmax45 = ${VMAX}(vi4, vi5, vl); + ${VTYPE}m${LMUL}_t vmax67 = ${VMAX}(vi6, vi7, vl); + ${VTYPE}m${LMUL}_t vmax018 = ${VMAX}(vmax01, vi8, vl); + + ${VTYPE}m${LMUL}_t vmax2345 = ${VMAX}(vmax23, vmax45, vl); + ${VTYPE}m${LMUL}_t vmax01678 = ${VMAX}(vmax67, vmax018, vl); + ${VTYPE}m${LMUL}_t vacc = ${VMAX}(vmax2345, vmax01678, vl); - for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) { - const float* i0 = *i++; - const float* i1 = 1 < k ? *i++ : i0; - const float* i2 = 2 < k ? *i++ : i0; - const float* i3 = 3 < k ? *i++ : i0; - const float* i4 = 4 < k ? *i++ : i0; - const float* i5 = 5 < k ? *i++ : i0; - const float* i6 = 6 < k ? *i++ : i0; - const float* i7 = 7 < k ? *i++ : i0; - i0 = (const float*) ((uintptr_t) i0 + input_offset); - i1 = (const float*) ((uintptr_t) i1 + input_offset); - i2 = (const float*) ((uintptr_t) i2 + input_offset); - i3 = (const float*) ((uintptr_t) i3 + input_offset); - i4 = (const float*) ((uintptr_t) i4 + input_offset); - i5 = (const float*) ((uintptr_t) i5 + input_offset); - i6 = (const float*) ((uintptr_t) i6 + input_offset); - i7 = (const float*) ((uintptr_t) i7 + input_offset); + vacc = ${VMAX}(vacc, output_min, vl); + vacc = ${VMIN}(vacc, output_max, vl); + ${VSTORE}m${LMUL}(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + + // Passes 1 - n: Max more inputs to the output. + for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) { + const ${CTYPE}* i0 = *i++; + $for K in range(1, 9): + const ${CTYPE}* i${K} = ${K} < k ? *i++ : i0; + $for K in range(9): + i${K} = (const ${CTYPE}*) ((uintptr_t) i${K} + input_offset); o = output; size_t c = channels; do { - int32_t n = __riscv_vsetvl_e32m${LMUL}(c); - - vfloat32m${LMUL}_t i0_f32v = __riscv_vle32_v_f32m${LMUL}(i0, n); i0 += n; - vfloat32m${LMUL}_t i1_f32v = __riscv_vle32_v_f32m${LMUL}(i1, n); i1 += n; - vfloat32m${LMUL}_t i2_f32v = __riscv_vle32_v_f32m${LMUL}(i2, n); i2 += n; - vfloat32m${LMUL}_t i3_f32v = __riscv_vle32_v_f32m${LMUL}(i3, n); i3 += n; - vfloat32m${LMUL}_t i4_f32v = __riscv_vle32_v_f32m${LMUL}(i4, n); i4 += n; - vfloat32m${LMUL}_t i5_f32v = __riscv_vle32_v_f32m${LMUL}(i5, n); i5 += n; - vfloat32m${LMUL}_t i6_f32v = __riscv_vle32_v_f32m${LMUL}(i6, n); i6 += n; - vfloat32m${LMUL}_t i7_f32v = __riscv_vle32_v_f32m${LMUL}(i7, n); i7 += n; - vfloat32m${LMUL}_t i8_f32v = __riscv_vle32_v_f32m${LMUL}(o, n); - - vfloat32m${LMUL}_t max01_f32v = __riscv_vfmax_vv_f32m${LMUL}(i0_f32v, i1_f32v, n); - vfloat32m${LMUL}_t max23_f32v = __riscv_vfmax_vv_f32m${LMUL}(i2_f32v, i3_f32v, n); - vfloat32m${LMUL}_t max45_f32v = __riscv_vfmax_vv_f32m${LMUL}(i4_f32v, i5_f32v, n); - vfloat32m${LMUL}_t max67_f32v = __riscv_vfmax_vv_f32m${LMUL}(i6_f32v, i7_f32v, n); - vfloat32m${LMUL}_t max018_f32v = __riscv_vfmax_vv_f32m${LMUL}(max01_f32v, i8_f32v, n); - - vfloat32m${LMUL}_t max2345_f32v = __riscv_vfmax_vv_f32m${LMUL}(max23_f32v, max45_f32v, n); - vfloat32m${LMUL}_t max01678_f32v = __riscv_vfmax_vv_f32m${LMUL}(max67_f32v, max018_f32v, n); - vfloat32m${LMUL}_t out_f32v = __riscv_vfmax_vv_f32m${LMUL}(max2345_f32v, max01678_f32v, n); - out_f32v = __riscv_vfmin_vf_f32m${LMUL}(__riscv_vfmax_vf_f32m${LMUL}(out_f32v, output_min, n), output_max, n); - __riscv_vse32_v_f32m${LMUL}(o, out_f32v, n); o += n; - - c -= n; + size_t vl = ${VSETVL}m${LMUL}(c); + + $for K in range(9): + ${VTYPE}m${LMUL}_t vi${K} = ${VLOAD}m${LMUL}(i${K}, vl); i${K} += vl; + + ${VTYPE}m${LMUL}_t vprev = ${VLOAD}m${LMUL}(o, vl); + + ${VTYPE}m${LMUL}_t vmax01 = ${VMAX}(vi0, vi1, vl); + ${VTYPE}m${LMUL}_t vmax23 = ${VMAX}(vi2, vi3, vl); + ${VTYPE}m${LMUL}_t vmax45 = ${VMAX}(vi4, vi5, vl); + ${VTYPE}m${LMUL}_t vmax67 = ${VMAX}(vi6, vi7, vl); + ${VTYPE}m${LMUL}_t vmax018 = ${VMAX}(vmax01, vi8, vl); + + ${VTYPE}m${LMUL}_t vmax2345 = ${VMAX}(vmax23, vmax45, vl); + ${VTYPE}m${LMUL}_t vmax01678 = ${VMAX}(vmax67, vmax018, vl); + ${VTYPE}m${LMUL}_t vmax012345678 = ${VMAX}(vmax2345, vmax01678, vl); + + ${VTYPE}m${LMUL}_t vacc = ${VMAX}(vprev, vmax012345678, vl); + vacc = ${VMIN}(vacc, output_max, vl); + ${VSTORE}m${LMUL}(o, vacc, vl); o += vl; + + c -= vl; } while (c != 0); } - input = (const float**) ((uintptr_t) input + input_increment); + input = (const ${CTYPE}**) ((uintptr_t) input + input_increment); input_offset += input_pixel_stride; - output = (float*) ((uintptr_t) output + output_increment); + output = (${CTYPE}*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } diff --git a/src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c b/src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c new file mode 100644 index 00000000000..3114918b9b0 --- /dev/null +++ b/src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c @@ -0,0 +1,147 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-maxpool/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include "src/xnnpack/maxpool.h" +#include + + +void xnn_s8_maxpool_minmax_ukernel_9p__rvv_u1v( + size_t output_pixels, + size_t kernel_elements, + size_t channels, + const int8_t** input, + size_t input_offset, + size_t input_pixel_stride, + int8_t* output, + size_t input_increment, + size_t output_increment, + const struct xnn_s8_minmax_params* restrict params) +{ + assert(output_pixels != 0); + assert(kernel_elements != 0); + assert(channels != 0); + + const int8_t output_min = params->scalar.min; + const int8_t output_max = params->scalar.max; + do { + const int8_t** i = input; + + // First pass: load the inputs, store the max pool in the output. + const int8_t* i0 = *i++; + const int8_t* i1 = 1 < kernel_elements ? *i++ : i0; + const int8_t* i2 = 2 < kernel_elements ? *i++ : i0; + const int8_t* i3 = 3 < kernel_elements ? *i++ : i0; + const int8_t* i4 = 4 < kernel_elements ? *i++ : i0; + const int8_t* i5 = 5 < kernel_elements ? *i++ : i0; + const int8_t* i6 = 6 < kernel_elements ? *i++ : i0; + const int8_t* i7 = 7 < kernel_elements ? *i++ : i0; + const int8_t* i8 = 8 < kernel_elements ? *i++ : i0; + i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); + i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); + i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); + i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); + i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); + i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); + i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); + i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); + i8 = (const int8_t*) ((uintptr_t) i8 + input_offset); + + int8_t* o = output; + size_t c = channels; + do { + size_t vl = __riscv_vsetvl_e8m1(c); + vint8m1_t vi0 = __riscv_vle8_v_i8m1(i0, vl); i0 += vl; + vint8m1_t vi1 = __riscv_vle8_v_i8m1(i1, vl); i1 += vl; + vint8m1_t vi2 = __riscv_vle8_v_i8m1(i2, vl); i2 += vl; + vint8m1_t vi3 = __riscv_vle8_v_i8m1(i3, vl); i3 += vl; + vint8m1_t vi4 = __riscv_vle8_v_i8m1(i4, vl); i4 += vl; + vint8m1_t vi5 = __riscv_vle8_v_i8m1(i5, vl); i5 += vl; + vint8m1_t vi6 = __riscv_vle8_v_i8m1(i6, vl); i6 += vl; + vint8m1_t vi7 = __riscv_vle8_v_i8m1(i7, vl); i7 += vl; + vint8m1_t vi8 = __riscv_vle8_v_i8m1(i8, vl); i8 += vl; + + vint8m1_t vmax01 = __riscv_vmax(vi0, vi1, vl); + vint8m1_t vmax23 = __riscv_vmax(vi2, vi3, vl); + vint8m1_t vmax45 = __riscv_vmax(vi4, vi5, vl); + vint8m1_t vmax67 = __riscv_vmax(vi6, vi7, vl); + vint8m1_t vmax018 = __riscv_vmax(vmax01, vi8, vl); + + vint8m1_t vmax2345 = __riscv_vmax(vmax23, vmax45, vl); + vint8m1_t vmax01678 = __riscv_vmax(vmax67, vmax018, vl); + vint8m1_t vacc = __riscv_vmax(vmax2345, vmax01678, vl); + + vacc = __riscv_vmax(vacc, output_min, vl); + vacc = __riscv_vmin(vacc, output_max, vl); + __riscv_vse8_v_i8m1(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + + // Passes 1 - n: Max more inputs to the output. + for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) { + const int8_t* i0 = *i++; + const int8_t* i1 = 1 < k ? *i++ : i0; + const int8_t* i2 = 2 < k ? *i++ : i0; + const int8_t* i3 = 3 < k ? *i++ : i0; + const int8_t* i4 = 4 < k ? *i++ : i0; + const int8_t* i5 = 5 < k ? *i++ : i0; + const int8_t* i6 = 6 < k ? *i++ : i0; + const int8_t* i7 = 7 < k ? *i++ : i0; + const int8_t* i8 = 8 < k ? *i++ : i0; + i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); + i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); + i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); + i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); + i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); + i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); + i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); + i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); + i8 = (const int8_t*) ((uintptr_t) i8 + input_offset); + + o = output; + size_t c = channels; + do { + size_t vl = __riscv_vsetvl_e8m1(c); + + vint8m1_t vi0 = __riscv_vle8_v_i8m1(i0, vl); i0 += vl; + vint8m1_t vi1 = __riscv_vle8_v_i8m1(i1, vl); i1 += vl; + vint8m1_t vi2 = __riscv_vle8_v_i8m1(i2, vl); i2 += vl; + vint8m1_t vi3 = __riscv_vle8_v_i8m1(i3, vl); i3 += vl; + vint8m1_t vi4 = __riscv_vle8_v_i8m1(i4, vl); i4 += vl; + vint8m1_t vi5 = __riscv_vle8_v_i8m1(i5, vl); i5 += vl; + vint8m1_t vi6 = __riscv_vle8_v_i8m1(i6, vl); i6 += vl; + vint8m1_t vi7 = __riscv_vle8_v_i8m1(i7, vl); i7 += vl; + vint8m1_t vi8 = __riscv_vle8_v_i8m1(i8, vl); i8 += vl; + + vint8m1_t vprev = __riscv_vle8_v_i8m1(o, vl); + + vint8m1_t vmax01 = __riscv_vmax(vi0, vi1, vl); + vint8m1_t vmax23 = __riscv_vmax(vi2, vi3, vl); + vint8m1_t vmax45 = __riscv_vmax(vi4, vi5, vl); + vint8m1_t vmax67 = __riscv_vmax(vi6, vi7, vl); + vint8m1_t vmax018 = __riscv_vmax(vmax01, vi8, vl); + + vint8m1_t vmax2345 = __riscv_vmax(vmax23, vmax45, vl); + vint8m1_t vmax01678 = __riscv_vmax(vmax67, vmax018, vl); + vint8m1_t vmax012345678 = __riscv_vmax(vmax2345, vmax01678, vl); + + vint8m1_t vacc = __riscv_vmax(vprev, vmax012345678, vl); + vacc = __riscv_vmin(vacc, output_max, vl); + __riscv_vse8_v_i8m1(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + } + input = (const int8_t**) ((uintptr_t) input + input_increment); + input_offset += input_pixel_stride; + output = (int8_t*) ((uintptr_t) output + output_increment); + } while (--output_pixels != 0); +} diff --git a/src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u2v.c b/src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u2v.c new file mode 100644 index 00000000000..439ab947045 --- /dev/null +++ b/src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u2v.c @@ -0,0 +1,147 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-maxpool/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include "src/xnnpack/maxpool.h" +#include + + +void xnn_s8_maxpool_minmax_ukernel_9p__rvv_u2v( + size_t output_pixels, + size_t kernel_elements, + size_t channels, + const int8_t** input, + size_t input_offset, + size_t input_pixel_stride, + int8_t* output, + size_t input_increment, + size_t output_increment, + const struct xnn_s8_minmax_params* restrict params) +{ + assert(output_pixels != 0); + assert(kernel_elements != 0); + assert(channels != 0); + + const int8_t output_min = params->scalar.min; + const int8_t output_max = params->scalar.max; + do { + const int8_t** i = input; + + // First pass: load the inputs, store the max pool in the output. + const int8_t* i0 = *i++; + const int8_t* i1 = 1 < kernel_elements ? *i++ : i0; + const int8_t* i2 = 2 < kernel_elements ? *i++ : i0; + const int8_t* i3 = 3 < kernel_elements ? *i++ : i0; + const int8_t* i4 = 4 < kernel_elements ? *i++ : i0; + const int8_t* i5 = 5 < kernel_elements ? *i++ : i0; + const int8_t* i6 = 6 < kernel_elements ? *i++ : i0; + const int8_t* i7 = 7 < kernel_elements ? *i++ : i0; + const int8_t* i8 = 8 < kernel_elements ? *i++ : i0; + i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); + i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); + i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); + i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); + i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); + i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); + i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); + i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); + i8 = (const int8_t*) ((uintptr_t) i8 + input_offset); + + int8_t* o = output; + size_t c = channels; + do { + size_t vl = __riscv_vsetvl_e8m2(c); + vint8m2_t vi0 = __riscv_vle8_v_i8m2(i0, vl); i0 += vl; + vint8m2_t vi1 = __riscv_vle8_v_i8m2(i1, vl); i1 += vl; + vint8m2_t vi2 = __riscv_vle8_v_i8m2(i2, vl); i2 += vl; + vint8m2_t vi3 = __riscv_vle8_v_i8m2(i3, vl); i3 += vl; + vint8m2_t vi4 = __riscv_vle8_v_i8m2(i4, vl); i4 += vl; + vint8m2_t vi5 = __riscv_vle8_v_i8m2(i5, vl); i5 += vl; + vint8m2_t vi6 = __riscv_vle8_v_i8m2(i6, vl); i6 += vl; + vint8m2_t vi7 = __riscv_vle8_v_i8m2(i7, vl); i7 += vl; + vint8m2_t vi8 = __riscv_vle8_v_i8m2(i8, vl); i8 += vl; + + vint8m2_t vmax01 = __riscv_vmax(vi0, vi1, vl); + vint8m2_t vmax23 = __riscv_vmax(vi2, vi3, vl); + vint8m2_t vmax45 = __riscv_vmax(vi4, vi5, vl); + vint8m2_t vmax67 = __riscv_vmax(vi6, vi7, vl); + vint8m2_t vmax018 = __riscv_vmax(vmax01, vi8, vl); + + vint8m2_t vmax2345 = __riscv_vmax(vmax23, vmax45, vl); + vint8m2_t vmax01678 = __riscv_vmax(vmax67, vmax018, vl); + vint8m2_t vacc = __riscv_vmax(vmax2345, vmax01678, vl); + + vacc = __riscv_vmax(vacc, output_min, vl); + vacc = __riscv_vmin(vacc, output_max, vl); + __riscv_vse8_v_i8m2(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + + // Passes 1 - n: Max more inputs to the output. + for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) { + const int8_t* i0 = *i++; + const int8_t* i1 = 1 < k ? *i++ : i0; + const int8_t* i2 = 2 < k ? *i++ : i0; + const int8_t* i3 = 3 < k ? *i++ : i0; + const int8_t* i4 = 4 < k ? *i++ : i0; + const int8_t* i5 = 5 < k ? *i++ : i0; + const int8_t* i6 = 6 < k ? *i++ : i0; + const int8_t* i7 = 7 < k ? *i++ : i0; + const int8_t* i8 = 8 < k ? *i++ : i0; + i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); + i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); + i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); + i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); + i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); + i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); + i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); + i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); + i8 = (const int8_t*) ((uintptr_t) i8 + input_offset); + + o = output; + size_t c = channels; + do { + size_t vl = __riscv_vsetvl_e8m2(c); + + vint8m2_t vi0 = __riscv_vle8_v_i8m2(i0, vl); i0 += vl; + vint8m2_t vi1 = __riscv_vle8_v_i8m2(i1, vl); i1 += vl; + vint8m2_t vi2 = __riscv_vle8_v_i8m2(i2, vl); i2 += vl; + vint8m2_t vi3 = __riscv_vle8_v_i8m2(i3, vl); i3 += vl; + vint8m2_t vi4 = __riscv_vle8_v_i8m2(i4, vl); i4 += vl; + vint8m2_t vi5 = __riscv_vle8_v_i8m2(i5, vl); i5 += vl; + vint8m2_t vi6 = __riscv_vle8_v_i8m2(i6, vl); i6 += vl; + vint8m2_t vi7 = __riscv_vle8_v_i8m2(i7, vl); i7 += vl; + vint8m2_t vi8 = __riscv_vle8_v_i8m2(i8, vl); i8 += vl; + + vint8m2_t vprev = __riscv_vle8_v_i8m2(o, vl); + + vint8m2_t vmax01 = __riscv_vmax(vi0, vi1, vl); + vint8m2_t vmax23 = __riscv_vmax(vi2, vi3, vl); + vint8m2_t vmax45 = __riscv_vmax(vi4, vi5, vl); + vint8m2_t vmax67 = __riscv_vmax(vi6, vi7, vl); + vint8m2_t vmax018 = __riscv_vmax(vmax01, vi8, vl); + + vint8m2_t vmax2345 = __riscv_vmax(vmax23, vmax45, vl); + vint8m2_t vmax01678 = __riscv_vmax(vmax67, vmax018, vl); + vint8m2_t vmax012345678 = __riscv_vmax(vmax2345, vmax01678, vl); + + vint8m2_t vacc = __riscv_vmax(vprev, vmax012345678, vl); + vacc = __riscv_vmin(vacc, output_max, vl); + __riscv_vse8_v_i8m2(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + } + input = (const int8_t**) ((uintptr_t) input + input_increment); + input_offset += input_pixel_stride; + output = (int8_t*) ((uintptr_t) output + output_increment); + } while (--output_pixels != 0); +} diff --git a/src/s8-maxpool/s8-maxpool-minmax.inc b/src/s8-maxpool/s8-maxpool-minmax.inc index 8c453aae27e..498299cb8d6 100644 --- a/src/s8-maxpool/s8-maxpool-minmax.inc +++ b/src/s8-maxpool/s8-maxpool-minmax.inc @@ -17,6 +17,9 @@ XNN_UKERNEL(xnn_arch_x86_sse4_1, xnn_s8_maxpool_minmax_ukernel_9p__sse41_u16, 16 XNN_UKERNEL(xnn_arch_none, xnn_s8_maxpool_minmax_ukernel_9p__wasmsimd_u16, 16, 9, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL(xnn_arch_none, xnn_s8_maxpool_minmax_ukernel_9p__scalar_u1, 1, 9, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) - +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_s8_maxpool_minmax_ukernel_9p__rvv_u1v, (1*xnn_init_hardware_config()->vlenb/sizeof(int8_t)), 9, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_s8_maxpool_minmax_ukernel_9p__rvv_u2v, (2*xnn_init_hardware_config()->vlenb/sizeof(int8_t)), 9, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR +XNN_UKERNEL(xnn_arch_none, xnn_s8_maxpool_minmax_ukernel_9p__scalar_u1, 1, 9, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) diff --git a/src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c b/src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c new file mode 100644 index 00000000000..c0561f3d658 --- /dev/null +++ b/src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c @@ -0,0 +1,147 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-maxpool/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include "src/xnnpack/maxpool.h" +#include + + +void xnn_u8_maxpool_minmax_ukernel_9p__rvv_u1v( + size_t output_pixels, + size_t kernel_elements, + size_t channels, + const uint8_t** input, + size_t input_offset, + size_t input_pixel_stride, + uint8_t* output, + size_t input_increment, + size_t output_increment, + const struct xnn_u8_minmax_params* restrict params) +{ + assert(output_pixels != 0); + assert(kernel_elements != 0); + assert(channels != 0); + + const uint8_t output_min = params->scalar.min; + const uint8_t output_max = params->scalar.max; + do { + const uint8_t** i = input; + + // First pass: load the inputs, store the max pool in the output. + const uint8_t* i0 = *i++; + const uint8_t* i1 = 1 < kernel_elements ? *i++ : i0; + const uint8_t* i2 = 2 < kernel_elements ? *i++ : i0; + const uint8_t* i3 = 3 < kernel_elements ? *i++ : i0; + const uint8_t* i4 = 4 < kernel_elements ? *i++ : i0; + const uint8_t* i5 = 5 < kernel_elements ? *i++ : i0; + const uint8_t* i6 = 6 < kernel_elements ? *i++ : i0; + const uint8_t* i7 = 7 < kernel_elements ? *i++ : i0; + const uint8_t* i8 = 8 < kernel_elements ? *i++ : i0; + i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset); + i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset); + i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset); + i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset); + i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset); + i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset); + i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset); + i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset); + i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset); + + uint8_t* o = output; + size_t c = channels; + do { + size_t vl = __riscv_vsetvl_e8m1(c); + vuint8m1_t vi0 = __riscv_vle8_v_u8m1(i0, vl); i0 += vl; + vuint8m1_t vi1 = __riscv_vle8_v_u8m1(i1, vl); i1 += vl; + vuint8m1_t vi2 = __riscv_vle8_v_u8m1(i2, vl); i2 += vl; + vuint8m1_t vi3 = __riscv_vle8_v_u8m1(i3, vl); i3 += vl; + vuint8m1_t vi4 = __riscv_vle8_v_u8m1(i4, vl); i4 += vl; + vuint8m1_t vi5 = __riscv_vle8_v_u8m1(i5, vl); i5 += vl; + vuint8m1_t vi6 = __riscv_vle8_v_u8m1(i6, vl); i6 += vl; + vuint8m1_t vi7 = __riscv_vle8_v_u8m1(i7, vl); i7 += vl; + vuint8m1_t vi8 = __riscv_vle8_v_u8m1(i8, vl); i8 += vl; + + vuint8m1_t vmax01 = __riscv_vmaxu(vi0, vi1, vl); + vuint8m1_t vmax23 = __riscv_vmaxu(vi2, vi3, vl); + vuint8m1_t vmax45 = __riscv_vmaxu(vi4, vi5, vl); + vuint8m1_t vmax67 = __riscv_vmaxu(vi6, vi7, vl); + vuint8m1_t vmax018 = __riscv_vmaxu(vmax01, vi8, vl); + + vuint8m1_t vmax2345 = __riscv_vmaxu(vmax23, vmax45, vl); + vuint8m1_t vmax01678 = __riscv_vmaxu(vmax67, vmax018, vl); + vuint8m1_t vacc = __riscv_vmaxu(vmax2345, vmax01678, vl); + + vacc = __riscv_vmaxu(vacc, output_min, vl); + vacc = __riscv_vminu(vacc, output_max, vl); + __riscv_vse8_v_u8m1(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + + // Passes 1 - n: Max more inputs to the output. + for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) { + const uint8_t* i0 = *i++; + const uint8_t* i1 = 1 < k ? *i++ : i0; + const uint8_t* i2 = 2 < k ? *i++ : i0; + const uint8_t* i3 = 3 < k ? *i++ : i0; + const uint8_t* i4 = 4 < k ? *i++ : i0; + const uint8_t* i5 = 5 < k ? *i++ : i0; + const uint8_t* i6 = 6 < k ? *i++ : i0; + const uint8_t* i7 = 7 < k ? *i++ : i0; + const uint8_t* i8 = 8 < k ? *i++ : i0; + i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset); + i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset); + i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset); + i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset); + i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset); + i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset); + i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset); + i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset); + i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset); + + o = output; + size_t c = channels; + do { + size_t vl = __riscv_vsetvl_e8m1(c); + + vuint8m1_t vi0 = __riscv_vle8_v_u8m1(i0, vl); i0 += vl; + vuint8m1_t vi1 = __riscv_vle8_v_u8m1(i1, vl); i1 += vl; + vuint8m1_t vi2 = __riscv_vle8_v_u8m1(i2, vl); i2 += vl; + vuint8m1_t vi3 = __riscv_vle8_v_u8m1(i3, vl); i3 += vl; + vuint8m1_t vi4 = __riscv_vle8_v_u8m1(i4, vl); i4 += vl; + vuint8m1_t vi5 = __riscv_vle8_v_u8m1(i5, vl); i5 += vl; + vuint8m1_t vi6 = __riscv_vle8_v_u8m1(i6, vl); i6 += vl; + vuint8m1_t vi7 = __riscv_vle8_v_u8m1(i7, vl); i7 += vl; + vuint8m1_t vi8 = __riscv_vle8_v_u8m1(i8, vl); i8 += vl; + + vuint8m1_t vprev = __riscv_vle8_v_u8m1(o, vl); + + vuint8m1_t vmax01 = __riscv_vmaxu(vi0, vi1, vl); + vuint8m1_t vmax23 = __riscv_vmaxu(vi2, vi3, vl); + vuint8m1_t vmax45 = __riscv_vmaxu(vi4, vi5, vl); + vuint8m1_t vmax67 = __riscv_vmaxu(vi6, vi7, vl); + vuint8m1_t vmax018 = __riscv_vmaxu(vmax01, vi8, vl); + + vuint8m1_t vmax2345 = __riscv_vmaxu(vmax23, vmax45, vl); + vuint8m1_t vmax01678 = __riscv_vmaxu(vmax67, vmax018, vl); + vuint8m1_t vmax012345678 = __riscv_vmaxu(vmax2345, vmax01678, vl); + + vuint8m1_t vacc = __riscv_vmaxu(vprev, vmax012345678, vl); + vacc = __riscv_vminu(vacc, output_max, vl); + __riscv_vse8_v_u8m1(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + } + input = (const uint8_t**) ((uintptr_t) input + input_increment); + input_offset += input_pixel_stride; + output = (uint8_t*) ((uintptr_t) output + output_increment); + } while (--output_pixels != 0); +} diff --git a/src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u2v.c b/src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u2v.c new file mode 100644 index 00000000000..b1609fb0ef5 --- /dev/null +++ b/src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u2v.c @@ -0,0 +1,147 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-maxpool/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include "src/xnnpack/maxpool.h" +#include + + +void xnn_u8_maxpool_minmax_ukernel_9p__rvv_u2v( + size_t output_pixels, + size_t kernel_elements, + size_t channels, + const uint8_t** input, + size_t input_offset, + size_t input_pixel_stride, + uint8_t* output, + size_t input_increment, + size_t output_increment, + const struct xnn_u8_minmax_params* restrict params) +{ + assert(output_pixels != 0); + assert(kernel_elements != 0); + assert(channels != 0); + + const uint8_t output_min = params->scalar.min; + const uint8_t output_max = params->scalar.max; + do { + const uint8_t** i = input; + + // First pass: load the inputs, store the max pool in the output. + const uint8_t* i0 = *i++; + const uint8_t* i1 = 1 < kernel_elements ? *i++ : i0; + const uint8_t* i2 = 2 < kernel_elements ? *i++ : i0; + const uint8_t* i3 = 3 < kernel_elements ? *i++ : i0; + const uint8_t* i4 = 4 < kernel_elements ? *i++ : i0; + const uint8_t* i5 = 5 < kernel_elements ? *i++ : i0; + const uint8_t* i6 = 6 < kernel_elements ? *i++ : i0; + const uint8_t* i7 = 7 < kernel_elements ? *i++ : i0; + const uint8_t* i8 = 8 < kernel_elements ? *i++ : i0; + i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset); + i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset); + i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset); + i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset); + i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset); + i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset); + i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset); + i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset); + i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset); + + uint8_t* o = output; + size_t c = channels; + do { + size_t vl = __riscv_vsetvl_e8m2(c); + vuint8m2_t vi0 = __riscv_vle8_v_u8m2(i0, vl); i0 += vl; + vuint8m2_t vi1 = __riscv_vle8_v_u8m2(i1, vl); i1 += vl; + vuint8m2_t vi2 = __riscv_vle8_v_u8m2(i2, vl); i2 += vl; + vuint8m2_t vi3 = __riscv_vle8_v_u8m2(i3, vl); i3 += vl; + vuint8m2_t vi4 = __riscv_vle8_v_u8m2(i4, vl); i4 += vl; + vuint8m2_t vi5 = __riscv_vle8_v_u8m2(i5, vl); i5 += vl; + vuint8m2_t vi6 = __riscv_vle8_v_u8m2(i6, vl); i6 += vl; + vuint8m2_t vi7 = __riscv_vle8_v_u8m2(i7, vl); i7 += vl; + vuint8m2_t vi8 = __riscv_vle8_v_u8m2(i8, vl); i8 += vl; + + vuint8m2_t vmax01 = __riscv_vmaxu(vi0, vi1, vl); + vuint8m2_t vmax23 = __riscv_vmaxu(vi2, vi3, vl); + vuint8m2_t vmax45 = __riscv_vmaxu(vi4, vi5, vl); + vuint8m2_t vmax67 = __riscv_vmaxu(vi6, vi7, vl); + vuint8m2_t vmax018 = __riscv_vmaxu(vmax01, vi8, vl); + + vuint8m2_t vmax2345 = __riscv_vmaxu(vmax23, vmax45, vl); + vuint8m2_t vmax01678 = __riscv_vmaxu(vmax67, vmax018, vl); + vuint8m2_t vacc = __riscv_vmaxu(vmax2345, vmax01678, vl); + + vacc = __riscv_vmaxu(vacc, output_min, vl); + vacc = __riscv_vminu(vacc, output_max, vl); + __riscv_vse8_v_u8m2(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + + // Passes 1 - n: Max more inputs to the output. + for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) { + const uint8_t* i0 = *i++; + const uint8_t* i1 = 1 < k ? *i++ : i0; + const uint8_t* i2 = 2 < k ? *i++ : i0; + const uint8_t* i3 = 3 < k ? *i++ : i0; + const uint8_t* i4 = 4 < k ? *i++ : i0; + const uint8_t* i5 = 5 < k ? *i++ : i0; + const uint8_t* i6 = 6 < k ? *i++ : i0; + const uint8_t* i7 = 7 < k ? *i++ : i0; + const uint8_t* i8 = 8 < k ? *i++ : i0; + i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset); + i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset); + i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset); + i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset); + i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset); + i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset); + i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset); + i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset); + i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset); + + o = output; + size_t c = channels; + do { + size_t vl = __riscv_vsetvl_e8m2(c); + + vuint8m2_t vi0 = __riscv_vle8_v_u8m2(i0, vl); i0 += vl; + vuint8m2_t vi1 = __riscv_vle8_v_u8m2(i1, vl); i1 += vl; + vuint8m2_t vi2 = __riscv_vle8_v_u8m2(i2, vl); i2 += vl; + vuint8m2_t vi3 = __riscv_vle8_v_u8m2(i3, vl); i3 += vl; + vuint8m2_t vi4 = __riscv_vle8_v_u8m2(i4, vl); i4 += vl; + vuint8m2_t vi5 = __riscv_vle8_v_u8m2(i5, vl); i5 += vl; + vuint8m2_t vi6 = __riscv_vle8_v_u8m2(i6, vl); i6 += vl; + vuint8m2_t vi7 = __riscv_vle8_v_u8m2(i7, vl); i7 += vl; + vuint8m2_t vi8 = __riscv_vle8_v_u8m2(i8, vl); i8 += vl; + + vuint8m2_t vprev = __riscv_vle8_v_u8m2(o, vl); + + vuint8m2_t vmax01 = __riscv_vmaxu(vi0, vi1, vl); + vuint8m2_t vmax23 = __riscv_vmaxu(vi2, vi3, vl); + vuint8m2_t vmax45 = __riscv_vmaxu(vi4, vi5, vl); + vuint8m2_t vmax67 = __riscv_vmaxu(vi6, vi7, vl); + vuint8m2_t vmax018 = __riscv_vmaxu(vmax01, vi8, vl); + + vuint8m2_t vmax2345 = __riscv_vmaxu(vmax23, vmax45, vl); + vuint8m2_t vmax01678 = __riscv_vmaxu(vmax67, vmax018, vl); + vuint8m2_t vmax012345678 = __riscv_vmaxu(vmax2345, vmax01678, vl); + + vuint8m2_t vacc = __riscv_vmaxu(vprev, vmax012345678, vl); + vacc = __riscv_vminu(vacc, output_max, vl); + __riscv_vse8_v_u8m2(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + } + input = (const uint8_t**) ((uintptr_t) input + input_increment); + input_offset += input_pixel_stride; + output = (uint8_t*) ((uintptr_t) output + output_increment); + } while (--output_pixels != 0); +} diff --git a/src/u8-maxpool/u8-maxpool-minmax.inc b/src/u8-maxpool/u8-maxpool-minmax.inc index 94e651a04ea..f090da6c6c4 100644 --- a/src/u8-maxpool/u8-maxpool-minmax.inc +++ b/src/u8-maxpool/u8-maxpool-minmax.inc @@ -17,6 +17,9 @@ XNN_UKERNEL(xnn_arch_x86_sse2, xnn_u8_maxpool_minmax_ukernel_9p__sse2_u16, 16, 9 XNN_UKERNEL(xnn_arch_none, xnn_u8_maxpool_minmax_ukernel_9p__wasmsimd_u16, 16, 9, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL(xnn_arch_none, xnn_u8_maxpool_minmax_ukernel_9p__scalar_u1, 1, 9, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) - +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_u8_maxpool_minmax_ukernel_9p__rvv_u1v, (1*xnn_init_hardware_config()->vlenb/sizeof(uint8_t)), 9, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_u8_maxpool_minmax_ukernel_9p__rvv_u2v, (2*xnn_init_hardware_config()->vlenb/sizeof(uint8_t)), 9, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR +XNN_UKERNEL(xnn_arch_none, xnn_u8_maxpool_minmax_ukernel_9p__scalar_u1, 1, 9, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) From afaabc20c8ce4df867fa95ce6eb2a5027fc4b301 Mon Sep 17 00:00:00 2001 From: Ken Unger Date: Mon, 2 Mar 2026 22:59:20 -0800 Subject: [PATCH 2/2] cleanup rvv maxpool, add rvv avgpool --- cmake/gen/rvv_microkernels.cmake | 4 +- cmake/gen/rvvfp16arith_microkernels.cmake | 2 +- gen/rvv_microkernels.bzl | 4 +- gen/rvvfp16arith_microkernels.bzl | 2 +- scripts/generate-f16-avgpool.sh | 3 + scripts/generate-f16-maxpool.sh | 1 - scripts/generate-f32-avgpool.sh | 3 + scripts/generate-f32-maxpool.sh | 1 - scripts/generate-s8-maxpool.sh | 1 - scripts/generate-u8-maxpool.sh | 1 - src/configs/avgpool-config.c | 17 ++ src/f16-avgpool/f16-avgpool-minmax.inc | 4 + .../f16-avgpool-9p-minmax-rvvfp16arith-u2v.c | 216 ++++++++++++++++++ src/f16-maxpool/f16-maxpool-minmax.inc | 1 - .../f16-maxpool-9p-minmax-rvvfp16arith-u1v.c | 147 ------------ src/f32-avgpool/f32-avgpool-minmax.inc | 4 + .../gen/f32-avgpool-9p-minmax-rvv-u2v.c | 216 ++++++++++++++++++ src/f32-avgpool/rvv.c.in | 127 ++++++++++ src/f32-maxpool/f32-maxpool-minmax.inc | 1 - .../gen/f32-maxpool-9p-minmax-rvv-u1v.c | 147 ------------ .../gen/s8-maxpool-9p-minmax-rvv-u1v.c | 147 ------------ src/s8-maxpool/s8-maxpool-minmax.inc | 1 - .../gen/u8-maxpool-9p-minmax-rvv-u1v.c | 147 ------------ src/u8-maxpool/u8-maxpool-minmax.inc | 1 - 24 files changed, 594 insertions(+), 604 deletions(-) create mode 100644 src/f16-avgpool/gen/f16-avgpool-9p-minmax-rvvfp16arith-u2v.c delete mode 100644 src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c create mode 100644 src/f32-avgpool/gen/f32-avgpool-9p-minmax-rvv-u2v.c create mode 100644 src/f32-avgpool/rvv.c.in delete mode 100644 src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c delete mode 100644 src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c delete mode 100644 src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake index 04d5e143861..7307f77ddeb 100644 --- a/cmake/gen/rvv_microkernels.cmake +++ b/cmake/gen/rvv_microkernels.cmake @@ -11,6 +11,7 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/f32-argmaxpool/f32-argmaxpool-9p8x-rvv-u1v.c + src/f32-avgpool/gen/f32-avgpool-9p-minmax-rvv-u2v.c src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x2v-rvv-2x2.c src/f32-dwconv/gen/f32-dwconv-3p8vc-minmax-rvv.c src/f32-dwconv/gen/f32-dwconv-3p8vc-rvv.c @@ -138,7 +139,6 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS src/f32-igemm/gen/f32-igemm-1x4v-rvv.c src/f32-igemm/gen/f32-igemm-7x4v-relu-rvv.c src/f32-igemm/gen/f32-igemm-7x4v-rvv.c - src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u1v.c src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u4v.c src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u8v.c @@ -260,11 +260,9 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u1v.c src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c - src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c src/s8-vclamp/gen/s8-vclamp-rvv-u1v.c src/s8-vclamp/gen/s8-vclamp-rvv-u2v.c src/s8-vclamp/gen/s8-vclamp-rvv-u8v.c - src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c src/u8-vclamp/gen/u8-vclamp-rvv-u1v.c src/u8-vclamp/gen/u8-vclamp-rvv-u2v.c src/u8-vclamp/gen/u8-vclamp-rvv-u8v.c diff --git a/cmake/gen/rvvfp16arith_microkernels.cmake b/cmake/gen/rvvfp16arith_microkernels.cmake index ac5f45c7724..1780d593920 100644 --- a/cmake/gen/rvvfp16arith_microkernels.cmake +++ b/cmake/gen/rvvfp16arith_microkernels.cmake @@ -10,6 +10,7 @@ SET(PROD_RVVFP16ARITH_MICROKERNEL_SRCS + src/f16-avgpool/gen/f16-avgpool-9p-minmax-rvvfp16arith-u2v.c src/f16-dwconv/gen/f16-dwconv-3p8vc-minmax-rvvfp16arith.c src/f16-dwconv/gen/f16-dwconv-4p8vc-minmax-rvvfp16arith.c src/f16-dwconv/gen/f16-dwconv-9p8vc-minmax-rvvfp16arith.c @@ -55,7 +56,6 @@ SET(NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS src/f16-f32-vcvt/gen/f16-f32-vcvt-rvvfp16arith-u2v.c src/f16-gemm/gen/f16-gemm-4x4v-minmax-rvvfp16arith.c src/f16-igemm/gen/f16-igemm-4x4v-minmax-rvvfp16arith.c - src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c src/f16-spmm/gen/f16-spmm-1vx1-minmax-rvvfp16arith.c src/f16-spmm/gen/f16-spmm-2vx1-minmax-rvvfp16arith.c src/f16-spmm/gen/f16-spmm-4vx1-minmax-rvvfp16arith.c diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl index fb7468b48b3..335ed2f7d4a 100644 --- a/gen/rvv_microkernels.bzl +++ b/gen/rvv_microkernels.bzl @@ -7,6 +7,7 @@ PROD_RVV_MICROKERNEL_SRCS = [ "src/f32-argmaxpool/f32-argmaxpool-9p8x-rvv-u1v.c", + "src/f32-avgpool/gen/f32-avgpool-9p-minmax-rvv-u2v.c", "src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x2v-rvv-2x2.c", "src/f32-dwconv/gen/f32-dwconv-3p8vc-minmax-rvv.c", "src/f32-dwconv/gen/f32-dwconv-3p8vc-rvv.c", @@ -135,7 +136,6 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [ "src/f32-igemm/gen/f32-igemm-1x4v-rvv.c", "src/f32-igemm/gen/f32-igemm-7x4v-relu-rvv.c", "src/f32-igemm/gen/f32-igemm-7x4v-rvv.c", - "src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c", "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u1v.c", "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u4v.c", "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u8v.c", @@ -257,11 +257,9 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [ "src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u1v.c", "src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c", "src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c", - "src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c", "src/s8-vclamp/gen/s8-vclamp-rvv-u1v.c", "src/s8-vclamp/gen/s8-vclamp-rvv-u2v.c", "src/s8-vclamp/gen/s8-vclamp-rvv-u8v.c", - "src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c", "src/u8-vclamp/gen/u8-vclamp-rvv-u1v.c", "src/u8-vclamp/gen/u8-vclamp-rvv-u2v.c", "src/u8-vclamp/gen/u8-vclamp-rvv-u8v.c", diff --git a/gen/rvvfp16arith_microkernels.bzl b/gen/rvvfp16arith_microkernels.bzl index acbae4d7c04..c007293ecb4 100644 --- a/gen/rvvfp16arith_microkernels.bzl +++ b/gen/rvvfp16arith_microkernels.bzl @@ -6,6 +6,7 @@ # PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [ + "src/f16-avgpool/gen/f16-avgpool-9p-minmax-rvvfp16arith-u2v.c", "src/f16-dwconv/gen/f16-dwconv-3p8vc-minmax-rvvfp16arith.c", "src/f16-dwconv/gen/f16-dwconv-4p8vc-minmax-rvvfp16arith.c", "src/f16-dwconv/gen/f16-dwconv-9p8vc-minmax-rvvfp16arith.c", @@ -52,7 +53,6 @@ NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-f32-vcvt/gen/f16-f32-vcvt-rvvfp16arith-u2v.c", "src/f16-gemm/gen/f16-gemm-4x4v-minmax-rvvfp16arith.c", "src/f16-igemm/gen/f16-igemm-4x4v-minmax-rvvfp16arith.c", - "src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c", "src/f16-spmm/gen/f16-spmm-1vx1-minmax-rvvfp16arith.c", "src/f16-spmm/gen/f16-spmm-2vx1-minmax-rvvfp16arith.c", "src/f16-spmm/gen/f16-spmm-4vx1-minmax-rvvfp16arith.c", diff --git a/scripts/generate-f16-avgpool.sh b/scripts/generate-f16-avgpool.sh index b9bf469fc21..32fcaeb0207 100755 --- a/scripts/generate-f16-avgpool.sh +++ b/scripts/generate-f16-avgpool.sh @@ -10,4 +10,7 @@ tools/xngen src/f32-avgpool/avgpool.c.in -D ARCH=neonfp16arith -D DATATYPE=f16 - ##################################### f16c ##################################### tools/xngen src/f16-avgpool/f16c.c.in -D SIMD_SIZE=8 -o src/f16-avgpool/gen/f16-avgpool-9p-minmax-f16c.c & +################################ RISC-V Vector ################################# +tools/xngen src/f32-avgpool/rvv.c.in -D DATATYPE=f16 -D LMUL=2 -o src/f16-avgpool/gen/f16-avgpool-9p-minmax-rvvfp16arith-u2v.c & + wait diff --git a/scripts/generate-f16-maxpool.sh b/scripts/generate-f16-maxpool.sh index 9bfb490c2cc..0c3b041ea09 100755 --- a/scripts/generate-f16-maxpool.sh +++ b/scripts/generate-f16-maxpool.sh @@ -10,7 +10,6 @@ tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=f16 -D ARCH=avx2 -D SIMD_SI tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=f16 -D ARCH=sse41 -D SIMD_SIZE=8 -o src/f16-maxpool/gen/f16-maxpool-9p-minmax-sse41-u8.c & ################################ RISC-V Vector ################################ -tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=f16 -D LMUL=1 -o src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c & tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=f16 -D LMUL=2 -o src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u2v.c & wait diff --git a/scripts/generate-f32-avgpool.sh b/scripts/generate-f32-avgpool.sh index c59e51ddc8d..454f4957e77 100755 --- a/scripts/generate-f32-avgpool.sh +++ b/scripts/generate-f32-avgpool.sh @@ -13,4 +13,7 @@ tools/xngen src/f32-avgpool/avgpool.c.in -D ARCH=avx -D DATATYPE=f32 -D SIM tools/xngen src/f32-avgpool/avgpool.c.in -D ARCH=avx512f -D DATATYPE=f32 -D SIMD_SIZE=16 -o src/f32-avgpool/gen/f32-avgpool-9p-minmax-avx512f-u16.c & tools/xngen src/f32-avgpool/avgpool.c.in -D ARCH=hvx -D DATATYPE=f32 -D SIMD_SIZE=32 -o src/f32-avgpool/gen/f32-avgpool-9p-minmax-hvx-u32.c & +################################ RISC-V Vector ################################ +tools/xngen src/f32-avgpool/rvv.c.in -D DATATYPE=f32 -D LMUL=2 -o src/f32-avgpool/gen/f32-avgpool-9p-minmax-rvv-u2v.c & + wait diff --git a/scripts/generate-f32-maxpool.sh b/scripts/generate-f32-maxpool.sh index 4c15f633750..322f3151ec9 100755 --- a/scripts/generate-f32-maxpool.sh +++ b/scripts/generate-f32-maxpool.sh @@ -12,7 +12,6 @@ tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=f32 -D ARCH=neon -D SIM tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=f32 -D ARCH=hvx -D SIMD_SIZE=32 -o src/f32-maxpool/gen/f32-maxpool-9p-minmax-hvx-u32.c & ################################ RISC-V Vector ################################ -tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=f32 -D LMUL=1 -o src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c & tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=f32 -D LMUL=2 -o src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u2v.c & wait diff --git a/scripts/generate-s8-maxpool.sh b/scripts/generate-s8-maxpool.sh index 6b379899ffd..84c32a10663 100755 --- a/scripts/generate-s8-maxpool.sh +++ b/scripts/generate-s8-maxpool.sh @@ -11,7 +11,6 @@ tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=s8 -D ARCH=wasmsimd -D SIMD tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=s8 -D ARCH=neon -D SIMD_SIZE=16 -o src/s8-maxpool/gen/s8-maxpool-9p-minmax-neon-u16.c & ################################ RISC-V Vector ################################# -tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=s8 -D LMUL=1 -o src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c & tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=s8 -D LMUL=2 -o src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u2v.c & wait diff --git a/scripts/generate-u8-maxpool.sh b/scripts/generate-u8-maxpool.sh index aa1b1f66faf..6a52b039fa9 100755 --- a/scripts/generate-u8-maxpool.sh +++ b/scripts/generate-u8-maxpool.sh @@ -11,7 +11,6 @@ tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=u8 -D KERNEL_TILE=9 -D ARCH tools/xngen src/f32-maxpool/maxpool.c.in -D DATATYPE=u8 -D KERNEL_TILE=9 -D ARCH=neon -D SIMD_SIZE=16 -o src/u8-maxpool/gen/u8-maxpool-9p-minmax-neon-u16.c & ################################ RISC-V Vector ################################# -tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=u8 -D LMUL=1 -o src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c & tools/xngen src/f32-maxpool/rvv.c.in -D DATATYPE=u8 -D LMUL=2 -o src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u2v.c & wait diff --git a/src/configs/avgpool-config.c b/src/configs/avgpool-config.c index 79bca26a612..40493569ec0 100644 --- a/src/configs/avgpool-config.c +++ b/src/configs/avgpool-config.c @@ -51,6 +51,16 @@ static void init_f16_avgpool_config(void) { } else #endif ; // no f16 support + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + (void) hardware_config; // May be unused. + if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) { + f16_avgpool_config.ukernel = XNN_INIT_AVGPOOL_UKERNEL(xnn_f16_avgpool_minmax_ukernel_9p__rvvfp16arith_u2v); + f16_avgpool_config.init.f16 = xnn_init_f16_scaleminmax_scalar_params; + f16_avgpool_config.primary_tile = 9; + f16_avgpool_config.channel_tile = 2 * hardware_config->vlenb / sizeof(xnn_float16); + } #endif } @@ -123,6 +133,13 @@ static void init_f32_avgpool_config(void) { f32_avgpool_config.primary_tile = 9; f32_avgpool_config.channel_tile = 32; } + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + f32_avgpool_config.ukernel = XNN_INIT_AVGPOOL_UKERNEL(xnn_f32_avgpool_minmax_ukernel_9p__rvv_u2v); + f32_avgpool_config.init.f32 = xnn_init_f32_scaleminmax_scalar_params; + f32_avgpool_config.primary_tile = 9; + f32_avgpool_config.channel_tile = 2 * hardware_config->vlenb / sizeof(float); #else f32_avgpool_config.ukernel = XNN_INIT_AVGPOOL_UKERNEL(xnn_f32_avgpool_minmax_ukernel_9p__scalar_u1); f32_avgpool_config.init.f32 = xnn_init_f32_scaleminmax_scalar_params; diff --git a/src/f16-avgpool/f16-avgpool-minmax.inc b/src/f16-avgpool/f16-avgpool-minmax.inc index c099013791e..cba9b2d1370 100644 --- a/src/f16-avgpool/f16-avgpool-minmax.inc +++ b/src/f16-avgpool/f16-avgpool-minmax.inc @@ -14,3 +14,7 @@ XNN_UKERNEL(xnn_arch_arm_neon_fp16_arith, xnn_f16_avgpool_minmax_ukernel_9p__neo #if XNN_ENABLE_F16C && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL(xnn_arch_x86_f16c, xnn_f16_avgpool_minmax_ukernel_9p__f16c_u8, 8, 9, xnn_float16, struct xnn_f16_scaleminmax_params, xnn_init_f16_scaleminmax_scalar_params) #endif // XNN_ENABLE_F16C && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_avgpool_minmax_ukernel_9p__rvvfp16arith_u2v, (2*xnn_init_hardware_config()->vlenb/sizeof(xnn_float16)), 9, xnn_float16, struct xnn_f16_scaleminmax_params, xnn_init_f16_scaleminmax_scalar_params) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR diff --git a/src/f16-avgpool/gen/f16-avgpool-9p-minmax-rvvfp16arith-u2v.c b/src/f16-avgpool/gen/f16-avgpool-9p-minmax-rvvfp16arith-u2v.c new file mode 100644 index 00000000000..57b68030e69 --- /dev/null +++ b/src/f16-avgpool/gen/f16-avgpool-9p-minmax-rvvfp16arith-u2v.c @@ -0,0 +1,216 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-avgpool/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include "src/xnnpack/maxpool.h" +#include + + +void xnn_f16_avgpool_minmax_ukernel_9p__rvvfp16arith_u2v( + size_t output_pixels, + size_t kernel_elements, + size_t channels, + const xnn_float16** input, + size_t input_offset, + size_t input_pixel_stride, + const xnn_float16* zero, + const xnn_float16* multiplier, + xnn_float16* output, + size_t input_increment, + size_t output_increment, + const struct xnn_f16_scaleminmax_params* restrict params) +{ + assert(output_pixels != 0); + assert(kernel_elements != 0); + assert(channels != 0); + + const xnn_float16 min = params->scalar.min; + const xnn_float16 max = params->scalar.max; + xnn_float16 scale = params->scalar.scale; + + do { + // Start with the previous output as the zero buffer. + const xnn_float16* prev_output = zero; + + const xnn_float16** i = input; + + // Passes 0 - n-1: load the output, add 9 inputs. + size_t k = kernel_elements; + for (; k > 9; k -= 9) { + const xnn_float16* i0 = *i++; + assert(i0 != NULL); + if XNN_UNPREDICTABLE(i0 != zero) { + i0 = (const xnn_float16*) ((uintptr_t) i0 + input_offset); + } + const xnn_float16* i1 = *i++; + assert(i1 != NULL); + if XNN_UNPREDICTABLE(i1 != zero) { + i1 = (const xnn_float16*) ((uintptr_t) i1 + input_offset); + } + const xnn_float16* i2 = *i++; + assert(i2 != NULL); + if XNN_UNPREDICTABLE(i2 != zero) { + i2 = (const xnn_float16*) ((uintptr_t) i2 + input_offset); + } + const xnn_float16* i3 = *i++; + assert(i3 != NULL); + if XNN_UNPREDICTABLE(i3 != zero) { + i3 = (const xnn_float16*) ((uintptr_t) i3 + input_offset); + } + const xnn_float16* i4 = *i++; + assert(i4 != NULL); + if XNN_UNPREDICTABLE(i4 != zero) { + i4 = (const xnn_float16*) ((uintptr_t) i4 + input_offset); + } + const xnn_float16* i5 = *i++; + assert(i5 != NULL); + if XNN_UNPREDICTABLE(i5 != zero) { + i5 = (const xnn_float16*) ((uintptr_t) i5 + input_offset); + } + const xnn_float16* i6 = *i++; + assert(i6 != NULL); + if XNN_UNPREDICTABLE(i6 != zero) { + i6 = (const xnn_float16*) ((uintptr_t) i6 + input_offset); + } + const xnn_float16* i7 = *i++; + assert(i7 != NULL); + if XNN_UNPREDICTABLE(i7 != zero) { + i7 = (const xnn_float16*) ((uintptr_t) i7 + input_offset); + } + const xnn_float16* i8 = *i++; + assert(i8 != NULL); + if XNN_UNPREDICTABLE(i8 != zero) { + i8 = (const xnn_float16*) ((uintptr_t) i8 + input_offset); + } + + xnn_float16* o = output; + size_t c = channels; + do { + size_t vl = __riscv_vsetvl_e16m2(c); + vfloat16m2_t vi0 = __riscv_vle16_v_f16m2(i0, vl); i0 += vl; + vfloat16m2_t vi1 = __riscv_vle16_v_f16m2(i1, vl); i1 += vl; + vfloat16m2_t vi2 = __riscv_vle16_v_f16m2(i2, vl); i2 += vl; + vfloat16m2_t vi3 = __riscv_vle16_v_f16m2(i3, vl); i3 += vl; + vfloat16m2_t vi4 = __riscv_vle16_v_f16m2(i4, vl); i4 += vl; + vfloat16m2_t vi5 = __riscv_vle16_v_f16m2(i5, vl); i5 += vl; + vfloat16m2_t vi6 = __riscv_vle16_v_f16m2(i6, vl); i6 += vl; + vfloat16m2_t vi7 = __riscv_vle16_v_f16m2(i7, vl); i7 += vl; + vfloat16m2_t vi8 = __riscv_vle16_v_f16m2(i8, vl); i8 += vl; + vfloat16m2_t vprev = __riscv_vle16_v_f16m2(prev_output, vl); prev_output += vl; + + vfloat16m2_t vsum01 = __riscv_vfadd(vi0, vi1, vl); + vfloat16m2_t vsum23 = __riscv_vfadd(vi2, vi3, vl); + vfloat16m2_t vsum45 = __riscv_vfadd(vi4, vi5, vl); + vfloat16m2_t vsum67 = __riscv_vfadd(vi6, vi7, vl); + vfloat16m2_t vsum018 = __riscv_vfadd(vsum01, vi8, vl); + + vfloat16m2_t vsum2345 = __riscv_vfadd(vsum23, vsum45, vl); + vfloat16m2_t vsum01678 = __riscv_vfadd(vsum67, vsum018, vl); + vfloat16m2_t vsum012345678 = __riscv_vfadd(vsum2345, vsum01678, vl); + vfloat16m2_t vacc = __riscv_vfadd(vprev, vsum012345678, vl); + __riscv_vse16_v_f16m2(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + + // Subsequent passes read from the previous output. + prev_output = output; + } + + // Final pass: load the output, add remaining kernel elements, apply scaling/min/max + const xnn_float16* i0 = 0 < k ? *i++ : zero; + assert(i0 != NULL); + if XNN_UNPREDICTABLE(i0 != zero) { + i0 = (const xnn_float16*) ((uintptr_t) i0 + input_offset); + } + const xnn_float16* i1 = 1 < k ? *i++ : zero; + assert(i1 != NULL); + if XNN_UNPREDICTABLE(i1 != zero) { + i1 = (const xnn_float16*) ((uintptr_t) i1 + input_offset); + } + const xnn_float16* i2 = 2 < k ? *i++ : zero; + assert(i2 != NULL); + if XNN_UNPREDICTABLE(i2 != zero) { + i2 = (const xnn_float16*) ((uintptr_t) i2 + input_offset); + } + const xnn_float16* i3 = 3 < k ? *i++ : zero; + assert(i3 != NULL); + if XNN_UNPREDICTABLE(i3 != zero) { + i3 = (const xnn_float16*) ((uintptr_t) i3 + input_offset); + } + const xnn_float16* i4 = 4 < k ? *i++ : zero; + assert(i4 != NULL); + if XNN_UNPREDICTABLE(i4 != zero) { + i4 = (const xnn_float16*) ((uintptr_t) i4 + input_offset); + } + const xnn_float16* i5 = 5 < k ? *i++ : zero; + assert(i5 != NULL); + if XNN_UNPREDICTABLE(i5 != zero) { + i5 = (const xnn_float16*) ((uintptr_t) i5 + input_offset); + } + const xnn_float16* i6 = 6 < k ? *i++ : zero; + assert(i6 != NULL); + if XNN_UNPREDICTABLE(i6 != zero) { + i6 = (const xnn_float16*) ((uintptr_t) i6 + input_offset); + } + const xnn_float16* i7 = 7 < k ? *i++ : zero; + assert(i7 != NULL); + if XNN_UNPREDICTABLE(i7 != zero) { + i7 = (const xnn_float16*) ((uintptr_t) i7 + input_offset); + } + const xnn_float16* i8 = 8 < k ? *i++ : zero; + assert(i8 != NULL); + if XNN_UNPREDICTABLE(i8 != zero) { + i8 = (const xnn_float16*) ((uintptr_t) i8 + input_offset); + } + + if (multiplier != NULL) { + scale = *multiplier++; + } + xnn_float16* o = output; + size_t c = channels; + do { + size_t vl = __riscv_vsetvl_e16m2(c); + vfloat16m2_t vi0 = __riscv_vle16_v_f16m2(i0, vl); i0 += vl; + vfloat16m2_t vi1 = __riscv_vle16_v_f16m2(i1, vl); i1 += vl; + vfloat16m2_t vi2 = __riscv_vle16_v_f16m2(i2, vl); i2 += vl; + vfloat16m2_t vi3 = __riscv_vle16_v_f16m2(i3, vl); i3 += vl; + vfloat16m2_t vi4 = __riscv_vle16_v_f16m2(i4, vl); i4 += vl; + vfloat16m2_t vi5 = __riscv_vle16_v_f16m2(i5, vl); i5 += vl; + vfloat16m2_t vi6 = __riscv_vle16_v_f16m2(i6, vl); i6 += vl; + vfloat16m2_t vi7 = __riscv_vle16_v_f16m2(i7, vl); i7 += vl; + vfloat16m2_t vi8 = __riscv_vle16_v_f16m2(i8, vl); i8 += vl; + vfloat16m2_t vprev = __riscv_vle16_v_f16m2(prev_output, vl); prev_output += vl; + + vfloat16m2_t vsum01 = __riscv_vfadd(vi0, vi1, vl); + vfloat16m2_t vsum23 = __riscv_vfadd(vi2, vi3, vl); + vfloat16m2_t vsum45 = __riscv_vfadd(vi4, vi5, vl); + vfloat16m2_t vsum67 = __riscv_vfadd(vi6, vi7, vl); + vfloat16m2_t vsum018 = __riscv_vfadd(vsum01, vi8, vl); + + vfloat16m2_t vsum2345 = __riscv_vfadd(vsum23, vsum45, vl); + vfloat16m2_t vsum01678 = __riscv_vfadd(vsum67, vsum018, vl); + vfloat16m2_t vsum012345678 = __riscv_vfadd(vsum2345, vsum01678, vl); + vfloat16m2_t vacc = __riscv_vfadd(vprev, vsum012345678, vl); + + vacc = __riscv_vfmul(vacc, scale, vl); + vacc = __riscv_vfmax(vacc, min, vl); + vacc = __riscv_vfmin(vacc, max, vl); + + __riscv_vse16_v_f16m2(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + + input = (const xnn_float16**) ((uintptr_t) input + input_increment); + input_offset += input_pixel_stride; + output = (xnn_float16*) ((uintptr_t) output + output_increment); + } while (--output_pixels != 0); +} diff --git a/src/f16-maxpool/f16-maxpool-minmax.inc b/src/f16-maxpool/f16-maxpool-minmax.inc index ebc7a65487c..a954de9bb4b 100644 --- a/src/f16-maxpool/f16-maxpool-minmax.inc +++ b/src/f16-maxpool/f16-maxpool-minmax.inc @@ -18,6 +18,5 @@ XNN_UKERNEL(xnn_arch_x86_avx2, xnn_f16_maxpool_minmax_ukernel_9p__avx2_u16, 16, #endif // XNN_ENABLE_AVX2 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR -XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_maxpool_minmax_ukernel_9p__rvvfp16arith_u1v, (1*xnn_init_hardware_config()->vlenb/sizeof(xnn_float16)), 9, xnn_float16, struct xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_maxpool_minmax_ukernel_9p__rvvfp16arith_u2v, (2*xnn_init_hardware_config()->vlenb/sizeof(xnn_float16)), 9, xnn_float16, struct xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) #endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR diff --git a/src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c b/src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c deleted file mode 100644 index 266b3701f49..00000000000 --- a/src/f16-maxpool/gen/f16-maxpool-9p-minmax-rvvfp16arith-u1v.c +++ /dev/null @@ -1,147 +0,0 @@ -// clang-format off -// Auto-generated file. Do not edit! -// Template: src/f32-maxpool/rvv.c.in -// Generator: tools/xngen -// -// Copyright 2026 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include "src/xnnpack/maxpool.h" -#include - - -void xnn_f16_maxpool_minmax_ukernel_9p__rvvfp16arith_u1v( - size_t output_pixels, - size_t kernel_elements, - size_t channels, - const xnn_float16** input, - size_t input_offset, - size_t input_pixel_stride, - xnn_float16* output, - size_t input_increment, - size_t output_increment, - const struct xnn_f16_minmax_params* restrict params) -{ - assert(output_pixels != 0); - assert(kernel_elements != 0); - assert(channels != 0); - - const xnn_float16 output_min = params->scalar.min; - const xnn_float16 output_max = params->scalar.max; - do { - const xnn_float16** i = input; - - // First pass: load the inputs, store the max pool in the output. - const xnn_float16* i0 = *i++; - const xnn_float16* i1 = 1 < kernel_elements ? *i++ : i0; - const xnn_float16* i2 = 2 < kernel_elements ? *i++ : i0; - const xnn_float16* i3 = 3 < kernel_elements ? *i++ : i0; - const xnn_float16* i4 = 4 < kernel_elements ? *i++ : i0; - const xnn_float16* i5 = 5 < kernel_elements ? *i++ : i0; - const xnn_float16* i6 = 6 < kernel_elements ? *i++ : i0; - const xnn_float16* i7 = 7 < kernel_elements ? *i++ : i0; - const xnn_float16* i8 = 8 < kernel_elements ? *i++ : i0; - i0 = (const xnn_float16*) ((uintptr_t) i0 + input_offset); - i1 = (const xnn_float16*) ((uintptr_t) i1 + input_offset); - i2 = (const xnn_float16*) ((uintptr_t) i2 + input_offset); - i3 = (const xnn_float16*) ((uintptr_t) i3 + input_offset); - i4 = (const xnn_float16*) ((uintptr_t) i4 + input_offset); - i5 = (const xnn_float16*) ((uintptr_t) i5 + input_offset); - i6 = (const xnn_float16*) ((uintptr_t) i6 + input_offset); - i7 = (const xnn_float16*) ((uintptr_t) i7 + input_offset); - i8 = (const xnn_float16*) ((uintptr_t) i8 + input_offset); - - xnn_float16* o = output; - size_t c = channels; - do { - size_t vl = __riscv_vsetvl_e16m1(c); - vfloat16m1_t vi0 = __riscv_vle16_v_f16m1(i0, vl); i0 += vl; - vfloat16m1_t vi1 = __riscv_vle16_v_f16m1(i1, vl); i1 += vl; - vfloat16m1_t vi2 = __riscv_vle16_v_f16m1(i2, vl); i2 += vl; - vfloat16m1_t vi3 = __riscv_vle16_v_f16m1(i3, vl); i3 += vl; - vfloat16m1_t vi4 = __riscv_vle16_v_f16m1(i4, vl); i4 += vl; - vfloat16m1_t vi5 = __riscv_vle16_v_f16m1(i5, vl); i5 += vl; - vfloat16m1_t vi6 = __riscv_vle16_v_f16m1(i6, vl); i6 += vl; - vfloat16m1_t vi7 = __riscv_vle16_v_f16m1(i7, vl); i7 += vl; - vfloat16m1_t vi8 = __riscv_vle16_v_f16m1(i8, vl); i8 += vl; - - vfloat16m1_t vmax01 = __riscv_vfmax(vi0, vi1, vl); - vfloat16m1_t vmax23 = __riscv_vfmax(vi2, vi3, vl); - vfloat16m1_t vmax45 = __riscv_vfmax(vi4, vi5, vl); - vfloat16m1_t vmax67 = __riscv_vfmax(vi6, vi7, vl); - vfloat16m1_t vmax018 = __riscv_vfmax(vmax01, vi8, vl); - - vfloat16m1_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl); - vfloat16m1_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl); - vfloat16m1_t vacc = __riscv_vfmax(vmax2345, vmax01678, vl); - - vacc = __riscv_vfmax(vacc, output_min, vl); - vacc = __riscv_vfmin(vacc, output_max, vl); - __riscv_vse16_v_f16m1(o, vacc, vl); o += vl; - - c -= vl; - } while (c != 0); - - // Passes 1 - n: Max more inputs to the output. - for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) { - const xnn_float16* i0 = *i++; - const xnn_float16* i1 = 1 < k ? *i++ : i0; - const xnn_float16* i2 = 2 < k ? *i++ : i0; - const xnn_float16* i3 = 3 < k ? *i++ : i0; - const xnn_float16* i4 = 4 < k ? *i++ : i0; - const xnn_float16* i5 = 5 < k ? *i++ : i0; - const xnn_float16* i6 = 6 < k ? *i++ : i0; - const xnn_float16* i7 = 7 < k ? *i++ : i0; - const xnn_float16* i8 = 8 < k ? *i++ : i0; - i0 = (const xnn_float16*) ((uintptr_t) i0 + input_offset); - i1 = (const xnn_float16*) ((uintptr_t) i1 + input_offset); - i2 = (const xnn_float16*) ((uintptr_t) i2 + input_offset); - i3 = (const xnn_float16*) ((uintptr_t) i3 + input_offset); - i4 = (const xnn_float16*) ((uintptr_t) i4 + input_offset); - i5 = (const xnn_float16*) ((uintptr_t) i5 + input_offset); - i6 = (const xnn_float16*) ((uintptr_t) i6 + input_offset); - i7 = (const xnn_float16*) ((uintptr_t) i7 + input_offset); - i8 = (const xnn_float16*) ((uintptr_t) i8 + input_offset); - - o = output; - size_t c = channels; - do { - size_t vl = __riscv_vsetvl_e16m1(c); - - vfloat16m1_t vi0 = __riscv_vle16_v_f16m1(i0, vl); i0 += vl; - vfloat16m1_t vi1 = __riscv_vle16_v_f16m1(i1, vl); i1 += vl; - vfloat16m1_t vi2 = __riscv_vle16_v_f16m1(i2, vl); i2 += vl; - vfloat16m1_t vi3 = __riscv_vle16_v_f16m1(i3, vl); i3 += vl; - vfloat16m1_t vi4 = __riscv_vle16_v_f16m1(i4, vl); i4 += vl; - vfloat16m1_t vi5 = __riscv_vle16_v_f16m1(i5, vl); i5 += vl; - vfloat16m1_t vi6 = __riscv_vle16_v_f16m1(i6, vl); i6 += vl; - vfloat16m1_t vi7 = __riscv_vle16_v_f16m1(i7, vl); i7 += vl; - vfloat16m1_t vi8 = __riscv_vle16_v_f16m1(i8, vl); i8 += vl; - - vfloat16m1_t vprev = __riscv_vle16_v_f16m1(o, vl); - - vfloat16m1_t vmax01 = __riscv_vfmax(vi0, vi1, vl); - vfloat16m1_t vmax23 = __riscv_vfmax(vi2, vi3, vl); - vfloat16m1_t vmax45 = __riscv_vfmax(vi4, vi5, vl); - vfloat16m1_t vmax67 = __riscv_vfmax(vi6, vi7, vl); - vfloat16m1_t vmax018 = __riscv_vfmax(vmax01, vi8, vl); - - vfloat16m1_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl); - vfloat16m1_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl); - vfloat16m1_t vmax012345678 = __riscv_vfmax(vmax2345, vmax01678, vl); - - vfloat16m1_t vacc = __riscv_vfmax(vprev, vmax012345678, vl); - vacc = __riscv_vfmin(vacc, output_max, vl); - __riscv_vse16_v_f16m1(o, vacc, vl); o += vl; - - c -= vl; - } while (c != 0); - } - input = (const xnn_float16**) ((uintptr_t) input + input_increment); - input_offset += input_pixel_stride; - output = (xnn_float16*) ((uintptr_t) output + output_increment); - } while (--output_pixels != 0); -} diff --git a/src/f32-avgpool/f32-avgpool-minmax.inc b/src/f32-avgpool/f32-avgpool-minmax.inc index e530e870b4b..534eb3c83ca 100644 --- a/src/f32-avgpool/f32-avgpool-minmax.inc +++ b/src/f32-avgpool/f32-avgpool-minmax.inc @@ -28,4 +28,8 @@ XNN_UKERNEL(xnn_arch_none, xnn_f32_avgpool_minmax_ukernel_9p__wasmsimd_u4, 4, 9, XNN_UKERNEL(xnn_arch_hvx, xnn_f32_avgpool_minmax_ukernel_9p__hvx_u32, 32, 9, float, struct xnn_f32_scaleminmax_params, xnn_init_f32_scaleminmax_scalar_params) #endif // XNN_ENABLE_HVX && XNN_ARCH_HEXAGON +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_avgpool_minmax_ukernel_9p__rvv_u2v, (2*xnn_init_hardware_config()->vlenb/sizeof(float)), 9, float, struct xnn_f32_scaleminmax_params, xnn_init_f32_scaleminmax_scalar_params) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + XNN_UKERNEL(xnn_arch_none, xnn_f32_avgpool_minmax_ukernel_9p__scalar_u1, 1, 9, float, struct xnn_f32_scaleminmax_params, xnn_init_f32_scaleminmax_scalar_params) diff --git a/src/f32-avgpool/gen/f32-avgpool-9p-minmax-rvv-u2v.c b/src/f32-avgpool/gen/f32-avgpool-9p-minmax-rvv-u2v.c new file mode 100644 index 00000000000..fd032a5d64a --- /dev/null +++ b/src/f32-avgpool/gen/f32-avgpool-9p-minmax-rvv-u2v.c @@ -0,0 +1,216 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-avgpool/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include "src/xnnpack/maxpool.h" +#include + + +void xnn_f32_avgpool_minmax_ukernel_9p__rvv_u2v( + size_t output_pixels, + size_t kernel_elements, + size_t channels, + const float** input, + size_t input_offset, + size_t input_pixel_stride, + const float* zero, + const float* multiplier, + float* output, + size_t input_increment, + size_t output_increment, + const struct xnn_f32_scaleminmax_params* restrict params) +{ + assert(output_pixels != 0); + assert(kernel_elements != 0); + assert(channels != 0); + + const float min = params->scalar.min; + const float max = params->scalar.max; + float scale = params->scalar.scale; + + do { + // Start with the previous output as the zero buffer. + const float* prev_output = zero; + + const float** i = input; + + // Passes 0 - n-1: load the output, add 9 inputs. + size_t k = kernel_elements; + for (; k > 9; k -= 9) { + const float* i0 = *i++; + assert(i0 != NULL); + if XNN_UNPREDICTABLE(i0 != zero) { + i0 = (const float*) ((uintptr_t) i0 + input_offset); + } + const float* i1 = *i++; + assert(i1 != NULL); + if XNN_UNPREDICTABLE(i1 != zero) { + i1 = (const float*) ((uintptr_t) i1 + input_offset); + } + const float* i2 = *i++; + assert(i2 != NULL); + if XNN_UNPREDICTABLE(i2 != zero) { + i2 = (const float*) ((uintptr_t) i2 + input_offset); + } + const float* i3 = *i++; + assert(i3 != NULL); + if XNN_UNPREDICTABLE(i3 != zero) { + i3 = (const float*) ((uintptr_t) i3 + input_offset); + } + const float* i4 = *i++; + assert(i4 != NULL); + if XNN_UNPREDICTABLE(i4 != zero) { + i4 = (const float*) ((uintptr_t) i4 + input_offset); + } + const float* i5 = *i++; + assert(i5 != NULL); + if XNN_UNPREDICTABLE(i5 != zero) { + i5 = (const float*) ((uintptr_t) i5 + input_offset); + } + const float* i6 = *i++; + assert(i6 != NULL); + if XNN_UNPREDICTABLE(i6 != zero) { + i6 = (const float*) ((uintptr_t) i6 + input_offset); + } + const float* i7 = *i++; + assert(i7 != NULL); + if XNN_UNPREDICTABLE(i7 != zero) { + i7 = (const float*) ((uintptr_t) i7 + input_offset); + } + const float* i8 = *i++; + assert(i8 != NULL); + if XNN_UNPREDICTABLE(i8 != zero) { + i8 = (const float*) ((uintptr_t) i8 + input_offset); + } + + float* o = output; + size_t c = channels; + do { + size_t vl = __riscv_vsetvl_e32m2(c); + vfloat32m2_t vi0 = __riscv_vle32_v_f32m2(i0, vl); i0 += vl; + vfloat32m2_t vi1 = __riscv_vle32_v_f32m2(i1, vl); i1 += vl; + vfloat32m2_t vi2 = __riscv_vle32_v_f32m2(i2, vl); i2 += vl; + vfloat32m2_t vi3 = __riscv_vle32_v_f32m2(i3, vl); i3 += vl; + vfloat32m2_t vi4 = __riscv_vle32_v_f32m2(i4, vl); i4 += vl; + vfloat32m2_t vi5 = __riscv_vle32_v_f32m2(i5, vl); i5 += vl; + vfloat32m2_t vi6 = __riscv_vle32_v_f32m2(i6, vl); i6 += vl; + vfloat32m2_t vi7 = __riscv_vle32_v_f32m2(i7, vl); i7 += vl; + vfloat32m2_t vi8 = __riscv_vle32_v_f32m2(i8, vl); i8 += vl; + vfloat32m2_t vprev = __riscv_vle32_v_f32m2(prev_output, vl); prev_output += vl; + + vfloat32m2_t vsum01 = __riscv_vfadd(vi0, vi1, vl); + vfloat32m2_t vsum23 = __riscv_vfadd(vi2, vi3, vl); + vfloat32m2_t vsum45 = __riscv_vfadd(vi4, vi5, vl); + vfloat32m2_t vsum67 = __riscv_vfadd(vi6, vi7, vl); + vfloat32m2_t vsum018 = __riscv_vfadd(vsum01, vi8, vl); + + vfloat32m2_t vsum2345 = __riscv_vfadd(vsum23, vsum45, vl); + vfloat32m2_t vsum01678 = __riscv_vfadd(vsum67, vsum018, vl); + vfloat32m2_t vsum012345678 = __riscv_vfadd(vsum2345, vsum01678, vl); + vfloat32m2_t vacc = __riscv_vfadd(vprev, vsum012345678, vl); + __riscv_vse32_v_f32m2(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + + // Subsequent passes read from the previous output. + prev_output = output; + } + + // Final pass: load the output, add remaining kernel elements, apply scaling/min/max + const float* i0 = 0 < k ? *i++ : zero; + assert(i0 != NULL); + if XNN_UNPREDICTABLE(i0 != zero) { + i0 = (const float*) ((uintptr_t) i0 + input_offset); + } + const float* i1 = 1 < k ? *i++ : zero; + assert(i1 != NULL); + if XNN_UNPREDICTABLE(i1 != zero) { + i1 = (const float*) ((uintptr_t) i1 + input_offset); + } + const float* i2 = 2 < k ? *i++ : zero; + assert(i2 != NULL); + if XNN_UNPREDICTABLE(i2 != zero) { + i2 = (const float*) ((uintptr_t) i2 + input_offset); + } + const float* i3 = 3 < k ? *i++ : zero; + assert(i3 != NULL); + if XNN_UNPREDICTABLE(i3 != zero) { + i3 = (const float*) ((uintptr_t) i3 + input_offset); + } + const float* i4 = 4 < k ? *i++ : zero; + assert(i4 != NULL); + if XNN_UNPREDICTABLE(i4 != zero) { + i4 = (const float*) ((uintptr_t) i4 + input_offset); + } + const float* i5 = 5 < k ? *i++ : zero; + assert(i5 != NULL); + if XNN_UNPREDICTABLE(i5 != zero) { + i5 = (const float*) ((uintptr_t) i5 + input_offset); + } + const float* i6 = 6 < k ? *i++ : zero; + assert(i6 != NULL); + if XNN_UNPREDICTABLE(i6 != zero) { + i6 = (const float*) ((uintptr_t) i6 + input_offset); + } + const float* i7 = 7 < k ? *i++ : zero; + assert(i7 != NULL); + if XNN_UNPREDICTABLE(i7 != zero) { + i7 = (const float*) ((uintptr_t) i7 + input_offset); + } + const float* i8 = 8 < k ? *i++ : zero; + assert(i8 != NULL); + if XNN_UNPREDICTABLE(i8 != zero) { + i8 = (const float*) ((uintptr_t) i8 + input_offset); + } + + if (multiplier != NULL) { + scale = *multiplier++; + } + float* o = output; + size_t c = channels; + do { + size_t vl = __riscv_vsetvl_e32m2(c); + vfloat32m2_t vi0 = __riscv_vle32_v_f32m2(i0, vl); i0 += vl; + vfloat32m2_t vi1 = __riscv_vle32_v_f32m2(i1, vl); i1 += vl; + vfloat32m2_t vi2 = __riscv_vle32_v_f32m2(i2, vl); i2 += vl; + vfloat32m2_t vi3 = __riscv_vle32_v_f32m2(i3, vl); i3 += vl; + vfloat32m2_t vi4 = __riscv_vle32_v_f32m2(i4, vl); i4 += vl; + vfloat32m2_t vi5 = __riscv_vle32_v_f32m2(i5, vl); i5 += vl; + vfloat32m2_t vi6 = __riscv_vle32_v_f32m2(i6, vl); i6 += vl; + vfloat32m2_t vi7 = __riscv_vle32_v_f32m2(i7, vl); i7 += vl; + vfloat32m2_t vi8 = __riscv_vle32_v_f32m2(i8, vl); i8 += vl; + vfloat32m2_t vprev = __riscv_vle32_v_f32m2(prev_output, vl); prev_output += vl; + + vfloat32m2_t vsum01 = __riscv_vfadd(vi0, vi1, vl); + vfloat32m2_t vsum23 = __riscv_vfadd(vi2, vi3, vl); + vfloat32m2_t vsum45 = __riscv_vfadd(vi4, vi5, vl); + vfloat32m2_t vsum67 = __riscv_vfadd(vi6, vi7, vl); + vfloat32m2_t vsum018 = __riscv_vfadd(vsum01, vi8, vl); + + vfloat32m2_t vsum2345 = __riscv_vfadd(vsum23, vsum45, vl); + vfloat32m2_t vsum01678 = __riscv_vfadd(vsum67, vsum018, vl); + vfloat32m2_t vsum012345678 = __riscv_vfadd(vsum2345, vsum01678, vl); + vfloat32m2_t vacc = __riscv_vfadd(vprev, vsum012345678, vl); + + vacc = __riscv_vfmul(vacc, scale, vl); + vacc = __riscv_vfmax(vacc, min, vl); + vacc = __riscv_vfmin(vacc, max, vl); + + __riscv_vse32_v_f32m2(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + + input = (const float**) ((uintptr_t) input + input_increment); + input_offset += input_pixel_stride; + output = (float*) ((uintptr_t) output + output_increment); + } while (--output_pixels != 0); +} diff --git a/src/f32-avgpool/rvv.c.in b/src/f32-avgpool/rvv.c.in new file mode 100644 index 00000000000..2fd67ee4055 --- /dev/null +++ b/src/f32-avgpool/rvv.c.in @@ -0,0 +1,127 @@ +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert LMUL in [1, 2, 4, 8] +$assert(DATATYPE in ["f32", "f16"]) +#include +#include "src/xnnpack/maxpool.h" +#include + +$CTYPE = {"f32": "float", "f16": "xnn_float16"}[DATATYPE] +$VTYPE = {"f32": "vfloat32", "f16": "vfloat16"}[DATATYPE] +$VLOAD = {"f32": "__riscv_vle32_v_f32", "f16": "__riscv_vle16_v_f16"}[DATATYPE] +$VSTORE = {"f32": "__riscv_vse32_v_f32", "f16": "__riscv_vse16_v_f16"}[DATATYPE] +$VSETVL = {"f32": "__riscv_vsetvl_e32", "f16": "__riscv_vsetvl_e16"}[DATATYPE] +$ISA = "fp16arith" if DATATYPE == "f16" else "" + +void xnn_${DATATYPE}_avgpool_minmax_ukernel_9p__rvv${ISA}_u${LMUL}v( + size_t output_pixels, + size_t kernel_elements, + size_t channels, + const ${CTYPE}** input, + size_t input_offset, + size_t input_pixel_stride, + const ${CTYPE}* zero, + const ${CTYPE}* multiplier, + ${CTYPE}* output, + size_t input_increment, + size_t output_increment, + const struct xnn_${DATATYPE}_scaleminmax_params* restrict params) +{ + assert(output_pixels != 0); + assert(kernel_elements != 0); + assert(channels != 0); + + const ${CTYPE} min = params->scalar.min; + const ${CTYPE} max = params->scalar.max; + ${CTYPE} scale = params->scalar.scale; + + do { + // Start with the previous output as the zero buffer. + const ${CTYPE}* prev_output = zero; + + const ${CTYPE}** i = input; + + // Passes 0 - n-1: load the output, add 9 inputs. + size_t k = kernel_elements; + for (; k > 9; k -= 9) { + $for K in range(9): + const ${CTYPE}* i${K} = *i++; + assert(i${K} != NULL); + if XNN_UNPREDICTABLE(i${K} != zero) { + i${K} = (const ${CTYPE}*) ((uintptr_t) i${K} + input_offset); + } + + ${CTYPE}* o = output; + size_t c = channels; + do { + size_t vl = ${VSETVL}m${LMUL}(c); + $for K in range(9): + ${VTYPE}m${LMUL}_t vi${K} = ${VLOAD}m${LMUL}(i${K}, vl); i${K} += vl; + ${VTYPE}m${LMUL}_t vprev = ${VLOAD}m${LMUL}(prev_output, vl); prev_output += vl; + + ${VTYPE}m${LMUL}_t vsum01 = __riscv_vfadd(vi0, vi1, vl); + ${VTYPE}m${LMUL}_t vsum23 = __riscv_vfadd(vi2, vi3, vl); + ${VTYPE}m${LMUL}_t vsum45 = __riscv_vfadd(vi4, vi5, vl); + ${VTYPE}m${LMUL}_t vsum67 = __riscv_vfadd(vi6, vi7, vl); + ${VTYPE}m${LMUL}_t vsum018 = __riscv_vfadd(vsum01, vi8, vl); + + ${VTYPE}m${LMUL}_t vsum2345 = __riscv_vfadd(vsum23, vsum45, vl); + ${VTYPE}m${LMUL}_t vsum01678 = __riscv_vfadd(vsum67, vsum018, vl); + ${VTYPE}m${LMUL}_t vsum012345678 = __riscv_vfadd(vsum2345, vsum01678, vl); + ${VTYPE}m${LMUL}_t vacc = __riscv_vfadd(vprev, vsum012345678, vl); + ${VSTORE}m${LMUL}(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + + // Subsequent passes read from the previous output. + prev_output = output; + } + + // Final pass: load the output, add remaining kernel elements, apply scaling/min/max + $for K in range(9): + const ${CTYPE}* i${K} = ${K} < k ? *i++ : zero; + assert(i${K} != NULL); + if XNN_UNPREDICTABLE(i${K} != zero) { + i${K} = (const ${CTYPE}*) ((uintptr_t) i${K} + input_offset); + } + + if (multiplier != NULL) { + scale = *multiplier++; + } + ${CTYPE}* o = output; + size_t c = channels; + do { + size_t vl = ${VSETVL}m${LMUL}(c); + $for K in range(9): + ${VTYPE}m${LMUL}_t vi${K} = ${VLOAD}m${LMUL}(i${K}, vl); i${K} += vl; + ${VTYPE}m${LMUL}_t vprev = ${VLOAD}m${LMUL}(prev_output, vl); prev_output += vl; + + ${VTYPE}m${LMUL}_t vsum01 = __riscv_vfadd(vi0, vi1, vl); + ${VTYPE}m${LMUL}_t vsum23 = __riscv_vfadd(vi2, vi3, vl); + ${VTYPE}m${LMUL}_t vsum45 = __riscv_vfadd(vi4, vi5, vl); + ${VTYPE}m${LMUL}_t vsum67 = __riscv_vfadd(vi6, vi7, vl); + ${VTYPE}m${LMUL}_t vsum018 = __riscv_vfadd(vsum01, vi8, vl); + + ${VTYPE}m${LMUL}_t vsum2345 = __riscv_vfadd(vsum23, vsum45, vl); + ${VTYPE}m${LMUL}_t vsum01678 = __riscv_vfadd(vsum67, vsum018, vl); + ${VTYPE}m${LMUL}_t vsum012345678 = __riscv_vfadd(vsum2345, vsum01678, vl); + ${VTYPE}m${LMUL}_t vacc = __riscv_vfadd(vprev, vsum012345678, vl); + + vacc = __riscv_vfmul(vacc, scale, vl); + vacc = __riscv_vfmax(vacc, min, vl); + vacc = __riscv_vfmin(vacc, max, vl); + + ${VSTORE}m${LMUL}(o, vacc, vl); o += vl; + + c -= vl; + } while (c != 0); + + input = (const ${CTYPE}**) ((uintptr_t) input + input_increment); + input_offset += input_pixel_stride; + output = (${CTYPE}*) ((uintptr_t) output + output_increment); + } while (--output_pixels != 0); +} diff --git a/src/f32-maxpool/f32-maxpool-minmax.inc b/src/f32-maxpool/f32-maxpool-minmax.inc index d3434811e17..10bc3f2e5d7 100644 --- a/src/f32-maxpool/f32-maxpool-minmax.inc +++ b/src/f32-maxpool/f32-maxpool-minmax.inc @@ -18,7 +18,6 @@ XNN_UKERNEL(xnn_arch_none, xnn_f32_maxpool_minmax_ukernel_9p__wasmsimd_u4, 4, 9, #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR -XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_maxpool_minmax_ukernel_9p__rvv_u1v, (1*xnn_init_hardware_config()->vlenb/sizeof(float)), 9, float, struct xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_maxpool_minmax_ukernel_9p__rvv_u2v, (2*xnn_init_hardware_config()->vlenb/sizeof(float)), 9, float, struct xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) #endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR diff --git a/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c b/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c deleted file mode 100644 index a4873239b0a..00000000000 --- a/src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u1v.c +++ /dev/null @@ -1,147 +0,0 @@ -// clang-format off -// Auto-generated file. Do not edit! -// Template: src/f32-maxpool/rvv.c.in -// Generator: tools/xngen -// -// Copyright 2026 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include "src/xnnpack/maxpool.h" -#include - - -void xnn_f32_maxpool_minmax_ukernel_9p__rvv_u1v( - size_t output_pixels, - size_t kernel_elements, - size_t channels, - const float** input, - size_t input_offset, - size_t input_pixel_stride, - float* output, - size_t input_increment, - size_t output_increment, - const struct xnn_f32_minmax_params* restrict params) -{ - assert(output_pixels != 0); - assert(kernel_elements != 0); - assert(channels != 0); - - const float output_min = params->scalar.min; - const float output_max = params->scalar.max; - do { - const float** i = input; - - // First pass: load the inputs, store the max pool in the output. - const float* i0 = *i++; - const float* i1 = 1 < kernel_elements ? *i++ : i0; - const float* i2 = 2 < kernel_elements ? *i++ : i0; - const float* i3 = 3 < kernel_elements ? *i++ : i0; - const float* i4 = 4 < kernel_elements ? *i++ : i0; - const float* i5 = 5 < kernel_elements ? *i++ : i0; - const float* i6 = 6 < kernel_elements ? *i++ : i0; - const float* i7 = 7 < kernel_elements ? *i++ : i0; - const float* i8 = 8 < kernel_elements ? *i++ : i0; - i0 = (const float*) ((uintptr_t) i0 + input_offset); - i1 = (const float*) ((uintptr_t) i1 + input_offset); - i2 = (const float*) ((uintptr_t) i2 + input_offset); - i3 = (const float*) ((uintptr_t) i3 + input_offset); - i4 = (const float*) ((uintptr_t) i4 + input_offset); - i5 = (const float*) ((uintptr_t) i5 + input_offset); - i6 = (const float*) ((uintptr_t) i6 + input_offset); - i7 = (const float*) ((uintptr_t) i7 + input_offset); - i8 = (const float*) ((uintptr_t) i8 + input_offset); - - float* o = output; - size_t c = channels; - do { - size_t vl = __riscv_vsetvl_e32m1(c); - vfloat32m1_t vi0 = __riscv_vle32_v_f32m1(i0, vl); i0 += vl; - vfloat32m1_t vi1 = __riscv_vle32_v_f32m1(i1, vl); i1 += vl; - vfloat32m1_t vi2 = __riscv_vle32_v_f32m1(i2, vl); i2 += vl; - vfloat32m1_t vi3 = __riscv_vle32_v_f32m1(i3, vl); i3 += vl; - vfloat32m1_t vi4 = __riscv_vle32_v_f32m1(i4, vl); i4 += vl; - vfloat32m1_t vi5 = __riscv_vle32_v_f32m1(i5, vl); i5 += vl; - vfloat32m1_t vi6 = __riscv_vle32_v_f32m1(i6, vl); i6 += vl; - vfloat32m1_t vi7 = __riscv_vle32_v_f32m1(i7, vl); i7 += vl; - vfloat32m1_t vi8 = __riscv_vle32_v_f32m1(i8, vl); i8 += vl; - - vfloat32m1_t vmax01 = __riscv_vfmax(vi0, vi1, vl); - vfloat32m1_t vmax23 = __riscv_vfmax(vi2, vi3, vl); - vfloat32m1_t vmax45 = __riscv_vfmax(vi4, vi5, vl); - vfloat32m1_t vmax67 = __riscv_vfmax(vi6, vi7, vl); - vfloat32m1_t vmax018 = __riscv_vfmax(vmax01, vi8, vl); - - vfloat32m1_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl); - vfloat32m1_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl); - vfloat32m1_t vacc = __riscv_vfmax(vmax2345, vmax01678, vl); - - vacc = __riscv_vfmax(vacc, output_min, vl); - vacc = __riscv_vfmin(vacc, output_max, vl); - __riscv_vse32_v_f32m1(o, vacc, vl); o += vl; - - c -= vl; - } while (c != 0); - - // Passes 1 - n: Max more inputs to the output. - for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) { - const float* i0 = *i++; - const float* i1 = 1 < k ? *i++ : i0; - const float* i2 = 2 < k ? *i++ : i0; - const float* i3 = 3 < k ? *i++ : i0; - const float* i4 = 4 < k ? *i++ : i0; - const float* i5 = 5 < k ? *i++ : i0; - const float* i6 = 6 < k ? *i++ : i0; - const float* i7 = 7 < k ? *i++ : i0; - const float* i8 = 8 < k ? *i++ : i0; - i0 = (const float*) ((uintptr_t) i0 + input_offset); - i1 = (const float*) ((uintptr_t) i1 + input_offset); - i2 = (const float*) ((uintptr_t) i2 + input_offset); - i3 = (const float*) ((uintptr_t) i3 + input_offset); - i4 = (const float*) ((uintptr_t) i4 + input_offset); - i5 = (const float*) ((uintptr_t) i5 + input_offset); - i6 = (const float*) ((uintptr_t) i6 + input_offset); - i7 = (const float*) ((uintptr_t) i7 + input_offset); - i8 = (const float*) ((uintptr_t) i8 + input_offset); - - o = output; - size_t c = channels; - do { - size_t vl = __riscv_vsetvl_e32m1(c); - - vfloat32m1_t vi0 = __riscv_vle32_v_f32m1(i0, vl); i0 += vl; - vfloat32m1_t vi1 = __riscv_vle32_v_f32m1(i1, vl); i1 += vl; - vfloat32m1_t vi2 = __riscv_vle32_v_f32m1(i2, vl); i2 += vl; - vfloat32m1_t vi3 = __riscv_vle32_v_f32m1(i3, vl); i3 += vl; - vfloat32m1_t vi4 = __riscv_vle32_v_f32m1(i4, vl); i4 += vl; - vfloat32m1_t vi5 = __riscv_vle32_v_f32m1(i5, vl); i5 += vl; - vfloat32m1_t vi6 = __riscv_vle32_v_f32m1(i6, vl); i6 += vl; - vfloat32m1_t vi7 = __riscv_vle32_v_f32m1(i7, vl); i7 += vl; - vfloat32m1_t vi8 = __riscv_vle32_v_f32m1(i8, vl); i8 += vl; - - vfloat32m1_t vprev = __riscv_vle32_v_f32m1(o, vl); - - vfloat32m1_t vmax01 = __riscv_vfmax(vi0, vi1, vl); - vfloat32m1_t vmax23 = __riscv_vfmax(vi2, vi3, vl); - vfloat32m1_t vmax45 = __riscv_vfmax(vi4, vi5, vl); - vfloat32m1_t vmax67 = __riscv_vfmax(vi6, vi7, vl); - vfloat32m1_t vmax018 = __riscv_vfmax(vmax01, vi8, vl); - - vfloat32m1_t vmax2345 = __riscv_vfmax(vmax23, vmax45, vl); - vfloat32m1_t vmax01678 = __riscv_vfmax(vmax67, vmax018, vl); - vfloat32m1_t vmax012345678 = __riscv_vfmax(vmax2345, vmax01678, vl); - - vfloat32m1_t vacc = __riscv_vfmax(vprev, vmax012345678, vl); - vacc = __riscv_vfmin(vacc, output_max, vl); - __riscv_vse32_v_f32m1(o, vacc, vl); o += vl; - - c -= vl; - } while (c != 0); - } - input = (const float**) ((uintptr_t) input + input_increment); - input_offset += input_pixel_stride; - output = (float*) ((uintptr_t) output + output_increment); - } while (--output_pixels != 0); -} diff --git a/src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c b/src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c deleted file mode 100644 index 3114918b9b0..00000000000 --- a/src/s8-maxpool/gen/s8-maxpool-9p-minmax-rvv-u1v.c +++ /dev/null @@ -1,147 +0,0 @@ -// clang-format off -// Auto-generated file. Do not edit! -// Template: src/f32-maxpool/rvv.c.in -// Generator: tools/xngen -// -// Copyright 2026 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include "src/xnnpack/maxpool.h" -#include - - -void xnn_s8_maxpool_minmax_ukernel_9p__rvv_u1v( - size_t output_pixels, - size_t kernel_elements, - size_t channels, - const int8_t** input, - size_t input_offset, - size_t input_pixel_stride, - int8_t* output, - size_t input_increment, - size_t output_increment, - const struct xnn_s8_minmax_params* restrict params) -{ - assert(output_pixels != 0); - assert(kernel_elements != 0); - assert(channels != 0); - - const int8_t output_min = params->scalar.min; - const int8_t output_max = params->scalar.max; - do { - const int8_t** i = input; - - // First pass: load the inputs, store the max pool in the output. - const int8_t* i0 = *i++; - const int8_t* i1 = 1 < kernel_elements ? *i++ : i0; - const int8_t* i2 = 2 < kernel_elements ? *i++ : i0; - const int8_t* i3 = 3 < kernel_elements ? *i++ : i0; - const int8_t* i4 = 4 < kernel_elements ? *i++ : i0; - const int8_t* i5 = 5 < kernel_elements ? *i++ : i0; - const int8_t* i6 = 6 < kernel_elements ? *i++ : i0; - const int8_t* i7 = 7 < kernel_elements ? *i++ : i0; - const int8_t* i8 = 8 < kernel_elements ? *i++ : i0; - i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); - i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); - i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); - i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); - i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); - i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); - i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); - i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); - i8 = (const int8_t*) ((uintptr_t) i8 + input_offset); - - int8_t* o = output; - size_t c = channels; - do { - size_t vl = __riscv_vsetvl_e8m1(c); - vint8m1_t vi0 = __riscv_vle8_v_i8m1(i0, vl); i0 += vl; - vint8m1_t vi1 = __riscv_vle8_v_i8m1(i1, vl); i1 += vl; - vint8m1_t vi2 = __riscv_vle8_v_i8m1(i2, vl); i2 += vl; - vint8m1_t vi3 = __riscv_vle8_v_i8m1(i3, vl); i3 += vl; - vint8m1_t vi4 = __riscv_vle8_v_i8m1(i4, vl); i4 += vl; - vint8m1_t vi5 = __riscv_vle8_v_i8m1(i5, vl); i5 += vl; - vint8m1_t vi6 = __riscv_vle8_v_i8m1(i6, vl); i6 += vl; - vint8m1_t vi7 = __riscv_vle8_v_i8m1(i7, vl); i7 += vl; - vint8m1_t vi8 = __riscv_vle8_v_i8m1(i8, vl); i8 += vl; - - vint8m1_t vmax01 = __riscv_vmax(vi0, vi1, vl); - vint8m1_t vmax23 = __riscv_vmax(vi2, vi3, vl); - vint8m1_t vmax45 = __riscv_vmax(vi4, vi5, vl); - vint8m1_t vmax67 = __riscv_vmax(vi6, vi7, vl); - vint8m1_t vmax018 = __riscv_vmax(vmax01, vi8, vl); - - vint8m1_t vmax2345 = __riscv_vmax(vmax23, vmax45, vl); - vint8m1_t vmax01678 = __riscv_vmax(vmax67, vmax018, vl); - vint8m1_t vacc = __riscv_vmax(vmax2345, vmax01678, vl); - - vacc = __riscv_vmax(vacc, output_min, vl); - vacc = __riscv_vmin(vacc, output_max, vl); - __riscv_vse8_v_i8m1(o, vacc, vl); o += vl; - - c -= vl; - } while (c != 0); - - // Passes 1 - n: Max more inputs to the output. - for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) { - const int8_t* i0 = *i++; - const int8_t* i1 = 1 < k ? *i++ : i0; - const int8_t* i2 = 2 < k ? *i++ : i0; - const int8_t* i3 = 3 < k ? *i++ : i0; - const int8_t* i4 = 4 < k ? *i++ : i0; - const int8_t* i5 = 5 < k ? *i++ : i0; - const int8_t* i6 = 6 < k ? *i++ : i0; - const int8_t* i7 = 7 < k ? *i++ : i0; - const int8_t* i8 = 8 < k ? *i++ : i0; - i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); - i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); - i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); - i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); - i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); - i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); - i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); - i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); - i8 = (const int8_t*) ((uintptr_t) i8 + input_offset); - - o = output; - size_t c = channels; - do { - size_t vl = __riscv_vsetvl_e8m1(c); - - vint8m1_t vi0 = __riscv_vle8_v_i8m1(i0, vl); i0 += vl; - vint8m1_t vi1 = __riscv_vle8_v_i8m1(i1, vl); i1 += vl; - vint8m1_t vi2 = __riscv_vle8_v_i8m1(i2, vl); i2 += vl; - vint8m1_t vi3 = __riscv_vle8_v_i8m1(i3, vl); i3 += vl; - vint8m1_t vi4 = __riscv_vle8_v_i8m1(i4, vl); i4 += vl; - vint8m1_t vi5 = __riscv_vle8_v_i8m1(i5, vl); i5 += vl; - vint8m1_t vi6 = __riscv_vle8_v_i8m1(i6, vl); i6 += vl; - vint8m1_t vi7 = __riscv_vle8_v_i8m1(i7, vl); i7 += vl; - vint8m1_t vi8 = __riscv_vle8_v_i8m1(i8, vl); i8 += vl; - - vint8m1_t vprev = __riscv_vle8_v_i8m1(o, vl); - - vint8m1_t vmax01 = __riscv_vmax(vi0, vi1, vl); - vint8m1_t vmax23 = __riscv_vmax(vi2, vi3, vl); - vint8m1_t vmax45 = __riscv_vmax(vi4, vi5, vl); - vint8m1_t vmax67 = __riscv_vmax(vi6, vi7, vl); - vint8m1_t vmax018 = __riscv_vmax(vmax01, vi8, vl); - - vint8m1_t vmax2345 = __riscv_vmax(vmax23, vmax45, vl); - vint8m1_t vmax01678 = __riscv_vmax(vmax67, vmax018, vl); - vint8m1_t vmax012345678 = __riscv_vmax(vmax2345, vmax01678, vl); - - vint8m1_t vacc = __riscv_vmax(vprev, vmax012345678, vl); - vacc = __riscv_vmin(vacc, output_max, vl); - __riscv_vse8_v_i8m1(o, vacc, vl); o += vl; - - c -= vl; - } while (c != 0); - } - input = (const int8_t**) ((uintptr_t) input + input_increment); - input_offset += input_pixel_stride; - output = (int8_t*) ((uintptr_t) output + output_increment); - } while (--output_pixels != 0); -} diff --git a/src/s8-maxpool/s8-maxpool-minmax.inc b/src/s8-maxpool/s8-maxpool-minmax.inc index 498299cb8d6..c385f36fd66 100644 --- a/src/s8-maxpool/s8-maxpool-minmax.inc +++ b/src/s8-maxpool/s8-maxpool-minmax.inc @@ -18,7 +18,6 @@ XNN_UKERNEL(xnn_arch_none, xnn_s8_maxpool_minmax_ukernel_9p__wasmsimd_u16, 16, 9 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR -XNN_UKERNEL(xnn_arch_riscv_vector, xnn_s8_maxpool_minmax_ukernel_9p__rvv_u1v, (1*xnn_init_hardware_config()->vlenb/sizeof(int8_t)), 9, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) XNN_UKERNEL(xnn_arch_riscv_vector, xnn_s8_maxpool_minmax_ukernel_9p__rvv_u2v, (2*xnn_init_hardware_config()->vlenb/sizeof(int8_t)), 9, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) #endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR diff --git a/src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c b/src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c deleted file mode 100644 index c0561f3d658..00000000000 --- a/src/u8-maxpool/gen/u8-maxpool-9p-minmax-rvv-u1v.c +++ /dev/null @@ -1,147 +0,0 @@ -// clang-format off -// Auto-generated file. Do not edit! -// Template: src/f32-maxpool/rvv.c.in -// Generator: tools/xngen -// -// Copyright 2026 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include "src/xnnpack/maxpool.h" -#include - - -void xnn_u8_maxpool_minmax_ukernel_9p__rvv_u1v( - size_t output_pixels, - size_t kernel_elements, - size_t channels, - const uint8_t** input, - size_t input_offset, - size_t input_pixel_stride, - uint8_t* output, - size_t input_increment, - size_t output_increment, - const struct xnn_u8_minmax_params* restrict params) -{ - assert(output_pixels != 0); - assert(kernel_elements != 0); - assert(channels != 0); - - const uint8_t output_min = params->scalar.min; - const uint8_t output_max = params->scalar.max; - do { - const uint8_t** i = input; - - // First pass: load the inputs, store the max pool in the output. - const uint8_t* i0 = *i++; - const uint8_t* i1 = 1 < kernel_elements ? *i++ : i0; - const uint8_t* i2 = 2 < kernel_elements ? *i++ : i0; - const uint8_t* i3 = 3 < kernel_elements ? *i++ : i0; - const uint8_t* i4 = 4 < kernel_elements ? *i++ : i0; - const uint8_t* i5 = 5 < kernel_elements ? *i++ : i0; - const uint8_t* i6 = 6 < kernel_elements ? *i++ : i0; - const uint8_t* i7 = 7 < kernel_elements ? *i++ : i0; - const uint8_t* i8 = 8 < kernel_elements ? *i++ : i0; - i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset); - i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset); - i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset); - - uint8_t* o = output; - size_t c = channels; - do { - size_t vl = __riscv_vsetvl_e8m1(c); - vuint8m1_t vi0 = __riscv_vle8_v_u8m1(i0, vl); i0 += vl; - vuint8m1_t vi1 = __riscv_vle8_v_u8m1(i1, vl); i1 += vl; - vuint8m1_t vi2 = __riscv_vle8_v_u8m1(i2, vl); i2 += vl; - vuint8m1_t vi3 = __riscv_vle8_v_u8m1(i3, vl); i3 += vl; - vuint8m1_t vi4 = __riscv_vle8_v_u8m1(i4, vl); i4 += vl; - vuint8m1_t vi5 = __riscv_vle8_v_u8m1(i5, vl); i5 += vl; - vuint8m1_t vi6 = __riscv_vle8_v_u8m1(i6, vl); i6 += vl; - vuint8m1_t vi7 = __riscv_vle8_v_u8m1(i7, vl); i7 += vl; - vuint8m1_t vi8 = __riscv_vle8_v_u8m1(i8, vl); i8 += vl; - - vuint8m1_t vmax01 = __riscv_vmaxu(vi0, vi1, vl); - vuint8m1_t vmax23 = __riscv_vmaxu(vi2, vi3, vl); - vuint8m1_t vmax45 = __riscv_vmaxu(vi4, vi5, vl); - vuint8m1_t vmax67 = __riscv_vmaxu(vi6, vi7, vl); - vuint8m1_t vmax018 = __riscv_vmaxu(vmax01, vi8, vl); - - vuint8m1_t vmax2345 = __riscv_vmaxu(vmax23, vmax45, vl); - vuint8m1_t vmax01678 = __riscv_vmaxu(vmax67, vmax018, vl); - vuint8m1_t vacc = __riscv_vmaxu(vmax2345, vmax01678, vl); - - vacc = __riscv_vmaxu(vacc, output_min, vl); - vacc = __riscv_vminu(vacc, output_max, vl); - __riscv_vse8_v_u8m1(o, vacc, vl); o += vl; - - c -= vl; - } while (c != 0); - - // Passes 1 - n: Max more inputs to the output. - for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 9) { - const uint8_t* i0 = *i++; - const uint8_t* i1 = 1 < k ? *i++ : i0; - const uint8_t* i2 = 2 < k ? *i++ : i0; - const uint8_t* i3 = 3 < k ? *i++ : i0; - const uint8_t* i4 = 4 < k ? *i++ : i0; - const uint8_t* i5 = 5 < k ? *i++ : i0; - const uint8_t* i6 = 6 < k ? *i++ : i0; - const uint8_t* i7 = 7 < k ? *i++ : i0; - const uint8_t* i8 = 8 < k ? *i++ : i0; - i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset); - i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset); - i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset); - - o = output; - size_t c = channels; - do { - size_t vl = __riscv_vsetvl_e8m1(c); - - vuint8m1_t vi0 = __riscv_vle8_v_u8m1(i0, vl); i0 += vl; - vuint8m1_t vi1 = __riscv_vle8_v_u8m1(i1, vl); i1 += vl; - vuint8m1_t vi2 = __riscv_vle8_v_u8m1(i2, vl); i2 += vl; - vuint8m1_t vi3 = __riscv_vle8_v_u8m1(i3, vl); i3 += vl; - vuint8m1_t vi4 = __riscv_vle8_v_u8m1(i4, vl); i4 += vl; - vuint8m1_t vi5 = __riscv_vle8_v_u8m1(i5, vl); i5 += vl; - vuint8m1_t vi6 = __riscv_vle8_v_u8m1(i6, vl); i6 += vl; - vuint8m1_t vi7 = __riscv_vle8_v_u8m1(i7, vl); i7 += vl; - vuint8m1_t vi8 = __riscv_vle8_v_u8m1(i8, vl); i8 += vl; - - vuint8m1_t vprev = __riscv_vle8_v_u8m1(o, vl); - - vuint8m1_t vmax01 = __riscv_vmaxu(vi0, vi1, vl); - vuint8m1_t vmax23 = __riscv_vmaxu(vi2, vi3, vl); - vuint8m1_t vmax45 = __riscv_vmaxu(vi4, vi5, vl); - vuint8m1_t vmax67 = __riscv_vmaxu(vi6, vi7, vl); - vuint8m1_t vmax018 = __riscv_vmaxu(vmax01, vi8, vl); - - vuint8m1_t vmax2345 = __riscv_vmaxu(vmax23, vmax45, vl); - vuint8m1_t vmax01678 = __riscv_vmaxu(vmax67, vmax018, vl); - vuint8m1_t vmax012345678 = __riscv_vmaxu(vmax2345, vmax01678, vl); - - vuint8m1_t vacc = __riscv_vmaxu(vprev, vmax012345678, vl); - vacc = __riscv_vminu(vacc, output_max, vl); - __riscv_vse8_v_u8m1(o, vacc, vl); o += vl; - - c -= vl; - } while (c != 0); - } - input = (const uint8_t**) ((uintptr_t) input + input_increment); - input_offset += input_pixel_stride; - output = (uint8_t*) ((uintptr_t) output + output_increment); - } while (--output_pixels != 0); -} diff --git a/src/u8-maxpool/u8-maxpool-minmax.inc b/src/u8-maxpool/u8-maxpool-minmax.inc index f090da6c6c4..6cff5d4938f 100644 --- a/src/u8-maxpool/u8-maxpool-minmax.inc +++ b/src/u8-maxpool/u8-maxpool-minmax.inc @@ -18,7 +18,6 @@ XNN_UKERNEL(xnn_arch_none, xnn_u8_maxpool_minmax_ukernel_9p__wasmsimd_u16, 16, 9 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR -XNN_UKERNEL(xnn_arch_riscv_vector, xnn_u8_maxpool_minmax_ukernel_9p__rvv_u1v, (1*xnn_init_hardware_config()->vlenb/sizeof(uint8_t)), 9, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) XNN_UKERNEL(xnn_arch_riscv_vector, xnn_u8_maxpool_minmax_ukernel_9p__rvv_u2v, (2*xnn_init_hardware_config()->vlenb/sizeof(uint8_t)), 9, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) #endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR