Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions bench/f16-raddstoreexpminusmax.cc
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,33 @@ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_u64_acc4,
->UseRealTime();
#endif // XNN_ENABLE_AVX2 && (XNN_ARCH_X86 || XNN_ARCH_X86_64)

#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
BENCHMARK_CAPTURE(
f16_raddstoreexpminusmax, rvvfp16arith_rr2_p2_u1v,
xnn_f16_rmax_ukernel__rvvfp16arith_u8v,
xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u1v, nullptr,
xnn_arch_riscv_vector_fp16_arith)
->Apply(
benchmark::utils::UnaryElementwiseParameters<xnn_float16, xnn_float16>)
->UseRealTime();
BENCHMARK_CAPTURE(
f16_raddstoreexpminusmax, rvvfp16arith_rr2_p2_u2v,
xnn_f16_rmax_ukernel__rvvfp16arith_u8v,
xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u2v, nullptr,
xnn_arch_riscv_vector_fp16_arith)
->Apply(
benchmark::utils::UnaryElementwiseParameters<xnn_float16, xnn_float16>)
->UseRealTime();
BENCHMARK_CAPTURE(
f16_raddstoreexpminusmax, rvvfp16arith_rr2_p2_u4v,
xnn_f16_rmax_ukernel__rvvfp16arith_u8v,
xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u4v, nullptr,
xnn_arch_riscv_vector_fp16_arith)
->Apply(
benchmark::utils::UnaryElementwiseParameters<xnn_float16, xnn_float16>)
->UseRealTime();
#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR

#ifndef XNNPACK_BENCHMARK_NO_MAIN
XNN_BENCHMARK_MAIN();
#endif
2 changes: 1 addition & 1 deletion cmake/gen/avx_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ SET(PROD_AVX_MICROKERNEL_SRCS
src/f32-vrsqrt/gen/f32-vrsqrt-avx-rsqrt.c
src/f32-vrsqrt/gen/f32-vrsqrt-avx-sqrt.c
src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-u16.c
src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-u16.c
src/f32-vsin/gen/f32-vsin-avx-rational-5-4-div.c
src/f32-vsqrt/gen/f32-vsqrt-avx-rsqrt.c
src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt.c
Expand Down Expand Up @@ -237,6 +236,7 @@ SET(NON_PROD_AVX_MICROKERNEL_SRCS
src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-u24.c
src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-u32.c
src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-u8.c
src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-u16.c
src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-u24.c
src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-u32.c
src/f32-vtanh/gen/f32-vtanh-avx-rational-9-8-nr.c
Expand Down
3 changes: 3 additions & 0 deletions cmake/gen/rvvfp16arith_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ SET(PROD_RVVFP16ARITH_MICROKERNEL_SRCS
src/f16-gemm/gen/f16-gemm-7x4v-minmax-rvvfp16arith.c
src/f16-igemm/gen/f16-igemm-1x4v-minmax-rvvfp16arith.c
src/f16-igemm/gen/f16-igemm-7x4v-minmax-rvvfp16arith.c
src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u4v.c
src/f16-rdminmax/gen/f16-rdmax-2p2x-rvvfp16arith-u8v.c
src/f16-rdminmax/gen/f16-rdmin-2p2x-rvvfp16arith-u8v.c
src/f16-rminmax/gen/f16-rmax-rvvfp16arith-u8v.c
Expand Down Expand Up @@ -73,6 +74,8 @@ SET(NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS
src/f16-f32acc-rsum2/gen/f16-f32acc-rsum2-rvvfp16arith-u2v.c
src/f16-gemm/gen/f16-gemm-4x4v-minmax-rvvfp16arith.c
src/f16-igemm/gen/f16-igemm-4x4v-minmax-rvvfp16arith.c
src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u1v.c
src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u2v.c
src/f16-rminmax/gen/f16-rmax-rvvfp16arith-u4v.c
src/f16-rminmax/gen/f16-rmin-rvvfp16arith-u4v.c
src/f16-rminmax/gen/f16-rminmax-rvvfp16arith-u4v.c
Expand Down
2 changes: 1 addition & 1 deletion gen/avx_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ PROD_AVX_MICROKERNEL_SRCS = [
"src/f32-vrsqrt/gen/f32-vrsqrt-avx-rsqrt.c",
"src/f32-vrsqrt/gen/f32-vrsqrt-avx-sqrt.c",
"src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-u16.c",
"src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-u16.c",
"src/f32-vsin/gen/f32-vsin-avx-rational-5-4-div.c",
"src/f32-vsqrt/gen/f32-vsqrt-avx-rsqrt.c",
"src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt.c",
Expand Down Expand Up @@ -234,6 +233,7 @@ NON_PROD_AVX_MICROKERNEL_SRCS = [
"src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-u24.c",
"src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-u32.c",
"src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-u8.c",
"src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-u16.c",
"src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-u24.c",
"src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-u32.c",
"src/f32-vtanh/gen/f32-vtanh-avx-rational-9-8-nr.c",
Expand Down
3 changes: 3 additions & 0 deletions gen/rvvfp16arith_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [
"src/f16-gemm/gen/f16-gemm-7x4v-minmax-rvvfp16arith.c",
"src/f16-igemm/gen/f16-igemm-1x4v-minmax-rvvfp16arith.c",
"src/f16-igemm/gen/f16-igemm-7x4v-minmax-rvvfp16arith.c",
"src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u4v.c",
"src/f16-rdminmax/gen/f16-rdmax-2p2x-rvvfp16arith-u8v.c",
"src/f16-rdminmax/gen/f16-rdmin-2p2x-rvvfp16arith-u8v.c",
"src/f16-rminmax/gen/f16-rmax-rvvfp16arith-u8v.c",
Expand Down Expand Up @@ -70,6 +71,8 @@ NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [
"src/f16-f32acc-rsum2/gen/f16-f32acc-rsum2-rvvfp16arith-u2v.c",
"src/f16-gemm/gen/f16-gemm-4x4v-minmax-rvvfp16arith.c",
"src/f16-igemm/gen/f16-igemm-4x4v-minmax-rvvfp16arith.c",
"src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u1v.c",
"src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u2v.c",
"src/f16-rminmax/gen/f16-rmax-rvvfp16arith-u4v.c",
"src/f16-rminmax/gen/f16-rmin-rvvfp16arith-u4v.c",
"src/f16-rminmax/gen/f16-rminmax-rvvfp16arith-u4v.c",
Expand Down
5 changes: 5 additions & 0 deletions scripts/generate-f16-raddstoreexpminusmax.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,8 @@ tools/xngen src/f16-raddstoreexpminusmax/avx2-rr1-p2.c.in -D BATCH_TILE=48 -D AC
tools/xngen src/f16-raddstoreexpminusmax/avx2-rr1-p2.c.in -D BATCH_TILE=64 -D ACCUMULATORS=1 -o src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u64.c &
tools/xngen src/f16-raddstoreexpminusmax/avx2-rr1-p2.c.in -D BATCH_TILE=64 -D ACCUMULATORS=2 -o src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u64-acc2.c &
tools/xngen src/f16-raddstoreexpminusmax/avx2-rr1-p2.c.in -D BATCH_TILE=64 -D ACCUMULATORS=4 -o src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-u64-acc4.c &

# RISC-V Vector
tools/xngen src/f16-raddstoreexpminusmax/rvvfp16arith-rr2-p2.c.in -D LMUL=1 -o src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u1v.c &
tools/xngen src/f16-raddstoreexpminusmax/rvvfp16arith-rr2-p2.c.in -D LMUL=2 -o src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u2v.c &
tools/xngen src/f16-raddstoreexpminusmax/rvvfp16arith-rr2-p2.c.in -D LMUL=4 -o src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-rvvfp16arith-rr2-p2-u4v.c &
8 changes: 8 additions & 0 deletions src/configs/raddstoreexpminusmax-config.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ static void init_f16_raddstoreexpminusmax_config(void) {
f16_raddstoreexpminusmax_config.ukernel = XNN_INIT_RADDSTOREEXPMINUSMAX_UKERNEL(xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_u32);
}
#endif
#elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
if (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith) {
f16_raddstoreexpminusmax_config.ukernel = XNN_INIT_RADDSTOREEXPMINUSMAX_UKERNEL(xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u4v);
}
#endif
}

Expand Down Expand Up @@ -127,6 +133,8 @@ static bool is_f16_compatible_config(const struct xnn_hardware_config* hardware_
return (hardware_config->arch_flags & xnn_arch_arm_neon_fp16_arith);
#elif XNN_ARCH_X86 || XNN_ARCH_X86_64
return (hardware_config->arch_flags & xnn_arch_x86_avx2);
#elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
return (hardware_config->arch_flags & xnn_arch_riscv_vector_fp16_arith);
#else
return false;
#endif
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
// clang-format off
// Auto-generated file. Do not edit!
// Template: src/f16-raddstoreexpminusmax/rvvfp16arith-rr2-p2.c.in
// Generator: tools/xngen
//
// Copyright 2026 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>
#include <stddef.h>
#include <stdint.h>

#include <riscv_vector.h>

#include "src/xnnpack/common.h"
#include "src/xnnpack/math.h"
#include "src/xnnpack/raddstoreexpminusmax.h"


void xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u1v(
size_t batch,
const xnn_float16* input,
const xnn_float16* max,
xnn_float16* output,
float* sum,
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(xnn_float16) == 0);
assert(input != NULL);
assert(max != NULL);
assert(output != NULL);
assert(sum != NULL);

const xnn_float16 vlog2e = 0x1.715476p0f;
const xnn_float16 vmagic_bias = 0x1.83Cp+10f;
const xnn_float16 vminus_ln2_hi = -0x1.630p-1f;
const xnn_float16 vminus_ln2_lo = 0x1.BD0p-13f;
const xnn_float16 vc2 = 0x1.FF3A32p-2f;
const xnn_float16 vc1 = 0x1.039E10p+0f;
const xnn_float16 vdenorm_cutoff = -0x1.368000p+3f;

const xnn_float16* i = input;
xnn_float16* o = output;

batch >>= XNN_LOG2_SIZEOF_FLOAT16;

size_t vlmax = __riscv_vsetvl_e16m1(batch);
vfloat32m2_t vacc = __riscv_vfmv_v_f_f32m2(0.0f, vlmax);

do {
size_t vl = __riscv_vsetvl_e16m1(batch); batch -= vl;

vfloat16m1_t vi = __riscv_vle16_v_f16m1(i, vl); i += vl;

const vfloat16m1_t vx = __riscv_vfsub(vi, *max, vl);

vfloat16m1_t vn = __riscv_vfmv_v_f_f16m1(vmagic_bias, vl);
vn = __riscv_vfmacc(vn, vlog2e, vx, vl);

const vfloat16m1_t vs = __riscv_vreinterpret_f16m1(__riscv_vsll(__riscv_vreinterpret_i16m1(vn), 10, vl));

vn = __riscv_vfsub(vn, vmagic_bias, vl);

vfloat16m1_t vt = __riscv_vmv_v(vx, vl);
vt = __riscv_vfmacc(vt, vminus_ln2_hi, vn, vl);
vt = __riscv_vfmacc(vt, vminus_ln2_lo, vn, vl);

vfloat16m1_t vp = __riscv_vfmv_v_f_f16m1(vc1, vl);
vp = __riscv_vfmacc(vp, vc2, vt, vl);

vt = __riscv_vfmul(vt, vs, vl);

vfloat16m1_t vf = __riscv_vmv_v(vs, vl);
vf = __riscv_vfmacc(vf, vp, vt, vl);

const vbool16_t vmask = __riscv_vmflt(vx, vdenorm_cutoff, vl);
vf = __riscv_vfmerge(vf, 0.0f, vmask, vl);

__riscv_vse16(o, vf, vl); o += vl;

vacc = __riscv_vfwadd_wv(vacc, vf, vl);
} while (batch > 0);

vfloat32m1_t v0 = __riscv_vfmv_s_f_f32m1(0.0f, 1);
*sum = __riscv_vfmv_f(__riscv_vfredusum(vacc, v0, vlmax));
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
// clang-format off
// Auto-generated file. Do not edit!
// Template: src/f16-raddstoreexpminusmax/rvvfp16arith-rr2-p2.c.in
// Generator: tools/xngen
//
// Copyright 2026 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>
#include <stddef.h>
#include <stdint.h>

#include <riscv_vector.h>

#include "src/xnnpack/common.h"
#include "src/xnnpack/math.h"
#include "src/xnnpack/raddstoreexpminusmax.h"


void xnn_f16_raddstoreexpminusmax_ukernel__rvvfp16arith_rr2_p2_u2v(
size_t batch,
const xnn_float16* input,
const xnn_float16* max,
xnn_float16* output,
float* sum,
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(xnn_float16) == 0);
assert(input != NULL);
assert(max != NULL);
assert(output != NULL);
assert(sum != NULL);

const xnn_float16 vlog2e = 0x1.715476p0f;
const xnn_float16 vmagic_bias = 0x1.83Cp+10f;
const xnn_float16 vminus_ln2_hi = -0x1.630p-1f;
const xnn_float16 vminus_ln2_lo = 0x1.BD0p-13f;
const xnn_float16 vc2 = 0x1.FF3A32p-2f;
const xnn_float16 vc1 = 0x1.039E10p+0f;
const xnn_float16 vdenorm_cutoff = -0x1.368000p+3f;

const xnn_float16* i = input;
xnn_float16* o = output;

batch >>= XNN_LOG2_SIZEOF_FLOAT16;

size_t vlmax = __riscv_vsetvl_e16m2(batch);
vfloat32m4_t vacc = __riscv_vfmv_v_f_f32m4(0.0f, vlmax);

do {
size_t vl = __riscv_vsetvl_e16m2(batch); batch -= vl;

vfloat16m2_t vi = __riscv_vle16_v_f16m2(i, vl); i += vl;

const vfloat16m2_t vx = __riscv_vfsub(vi, *max, vl);

vfloat16m2_t vn = __riscv_vfmv_v_f_f16m2(vmagic_bias, vl);
vn = __riscv_vfmacc(vn, vlog2e, vx, vl);

const vfloat16m2_t vs = __riscv_vreinterpret_f16m2(__riscv_vsll(__riscv_vreinterpret_i16m2(vn), 10, vl));

vn = __riscv_vfsub(vn, vmagic_bias, vl);

vfloat16m2_t vt = __riscv_vmv_v(vx, vl);
vt = __riscv_vfmacc(vt, vminus_ln2_hi, vn, vl);
vt = __riscv_vfmacc(vt, vminus_ln2_lo, vn, vl);

vfloat16m2_t vp = __riscv_vfmv_v_f_f16m2(vc1, vl);
vp = __riscv_vfmacc(vp, vc2, vt, vl);

vt = __riscv_vfmul(vt, vs, vl);

vfloat16m2_t vf = __riscv_vmv_v(vs, vl);
vf = __riscv_vfmacc(vf, vp, vt, vl);

const vbool8_t vmask = __riscv_vmflt(vx, vdenorm_cutoff, vl);
vf = __riscv_vfmerge(vf, 0.0f, vmask, vl);

__riscv_vse16(o, vf, vl); o += vl;

vacc = __riscv_vfwadd_wv(vacc, vf, vl);
} while (batch > 0);

vfloat32m1_t v0 = __riscv_vfmv_s_f_f32m1(0.0f, 1);
*sum = __riscv_vfmv_f(__riscv_vfredusum(vacc, v0, vlmax));
}
Loading
Loading