Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 135 additions & 0 deletions bench/qd8-f16-qc2w-gemm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,141 @@ static void qd8_f16_qc2w_gemm_minmax_ukernel_4x4__scalar(benchmark::State& state

BENCHMARK_GEMM(qd8_f16_qc2w_gemm_minmax_ukernel_4x4__scalar)

#if XNN_ENABLE_ARM_DOTPROD && XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
static void qd8_f16_qc2w_gemm_minmax_ukernel_1x8c4__neondotfp16arith(benchmark::State& state) {
GEMMBenchmark(state,
xnn_qd8_f16_qc2w_gemm_minmax_ukernel_1x8c4__neondotfp16arith,
xnn_init_f16_minmax_scalar_params,
xnn_pack_qd8_qc2w_gemm_goi_w,
/*mr=*/1, /*nr=*/8, /*kr=*/4, /*sr=*/1,
/*arch_flags=*/xnn_arch_arm_neon_dot | xnn_arch_arm_neon_fp16_arith);
}

BENCHMARK_GEMM(qd8_f16_qc2w_gemm_minmax_ukernel_1x8c4__neondotfp16arith)

static void qd8_f16_qc2w_gemm_minmax_ukernel_2x8c4__neondotfp16arith(benchmark::State& state) {
GEMMBenchmark(state,
xnn_qd8_f16_qc2w_gemm_minmax_ukernel_2x8c4__neondotfp16arith,
xnn_init_f16_minmax_scalar_params,
xnn_pack_qd8_qc2w_gemm_goi_w,
/*mr=*/2, /*nr=*/8, /*kr=*/4, /*sr=*/1,
/*arch_flags=*/xnn_arch_arm_neon_dot | xnn_arch_arm_neon_fp16_arith);
}

BENCHMARK_GEMM(qd8_f16_qc2w_gemm_minmax_ukernel_2x8c4__neondotfp16arith)

static void qd8_f16_qc2w_gemm_minmax_ukernel_3x8c4__neondotfp16arith(benchmark::State& state) {
GEMMBenchmark(state,
xnn_qd8_f16_qc2w_gemm_minmax_ukernel_3x8c4__neondotfp16arith,
xnn_init_f16_minmax_scalar_params,
xnn_pack_qd8_qc2w_gemm_goi_w,
/*mr=*/3, /*nr=*/8, /*kr=*/4, /*sr=*/1,
/*arch_flags=*/xnn_arch_arm_neon_dot | xnn_arch_arm_neon_fp16_arith);
}

BENCHMARK_GEMM(qd8_f16_qc2w_gemm_minmax_ukernel_3x8c4__neondotfp16arith)

static void qd8_f16_qc2w_gemm_minmax_ukernel_4x8c4__neondotfp16arith(benchmark::State& state) {
GEMMBenchmark(state,
xnn_qd8_f16_qc2w_gemm_minmax_ukernel_4x8c4__neondotfp16arith,
xnn_init_f16_minmax_scalar_params,
xnn_pack_qd8_qc2w_gemm_goi_w,
/*mr=*/4, /*nr=*/8, /*kr=*/4, /*sr=*/1,
/*arch_flags=*/xnn_arch_arm_neon_dot | xnn_arch_arm_neon_fp16_arith);
}

BENCHMARK_GEMM(qd8_f16_qc2w_gemm_minmax_ukernel_4x8c4__neondotfp16arith)

static void qd8_f16_qc2w_gemm_minmax_ukernel_5x8c4__neondotfp16arith(benchmark::State& state) {
GEMMBenchmark(state,
xnn_qd8_f16_qc2w_gemm_minmax_ukernel_5x8c4__neondotfp16arith,
xnn_init_f16_minmax_scalar_params,
xnn_pack_qd8_qc2w_gemm_goi_w,
/*mr=*/5, /*nr=*/8, /*kr=*/4, /*sr=*/1,
/*arch_flags=*/xnn_arch_arm_neon_dot | xnn_arch_arm_neon_fp16_arith);
}

BENCHMARK_GEMM(qd8_f16_qc2w_gemm_minmax_ukernel_5x8c4__neondotfp16arith)

static void qd8_f16_qc2w_gemm_minmax_ukernel_6x8c4__neondotfp16arith(benchmark::State& state) {
GEMMBenchmark(state,
xnn_qd8_f16_qc2w_gemm_minmax_ukernel_6x8c4__neondotfp16arith,
xnn_init_f16_minmax_scalar_params,
xnn_pack_qd8_qc2w_gemm_goi_w,
/*mr=*/6, /*nr=*/8, /*kr=*/4, /*sr=*/1,
/*arch_flags=*/xnn_arch_arm_neon_dot | xnn_arch_arm_neon_fp16_arith);
}

BENCHMARK_GEMM(qd8_f16_qc2w_gemm_minmax_ukernel_6x8c4__neondotfp16arith)

static void qd8_f16_qc2w_gemm_minmax_ukernel_1x16c4__neondotfp16arith(benchmark::State& state) {
GEMMBenchmark(state,
xnn_qd8_f16_qc2w_gemm_minmax_ukernel_1x16c4__neondotfp16arith,
xnn_init_f16_minmax_scalar_params,
xnn_pack_qd8_qc2w_gemm_goi_w,
/*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1,
/*arch_flags=*/xnn_arch_arm_neon_dot | xnn_arch_arm_neon_fp16_arith);
}

BENCHMARK_GEMM(qd8_f16_qc2w_gemm_minmax_ukernel_1x16c4__neondotfp16arith)

static void qd8_f16_qc2w_gemm_minmax_ukernel_2x16c4__neondotfp16arith(benchmark::State& state) {
GEMMBenchmark(state,
xnn_qd8_f16_qc2w_gemm_minmax_ukernel_2x16c4__neondotfp16arith,
xnn_init_f16_minmax_scalar_params,
xnn_pack_qd8_qc2w_gemm_goi_w,
/*mr=*/2, /*nr=*/16, /*kr=*/4, /*sr=*/1,
/*arch_flags=*/xnn_arch_arm_neon_dot | xnn_arch_arm_neon_fp16_arith);
}

BENCHMARK_GEMM(qd8_f16_qc2w_gemm_minmax_ukernel_2x16c4__neondotfp16arith)

static void qd8_f16_qc2w_gemm_minmax_ukernel_3x16c4__neondotfp16arith(benchmark::State& state) {
GEMMBenchmark(state,
xnn_qd8_f16_qc2w_gemm_minmax_ukernel_3x16c4__neondotfp16arith,
xnn_init_f16_minmax_scalar_params,
xnn_pack_qd8_qc2w_gemm_goi_w,
/*mr=*/3, /*nr=*/16, /*kr=*/4, /*sr=*/1,
/*arch_flags=*/xnn_arch_arm_neon_dot | xnn_arch_arm_neon_fp16_arith);
}

BENCHMARK_GEMM(qd8_f16_qc2w_gemm_minmax_ukernel_3x16c4__neondotfp16arith)

static void qd8_f16_qc2w_gemm_minmax_ukernel_4x16c4__neondotfp16arith(benchmark::State& state) {
GEMMBenchmark(state,
xnn_qd8_f16_qc2w_gemm_minmax_ukernel_4x16c4__neondotfp16arith,
xnn_init_f16_minmax_scalar_params,
xnn_pack_qd8_qc2w_gemm_goi_w,
/*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1,
/*arch_flags=*/xnn_arch_arm_neon_dot | xnn_arch_arm_neon_fp16_arith);
}

BENCHMARK_GEMM(qd8_f16_qc2w_gemm_minmax_ukernel_4x16c4__neondotfp16arith)

static void qd8_f16_qc2w_gemm_minmax_ukernel_5x16c4__neondotfp16arith(benchmark::State& state) {
GEMMBenchmark(state,
xnn_qd8_f16_qc2w_gemm_minmax_ukernel_5x16c4__neondotfp16arith,
xnn_init_f16_minmax_scalar_params,
xnn_pack_qd8_qc2w_gemm_goi_w,
/*mr=*/5, /*nr=*/16, /*kr=*/4, /*sr=*/1,
/*arch_flags=*/xnn_arch_arm_neon_dot | xnn_arch_arm_neon_fp16_arith);
}

BENCHMARK_GEMM(qd8_f16_qc2w_gemm_minmax_ukernel_5x16c4__neondotfp16arith)

static void qd8_f16_qc2w_gemm_minmax_ukernel_6x16c4__neondotfp16arith(benchmark::State& state) {
GEMMBenchmark(state,
xnn_qd8_f16_qc2w_gemm_minmax_ukernel_6x16c4__neondotfp16arith,
xnn_init_f16_minmax_scalar_params,
xnn_pack_qd8_qc2w_gemm_goi_w,
/*mr=*/6, /*nr=*/16, /*kr=*/4, /*sr=*/1,
/*arch_flags=*/xnn_arch_arm_neon_dot | xnn_arch_arm_neon_fp16_arith);
}

BENCHMARK_GEMM(qd8_f16_qc2w_gemm_minmax_ukernel_6x16c4__neondotfp16arith)
#endif // XNN_ENABLE_ARM_DOTPROD && XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)


} // namespace

#ifndef XNNPACK_BENCHMARK_NO_MAIN
Expand Down
12 changes: 12 additions & 0 deletions cmake/gen/neondotfp16arith_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
SET(PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS
src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-1x16c4-minmax-neondotfp16arith.c
src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-4x16c4-minmax-neondotfp16arith.c
src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-1x8c4-minmax-neondotfp16arith.c
src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-2x8c4-minmax-neondotfp16arith.c
src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-6x8c4-minmax-neondotfp16arith.c
src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-1x16c4-minmax-neondotfp16arith.c
src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-4x16c4-minmax-neondotfp16arith.c
src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-1x8c4-minmax-neondotfp16arith.c
Expand All @@ -34,6 +37,15 @@ SET(NON_PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS
src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-5x16c4-minmax-neondotfp16arith.c
src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-6x8c4-minmax-neondotfp16arith.c
src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-6x16c4-minmax-neondotfp16arith.c
src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-1x16c4-minmax-neondotfp16arith.c
src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-2x16c4-minmax-neondotfp16arith.c
src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-3x8c4-minmax-neondotfp16arith.c
src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-3x16c4-minmax-neondotfp16arith.c
src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-4x8c4-minmax-neondotfp16arith.c
src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-4x16c4-minmax-neondotfp16arith.c
src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-5x8c4-minmax-neondotfp16arith.c
src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-5x16c4-minmax-neondotfp16arith.c
src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-6x16c4-minmax-neondotfp16arith.c
src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-1x8c4-minmax-neondotfp16arith.c
src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-2x8c4-minmax-neondotfp16arith.c
src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-2x16c4-minmax-neondotfp16arith.c
Expand Down
12 changes: 12 additions & 0 deletions gen/neondotfp16arith_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS = [
"src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-1x16c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-4x16c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-1x8c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-2x8c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-6x8c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-1x16c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-4x16c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-1x8c4-minmax-neondotfp16arith.c",
Expand All @@ -31,6 +34,15 @@ NON_PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS = [
"src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-5x16c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-6x8c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-6x16c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-1x16c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-2x16c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-3x8c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-3x16c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-4x8c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-4x16c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-5x8c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-5x16c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-6x16c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-1x8c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-2x8c4-minmax-neondotfp16arith.c",
"src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-2x16c4-minmax-neondotfp16arith.c",
Expand Down
16 changes: 16 additions & 0 deletions scripts/generate-qs8-gemm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,8 @@ tools/xngen src/qs8-gemm/c8-neon-mull.c.in -D MR=1 -D NR=8 -D MLA=1 -D REQUANTI
tools/xngen src/qs8-gemm/c8-neon-mull.c.in -D MR=2 -D NR=8 -D MLA=1 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D ARMV8=1 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-neonv8-mlal.c &

### C4 micro-kernels
tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=1 -D NR=8 -D REQUANTIZATION= -D DATATYPE=QD8_BF16 -o src/qd8-bf16-qc8w-gemm/gen/qd8-bf16-qc8w-gemm-1x8c4-minmax-neondotbf16.c &

tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=1 -D NR=8 -D REQUANTIZATION= -D DATATYPE=QD8_F16 -o src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-1x8c4-minmax-neondotfp16arith.c &
tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=2 -D NR=8 -D REQUANTIZATION= -D DATATYPE=QD8_F16 -o src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-2x8c4-minmax-neondotfp16arith.c &
tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=3 -D NR=8 -D REQUANTIZATION= -D DATATYPE=QD8_F16 -o src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-3x8c4-minmax-neondotfp16arith.c &
Expand Down Expand Up @@ -694,6 +696,20 @@ tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=4 -D NR=16 -D REQUANTIZATION= -D
tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=5 -D NR=16 -D REQUANTIZATION= -D DATATYPE=QC4_F16 -o src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-5x16c4-minmax-neondotfp16arith.c &
tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=6 -D NR=16 -D REQUANTIZATION= -D DATATYPE=QC4_F16 -o src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-6x16c4-minmax-neondotfp16arith.c &

tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=1 -D NR=8 -D REQUANTIZATION= -D DATATYPE=QC2_F16 -o src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-1x8c4-minmax-neondotfp16arith.c &
tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=2 -D NR=8 -D REQUANTIZATION= -D DATATYPE=QC2_F16 -o src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-2x8c4-minmax-neondotfp16arith.c &
tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=3 -D NR=8 -D REQUANTIZATION= -D DATATYPE=QC2_F16 -o src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-3x8c4-minmax-neondotfp16arith.c &
tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=4 -D NR=8 -D REQUANTIZATION= -D DATATYPE=QC2_F16 -o src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-4x8c4-minmax-neondotfp16arith.c &
tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=5 -D NR=8 -D REQUANTIZATION= -D DATATYPE=QC2_F16 -o src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-5x8c4-minmax-neondotfp16arith.c &
tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=6 -D NR=8 -D REQUANTIZATION= -D DATATYPE=QC2_F16 -o src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-6x8c4-minmax-neondotfp16arith.c &

tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=1 -D NR=16 -D REQUANTIZATION= -D DATATYPE=QC2_F16 -o src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-1x16c4-minmax-neondotfp16arith.c &
tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=2 -D NR=16 -D REQUANTIZATION= -D DATATYPE=QC2_F16 -o src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-2x16c4-minmax-neondotfp16arith.c &
tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=3 -D NR=16 -D REQUANTIZATION= -D DATATYPE=QC2_F16 -o src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-3x16c4-minmax-neondotfp16arith.c &
tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=4 -D NR=16 -D REQUANTIZATION= -D DATATYPE=QC2_F16 -o src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-4x16c4-minmax-neondotfp16arith.c &
tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=5 -D NR=16 -D REQUANTIZATION= -D DATATYPE=QC2_F16 -o src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-5x16c4-minmax-neondotfp16arith.c &
tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=6 -D NR=16 -D REQUANTIZATION= -D DATATYPE=QC2_F16 -o src/qd8-f16-qc2w-gemm/gen/qd8-f16-qc2w-gemm-6x16c4-minmax-neondotfp16arith.c &

tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=1 -D NR=8 -D REQUANTIZATION= -D DATATYPE=QB4_F16 -o src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-1x8c4-minmax-neondotfp16arith.c &
tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=2 -D NR=8 -D REQUANTIZATION= -D DATATYPE=QB4_F16 -o src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-2x8c4-minmax-neondotfp16arith.c &
tools/xngen src/qs8-gemm/c4-neondot.c.in -D MR=3 -D NR=8 -D REQUANTIZATION= -D DATATYPE=QB4_F16 -o src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-3x8c4-minmax-neondotfp16arith.c &
Expand Down
49 changes: 40 additions & 9 deletions src/configs/gemm-config.c
Original file line number Diff line number Diff line change
Expand Up @@ -2319,10 +2319,36 @@ static void init_qd8_f16_qc2w_gemm_config(void) {
qd8_f16_qc2w_gemm_config.pack_gemm_goi =
(xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qd8_qc2w_gemm_goi_w; // Ignored

qd8_f16_qc2w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] =
XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc2w_gemm_minmax_ukernel_1x2__scalar);
qd8_f16_qc2w_gemm_config.mr = 1;
qd8_f16_qc2w_gemm_config.nr = 2;
#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
(void) hardware_config; // May be unused.
#if XNN_ENABLE_ARM_DOTPROD
if (hardware_config->arch_flags & xnn_arch_arm_neon_dot) {
qd8_f16_qc2w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc2w_gemm_minmax_ukernel_1x8c4__neondotfp16arith);
#if XNN_ARCH_ARM64
qd8_f16_qc2w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(6)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc2w_gemm_minmax_ukernel_6x8c4__neondotfp16arith);
qd8_f16_qc2w_gemm_config.mr = 6;
#else
qd8_f16_qc2w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(2)] = XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc2w_gemm_minmax_ukernel_2x8c4__neondotfp16arith);
qd8_f16_qc2w_gemm_config.mr = 2;
#endif
qd8_f16_qc2w_gemm_config.nr = 8;
qd8_f16_qc2w_gemm_config.log2_kr = 2;
} else
#endif // XNN_ENABLE_ARM_DOTPROD
{
qd8_f16_qc2w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] =
XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc2w_gemm_minmax_ukernel_1x2__scalar);
qd8_f16_qc2w_gemm_config.mr = 1;
qd8_f16_qc2w_gemm_config.nr = 2;
}
#else
qd8_f16_qc2w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] =
XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f16_qc2w_gemm_minmax_ukernel_1x2__scalar);
qd8_f16_qc2w_gemm_config.mr = 1;
qd8_f16_qc2w_gemm_config.nr = 2;
#endif

assert(qd8_f16_qc2w_gemm_config.mr <= XNN_MAX_MR);
assert(qd8_f16_qc2w_gemm_config.mr <= (XNN_EXTRA_QUANTIZATION_PARAMS + 1));
Expand Down Expand Up @@ -2354,7 +2380,7 @@ static void init_qdu8_f16_qc2w_gemm_config(void) {
qdu8_f16_qc2w_gemm_config.nr = 8;
qdu8_f16_qc2w_gemm_config.log2_kr = 3;
qdu8_f16_qc2w_gemm_config.planes = 4;
}
} else
#endif
#if XNN_ENABLE_AVX2
if (hardware_config->arch_flags & xnn_arch_x86_avx2) {
Expand All @@ -2370,8 +2396,10 @@ static void init_qdu8_f16_qc2w_gemm_config(void) {
qdu8_f16_qc2w_gemm_config.nr = 8;
qdu8_f16_qc2w_gemm_config.log2_kr = 3;
qdu8_f16_qc2w_gemm_config.planes = 4;
}
} else
#endif
{
}
#endif //XNN_ARCH_X86 || XNN_ARCH_X86_64
assert(qdu8_f16_qc2w_gemm_config.mr <= XNN_MAX_MR);
assert(qdu8_f16_qc2w_gemm_config.mr <= (XNN_EXTRA_QUANTIZATION_PARAMS + 1));
Expand Down Expand Up @@ -2482,11 +2510,11 @@ static void init_qd8_f32_qc2w_gemm_config(void) {
qd8_f32_qc2w_gemm_config.pack_gemm_goi =
(xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qd8_qc2w_gemm_goi_w; // Ignored
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ENABLE_ARM_DOTPROD
const struct xnn_hardware_config* hardware_config =
xnn_init_hardware_config();
assert(hardware_config != NULL);
(void) hardware_config; // May be unused.
#if XNN_ENABLE_ARM_DOTPROD
if (hardware_config->arch_flags & xnn_arch_arm_neon_dot) {
qd8_f32_qc2w_gemm_config.arch = xnn_arch_arm_neon_dot;
qd8_f32_qc2w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] =
Expand All @@ -2496,6 +2524,7 @@ static void init_qd8_f32_qc2w_gemm_config(void) {
XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc2w_gemm_minmax_ukernel_8x8c4__neondot);
qd8_f32_qc2w_gemm_config.mr = 8;
#else
// TODO: fix sdot lane in clang
qd8_f32_qc2w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(2)] =
XNN_INIT_HMP_DQGEMM_UKERNEL(xnn_qd8_f32_qc2w_gemm_minmax_ukernel_2x8c4__neondot);
qd8_f32_qc2w_gemm_config.mr = 2;
Expand Down Expand Up @@ -2546,7 +2575,7 @@ static void init_qdu8_f32_qc2w_gemm_config(void) {
qdu8_f32_qc2w_gemm_config.nr = 8;
qdu8_f32_qc2w_gemm_config.log2_kr = 3;
qdu8_f32_qc2w_gemm_config.planes = 4;
}
} else
#endif
#if XNN_ENABLE_AVX2
if (hardware_config->arch_flags & xnn_arch_x86_avx2) {
Expand All @@ -2562,8 +2591,10 @@ static void init_qdu8_f32_qc2w_gemm_config(void) {
qdu8_f32_qc2w_gemm_config.nr = 8;
qdu8_f32_qc2w_gemm_config.log2_kr = 3;
qdu8_f32_qc2w_gemm_config.planes = 4;
}
} else
#endif
{
}
#endif //XNN_ARCH_X86 || XNN_ARCH_X86_64
assert(qdu8_f32_qc2w_gemm_config.mr <= XNN_MAX_MR);
assert(qdu8_f32_qc2w_gemm_config.mr <= (XNN_EXTRA_QUANTIZATION_PARAMS + 1));
Expand Down
Loading
Loading