Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
16 changes: 8 additions & 8 deletions bench/qd8-f32-qc4w-gemm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ namespace {
xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4v__rvv,
xnn_init_f32_qc4w_minmax_scalar_params,
xnn_pack_qs8_qc4w_gemm_goi_w,
/*mr=*/1, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(int32_t), /*kr=*/1, /*sr=*/1,
/*mr=*/1, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(float), /*kr=*/1, /*sr=*/1,
/*arch_flags=*/xnn_arch_riscv_vector);
}

Expand All @@ -43,7 +43,7 @@ namespace {
xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4v__rvv,
xnn_init_f32_qc4w_minmax_scalar_params,
xnn_pack_qs8_qc4w_gemm_goi_w,
/*mr=*/2, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(int32_t), /*kr=*/1, /*sr=*/1,
/*mr=*/2, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(float), /*kr=*/1, /*sr=*/1,
/*arch_flags=*/xnn_arch_riscv_vector);
}

Expand All @@ -54,7 +54,7 @@ namespace {
xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4v__rvv,
xnn_init_f32_qc4w_minmax_scalar_params,
xnn_pack_qs8_qc4w_gemm_goi_w,
/*mr=*/3, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(int32_t), /*kr=*/1, /*sr=*/1,
/*mr=*/3, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(float), /*kr=*/1, /*sr=*/1,
/*arch_flags=*/xnn_arch_riscv_vector);
}

Expand All @@ -65,7 +65,7 @@ namespace {
xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4v__rvv,
xnn_init_f32_qc4w_minmax_scalar_params,
xnn_pack_qs8_qc4w_gemm_goi_w,
/*mr=*/4, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(int32_t), /*kr=*/1, /*sr=*/1,
/*mr=*/4, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(float), /*kr=*/1, /*sr=*/1,
/*arch_flags=*/xnn_arch_riscv_vector);
}

Expand All @@ -76,7 +76,7 @@ namespace {
xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x4v__rvv,
xnn_init_f32_qc4w_minmax_scalar_params,
xnn_pack_qs8_qc4w_gemm_goi_w,
/*mr=*/5, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(int32_t), /*kr=*/1, /*sr=*/1,
/*mr=*/5, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(float), /*kr=*/1, /*sr=*/1,
/*arch_flags=*/xnn_arch_riscv_vector);
}

Expand All @@ -87,7 +87,7 @@ namespace {
xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x4v__rvv,
xnn_init_f32_qc4w_minmax_scalar_params,
xnn_pack_qs8_qc4w_gemm_goi_w,
/*mr=*/6, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(int32_t), /*kr=*/1, /*sr=*/1,
/*mr=*/6, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(float), /*kr=*/1, /*sr=*/1,
/*arch_flags=*/xnn_arch_riscv_vector);
}

Expand All @@ -98,7 +98,7 @@ namespace {
xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x4v__rvv,
xnn_init_f32_qc4w_minmax_scalar_params,
xnn_pack_qs8_qc4w_gemm_goi_w,
/*mr=*/7, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(int32_t), /*kr=*/1, /*sr=*/1,
/*mr=*/7, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(float), /*kr=*/1, /*sr=*/1,
/*arch_flags=*/xnn_arch_riscv_vector);
}

Expand All @@ -109,7 +109,7 @@ namespace {
xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x4v__rvv,
xnn_init_f32_qc4w_minmax_scalar_params,
xnn_pack_qs8_qc4w_gemm_goi_w,
/*mr=*/8, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(int32_t), /*kr=*/1, /*sr=*/1,
/*mr=*/8, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(float), /*kr=*/1, /*sr=*/1,
/*arch_flags=*/xnn_arch_riscv_vector);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,13 @@ void xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x2v__rvvfp16arith(
c0 = (xnn_float16*) ((uintptr_t) c0 + cn_stride);

a0 = (const int8_t*) ((uintptr_t) a0 - kc);

vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl);

__riscv_vse8(c0, vout80, vl);
c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);

a0 = (const int8_t*) ((uintptr_t) a0 - kc);

} while (nc != 0);
}
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,25 @@ void xnn_qd8_f16_qc4w_gemm_minmax_ukernel_4x2v__rvvfp16arith(
a1 = (const int8_t*) ((uintptr_t) a1 - kc);
a2 = (const int8_t*) ((uintptr_t) a2 - kc);
a3 = (const int8_t*) ((uintptr_t) a3 - kc);

vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl);
vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl);
vint8m1_t vout82 = __riscv_vncvt_x(vout2, vl);
vint8m1_t vout83 = __riscv_vncvt_x(vout3, vl);

__riscv_vse8(c0, vout80, vl);
c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
__riscv_vse8(c1, vout81, vl);
c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
__riscv_vse8(c2, vout82, vl);
c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
__riscv_vse8(c3, vout83, vl);
c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);

a0 = (const int8_t*) ((uintptr_t) a0 - kc);
a1 = (const int8_t*) ((uintptr_t) a1 - kc);
a2 = (const int8_t*) ((uintptr_t) a2 - kc);
a3 = (const int8_t*) ((uintptr_t) a3 - kc);

} while (nc != 0);
}
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,37 @@ void xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x2v__rvvfp16arith(
a4 = (const int8_t*) ((uintptr_t) a4 - kc);
a5 = (const int8_t*) ((uintptr_t) a5 - kc);
a6 = (const int8_t*) ((uintptr_t) a6 - kc);

vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl);
vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl);
vint8m1_t vout82 = __riscv_vncvt_x(vout2, vl);
vint8m1_t vout83 = __riscv_vncvt_x(vout3, vl);
vint8m1_t vout84 = __riscv_vncvt_x(vout4, vl);
vint8m1_t vout85 = __riscv_vncvt_x(vout5, vl);
vint8m1_t vout86 = __riscv_vncvt_x(vout6, vl);

__riscv_vse8(c0, vout80, vl);
c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
__riscv_vse8(c1, vout81, vl);
c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
__riscv_vse8(c2, vout82, vl);
c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
__riscv_vse8(c3, vout83, vl);
c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
__riscv_vse8(c4, vout84, vl);
c4 = (int8_t*) ((uintptr_t) c4 + cn_stride);
__riscv_vse8(c5, vout85, vl);
c5 = (int8_t*) ((uintptr_t) c5 + cn_stride);
__riscv_vse8(c6, vout86, vl);
c6 = (int8_t*) ((uintptr_t) c6 + cn_stride);

a0 = (const int8_t*) ((uintptr_t) a0 - kc);
a1 = (const int8_t*) ((uintptr_t) a1 - kc);
a2 = (const int8_t*) ((uintptr_t) a2 - kc);
a3 = (const int8_t*) ((uintptr_t) a3 - kc);
a4 = (const int8_t*) ((uintptr_t) a4 - kc);
a5 = (const int8_t*) ((uintptr_t) a5 - kc);
a6 = (const int8_t*) ((uintptr_t) a6 - kc);

} while (nc != 0);
}
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,13 @@ void xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x2v__rvvfp16arith(
c0 = (xnn_float16*) ((uintptr_t) c0 + cn_stride);

a0 = (const int8_t*) ((uintptr_t) a0 - kc);

vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl);

__riscv_vse8(c0, vout80, vl);
c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);

a0 = (const int8_t*) ((uintptr_t) a0 - kc);

} while (nc != 0);
}
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,25 @@ void xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x2v__rvvfp16arith(
a1 = (const int8_t*) ((uintptr_t) a1 - kc);
a2 = (const int8_t*) ((uintptr_t) a2 - kc);
a3 = (const int8_t*) ((uintptr_t) a3 - kc);

vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl);
vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl);
vint8m1_t vout82 = __riscv_vncvt_x(vout2, vl);
vint8m1_t vout83 = __riscv_vncvt_x(vout3, vl);

__riscv_vse8(c0, vout80, vl);
c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
__riscv_vse8(c1, vout81, vl);
c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
__riscv_vse8(c2, vout82, vl);
c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
__riscv_vse8(c3, vout83, vl);
c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);

a0 = (const int8_t*) ((uintptr_t) a0 - kc);
a1 = (const int8_t*) ((uintptr_t) a1 - kc);
a2 = (const int8_t*) ((uintptr_t) a2 - kc);
a3 = (const int8_t*) ((uintptr_t) a3 - kc);

} while (nc != 0);
}
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,37 @@ void xnn_qd8_f16_qc8w_gemm_minmax_ukernel_7x2v__rvvfp16arith(
a4 = (const int8_t*) ((uintptr_t) a4 - kc);
a5 = (const int8_t*) ((uintptr_t) a5 - kc);
a6 = (const int8_t*) ((uintptr_t) a6 - kc);

vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl);
vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl);
vint8m1_t vout82 = __riscv_vncvt_x(vout2, vl);
vint8m1_t vout83 = __riscv_vncvt_x(vout3, vl);
vint8m1_t vout84 = __riscv_vncvt_x(vout4, vl);
vint8m1_t vout85 = __riscv_vncvt_x(vout5, vl);
vint8m1_t vout86 = __riscv_vncvt_x(vout6, vl);

__riscv_vse8(c0, vout80, vl);
c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
__riscv_vse8(c1, vout81, vl);
c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
__riscv_vse8(c2, vout82, vl);
c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
__riscv_vse8(c3, vout83, vl);
c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
__riscv_vse8(c4, vout84, vl);
c4 = (int8_t*) ((uintptr_t) c4 + cn_stride);
__riscv_vse8(c5, vout85, vl);
c5 = (int8_t*) ((uintptr_t) c5 + cn_stride);
__riscv_vse8(c6, vout86, vl);
c6 = (int8_t*) ((uintptr_t) c6 + cn_stride);

a0 = (const int8_t*) ((uintptr_t) a0 - kc);
a1 = (const int8_t*) ((uintptr_t) a1 - kc);
a2 = (const int8_t*) ((uintptr_t) a2 - kc);
a3 = (const int8_t*) ((uintptr_t) a3 - kc);
a4 = (const int8_t*) ((uintptr_t) a4 - kc);
a5 = (const int8_t*) ((uintptr_t) a5 - kc);
a6 = (const int8_t*) ((uintptr_t) a6 - kc);

} while (nc != 0);
}
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_10x16c8__avx512vnni_prfm(
// XNN_FORCE_REALIZATION(voutput_min);
// XNN_FORCE_REALIZATION(voutput_max);
const __m512i vmask = _mm512_set1_epi8(0xF0);
XNN_FORCE_REALIZATION(vmask);
// XNN_FORCE_REALIZATION(vmask);
do {
const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w);
__m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_10x16c8__avx512vnni(
// XNN_FORCE_REALIZATION(voutput_min);
// XNN_FORCE_REALIZATION(voutput_max);
const __m512i vmask = _mm512_set1_epi8(0xF0);
XNN_FORCE_REALIZATION(vmask);
// XNN_FORCE_REALIZATION(vmask);
do {
const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w);
__m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_10x16c8__avx512vnnigfni_prfm(
// XNN_FORCE_REALIZATION(voutput_min);
// XNN_FORCE_REALIZATION(voutput_max);
const __m512i vmask = _mm512_set1_epi8(0xF0);
XNN_FORCE_REALIZATION(vmask);
// XNN_FORCE_REALIZATION(vmask);
const __m512i vshl4 = _mm512_set1_epi64(0x01020408);
XNN_FORCE_REALIZATION(vshl4);
// XNN_FORCE_REALIZATION(vshl4);
do {
const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w);
__m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_10x16c8__avx512vnnigfni(
// XNN_FORCE_REALIZATION(voutput_min);
// XNN_FORCE_REALIZATION(voutput_max);
const __m512i vmask = _mm512_set1_epi8(0xF0);
XNN_FORCE_REALIZATION(vmask);
// XNN_FORCE_REALIZATION(vmask);
const __m512i vshl4 = _mm512_set1_epi64(0x01020408);
XNN_FORCE_REALIZATION(vshl4);
// XNN_FORCE_REALIZATION(vshl4);
do {
const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w);
__m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_12x16c8__avx512vnni_prfm(
// XNN_FORCE_REALIZATION(voutput_min);
// XNN_FORCE_REALIZATION(voutput_max);
const __m512i vmask = _mm512_set1_epi8(0xF0);
XNN_FORCE_REALIZATION(vmask);
// XNN_FORCE_REALIZATION(vmask);
do {
const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w);
__m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_12x16c8__avx512vnni(
// XNN_FORCE_REALIZATION(voutput_min);
// XNN_FORCE_REALIZATION(voutput_max);
const __m512i vmask = _mm512_set1_epi8(0xF0);
XNN_FORCE_REALIZATION(vmask);
// XNN_FORCE_REALIZATION(vmask);
do {
const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w);
__m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_12x16c8__avx512vnnigfni_prfm(
// XNN_FORCE_REALIZATION(voutput_min);
// XNN_FORCE_REALIZATION(voutput_max);
const __m512i vmask = _mm512_set1_epi8(0xF0);
XNN_FORCE_REALIZATION(vmask);
// XNN_FORCE_REALIZATION(vmask);
const __m512i vshl4 = _mm512_set1_epi64(0x01020408);
XNN_FORCE_REALIZATION(vshl4);
// XNN_FORCE_REALIZATION(vshl4);
do {
const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w);
__m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_12x16c8__avx512vnnigfni(
// XNN_FORCE_REALIZATION(voutput_min);
// XNN_FORCE_REALIZATION(voutput_max);
const __m512i vmask = _mm512_set1_epi8(0xF0);
XNN_FORCE_REALIZATION(vmask);
// XNN_FORCE_REALIZATION(vmask);
const __m512i vshl4 = _mm512_set1_epi64(0x01020408);
XNN_FORCE_REALIZATION(vshl4);
// XNN_FORCE_REALIZATION(vshl4);
do {
const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w);
__m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_14x16c8__avx512vnni_prfm(
// XNN_FORCE_REALIZATION(voutput_min);
// XNN_FORCE_REALIZATION(voutput_max);
const __m512i vmask = _mm512_set1_epi8(0xF0);
XNN_FORCE_REALIZATION(vmask);
// XNN_FORCE_REALIZATION(vmask);
do {
const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w);
__m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_14x16c8__avx512vnni(
// XNN_FORCE_REALIZATION(voutput_min);
// XNN_FORCE_REALIZATION(voutput_max);
const __m512i vmask = _mm512_set1_epi8(0xF0);
XNN_FORCE_REALIZATION(vmask);
// XNN_FORCE_REALIZATION(vmask);
do {
const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w);
__m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_14x16c8__avx512vnnigfni_prfm(
// XNN_FORCE_REALIZATION(voutput_min);
// XNN_FORCE_REALIZATION(voutput_max);
const __m512i vmask = _mm512_set1_epi8(0xF0);
XNN_FORCE_REALIZATION(vmask);
// XNN_FORCE_REALIZATION(vmask);
const __m512i vshl4 = _mm512_set1_epi64(0x01020408);
XNN_FORCE_REALIZATION(vshl4);
// XNN_FORCE_REALIZATION(vshl4);
do {
const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w);
__m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_14x16c8__avx512vnnigfni(
// XNN_FORCE_REALIZATION(voutput_min);
// XNN_FORCE_REALIZATION(voutput_max);
const __m512i vmask = _mm512_set1_epi8(0xF0);
XNN_FORCE_REALIZATION(vmask);
// XNN_FORCE_REALIZATION(vmask);
const __m512i vshl4 = _mm512_set1_epi64(0x01020408);
XNN_FORCE_REALIZATION(vshl4);
// XNN_FORCE_REALIZATION(vshl4);
do {
const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w);
__m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c8__avx512vnni_prfm(
// XNN_FORCE_REALIZATION(voutput_min);
// XNN_FORCE_REALIZATION(voutput_max);
const __m512i vmask = _mm512_set1_epi8(0xF0);
XNN_FORCE_REALIZATION(vmask);
// XNN_FORCE_REALIZATION(vmask);
do {
const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w);
__m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c8__avx512vnni(
// XNN_FORCE_REALIZATION(voutput_min);
// XNN_FORCE_REALIZATION(voutput_max);
const __m512i vmask = _mm512_set1_epi8(0xF0);
XNN_FORCE_REALIZATION(vmask);
// XNN_FORCE_REALIZATION(vmask);
do {
const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w);
__m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c8__avx512vnnigfni_prfm(
// XNN_FORCE_REALIZATION(voutput_min);
// XNN_FORCE_REALIZATION(voutput_max);
const __m512i vmask = _mm512_set1_epi8(0xF0);
XNN_FORCE_REALIZATION(vmask);
// XNN_FORCE_REALIZATION(vmask);
const __m512i vshl4 = _mm512_set1_epi64(0x01020408);
XNN_FORCE_REALIZATION(vshl4);
// XNN_FORCE_REALIZATION(vshl4);
do {
const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w);
__m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c8__avx512vnnigfni(
// XNN_FORCE_REALIZATION(voutput_min);
// XNN_FORCE_REALIZATION(voutput_max);
const __m512i vmask = _mm512_set1_epi8(0xF0);
XNN_FORCE_REALIZATION(vmask);
// XNN_FORCE_REALIZATION(vmask);
const __m512i vshl4 = _mm512_set1_epi64(0x01020408);
XNN_FORCE_REALIZATION(vshl4);
// XNN_FORCE_REALIZATION(vshl4);
do {
const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w);
__m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0);
Expand Down
Loading
Loading