diff --git a/bench/qd8-f32-qc4w-gemm.cc b/bench/qd8-f32-qc4w-gemm.cc index 7e67679d9ff..5fc1b73a801 100644 --- a/bench/qd8-f32-qc4w-gemm.cc +++ b/bench/qd8-f32-qc4w-gemm.cc @@ -32,7 +32,7 @@ namespace { xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4v__rvv, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w, - /*mr=*/1, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(int32_t), /*kr=*/1, /*sr=*/1, + /*mr=*/1, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(float), /*kr=*/1, /*sr=*/1, /*arch_flags=*/xnn_arch_riscv_vector); } @@ -43,7 +43,7 @@ namespace { xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4v__rvv, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w, - /*mr=*/2, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(int32_t), /*kr=*/1, /*sr=*/1, + /*mr=*/2, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(float), /*kr=*/1, /*sr=*/1, /*arch_flags=*/xnn_arch_riscv_vector); } @@ -54,7 +54,7 @@ namespace { xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4v__rvv, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w, - /*mr=*/3, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(int32_t), /*kr=*/1, /*sr=*/1, + /*mr=*/3, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(float), /*kr=*/1, /*sr=*/1, /*arch_flags=*/xnn_arch_riscv_vector); } @@ -65,7 +65,7 @@ namespace { xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4v__rvv, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w, - /*mr=*/4, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(int32_t), /*kr=*/1, /*sr=*/1, + /*mr=*/4, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(float), /*kr=*/1, /*sr=*/1, /*arch_flags=*/xnn_arch_riscv_vector); } @@ -76,7 +76,7 @@ namespace { xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x4v__rvv, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w, - /*mr=*/5, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(int32_t), /*kr=*/1, /*sr=*/1, + /*mr=*/5, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(float), /*kr=*/1, /*sr=*/1, /*arch_flags=*/xnn_arch_riscv_vector); } @@ -87,7 +87,7 @@ namespace { xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x4v__rvv, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w, - /*mr=*/6, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(int32_t), /*kr=*/1, /*sr=*/1, + /*mr=*/6, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(float), /*kr=*/1, /*sr=*/1, /*arch_flags=*/xnn_arch_riscv_vector); } @@ -98,7 +98,7 @@ namespace { xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x4v__rvv, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w, - /*mr=*/7, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(int32_t), /*kr=*/1, /*sr=*/1, + /*mr=*/7, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(float), /*kr=*/1, /*sr=*/1, /*arch_flags=*/xnn_arch_riscv_vector); } @@ -109,7 +109,7 @@ namespace { xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x4v__rvv, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w, - /*mr=*/8, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(int32_t), /*kr=*/1, /*sr=*/1, + /*mr=*/8, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(float), /*kr=*/1, /*sr=*/1, /*arch_flags=*/xnn_arch_riscv_vector); } diff --git a/src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-1x2v-minmax-rvvfp16arith.c b/src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-1x2v-minmax-rvvfp16arith.c index 428a08f3419..013257fadde 100644 --- a/src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-1x2v-minmax-rvvfp16arith.c +++ b/src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-1x2v-minmax-rvvfp16arith.c @@ -96,6 +96,13 @@ void xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x2v__rvvfp16arith( c0 = (xnn_float16*) ((uintptr_t) c0 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); } while (nc != 0); } diff --git a/src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-4x2v-minmax-rvvfp16arith.c b/src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-4x2v-minmax-rvvfp16arith.c index 699a7849b40..48382a9a8f3 100644 --- a/src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-4x2v-minmax-rvvfp16arith.c +++ b/src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-4x2v-minmax-rvvfp16arith.c @@ -177,6 +177,25 @@ void xnn_qd8_f16_qc4w_gemm_minmax_ukernel_4x2v__rvvfp16arith( a1 = (const int8_t*) ((uintptr_t) a1 - kc); a2 = (const int8_t*) ((uintptr_t) a2 - kc); a3 = (const int8_t*) ((uintptr_t) a3 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl); + vint8m1_t vout82 = __riscv_vncvt_x(vout2, vl); + vint8m1_t vout83 = __riscv_vncvt_x(vout3, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + __riscv_vse8(c1, vout81, vl); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + __riscv_vse8(c2, vout82, vl); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + __riscv_vse8(c3, vout83, vl); + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); } while (nc != 0); } diff --git a/src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-7x2v-minmax-rvvfp16arith.c b/src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-7x2v-minmax-rvvfp16arith.c index 1edc78b572d..da9ef53e4c0 100644 --- a/src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-7x2v-minmax-rvvfp16arith.c +++ b/src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-7x2v-minmax-rvvfp16arith.c @@ -258,6 +258,37 @@ void xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x2v__rvvfp16arith( a4 = (const int8_t*) ((uintptr_t) a4 - kc); a5 = (const int8_t*) ((uintptr_t) a5 - kc); a6 = (const int8_t*) ((uintptr_t) a6 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl); + vint8m1_t vout82 = __riscv_vncvt_x(vout2, vl); + vint8m1_t vout83 = __riscv_vncvt_x(vout3, vl); + vint8m1_t vout84 = __riscv_vncvt_x(vout4, vl); + vint8m1_t vout85 = __riscv_vncvt_x(vout5, vl); + vint8m1_t vout86 = __riscv_vncvt_x(vout6, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + __riscv_vse8(c1, vout81, vl); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + __riscv_vse8(c2, vout82, vl); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + __riscv_vse8(c3, vout83, vl); + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + __riscv_vse8(c4, vout84, vl); + c4 = (int8_t*) ((uintptr_t) c4 + cn_stride); + __riscv_vse8(c5, vout85, vl); + c5 = (int8_t*) ((uintptr_t) c5 + cn_stride); + __riscv_vse8(c6, vout86, vl); + c6 = (int8_t*) ((uintptr_t) c6 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + a4 = (const int8_t*) ((uintptr_t) a4 - kc); + a5 = (const int8_t*) ((uintptr_t) a5 - kc); + a6 = (const int8_t*) ((uintptr_t) a6 - kc); } while (nc != 0); } diff --git a/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-1x2v-minmax-rvvfp16arith.c b/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-1x2v-minmax-rvvfp16arith.c index 5670b9dc5ff..bbef3371299 100644 --- a/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-1x2v-minmax-rvvfp16arith.c +++ b/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-1x2v-minmax-rvvfp16arith.c @@ -93,6 +93,13 @@ void xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x2v__rvvfp16arith( c0 = (xnn_float16*) ((uintptr_t) c0 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); } while (nc != 0); } diff --git a/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x2v-minmax-rvvfp16arith.c b/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x2v-minmax-rvvfp16arith.c index 449db4b3ad7..c08565a8a5f 100644 --- a/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x2v-minmax-rvvfp16arith.c +++ b/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x2v-minmax-rvvfp16arith.c @@ -156,6 +156,25 @@ void xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x2v__rvvfp16arith( a1 = (const int8_t*) ((uintptr_t) a1 - kc); a2 = (const int8_t*) ((uintptr_t) a2 - kc); a3 = (const int8_t*) ((uintptr_t) a3 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl); + vint8m1_t vout82 = __riscv_vncvt_x(vout2, vl); + vint8m1_t vout83 = __riscv_vncvt_x(vout3, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + __riscv_vse8(c1, vout81, vl); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + __riscv_vse8(c2, vout82, vl); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + __riscv_vse8(c3, vout83, vl); + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); } while (nc != 0); } diff --git a/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-7x2v-minmax-rvvfp16arith.c b/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-7x2v-minmax-rvvfp16arith.c index 2f24deddecd..15624a3e765 100644 --- a/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-7x2v-minmax-rvvfp16arith.c +++ b/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-7x2v-minmax-rvvfp16arith.c @@ -219,6 +219,37 @@ void xnn_qd8_f16_qc8w_gemm_minmax_ukernel_7x2v__rvvfp16arith( a4 = (const int8_t*) ((uintptr_t) a4 - kc); a5 = (const int8_t*) ((uintptr_t) a5 - kc); a6 = (const int8_t*) ((uintptr_t) a6 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl); + vint8m1_t vout82 = __riscv_vncvt_x(vout2, vl); + vint8m1_t vout83 = __riscv_vncvt_x(vout3, vl); + vint8m1_t vout84 = __riscv_vncvt_x(vout4, vl); + vint8m1_t vout85 = __riscv_vncvt_x(vout5, vl); + vint8m1_t vout86 = __riscv_vncvt_x(vout6, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + __riscv_vse8(c1, vout81, vl); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + __riscv_vse8(c2, vout82, vl); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + __riscv_vse8(c3, vout83, vl); + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + __riscv_vse8(c4, vout84, vl); + c4 = (int8_t*) ((uintptr_t) c4 + cn_stride); + __riscv_vse8(c5, vout85, vl); + c5 = (int8_t*) ((uintptr_t) c5 + cn_stride); + __riscv_vse8(c6, vout86, vl); + c6 = (int8_t*) ((uintptr_t) c6 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + a4 = (const int8_t*) ((uintptr_t) a4 - kc); + a5 = (const int8_t*) ((uintptr_t) a5 - kc); + a6 = (const int8_t*) ((uintptr_t) a6 - kc); } while (nc != 0); } diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-10x16c8-minmax-avx512vnni-prfm.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-10x16c8-minmax-avx512vnni-prfm.c index e4bd263bba5..da867e61c36 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-10x16c8-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-10x16c8-minmax-avx512vnni-prfm.c @@ -123,7 +123,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_10x16c8__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-10x16c8-minmax-avx512vnni.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-10x16c8-minmax-avx512vnni.c index 9bdc35862a0..066ad2982f0 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-10x16c8-minmax-avx512vnni.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-10x16c8-minmax-avx512vnni.c @@ -122,7 +122,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_10x16c8__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-10x16c8-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-10x16c8-minmax-avx512vnnigfni-prfm.c index d9ca2b5915b..926582fff67 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-10x16c8-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-10x16c8-minmax-avx512vnnigfni-prfm.c @@ -123,9 +123,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_10x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-10x16c8-minmax-avx512vnnigfni.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-10x16c8-minmax-avx512vnnigfni.c index 19e52f2dedc..1936676e490 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-10x16c8-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-10x16c8-minmax-avx512vnnigfni.c @@ -122,9 +122,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_10x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-12x16c8-minmax-avx512vnni-prfm.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-12x16c8-minmax-avx512vnni-prfm.c index db22722790a..4e11f2add72 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-12x16c8-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-12x16c8-minmax-avx512vnni-prfm.c @@ -137,7 +137,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_12x16c8__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-12x16c8-minmax-avx512vnni.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-12x16c8-minmax-avx512vnni.c index d49244a868e..d72f7bea501 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-12x16c8-minmax-avx512vnni.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-12x16c8-minmax-avx512vnni.c @@ -136,7 +136,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_12x16c8__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-12x16c8-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-12x16c8-minmax-avx512vnnigfni-prfm.c index 7a9c206aae5..44d1150c591 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-12x16c8-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-12x16c8-minmax-avx512vnnigfni-prfm.c @@ -137,9 +137,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_12x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-12x16c8-minmax-avx512vnnigfni.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-12x16c8-minmax-avx512vnnigfni.c index 2c717bd4758..f01fe8db108 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-12x16c8-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-12x16c8-minmax-avx512vnnigfni.c @@ -136,9 +136,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_12x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-14x16c8-minmax-avx512vnni-prfm.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-14x16c8-minmax-avx512vnni-prfm.c index 1d653cebfd7..9c4f783627a 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-14x16c8-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-14x16c8-minmax-avx512vnni-prfm.c @@ -151,7 +151,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_14x16c8__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-14x16c8-minmax-avx512vnni.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-14x16c8-minmax-avx512vnni.c index 44971dc08c9..017602d6c80 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-14x16c8-minmax-avx512vnni.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-14x16c8-minmax-avx512vnni.c @@ -150,7 +150,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_14x16c8__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-14x16c8-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-14x16c8-minmax-avx512vnnigfni-prfm.c index e92c6343478..03ec2e394f9 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-14x16c8-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-14x16c8-minmax-avx512vnnigfni-prfm.c @@ -151,9 +151,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_14x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-14x16c8-minmax-avx512vnnigfni.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-14x16c8-minmax-avx512vnnigfni.c index 0b5db015b02..dd28dbb8cd6 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-14x16c8-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-14x16c8-minmax-avx512vnnigfni.c @@ -150,9 +150,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_14x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x16c8-minmax-avx512vnni-prfm.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x16c8-minmax-avx512vnni-prfm.c index 8044915186a..d8b4eb43662 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x16c8-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x16c8-minmax-avx512vnni-prfm.c @@ -60,7 +60,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c8__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x16c8-minmax-avx512vnni.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x16c8-minmax-avx512vnni.c index 803098d620e..32bb7400980 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x16c8-minmax-avx512vnni.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x16c8-minmax-avx512vnni.c @@ -59,7 +59,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c8__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x16c8-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x16c8-minmax-avx512vnnigfni-prfm.c index 1cb2045cb28..8a029df8c7d 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x16c8-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x16c8-minmax-avx512vnnigfni-prfm.c @@ -60,9 +60,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x16c8-minmax-avx512vnnigfni.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x16c8-minmax-avx512vnnigfni.c index e9af6af0605..b368cc8a9aa 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x16c8-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x16c8-minmax-avx512vnnigfni.c @@ -59,9 +59,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-5x16c8-minmax-avx512vnni-prfm.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-5x16c8-minmax-avx512vnni-prfm.c index 7a0286d7d63..e26fadd8f07 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-5x16c8-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-5x16c8-minmax-avx512vnni-prfm.c @@ -88,7 +88,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_5x16c8__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-5x16c8-minmax-avx512vnni.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-5x16c8-minmax-avx512vnni.c index c79b38e945e..1673aee8151 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-5x16c8-minmax-avx512vnni.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-5x16c8-minmax-avx512vnni.c @@ -87,7 +87,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_5x16c8__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-5x16c8-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-5x16c8-minmax-avx512vnnigfni-prfm.c index 898cac737d8..158f7eb22f3 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-5x16c8-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-5x16c8-minmax-avx512vnnigfni-prfm.c @@ -88,9 +88,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_5x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-5x16c8-minmax-avx512vnnigfni.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-5x16c8-minmax-avx512vnnigfni.c index c6da2258178..3a529783240 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-5x16c8-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-5x16c8-minmax-avx512vnnigfni.c @@ -87,9 +87,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_5x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-7x16c8-minmax-avx512vnni-prfm.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-7x16c8-minmax-avx512vnni-prfm.c index 600cce0a948..15afc6f26a2 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-7x16c8-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-7x16c8-minmax-avx512vnni-prfm.c @@ -102,7 +102,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_7x16c8__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-7x16c8-minmax-avx512vnni.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-7x16c8-minmax-avx512vnni.c index 778ea8de6cf..65373a5d6ab 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-7x16c8-minmax-avx512vnni.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-7x16c8-minmax-avx512vnni.c @@ -101,7 +101,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_7x16c8__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-7x16c8-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-7x16c8-minmax-avx512vnnigfni-prfm.c index c4523a9923d..b2ad2bb4f7b 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-7x16c8-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-7x16c8-minmax-avx512vnnigfni-prfm.c @@ -102,9 +102,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_7x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-7x16c8-minmax-avx512vnnigfni.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-7x16c8-minmax-avx512vnnigfni.c index 6b206a76492..e33ba1060bb 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-7x16c8-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-7x16c8-minmax-avx512vnnigfni.c @@ -101,9 +101,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_7x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-8x16c8-minmax-avx512vnni-prfm.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-8x16c8-minmax-avx512vnni-prfm.c index b8cc218dda9..84afdd5803a 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-8x16c8-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-8x16c8-minmax-avx512vnni-prfm.c @@ -109,7 +109,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_8x16c8__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-8x16c8-minmax-avx512vnni.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-8x16c8-minmax-avx512vnni.c index 985f3935446..664e183b008 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-8x16c8-minmax-avx512vnni.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-8x16c8-minmax-avx512vnni.c @@ -108,7 +108,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_8x16c8__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-8x16c8-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-8x16c8-minmax-avx512vnnigfni-prfm.c index f258952802f..8ddd0d24112 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-8x16c8-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-8x16c8-minmax-avx512vnnigfni-prfm.c @@ -109,9 +109,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_8x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-8x16c8-minmax-avx512vnnigfni.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-8x16c8-minmax-avx512vnnigfni.c index 146f779e8bc..274427159fb 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-8x16c8-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-8x16c8-minmax-avx512vnnigfni.c @@ -108,9 +108,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_8x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-9x16c8-minmax-avx512vnni-prfm.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-9x16c8-minmax-avx512vnni-prfm.c index 87841dfb7cf..468c86536fb 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-9x16c8-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-9x16c8-minmax-avx512vnni-prfm.c @@ -116,7 +116,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_9x16c8__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-9x16c8-minmax-avx512vnni.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-9x16c8-minmax-avx512vnni.c index 67c47fdad17..5383027be7b 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-9x16c8-minmax-avx512vnni.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-9x16c8-minmax-avx512vnni.c @@ -115,7 +115,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_9x16c8__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-9x16c8-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-9x16c8-minmax-avx512vnnigfni-prfm.c index 56523c534a7..c1f30697a6f 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-9x16c8-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-9x16c8-minmax-avx512vnnigfni-prfm.c @@ -116,9 +116,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_9x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-9x16c8-minmax-avx512vnnigfni.c b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-9x16c8-minmax-avx512vnnigfni.c index a3d832a8283..8f2a8516115 100644 --- a/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-9x16c8-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-9x16c8-minmax-avx512vnnigfni.c @@ -115,9 +115,9 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_9x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512 vksum0123456789ABCDEF = _mm512_loadu_ps(w); __m512 vscaled0x0123456789ABCDEF = _mm512_mul_ps(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512skx-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512skx-madd-prfm.c index 1b7722837ce..8751e2fc27e 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512skx-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512skx-madd-prfm.c @@ -118,7 +118,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c4__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512skx-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512skx-madd.c index da0f07737a1..70260727c65 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512skx-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512skx-madd.c @@ -117,7 +117,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c4__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512vnni-prfm.c index 687cd3f778e..155716e07a4 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512vnni-prfm.c @@ -118,7 +118,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c4__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512vnni.c index 04fb78a578f..5d92dde2ae7 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512vnni.c @@ -117,7 +117,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c4__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512vnnigfni-prfm.c index a26b09c0ef9..bba11dd9a4e 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512vnnigfni-prfm.c @@ -118,9 +118,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c4__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512vnnigfni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512vnnigfni.c index a4b374bd222..72c4c5c2086 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c4-minmax-avx512vnnigfni.c @@ -117,9 +117,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c4__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512skx-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512skx-madd-prfm.c index a45680d73a5..64d45bb4b61 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512skx-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512skx-madd-prfm.c @@ -118,7 +118,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c8__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512skx-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512skx-madd.c index 6ddd6506ca3..698657b3d33 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512skx-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512skx-madd.c @@ -117,7 +117,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c8__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512vnni-prfm.c index 4edb7b989a9..5884e4f9fb1 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512vnni-prfm.c @@ -118,7 +118,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c8__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512vnni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512vnni.c index 3268fd2dc54..8dabf0b0847 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512vnni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512vnni.c @@ -117,7 +117,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c8__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512vnnigfni-prfm.c index 9017a0ad21f..df91a6e215d 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512vnnigfni-prfm.c @@ -118,9 +118,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512vnnigfni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512vnnigfni.c index 70dc4c81d09..41c2fa3f7a2 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-10x16c8-minmax-avx512vnnigfni.c @@ -117,9 +117,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512skx-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512skx-madd-prfm.c index 0128fd8e5b4..52f78cd61b3 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512skx-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512skx-madd-prfm.c @@ -132,7 +132,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c4__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512skx-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512skx-madd.c index 451ed97f1c7..b80f9465ab2 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512skx-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512skx-madd.c @@ -131,7 +131,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c4__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512vnni-prfm.c index 091c08af45f..4f29a5649e0 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512vnni-prfm.c @@ -132,7 +132,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c4__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512vnni.c index ca58f42dc45..303f67af781 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512vnni.c @@ -131,7 +131,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c4__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512vnnigfni-prfm.c index 10d1963924b..2f44eb95c64 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512vnnigfni-prfm.c @@ -132,9 +132,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c4__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512vnnigfni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512vnnigfni.c index dbf7adb5f76..6f82c19fa71 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c4-minmax-avx512vnnigfni.c @@ -131,9 +131,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c4__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512skx-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512skx-madd-prfm.c index 22a502a3628..83c4652b8f7 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512skx-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512skx-madd-prfm.c @@ -132,7 +132,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c8__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512skx-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512skx-madd.c index eefabfac6ba..7dd13156303 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512skx-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512skx-madd.c @@ -131,7 +131,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c8__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512vnni-prfm.c index 36ea60d3667..1d3ba36d3d9 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512vnni-prfm.c @@ -132,7 +132,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c8__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512vnni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512vnni.c index 32c0d106389..bbb6797b23d 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512vnni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512vnni.c @@ -131,7 +131,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c8__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512vnnigfni-prfm.c index 1313ad39a76..66a4d533631 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512vnnigfni-prfm.c @@ -132,9 +132,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512vnnigfni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512vnnigfni.c index fff96049d7a..39dfb663059 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-12x16c8-minmax-avx512vnnigfni.c @@ -131,9 +131,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512skx-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512skx-madd-prfm.c index 1b5fc19fb43..05be773fb76 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512skx-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512skx-madd-prfm.c @@ -146,7 +146,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c4__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512skx-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512skx-madd.c index 38bca59fc0f..f89ea7b5fd9 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512skx-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512skx-madd.c @@ -145,7 +145,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c4__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512vnni-prfm.c index ee1a53facb8..8df09a79733 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512vnni-prfm.c @@ -146,7 +146,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c4__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512vnni.c index 2e07c8551c6..dd0a2c7fd02 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512vnni.c @@ -145,7 +145,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c4__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512vnnigfni-prfm.c index ed67df62d61..a4ee4c27fcb 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512vnnigfni-prfm.c @@ -146,9 +146,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c4__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512vnnigfni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512vnnigfni.c index 5c4176e67bf..7c5de23bf0c 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c4-minmax-avx512vnnigfni.c @@ -145,9 +145,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c4__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512skx-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512skx-madd-prfm.c index 37922cf12cd..7d755229d85 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512skx-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512skx-madd-prfm.c @@ -146,7 +146,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c8__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512skx-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512skx-madd.c index 2eae37ba908..ef0344b2e68 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512skx-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512skx-madd.c @@ -145,7 +145,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c8__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512vnni-prfm.c index c14b8bb4fff..ce20aec5e28 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512vnni-prfm.c @@ -146,7 +146,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c8__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512vnni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512vnni.c index b58f6bd0d3a..33e4e123c87 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512vnni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512vnni.c @@ -145,7 +145,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c8__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512vnnigfni-prfm.c index 2ed98ff6425..ba31979a771 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512vnnigfni-prfm.c @@ -146,9 +146,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512vnnigfni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512vnnigfni.c index e00194e531f..820fefde857 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-14x16c8-minmax-avx512vnnigfni.c @@ -145,9 +145,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512skx-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512skx-madd-prfm.c index a39c19bb9d4..e252c724183 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512skx-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512skx-madd-prfm.c @@ -55,7 +55,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c4__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512skx-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512skx-madd.c index 7adb900628c..7cc1a59a5fe 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512skx-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512skx-madd.c @@ -54,7 +54,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c4__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512vnni-prfm.c index 48d8a088389..10471dcd62e 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512vnni-prfm.c @@ -55,7 +55,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c4__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512vnni.c index 3b096e5fedf..6cc58544fdd 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512vnni.c @@ -54,7 +54,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c4__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512vnnigfni-prfm.c index 8f00c85ec2a..3401a9b7397 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512vnnigfni-prfm.c @@ -55,9 +55,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c4__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512vnnigfni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512vnnigfni.c index dbdaff50e64..0f2a0743c97 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c4-minmax-avx512vnnigfni.c @@ -54,9 +54,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c4__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512skx-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512skx-madd-prfm.c index 428f1f38ed6..6726bf0f2aa 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512skx-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512skx-madd-prfm.c @@ -55,7 +55,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512skx-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512skx-madd.c index a7b75efa1bb..17c70dab9ef 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512skx-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512skx-madd.c @@ -54,7 +54,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512vnni-prfm.c index 637a2d92ba7..041d71731ef 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512vnni-prfm.c @@ -55,7 +55,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512vnni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512vnni.c index 87dfd5357c0..260454fcf5a 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512vnni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512vnni.c @@ -54,7 +54,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512vnnigfni-prfm.c index 9a8a1d3eb74..af1ee721a74 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512vnnigfni-prfm.c @@ -55,9 +55,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512vnnigfni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512vnnigfni.c index d00fb7b0d72..94007cba76a 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x16c8-minmax-avx512vnnigfni.c @@ -54,9 +54,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-ssse3-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-ssse3-madd-prfm.c index 61d307dcfc5..8acb6c6257a 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-ssse3-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-ssse3-madd-prfm.c @@ -53,7 +53,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__ssse3_madd_prfm( const __m128 voutput_min = _mm_set1_ps(params->scalar.min); const __m128 voutput_max = _mm_set1_ps(params->scalar.max); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); const __m128i vksum13 = _mm_shuffle_epi32(vksum0123, 0xF5); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-ssse3-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-ssse3-madd.c index 92c67301de7..7e5c477c333 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-ssse3-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4c8-minmax-ssse3-madd.c @@ -52,7 +52,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__ssse3_madd( const __m128 voutput_min = _mm_set1_ps(params->scalar.min); const __m128 voutput_max = _mm_set1_ps(params->scalar.max); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); const __m128i vksum13 = _mm_shuffle_epi32(vksum0123, 0xF5); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4v-minmax-rvv.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4v-minmax-rvv.c index 81419089f2d..ba8dea5687f 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4v-minmax-rvv.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x4v-minmax-rvv.c @@ -95,6 +95,13 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4v__rvv( c0 = (float*) ((uintptr_t) c0 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); } while (nc != 0); } diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-ssse3-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-ssse3-madd-prfm.c index 29ffab32604..5dec4b0aeb8 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-ssse3-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-ssse3-madd-prfm.c @@ -60,7 +60,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__ssse3_madd_prfm( const __m128 voutput_min = _mm_set1_ps(params->scalar.min); const __m128 voutput_max = _mm_set1_ps(params->scalar.max); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); const __m128i vksum13 = _mm_shuffle_epi32(vksum0123, 0xF5); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-ssse3-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-ssse3-madd.c index a7ad1f073dc..b6be901136e 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-ssse3-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4c8-minmax-ssse3-madd.c @@ -59,7 +59,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__ssse3_madd( const __m128 voutput_min = _mm_set1_ps(params->scalar.min); const __m128 voutput_max = _mm_set1_ps(params->scalar.max); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); const __m128i vksum13 = _mm_shuffle_epi32(vksum0123, 0xF5); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4v-minmax-rvv.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4v-minmax-rvv.c index 069f82702ec..f52f32f0181 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4v-minmax-rvv.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x4v-minmax-rvv.c @@ -121,6 +121,17 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4v__rvv( a0 = (const int8_t*) ((uintptr_t) a0 - kc); a1 = (const int8_t*) ((uintptr_t) a1 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + __riscv_vse8(c1, vout81, vl); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); } while (nc != 0); } diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-ssse3-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-ssse3-madd-prfm.c index 2057b0b9887..9f2f70369f9 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-ssse3-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-ssse3-madd-prfm.c @@ -67,7 +67,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__ssse3_madd_prfm( const __m128 voutput_min = _mm_set1_ps(params->scalar.min); const __m128 voutput_max = _mm_set1_ps(params->scalar.max); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); const __m128i vksum13 = _mm_shuffle_epi32(vksum0123, 0xF5); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-ssse3-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-ssse3-madd.c index a5737235036..e219f23da0e 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-ssse3-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-ssse3-madd.c @@ -66,7 +66,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__ssse3_madd( const __m128 voutput_min = _mm_set1_ps(params->scalar.min); const __m128 voutput_max = _mm_set1_ps(params->scalar.max); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); const __m128i vksum13 = _mm_shuffle_epi32(vksum0123, 0xF5); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4v-minmax-rvv.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4v-minmax-rvv.c index 3e6fabf6b31..26e032363f4 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4v-minmax-rvv.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4v-minmax-rvv.c @@ -147,6 +147,21 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4v__rvv( a0 = (const int8_t*) ((uintptr_t) a0 - kc); a1 = (const int8_t*) ((uintptr_t) a1 - kc); a2 = (const int8_t*) ((uintptr_t) a2 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl); + vint8m1_t vout82 = __riscv_vncvt_x(vout2, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + __riscv_vse8(c1, vout81, vl); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + __riscv_vse8(c2, vout82, vl); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); } while (nc != 0); } diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512skx-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512skx-madd-prfm.c index 9d12da5ea3d..bcf1da0aabb 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512skx-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512skx-madd-prfm.c @@ -76,7 +76,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x16c4__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512skx-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512skx-madd.c index 9c56c7fde98..d30dacba560 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512skx-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512skx-madd.c @@ -75,7 +75,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x16c4__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512vnni-prfm.c index 1ec3a96f255..36d630ab810 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512vnni-prfm.c @@ -76,7 +76,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x16c4__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512vnni.c index 8ff15e3d522..80b58979f71 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512vnni.c @@ -75,7 +75,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x16c4__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512vnnigfni-prfm.c index c0f22cc1ed2..72370be4f4a 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512vnnigfni-prfm.c @@ -76,9 +76,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x16c4__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512vnnigfni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512vnnigfni.c index 0d34fb73c85..defc68033f2 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x16c4-minmax-avx512vnnigfni.c @@ -75,9 +75,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x16c4__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-ssse3-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-ssse3-madd-prfm.c index 47b969813ed..48ee2df6c16 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-ssse3-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-ssse3-madd-prfm.c @@ -74,7 +74,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__ssse3_madd_prfm( const __m128 voutput_min = _mm_set1_ps(params->scalar.min); const __m128 voutput_max = _mm_set1_ps(params->scalar.max); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); const __m128i vksum13 = _mm_shuffle_epi32(vksum0123, 0xF5); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-ssse3-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-ssse3-madd.c index d31b1398e1e..cc2fff22bf1 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-ssse3-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-ssse3-madd.c @@ -73,7 +73,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__ssse3_madd( const __m128 voutput_min = _mm_set1_ps(params->scalar.min); const __m128 voutput_max = _mm_set1_ps(params->scalar.max); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); const __m128i vksum13 = _mm_shuffle_epi32(vksum0123, 0xF5); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4v-minmax-rvv.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4v-minmax-rvv.c index d2a7599f437..f31892c0fc7 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4v-minmax-rvv.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4v-minmax-rvv.c @@ -173,6 +173,25 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4v__rvv( a1 = (const int8_t*) ((uintptr_t) a1 - kc); a2 = (const int8_t*) ((uintptr_t) a2 - kc); a3 = (const int8_t*) ((uintptr_t) a3 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl); + vint8m1_t vout82 = __riscv_vncvt_x(vout2, vl); + vint8m1_t vout83 = __riscv_vncvt_x(vout3, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + __riscv_vse8(c1, vout81, vl); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + __riscv_vse8(c2, vout82, vl); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + __riscv_vse8(c3, vout83, vl); + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); } while (nc != 0); } diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512skx-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512skx-madd-prfm.c index d6abde409ce..42ebd6e262b 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512skx-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512skx-madd-prfm.c @@ -83,7 +83,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c4__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512skx-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512skx-madd.c index 3653091d998..2dd096b05a2 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512skx-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512skx-madd.c @@ -82,7 +82,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c4__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512vnni-prfm.c index 32c88700b7f..bd0d1cb542d 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512vnni-prfm.c @@ -83,7 +83,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c4__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512vnni.c index e6655e261ff..074f8077751 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512vnni.c @@ -82,7 +82,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c4__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512vnnigfni-prfm.c index 3b9443a984b..1ebcf0c5a86 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512vnnigfni-prfm.c @@ -83,9 +83,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c4__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512vnnigfni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512vnnigfni.c index 17c871f178c..bb9c9ed56ea 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c4-minmax-avx512vnnigfni.c @@ -82,9 +82,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c4__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512skx-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512skx-madd-prfm.c index 2fe79a4b30e..44f4bdbd7d4 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512skx-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512skx-madd-prfm.c @@ -83,7 +83,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c8__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512skx-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512skx-madd.c index 3c285439c43..cf18b6d12bd 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512skx-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512skx-madd.c @@ -82,7 +82,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c8__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512vnni-prfm.c index c4718b17954..b8cb3c7c388 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512vnni-prfm.c @@ -83,7 +83,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c8__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512vnni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512vnni.c index 717b8e3a97e..d006e5ee686 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512vnni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512vnni.c @@ -82,7 +82,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c8__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512vnnigfni-prfm.c index e4e3b903120..3732186d9ff 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512vnnigfni-prfm.c @@ -83,9 +83,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512vnnigfni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512vnnigfni.c index 2db67776fb4..d0d557175e9 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x16c8-minmax-avx512vnnigfni.c @@ -82,9 +82,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x4c8-minmax-ssse3-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x4c8-minmax-ssse3-madd-prfm.c index c33b231d552..3a7ae8996b9 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x4c8-minmax-ssse3-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x4c8-minmax-ssse3-madd-prfm.c @@ -81,7 +81,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x4c8__ssse3_madd_prfm( const __m128 voutput_min = _mm_set1_ps(params->scalar.min); const __m128 voutput_max = _mm_set1_ps(params->scalar.max); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); const __m128i vksum13 = _mm_shuffle_epi32(vksum0123, 0xF5); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x4c8-minmax-ssse3-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x4c8-minmax-ssse3-madd.c index 758bc2c222a..f43c9d05757 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x4c8-minmax-ssse3-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x4c8-minmax-ssse3-madd.c @@ -80,7 +80,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x4c8__ssse3_madd( const __m128 voutput_min = _mm_set1_ps(params->scalar.min); const __m128 voutput_max = _mm_set1_ps(params->scalar.max); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); const __m128i vksum13 = _mm_shuffle_epi32(vksum0123, 0xF5); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x4v-minmax-rvv.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x4v-minmax-rvv.c index fae0ce37025..7781ae6ac96 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x4v-minmax-rvv.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-5x4v-minmax-rvv.c @@ -199,6 +199,29 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x4v__rvv( a2 = (const int8_t*) ((uintptr_t) a2 - kc); a3 = (const int8_t*) ((uintptr_t) a3 - kc); a4 = (const int8_t*) ((uintptr_t) a4 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl); + vint8m1_t vout82 = __riscv_vncvt_x(vout2, vl); + vint8m1_t vout83 = __riscv_vncvt_x(vout3, vl); + vint8m1_t vout84 = __riscv_vncvt_x(vout4, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + __riscv_vse8(c1, vout81, vl); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + __riscv_vse8(c2, vout82, vl); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + __riscv_vse8(c3, vout83, vl); + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + __riscv_vse8(c4, vout84, vl); + c4 = (int8_t*) ((uintptr_t) c4 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + a4 = (const int8_t*) ((uintptr_t) a4 - kc); } while (nc != 0); } diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-6x4c8-minmax-ssse3-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-6x4c8-minmax-ssse3-madd-prfm.c index 0fb384e431d..a8817c8a44a 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-6x4c8-minmax-ssse3-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-6x4c8-minmax-ssse3-madd-prfm.c @@ -88,7 +88,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x4c8__ssse3_madd_prfm( const __m128 voutput_min = _mm_set1_ps(params->scalar.min); const __m128 voutput_max = _mm_set1_ps(params->scalar.max); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); const __m128i vksum13 = _mm_shuffle_epi32(vksum0123, 0xF5); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-6x4c8-minmax-ssse3-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-6x4c8-minmax-ssse3-madd.c index 7cf9a645c46..a91d4a5af01 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-6x4c8-minmax-ssse3-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-6x4c8-minmax-ssse3-madd.c @@ -87,7 +87,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x4c8__ssse3_madd( const __m128 voutput_min = _mm_set1_ps(params->scalar.min); const __m128 voutput_max = _mm_set1_ps(params->scalar.max); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); const __m128i vksum13 = _mm_shuffle_epi32(vksum0123, 0xF5); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-6x4v-minmax-rvv.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-6x4v-minmax-rvv.c index 130272b0cf9..1fb65b4c3a3 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-6x4v-minmax-rvv.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-6x4v-minmax-rvv.c @@ -225,6 +225,33 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x4v__rvv( a3 = (const int8_t*) ((uintptr_t) a3 - kc); a4 = (const int8_t*) ((uintptr_t) a4 - kc); a5 = (const int8_t*) ((uintptr_t) a5 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl); + vint8m1_t vout82 = __riscv_vncvt_x(vout2, vl); + vint8m1_t vout83 = __riscv_vncvt_x(vout3, vl); + vint8m1_t vout84 = __riscv_vncvt_x(vout4, vl); + vint8m1_t vout85 = __riscv_vncvt_x(vout5, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + __riscv_vse8(c1, vout81, vl); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + __riscv_vse8(c2, vout82, vl); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + __riscv_vse8(c3, vout83, vl); + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + __riscv_vse8(c4, vout84, vl); + c4 = (int8_t*) ((uintptr_t) c4 + cn_stride); + __riscv_vse8(c5, vout85, vl); + c5 = (int8_t*) ((uintptr_t) c5 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + a4 = (const int8_t*) ((uintptr_t) a4 - kc); + a5 = (const int8_t*) ((uintptr_t) a5 - kc); } while (nc != 0); } diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512skx-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512skx-madd-prfm.c index 6aeb7c3b1e2..d4a423a8ab2 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512skx-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512skx-madd-prfm.c @@ -97,7 +97,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c4__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512skx-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512skx-madd.c index 54b6ce9bdb8..9ad9a534baa 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512skx-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512skx-madd.c @@ -96,7 +96,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c4__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512vnni-prfm.c index d709ea1a02b..83ba7561ea5 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512vnni-prfm.c @@ -97,7 +97,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c4__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512vnni.c index 1db904a00f7..3f61201517f 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512vnni.c @@ -96,7 +96,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c4__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512vnnigfni-prfm.c index 476264de4ad..160bc081947 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512vnnigfni-prfm.c @@ -97,9 +97,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c4__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512vnnigfni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512vnnigfni.c index df47fc8bbac..32226b5e3ea 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c4-minmax-avx512vnnigfni.c @@ -96,9 +96,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c4__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512skx-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512skx-madd-prfm.c index 42bc283a422..80dae4bb793 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512skx-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512skx-madd-prfm.c @@ -97,7 +97,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512skx-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512skx-madd.c index b1a1497bc4d..e0fe7d86e5a 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512skx-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512skx-madd.c @@ -96,7 +96,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512vnni-prfm.c index 827d78622f0..aab5cb80c1a 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512vnni-prfm.c @@ -97,7 +97,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512vnni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512vnni.c index bc7031e0eae..61adffd3695 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512vnni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512vnni.c @@ -96,7 +96,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512vnnigfni-prfm.c index 3196781bf17..7900ce4411f 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512vnnigfni-prfm.c @@ -97,9 +97,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512vnnigfni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512vnnigfni.c index 76e3e02fef3..7dd7b85e250 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x16c8-minmax-avx512vnnigfni.c @@ -96,9 +96,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x4v-minmax-rvv.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x4v-minmax-rvv.c index c24b732d049..2f43a3a7911 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x4v-minmax-rvv.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-7x4v-minmax-rvv.c @@ -251,6 +251,37 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x4v__rvv( a4 = (const int8_t*) ((uintptr_t) a4 - kc); a5 = (const int8_t*) ((uintptr_t) a5 - kc); a6 = (const int8_t*) ((uintptr_t) a6 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl); + vint8m1_t vout82 = __riscv_vncvt_x(vout2, vl); + vint8m1_t vout83 = __riscv_vncvt_x(vout3, vl); + vint8m1_t vout84 = __riscv_vncvt_x(vout4, vl); + vint8m1_t vout85 = __riscv_vncvt_x(vout5, vl); + vint8m1_t vout86 = __riscv_vncvt_x(vout6, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + __riscv_vse8(c1, vout81, vl); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + __riscv_vse8(c2, vout82, vl); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + __riscv_vse8(c3, vout83, vl); + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + __riscv_vse8(c4, vout84, vl); + c4 = (int8_t*) ((uintptr_t) c4 + cn_stride); + __riscv_vse8(c5, vout85, vl); + c5 = (int8_t*) ((uintptr_t) c5 + cn_stride); + __riscv_vse8(c6, vout86, vl); + c6 = (int8_t*) ((uintptr_t) c6 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + a4 = (const int8_t*) ((uintptr_t) a4 - kc); + a5 = (const int8_t*) ((uintptr_t) a5 - kc); + a6 = (const int8_t*) ((uintptr_t) a6 - kc); } while (nc != 0); } diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512skx-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512skx-madd-prfm.c index b1c9a1e93d5..4c0f0a9dc96 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512skx-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512skx-madd-prfm.c @@ -104,7 +104,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c4__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512skx-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512skx-madd.c index f6c1614e941..2094a27c641 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512skx-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512skx-madd.c @@ -103,7 +103,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c4__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512vnni-prfm.c index e2cbea326db..a4f7ce4f1d8 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512vnni-prfm.c @@ -104,7 +104,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c4__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512vnni.c index e11110c0ab8..ff491812956 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512vnni.c @@ -103,7 +103,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c4__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512vnnigfni-prfm.c index 22ae7228fcf..c4f3b3678c4 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512vnnigfni-prfm.c @@ -104,9 +104,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c4__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512vnnigfni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512vnnigfni.c index ed7f516b088..87a3294ef6a 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c4-minmax-avx512vnnigfni.c @@ -103,9 +103,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c4__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512skx-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512skx-madd-prfm.c index 3f6641474f4..ed76d9c2a41 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512skx-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512skx-madd-prfm.c @@ -104,7 +104,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c8__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512skx-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512skx-madd.c index 5b3e2a02d8d..75aaf210aa7 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512skx-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512skx-madd.c @@ -103,7 +103,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c8__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512vnni-prfm.c index a00f19ee732..764db39313b 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512vnni-prfm.c @@ -104,7 +104,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c8__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512vnni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512vnni.c index a01c3e42cd7..9d723c10538 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512vnni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512vnni.c @@ -103,7 +103,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c8__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512vnnigfni-prfm.c index b4192bc2928..d0078650679 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512vnnigfni-prfm.c @@ -104,9 +104,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512vnnigfni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512vnnigfni.c index a3aad350a7d..ee9b4269281 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x16c8-minmax-avx512vnnigfni.c @@ -103,9 +103,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x4v-minmax-rvv.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x4v-minmax-rvv.c index 25ccbf2f900..dbd10d282f5 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x4v-minmax-rvv.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-8x4v-minmax-rvv.c @@ -277,6 +277,41 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x4v__rvv( a5 = (const int8_t*) ((uintptr_t) a5 - kc); a6 = (const int8_t*) ((uintptr_t) a6 - kc); a7 = (const int8_t*) ((uintptr_t) a7 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl); + vint8m1_t vout82 = __riscv_vncvt_x(vout2, vl); + vint8m1_t vout83 = __riscv_vncvt_x(vout3, vl); + vint8m1_t vout84 = __riscv_vncvt_x(vout4, vl); + vint8m1_t vout85 = __riscv_vncvt_x(vout5, vl); + vint8m1_t vout86 = __riscv_vncvt_x(vout6, vl); + vint8m1_t vout87 = __riscv_vncvt_x(vout7, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + __riscv_vse8(c1, vout81, vl); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + __riscv_vse8(c2, vout82, vl); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + __riscv_vse8(c3, vout83, vl); + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + __riscv_vse8(c4, vout84, vl); + c4 = (int8_t*) ((uintptr_t) c4 + cn_stride); + __riscv_vse8(c5, vout85, vl); + c5 = (int8_t*) ((uintptr_t) c5 + cn_stride); + __riscv_vse8(c6, vout86, vl); + c6 = (int8_t*) ((uintptr_t) c6 + cn_stride); + __riscv_vse8(c7, vout87, vl); + c7 = (int8_t*) ((uintptr_t) c7 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + a4 = (const int8_t*) ((uintptr_t) a4 - kc); + a5 = (const int8_t*) ((uintptr_t) a5 - kc); + a6 = (const int8_t*) ((uintptr_t) a6 - kc); + a7 = (const int8_t*) ((uintptr_t) a7 - kc); } while (nc != 0); } diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512skx-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512skx-madd-prfm.c index c07cfc20dd1..5f2ed02fe67 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512skx-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512skx-madd-prfm.c @@ -111,7 +111,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c4__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512skx-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512skx-madd.c index ce4a56cb202..3c61f2d0e8e 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512skx-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512skx-madd.c @@ -110,7 +110,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c4__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512vnni-prfm.c index 9b18624fe1c..0da916a1fa2 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512vnni-prfm.c @@ -111,7 +111,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c4__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512vnni.c index 18859670f8a..8ed1a674d8d 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512vnni.c @@ -110,7 +110,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c4__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512vnnigfni-prfm.c index ba495802b5c..341d6cc3554 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512vnnigfni-prfm.c @@ -111,9 +111,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c4__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512vnnigfni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512vnnigfni.c index 9f630fe1bc6..a166bab4f1a 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c4-minmax-avx512vnnigfni.c @@ -110,9 +110,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c4__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vacc0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512skx-madd-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512skx-madd-prfm.c index 8dffe24a85a..8901beea3f5 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512skx-madd-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512skx-madd-prfm.c @@ -111,7 +111,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c8__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512skx-madd.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512skx-madd.c index 1210f0c17d8..ec87dc52fd3 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512skx-madd.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512skx-madd.c @@ -110,7 +110,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c8__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512vnni-prfm.c index 3cdf7943792..be7b86b4fd0 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512vnni-prfm.c @@ -111,7 +111,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c8__avx512vnni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512vnni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512vnni.c index 7c51c732aea..ccb806f6068 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512vnni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512vnni.c @@ -110,7 +110,7 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c8__avx512vnni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512vnnigfni-prfm.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512vnnigfni-prfm.c index 218181e70e3..c86f7ea8561 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512vnnigfni-prfm.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512vnnigfni-prfm.c @@ -111,9 +111,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512vnnigfni.c b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512vnnigfni.c index 7b5c0d738e9..10e2f6c49a2 100644 --- a/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512vnnigfni.c +++ b/src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-9x16c8-minmax-avx512vnnigfni.c @@ -110,9 +110,9 @@ void xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); __m512i vsum0x0123456789ABCDEF = _mm512_mullo_epi32(vksum0123456789ABCDEF, vinput_zero_point0); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4v-minmax-rvv.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4v-minmax-rvv.c index 0fc52e5aa65..7c8ed81e1c9 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4v-minmax-rvv.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4v-minmax-rvv.c @@ -48,7 +48,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4v__rvv( vint32m4_t vksum = __riscv_vle32_v_i32m4((const int32_t*)w, vl); const int32_t vinput_zero_point0 = quantization_params[0].zero_point; - vint32m4_t vacc0 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point0, vl); + vint32m4_t vacc0 = __riscv_vmul(vksum, vinput_zero_point0, vl); w = (const int32_t*) w + nr; @@ -61,36 +61,43 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4v__rvv( w = (const int8_t*) w + nr; - vacc0 = __riscv_vwmacc_vx_i32m4(vacc0, va0, vb0, vl); + vacc0 = __riscv_vwmacc(vacc0, va0, vb0, vl); k -= sizeof(int8_t); } while (k != 0); // i32 -> f32 - vfloat32m4_t vout0 = __riscv_vfcvt_f_x_v_f32m4(vacc0, vl); + vfloat32m4_t vout0 = __riscv_vfcvt_f(vacc0, vl); // vout * input_scale const float vinput_scale0 = quantization_params[0].inv_scale; - vout0 = __riscv_vfmul_vf_f32m4(vout0, vinput_scale0, vl); + vout0 = __riscv_vfmul(vout0, vinput_scale0, vl); const vfloat32m4_t vfilter_output_scale = __riscv_vle32_v_f32m4((const float*) w, vl); w = (const float*) w + nr; - vout0 = __riscv_vfmul_vv_f32m4(vout0, vfilter_output_scale, vl); + vout0 = __riscv_vfmul(vout0, vfilter_output_scale, vl); - const vfloat32m4_t vbias = __riscv_vle32_v_f32m4((const float*) w, vl); + const vfloat32m4_t vbias = __riscv_vle32_v_f32m4((const float*) w, vl); w = (const float*) w + nr; - vout0 = __riscv_vfadd_vv_f32m4(vout0, vbias, vl); + vout0 = __riscv_vfadd(vout0, vbias, vl); const float vmin = params->scalar.min; - vout0 = __riscv_vfmax_vf_f32m4(vout0, vmin, vl); + vout0 = __riscv_vfmax(vout0, vmin, vl); const float vmax = params->scalar.max; - vout0 = __riscv_vfmin_vf_f32m4(vout0, vmax, vl); + vout0 = __riscv_vfmin(vout0, vmax, vl); // store 1 x vl results to c - __riscv_vse32_v_f32m4(c0, vout0, vl); + __riscv_vse32(c0, vout0, vl); c0 = (float*) ((uintptr_t) c0 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); } while (nc != 0); } diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4v-minmax-rvv.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4v-minmax-rvv.c index 57da64b09d9..39b130de4d8 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4v-minmax-rvv.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4v-minmax-rvv.c @@ -55,8 +55,8 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4v__rvv( vint32m4_t vksum = __riscv_vle32_v_i32m4((const int32_t*)w, vl); const int32_t vinput_zero_point0 = quantization_params[0].zero_point; const int32_t vinput_zero_point1 = quantization_params[1].zero_point; - vint32m4_t vacc0 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point0, vl); - vint32m4_t vacc1 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point1, vl); + vint32m4_t vacc0 = __riscv_vmul(vksum, vinput_zero_point0, vl); + vint32m4_t vacc1 = __riscv_vmul(vksum, vinput_zero_point1, vl); w = (const int32_t*) w + nr; @@ -70,47 +70,58 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4v__rvv( w = (const int8_t*) w + nr; - vacc0 = __riscv_vwmacc_vx_i32m4(vacc0, va0, vb0, vl); - vacc1 = __riscv_vwmacc_vx_i32m4(vacc1, va1, vb0, vl); + vacc0 = __riscv_vwmacc(vacc0, va0, vb0, vl); + vacc1 = __riscv_vwmacc(vacc1, va1, vb0, vl); k -= sizeof(int8_t); } while (k != 0); // i32 -> f32 - vfloat32m4_t vout0 = __riscv_vfcvt_f_x_v_f32m4(vacc0, vl); - vfloat32m4_t vout1 = __riscv_vfcvt_f_x_v_f32m4(vacc1, vl); + vfloat32m4_t vout0 = __riscv_vfcvt_f(vacc0, vl); + vfloat32m4_t vout1 = __riscv_vfcvt_f(vacc1, vl); // vout * input_scale const float vinput_scale0 = quantization_params[0].inv_scale; const float vinput_scale1 = quantization_params[1].inv_scale; - vout0 = __riscv_vfmul_vf_f32m4(vout0, vinput_scale0, vl); - vout1 = __riscv_vfmul_vf_f32m4(vout1, vinput_scale1, vl); + vout0 = __riscv_vfmul(vout0, vinput_scale0, vl); + vout1 = __riscv_vfmul(vout1, vinput_scale1, vl); const vfloat32m4_t vfilter_output_scale = __riscv_vle32_v_f32m4((const float*) w, vl); w = (const float*) w + nr; - vout0 = __riscv_vfmul_vv_f32m4(vout0, vfilter_output_scale, vl); - vout1 = __riscv_vfmul_vv_f32m4(vout1, vfilter_output_scale, vl); + vout0 = __riscv_vfmul(vout0, vfilter_output_scale, vl); + vout1 = __riscv_vfmul(vout1, vfilter_output_scale, vl); - const vfloat32m4_t vbias = __riscv_vle32_v_f32m4((const float*) w, vl); + const vfloat32m4_t vbias = __riscv_vle32_v_f32m4((const float*) w, vl); w = (const float*) w + nr; - vout0 = __riscv_vfadd_vv_f32m4(vout0, vbias, vl); - vout1 = __riscv_vfadd_vv_f32m4(vout1, vbias, vl); + vout0 = __riscv_vfadd(vout0, vbias, vl); + vout1 = __riscv_vfadd(vout1, vbias, vl); const float vmin = params->scalar.min; - vout0 = __riscv_vfmax_vf_f32m4(vout0, vmin, vl); - vout1 = __riscv_vfmax_vf_f32m4(vout1, vmin, vl); + vout0 = __riscv_vfmax(vout0, vmin, vl); + vout1 = __riscv_vfmax(vout1, vmin, vl); const float vmax = params->scalar.max; - vout0 = __riscv_vfmin_vf_f32m4(vout0, vmax, vl); - vout1 = __riscv_vfmin_vf_f32m4(vout1, vmax, vl); + vout0 = __riscv_vfmin(vout0, vmax, vl); + vout1 = __riscv_vfmin(vout1, vmax, vl); // store 2 x vl results to c - __riscv_vse32_v_f32m4(c0, vout0, vl); + __riscv_vse32(c0, vout0, vl); c0 = (float*) ((uintptr_t) c0 + cn_stride); - __riscv_vse32_v_f32m4(c1, vout1, vl); + __riscv_vse32(c1, vout1, vl); c1 = (float*) ((uintptr_t) c1 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); a1 = (const int8_t*) ((uintptr_t) a1 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + __riscv_vse8(c1, vout81, vl); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); } while (nc != 0); } diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4v-minmax-rvv.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4v-minmax-rvv.c index 6ac7a1f65b5..2c1490fbdfa 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4v-minmax-rvv.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4v-minmax-rvv.c @@ -62,9 +62,9 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4v__rvv( const int32_t vinput_zero_point0 = quantization_params[0].zero_point; const int32_t vinput_zero_point1 = quantization_params[1].zero_point; const int32_t vinput_zero_point2 = quantization_params[2].zero_point; - vint32m4_t vacc0 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point0, vl); - vint32m4_t vacc1 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point1, vl); - vint32m4_t vacc2 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point2, vl); + vint32m4_t vacc0 = __riscv_vmul(vksum, vinput_zero_point0, vl); + vint32m4_t vacc1 = __riscv_vmul(vksum, vinput_zero_point1, vl); + vint32m4_t vacc2 = __riscv_vmul(vksum, vinput_zero_point2, vl); w = (const int32_t*) w + nr; @@ -79,58 +79,73 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4v__rvv( w = (const int8_t*) w + nr; - vacc0 = __riscv_vwmacc_vx_i32m4(vacc0, va0, vb0, vl); - vacc1 = __riscv_vwmacc_vx_i32m4(vacc1, va1, vb0, vl); - vacc2 = __riscv_vwmacc_vx_i32m4(vacc2, va2, vb0, vl); + vacc0 = __riscv_vwmacc(vacc0, va0, vb0, vl); + vacc1 = __riscv_vwmacc(vacc1, va1, vb0, vl); + vacc2 = __riscv_vwmacc(vacc2, va2, vb0, vl); k -= sizeof(int8_t); } while (k != 0); // i32 -> f32 - vfloat32m4_t vout0 = __riscv_vfcvt_f_x_v_f32m4(vacc0, vl); - vfloat32m4_t vout1 = __riscv_vfcvt_f_x_v_f32m4(vacc1, vl); - vfloat32m4_t vout2 = __riscv_vfcvt_f_x_v_f32m4(vacc2, vl); + vfloat32m4_t vout0 = __riscv_vfcvt_f(vacc0, vl); + vfloat32m4_t vout1 = __riscv_vfcvt_f(vacc1, vl); + vfloat32m4_t vout2 = __riscv_vfcvt_f(vacc2, vl); // vout * input_scale const float vinput_scale0 = quantization_params[0].inv_scale; const float vinput_scale1 = quantization_params[1].inv_scale; const float vinput_scale2 = quantization_params[2].inv_scale; - vout0 = __riscv_vfmul_vf_f32m4(vout0, vinput_scale0, vl); - vout1 = __riscv_vfmul_vf_f32m4(vout1, vinput_scale1, vl); - vout2 = __riscv_vfmul_vf_f32m4(vout2, vinput_scale2, vl); + vout0 = __riscv_vfmul(vout0, vinput_scale0, vl); + vout1 = __riscv_vfmul(vout1, vinput_scale1, vl); + vout2 = __riscv_vfmul(vout2, vinput_scale2, vl); const vfloat32m4_t vfilter_output_scale = __riscv_vle32_v_f32m4((const float*) w, vl); w = (const float*) w + nr; - vout0 = __riscv_vfmul_vv_f32m4(vout0, vfilter_output_scale, vl); - vout1 = __riscv_vfmul_vv_f32m4(vout1, vfilter_output_scale, vl); - vout2 = __riscv_vfmul_vv_f32m4(vout2, vfilter_output_scale, vl); + vout0 = __riscv_vfmul(vout0, vfilter_output_scale, vl); + vout1 = __riscv_vfmul(vout1, vfilter_output_scale, vl); + vout2 = __riscv_vfmul(vout2, vfilter_output_scale, vl); - const vfloat32m4_t vbias = __riscv_vle32_v_f32m4((const float*) w, vl); + const vfloat32m4_t vbias = __riscv_vle32_v_f32m4((const float*) w, vl); w = (const float*) w + nr; - vout0 = __riscv_vfadd_vv_f32m4(vout0, vbias, vl); - vout1 = __riscv_vfadd_vv_f32m4(vout1, vbias, vl); - vout2 = __riscv_vfadd_vv_f32m4(vout2, vbias, vl); + vout0 = __riscv_vfadd(vout0, vbias, vl); + vout1 = __riscv_vfadd(vout1, vbias, vl); + vout2 = __riscv_vfadd(vout2, vbias, vl); const float vmin = params->scalar.min; - vout0 = __riscv_vfmax_vf_f32m4(vout0, vmin, vl); - vout1 = __riscv_vfmax_vf_f32m4(vout1, vmin, vl); - vout2 = __riscv_vfmax_vf_f32m4(vout2, vmin, vl); + vout0 = __riscv_vfmax(vout0, vmin, vl); + vout1 = __riscv_vfmax(vout1, vmin, vl); + vout2 = __riscv_vfmax(vout2, vmin, vl); const float vmax = params->scalar.max; - vout0 = __riscv_vfmin_vf_f32m4(vout0, vmax, vl); - vout1 = __riscv_vfmin_vf_f32m4(vout1, vmax, vl); - vout2 = __riscv_vfmin_vf_f32m4(vout2, vmax, vl); + vout0 = __riscv_vfmin(vout0, vmax, vl); + vout1 = __riscv_vfmin(vout1, vmax, vl); + vout2 = __riscv_vfmin(vout2, vmax, vl); // store 3 x vl results to c - __riscv_vse32_v_f32m4(c0, vout0, vl); + __riscv_vse32(c0, vout0, vl); c0 = (float*) ((uintptr_t) c0 + cn_stride); - __riscv_vse32_v_f32m4(c1, vout1, vl); + __riscv_vse32(c1, vout1, vl); c1 = (float*) ((uintptr_t) c1 + cn_stride); - __riscv_vse32_v_f32m4(c2, vout2, vl); + __riscv_vse32(c2, vout2, vl); c2 = (float*) ((uintptr_t) c2 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); a1 = (const int8_t*) ((uintptr_t) a1 - kc); a2 = (const int8_t*) ((uintptr_t) a2 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl); + vint8m1_t vout82 = __riscv_vncvt_x(vout2, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + __riscv_vse8(c1, vout81, vl); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + __riscv_vse8(c2, vout82, vl); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); } while (nc != 0); } diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4v-minmax-rvv.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4v-minmax-rvv.c index ce2352a5f02..1bad32b2bfd 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4v-minmax-rvv.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4v-minmax-rvv.c @@ -69,10 +69,10 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4v__rvv( const int32_t vinput_zero_point1 = quantization_params[1].zero_point; const int32_t vinput_zero_point2 = quantization_params[2].zero_point; const int32_t vinput_zero_point3 = quantization_params[3].zero_point; - vint32m4_t vacc0 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point0, vl); - vint32m4_t vacc1 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point1, vl); - vint32m4_t vacc2 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point2, vl); - vint32m4_t vacc3 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point3, vl); + vint32m4_t vacc0 = __riscv_vmul(vksum, vinput_zero_point0, vl); + vint32m4_t vacc1 = __riscv_vmul(vksum, vinput_zero_point1, vl); + vint32m4_t vacc2 = __riscv_vmul(vksum, vinput_zero_point2, vl); + vint32m4_t vacc3 = __riscv_vmul(vksum, vinput_zero_point3, vl); w = (const int32_t*) w + nr; @@ -88,69 +88,88 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4v__rvv( w = (const int8_t*) w + nr; - vacc0 = __riscv_vwmacc_vx_i32m4(vacc0, va0, vb0, vl); - vacc1 = __riscv_vwmacc_vx_i32m4(vacc1, va1, vb0, vl); - vacc2 = __riscv_vwmacc_vx_i32m4(vacc2, va2, vb0, vl); - vacc3 = __riscv_vwmacc_vx_i32m4(vacc3, va3, vb0, vl); + vacc0 = __riscv_vwmacc(vacc0, va0, vb0, vl); + vacc1 = __riscv_vwmacc(vacc1, va1, vb0, vl); + vacc2 = __riscv_vwmacc(vacc2, va2, vb0, vl); + vacc3 = __riscv_vwmacc(vacc3, va3, vb0, vl); k -= sizeof(int8_t); } while (k != 0); // i32 -> f32 - vfloat32m4_t vout0 = __riscv_vfcvt_f_x_v_f32m4(vacc0, vl); - vfloat32m4_t vout1 = __riscv_vfcvt_f_x_v_f32m4(vacc1, vl); - vfloat32m4_t vout2 = __riscv_vfcvt_f_x_v_f32m4(vacc2, vl); - vfloat32m4_t vout3 = __riscv_vfcvt_f_x_v_f32m4(vacc3, vl); + vfloat32m4_t vout0 = __riscv_vfcvt_f(vacc0, vl); + vfloat32m4_t vout1 = __riscv_vfcvt_f(vacc1, vl); + vfloat32m4_t vout2 = __riscv_vfcvt_f(vacc2, vl); + vfloat32m4_t vout3 = __riscv_vfcvt_f(vacc3, vl); // vout * input_scale const float vinput_scale0 = quantization_params[0].inv_scale; const float vinput_scale1 = quantization_params[1].inv_scale; const float vinput_scale2 = quantization_params[2].inv_scale; const float vinput_scale3 = quantization_params[3].inv_scale; - vout0 = __riscv_vfmul_vf_f32m4(vout0, vinput_scale0, vl); - vout1 = __riscv_vfmul_vf_f32m4(vout1, vinput_scale1, vl); - vout2 = __riscv_vfmul_vf_f32m4(vout2, vinput_scale2, vl); - vout3 = __riscv_vfmul_vf_f32m4(vout3, vinput_scale3, vl); + vout0 = __riscv_vfmul(vout0, vinput_scale0, vl); + vout1 = __riscv_vfmul(vout1, vinput_scale1, vl); + vout2 = __riscv_vfmul(vout2, vinput_scale2, vl); + vout3 = __riscv_vfmul(vout3, vinput_scale3, vl); const vfloat32m4_t vfilter_output_scale = __riscv_vle32_v_f32m4((const float*) w, vl); w = (const float*) w + nr; - vout0 = __riscv_vfmul_vv_f32m4(vout0, vfilter_output_scale, vl); - vout1 = __riscv_vfmul_vv_f32m4(vout1, vfilter_output_scale, vl); - vout2 = __riscv_vfmul_vv_f32m4(vout2, vfilter_output_scale, vl); - vout3 = __riscv_vfmul_vv_f32m4(vout3, vfilter_output_scale, vl); + vout0 = __riscv_vfmul(vout0, vfilter_output_scale, vl); + vout1 = __riscv_vfmul(vout1, vfilter_output_scale, vl); + vout2 = __riscv_vfmul(vout2, vfilter_output_scale, vl); + vout3 = __riscv_vfmul(vout3, vfilter_output_scale, vl); - const vfloat32m4_t vbias = __riscv_vle32_v_f32m4((const float*) w, vl); + const vfloat32m4_t vbias = __riscv_vle32_v_f32m4((const float*) w, vl); w = (const float*) w + nr; - vout0 = __riscv_vfadd_vv_f32m4(vout0, vbias, vl); - vout1 = __riscv_vfadd_vv_f32m4(vout1, vbias, vl); - vout2 = __riscv_vfadd_vv_f32m4(vout2, vbias, vl); - vout3 = __riscv_vfadd_vv_f32m4(vout3, vbias, vl); + vout0 = __riscv_vfadd(vout0, vbias, vl); + vout1 = __riscv_vfadd(vout1, vbias, vl); + vout2 = __riscv_vfadd(vout2, vbias, vl); + vout3 = __riscv_vfadd(vout3, vbias, vl); const float vmin = params->scalar.min; - vout0 = __riscv_vfmax_vf_f32m4(vout0, vmin, vl); - vout1 = __riscv_vfmax_vf_f32m4(vout1, vmin, vl); - vout2 = __riscv_vfmax_vf_f32m4(vout2, vmin, vl); - vout3 = __riscv_vfmax_vf_f32m4(vout3, vmin, vl); + vout0 = __riscv_vfmax(vout0, vmin, vl); + vout1 = __riscv_vfmax(vout1, vmin, vl); + vout2 = __riscv_vfmax(vout2, vmin, vl); + vout3 = __riscv_vfmax(vout3, vmin, vl); const float vmax = params->scalar.max; - vout0 = __riscv_vfmin_vf_f32m4(vout0, vmax, vl); - vout1 = __riscv_vfmin_vf_f32m4(vout1, vmax, vl); - vout2 = __riscv_vfmin_vf_f32m4(vout2, vmax, vl); - vout3 = __riscv_vfmin_vf_f32m4(vout3, vmax, vl); + vout0 = __riscv_vfmin(vout0, vmax, vl); + vout1 = __riscv_vfmin(vout1, vmax, vl); + vout2 = __riscv_vfmin(vout2, vmax, vl); + vout3 = __riscv_vfmin(vout3, vmax, vl); // store 4 x vl results to c - __riscv_vse32_v_f32m4(c0, vout0, vl); + __riscv_vse32(c0, vout0, vl); c0 = (float*) ((uintptr_t) c0 + cn_stride); - __riscv_vse32_v_f32m4(c1, vout1, vl); + __riscv_vse32(c1, vout1, vl); c1 = (float*) ((uintptr_t) c1 + cn_stride); - __riscv_vse32_v_f32m4(c2, vout2, vl); + __riscv_vse32(c2, vout2, vl); c2 = (float*) ((uintptr_t) c2 + cn_stride); - __riscv_vse32_v_f32m4(c3, vout3, vl); + __riscv_vse32(c3, vout3, vl); c3 = (float*) ((uintptr_t) c3 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); a1 = (const int8_t*) ((uintptr_t) a1 - kc); a2 = (const int8_t*) ((uintptr_t) a2 - kc); a3 = (const int8_t*) ((uintptr_t) a3 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl); + vint8m1_t vout82 = __riscv_vncvt_x(vout2, vl); + vint8m1_t vout83 = __riscv_vncvt_x(vout3, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + __riscv_vse8(c1, vout81, vl); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + __riscv_vse8(c2, vout82, vl); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + __riscv_vse8(c3, vout83, vl); + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); } while (nc != 0); } diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-5x4v-minmax-rvv.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-5x4v-minmax-rvv.c index 401285d3a35..46bc0ca9bb4 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-5x4v-minmax-rvv.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-5x4v-minmax-rvv.c @@ -76,11 +76,11 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x4v__rvv( const int32_t vinput_zero_point2 = quantization_params[2].zero_point; const int32_t vinput_zero_point3 = quantization_params[3].zero_point; const int32_t vinput_zero_point4 = quantization_params[4].zero_point; - vint32m4_t vacc0 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point0, vl); - vint32m4_t vacc1 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point1, vl); - vint32m4_t vacc2 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point2, vl); - vint32m4_t vacc3 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point3, vl); - vint32m4_t vacc4 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point4, vl); + vint32m4_t vacc0 = __riscv_vmul(vksum, vinput_zero_point0, vl); + vint32m4_t vacc1 = __riscv_vmul(vksum, vinput_zero_point1, vl); + vint32m4_t vacc2 = __riscv_vmul(vksum, vinput_zero_point2, vl); + vint32m4_t vacc3 = __riscv_vmul(vksum, vinput_zero_point3, vl); + vint32m4_t vacc4 = __riscv_vmul(vksum, vinput_zero_point4, vl); w = (const int32_t*) w + nr; @@ -97,21 +97,21 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x4v__rvv( w = (const int8_t*) w + nr; - vacc0 = __riscv_vwmacc_vx_i32m4(vacc0, va0, vb0, vl); - vacc1 = __riscv_vwmacc_vx_i32m4(vacc1, va1, vb0, vl); - vacc2 = __riscv_vwmacc_vx_i32m4(vacc2, va2, vb0, vl); - vacc3 = __riscv_vwmacc_vx_i32m4(vacc3, va3, vb0, vl); - vacc4 = __riscv_vwmacc_vx_i32m4(vacc4, va4, vb0, vl); + vacc0 = __riscv_vwmacc(vacc0, va0, vb0, vl); + vacc1 = __riscv_vwmacc(vacc1, va1, vb0, vl); + vacc2 = __riscv_vwmacc(vacc2, va2, vb0, vl); + vacc3 = __riscv_vwmacc(vacc3, va3, vb0, vl); + vacc4 = __riscv_vwmacc(vacc4, va4, vb0, vl); k -= sizeof(int8_t); } while (k != 0); // i32 -> f32 - vfloat32m4_t vout0 = __riscv_vfcvt_f_x_v_f32m4(vacc0, vl); - vfloat32m4_t vout1 = __riscv_vfcvt_f_x_v_f32m4(vacc1, vl); - vfloat32m4_t vout2 = __riscv_vfcvt_f_x_v_f32m4(vacc2, vl); - vfloat32m4_t vout3 = __riscv_vfcvt_f_x_v_f32m4(vacc3, vl); - vfloat32m4_t vout4 = __riscv_vfcvt_f_x_v_f32m4(vacc4, vl); + vfloat32m4_t vout0 = __riscv_vfcvt_f(vacc0, vl); + vfloat32m4_t vout1 = __riscv_vfcvt_f(vacc1, vl); + vfloat32m4_t vout2 = __riscv_vfcvt_f(vacc2, vl); + vfloat32m4_t vout3 = __riscv_vfcvt_f(vacc3, vl); + vfloat32m4_t vout4 = __riscv_vfcvt_f(vacc4, vl); // vout * input_scale const float vinput_scale0 = quantization_params[0].inv_scale; @@ -119,51 +119,51 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x4v__rvv( const float vinput_scale2 = quantization_params[2].inv_scale; const float vinput_scale3 = quantization_params[3].inv_scale; const float vinput_scale4 = quantization_params[4].inv_scale; - vout0 = __riscv_vfmul_vf_f32m4(vout0, vinput_scale0, vl); - vout1 = __riscv_vfmul_vf_f32m4(vout1, vinput_scale1, vl); - vout2 = __riscv_vfmul_vf_f32m4(vout2, vinput_scale2, vl); - vout3 = __riscv_vfmul_vf_f32m4(vout3, vinput_scale3, vl); - vout4 = __riscv_vfmul_vf_f32m4(vout4, vinput_scale4, vl); + vout0 = __riscv_vfmul(vout0, vinput_scale0, vl); + vout1 = __riscv_vfmul(vout1, vinput_scale1, vl); + vout2 = __riscv_vfmul(vout2, vinput_scale2, vl); + vout3 = __riscv_vfmul(vout3, vinput_scale3, vl); + vout4 = __riscv_vfmul(vout4, vinput_scale4, vl); const vfloat32m4_t vfilter_output_scale = __riscv_vle32_v_f32m4((const float*) w, vl); w = (const float*) w + nr; - vout0 = __riscv_vfmul_vv_f32m4(vout0, vfilter_output_scale, vl); - vout1 = __riscv_vfmul_vv_f32m4(vout1, vfilter_output_scale, vl); - vout2 = __riscv_vfmul_vv_f32m4(vout2, vfilter_output_scale, vl); - vout3 = __riscv_vfmul_vv_f32m4(vout3, vfilter_output_scale, vl); - vout4 = __riscv_vfmul_vv_f32m4(vout4, vfilter_output_scale, vl); + vout0 = __riscv_vfmul(vout0, vfilter_output_scale, vl); + vout1 = __riscv_vfmul(vout1, vfilter_output_scale, vl); + vout2 = __riscv_vfmul(vout2, vfilter_output_scale, vl); + vout3 = __riscv_vfmul(vout3, vfilter_output_scale, vl); + vout4 = __riscv_vfmul(vout4, vfilter_output_scale, vl); - const vfloat32m4_t vbias = __riscv_vle32_v_f32m4((const float*) w, vl); + const vfloat32m4_t vbias = __riscv_vle32_v_f32m4((const float*) w, vl); w = (const float*) w + nr; - vout0 = __riscv_vfadd_vv_f32m4(vout0, vbias, vl); - vout1 = __riscv_vfadd_vv_f32m4(vout1, vbias, vl); - vout2 = __riscv_vfadd_vv_f32m4(vout2, vbias, vl); - vout3 = __riscv_vfadd_vv_f32m4(vout3, vbias, vl); - vout4 = __riscv_vfadd_vv_f32m4(vout4, vbias, vl); + vout0 = __riscv_vfadd(vout0, vbias, vl); + vout1 = __riscv_vfadd(vout1, vbias, vl); + vout2 = __riscv_vfadd(vout2, vbias, vl); + vout3 = __riscv_vfadd(vout3, vbias, vl); + vout4 = __riscv_vfadd(vout4, vbias, vl); const float vmin = params->scalar.min; - vout0 = __riscv_vfmax_vf_f32m4(vout0, vmin, vl); - vout1 = __riscv_vfmax_vf_f32m4(vout1, vmin, vl); - vout2 = __riscv_vfmax_vf_f32m4(vout2, vmin, vl); - vout3 = __riscv_vfmax_vf_f32m4(vout3, vmin, vl); - vout4 = __riscv_vfmax_vf_f32m4(vout4, vmin, vl); + vout0 = __riscv_vfmax(vout0, vmin, vl); + vout1 = __riscv_vfmax(vout1, vmin, vl); + vout2 = __riscv_vfmax(vout2, vmin, vl); + vout3 = __riscv_vfmax(vout3, vmin, vl); + vout4 = __riscv_vfmax(vout4, vmin, vl); const float vmax = params->scalar.max; - vout0 = __riscv_vfmin_vf_f32m4(vout0, vmax, vl); - vout1 = __riscv_vfmin_vf_f32m4(vout1, vmax, vl); - vout2 = __riscv_vfmin_vf_f32m4(vout2, vmax, vl); - vout3 = __riscv_vfmin_vf_f32m4(vout3, vmax, vl); - vout4 = __riscv_vfmin_vf_f32m4(vout4, vmax, vl); + vout0 = __riscv_vfmin(vout0, vmax, vl); + vout1 = __riscv_vfmin(vout1, vmax, vl); + vout2 = __riscv_vfmin(vout2, vmax, vl); + vout3 = __riscv_vfmin(vout3, vmax, vl); + vout4 = __riscv_vfmin(vout4, vmax, vl); // store 5 x vl results to c - __riscv_vse32_v_f32m4(c0, vout0, vl); + __riscv_vse32(c0, vout0, vl); c0 = (float*) ((uintptr_t) c0 + cn_stride); - __riscv_vse32_v_f32m4(c1, vout1, vl); + __riscv_vse32(c1, vout1, vl); c1 = (float*) ((uintptr_t) c1 + cn_stride); - __riscv_vse32_v_f32m4(c2, vout2, vl); + __riscv_vse32(c2, vout2, vl); c2 = (float*) ((uintptr_t) c2 + cn_stride); - __riscv_vse32_v_f32m4(c3, vout3, vl); + __riscv_vse32(c3, vout3, vl); c3 = (float*) ((uintptr_t) c3 + cn_stride); - __riscv_vse32_v_f32m4(c4, vout4, vl); + __riscv_vse32(c4, vout4, vl); c4 = (float*) ((uintptr_t) c4 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); @@ -171,6 +171,29 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x4v__rvv( a2 = (const int8_t*) ((uintptr_t) a2 - kc); a3 = (const int8_t*) ((uintptr_t) a3 - kc); a4 = (const int8_t*) ((uintptr_t) a4 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl); + vint8m1_t vout82 = __riscv_vncvt_x(vout2, vl); + vint8m1_t vout83 = __riscv_vncvt_x(vout3, vl); + vint8m1_t vout84 = __riscv_vncvt_x(vout4, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + __riscv_vse8(c1, vout81, vl); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + __riscv_vse8(c2, vout82, vl); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + __riscv_vse8(c3, vout83, vl); + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + __riscv_vse8(c4, vout84, vl); + c4 = (int8_t*) ((uintptr_t) c4 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + a4 = (const int8_t*) ((uintptr_t) a4 - kc); } while (nc != 0); } diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x4v-minmax-rvv.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x4v-minmax-rvv.c index 31ca9e5a8e8..29f5a9678f7 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x4v-minmax-rvv.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x4v-minmax-rvv.c @@ -83,12 +83,12 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x4v__rvv( const int32_t vinput_zero_point3 = quantization_params[3].zero_point; const int32_t vinput_zero_point4 = quantization_params[4].zero_point; const int32_t vinput_zero_point5 = quantization_params[5].zero_point; - vint32m4_t vacc0 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point0, vl); - vint32m4_t vacc1 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point1, vl); - vint32m4_t vacc2 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point2, vl); - vint32m4_t vacc3 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point3, vl); - vint32m4_t vacc4 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point4, vl); - vint32m4_t vacc5 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point5, vl); + vint32m4_t vacc0 = __riscv_vmul(vksum, vinput_zero_point0, vl); + vint32m4_t vacc1 = __riscv_vmul(vksum, vinput_zero_point1, vl); + vint32m4_t vacc2 = __riscv_vmul(vksum, vinput_zero_point2, vl); + vint32m4_t vacc3 = __riscv_vmul(vksum, vinput_zero_point3, vl); + vint32m4_t vacc4 = __riscv_vmul(vksum, vinput_zero_point4, vl); + vint32m4_t vacc5 = __riscv_vmul(vksum, vinput_zero_point5, vl); w = (const int32_t*) w + nr; @@ -106,23 +106,23 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x4v__rvv( w = (const int8_t*) w + nr; - vacc0 = __riscv_vwmacc_vx_i32m4(vacc0, va0, vb0, vl); - vacc1 = __riscv_vwmacc_vx_i32m4(vacc1, va1, vb0, vl); - vacc2 = __riscv_vwmacc_vx_i32m4(vacc2, va2, vb0, vl); - vacc3 = __riscv_vwmacc_vx_i32m4(vacc3, va3, vb0, vl); - vacc4 = __riscv_vwmacc_vx_i32m4(vacc4, va4, vb0, vl); - vacc5 = __riscv_vwmacc_vx_i32m4(vacc5, va5, vb0, vl); + vacc0 = __riscv_vwmacc(vacc0, va0, vb0, vl); + vacc1 = __riscv_vwmacc(vacc1, va1, vb0, vl); + vacc2 = __riscv_vwmacc(vacc2, va2, vb0, vl); + vacc3 = __riscv_vwmacc(vacc3, va3, vb0, vl); + vacc4 = __riscv_vwmacc(vacc4, va4, vb0, vl); + vacc5 = __riscv_vwmacc(vacc5, va5, vb0, vl); k -= sizeof(int8_t); } while (k != 0); // i32 -> f32 - vfloat32m4_t vout0 = __riscv_vfcvt_f_x_v_f32m4(vacc0, vl); - vfloat32m4_t vout1 = __riscv_vfcvt_f_x_v_f32m4(vacc1, vl); - vfloat32m4_t vout2 = __riscv_vfcvt_f_x_v_f32m4(vacc2, vl); - vfloat32m4_t vout3 = __riscv_vfcvt_f_x_v_f32m4(vacc3, vl); - vfloat32m4_t vout4 = __riscv_vfcvt_f_x_v_f32m4(vacc4, vl); - vfloat32m4_t vout5 = __riscv_vfcvt_f_x_v_f32m4(vacc5, vl); + vfloat32m4_t vout0 = __riscv_vfcvt_f(vacc0, vl); + vfloat32m4_t vout1 = __riscv_vfcvt_f(vacc1, vl); + vfloat32m4_t vout2 = __riscv_vfcvt_f(vacc2, vl); + vfloat32m4_t vout3 = __riscv_vfcvt_f(vacc3, vl); + vfloat32m4_t vout4 = __riscv_vfcvt_f(vacc4, vl); + vfloat32m4_t vout5 = __riscv_vfcvt_f(vacc5, vl); // vout * input_scale const float vinput_scale0 = quantization_params[0].inv_scale; @@ -131,58 +131,58 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x4v__rvv( const float vinput_scale3 = quantization_params[3].inv_scale; const float vinput_scale4 = quantization_params[4].inv_scale; const float vinput_scale5 = quantization_params[5].inv_scale; - vout0 = __riscv_vfmul_vf_f32m4(vout0, vinput_scale0, vl); - vout1 = __riscv_vfmul_vf_f32m4(vout1, vinput_scale1, vl); - vout2 = __riscv_vfmul_vf_f32m4(vout2, vinput_scale2, vl); - vout3 = __riscv_vfmul_vf_f32m4(vout3, vinput_scale3, vl); - vout4 = __riscv_vfmul_vf_f32m4(vout4, vinput_scale4, vl); - vout5 = __riscv_vfmul_vf_f32m4(vout5, vinput_scale5, vl); + vout0 = __riscv_vfmul(vout0, vinput_scale0, vl); + vout1 = __riscv_vfmul(vout1, vinput_scale1, vl); + vout2 = __riscv_vfmul(vout2, vinput_scale2, vl); + vout3 = __riscv_vfmul(vout3, vinput_scale3, vl); + vout4 = __riscv_vfmul(vout4, vinput_scale4, vl); + vout5 = __riscv_vfmul(vout5, vinput_scale5, vl); const vfloat32m4_t vfilter_output_scale = __riscv_vle32_v_f32m4((const float*) w, vl); w = (const float*) w + nr; - vout0 = __riscv_vfmul_vv_f32m4(vout0, vfilter_output_scale, vl); - vout1 = __riscv_vfmul_vv_f32m4(vout1, vfilter_output_scale, vl); - vout2 = __riscv_vfmul_vv_f32m4(vout2, vfilter_output_scale, vl); - vout3 = __riscv_vfmul_vv_f32m4(vout3, vfilter_output_scale, vl); - vout4 = __riscv_vfmul_vv_f32m4(vout4, vfilter_output_scale, vl); - vout5 = __riscv_vfmul_vv_f32m4(vout5, vfilter_output_scale, vl); - - const vfloat32m4_t vbias = __riscv_vle32_v_f32m4((const float*) w, vl); + vout0 = __riscv_vfmul(vout0, vfilter_output_scale, vl); + vout1 = __riscv_vfmul(vout1, vfilter_output_scale, vl); + vout2 = __riscv_vfmul(vout2, vfilter_output_scale, vl); + vout3 = __riscv_vfmul(vout3, vfilter_output_scale, vl); + vout4 = __riscv_vfmul(vout4, vfilter_output_scale, vl); + vout5 = __riscv_vfmul(vout5, vfilter_output_scale, vl); + + const vfloat32m4_t vbias = __riscv_vle32_v_f32m4((const float*) w, vl); w = (const float*) w + nr; - vout0 = __riscv_vfadd_vv_f32m4(vout0, vbias, vl); - vout1 = __riscv_vfadd_vv_f32m4(vout1, vbias, vl); - vout2 = __riscv_vfadd_vv_f32m4(vout2, vbias, vl); - vout3 = __riscv_vfadd_vv_f32m4(vout3, vbias, vl); - vout4 = __riscv_vfadd_vv_f32m4(vout4, vbias, vl); - vout5 = __riscv_vfadd_vv_f32m4(vout5, vbias, vl); + vout0 = __riscv_vfadd(vout0, vbias, vl); + vout1 = __riscv_vfadd(vout1, vbias, vl); + vout2 = __riscv_vfadd(vout2, vbias, vl); + vout3 = __riscv_vfadd(vout3, vbias, vl); + vout4 = __riscv_vfadd(vout4, vbias, vl); + vout5 = __riscv_vfadd(vout5, vbias, vl); const float vmin = params->scalar.min; - vout0 = __riscv_vfmax_vf_f32m4(vout0, vmin, vl); - vout1 = __riscv_vfmax_vf_f32m4(vout1, vmin, vl); - vout2 = __riscv_vfmax_vf_f32m4(vout2, vmin, vl); - vout3 = __riscv_vfmax_vf_f32m4(vout3, vmin, vl); - vout4 = __riscv_vfmax_vf_f32m4(vout4, vmin, vl); - vout5 = __riscv_vfmax_vf_f32m4(vout5, vmin, vl); + vout0 = __riscv_vfmax(vout0, vmin, vl); + vout1 = __riscv_vfmax(vout1, vmin, vl); + vout2 = __riscv_vfmax(vout2, vmin, vl); + vout3 = __riscv_vfmax(vout3, vmin, vl); + vout4 = __riscv_vfmax(vout4, vmin, vl); + vout5 = __riscv_vfmax(vout5, vmin, vl); const float vmax = params->scalar.max; - vout0 = __riscv_vfmin_vf_f32m4(vout0, vmax, vl); - vout1 = __riscv_vfmin_vf_f32m4(vout1, vmax, vl); - vout2 = __riscv_vfmin_vf_f32m4(vout2, vmax, vl); - vout3 = __riscv_vfmin_vf_f32m4(vout3, vmax, vl); - vout4 = __riscv_vfmin_vf_f32m4(vout4, vmax, vl); - vout5 = __riscv_vfmin_vf_f32m4(vout5, vmax, vl); + vout0 = __riscv_vfmin(vout0, vmax, vl); + vout1 = __riscv_vfmin(vout1, vmax, vl); + vout2 = __riscv_vfmin(vout2, vmax, vl); + vout3 = __riscv_vfmin(vout3, vmax, vl); + vout4 = __riscv_vfmin(vout4, vmax, vl); + vout5 = __riscv_vfmin(vout5, vmax, vl); // store 6 x vl results to c - __riscv_vse32_v_f32m4(c0, vout0, vl); + __riscv_vse32(c0, vout0, vl); c0 = (float*) ((uintptr_t) c0 + cn_stride); - __riscv_vse32_v_f32m4(c1, vout1, vl); + __riscv_vse32(c1, vout1, vl); c1 = (float*) ((uintptr_t) c1 + cn_stride); - __riscv_vse32_v_f32m4(c2, vout2, vl); + __riscv_vse32(c2, vout2, vl); c2 = (float*) ((uintptr_t) c2 + cn_stride); - __riscv_vse32_v_f32m4(c3, vout3, vl); + __riscv_vse32(c3, vout3, vl); c3 = (float*) ((uintptr_t) c3 + cn_stride); - __riscv_vse32_v_f32m4(c4, vout4, vl); + __riscv_vse32(c4, vout4, vl); c4 = (float*) ((uintptr_t) c4 + cn_stride); - __riscv_vse32_v_f32m4(c5, vout5, vl); + __riscv_vse32(c5, vout5, vl); c5 = (float*) ((uintptr_t) c5 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); @@ -191,6 +191,33 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x4v__rvv( a3 = (const int8_t*) ((uintptr_t) a3 - kc); a4 = (const int8_t*) ((uintptr_t) a4 - kc); a5 = (const int8_t*) ((uintptr_t) a5 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl); + vint8m1_t vout82 = __riscv_vncvt_x(vout2, vl); + vint8m1_t vout83 = __riscv_vncvt_x(vout3, vl); + vint8m1_t vout84 = __riscv_vncvt_x(vout4, vl); + vint8m1_t vout85 = __riscv_vncvt_x(vout5, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + __riscv_vse8(c1, vout81, vl); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + __riscv_vse8(c2, vout82, vl); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + __riscv_vse8(c3, vout83, vl); + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + __riscv_vse8(c4, vout84, vl); + c4 = (int8_t*) ((uintptr_t) c4 + cn_stride); + __riscv_vse8(c5, vout85, vl); + c5 = (int8_t*) ((uintptr_t) c5 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + a4 = (const int8_t*) ((uintptr_t) a4 - kc); + a5 = (const int8_t*) ((uintptr_t) a5 - kc); } while (nc != 0); } diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x4v-minmax-rvv.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x4v-minmax-rvv.c index 798154781f5..dea20ae75a0 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x4v-minmax-rvv.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x4v-minmax-rvv.c @@ -90,13 +90,13 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x4v__rvv( const int32_t vinput_zero_point4 = quantization_params[4].zero_point; const int32_t vinput_zero_point5 = quantization_params[5].zero_point; const int32_t vinput_zero_point6 = quantization_params[6].zero_point; - vint32m4_t vacc0 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point0, vl); - vint32m4_t vacc1 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point1, vl); - vint32m4_t vacc2 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point2, vl); - vint32m4_t vacc3 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point3, vl); - vint32m4_t vacc4 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point4, vl); - vint32m4_t vacc5 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point5, vl); - vint32m4_t vacc6 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point6, vl); + vint32m4_t vacc0 = __riscv_vmul(vksum, vinput_zero_point0, vl); + vint32m4_t vacc1 = __riscv_vmul(vksum, vinput_zero_point1, vl); + vint32m4_t vacc2 = __riscv_vmul(vksum, vinput_zero_point2, vl); + vint32m4_t vacc3 = __riscv_vmul(vksum, vinput_zero_point3, vl); + vint32m4_t vacc4 = __riscv_vmul(vksum, vinput_zero_point4, vl); + vint32m4_t vacc5 = __riscv_vmul(vksum, vinput_zero_point5, vl); + vint32m4_t vacc6 = __riscv_vmul(vksum, vinput_zero_point6, vl); w = (const int32_t*) w + nr; @@ -115,25 +115,25 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x4v__rvv( w = (const int8_t*) w + nr; - vacc0 = __riscv_vwmacc_vx_i32m4(vacc0, va0, vb0, vl); - vacc1 = __riscv_vwmacc_vx_i32m4(vacc1, va1, vb0, vl); - vacc2 = __riscv_vwmacc_vx_i32m4(vacc2, va2, vb0, vl); - vacc3 = __riscv_vwmacc_vx_i32m4(vacc3, va3, vb0, vl); - vacc4 = __riscv_vwmacc_vx_i32m4(vacc4, va4, vb0, vl); - vacc5 = __riscv_vwmacc_vx_i32m4(vacc5, va5, vb0, vl); - vacc6 = __riscv_vwmacc_vx_i32m4(vacc6, va6, vb0, vl); + vacc0 = __riscv_vwmacc(vacc0, va0, vb0, vl); + vacc1 = __riscv_vwmacc(vacc1, va1, vb0, vl); + vacc2 = __riscv_vwmacc(vacc2, va2, vb0, vl); + vacc3 = __riscv_vwmacc(vacc3, va3, vb0, vl); + vacc4 = __riscv_vwmacc(vacc4, va4, vb0, vl); + vacc5 = __riscv_vwmacc(vacc5, va5, vb0, vl); + vacc6 = __riscv_vwmacc(vacc6, va6, vb0, vl); k -= sizeof(int8_t); } while (k != 0); // i32 -> f32 - vfloat32m4_t vout0 = __riscv_vfcvt_f_x_v_f32m4(vacc0, vl); - vfloat32m4_t vout1 = __riscv_vfcvt_f_x_v_f32m4(vacc1, vl); - vfloat32m4_t vout2 = __riscv_vfcvt_f_x_v_f32m4(vacc2, vl); - vfloat32m4_t vout3 = __riscv_vfcvt_f_x_v_f32m4(vacc3, vl); - vfloat32m4_t vout4 = __riscv_vfcvt_f_x_v_f32m4(vacc4, vl); - vfloat32m4_t vout5 = __riscv_vfcvt_f_x_v_f32m4(vacc5, vl); - vfloat32m4_t vout6 = __riscv_vfcvt_f_x_v_f32m4(vacc6, vl); + vfloat32m4_t vout0 = __riscv_vfcvt_f(vacc0, vl); + vfloat32m4_t vout1 = __riscv_vfcvt_f(vacc1, vl); + vfloat32m4_t vout2 = __riscv_vfcvt_f(vacc2, vl); + vfloat32m4_t vout3 = __riscv_vfcvt_f(vacc3, vl); + vfloat32m4_t vout4 = __riscv_vfcvt_f(vacc4, vl); + vfloat32m4_t vout5 = __riscv_vfcvt_f(vacc5, vl); + vfloat32m4_t vout6 = __riscv_vfcvt_f(vacc6, vl); // vout * input_scale const float vinput_scale0 = quantization_params[0].inv_scale; @@ -143,65 +143,65 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x4v__rvv( const float vinput_scale4 = quantization_params[4].inv_scale; const float vinput_scale5 = quantization_params[5].inv_scale; const float vinput_scale6 = quantization_params[6].inv_scale; - vout0 = __riscv_vfmul_vf_f32m4(vout0, vinput_scale0, vl); - vout1 = __riscv_vfmul_vf_f32m4(vout1, vinput_scale1, vl); - vout2 = __riscv_vfmul_vf_f32m4(vout2, vinput_scale2, vl); - vout3 = __riscv_vfmul_vf_f32m4(vout3, vinput_scale3, vl); - vout4 = __riscv_vfmul_vf_f32m4(vout4, vinput_scale4, vl); - vout5 = __riscv_vfmul_vf_f32m4(vout5, vinput_scale5, vl); - vout6 = __riscv_vfmul_vf_f32m4(vout6, vinput_scale6, vl); + vout0 = __riscv_vfmul(vout0, vinput_scale0, vl); + vout1 = __riscv_vfmul(vout1, vinput_scale1, vl); + vout2 = __riscv_vfmul(vout2, vinput_scale2, vl); + vout3 = __riscv_vfmul(vout3, vinput_scale3, vl); + vout4 = __riscv_vfmul(vout4, vinput_scale4, vl); + vout5 = __riscv_vfmul(vout5, vinput_scale5, vl); + vout6 = __riscv_vfmul(vout6, vinput_scale6, vl); const vfloat32m4_t vfilter_output_scale = __riscv_vle32_v_f32m4((const float*) w, vl); w = (const float*) w + nr; - vout0 = __riscv_vfmul_vv_f32m4(vout0, vfilter_output_scale, vl); - vout1 = __riscv_vfmul_vv_f32m4(vout1, vfilter_output_scale, vl); - vout2 = __riscv_vfmul_vv_f32m4(vout2, vfilter_output_scale, vl); - vout3 = __riscv_vfmul_vv_f32m4(vout3, vfilter_output_scale, vl); - vout4 = __riscv_vfmul_vv_f32m4(vout4, vfilter_output_scale, vl); - vout5 = __riscv_vfmul_vv_f32m4(vout5, vfilter_output_scale, vl); - vout6 = __riscv_vfmul_vv_f32m4(vout6, vfilter_output_scale, vl); - - const vfloat32m4_t vbias = __riscv_vle32_v_f32m4((const float*) w, vl); + vout0 = __riscv_vfmul(vout0, vfilter_output_scale, vl); + vout1 = __riscv_vfmul(vout1, vfilter_output_scale, vl); + vout2 = __riscv_vfmul(vout2, vfilter_output_scale, vl); + vout3 = __riscv_vfmul(vout3, vfilter_output_scale, vl); + vout4 = __riscv_vfmul(vout4, vfilter_output_scale, vl); + vout5 = __riscv_vfmul(vout5, vfilter_output_scale, vl); + vout6 = __riscv_vfmul(vout6, vfilter_output_scale, vl); + + const vfloat32m4_t vbias = __riscv_vle32_v_f32m4((const float*) w, vl); w = (const float*) w + nr; - vout0 = __riscv_vfadd_vv_f32m4(vout0, vbias, vl); - vout1 = __riscv_vfadd_vv_f32m4(vout1, vbias, vl); - vout2 = __riscv_vfadd_vv_f32m4(vout2, vbias, vl); - vout3 = __riscv_vfadd_vv_f32m4(vout3, vbias, vl); - vout4 = __riscv_vfadd_vv_f32m4(vout4, vbias, vl); - vout5 = __riscv_vfadd_vv_f32m4(vout5, vbias, vl); - vout6 = __riscv_vfadd_vv_f32m4(vout6, vbias, vl); + vout0 = __riscv_vfadd(vout0, vbias, vl); + vout1 = __riscv_vfadd(vout1, vbias, vl); + vout2 = __riscv_vfadd(vout2, vbias, vl); + vout3 = __riscv_vfadd(vout3, vbias, vl); + vout4 = __riscv_vfadd(vout4, vbias, vl); + vout5 = __riscv_vfadd(vout5, vbias, vl); + vout6 = __riscv_vfadd(vout6, vbias, vl); const float vmin = params->scalar.min; - vout0 = __riscv_vfmax_vf_f32m4(vout0, vmin, vl); - vout1 = __riscv_vfmax_vf_f32m4(vout1, vmin, vl); - vout2 = __riscv_vfmax_vf_f32m4(vout2, vmin, vl); - vout3 = __riscv_vfmax_vf_f32m4(vout3, vmin, vl); - vout4 = __riscv_vfmax_vf_f32m4(vout4, vmin, vl); - vout5 = __riscv_vfmax_vf_f32m4(vout5, vmin, vl); - vout6 = __riscv_vfmax_vf_f32m4(vout6, vmin, vl); + vout0 = __riscv_vfmax(vout0, vmin, vl); + vout1 = __riscv_vfmax(vout1, vmin, vl); + vout2 = __riscv_vfmax(vout2, vmin, vl); + vout3 = __riscv_vfmax(vout3, vmin, vl); + vout4 = __riscv_vfmax(vout4, vmin, vl); + vout5 = __riscv_vfmax(vout5, vmin, vl); + vout6 = __riscv_vfmax(vout6, vmin, vl); const float vmax = params->scalar.max; - vout0 = __riscv_vfmin_vf_f32m4(vout0, vmax, vl); - vout1 = __riscv_vfmin_vf_f32m4(vout1, vmax, vl); - vout2 = __riscv_vfmin_vf_f32m4(vout2, vmax, vl); - vout3 = __riscv_vfmin_vf_f32m4(vout3, vmax, vl); - vout4 = __riscv_vfmin_vf_f32m4(vout4, vmax, vl); - vout5 = __riscv_vfmin_vf_f32m4(vout5, vmax, vl); - vout6 = __riscv_vfmin_vf_f32m4(vout6, vmax, vl); + vout0 = __riscv_vfmin(vout0, vmax, vl); + vout1 = __riscv_vfmin(vout1, vmax, vl); + vout2 = __riscv_vfmin(vout2, vmax, vl); + vout3 = __riscv_vfmin(vout3, vmax, vl); + vout4 = __riscv_vfmin(vout4, vmax, vl); + vout5 = __riscv_vfmin(vout5, vmax, vl); + vout6 = __riscv_vfmin(vout6, vmax, vl); // store 7 x vl results to c - __riscv_vse32_v_f32m4(c0, vout0, vl); + __riscv_vse32(c0, vout0, vl); c0 = (float*) ((uintptr_t) c0 + cn_stride); - __riscv_vse32_v_f32m4(c1, vout1, vl); + __riscv_vse32(c1, vout1, vl); c1 = (float*) ((uintptr_t) c1 + cn_stride); - __riscv_vse32_v_f32m4(c2, vout2, vl); + __riscv_vse32(c2, vout2, vl); c2 = (float*) ((uintptr_t) c2 + cn_stride); - __riscv_vse32_v_f32m4(c3, vout3, vl); + __riscv_vse32(c3, vout3, vl); c3 = (float*) ((uintptr_t) c3 + cn_stride); - __riscv_vse32_v_f32m4(c4, vout4, vl); + __riscv_vse32(c4, vout4, vl); c4 = (float*) ((uintptr_t) c4 + cn_stride); - __riscv_vse32_v_f32m4(c5, vout5, vl); + __riscv_vse32(c5, vout5, vl); c5 = (float*) ((uintptr_t) c5 + cn_stride); - __riscv_vse32_v_f32m4(c6, vout6, vl); + __riscv_vse32(c6, vout6, vl); c6 = (float*) ((uintptr_t) c6 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); @@ -211,6 +211,37 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x4v__rvv( a4 = (const int8_t*) ((uintptr_t) a4 - kc); a5 = (const int8_t*) ((uintptr_t) a5 - kc); a6 = (const int8_t*) ((uintptr_t) a6 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl); + vint8m1_t vout82 = __riscv_vncvt_x(vout2, vl); + vint8m1_t vout83 = __riscv_vncvt_x(vout3, vl); + vint8m1_t vout84 = __riscv_vncvt_x(vout4, vl); + vint8m1_t vout85 = __riscv_vncvt_x(vout5, vl); + vint8m1_t vout86 = __riscv_vncvt_x(vout6, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + __riscv_vse8(c1, vout81, vl); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + __riscv_vse8(c2, vout82, vl); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + __riscv_vse8(c3, vout83, vl); + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + __riscv_vse8(c4, vout84, vl); + c4 = (int8_t*) ((uintptr_t) c4 + cn_stride); + __riscv_vse8(c5, vout85, vl); + c5 = (int8_t*) ((uintptr_t) c5 + cn_stride); + __riscv_vse8(c6, vout86, vl); + c6 = (int8_t*) ((uintptr_t) c6 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + a4 = (const int8_t*) ((uintptr_t) a4 - kc); + a5 = (const int8_t*) ((uintptr_t) a5 - kc); + a6 = (const int8_t*) ((uintptr_t) a6 - kc); } while (nc != 0); } diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x4v-minmax-rvv.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x4v-minmax-rvv.c index 696c3c60f48..c9cabf0ad29 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x4v-minmax-rvv.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x4v-minmax-rvv.c @@ -97,14 +97,14 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x4v__rvv( const int32_t vinput_zero_point5 = quantization_params[5].zero_point; const int32_t vinput_zero_point6 = quantization_params[6].zero_point; const int32_t vinput_zero_point7 = quantization_params[7].zero_point; - vint32m4_t vacc0 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point0, vl); - vint32m4_t vacc1 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point1, vl); - vint32m4_t vacc2 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point2, vl); - vint32m4_t vacc3 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point3, vl); - vint32m4_t vacc4 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point4, vl); - vint32m4_t vacc5 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point5, vl); - vint32m4_t vacc6 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point6, vl); - vint32m4_t vacc7 = __riscv_vmul_vx_i32m4(vksum, vinput_zero_point7, vl); + vint32m4_t vacc0 = __riscv_vmul(vksum, vinput_zero_point0, vl); + vint32m4_t vacc1 = __riscv_vmul(vksum, vinput_zero_point1, vl); + vint32m4_t vacc2 = __riscv_vmul(vksum, vinput_zero_point2, vl); + vint32m4_t vacc3 = __riscv_vmul(vksum, vinput_zero_point3, vl); + vint32m4_t vacc4 = __riscv_vmul(vksum, vinput_zero_point4, vl); + vint32m4_t vacc5 = __riscv_vmul(vksum, vinput_zero_point5, vl); + vint32m4_t vacc6 = __riscv_vmul(vksum, vinput_zero_point6, vl); + vint32m4_t vacc7 = __riscv_vmul(vksum, vinput_zero_point7, vl); w = (const int32_t*) w + nr; @@ -124,27 +124,27 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x4v__rvv( w = (const int8_t*) w + nr; - vacc0 = __riscv_vwmacc_vx_i32m4(vacc0, va0, vb0, vl); - vacc1 = __riscv_vwmacc_vx_i32m4(vacc1, va1, vb0, vl); - vacc2 = __riscv_vwmacc_vx_i32m4(vacc2, va2, vb0, vl); - vacc3 = __riscv_vwmacc_vx_i32m4(vacc3, va3, vb0, vl); - vacc4 = __riscv_vwmacc_vx_i32m4(vacc4, va4, vb0, vl); - vacc5 = __riscv_vwmacc_vx_i32m4(vacc5, va5, vb0, vl); - vacc6 = __riscv_vwmacc_vx_i32m4(vacc6, va6, vb0, vl); - vacc7 = __riscv_vwmacc_vx_i32m4(vacc7, va7, vb0, vl); + vacc0 = __riscv_vwmacc(vacc0, va0, vb0, vl); + vacc1 = __riscv_vwmacc(vacc1, va1, vb0, vl); + vacc2 = __riscv_vwmacc(vacc2, va2, vb0, vl); + vacc3 = __riscv_vwmacc(vacc3, va3, vb0, vl); + vacc4 = __riscv_vwmacc(vacc4, va4, vb0, vl); + vacc5 = __riscv_vwmacc(vacc5, va5, vb0, vl); + vacc6 = __riscv_vwmacc(vacc6, va6, vb0, vl); + vacc7 = __riscv_vwmacc(vacc7, va7, vb0, vl); k -= sizeof(int8_t); } while (k != 0); // i32 -> f32 - vfloat32m4_t vout0 = __riscv_vfcvt_f_x_v_f32m4(vacc0, vl); - vfloat32m4_t vout1 = __riscv_vfcvt_f_x_v_f32m4(vacc1, vl); - vfloat32m4_t vout2 = __riscv_vfcvt_f_x_v_f32m4(vacc2, vl); - vfloat32m4_t vout3 = __riscv_vfcvt_f_x_v_f32m4(vacc3, vl); - vfloat32m4_t vout4 = __riscv_vfcvt_f_x_v_f32m4(vacc4, vl); - vfloat32m4_t vout5 = __riscv_vfcvt_f_x_v_f32m4(vacc5, vl); - vfloat32m4_t vout6 = __riscv_vfcvt_f_x_v_f32m4(vacc6, vl); - vfloat32m4_t vout7 = __riscv_vfcvt_f_x_v_f32m4(vacc7, vl); + vfloat32m4_t vout0 = __riscv_vfcvt_f(vacc0, vl); + vfloat32m4_t vout1 = __riscv_vfcvt_f(vacc1, vl); + vfloat32m4_t vout2 = __riscv_vfcvt_f(vacc2, vl); + vfloat32m4_t vout3 = __riscv_vfcvt_f(vacc3, vl); + vfloat32m4_t vout4 = __riscv_vfcvt_f(vacc4, vl); + vfloat32m4_t vout5 = __riscv_vfcvt_f(vacc5, vl); + vfloat32m4_t vout6 = __riscv_vfcvt_f(vacc6, vl); + vfloat32m4_t vout7 = __riscv_vfcvt_f(vacc7, vl); // vout * input_scale const float vinput_scale0 = quantization_params[0].inv_scale; @@ -155,72 +155,72 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x4v__rvv( const float vinput_scale5 = quantization_params[5].inv_scale; const float vinput_scale6 = quantization_params[6].inv_scale; const float vinput_scale7 = quantization_params[7].inv_scale; - vout0 = __riscv_vfmul_vf_f32m4(vout0, vinput_scale0, vl); - vout1 = __riscv_vfmul_vf_f32m4(vout1, vinput_scale1, vl); - vout2 = __riscv_vfmul_vf_f32m4(vout2, vinput_scale2, vl); - vout3 = __riscv_vfmul_vf_f32m4(vout3, vinput_scale3, vl); - vout4 = __riscv_vfmul_vf_f32m4(vout4, vinput_scale4, vl); - vout5 = __riscv_vfmul_vf_f32m4(vout5, vinput_scale5, vl); - vout6 = __riscv_vfmul_vf_f32m4(vout6, vinput_scale6, vl); - vout7 = __riscv_vfmul_vf_f32m4(vout7, vinput_scale7, vl); + vout0 = __riscv_vfmul(vout0, vinput_scale0, vl); + vout1 = __riscv_vfmul(vout1, vinput_scale1, vl); + vout2 = __riscv_vfmul(vout2, vinput_scale2, vl); + vout3 = __riscv_vfmul(vout3, vinput_scale3, vl); + vout4 = __riscv_vfmul(vout4, vinput_scale4, vl); + vout5 = __riscv_vfmul(vout5, vinput_scale5, vl); + vout6 = __riscv_vfmul(vout6, vinput_scale6, vl); + vout7 = __riscv_vfmul(vout7, vinput_scale7, vl); const vfloat32m4_t vfilter_output_scale = __riscv_vle32_v_f32m4((const float*) w, vl); w = (const float*) w + nr; - vout0 = __riscv_vfmul_vv_f32m4(vout0, vfilter_output_scale, vl); - vout1 = __riscv_vfmul_vv_f32m4(vout1, vfilter_output_scale, vl); - vout2 = __riscv_vfmul_vv_f32m4(vout2, vfilter_output_scale, vl); - vout3 = __riscv_vfmul_vv_f32m4(vout3, vfilter_output_scale, vl); - vout4 = __riscv_vfmul_vv_f32m4(vout4, vfilter_output_scale, vl); - vout5 = __riscv_vfmul_vv_f32m4(vout5, vfilter_output_scale, vl); - vout6 = __riscv_vfmul_vv_f32m4(vout6, vfilter_output_scale, vl); - vout7 = __riscv_vfmul_vv_f32m4(vout7, vfilter_output_scale, vl); - - const vfloat32m4_t vbias = __riscv_vle32_v_f32m4((const float*) w, vl); + vout0 = __riscv_vfmul(vout0, vfilter_output_scale, vl); + vout1 = __riscv_vfmul(vout1, vfilter_output_scale, vl); + vout2 = __riscv_vfmul(vout2, vfilter_output_scale, vl); + vout3 = __riscv_vfmul(vout3, vfilter_output_scale, vl); + vout4 = __riscv_vfmul(vout4, vfilter_output_scale, vl); + vout5 = __riscv_vfmul(vout5, vfilter_output_scale, vl); + vout6 = __riscv_vfmul(vout6, vfilter_output_scale, vl); + vout7 = __riscv_vfmul(vout7, vfilter_output_scale, vl); + + const vfloat32m4_t vbias = __riscv_vle32_v_f32m4((const float*) w, vl); w = (const float*) w + nr; - vout0 = __riscv_vfadd_vv_f32m4(vout0, vbias, vl); - vout1 = __riscv_vfadd_vv_f32m4(vout1, vbias, vl); - vout2 = __riscv_vfadd_vv_f32m4(vout2, vbias, vl); - vout3 = __riscv_vfadd_vv_f32m4(vout3, vbias, vl); - vout4 = __riscv_vfadd_vv_f32m4(vout4, vbias, vl); - vout5 = __riscv_vfadd_vv_f32m4(vout5, vbias, vl); - vout6 = __riscv_vfadd_vv_f32m4(vout6, vbias, vl); - vout7 = __riscv_vfadd_vv_f32m4(vout7, vbias, vl); + vout0 = __riscv_vfadd(vout0, vbias, vl); + vout1 = __riscv_vfadd(vout1, vbias, vl); + vout2 = __riscv_vfadd(vout2, vbias, vl); + vout3 = __riscv_vfadd(vout3, vbias, vl); + vout4 = __riscv_vfadd(vout4, vbias, vl); + vout5 = __riscv_vfadd(vout5, vbias, vl); + vout6 = __riscv_vfadd(vout6, vbias, vl); + vout7 = __riscv_vfadd(vout7, vbias, vl); const float vmin = params->scalar.min; - vout0 = __riscv_vfmax_vf_f32m4(vout0, vmin, vl); - vout1 = __riscv_vfmax_vf_f32m4(vout1, vmin, vl); - vout2 = __riscv_vfmax_vf_f32m4(vout2, vmin, vl); - vout3 = __riscv_vfmax_vf_f32m4(vout3, vmin, vl); - vout4 = __riscv_vfmax_vf_f32m4(vout4, vmin, vl); - vout5 = __riscv_vfmax_vf_f32m4(vout5, vmin, vl); - vout6 = __riscv_vfmax_vf_f32m4(vout6, vmin, vl); - vout7 = __riscv_vfmax_vf_f32m4(vout7, vmin, vl); + vout0 = __riscv_vfmax(vout0, vmin, vl); + vout1 = __riscv_vfmax(vout1, vmin, vl); + vout2 = __riscv_vfmax(vout2, vmin, vl); + vout3 = __riscv_vfmax(vout3, vmin, vl); + vout4 = __riscv_vfmax(vout4, vmin, vl); + vout5 = __riscv_vfmax(vout5, vmin, vl); + vout6 = __riscv_vfmax(vout6, vmin, vl); + vout7 = __riscv_vfmax(vout7, vmin, vl); const float vmax = params->scalar.max; - vout0 = __riscv_vfmin_vf_f32m4(vout0, vmax, vl); - vout1 = __riscv_vfmin_vf_f32m4(vout1, vmax, vl); - vout2 = __riscv_vfmin_vf_f32m4(vout2, vmax, vl); - vout3 = __riscv_vfmin_vf_f32m4(vout3, vmax, vl); - vout4 = __riscv_vfmin_vf_f32m4(vout4, vmax, vl); - vout5 = __riscv_vfmin_vf_f32m4(vout5, vmax, vl); - vout6 = __riscv_vfmin_vf_f32m4(vout6, vmax, vl); - vout7 = __riscv_vfmin_vf_f32m4(vout7, vmax, vl); + vout0 = __riscv_vfmin(vout0, vmax, vl); + vout1 = __riscv_vfmin(vout1, vmax, vl); + vout2 = __riscv_vfmin(vout2, vmax, vl); + vout3 = __riscv_vfmin(vout3, vmax, vl); + vout4 = __riscv_vfmin(vout4, vmax, vl); + vout5 = __riscv_vfmin(vout5, vmax, vl); + vout6 = __riscv_vfmin(vout6, vmax, vl); + vout7 = __riscv_vfmin(vout7, vmax, vl); // store 8 x vl results to c - __riscv_vse32_v_f32m4(c0, vout0, vl); + __riscv_vse32(c0, vout0, vl); c0 = (float*) ((uintptr_t) c0 + cn_stride); - __riscv_vse32_v_f32m4(c1, vout1, vl); + __riscv_vse32(c1, vout1, vl); c1 = (float*) ((uintptr_t) c1 + cn_stride); - __riscv_vse32_v_f32m4(c2, vout2, vl); + __riscv_vse32(c2, vout2, vl); c2 = (float*) ((uintptr_t) c2 + cn_stride); - __riscv_vse32_v_f32m4(c3, vout3, vl); + __riscv_vse32(c3, vout3, vl); c3 = (float*) ((uintptr_t) c3 + cn_stride); - __riscv_vse32_v_f32m4(c4, vout4, vl); + __riscv_vse32(c4, vout4, vl); c4 = (float*) ((uintptr_t) c4 + cn_stride); - __riscv_vse32_v_f32m4(c5, vout5, vl); + __riscv_vse32(c5, vout5, vl); c5 = (float*) ((uintptr_t) c5 + cn_stride); - __riscv_vse32_v_f32m4(c6, vout6, vl); + __riscv_vse32(c6, vout6, vl); c6 = (float*) ((uintptr_t) c6 + cn_stride); - __riscv_vse32_v_f32m4(c7, vout7, vl); + __riscv_vse32(c7, vout7, vl); c7 = (float*) ((uintptr_t) c7 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); @@ -231,6 +231,41 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x4v__rvv( a5 = (const int8_t*) ((uintptr_t) a5 - kc); a6 = (const int8_t*) ((uintptr_t) a6 - kc); a7 = (const int8_t*) ((uintptr_t) a7 - kc); + + vint8m1_t vout80 = __riscv_vncvt_x(vout0, vl); + vint8m1_t vout81 = __riscv_vncvt_x(vout1, vl); + vint8m1_t vout82 = __riscv_vncvt_x(vout2, vl); + vint8m1_t vout83 = __riscv_vncvt_x(vout3, vl); + vint8m1_t vout84 = __riscv_vncvt_x(vout4, vl); + vint8m1_t vout85 = __riscv_vncvt_x(vout5, vl); + vint8m1_t vout86 = __riscv_vncvt_x(vout6, vl); + vint8m1_t vout87 = __riscv_vncvt_x(vout7, vl); + + __riscv_vse8(c0, vout80, vl); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + __riscv_vse8(c1, vout81, vl); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + __riscv_vse8(c2, vout82, vl); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + __riscv_vse8(c3, vout83, vl); + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + __riscv_vse8(c4, vout84, vl); + c4 = (int8_t*) ((uintptr_t) c4 + cn_stride); + __riscv_vse8(c5, vout85, vl); + c5 = (int8_t*) ((uintptr_t) c5 + cn_stride); + __riscv_vse8(c6, vout86, vl); + c6 = (int8_t*) ((uintptr_t) c6 + cn_stride); + __riscv_vse8(c7, vout87, vl); + c7 = (int8_t*) ((uintptr_t) c7 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + a4 = (const int8_t*) ((uintptr_t) a4 - kc); + a5 = (const int8_t*) ((uintptr_t) a5 - kc); + a6 = (const int8_t*) ((uintptr_t) a6 - kc); + a7 = (const int8_t*) ((uintptr_t) a7 - kc); } while (nc != 0); } diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-10x16c4-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-10x16c4-minmax-avx512vnni-prfm.c index 7838c2b61bc..0c561d8ee3f 100644 --- a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-10x16c4-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-10x16c4-minmax-avx512vnni-prfm.c @@ -91,7 +91,7 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_10x16c4__avx512vnni_prfm( const __m512 vinput_inv_scale = _mm512_set1_ps(quantization_params->inv_scale); const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vinput_zero_point); + // XNN_FORCE_REALIZATION(vinput_zero_point); // XNN_FORCE_REALIZATION(vinput_inv_scale); // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-10x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-10x16c4-minmax-avx512vnni.c index 6bf76954754..052a0efefb2 100644 --- a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-10x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-10x16c4-minmax-avx512vnni.c @@ -90,7 +90,7 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_10x16c4__avx512vnni( const __m512 vinput_inv_scale = _mm512_set1_ps(quantization_params->inv_scale); const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vinput_zero_point); + // XNN_FORCE_REALIZATION(vinput_zero_point); // XNN_FORCE_REALIZATION(vinput_inv_scale); // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-12x16c4-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-12x16c4-minmax-avx512vnni-prfm.c index 7bed1ec6987..acbf9947df4 100644 --- a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-12x16c4-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-12x16c4-minmax-avx512vnni-prfm.c @@ -99,7 +99,7 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_12x16c4__avx512vnni_prfm( const __m512 vinput_inv_scale = _mm512_set1_ps(quantization_params->inv_scale); const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vinput_zero_point); + // XNN_FORCE_REALIZATION(vinput_zero_point); // XNN_FORCE_REALIZATION(vinput_inv_scale); // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-12x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-12x16c4-minmax-avx512vnni.c index 18e9849f614..88a21a598bd 100644 --- a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-12x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-12x16c4-minmax-avx512vnni.c @@ -98,7 +98,7 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_12x16c4__avx512vnni( const __m512 vinput_inv_scale = _mm512_set1_ps(quantization_params->inv_scale); const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vinput_zero_point); + // XNN_FORCE_REALIZATION(vinput_zero_point); // XNN_FORCE_REALIZATION(vinput_inv_scale); // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-14x16c4-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-14x16c4-minmax-avx512vnni-prfm.c index 5f58187e337..0d1bfbd5384 100644 --- a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-14x16c4-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-14x16c4-minmax-avx512vnni-prfm.c @@ -107,7 +107,7 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_14x16c4__avx512vnni_prfm( const __m512 vinput_inv_scale = _mm512_set1_ps(quantization_params->inv_scale); const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vinput_zero_point); + // XNN_FORCE_REALIZATION(vinput_zero_point); // XNN_FORCE_REALIZATION(vinput_inv_scale); // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-14x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-14x16c4-minmax-avx512vnni.c index 580472f1881..dac64007fb0 100644 --- a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-14x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-14x16c4-minmax-avx512vnni.c @@ -106,7 +106,7 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_14x16c4__avx512vnni( const __m512 vinput_inv_scale = _mm512_set1_ps(quantization_params->inv_scale); const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vinput_zero_point); + // XNN_FORCE_REALIZATION(vinput_zero_point); // XNN_FORCE_REALIZATION(vinput_inv_scale); // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c4-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c4-minmax-avx512vnni-prfm.c index 54b74daaf97..28a7adb3c14 100644 --- a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c4-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c4-minmax-avx512vnni-prfm.c @@ -55,7 +55,7 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__avx512vnni_prfm( const __m512 vinput_inv_scale = _mm512_set1_ps(quantization_params->inv_scale); const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vinput_zero_point); + // XNN_FORCE_REALIZATION(vinput_zero_point); // XNN_FORCE_REALIZATION(vinput_inv_scale); // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c4-minmax-avx512vnni.c index af17aea1ca9..eceb57bdab2 100644 --- a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c4-minmax-avx512vnni.c @@ -54,7 +54,7 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__avx512vnni( const __m512 vinput_inv_scale = _mm512_set1_ps(quantization_params->inv_scale); const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vinput_zero_point); + // XNN_FORCE_REALIZATION(vinput_zero_point); // XNN_FORCE_REALIZATION(vinput_inv_scale); // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-avx512vnni-prfm.c index c20af82c915..b651a4865d9 100644 --- a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-avx512vnni-prfm.c @@ -67,7 +67,7 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__avx512vnni_prfm( const __m512 vinput_inv_scale = _mm512_set1_ps(quantization_params->inv_scale); const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vinput_zero_point); + // XNN_FORCE_REALIZATION(vinput_zero_point); // XNN_FORCE_REALIZATION(vinput_inv_scale); // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-avx512vnni.c index 8051b23c49d..7b250a9a167 100644 --- a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-avx512vnni.c @@ -66,7 +66,7 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__avx512vnni( const __m512 vinput_inv_scale = _mm512_set1_ps(quantization_params->inv_scale); const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vinput_zero_point); + // XNN_FORCE_REALIZATION(vinput_zero_point); // XNN_FORCE_REALIZATION(vinput_inv_scale); // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-5x16c4-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-5x16c4-minmax-avx512vnni-prfm.c index 91050d71691..6e100e2e2e5 100644 --- a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-5x16c4-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-5x16c4-minmax-avx512vnni-prfm.c @@ -71,7 +71,7 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_5x16c4__avx512vnni_prfm( const __m512 vinput_inv_scale = _mm512_set1_ps(quantization_params->inv_scale); const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vinput_zero_point); + // XNN_FORCE_REALIZATION(vinput_zero_point); // XNN_FORCE_REALIZATION(vinput_inv_scale); // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-5x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-5x16c4-minmax-avx512vnni.c index f934e3535e0..04e4f730cf8 100644 --- a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-5x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-5x16c4-minmax-avx512vnni.c @@ -70,7 +70,7 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_5x16c4__avx512vnni( const __m512 vinput_inv_scale = _mm512_set1_ps(quantization_params->inv_scale); const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vinput_zero_point); + // XNN_FORCE_REALIZATION(vinput_zero_point); // XNN_FORCE_REALIZATION(vinput_inv_scale); // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-7x16c4-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-7x16c4-minmax-avx512vnni-prfm.c index 029d471b981..071b9c3c853 100644 --- a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-7x16c4-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-7x16c4-minmax-avx512vnni-prfm.c @@ -79,7 +79,7 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x16c4__avx512vnni_prfm( const __m512 vinput_inv_scale = _mm512_set1_ps(quantization_params->inv_scale); const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vinput_zero_point); + // XNN_FORCE_REALIZATION(vinput_zero_point); // XNN_FORCE_REALIZATION(vinput_inv_scale); // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-7x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-7x16c4-minmax-avx512vnni.c index a987a2e009f..d0c6e9474f0 100644 --- a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-7x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-7x16c4-minmax-avx512vnni.c @@ -78,7 +78,7 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x16c4__avx512vnni( const __m512 vinput_inv_scale = _mm512_set1_ps(quantization_params->inv_scale); const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vinput_zero_point); + // XNN_FORCE_REALIZATION(vinput_zero_point); // XNN_FORCE_REALIZATION(vinput_inv_scale); // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-8x16c4-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-8x16c4-minmax-avx512vnni-prfm.c index 0cd45404a08..1468283ab5c 100644 --- a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-8x16c4-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-8x16c4-minmax-avx512vnni-prfm.c @@ -83,7 +83,7 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_8x16c4__avx512vnni_prfm( const __m512 vinput_inv_scale = _mm512_set1_ps(quantization_params->inv_scale); const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vinput_zero_point); + // XNN_FORCE_REALIZATION(vinput_zero_point); // XNN_FORCE_REALIZATION(vinput_inv_scale); // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-8x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-8x16c4-minmax-avx512vnni.c index 9926fe79178..bd943df6fa9 100644 --- a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-8x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-8x16c4-minmax-avx512vnni.c @@ -82,7 +82,7 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_8x16c4__avx512vnni( const __m512 vinput_inv_scale = _mm512_set1_ps(quantization_params->inv_scale); const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vinput_zero_point); + // XNN_FORCE_REALIZATION(vinput_zero_point); // XNN_FORCE_REALIZATION(vinput_inv_scale); // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-9x16c4-minmax-avx512vnni-prfm.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-9x16c4-minmax-avx512vnni-prfm.c index 580fa323fb2..5804c9dac12 100644 --- a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-9x16c4-minmax-avx512vnni-prfm.c +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-9x16c4-minmax-avx512vnni-prfm.c @@ -87,7 +87,7 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_9x16c4__avx512vnni_prfm( const __m512 vinput_inv_scale = _mm512_set1_ps(quantization_params->inv_scale); const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vinput_zero_point); + // XNN_FORCE_REALIZATION(vinput_zero_point); // XNN_FORCE_REALIZATION(vinput_inv_scale); // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-9x16c4-minmax-avx512vnni.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-9x16c4-minmax-avx512vnni.c index e4528e2346e..b00bef4e597 100644 --- a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-9x16c4-minmax-avx512vnni.c +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-9x16c4-minmax-avx512vnni.c @@ -86,7 +86,7 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_9x16c4__avx512vnni( const __m512 vinput_inv_scale = _mm512_set1_ps(quantization_params->inv_scale); const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vinput_zero_point); + // XNN_FORCE_REALIZATION(vinput_zero_point); // XNN_FORCE_REALIZATION(vinput_inv_scale); // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); diff --git a/src/qs8-gemm/MRx16c4-avx512vnni.c.in b/src/qs8-gemm/MRx16c4-avx512vnni.c.in index 30b02695c86..3950cdde070 100644 --- a/src/qs8-gemm/MRx16c4-avx512vnni.c.in +++ b/src/qs8-gemm/MRx16c4-avx512vnni.c.in @@ -78,7 +78,7 @@ void xnn_${DATATYPE_SPEC}_gemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x16c4__ $if DATATYPE not in ["QD8", "QC4"]: const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); $if DATATYPE in ["QD8", "QC4"]: $for M in range(MR): const __m512i vinput_zero_point${M} = _mm512_set1_epi32((int) quantization_params[${M}].zero_point); @@ -91,10 +91,10 @@ void xnn_${DATATYPE_SPEC}_gemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x16c4__ const __m512i vmask = _mm512_set1_epi8(0x0F); $else: const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); $if GFNI: const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); $else: $if DATATYPE != "QC8": const __m512 vscale = _mm512_set1_ps(params->${PARAMS_STRUCT}.scale); diff --git a/src/qs8-gemm/MRx16c8-avx512vnni.c.in b/src/qs8-gemm/MRx16c8-avx512vnni.c.in index 00eb9d82d49..bfc9fdcc22a 100644 --- a/src/qs8-gemm/MRx16c8-avx512vnni.c.in +++ b/src/qs8-gemm/MRx16c8-avx512vnni.c.in @@ -113,7 +113,7 @@ void xnn_${DATATYPE_SPEC}_gemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x16c8__ // XNN_FORCE_REALIZATION(voutput_max); $else: const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->${PARAMS_STRUCT}.output_max - (int32_t) params->${PARAMS_STRUCT}.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->${PARAMS_STRUCT}.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->${PARAMS_STRUCT}.output_min); @@ -125,10 +125,10 @@ void xnn_${DATATYPE_SPEC}_gemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x16c8__ const __m512i vmask = _mm512_set1_epi8(0x0F); $else: const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); $if GFNI: const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { $if DATATYPE in ["QD8", "QC4"]: const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); diff --git a/src/qs8-gemm/MRx4c8-ssevnni.c.in b/src/qs8-gemm/MRx4c8-ssevnni.c.in index 7eb45cc91df..d3f5c1ca71d 100644 --- a/src/qs8-gemm/MRx4c8-ssevnni.c.in +++ b/src/qs8-gemm/MRx4c8-ssevnni.c.in @@ -94,7 +94,7 @@ void xnn_${DATATYPE_SPEC}_gemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x4c8__$ const __m128 voutput_max = _mm_set1_ps(params->scalar.max); $else: const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->${PARAMS_STRUCT}.output_max - (int32_t) params->${PARAMS_STRUCT}.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->${PARAMS_STRUCT}.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->${PARAMS_STRUCT}.output_min); @@ -103,10 +103,10 @@ void xnn_${DATATYPE_SPEC}_gemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x4c8__$ const __m128i vmask = _mm_set1_epi8(0x0F); $else: const __m128i vmask = _mm_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); $if GFNI: const __m128i vshl4 = _mm_set1_epi64x(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { $if DATATYPE in ["QD8_F16", "QD8_F32", "QC4_F16", "QC4_F32"]: const __m128i vksum0123 = _mm_load_si128(w); diff --git a/src/qs8-gemm/MRx8c4-avxvnni.c.in b/src/qs8-gemm/MRx8c4-avxvnni.c.in index 84cad42c488..6a04216cf55 100644 --- a/src/qs8-gemm/MRx8c4-avxvnni.c.in +++ b/src/qs8-gemm/MRx8c4-avxvnni.c.in @@ -89,13 +89,13 @@ void xnn_${DATATYPE_SPEC}_gemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x8c4__$ // XNN_FORCE_REALIZATION(voutput_max); $if DATATYPE == "QC4": const __m256i vvalue_mask = _mm256_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vvalue_mask); + // XNN_FORCE_REALIZATION(vvalue_mask); $if GFNI: const __m256i vshl4 = _mm256_set1_epi64x(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); $else: const __m256i vsign_mask =_mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); $if DATATYPE != "QC8": const __m256 vscale = _mm256_set1_ps(params->${PARAMS_STRUCT}.scale); // XNN_FORCE_REALIZATION(vscale); diff --git a/src/qs8-gemm/MRx8c8-avx512vnni.c.in b/src/qs8-gemm/MRx8c8-avx512vnni.c.in index 1c582b51c45..5c0b6544c88 100644 --- a/src/qs8-gemm/MRx8c8-avx512vnni.c.in +++ b/src/qs8-gemm/MRx8c8-avx512vnni.c.in @@ -82,7 +82,7 @@ void xnn_${DATATYPE_SPEC}_gemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x16c8__ // XNN_FORCE_REALIZATION(voutput_max); $else: const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->${PARAMS_STRUCT}.output_max - (int32_t) params->${PARAMS_STRUCT}.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->${PARAMS_STRUCT}.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->${PARAMS_STRUCT}.output_min); @@ -91,10 +91,10 @@ void xnn_${DATATYPE_SPEC}_gemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x16c8__ // XNN_FORCE_REALIZATION(voutput_min); $if DATATYPE in ["QC4", "QS8_QC4"]: const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); $if GFNI: const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { $if DATATYPE in ["QD8", "QC4"]: const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); diff --git a/src/qs8-gemm/MRx8c8-avxvnni.c.in b/src/qs8-gemm/MRx8c8-avxvnni.c.in index f9953131a5c..25aa9ff6f62 100644 --- a/src/qs8-gemm/MRx8c8-avxvnni.c.in +++ b/src/qs8-gemm/MRx8c8-avxvnni.c.in @@ -108,7 +108,7 @@ void xnn_${DATATYPE_SPEC}_gemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x8c8__$ $else: $if VARIANT != "AVXVNNIINT8": const __m256i vsign_mask = ${_MM256_SET1_EPI8("0x80")}; - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->${PARAMS_STRUCT}.output_max - (int32_t) params->${PARAMS_STRUCT}.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->${PARAMS_STRUCT}.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->${PARAMS_STRUCT}.output_min); diff --git a/src/qs8-igemm/MRx16c4-avx512vnni.c.in b/src/qs8-igemm/MRx16c4-avx512vnni.c.in index ecb2ebe378f..b2a7fe01857 100644 --- a/src/qs8-igemm/MRx16c4-avx512vnni.c.in +++ b/src/qs8-igemm/MRx16c4-avx512vnni.c.in @@ -77,19 +77,19 @@ void xnn_${DATATYPE_SPEC}_igemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x16c4_ const __m512 vinput_inv_scale = _mm512_set1_ps(quantization_params->inv_scale); const __m512 voutput_min = _mm512_set1_ps(params->scalar.min); const __m512 voutput_max = _mm512_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vinput_zero_point); + // XNN_FORCE_REALIZATION(vinput_zero_point); // XNN_FORCE_REALIZATION(vinput_inv_scale); // XNN_FORCE_REALIZATION(voutput_min); // XNN_FORCE_REALIZATION(voutput_max); $if DATATYPE == "QC4": const __m256i vmask = _mm256_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); $if GFNI: const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); $else: const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); $if DATATYPE != "QC8": const __m512 vscale = _mm512_set1_ps(params->${PARAMS_STRUCT}.scale); // XNN_FORCE_REALIZATION(vscale); diff --git a/src/qs8-igemm/MRx16c8-avx512vnni.c.in b/src/qs8-igemm/MRx16c8-avx512vnni.c.in index c53c4fa857d..d97dd3219bb 100644 --- a/src/qs8-igemm/MRx16c8-avx512vnni.c.in +++ b/src/qs8-igemm/MRx16c8-avx512vnni.c.in @@ -83,13 +83,13 @@ void xnn_${DATATYPE_SPEC}_igemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x16c8_ // XNN_FORCE_REALIZATION(voutput_max); $if DATATYPE == "QC4": const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); $if GFNI: const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); $else: const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); $if DATATYPE != "QC8": const __m512 vscale = _mm512_set1_ps(params->${PARAMS_STRUCT}.scale); // XNN_FORCE_REALIZATION(vscale); @@ -98,7 +98,7 @@ void xnn_${DATATYPE_SPEC}_igemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x16c8_ const __m128i voutput_min = _mm_set1_epi8(params->${PARAMS_STRUCT}.output_min); // XNN_FORCE_REALIZATION(voutput_max_less_zero_point); // XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); + // XNN_FORCE_REALIZATION(voutput_min); do { $if DATATYPE in ["QD8", "QC4"]: const __m512i vksum0123456789ABCDEF = _mm512_load_epi32(w); diff --git a/src/qs8-igemm/MRx8c8-avxvnni.c.in b/src/qs8-igemm/MRx8c8-avxvnni.c.in index 2fb577474d6..3c6da749fb5 100644 --- a/src/qs8-igemm/MRx8c8-avxvnni.c.in +++ b/src/qs8-igemm/MRx8c8-avxvnni.c.in @@ -97,14 +97,14 @@ void xnn_${DATATYPE_SPEC}_igemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x8c8__ // XNN_FORCE_REALIZATION(voutput_max); $if DATATYPE in ["QC4_F16", "QC4_F32"]: const __m256i vmask = _mm256_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); $if GFNI: const __m256i vshl4 = _mm256_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); $else: $if VARIANT != "AVXVNNIINT8": const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); $if DATATYPE != "QC8": const __m256 vscale = _mm256_load_ps(params->${PARAMS_STRUCT}.scale); // XNN_FORCE_REALIZATION(vscale); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-1x8c8-minmax-avx2-madd.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-1x8c8-minmax-avx2-madd.c index 036c85d7079..f55cc7abe6c 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-1x8c8-minmax-avx2-madd.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-1x8c8-minmax-avx2-madd.c @@ -48,7 +48,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_1x8c8__avx2_madd( int8_t* c0 = c; const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-1x8c8-minmax-avx256skx-madd.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-1x8c8-minmax-avx256skx-madd.c index e23fb777843..c36c00748e0 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-1x8c8-minmax-avx256skx-madd.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-1x8c8-minmax-avx256skx-madd.c @@ -48,7 +48,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_1x8c8__avx256skx_madd( int8_t* c0 = c; const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-1x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-1x8c8-minmax-fp32-avxvnni.c index 827fba4fe78..feecd3a87ab 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-1x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-1x8c8-minmax-fp32-avxvnni.c @@ -48,7 +48,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_1x8c8__avxvnni( int8_t* c0 = c; const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-2x8c8-minmax-avx2-madd.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-2x8c8-minmax-avx2-madd.c index d393b576a23..72e38d4ea4e 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-2x8c8-minmax-avx2-madd.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-2x8c8-minmax-avx2-madd.c @@ -54,7 +54,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_2x8c8__avx2_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-2x8c8-minmax-avx256skx-madd.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-2x8c8-minmax-avx256skx-madd.c index 47ea111adad..f06d2c3aacb 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-2x8c8-minmax-avx256skx-madd.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-2x8c8-minmax-avx256skx-madd.c @@ -54,7 +54,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_2x8c8__avx256skx_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-2x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-2x8c8-minmax-fp32-avxvnni.c index a3973d3cf4c..feea5830f4f 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-2x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-2x8c8-minmax-fp32-avxvnni.c @@ -54,7 +54,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_2x8c8__avxvnni( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-3x8c8-minmax-avx2-madd.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-3x8c8-minmax-avx2-madd.c index 4207f3858d8..5c111b64a64 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-3x8c8-minmax-avx2-madd.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-3x8c8-minmax-avx2-madd.c @@ -60,7 +60,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_3x8c8__avx2_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-3x8c8-minmax-avx256skx-madd.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-3x8c8-minmax-avx256skx-madd.c index 62ea1b722f5..d89fcdf86a5 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-3x8c8-minmax-avx256skx-madd.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-3x8c8-minmax-avx256skx-madd.c @@ -60,7 +60,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_3x8c8__avx256skx_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-3x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-3x8c8-minmax-fp32-avxvnni.c index 92d6cb08e34..20313659e89 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-3x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-3x8c8-minmax-fp32-avxvnni.c @@ -60,7 +60,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_3x8c8__avxvnni( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-4x8c8-minmax-avx2-madd.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-4x8c8-minmax-avx2-madd.c index 62dfcf869e2..e54d919378c 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-4x8c8-minmax-avx2-madd.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-4x8c8-minmax-avx2-madd.c @@ -66,7 +66,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_4x8c8__avx2_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-4x8c8-minmax-avx256skx-madd.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-4x8c8-minmax-avx256skx-madd.c index b4fc9a130be..92f6f91d8af 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-4x8c8-minmax-avx256skx-madd.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-4x8c8-minmax-avx256skx-madd.c @@ -66,7 +66,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_4x8c8__avx256skx_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-4x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-4x8c8-minmax-fp32-avxvnni.c index 9fe7a987e75..f65a6c74180 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-4x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-4x8c8-minmax-fp32-avxvnni.c @@ -66,7 +66,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_4x8c8__avxvnni( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-5x8c8-minmax-avx2-madd.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-5x8c8-minmax-avx2-madd.c index ef5c7f0f955..5753854c472 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-5x8c8-minmax-avx2-madd.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-5x8c8-minmax-avx2-madd.c @@ -72,7 +72,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_5x8c8__avx2_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-5x8c8-minmax-avx256skx-madd.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-5x8c8-minmax-avx256skx-madd.c index abdfc05f660..da61717b878 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-5x8c8-minmax-avx256skx-madd.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-5x8c8-minmax-avx256skx-madd.c @@ -72,7 +72,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_5x8c8__avx256skx_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-5x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-5x8c8-minmax-fp32-avxvnni.c index bb9faeff6f9..d56689101e7 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-5x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-5x8c8-minmax-fp32-avxvnni.c @@ -72,7 +72,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_5x8c8__avxvnni( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-6x8c8-minmax-avx2-madd.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-6x8c8-minmax-avx2-madd.c index f97baa2bb5e..9fc3a921f01 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-6x8c8-minmax-avx2-madd.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-6x8c8-minmax-avx2-madd.c @@ -78,7 +78,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_6x8c8__avx2_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-6x8c8-minmax-avx256skx-madd.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-6x8c8-minmax-avx256skx-madd.c index f18f1b880da..13825e90332 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-6x8c8-minmax-avx256skx-madd.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-6x8c8-minmax-avx256skx-madd.c @@ -78,7 +78,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_6x8c8__avx256skx_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-6x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-6x8c8-minmax-fp32-avxvnni.c index ba05584d1ae..46afb3f452a 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-6x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-6x8c8-minmax-fp32-avxvnni.c @@ -78,7 +78,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_6x8c8__avxvnni( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-7x8c8-minmax-avx2-madd.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-7x8c8-minmax-avx2-madd.c index 383f9b48874..50443990d5d 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-7x8c8-minmax-avx2-madd.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-7x8c8-minmax-avx2-madd.c @@ -84,7 +84,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_7x8c8__avx2_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-7x8c8-minmax-avx256skx-madd.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-7x8c8-minmax-avx256skx-madd.c index 572f65c78ac..8d4a5fd724d 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-7x8c8-minmax-avx256skx-madd.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-7x8c8-minmax-avx256skx-madd.c @@ -84,7 +84,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_7x8c8__avx256skx_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-7x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-7x8c8-minmax-fp32-avxvnni.c index 49a6784158c..9a21ffae92e 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-7x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-7x8c8-minmax-fp32-avxvnni.c @@ -84,7 +84,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_7x8c8__avxvnni( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-8x8c8-minmax-avx2-madd.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-8x8c8-minmax-avx2-madd.c index 6211f80cfaa..f9ac5b666aa 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-8x8c8-minmax-avx2-madd.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-8x8c8-minmax-avx2-madd.c @@ -90,7 +90,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_8x8c8__avx2_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-8x8c8-minmax-avx256skx-madd.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-8x8c8-minmax-avx256skx-madd.c index 70cd97aada5..3e1df554c95 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-8x8c8-minmax-avx256skx-madd.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-8x8c8-minmax-avx256skx-madd.c @@ -90,7 +90,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_8x8c8__avx256skx_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-8x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-8x8c8-minmax-fp32-avxvnni.c index 0768a7b9f3f..2cd29b205f2 100644 --- a/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-8x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc2w-gemm/gen/qs8-qc2w-gemm-8x8c8-minmax-fp32-avxvnni.c @@ -90,7 +90,7 @@ void xnn_qs8_qc2w_gemm_minmax_fp32_ukernel_8x8c8__avxvnni( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-10x16c8-minmax-fp32-avx512skx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-10x16c8-minmax-fp32-avx512skx-madd-prfm.c index 59f17478d7d..7efa3d99232 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-10x16c8-minmax-fp32-avx512skx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-10x16c8-minmax-fp32-avx512skx-madd-prfm.c @@ -103,7 +103,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_10x16c8__avx512skx_madd_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -111,7 +111,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_10x16c8__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-10x16c8-minmax-fp32-avx512skx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-10x16c8-minmax-fp32-avx512skx-madd.c index 87361aa8767..be5d136d934 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-10x16c8-minmax-fp32-avx512skx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-10x16c8-minmax-fp32-avx512skx-madd.c @@ -102,7 +102,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_10x16c8__avx512skx_madd( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -110,7 +110,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_10x16c8__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-10x16c8-minmax-fp32-avx512vnnigfni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-10x16c8-minmax-fp32-avx512vnnigfni-prfm.c index 93e48791fe2..ec3abc6f77c 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-10x16c8-minmax-fp32-avx512vnnigfni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-10x16c8-minmax-fp32-avx512vnnigfni-prfm.c @@ -103,7 +103,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_10x16c8__avx512vnnigfni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -111,9 +111,9 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_10x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-10x16c8-minmax-fp32-avx512vnnigfni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-10x16c8-minmax-fp32-avx512vnnigfni.c index 2e22b92140b..af5986ca055 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-10x16c8-minmax-fp32-avx512vnnigfni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-10x16c8-minmax-fp32-avx512vnnigfni.c @@ -102,7 +102,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_10x16c8__avx512vnnigfni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -110,9 +110,9 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_10x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-12x16c8-minmax-fp32-avx512skx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-12x16c8-minmax-fp32-avx512skx-madd-prfm.c index cfb3957c4b7..8fa16ee712b 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-12x16c8-minmax-fp32-avx512skx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-12x16c8-minmax-fp32-avx512skx-madd-prfm.c @@ -115,7 +115,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_12x16c8__avx512skx_madd_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -123,7 +123,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_12x16c8__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-12x16c8-minmax-fp32-avx512skx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-12x16c8-minmax-fp32-avx512skx-madd.c index 18f88da5774..f4f76248aa0 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-12x16c8-minmax-fp32-avx512skx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-12x16c8-minmax-fp32-avx512skx-madd.c @@ -114,7 +114,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_12x16c8__avx512skx_madd( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -122,7 +122,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_12x16c8__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-12x16c8-minmax-fp32-avx512vnnigfni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-12x16c8-minmax-fp32-avx512vnnigfni-prfm.c index 9174ffcbb9b..b2c694c6648 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-12x16c8-minmax-fp32-avx512vnnigfni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-12x16c8-minmax-fp32-avx512vnnigfni-prfm.c @@ -115,7 +115,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_12x16c8__avx512vnnigfni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -123,9 +123,9 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_12x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-12x16c8-minmax-fp32-avx512vnnigfni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-12x16c8-minmax-fp32-avx512vnnigfni.c index 23f9e045e9c..22378efd925 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-12x16c8-minmax-fp32-avx512vnnigfni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-12x16c8-minmax-fp32-avx512vnnigfni.c @@ -114,7 +114,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_12x16c8__avx512vnnigfni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -122,9 +122,9 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_12x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-14x16c8-minmax-fp32-avx512skx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-14x16c8-minmax-fp32-avx512skx-madd-prfm.c index 228e8d3f6eb..6ab311b0aa6 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-14x16c8-minmax-fp32-avx512skx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-14x16c8-minmax-fp32-avx512skx-madd-prfm.c @@ -127,7 +127,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_14x16c8__avx512skx_madd_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -135,7 +135,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_14x16c8__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-14x16c8-minmax-fp32-avx512skx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-14x16c8-minmax-fp32-avx512skx-madd.c index 85a0e381461..9e291d94524 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-14x16c8-minmax-fp32-avx512skx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-14x16c8-minmax-fp32-avx512skx-madd.c @@ -126,7 +126,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_14x16c8__avx512skx_madd( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -134,7 +134,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_14x16c8__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-14x16c8-minmax-fp32-avx512vnnigfni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-14x16c8-minmax-fp32-avx512vnnigfni-prfm.c index 4564b50280d..6f59e52a7b1 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-14x16c8-minmax-fp32-avx512vnnigfni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-14x16c8-minmax-fp32-avx512vnnigfni-prfm.c @@ -127,7 +127,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_14x16c8__avx512vnnigfni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -135,9 +135,9 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_14x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-14x16c8-minmax-fp32-avx512vnnigfni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-14x16c8-minmax-fp32-avx512vnnigfni.c index 49727c00306..ecf42e8cc44 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-14x16c8-minmax-fp32-avx512vnnigfni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-14x16c8-minmax-fp32-avx512vnnigfni.c @@ -126,7 +126,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_14x16c8__avx512vnnigfni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -134,9 +134,9 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_14x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x16c8-minmax-fp32-avx512skx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x16c8-minmax-fp32-avx512skx-madd-prfm.c index 22c17da532b..6f763e946ae 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x16c8-minmax-fp32-avx512skx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x16c8-minmax-fp32-avx512skx-madd-prfm.c @@ -49,7 +49,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x16c8__avx512skx_madd_prfm( int8_t* c0 = c; const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -57,7 +57,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x16c8__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x16c8-minmax-fp32-avx512skx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x16c8-minmax-fp32-avx512skx-madd.c index e4af448486e..f6938913f1d 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x16c8-minmax-fp32-avx512skx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x16c8-minmax-fp32-avx512skx-madd.c @@ -48,7 +48,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x16c8__avx512skx_madd( int8_t* c0 = c; const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -56,7 +56,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x16c8__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x16c8-minmax-fp32-avx512vnnigfni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x16c8-minmax-fp32-avx512vnnigfni-prfm.c index 18801c46977..9183cee6273 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x16c8-minmax-fp32-avx512vnnigfni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x16c8-minmax-fp32-avx512vnnigfni-prfm.c @@ -49,7 +49,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x16c8__avx512vnnigfni_prfm( int8_t* c0 = c; const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -57,9 +57,9 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x16c8-minmax-fp32-avx512vnnigfni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x16c8-minmax-fp32-avx512vnnigfni.c index 2f945a51418..25b9d384842 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x16c8-minmax-fp32-avx512vnnigfni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x16c8-minmax-fp32-avx512vnnigfni.c @@ -48,7 +48,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x16c8__avx512vnnigfni( int8_t* c0 = c; const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -56,9 +56,9 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x4c8-minmax-avx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x4c8-minmax-avx-madd-prfm.c index 612bc7349c6..3b4f064fe89 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x4c8-minmax-avx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x4c8-minmax-avx-madd-prfm.c @@ -49,12 +49,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x4c8__avx_madd_prfm( int8_t* c0 = c; const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m128i vacc0x01 = _mm_cvtepu32_epi64(_mm_loadu_si64(w)); __m128i vacc0x23 = _mm_cvtepu32_epi64(_mm_loadu_si64(((const int32_t*) w + 2))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x4c8-minmax-avx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x4c8-minmax-avx-madd.c index a1c66e92e63..e543aa9247a 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x4c8-minmax-avx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x4c8-minmax-avx-madd.c @@ -48,12 +48,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x4c8__avx_madd( int8_t* c0 = c; const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m128i vacc0x01 = _mm_cvtepu32_epi64(_mm_loadu_si64(w)); __m128i vacc0x23 = _mm_cvtepu32_epi64(_mm_loadu_si64(((const int32_t*) w + 2))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x4c8-minmax-ssse3-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x4c8-minmax-ssse3-madd-prfm.c index 83508518d7f..c7b8d958161 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x4c8-minmax-ssse3-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x4c8-minmax-ssse3-madd-prfm.c @@ -49,12 +49,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x4c8__ssse3_madd_prfm( int8_t* c0 = c; const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); __m128i vacc0x01 = _mm_unpacklo_epi32(vksum0123, _mm_setzero_si128()); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x4c8-minmax-ssse3-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x4c8-minmax-ssse3-madd.c index 6ccd28abe90..d998058cc8a 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x4c8-minmax-ssse3-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x4c8-minmax-ssse3-madd.c @@ -48,12 +48,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x4c8__ssse3_madd( int8_t* c0 = c; const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); __m128i vacc0x01 = _mm_unpacklo_epi32(vksum0123, _mm_setzero_si128()); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-avx2-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-avx2-madd-prfm.c index 88c42cfb3c4..5e8846e3374 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-avx2-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-avx2-madd-prfm.c @@ -49,7 +49,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x8c8__avx2_madd_prfm( int8_t* c0 = c; const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-avx2-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-avx2-madd.c index b799c1de957..fe76552ce05 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-avx2-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-avx2-madd.c @@ -48,7 +48,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x8c8__avx2_madd( int8_t* c0 = c; const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-avxvnni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-avxvnni-prfm.c index 1e337f4f3f3..936b15c0bca 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-avxvnni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-avxvnni-prfm.c @@ -49,7 +49,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x8c8__avxvnni_prfm( int8_t* c0 = c; const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-avxvnni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-avxvnni.c index e580cef7e88..913315a45cd 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-avxvnni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-avxvnni.c @@ -48,7 +48,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x8c8__avxvnni( int8_t* c0 = c; const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-fp32-avx256skx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-fp32-avx256skx-madd-prfm.c index 622cd838e94..7e4eec86b17 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-fp32-avx256skx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-fp32-avx256skx-madd-prfm.c @@ -49,7 +49,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x8c8__avx256skx_madd_prfm( int8_t* c0 = c; const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-fp32-avx256skx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-fp32-avx256skx-madd.c index b29217fa2d5..defae02ae3e 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-fp32-avx256skx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-fp32-avx256skx-madd.c @@ -48,7 +48,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x8c8__avx256skx_madd( int8_t* c0 = c; const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-fp32-avx256vnni-prfm.c index d523e80d725..7f3cc6f775e 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-fp32-avx256vnni-prfm.c @@ -49,7 +49,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x8c8__avx256vnni_prfm( int8_t* c0 = c; const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-fp32-avx256vnni.c index 9a6f1853f21..3151d9a2a14 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-1x8c8-minmax-fp32-avx256vnni.c @@ -48,7 +48,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_1x8c8__avx256vnni( int8_t* c0 = c; const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x4c8-minmax-avx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x4c8-minmax-avx-madd-prfm.c index 8901f10ebe2..62ae0a64fca 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x4c8-minmax-avx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x4c8-minmax-avx-madd-prfm.c @@ -55,12 +55,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_2x4c8__avx_madd_prfm( } const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m128i vacc0x01 = _mm_cvtepu32_epi64(_mm_loadu_si64(w)); __m128i vacc0x23 = _mm_cvtepu32_epi64(_mm_loadu_si64(((const int32_t*) w + 2))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x4c8-minmax-avx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x4c8-minmax-avx-madd.c index c6f239fd817..a1a08d7044d 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x4c8-minmax-avx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x4c8-minmax-avx-madd.c @@ -54,12 +54,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_2x4c8__avx_madd( } const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m128i vacc0x01 = _mm_cvtepu32_epi64(_mm_loadu_si64(w)); __m128i vacc0x23 = _mm_cvtepu32_epi64(_mm_loadu_si64(((const int32_t*) w + 2))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x4c8-minmax-ssse3-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x4c8-minmax-ssse3-madd-prfm.c index 4ab1c128e90..51c04134dae 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x4c8-minmax-ssse3-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x4c8-minmax-ssse3-madd-prfm.c @@ -55,12 +55,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_2x4c8__ssse3_madd_prfm( } const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); __m128i vacc0x01 = _mm_unpacklo_epi32(vksum0123, _mm_setzero_si128()); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x4c8-minmax-ssse3-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x4c8-minmax-ssse3-madd.c index 33f5e661641..b929479edfd 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x4c8-minmax-ssse3-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x4c8-minmax-ssse3-madd.c @@ -54,12 +54,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_2x4c8__ssse3_madd( } const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); __m128i vacc0x01 = _mm_unpacklo_epi32(vksum0123, _mm_setzero_si128()); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-avx2-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-avx2-madd-prfm.c index c26a452369c..e1e5deec94c 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-avx2-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-avx2-madd-prfm.c @@ -55,7 +55,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_2x8c8__avx2_madd_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-avx2-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-avx2-madd.c index f368272e7cd..7d2eeff3d2a 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-avx2-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-avx2-madd.c @@ -54,7 +54,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_2x8c8__avx2_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-avxvnni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-avxvnni-prfm.c index 6aeeb9ede39..07bf03b3d68 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-avxvnni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-avxvnni-prfm.c @@ -55,7 +55,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_2x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-avxvnni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-avxvnni.c index 923ba32ed52..c49825fd455 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-avxvnni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-avxvnni.c @@ -54,7 +54,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_2x8c8__avxvnni( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-fp32-avx256skx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-fp32-avx256skx-madd-prfm.c index 1cd51347644..9423ce20fc2 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-fp32-avx256skx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-fp32-avx256skx-madd-prfm.c @@ -55,7 +55,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_2x8c8__avx256skx_madd_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-fp32-avx256skx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-fp32-avx256skx-madd.c index b80ff6db2c7..a825caaf04b 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-fp32-avx256skx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-fp32-avx256skx-madd.c @@ -54,7 +54,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_2x8c8__avx256skx_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-fp32-avx256vnni-prfm.c index 463f4f3e11f..c98b06731a2 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-fp32-avx256vnni-prfm.c @@ -55,7 +55,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_2x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-fp32-avx256vnni.c index b2e1376fb7c..01c90fe30fc 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-2x8c8-minmax-fp32-avx256vnni.c @@ -54,7 +54,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_2x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x4c8-minmax-avx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x4c8-minmax-avx-madd-prfm.c index 59afa136313..a4d7e517ec1 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x4c8-minmax-avx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x4c8-minmax-avx-madd-prfm.c @@ -61,12 +61,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_3x4c8__avx_madd_prfm( } const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m128i vacc0x01 = _mm_cvtepu32_epi64(_mm_loadu_si64(w)); __m128i vacc0x23 = _mm_cvtepu32_epi64(_mm_loadu_si64(((const int32_t*) w + 2))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x4c8-minmax-avx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x4c8-minmax-avx-madd.c index d3ce4c4fdd9..095848ae755 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x4c8-minmax-avx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x4c8-minmax-avx-madd.c @@ -60,12 +60,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_3x4c8__avx_madd( } const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m128i vacc0x01 = _mm_cvtepu32_epi64(_mm_loadu_si64(w)); __m128i vacc0x23 = _mm_cvtepu32_epi64(_mm_loadu_si64(((const int32_t*) w + 2))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x4c8-minmax-ssse3-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x4c8-minmax-ssse3-madd-prfm.c index ca0e0989a6e..49122b678cd 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x4c8-minmax-ssse3-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x4c8-minmax-ssse3-madd-prfm.c @@ -61,12 +61,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_3x4c8__ssse3_madd_prfm( } const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); __m128i vacc0x01 = _mm_unpacklo_epi32(vksum0123, _mm_setzero_si128()); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x4c8-minmax-ssse3-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x4c8-minmax-ssse3-madd.c index 207144a4c00..8ddcb994698 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x4c8-minmax-ssse3-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x4c8-minmax-ssse3-madd.c @@ -60,12 +60,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_3x4c8__ssse3_madd( } const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); __m128i vacc0x01 = _mm_unpacklo_epi32(vksum0123, _mm_setzero_si128()); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-avx2-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-avx2-madd-prfm.c index 80aa3edbf1e..f427c4bf2f3 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-avx2-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-avx2-madd-prfm.c @@ -61,7 +61,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_3x8c8__avx2_madd_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-avx2-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-avx2-madd.c index 5aeb2bcad12..b11fecc0348 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-avx2-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-avx2-madd.c @@ -60,7 +60,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_3x8c8__avx2_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-avxvnni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-avxvnni-prfm.c index eeef949c7a1..c1a74e4a4f5 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-avxvnni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-avxvnni-prfm.c @@ -61,7 +61,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_3x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-avxvnni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-avxvnni.c index 8e4db35a79c..2da59706be9 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-avxvnni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-avxvnni.c @@ -60,7 +60,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_3x8c8__avxvnni( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-fp32-avx256skx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-fp32-avx256skx-madd-prfm.c index 2d57562d577..86b980fde82 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-fp32-avx256skx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-fp32-avx256skx-madd-prfm.c @@ -61,7 +61,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_3x8c8__avx256skx_madd_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-fp32-avx256skx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-fp32-avx256skx-madd.c index 6e86768dfac..8f5d81ad258 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-fp32-avx256skx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-fp32-avx256skx-madd.c @@ -60,7 +60,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_3x8c8__avx256skx_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-fp32-avx256vnni-prfm.c index 1be6ab02ee8..d1723d656a4 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-fp32-avx256vnni-prfm.c @@ -61,7 +61,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_3x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-fp32-avx256vnni.c index 49599c986a3..3b29d540f06 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-3x8c8-minmax-fp32-avx256vnni.c @@ -60,7 +60,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_3x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x4c8-minmax-avx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x4c8-minmax-avx-madd-prfm.c index 511660e920e..21b4a21fe1c 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x4c8-minmax-avx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x4c8-minmax-avx-madd-prfm.c @@ -67,12 +67,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_4x4c8__avx_madd_prfm( } const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m128i vacc0x01 = _mm_cvtepu32_epi64(_mm_loadu_si64(w)); __m128i vacc0x23 = _mm_cvtepu32_epi64(_mm_loadu_si64(((const int32_t*) w + 2))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x4c8-minmax-avx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x4c8-minmax-avx-madd.c index 7f6ac63a314..6652360841a 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x4c8-minmax-avx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x4c8-minmax-avx-madd.c @@ -66,12 +66,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_4x4c8__avx_madd( } const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m128i vacc0x01 = _mm_cvtepu32_epi64(_mm_loadu_si64(w)); __m128i vacc0x23 = _mm_cvtepu32_epi64(_mm_loadu_si64(((const int32_t*) w + 2))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x4c8-minmax-ssse3-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x4c8-minmax-ssse3-madd-prfm.c index a83ddd8e945..4a20ed336ee 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x4c8-minmax-ssse3-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x4c8-minmax-ssse3-madd-prfm.c @@ -67,12 +67,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_4x4c8__ssse3_madd_prfm( } const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); __m128i vacc0x01 = _mm_unpacklo_epi32(vksum0123, _mm_setzero_si128()); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x4c8-minmax-ssse3-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x4c8-minmax-ssse3-madd.c index d5420ba1489..b5e7dbc6fed 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x4c8-minmax-ssse3-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x4c8-minmax-ssse3-madd.c @@ -66,12 +66,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_4x4c8__ssse3_madd( } const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); __m128i vacc0x01 = _mm_unpacklo_epi32(vksum0123, _mm_setzero_si128()); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-avx2-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-avx2-madd-prfm.c index 75c9a7387e8..2b36a097ecf 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-avx2-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-avx2-madd-prfm.c @@ -67,7 +67,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_4x8c8__avx2_madd_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-avx2-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-avx2-madd.c index 5bee1fc2e8e..b1027a20849 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-avx2-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-avx2-madd.c @@ -66,7 +66,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_4x8c8__avx2_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-avxvnni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-avxvnni-prfm.c index 69e22d6ae49..df36b67e64e 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-avxvnni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-avxvnni-prfm.c @@ -67,7 +67,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_4x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-avxvnni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-avxvnni.c index df911ddd1ae..dc77db2249a 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-avxvnni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-avxvnni.c @@ -66,7 +66,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_4x8c8__avxvnni( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-fp32-avx256skx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-fp32-avx256skx-madd-prfm.c index 7d2ee1a6763..4aa7c5ed5ea 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-fp32-avx256skx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-fp32-avx256skx-madd-prfm.c @@ -67,7 +67,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_4x8c8__avx256skx_madd_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-fp32-avx256skx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-fp32-avx256skx-madd.c index 0a9ad5f582e..e2a87e466ec 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-fp32-avx256skx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-fp32-avx256skx-madd.c @@ -66,7 +66,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_4x8c8__avx256skx_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-fp32-avx256vnni-prfm.c index 32209dd412a..fb2fafdd260 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-fp32-avx256vnni-prfm.c @@ -67,7 +67,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_4x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-fp32-avx256vnni.c index 2be7b0b3250..c5a62ceac2d 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-4x8c8-minmax-fp32-avx256vnni.c @@ -66,7 +66,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_4x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x16c8-minmax-fp32-avx512skx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x16c8-minmax-fp32-avx512skx-madd-prfm.c index 273ab6ec309..ed90ca1ba5d 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x16c8-minmax-fp32-avx512skx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x16c8-minmax-fp32-avx512skx-madd-prfm.c @@ -73,7 +73,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x16c8__avx512skx_madd_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -81,7 +81,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x16c8__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x16c8-minmax-fp32-avx512skx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x16c8-minmax-fp32-avx512skx-madd.c index 47eeb2df59c..da400451424 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x16c8-minmax-fp32-avx512skx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x16c8-minmax-fp32-avx512skx-madd.c @@ -72,7 +72,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x16c8__avx512skx_madd( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -80,7 +80,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x16c8__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x16c8-minmax-fp32-avx512vnnigfni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x16c8-minmax-fp32-avx512vnnigfni-prfm.c index a8388b595a3..423f2c8fa01 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x16c8-minmax-fp32-avx512vnnigfni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x16c8-minmax-fp32-avx512vnnigfni-prfm.c @@ -73,7 +73,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x16c8__avx512vnnigfni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -81,9 +81,9 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x16c8-minmax-fp32-avx512vnnigfni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x16c8-minmax-fp32-avx512vnnigfni.c index 2a3b000702d..c69e1d6ec8a 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x16c8-minmax-fp32-avx512vnnigfni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x16c8-minmax-fp32-avx512vnnigfni.c @@ -72,7 +72,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x16c8__avx512vnnigfni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -80,9 +80,9 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x4c8-minmax-avx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x4c8-minmax-avx-madd-prfm.c index bed9aed9be0..0b2c26f4eed 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x4c8-minmax-avx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x4c8-minmax-avx-madd-prfm.c @@ -73,12 +73,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x4c8__avx_madd_prfm( } const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m128i vacc0x01 = _mm_cvtepu32_epi64(_mm_loadu_si64(w)); __m128i vacc0x23 = _mm_cvtepu32_epi64(_mm_loadu_si64(((const int32_t*) w + 2))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x4c8-minmax-avx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x4c8-minmax-avx-madd.c index 588df7d00b0..8f7b6df5666 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x4c8-minmax-avx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x4c8-minmax-avx-madd.c @@ -72,12 +72,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x4c8__avx_madd( } const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m128i vacc0x01 = _mm_cvtepu32_epi64(_mm_loadu_si64(w)); __m128i vacc0x23 = _mm_cvtepu32_epi64(_mm_loadu_si64(((const int32_t*) w + 2))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x4c8-minmax-ssse3-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x4c8-minmax-ssse3-madd-prfm.c index 27a80a93f28..920f9cb040e 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x4c8-minmax-ssse3-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x4c8-minmax-ssse3-madd-prfm.c @@ -73,12 +73,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x4c8__ssse3_madd_prfm( } const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); __m128i vacc0x01 = _mm_unpacklo_epi32(vksum0123, _mm_setzero_si128()); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x4c8-minmax-ssse3-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x4c8-minmax-ssse3-madd.c index 5b14cc0eabf..c9160884322 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x4c8-minmax-ssse3-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x4c8-minmax-ssse3-madd.c @@ -72,12 +72,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x4c8__ssse3_madd( } const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); __m128i vacc0x01 = _mm_unpacklo_epi32(vksum0123, _mm_setzero_si128()); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-avx2-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-avx2-madd-prfm.c index 6ce142c410c..22f1c274bb0 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-avx2-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-avx2-madd-prfm.c @@ -73,7 +73,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x8c8__avx2_madd_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-avx2-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-avx2-madd.c index c08c380d85a..1c572a80ec5 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-avx2-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-avx2-madd.c @@ -72,7 +72,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x8c8__avx2_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-avxvnni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-avxvnni-prfm.c index 37aceb80d8e..cb6e982d3b1 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-avxvnni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-avxvnni-prfm.c @@ -73,7 +73,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-avxvnni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-avxvnni.c index af1da0b7911..610a8d9295d 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-avxvnni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-avxvnni.c @@ -72,7 +72,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x8c8__avxvnni( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-fp32-avx256skx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-fp32-avx256skx-madd-prfm.c index c6969c3f71b..1af9ee14cee 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-fp32-avx256skx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-fp32-avx256skx-madd-prfm.c @@ -73,7 +73,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x8c8__avx256skx_madd_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-fp32-avx256skx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-fp32-avx256skx-madd.c index ea20df08693..8437f40c39f 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-fp32-avx256skx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-fp32-avx256skx-madd.c @@ -72,7 +72,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x8c8__avx256skx_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-fp32-avx256vnni-prfm.c index f55e06c4e4c..7c78f25a26a 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-fp32-avx256vnni-prfm.c @@ -73,7 +73,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-fp32-avx256vnni.c index e9dbdb59528..4a688981881 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-5x8c8-minmax-fp32-avx256vnni.c @@ -72,7 +72,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_5x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x4c8-minmax-avx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x4c8-minmax-avx-madd-prfm.c index 6fb74bd41b2..7ce0f8beb2f 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x4c8-minmax-avx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x4c8-minmax-avx-madd-prfm.c @@ -79,12 +79,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_6x4c8__avx_madd_prfm( } const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m128i vacc0x01 = _mm_cvtepu32_epi64(_mm_loadu_si64(w)); __m128i vacc0x23 = _mm_cvtepu32_epi64(_mm_loadu_si64(((const int32_t*) w + 2))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x4c8-minmax-avx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x4c8-minmax-avx-madd.c index a3bb410715f..ed549124580 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x4c8-minmax-avx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x4c8-minmax-avx-madd.c @@ -78,12 +78,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_6x4c8__avx_madd( } const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m128i vacc0x01 = _mm_cvtepu32_epi64(_mm_loadu_si64(w)); __m128i vacc0x23 = _mm_cvtepu32_epi64(_mm_loadu_si64(((const int32_t*) w + 2))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x4c8-minmax-ssse3-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x4c8-minmax-ssse3-madd-prfm.c index e2e5c6c38a3..a0c4e0bfcf5 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x4c8-minmax-ssse3-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x4c8-minmax-ssse3-madd-prfm.c @@ -79,12 +79,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_6x4c8__ssse3_madd_prfm( } const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); __m128i vacc0x01 = _mm_unpacklo_epi32(vksum0123, _mm_setzero_si128()); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x4c8-minmax-ssse3-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x4c8-minmax-ssse3-madd.c index 96b8081b40a..e1601658f51 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x4c8-minmax-ssse3-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x4c8-minmax-ssse3-madd.c @@ -78,12 +78,12 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_6x4c8__ssse3_madd( } const __m128i vsign_mask = _mm_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m128 voutput_max_less_zero_point = _mm_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m128i voutput_zero_point = _mm_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi16(params->fp32_scalar.output_min); const __m128i vmask = _mm_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { const __m128i vksum0123 = _mm_load_si128(w); __m128i vacc0x01 = _mm_unpacklo_epi32(vksum0123, _mm_setzero_si128()); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-avx2-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-avx2-madd-prfm.c index d1ec44834a8..ddcc4793752 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-avx2-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-avx2-madd-prfm.c @@ -79,7 +79,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_6x8c8__avx2_madd_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-avx2-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-avx2-madd.c index 9156b49341c..f8510d5b86d 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-avx2-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-avx2-madd.c @@ -78,7 +78,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_6x8c8__avx2_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-avxvnni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-avxvnni-prfm.c index fba995b3504..961b1b03bcb 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-avxvnni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-avxvnni-prfm.c @@ -79,7 +79,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_6x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-avxvnni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-avxvnni.c index bafc559610b..8ea6bb53991 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-avxvnni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-avxvnni.c @@ -78,7 +78,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_6x8c8__avxvnni( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-fp32-avx256skx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-fp32-avx256skx-madd-prfm.c index fac83358303..671e76129f3 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-fp32-avx256skx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-fp32-avx256skx-madd-prfm.c @@ -79,7 +79,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_6x8c8__avx256skx_madd_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-fp32-avx256skx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-fp32-avx256skx-madd.c index d99350f97bb..5da079e9dfa 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-fp32-avx256skx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-fp32-avx256skx-madd.c @@ -78,7 +78,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_6x8c8__avx256skx_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-fp32-avx256vnni-prfm.c index 77fa19b12e1..fa15833b9fe 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-fp32-avx256vnni-prfm.c @@ -79,7 +79,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_6x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-fp32-avx256vnni.c index a7621558ab9..36aeeaec1f7 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-6x8c8-minmax-fp32-avx256vnni.c @@ -78,7 +78,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_6x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x16c8-minmax-fp32-avx512skx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x16c8-minmax-fp32-avx512skx-madd-prfm.c index f8445ce0af3..9b9d5228b2f 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x16c8-minmax-fp32-avx512skx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x16c8-minmax-fp32-avx512skx-madd-prfm.c @@ -85,7 +85,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x16c8__avx512skx_madd_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -93,7 +93,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x16c8__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x16c8-minmax-fp32-avx512skx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x16c8-minmax-fp32-avx512skx-madd.c index dd8717e8cb4..3ae10808208 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x16c8-minmax-fp32-avx512skx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x16c8-minmax-fp32-avx512skx-madd.c @@ -84,7 +84,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x16c8__avx512skx_madd( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -92,7 +92,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x16c8__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x16c8-minmax-fp32-avx512vnnigfni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x16c8-minmax-fp32-avx512vnnigfni-prfm.c index 16a6eed3e93..918d1d406d2 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x16c8-minmax-fp32-avx512vnnigfni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x16c8-minmax-fp32-avx512vnnigfni-prfm.c @@ -85,7 +85,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x16c8__avx512vnnigfni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -93,9 +93,9 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x16c8-minmax-fp32-avx512vnnigfni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x16c8-minmax-fp32-avx512vnnigfni.c index 234c5266765..dc8c8df2835 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x16c8-minmax-fp32-avx512vnnigfni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x16c8-minmax-fp32-avx512vnnigfni.c @@ -84,7 +84,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x16c8__avx512vnnigfni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -92,9 +92,9 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-avx2-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-avx2-madd-prfm.c index 2c74d66bb59..7f5bbdd75f2 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-avx2-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-avx2-madd-prfm.c @@ -85,7 +85,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x8c8__avx2_madd_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-avx2-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-avx2-madd.c index e53aa17500a..8a4d57c60a4 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-avx2-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-avx2-madd.c @@ -84,7 +84,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x8c8__avx2_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-avxvnni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-avxvnni-prfm.c index d509fc7a4dd..61a39a3f1d9 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-avxvnni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-avxvnni-prfm.c @@ -85,7 +85,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-avxvnni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-avxvnni.c index 383bc97d79a..abd7fdc4853 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-avxvnni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-avxvnni.c @@ -84,7 +84,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x8c8__avxvnni( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-fp32-avx256skx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-fp32-avx256skx-madd-prfm.c index 8d78a13013e..6e7c5ed773e 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-fp32-avx256skx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-fp32-avx256skx-madd-prfm.c @@ -85,7 +85,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x8c8__avx256skx_madd_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-fp32-avx256skx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-fp32-avx256skx-madd.c index 017b951af54..b51f72c7547 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-fp32-avx256skx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-fp32-avx256skx-madd.c @@ -84,7 +84,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x8c8__avx256skx_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-fp32-avx256vnni-prfm.c index 30efa160da4..3d04bbe0201 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-fp32-avx256vnni-prfm.c @@ -85,7 +85,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-fp32-avx256vnni.c index 3b3545cf2d0..6e898cc0c8d 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-7x8c8-minmax-fp32-avx256vnni.c @@ -84,7 +84,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_7x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x16c8-minmax-fp32-avx512skx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x16c8-minmax-fp32-avx512skx-madd-prfm.c index 307baa1cd76..d3b624c7394 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x16c8-minmax-fp32-avx512skx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x16c8-minmax-fp32-avx512skx-madd-prfm.c @@ -91,7 +91,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_8x16c8__avx512skx_madd_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -99,7 +99,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_8x16c8__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x16c8-minmax-fp32-avx512skx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x16c8-minmax-fp32-avx512skx-madd.c index 5309343155e..35b416be5b8 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x16c8-minmax-fp32-avx512skx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x16c8-minmax-fp32-avx512skx-madd.c @@ -90,7 +90,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_8x16c8__avx512skx_madd( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -98,7 +98,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_8x16c8__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x16c8-minmax-fp32-avx512vnnigfni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x16c8-minmax-fp32-avx512vnnigfni-prfm.c index 9c7400e04fc..a9d84113b75 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x16c8-minmax-fp32-avx512vnnigfni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x16c8-minmax-fp32-avx512vnnigfni-prfm.c @@ -91,7 +91,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_8x16c8__avx512vnnigfni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -99,9 +99,9 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_8x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x16c8-minmax-fp32-avx512vnnigfni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x16c8-minmax-fp32-avx512vnnigfni.c index 099dad8f0fd..0ac086a9ecb 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x16c8-minmax-fp32-avx512vnnigfni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x16c8-minmax-fp32-avx512vnnigfni.c @@ -90,7 +90,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_8x16c8__avx512vnnigfni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -98,9 +98,9 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_8x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-avx2-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-avx2-madd-prfm.c index 2911155694a..6015c8451a8 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-avx2-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-avx2-madd-prfm.c @@ -91,7 +91,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_8x8c8__avx2_madd_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-avx2-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-avx2-madd.c index d64991fda8a..f195a9fc10a 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-avx2-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-avx2-madd.c @@ -90,7 +90,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_8x8c8__avx2_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-avxvnni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-avxvnni-prfm.c index aec11366a2d..bf578808d32 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-avxvnni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-avxvnni-prfm.c @@ -91,7 +91,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_8x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-avxvnni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-avxvnni.c index f171178ecb4..85c50be01f3 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-avxvnni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-avxvnni.c @@ -90,7 +90,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_8x8c8__avxvnni( } const __m256i vsign_mask = _mm256_gf2p8affine_epi64_epi8(_mm256_setzero_si256(), _mm256_setzero_si256(), 0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-fp32-avx256skx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-fp32-avx256skx-madd-prfm.c index 1affe966cb7..7ffdc3beb40 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-fp32-avx256skx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-fp32-avx256skx-madd-prfm.c @@ -91,7 +91,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_8x8c8__avx256skx_madd_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-fp32-avx256skx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-fp32-avx256skx-madd.c index de8db850d9a..fbfb156f0c2 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-fp32-avx256skx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-fp32-avx256skx-madd.c @@ -90,7 +90,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_8x8c8__avx256skx_madd( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-fp32-avx256vnni-prfm.c index 64361ec1991..7fa008dc41a 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-fp32-avx256vnni-prfm.c @@ -91,7 +91,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_8x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-fp32-avx256vnni.c index cddb562caa5..615518ed45e 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-8x8c8-minmax-fp32-avx256vnni.c @@ -90,7 +90,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_8x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-9x16c8-minmax-fp32-avx512skx-madd-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-9x16c8-minmax-fp32-avx512skx-madd-prfm.c index 0a6464efe85..ba383b40a38 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-9x16c8-minmax-fp32-avx512skx-madd-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-9x16c8-minmax-fp32-avx512skx-madd-prfm.c @@ -97,7 +97,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_9x16c8__avx512skx_madd_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -105,7 +105,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_9x16c8__avx512skx_madd_prfm( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-9x16c8-minmax-fp32-avx512skx-madd.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-9x16c8-minmax-fp32-avx512skx-madd.c index efb4e72e159..557b31eec2a 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-9x16c8-minmax-fp32-avx512skx-madd.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-9x16c8-minmax-fp32-avx512skx-madd.c @@ -96,7 +96,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_9x16c8__avx512skx_madd( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -104,7 +104,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_9x16c8__avx512skx_madd( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0x0F); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-9x16c8-minmax-fp32-avx512vnnigfni-prfm.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-9x16c8-minmax-fp32-avx512vnnigfni-prfm.c index 3e70f5e58df..c65e59a038a 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-9x16c8-minmax-fp32-avx512vnnigfni-prfm.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-9x16c8-minmax-fp32-avx512vnnigfni-prfm.c @@ -97,7 +97,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_9x16c8__avx512vnnigfni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -105,9 +105,9 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_9x16c8__avx512vnnigfni_prfm( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-9x16c8-minmax-fp32-avx512vnnigfni.c b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-9x16c8-minmax-fp32-avx512vnnigfni.c index f5f2ca30876..722b5c8a992 100644 --- a/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-9x16c8-minmax-fp32-avx512vnnigfni.c +++ b/src/qs8-qc4w-gemm/gen/qs8-qc4w-gemm-9x16c8-minmax-fp32-avx512vnnigfni.c @@ -96,7 +96,7 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_9x16c8__avx512vnnigfni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); @@ -104,9 +104,9 @@ void xnn_qs8_qc4w_gemm_minmax_fp32_ukernel_9x16c8__avx512vnnigfni( // XNN_FORCE_REALIZATION(voutput_zero_point); // XNN_FORCE_REALIZATION(voutput_min); const __m512i vmask = _mm512_set1_epi8(0xF0); - XNN_FORCE_REALIZATION(vmask); + // XNN_FORCE_REALIZATION(vmask); const __m512i vshl4 = _mm512_set1_epi64(0x01020408); - XNN_FORCE_REALIZATION(vshl4); + // XNN_FORCE_REALIZATION(vshl4); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x16c4-minmax-fp32-avx512vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x16c4-minmax-fp32-avx512vnni-prfm.c index b158766ff9d..515f07b3339 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x16c4-minmax-fp32-avx512vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x16c4-minmax-fp32-avx512vnni-prfm.c @@ -103,7 +103,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_10x16c4__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x16c4-minmax-fp32-avx512vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x16c4-minmax-fp32-avx512vnni.c index 9e796672f44..64e4fbd789b 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x16c4-minmax-fp32-avx512vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x16c4-minmax-fp32-avx512vnni.c @@ -102,7 +102,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_10x16c4__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x16c8-minmax-fp32-avx512vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x16c8-minmax-fp32-avx512vnni-prfm.c index e2001c036aa..062265995a8 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x16c8-minmax-fp32-avx512vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x16c8-minmax-fp32-avx512vnni-prfm.c @@ -103,7 +103,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_10x16c8__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x16c8-minmax-fp32-avx512vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x16c8-minmax-fp32-avx512vnni.c index c4b52381c1b..507427314d7 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x16c8-minmax-fp32-avx512vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x16c8-minmax-fp32-avx512vnni.c @@ -102,7 +102,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_10x16c8__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x8c8-minmax-fp32-avx256vnni-prfm.c index 6ede64a7f5f..c8f00b57503 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x8c8-minmax-fp32-avx256vnni-prfm.c @@ -103,7 +103,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_10x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x8c8-minmax-fp32-avx256vnni.c index d0d1cf53a2c..1fb99f3bc7b 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-10x8c8-minmax-fp32-avx256vnni.c @@ -102,7 +102,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_10x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x16c4-minmax-fp32-avx512vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x16c4-minmax-fp32-avx512vnni-prfm.c index 2e1a63ae0ae..fcfbd8cee38 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x16c4-minmax-fp32-avx512vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x16c4-minmax-fp32-avx512vnni-prfm.c @@ -115,7 +115,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_12x16c4__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x16c4-minmax-fp32-avx512vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x16c4-minmax-fp32-avx512vnni.c index bebd0c26322..04620501996 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x16c4-minmax-fp32-avx512vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x16c4-minmax-fp32-avx512vnni.c @@ -114,7 +114,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_12x16c4__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x16c8-minmax-fp32-avx512vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x16c8-minmax-fp32-avx512vnni-prfm.c index a3827702600..15961fb6af6 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x16c8-minmax-fp32-avx512vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x16c8-minmax-fp32-avx512vnni-prfm.c @@ -115,7 +115,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_12x16c8__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x16c8-minmax-fp32-avx512vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x16c8-minmax-fp32-avx512vnni.c index ec28ef418ce..23c3700605d 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x16c8-minmax-fp32-avx512vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x16c8-minmax-fp32-avx512vnni.c @@ -114,7 +114,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_12x16c8__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x8c8-minmax-fp32-avx256vnni-prfm.c index ce245219cd2..46b34bd914d 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x8c8-minmax-fp32-avx256vnni-prfm.c @@ -115,7 +115,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_12x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x8c8-minmax-fp32-avx256vnni.c index 6bb830884c5..d4242f75ec4 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-12x8c8-minmax-fp32-avx256vnni.c @@ -114,7 +114,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_12x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x16c4-minmax-fp32-avx512vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x16c4-minmax-fp32-avx512vnni-prfm.c index 109bc29ff5d..2206ad2958d 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x16c4-minmax-fp32-avx512vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x16c4-minmax-fp32-avx512vnni-prfm.c @@ -127,7 +127,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_14x16c4__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x16c4-minmax-fp32-avx512vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x16c4-minmax-fp32-avx512vnni.c index b2d0a9b78e6..b4e6768b2fd 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x16c4-minmax-fp32-avx512vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x16c4-minmax-fp32-avx512vnni.c @@ -126,7 +126,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_14x16c4__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x16c8-minmax-fp32-avx512vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x16c8-minmax-fp32-avx512vnni-prfm.c index a2bf4fccef6..5421d2f052d 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x16c8-minmax-fp32-avx512vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x16c8-minmax-fp32-avx512vnni-prfm.c @@ -127,7 +127,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_14x16c8__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x16c8-minmax-fp32-avx512vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x16c8-minmax-fp32-avx512vnni.c index b7083875468..70836374f82 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x16c8-minmax-fp32-avx512vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x16c8-minmax-fp32-avx512vnni.c @@ -126,7 +126,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_14x16c8__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x8c8-minmax-fp32-avx256vnni-prfm.c index 0579e380410..89da9afb608 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x8c8-minmax-fp32-avx256vnni-prfm.c @@ -127,7 +127,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_14x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x8c8-minmax-fp32-avx256vnni.c index 397aad667cf..48b7379ff7a 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-14x8c8-minmax-fp32-avx256vnni.c @@ -126,7 +126,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_14x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-avx512vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-avx512vnni-prfm.c index 544c5254f16..ee01984db64 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-avx512vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-avx512vnni-prfm.c @@ -49,7 +49,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__avx512vnni_prfm( int8_t* c0 = c; const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-avx512vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-avx512vnni.c index 519004132c1..3c117fdd508 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-avx512vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-avx512vnni.c @@ -48,7 +48,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__avx512vnni( int8_t* c0 = c; const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c8-minmax-fp32-avx512vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c8-minmax-fp32-avx512vnni-prfm.c index 93753c3d57c..f3e2e09be04 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c8-minmax-fp32-avx512vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c8-minmax-fp32-avx512vnni-prfm.c @@ -49,7 +49,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512vnni_prfm( int8_t* c0 = c; const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c8-minmax-fp32-avx512vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c8-minmax-fp32-avx512vnni.c index 19c2d674e2c..29e9ef92582 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c8-minmax-fp32-avx512vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c8-minmax-fp32-avx512vnni.c @@ -48,7 +48,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512vnni( int8_t* c0 = c; const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avx256vnni-prfm.c index 61a58e6da03..a9755747159 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avx256vnni-prfm.c @@ -49,7 +49,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx256vnni_prfm( int8_t* c0 = c; const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avx256vnni.c index f5a6a8809a9..fdc35832ff3 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avx256vnni.c @@ -48,7 +48,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx256vnni( int8_t* c0 = c; const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avxvnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avxvnni-prfm.c index 77a7ec5bd6f..d0de0ca6638 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avxvnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avxvnni-prfm.c @@ -49,7 +49,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avxvnni_prfm( int8_t* c0 = c; const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avxvnni.c index 0227796756a..306030f7fc2 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avxvnni.c @@ -48,7 +48,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avxvnni( int8_t* c0 = c; const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-avxvnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-avxvnni-prfm.c index 103339be863..a3221571e10 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-avxvnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-avxvnni-prfm.c @@ -55,7 +55,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-avxvnni.c index ee151542ecd..77f68b6a239 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-avxvnni.c @@ -54,7 +54,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c8__avxvnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c8-minmax-fp32-avxvnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c8-minmax-fp32-avxvnni-prfm.c index d14454dae79..517b2b73386 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c8-minmax-fp32-avxvnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c8-minmax-fp32-avxvnni-prfm.c @@ -61,7 +61,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c8-minmax-fp32-avxvnni.c index 31337bcc003..dbe6f47f35f 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c8-minmax-fp32-avxvnni.c @@ -60,7 +60,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x8c8__avxvnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-avx512vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-avx512vnni-prfm.c index 29d74d054f8..49a799a33be 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-avx512vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-avx512vnni-prfm.c @@ -67,7 +67,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-avx512vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-avx512vnni.c index f6af90909ec..a95a0820b00 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-avx512vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-avx512vnni.c @@ -66,7 +66,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-avxvnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-avxvnni-prfm.c index 15fd611fc0b..f91eba8a0ae 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-avxvnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-avxvnni-prfm.c @@ -67,7 +67,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-avxvnni.c index 26a82b9586c..d5180184d56 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-avxvnni.c @@ -66,7 +66,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__avxvnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x16c4-minmax-fp32-avx512vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x16c4-minmax-fp32-avx512vnni-prfm.c index 119be1ba768..cf57f47ce46 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x16c4-minmax-fp32-avx512vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x16c4-minmax-fp32-avx512vnni-prfm.c @@ -73,7 +73,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_5x16c4__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x16c4-minmax-fp32-avx512vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x16c4-minmax-fp32-avx512vnni.c index 2e7546d6306..b9d9d4bbe27 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x16c4-minmax-fp32-avx512vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x16c4-minmax-fp32-avx512vnni.c @@ -72,7 +72,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_5x16c4__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x16c8-minmax-fp32-avx512vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x16c8-minmax-fp32-avx512vnni-prfm.c index db644bd72c6..d7650512c75 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x16c8-minmax-fp32-avx512vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x16c8-minmax-fp32-avx512vnni-prfm.c @@ -73,7 +73,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_5x16c8__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x16c8-minmax-fp32-avx512vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x16c8-minmax-fp32-avx512vnni.c index 15ca61d7681..cf9db1e0b3d 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x16c8-minmax-fp32-avx512vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x16c8-minmax-fp32-avx512vnni.c @@ -72,7 +72,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_5x16c8__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x8c8-minmax-fp32-avx256vnni-prfm.c index 2e3b14323b9..fc876ca4053 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x8c8-minmax-fp32-avx256vnni-prfm.c @@ -73,7 +73,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_5x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x8c8-minmax-fp32-avx256vnni.c index 772078b1d94..c8b6d161b39 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x8c8-minmax-fp32-avx256vnni.c @@ -72,7 +72,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_5x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x8c8-minmax-fp32-avxvnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x8c8-minmax-fp32-avxvnni-prfm.c index 1e2a97d8a53..c8c8de821f5 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x8c8-minmax-fp32-avxvnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x8c8-minmax-fp32-avxvnni-prfm.c @@ -73,7 +73,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_5x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x8c8-minmax-fp32-avxvnni.c index 9d0c40f893c..79b46d7bb90 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-5x8c8-minmax-fp32-avxvnni.c @@ -72,7 +72,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_5x8c8__avxvnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-6x8c8-minmax-fp32-avxvnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-6x8c8-minmax-fp32-avxvnni-prfm.c index 3bf4ae71097..9cb6b67ac17 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-6x8c8-minmax-fp32-avxvnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-6x8c8-minmax-fp32-avxvnni-prfm.c @@ -79,7 +79,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_6x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-6x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-6x8c8-minmax-fp32-avxvnni.c index f86ecfe5148..8db203ffc0b 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-6x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-6x8c8-minmax-fp32-avxvnni.c @@ -78,7 +78,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_6x8c8__avxvnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x16c4-minmax-fp32-avx512vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x16c4-minmax-fp32-avx512vnni-prfm.c index 5ad19b94f78..c8ca16eaeed 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x16c4-minmax-fp32-avx512vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x16c4-minmax-fp32-avx512vnni-prfm.c @@ -85,7 +85,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x16c4__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x16c4-minmax-fp32-avx512vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x16c4-minmax-fp32-avx512vnni.c index 972238c2ea0..d9f57ffe19a 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x16c4-minmax-fp32-avx512vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x16c4-minmax-fp32-avx512vnni.c @@ -84,7 +84,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x16c4__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x16c8-minmax-fp32-avx512vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x16c8-minmax-fp32-avx512vnni-prfm.c index 56b03e16f0b..38cca17d164 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x16c8-minmax-fp32-avx512vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x16c8-minmax-fp32-avx512vnni-prfm.c @@ -85,7 +85,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x16c8__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x16c8-minmax-fp32-avx512vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x16c8-minmax-fp32-avx512vnni.c index 5a55147131b..c132ee77b90 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x16c8-minmax-fp32-avx512vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x16c8-minmax-fp32-avx512vnni.c @@ -84,7 +84,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x16c8__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x8c8-minmax-fp32-avx256vnni-prfm.c index c760327f1b8..5271301f66d 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x8c8-minmax-fp32-avx256vnni-prfm.c @@ -85,7 +85,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x8c8-minmax-fp32-avx256vnni.c index 4436b1321c4..c983ff3fe68 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x8c8-minmax-fp32-avx256vnni.c @@ -84,7 +84,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x8c8-minmax-fp32-avxvnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x8c8-minmax-fp32-avxvnni-prfm.c index f909c7617ad..8d2a95c0242 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x8c8-minmax-fp32-avxvnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x8c8-minmax-fp32-avxvnni-prfm.c @@ -85,7 +85,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x8c8-minmax-fp32-avxvnni.c index e8224d3a700..40ff4f5386e 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-7x8c8-minmax-fp32-avxvnni.c @@ -84,7 +84,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x8c8__avxvnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x16c4-minmax-fp32-avx512vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x16c4-minmax-fp32-avx512vnni-prfm.c index fd21ee44a99..75e60a8c627 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x16c4-minmax-fp32-avx512vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x16c4-minmax-fp32-avx512vnni-prfm.c @@ -91,7 +91,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_8x16c4__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x16c4-minmax-fp32-avx512vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x16c4-minmax-fp32-avx512vnni.c index 54e76c1145e..a9f1f96d688 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x16c4-minmax-fp32-avx512vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x16c4-minmax-fp32-avx512vnni.c @@ -90,7 +90,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_8x16c4__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x16c8-minmax-fp32-avx512vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x16c8-minmax-fp32-avx512vnni-prfm.c index 38a274aad0f..92157098b80 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x16c8-minmax-fp32-avx512vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x16c8-minmax-fp32-avx512vnni-prfm.c @@ -91,7 +91,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_8x16c8__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x16c8-minmax-fp32-avx512vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x16c8-minmax-fp32-avx512vnni.c index a0f87986916..492b67a4d1f 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x16c8-minmax-fp32-avx512vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x16c8-minmax-fp32-avx512vnni.c @@ -90,7 +90,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_8x16c8__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x8c8-minmax-fp32-avx256vnni-prfm.c index 358561e465c..2feaf4251f8 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x8c8-minmax-fp32-avx256vnni-prfm.c @@ -91,7 +91,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_8x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x8c8-minmax-fp32-avx256vnni.c index 56b1945234a..e7712a12e96 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x8c8-minmax-fp32-avx256vnni.c @@ -90,7 +90,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_8x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x8c8-minmax-fp32-avxvnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x8c8-minmax-fp32-avxvnni-prfm.c index 4c9a6c1970a..0bb6a7666fd 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x8c8-minmax-fp32-avxvnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x8c8-minmax-fp32-avxvnni-prfm.c @@ -91,7 +91,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_8x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x8c8-minmax-fp32-avxvnni.c index 20a737fcba4..9d2c8a85dd4 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-8x8c8-minmax-fp32-avxvnni.c @@ -90,7 +90,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_8x8c8__avxvnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x16c4-minmax-fp32-avx512vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x16c4-minmax-fp32-avx512vnni-prfm.c index a6127665677..24dfb94a6b2 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x16c4-minmax-fp32-avx512vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x16c4-minmax-fp32-avx512vnni-prfm.c @@ -97,7 +97,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_9x16c4__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x16c4-minmax-fp32-avx512vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x16c4-minmax-fp32-avx512vnni.c index 9f6559df5ce..e5dc002289e 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x16c4-minmax-fp32-avx512vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x16c4-minmax-fp32-avx512vnni.c @@ -96,7 +96,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_9x16c4__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x16c8-minmax-fp32-avx512vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x16c8-minmax-fp32-avx512vnni-prfm.c index 6cc8a02da66..6abdd92b144 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x16c8-minmax-fp32-avx512vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x16c8-minmax-fp32-avx512vnni-prfm.c @@ -97,7 +97,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_9x16c8__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x16c8-minmax-fp32-avx512vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x16c8-minmax-fp32-avx512vnni.c index 239fff3feff..392bffcbdb9 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x16c8-minmax-fp32-avx512vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x16c8-minmax-fp32-avx512vnni.c @@ -96,7 +96,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_9x16c8__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x8c8-minmax-fp32-avx256vnni-prfm.c index f0da75d6ecd..bef4730729d 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x8c8-minmax-fp32-avx256vnni-prfm.c @@ -97,7 +97,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_9x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x8c8-minmax-fp32-avx256vnni.c index cade38f7cd5..cdba9262646 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-9x8c8-minmax-fp32-avx256vnni.c @@ -96,7 +96,7 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_9x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x16c4-minmax-avx512vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x16c4-minmax-avx512vnni-prfm.c index dc6aeca1d47..2e26782b811 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x16c4-minmax-avx512vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x16c4-minmax-avx512vnni-prfm.c @@ -86,7 +86,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_10x16c4__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x16c4-minmax-avx512vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x16c4-minmax-avx512vnni.c index 20f1ca3e795..8bd1f09fbe6 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x16c4-minmax-avx512vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x16c4-minmax-avx512vnni.c @@ -85,7 +85,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_10x16c4__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x16c8-minmax-avx512vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x16c8-minmax-avx512vnni-prfm.c index fb07ee611f5..8a1b9195174 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x16c8-minmax-avx512vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x16c8-minmax-avx512vnni-prfm.c @@ -86,13 +86,13 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_10x16c8__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); // XNN_FORCE_REALIZATION(voutput_max_less_zero_point); // XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); + // XNN_FORCE_REALIZATION(voutput_min); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x16c8-minmax-avx512vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x16c8-minmax-avx512vnni.c index ca7e1401f9d..e1f20dad161 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x16c8-minmax-avx512vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x16c8-minmax-avx512vnni.c @@ -85,13 +85,13 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_10x16c8__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); // XNN_FORCE_REALIZATION(voutput_max_less_zero_point); // XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); + // XNN_FORCE_REALIZATION(voutput_min); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x8c8-minmax-fp32-avx256vnni-prfm.c index 4478ee76be8..116932e5ce3 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x8c8-minmax-fp32-avx256vnni-prfm.c @@ -85,7 +85,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_10x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x8c8-minmax-fp32-avx256vnni.c index 79054705f6c..28ba610621d 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-10x8c8-minmax-fp32-avx256vnni.c @@ -84,7 +84,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_10x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x16c4-minmax-avx512vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x16c4-minmax-avx512vnni-prfm.c index dc7b7f29ac5..573c5f27c95 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x16c4-minmax-avx512vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x16c4-minmax-avx512vnni-prfm.c @@ -94,7 +94,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_12x16c4__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x16c4-minmax-avx512vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x16c4-minmax-avx512vnni.c index e641c21bf87..1d8463bd83c 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x16c4-minmax-avx512vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x16c4-minmax-avx512vnni.c @@ -93,7 +93,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_12x16c4__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x16c8-minmax-avx512vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x16c8-minmax-avx512vnni-prfm.c index 8c758a844ab..402dd235b07 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x16c8-minmax-avx512vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x16c8-minmax-avx512vnni-prfm.c @@ -94,13 +94,13 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_12x16c8__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); // XNN_FORCE_REALIZATION(voutput_max_less_zero_point); // XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); + // XNN_FORCE_REALIZATION(voutput_min); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x16c8-minmax-avx512vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x16c8-minmax-avx512vnni.c index 6e8af4e93c0..ba2b56b9555 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x16c8-minmax-avx512vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x16c8-minmax-avx512vnni.c @@ -93,13 +93,13 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_12x16c8__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); // XNN_FORCE_REALIZATION(voutput_max_less_zero_point); // XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); + // XNN_FORCE_REALIZATION(voutput_min); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x8c8-minmax-fp32-avx256vnni-prfm.c index 119bd45678a..fe933120282 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x8c8-minmax-fp32-avx256vnni-prfm.c @@ -93,7 +93,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_12x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x8c8-minmax-fp32-avx256vnni.c index 2dd82ff3796..a6441b60b22 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-12x8c8-minmax-fp32-avx256vnni.c @@ -92,7 +92,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_12x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x16c4-minmax-avx512vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x16c4-minmax-avx512vnni-prfm.c index bdddb71ca5a..2bbde5383f7 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x16c4-minmax-avx512vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x16c4-minmax-avx512vnni-prfm.c @@ -102,7 +102,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_14x16c4__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x16c4-minmax-avx512vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x16c4-minmax-avx512vnni.c index 6f5bff185d0..f181fe8553c 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x16c4-minmax-avx512vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x16c4-minmax-avx512vnni.c @@ -101,7 +101,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_14x16c4__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x16c8-minmax-avx512vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x16c8-minmax-avx512vnni-prfm.c index 9e95f13f664..0641f810085 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x16c8-minmax-avx512vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x16c8-minmax-avx512vnni-prfm.c @@ -102,13 +102,13 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_14x16c8__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); // XNN_FORCE_REALIZATION(voutput_max_less_zero_point); // XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); + // XNN_FORCE_REALIZATION(voutput_min); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x16c8-minmax-avx512vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x16c8-minmax-avx512vnni.c index 4c69efa5740..26c2e06507d 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x16c8-minmax-avx512vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x16c8-minmax-avx512vnni.c @@ -101,13 +101,13 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_14x16c8__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); // XNN_FORCE_REALIZATION(voutput_max_less_zero_point); // XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); + // XNN_FORCE_REALIZATION(voutput_min); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x8c8-minmax-fp32-avx256vnni-prfm.c index 5ac7e52750a..cc3adaf7c55 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x8c8-minmax-fp32-avx256vnni-prfm.c @@ -101,7 +101,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_14x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x8c8-minmax-fp32-avx256vnni.c index d9fbc7f8fcb..68b2cb5ca52 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-14x8c8-minmax-fp32-avx256vnni.c @@ -100,7 +100,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_14x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-avx512vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-avx512vnni-prfm.c index cbfeca06628..37f6703f32d 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-avx512vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-avx512vnni-prfm.c @@ -50,7 +50,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__avx512vnni_prfm( int8_t* c0 = c; const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-avx512vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-avx512vnni.c index 41430a4a436..b18c2343f9f 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-avx512vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-avx512vnni.c @@ -49,7 +49,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__avx512vnni( int8_t* c0 = c; const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c8-minmax-avx512vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c8-minmax-avx512vnni-prfm.c index b514a27513e..fd2bb440143 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c8-minmax-avx512vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c8-minmax-avx512vnni-prfm.c @@ -50,13 +50,13 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512vnni_prfm( int8_t* c0 = c; const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); // XNN_FORCE_REALIZATION(voutput_max_less_zero_point); // XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); + // XNN_FORCE_REALIZATION(voutput_min); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c8-minmax-avx512vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c8-minmax-avx512vnni.c index 2e9b505c9e7..793e6ccb656 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c8-minmax-avx512vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c8-minmax-avx512vnni.c @@ -49,13 +49,13 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512vnni( int8_t* c0 = c; const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); // XNN_FORCE_REALIZATION(voutput_max_less_zero_point); // XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); + // XNN_FORCE_REALIZATION(voutput_min); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-avx256vnni-prfm.c index 63e1d357560..b46d1861774 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-avx256vnni-prfm.c @@ -49,7 +49,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx256vnni_prfm( int8_t* c0 = c; const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-avx256vnni.c index 835118664e5..044a97161d3 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-avx256vnni.c @@ -48,7 +48,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx256vnni( int8_t* c0 = c; const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-avxvnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-avxvnni-prfm.c index 50f25500f3a..dbafee527c6 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-avxvnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-avxvnni-prfm.c @@ -49,7 +49,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avxvnni_prfm( int8_t* c0 = c; const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-avxvnni.c index 9d5c68bb36c..d9413bfd580 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-avxvnni.c @@ -48,7 +48,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avxvnni( int8_t* c0 = c; const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-avxvnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-avxvnni-prfm.c index afc6a43eeeb..b427ff211f8 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-avxvnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-avxvnni-prfm.c @@ -53,7 +53,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-avxvnni.c index a829dee8cb8..c592bb799bd 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-avxvnni.c @@ -52,7 +52,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c8__avxvnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x8c8-minmax-fp32-avxvnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x8c8-minmax-fp32-avxvnni-prfm.c index 2479cc83208..33206c46078 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x8c8-minmax-fp32-avxvnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x8c8-minmax-fp32-avxvnni-prfm.c @@ -57,7 +57,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x8c8-minmax-fp32-avxvnni.c index 25bdd53bbcd..628363384e9 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x8c8-minmax-fp32-avxvnni.c @@ -56,7 +56,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x8c8__avxvnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-avx512vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-avx512vnni-prfm.c index ca9f48f00a4..d740ac197f9 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-avx512vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-avx512vnni-prfm.c @@ -62,7 +62,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-avx512vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-avx512vnni.c index 03a462317f4..f1a11a466c4 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-avx512vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-avx512vnni.c @@ -61,7 +61,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-avxvnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-avxvnni-prfm.c index 16f3990f06a..36dfa4921ca 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-avxvnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-avxvnni-prfm.c @@ -61,7 +61,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-avxvnni.c index 9588e6e6d5c..a20c974bd05 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-avxvnni.c @@ -60,7 +60,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__avxvnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x16c4-minmax-avx512vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x16c4-minmax-avx512vnni-prfm.c index 615d7f9bb12..c318e7ef56a 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x16c4-minmax-avx512vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x16c4-minmax-avx512vnni-prfm.c @@ -66,7 +66,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_5x16c4__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x16c4-minmax-avx512vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x16c4-minmax-avx512vnni.c index e171f13f25f..bef8c5ec9f0 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x16c4-minmax-avx512vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x16c4-minmax-avx512vnni.c @@ -65,7 +65,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_5x16c4__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x16c8-minmax-avx512vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x16c8-minmax-avx512vnni-prfm.c index aec7080f94a..d84a3e1c388 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x16c8-minmax-avx512vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x16c8-minmax-avx512vnni-prfm.c @@ -66,13 +66,13 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_5x16c8__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); // XNN_FORCE_REALIZATION(voutput_max_less_zero_point); // XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); + // XNN_FORCE_REALIZATION(voutput_min); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x16c8-minmax-avx512vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x16c8-minmax-avx512vnni.c index 20e48ab803a..b64ab0d66f2 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x16c8-minmax-avx512vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x16c8-minmax-avx512vnni.c @@ -65,13 +65,13 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_5x16c8__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); // XNN_FORCE_REALIZATION(voutput_max_less_zero_point); // XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); + // XNN_FORCE_REALIZATION(voutput_min); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x8c8-minmax-fp32-avx256vnni-prfm.c index c86694efbf8..487e58f7106 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x8c8-minmax-fp32-avx256vnni-prfm.c @@ -65,7 +65,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_5x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x8c8-minmax-fp32-avx256vnni.c index 67d42748264..48db949387f 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x8c8-minmax-fp32-avx256vnni.c @@ -64,7 +64,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_5x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x8c8-minmax-fp32-avxvnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x8c8-minmax-fp32-avxvnni-prfm.c index 28239d52da5..d893e869457 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x8c8-minmax-fp32-avxvnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x8c8-minmax-fp32-avxvnni-prfm.c @@ -65,7 +65,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_5x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x8c8-minmax-fp32-avxvnni.c index 25404db9eea..6697a57f116 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-5x8c8-minmax-fp32-avxvnni.c @@ -64,7 +64,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_5x8c8__avxvnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-6x8c8-minmax-fp32-avxvnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-6x8c8-minmax-fp32-avxvnni-prfm.c index 9586fb92514..f82ff68f38d 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-6x8c8-minmax-fp32-avxvnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-6x8c8-minmax-fp32-avxvnni-prfm.c @@ -69,7 +69,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_6x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-6x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-6x8c8-minmax-fp32-avxvnni.c index 16b72ccb363..0b384292bf1 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-6x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-6x8c8-minmax-fp32-avxvnni.c @@ -68,7 +68,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_6x8c8__avxvnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x16c4-minmax-avx512vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x16c4-minmax-avx512vnni-prfm.c index 0a5a146ce4b..1f68737b0a6 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x16c4-minmax-avx512vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x16c4-minmax-avx512vnni-prfm.c @@ -74,7 +74,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x16c4__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x16c4-minmax-avx512vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x16c4-minmax-avx512vnni.c index d266b67361d..00c4dbc4bc4 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x16c4-minmax-avx512vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x16c4-minmax-avx512vnni.c @@ -73,7 +73,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x16c4__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x16c8-minmax-avx512vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x16c8-minmax-avx512vnni-prfm.c index 780018067d8..e68410acc2f 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x16c8-minmax-avx512vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x16c8-minmax-avx512vnni-prfm.c @@ -74,13 +74,13 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x16c8__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); // XNN_FORCE_REALIZATION(voutput_max_less_zero_point); // XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); + // XNN_FORCE_REALIZATION(voutput_min); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x16c8-minmax-avx512vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x16c8-minmax-avx512vnni.c index 1cbc5c83811..ae751132706 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x16c8-minmax-avx512vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x16c8-minmax-avx512vnni.c @@ -73,13 +73,13 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x16c8__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); // XNN_FORCE_REALIZATION(voutput_max_less_zero_point); // XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); + // XNN_FORCE_REALIZATION(voutput_min); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x8c8-minmax-fp32-avx256vnni-prfm.c index d16e3dc5f57..451c41492ed 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x8c8-minmax-fp32-avx256vnni-prfm.c @@ -73,7 +73,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x8c8-minmax-fp32-avx256vnni.c index 07e6fb1789f..8f7e78e3ca6 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x8c8-minmax-fp32-avx256vnni.c @@ -72,7 +72,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x8c8-minmax-fp32-avxvnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x8c8-minmax-fp32-avxvnni-prfm.c index 22ecd198f8a..b8d95a621de 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x8c8-minmax-fp32-avxvnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x8c8-minmax-fp32-avxvnni-prfm.c @@ -73,7 +73,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x8c8-minmax-fp32-avxvnni.c index f436f5378e5..243208254d5 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-7x8c8-minmax-fp32-avxvnni.c @@ -72,7 +72,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x8c8__avxvnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x16c4-minmax-avx512vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x16c4-minmax-avx512vnni-prfm.c index 6ae28b0de15..94e92a6cbcf 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x16c4-minmax-avx512vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x16c4-minmax-avx512vnni-prfm.c @@ -78,7 +78,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_8x16c4__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x16c4-minmax-avx512vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x16c4-minmax-avx512vnni.c index ed133158cc2..f2ffe54c21e 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x16c4-minmax-avx512vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x16c4-minmax-avx512vnni.c @@ -77,7 +77,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_8x16c4__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x16c8-minmax-avx512vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x16c8-minmax-avx512vnni-prfm.c index 91ddaf544b2..38e844be90f 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x16c8-minmax-avx512vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x16c8-minmax-avx512vnni-prfm.c @@ -78,13 +78,13 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_8x16c8__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); // XNN_FORCE_REALIZATION(voutput_max_less_zero_point); // XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); + // XNN_FORCE_REALIZATION(voutput_min); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x16c8-minmax-avx512vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x16c8-minmax-avx512vnni.c index 77845b4d8df..5e3c39541e0 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x16c8-minmax-avx512vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x16c8-minmax-avx512vnni.c @@ -77,13 +77,13 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_8x16c8__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); // XNN_FORCE_REALIZATION(voutput_max_less_zero_point); // XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); + // XNN_FORCE_REALIZATION(voutput_min); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x8c8-minmax-fp32-avx256vnni-prfm.c index ac03710e7fc..28ac4636c8d 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x8c8-minmax-fp32-avx256vnni-prfm.c @@ -77,7 +77,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_8x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x8c8-minmax-fp32-avx256vnni.c index 5d98870239f..206969adbb9 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x8c8-minmax-fp32-avx256vnni.c @@ -76,7 +76,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_8x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x8c8-minmax-fp32-avxvnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x8c8-minmax-fp32-avxvnni-prfm.c index 9924e034ad2..081729e666c 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x8c8-minmax-fp32-avxvnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x8c8-minmax-fp32-avxvnni-prfm.c @@ -77,7 +77,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_8x8c8__avxvnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x8c8-minmax-fp32-avxvnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x8c8-minmax-fp32-avxvnni.c index 12f1afc1261..3a251b0ce47 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x8c8-minmax-fp32-avxvnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x8c8-minmax-fp32-avxvnni.c @@ -76,7 +76,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_8x8c8__avxvnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x16c4-minmax-avx512vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x16c4-minmax-avx512vnni-prfm.c index 893d81d54cb..9cfe4d0b469 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x16c4-minmax-avx512vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x16c4-minmax-avx512vnni-prfm.c @@ -82,7 +82,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_9x16c4__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x16c4-minmax-avx512vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x16c4-minmax-avx512vnni.c index 56b9bdb96eb..1d13313c212 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x16c4-minmax-avx512vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x16c4-minmax-avx512vnni.c @@ -81,7 +81,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_9x16c4__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x16c8-minmax-avx512vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x16c8-minmax-avx512vnni-prfm.c index b96a2fc2078..afe19221223 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x16c8-minmax-avx512vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x16c8-minmax-avx512vnni-prfm.c @@ -82,13 +82,13 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_9x16c8__avx512vnni_prfm( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); // XNN_FORCE_REALIZATION(voutput_max_less_zero_point); // XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); + // XNN_FORCE_REALIZATION(voutput_min); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x16c8-minmax-avx512vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x16c8-minmax-avx512vnni.c index c015e4c86f9..a696814c9c6 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x16c8-minmax-avx512vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x16c8-minmax-avx512vnni.c @@ -81,13 +81,13 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_9x16c8__avx512vnni( } const __m512i vsign_mask = _mm512_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m512 voutput_max_less_zero_point = _mm512_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m512i voutput_zero_point = _mm512_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); // XNN_FORCE_REALIZATION(voutput_max_less_zero_point); // XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); + // XNN_FORCE_REALIZATION(voutput_min); do { __m512i vacc0x01234567 = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) w)); __m512i vacc0x89ABCDEF = _mm512_cvtepu32_epi64(_mm256_load_si256((const __m256i*) ((const int32_t*) w + 8))); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x8c8-minmax-fp32-avx256vnni-prfm.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x8c8-minmax-fp32-avx256vnni-prfm.c index a429762d659..03dec67faa4 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x8c8-minmax-fp32-avx256vnni-prfm.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x8c8-minmax-fp32-avx256vnni-prfm.c @@ -81,7 +81,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_9x8c8__avx256vnni_prfm( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x8c8-minmax-fp32-avx256vnni.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x8c8-minmax-fp32-avx256vnni.c index ea7c8a752a8..8c9da413304 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x8c8-minmax-fp32-avx256vnni.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-9x8c8-minmax-fp32-avx256vnni.c @@ -80,7 +80,7 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_9x8c8__avx256vnni( } const __m256i vsign_mask = _mm256_set1_epi8(0x80); - XNN_FORCE_REALIZATION(vsign_mask); + // XNN_FORCE_REALIZATION(vsign_mask); const __m256 voutput_max_less_zero_point = _mm256_set1_ps((int32_t) params->fp32_scalar.output_max - (int32_t) params->fp32_scalar.output_zero_point); const __m256i voutput_zero_point = _mm256_set1_epi32(params->fp32_scalar.output_zero_point); const __m128i voutput_min = _mm_set1_epi8(params->fp32_scalar.output_min); diff --git a/test/qd8-f32-qc4w-gemm-minmax-2.cc b/test/qd8-f32-qc4w-gemm-minmax-2.cc index 9d1ba868ab5..a06a2b160a9 100644 --- a/test/qd8-f32-qc4w-gemm-minmax-2.cc +++ b/test/qd8-f32-qc4w-gemm-minmax-2.cc @@ -337,7 +337,7 @@ std::vector CreateTests1( std::string kbs = std::to_string(k_block); std::string kb2s = std::to_string(k_block * 2); std::string akbs = std::to_string(adj_k_block); - nr = nr * xnn_init_hardware_config()->vlenb / sizeof(int32_t); + nr = nr * xnn_init_hardware_config()->vlenb / sizeof(float); std::string nrs = std::to_string(nr); const GemmMicrokernelTester tester = GemmMicrokernelTester() diff --git a/test/qd8-f32-qc4w-gemm-minmax-3.cc b/test/qd8-f32-qc4w-gemm-minmax-3.cc index 5dfd462ac00..2579ff35439 100644 --- a/test/qd8-f32-qc4w-gemm-minmax-3.cc +++ b/test/qd8-f32-qc4w-gemm-minmax-3.cc @@ -337,7 +337,7 @@ std::vector CreateTests1( std::string kbs = std::to_string(k_block); std::string kb2s = std::to_string(k_block * 2); std::string akbs = std::to_string(adj_k_block); - nr = nr * xnn_init_hardware_config()->vlenb / sizeof(int32_t); + nr = nr * xnn_init_hardware_config()->vlenb / sizeof(float); std::string nrs = std::to_string(nr); const GemmMicrokernelTester tester = GemmMicrokernelTester() diff --git a/test/qd8-f32-qc4w-gemm-minmax-4.cc b/test/qd8-f32-qc4w-gemm-minmax-4.cc index 949c6b644cc..23ca0761638 100644 --- a/test/qd8-f32-qc4w-gemm-minmax-4.cc +++ b/test/qd8-f32-qc4w-gemm-minmax-4.cc @@ -337,7 +337,7 @@ std::vector CreateTests1( std::string kbs = std::to_string(k_block); std::string kb2s = std::to_string(k_block * 2); std::string akbs = std::to_string(adj_k_block); - nr = nr * xnn_init_hardware_config()->vlenb / sizeof(int32_t); + nr = nr * xnn_init_hardware_config()->vlenb / sizeof(float); std::string nrs = std::to_string(nr); const GemmMicrokernelTester tester = GemmMicrokernelTester() diff --git a/test/qd8-f32-qc4w-gemm-minmax.cc b/test/qd8-f32-qc4w-gemm-minmax.cc index e89fd7cf3fb..17f7c19a858 100644 --- a/test/qd8-f32-qc4w-gemm-minmax.cc +++ b/test/qd8-f32-qc4w-gemm-minmax.cc @@ -337,7 +337,7 @@ std::vector CreateTests1( std::string kbs = std::to_string(k_block); std::string kb2s = std::to_string(k_block * 2); std::string akbs = std::to_string(adj_k_block); - nr = nr * xnn_init_hardware_config()->vlenb / sizeof(int32_t); + nr = nr * xnn_init_hardware_config()->vlenb / sizeof(float); std::string nrs = std::to_string(nr); const GemmMicrokernelTester tester = GemmMicrokernelTester()