From ce57bfe4aa2fca037018e8cd47aa1f6c1e58304d Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 16 Apr 2026 14:09:10 -0400 Subject: [PATCH] perf: add native AVX2 uint64/int64 mul kernel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously batch<[u]int64_t, avx2> mul fell through to AVX, which has no integer mul, which in turn fell through to SSE4.1 — splitting each 256-bit register into two 128-bit halves (vextracti128/vinserti128) and running the mul_epu32 sequence twice. Add a sizeof(T)==8 specialization using _mm256_mul_epu32 directly, mirroring the SSE4.1 pattern with 256-bit intrinsics. Generates 8 ymm ops: 2 vpshufd, 3 vpmuludq, 2 vpaddq, 1 vpsllq — no lane splitting. AVX512F (without DQ) also benefits since it forwards to the AVX2 kernel. --- include/xsimd/arch/xsimd_avx2.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 28545e75a..cfa22d18b 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -912,6 +912,16 @@ namespace xsimd { return _mm256_mullo_epi32(self, other); } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_add_epi64( + _mm256_mul_epu32(self, other), + _mm256_slli_epi64( + _mm256_add_epi64( + _mm256_mul_epu32(other, _mm256_shuffle_epi32(self, _MM_SHUFFLE(2, 3, 0, 1))), + _mm256_mul_epu32(self, _mm256_shuffle_epi32(other, _MM_SHUFFLE(2, 3, 0, 1)))), + 32)); + } else { return mul(self, other, avx {});