From ce57bfe4aa2fca037018e8cd47aa1f6c1e58304d Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Thu, 16 Apr 2026 14:09:10 -0400
Subject: [PATCH] perf: add native AVX2 uint64/int64 mul kernel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously batch<[u]int64_t, avx2> mul fell through to AVX, which has no
integer mul, which in turn fell through to SSE4.1 — splitting each 256-bit
register into two 128-bit halves (vextracti128/vinserti128) and running the
mul_epu32 sequence twice.

Add a sizeof(T)==8 specialization using _mm256_mul_epu32 directly, mirroring
the SSE4.1 pattern with 256-bit intrinsics. Generates 8 ymm ops: 2 vpshufd,
3 vpmuludq, 2 vpaddq, 1 vpsllq — no lane splitting.

AVX512F (without DQ) also benefits since it forwards to the AVX2 kernel.
---
 include/xsimd/arch/xsimd_avx2.hpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp
index 28545e75a..cfa22d18b 100644
--- a/include/xsimd/arch/xsimd_avx2.hpp
+++ b/include/xsimd/arch/xsimd_avx2.hpp
@@ -912,6 +912,16 @@ namespace xsimd
             {
                 return _mm256_mullo_epi32(self, other);
             }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_add_epi64(
+                    _mm256_mul_epu32(self, other),
+                    _mm256_slli_epi64(
+                        _mm256_add_epi64(
+                            _mm256_mul_epu32(other, _mm256_shuffle_epi32(self, _MM_SHUFFLE(2, 3, 0, 1))),
+                            _mm256_mul_epu32(self, _mm256_shuffle_epi32(other, _MM_SHUFFLE(2, 3, 0, 1)))),
+                        32));
+            }
             else
             {
                 return mul(self, other, avx {});