From 2af87f5681c9157ccf17849a0da2dcb2eb33a1b5 Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Mon, 27 Apr 2026 18:13:55 -0400
Subject: [PATCH] perf: collapse same-type aligned batch_bool_constant load to
 a select
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cross-type ``load_masked`` overload for ``batch_bool_constant`` builds
the result via a per-lane scalar buffer:

    for (i = 0; i < size; ++i)
        buffer[i] = mask[i] ? T(mem[i]) : T(0);

GCC -O3 -DNDEBUG folds this for wide types (4-lane f32: 4 instructions
``movd + pinsrq``) but not for narrow types — for a 16-lane uint8_t mask
on SSE4.2 it emits ~50 asm lines of stack ``mov``/``shl``/``and``/``or``
round-trips through ``-0x18(%rsp)``.

Add a same-type, aligned-mode overload that lowers to ``select``
against the constant mask. Aligned mode guarantees the whole vector
lives in one alignment unit (alignof(A) >= sizeof(batch) on every
common-fallback arch), so an unconditional load cannot fault on
inactive lanes. The new overload is more specialized than the existing
``T_in, T_out, alignment`` template, so it wins overload resolution for
the same-type aligned case while leaving cross-type and unaligned paths
untouched.

Codegen probe (``g++ -O3 -DNDEBUG -msse4.2``):

  function                            before  after
  load_aligned_const_u8 (mixed mask)  ~50 inst  2 inst (``pand mem, k``)
  load_aligned_const_f32 (T,F,T,F)    4 inst    2 inst (``pxor + blendps``)

Tests: 6 of 7 multi-arch builds (SSE2, SSE4.1, AVX2, AVX-512 via sde64,
RVV via qemu, emulated256) pass full ``test_xsimd``. AArch64 via qemu
shows 8 pre-existing test failures in ``[basic api] store_as(bool*,
batch_bool)`` reproduced on pristine master, unrelated to this change.
---
 .../xsimd/arch/common/xsimd_common_memory.hpp  | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp
index c8038334a..3b2f271b2 100644
--- a/include/xsimd/arch/common/xsimd_common_memory.hpp
+++ b/include/xsimd/arch/common/xsimd_common_memory.hpp
@@ -374,6 +374,24 @@ namespace xsimd
             return batch<T_out, A>::load(buffer.data(), aligned_mode {});
         }
 
+        // Same-type, aligned compile-time mask: lower to ``select`` against
+        // the constant mask. Aligned mode guarantees the whole vector lives
+        // inside one alignment unit, so an unconditional load cannot fault on
+        // inactive lanes. Collapses to one ``pand mem, const_mask`` (or one
+        // masked-blend) per call site, instead of the per-lane stack-buffer
+        // round-trip the cross-type generic overload above emits — which the
+        // compiler folds for wide types (f32/f64 → 4 inst) but NOT for narrow
+        // types like uint8_t (~50 inst of stack ``mov``/``shl``/``and``/``or``
+        // round-trips on SSE4.2 -O3 -DNDEBUG).
+        template <class A, class T, bool... Values>
+        XSIMD_INLINE batch<T, A>
+        load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, aligned_mode, requires_arch<common>) noexcept
+        {
+            return select(mask.as_batch_bool(),
+                          batch<T, A>::load_aligned(mem),
+                          batch<T, A>(T(0)));
+        }
+
         template <class A, class T_in, class T_out, bool... Values, class alignment>
         XSIMD_INLINE void
         store_masked(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...>, alignment, requires_arch<common>) noexcept