diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index c8038334a..3b2f271b2 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -374,6 +374,24 @@ namespace xsimd return batch::load(buffer.data(), aligned_mode {}); } + // Same-type, aligned compile-time mask: lower to ``select`` against + // the constant mask. Aligned mode guarantees the whole vector lives + // inside one alignment unit, so an unconditional load cannot fault on + // inactive lanes. Collapses to one ``pand mem, const_mask`` (or one + // masked-blend) per call site, instead of the per-lane stack-buffer + // round-trip the cross-type generic overload above emits — which the + // compiler folds for wide types (f32/f64 → 4 inst) but NOT for narrow + // types like uint8_t (~50 inst of stack ``mov``/``shl``/``and``/``or`` + // round-trips on SSE4.2 -O3 -DNDEBUG). + template + XSIMD_INLINE batch + load_masked(T const* mem, batch_bool_constant mask, convert, aligned_mode, requires_arch) noexcept + { + return select(mask.as_batch_bool(), + batch::load_aligned(mem), + batch(T(0))); + } + template XSIMD_INLINE void store_masked(T_out* mem, batch const& src, batch_bool_constant, alignment, requires_arch) noexcept