From 5af4b923e1a475c0589381351257e8cdb6ce5ca5 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 5 Oct 2025 22:03:57 +0200 Subject: [PATCH] Extend #1172 approach to avx512 --- include/xsimd/arch/xsimd_avx512bw.hpp | 60 ++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512bw.hpp index 14858e90c..e15e4433a 100644 --- a/include/xsimd/arch/xsimd_avx512bw.hpp +++ b/include/xsimd/arch/xsimd_avx512bw.hpp @@ -316,18 +316,66 @@ namespace xsimd } // load - template ::size == 64, void>::type> + template ::value, void>::type> XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { - __m512i bool_val = _mm512_loadu_si512((__m512i const*)mem); - return _mm512_cmpgt_epu8_mask(bool_val, _mm512_setzero_si512()); + using mask_type = typename batch_bool::register_type; + XSIMD_IF_CONSTEXPR(batch_bool::size == 64) + { + __m512i bool_val = _mm512_loadu_si512((__m512i const*)mem); + return (mask_type)_mm512_cmpgt_epu8_mask(bool_val, _mm512_setzero_si512()); + } + else XSIMD_IF_CONSTEXPR(batch_bool::size == 32) + { + __m256i bpack = _mm256_loadu_si256((__m256i const*)mem); + return (mask_type)_mm512_cmpgt_epu16_mask(_mm512_cvtepu8_epi16(bpack), _mm512_setzero_si512()); + } + else XSIMD_IF_CONSTEXPR(batch_bool::size == 16) + { + __m128i bpack = _mm_loadu_si128((__m128i const*)mem); + return (mask_type)_mm512_cmpgt_epu32_mask(_mm512_cvtepu8_epi32(bpack), _mm512_setzero_si512()); + } + else XSIMD_IF_CONSTEXPR(batch_bool::size == 8) + { + __m128i bpack = _mm_loadl_epi64((__m128i const*)mem); + return (mask_type)_mm512_cmpgt_epu64_mask(_mm512_cvtepu8_epi64(bpack), _mm512_setzero_si512()); + } + else + { + assert(false && "unexpected batch size"); + return {}; + } } - template ::size == 64, void>::type> + template ::value, void>::type> XSIMD_INLINE batch_bool load_aligned(bool const* mem, batch_bool, requires_arch) noexcept { - __m512i bool_val = _mm512_load_si512((__m512i const*)mem); - return _mm512_cmpgt_epu8_mask(bool_val, _mm512_setzero_si512()); + using mask_type = typename batch_bool::register_type; + XSIMD_IF_CONSTEXPR(batch_bool::size == 64) + { + __m512i bool_val = _mm512_load_si512((__m512i const*)mem); + return (mask_type)_mm512_cmpgt_epu8_mask(bool_val, _mm512_setzero_si512()); + } + else XSIMD_IF_CONSTEXPR(batch_bool::size == 32) + { + __m256i bpack = _mm256_load_si256((__m256i const*)mem); + return (mask_type)_mm512_cmpgt_epu16_mask(_mm512_cvtepu8_epi16(bpack), _mm512_setzero_si512()); + } + else XSIMD_IF_CONSTEXPR(batch_bool::size == 16) + { + __m128i bpack = _mm_load_si128((__m128i const*)mem); + return (mask_type)_mm512_cmpgt_epu32_mask(_mm512_cvtepu8_epi32(bpack), _mm512_setzero_si512()); + } + else XSIMD_IF_CONSTEXPR(batch_bool::size == 8) + { + __m128i bpack = _mm_loadl_epi64((__m128i const*)mem); + return (mask_type)_mm512_cmpgt_epu64_mask(_mm512_cvtepu8_epi64(bpack), _mm512_setzero_si512()); + } + else + { + assert(false && "unexpected batch size"); + return {}; + } } // max