From 60f7c95c02a31d182784073ead50c51265b4240a Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 5 Oct 2025 21:02:26 +0200 Subject: [PATCH 1/2] Simplify load implementation from #1172 - Split some implementation that lived in sse4_1 while sse2 was a good home - Avoid auxiliary function --- include/xsimd/arch/xsimd_avx2.hpp | 80 +++++++++++++---------------- include/xsimd/arch/xsimd_sse2.hpp | 20 ++++++++ include/xsimd/arch/xsimd_sse4_1.hpp | 78 ++++++++++++---------------- 3 files changed, 89 insertions(+), 89 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 6db583ff8..5addf79a1 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -572,64 +572,56 @@ namespace xsimd } // load_unaligned - namespace detail + + template ::value, void>::type> + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { - template - XSIMD_INLINE __m256i load_bool_avx2(bool const* mem) noexcept + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return { _mm256_sub_epi8(_mm256_set1_epi8(0), _mm256_loadu_si256((__m256i const*)mem)) }; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + auto bpack = _mm_loadu_si128((__m128i const*)mem); + return { _mm256_sub_epi16(_mm256_set1_epi8(0), _mm256_cvtepu8_epi16(bpack)) }; + } + // GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this. + // GCC/Clang/MSVC will turn it into the correct load. + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return _mm256_sub_epi8(_mm256_set1_epi8(0), _mm256_loadu_si256((__m256i const*)mem)); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - auto bpack = _mm_loadu_si128((__m128i const*)mem); - return _mm256_sub_epi16(_mm256_set1_epi8(0), _mm256_cvtepu8_epi16(bpack)); - } - // GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this. - // GCC/Clang/MSVC will turn it into the correct load. - else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { #if defined(__x86_64__) - uint64_t tmp; - memcpy(&tmp, mem, sizeof(tmp)); - auto val = _mm_cvtsi64_si128(tmp); + uint64_t tmp; + memcpy(&tmp, mem, sizeof(tmp)); + auto val = _mm_cvtsi64_si128(tmp); #else - __m128i val; - memcpy(&val, mem, sizeof(uint64_t)); + __m128i val; + memcpy(&val, mem, sizeof(uint64_t)); #endif - return _mm256_sub_epi32(_mm256_set1_epi8(0), _mm256_cvtepu8_epi32(val)); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - uint32_t tmp; - memcpy(&tmp, mem, sizeof(tmp)); - return _mm256_sub_epi64(_mm256_set1_epi8(0), _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(tmp))); - } - else - { - assert(false && "unsupported arch/op combination"); - return __m256i {}; - } + return { _mm256_sub_epi32(_mm256_set1_epi8(0), _mm256_cvtepu8_epi32(val)) }; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + uint32_t tmp; + memcpy(&tmp, mem, sizeof(tmp)); + return { _mm256_sub_epi64(_mm256_set1_epi8(0), _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(tmp))) }; + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; } - } - - template ::value, void>::type> - XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept - { - return batch_bool(detail::load_bool_avx2(mem)); } template - XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch r) noexcept { - return batch_bool(_mm256_castsi256_ps(detail::load_bool_avx2(mem))); + return { _mm256_castsi256_ps(load_unaligned(mem, batch_bool {}, r).data) }; } template - XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch r) noexcept { - return batch_bool(_mm256_castsi256_pd(detail::load_bool_avx2(mem))); + return { _mm256_castsi256_pd(load_unaligned(mem, batch_bool {}, r).data) }; } // mask diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index 01787de44..22f3cdf99 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -1043,6 +1043,26 @@ namespace xsimd return _mm_loadu_pd(mem); } + // load batch_bool + + template + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept + { + return _mm_sub_epi8(_mm_set1_epi8(0), _mm_loadu_si128((__m128i const*)mem)); + } + + template + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch r) noexcept + { + return { load_unaligned(mem, batch_bool {}, r).data }; + } + + template + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch r) noexcept + { + return { load_unaligned(mem, batch_bool {}, r).data }; + } + // load_complex namespace detail { diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp index ae01c8d02..b453d57cb 100644 --- a/include/xsimd/arch/xsimd_sse4_1.hpp +++ b/include/xsimd/arch/xsimd_sse4_1.hpp @@ -123,65 +123,53 @@ namespace xsimd } // load_unaligned - namespace detail + + template ::value && sizeof(T) > 1), void>::type> + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { - template - XSIMD_INLINE __m128i load_bool_sse4_1(bool const* mem) noexcept + // GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this. + // GCC/Clang/MSVC will turn it into the correct load. + XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return _mm_sub_epi8(_mm_set1_epi8(0), _mm_loadu_si128((__m128i const*)mem)); - } - // GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this. - // GCC/Clang/MSVC will turn it into the correct load. - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { #if defined(__x86_64__) - uint64_t tmp; - memcpy(&tmp, mem, sizeof(tmp)); - auto val = _mm_cvtsi64_si128(tmp); + uint64_t tmp; + memcpy(&tmp, mem, sizeof(tmp)); + auto val = _mm_cvtsi64_si128(tmp); #else - __m128i val; - memcpy(&val, mem, sizeof(uint64_t)); + __m128i val; + memcpy(&val, mem, sizeof(uint64_t)); #endif - return _mm_sub_epi16(_mm_set1_epi8(0), _mm_cvtepu8_epi16(val)); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - uint32_t tmp; - memcpy(&tmp, mem, sizeof(tmp)); - return _mm_sub_epi32(_mm_set1_epi8(0), _mm_cvtepu8_epi32(_mm_cvtsi32_si128(tmp))); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - uint16_t tmp; - memcpy(&tmp, mem, sizeof(tmp)); - return _mm_sub_epi64(_mm_set1_epi8(0), _mm_cvtepu8_epi64(_mm_cvtsi32_si128((uint32_t)tmp))); - } - else - { - assert(false && "unsupported arch/op combination"); - return __m128i {}; - } + return { _mm_sub_epi16(_mm_set1_epi8(0), _mm_cvtepu8_epi16(val)) }; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + uint32_t tmp; + memcpy(&tmp, mem, sizeof(tmp)); + return { _mm_sub_epi32(_mm_set1_epi8(0), _mm_cvtepu8_epi32(_mm_cvtsi32_si128(tmp))) }; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + uint16_t tmp; + memcpy(&tmp, mem, sizeof(tmp)); + return { _mm_sub_epi64(_mm_set1_epi8(0), _mm_cvtepu8_epi64(_mm_cvtsi32_si128((uint32_t)tmp))) }; + } + else + { + assert(false && "unsupported arch/op combination"); + return __m128i {}; } - } - - template ::value, void>::type> - XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept - { - return batch_bool(detail::load_bool_sse4_1(mem)); } template - XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch r) noexcept { - return batch_bool(_mm_castsi128_ps(detail::load_bool_sse4_1(mem))); + return { _mm_castsi128_ps(load_unaligned(mem, batch_bool {}, r)) }; } template - XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch r) noexcept { - return batch_bool(_mm_castsi128_pd(detail::load_bool_sse4_1(mem))); + return { _mm_castsi128_pd(load_unaligned(mem, batch_bool {}, r)) }; } // max From 51e514b8f3fe5dab6ac0b67a6744d79a0197032a Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 5 Oct 2025 22:22:23 +0200 Subject: [PATCH 2/2] Fix conversion warning for avx512vbmi2 mask --- include/xsimd/arch/xsimd_avx512vbmi2.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx512vbmi2.hpp b/include/xsimd/arch/xsimd_avx512vbmi2.hpp index c5a55560f..d6f56307b 100644 --- a/include/xsimd/arch/xsimd_avx512vbmi2.hpp +++ b/include/xsimd/arch/xsimd_avx512vbmi2.hpp @@ -28,12 +28,12 @@ namespace xsimd template XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { - return _mm512_maskz_compress_epi16(mask.mask(), self); + return _mm512_maskz_compress_epi16((__mmask32)mask.mask(), self); } template XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { - return _mm512_maskz_compress_epi16(mask.mask(), self); + return _mm512_maskz_compress_epi16((__mmask32)mask.mask(), self); } template XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept @@ -50,12 +50,12 @@ namespace xsimd template XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { - return _mm512_maskz_expand_epi16(mask.mask(), self); + return _mm512_maskz_expand_epi16((__mmask32)mask.mask(), self); } template XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { - return _mm512_maskz_expand_epi16(mask.mask(), self); + return _mm512_maskz_expand_epi16((__mmask32)mask.mask(), self); } template XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept