diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 87e5dafae..a9e30b4ab 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -573,7 +573,7 @@ namespace xsimd // load_unaligned - template ::value>::type> + template ::value>::type> XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp index ad2a25246..27c532d9b 100644 --- a/include/xsimd/arch/xsimd_neon.hpp +++ b/include/xsimd/arch/xsimd_neon.hpp @@ -574,51 +574,44 @@ namespace xsimd } /* batch bool version */ - template = 0> + template = 0> XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { auto vmem = load_unaligned((unsigned char const*)mem, convert {}, A {}); return { 0 - vmem.data }; } - template = 0> + template = 0> XSIMD_INLINE batch_bool load_aligned(bool const* mem, batch_bool t, requires_arch r) noexcept { return load_unaligned(mem, t, r); } - template = 0> + template = 0> XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { uint16x8_t vmem = vmovl_u8(vld1_u8((unsigned char const*)mem)); return { 0 - vmem }; } - template = 0> + template = 0> XSIMD_INLINE batch_bool load_aligned(bool const* mem, batch_bool t, requires_arch r) noexcept { return load_unaligned(mem, t, r); } - template = 0> + template = 0> XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { uint8x8_t tmp = vreinterpret_u8_u32(vset_lane_u32(*(unsigned int*)mem, vdup_n_u32(0), 0)); return { 0 - vmovl_u16(vget_low_u16(vmovl_u8(tmp))) }; } - template = 0> + template = 0> XSIMD_INLINE batch_bool load_aligned(bool const* mem, batch_bool t, requires_arch r) noexcept { return load_unaligned(mem, t, r); } - template - XSIMD_INLINE batch_bool load_aligned(bool const* mem, batch_bool t, requires_arch r) noexcept - { - uint8x8_t tmp = vreinterpret_u8_u32(vset_lane_u32(*(unsigned int*)mem, vdup_n_u32(0), 0)); - return { 0 - vmovl_u16(vget_low_u16(vmovl_u8(tmp))) }; - } - /********* * store * *********/ diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp index 130d50476..1a64fc878 100644 --- a/include/xsimd/arch/xsimd_sse4_1.hpp +++ b/include/xsimd/arch/xsimd_sse4_1.hpp @@ -125,7 +125,7 @@ namespace xsimd // load_unaligned - template ::value && sizeof(T) > 1)>::type> + template ::value && sizeof(T) > 1)>::type> XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { // GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.