Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 36 additions & 44 deletions include/xsimd/arch/xsimd_avx2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -572,64 +572,56 @@ namespace xsimd
}

// load_unaligned<batch_bool>
namespace detail

template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<avx2>) noexcept
{
template <class T>
XSIMD_INLINE __m256i load_bool_avx2(bool const* mem) noexcept
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return { _mm256_sub_epi8(_mm256_set1_epi8(0), _mm256_loadu_si256((__m256i const*)mem)) };
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
auto bpack = _mm_loadu_si128((__m128i const*)mem);
return { _mm256_sub_epi16(_mm256_set1_epi8(0), _mm256_cvtepu8_epi16(bpack)) };
}
// GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
// GCC/Clang/MSVC will turn it into the correct load.
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm256_sub_epi8(_mm256_set1_epi8(0), _mm256_loadu_si256((__m256i const*)mem));
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
auto bpack = _mm_loadu_si128((__m128i const*)mem);
return _mm256_sub_epi16(_mm256_set1_epi8(0), _mm256_cvtepu8_epi16(bpack));
}
// GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
// GCC/Clang/MSVC will turn it into the correct load.
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
#if defined(__x86_64__)
uint64_t tmp;
memcpy(&tmp, mem, sizeof(tmp));
auto val = _mm_cvtsi64_si128(tmp);
uint64_t tmp;
memcpy(&tmp, mem, sizeof(tmp));
auto val = _mm_cvtsi64_si128(tmp);
#else
__m128i val;
memcpy(&val, mem, sizeof(uint64_t));
__m128i val;
memcpy(&val, mem, sizeof(uint64_t));
#endif
return _mm256_sub_epi32(_mm256_set1_epi8(0), _mm256_cvtepu8_epi32(val));
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
uint32_t tmp;
memcpy(&tmp, mem, sizeof(tmp));
return _mm256_sub_epi64(_mm256_set1_epi8(0), _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(tmp)));
}
else
{
assert(false && "unsupported arch/op combination");
return __m256i {};
}
return { _mm256_sub_epi32(_mm256_set1_epi8(0), _mm256_cvtepu8_epi32(val)) };
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
uint32_t tmp;
memcpy(&tmp, mem, sizeof(tmp));
return { _mm256_sub_epi64(_mm256_set1_epi8(0), _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(tmp))) };
}
else
{
assert(false && "unsupported arch/op combination");
return {};
}
}

template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<avx2>) noexcept
{
return batch_bool<T, A>(detail::load_bool_avx2<T>(mem));
}

template <class A>
XSIMD_INLINE batch_bool<float, A> load_unaligned(bool const* mem, batch_bool<float, A>, requires_arch<avx2>) noexcept
XSIMD_INLINE batch_bool<float, A> load_unaligned(bool const* mem, batch_bool<float, A>, requires_arch<avx2> r) noexcept
{
return batch_bool<float, A>(_mm256_castsi256_ps(detail::load_bool_avx2<float>(mem)));
return { _mm256_castsi256_ps(load_unaligned(mem, batch_bool<uint32_t, A> {}, r).data) };
}

template <class A>
XSIMD_INLINE batch_bool<double, A> load_unaligned(bool const* mem, batch_bool<double, A>, requires_arch<avx2>) noexcept
XSIMD_INLINE batch_bool<double, A> load_unaligned(bool const* mem, batch_bool<double, A>, requires_arch<avx2> r) noexcept
{
return batch_bool<double, A>(_mm256_castsi256_pd(detail::load_bool_avx2<double>(mem)));
return { _mm256_castsi256_pd(load_unaligned(mem, batch_bool<uint64_t, A> {}, r).data) };
}

// mask
Expand Down
8 changes: 4 additions & 4 deletions include/xsimd/arch/xsimd_avx512vbmi2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@ namespace xsimd
template <class A>
XSIMD_INLINE batch<int16_t, A> compress(batch<int16_t, A> const& self, batch_bool<int16_t, A> const& mask, requires_arch<avx512vbmi2>) noexcept
{
return _mm512_maskz_compress_epi16(mask.mask(), self);
return _mm512_maskz_compress_epi16((__mmask32)mask.mask(), self);
}
template <class A>
XSIMD_INLINE batch<uint16_t, A> compress(batch<uint16_t, A> const& self, batch_bool<uint16_t, A> const& mask, requires_arch<avx512vbmi2>) noexcept
{
return _mm512_maskz_compress_epi16(mask.mask(), self);
return _mm512_maskz_compress_epi16((__mmask32)mask.mask(), self);
}
template <class A>
XSIMD_INLINE batch<int8_t, A> compress(batch<int8_t, A> const& self, batch_bool<int8_t, A> const& mask, requires_arch<avx512vbmi2>) noexcept
Expand All @@ -50,12 +50,12 @@ namespace xsimd
template <class A>
XSIMD_INLINE batch<int16_t, A> expand(batch<int16_t, A> const& self, batch_bool<int16_t, A> const& mask, requires_arch<avx512vbmi2>) noexcept
{
return _mm512_maskz_expand_epi16(mask.mask(), self);
return _mm512_maskz_expand_epi16((__mmask32)mask.mask(), self);
}
template <class A>
XSIMD_INLINE batch<uint16_t, A> expand(batch<uint16_t, A> const& self, batch_bool<uint16_t, A> const& mask, requires_arch<avx512vbmi2>) noexcept
{
return _mm512_maskz_expand_epi16(mask.mask(), self);
return _mm512_maskz_expand_epi16((__mmask32)mask.mask(), self);
}
template <class A>
XSIMD_INLINE batch<int8_t, A> expand(batch<int8_t, A> const& self, batch_bool<int8_t, A> const& mask, requires_arch<avx512vbmi2>) noexcept
Expand Down
20 changes: 20 additions & 0 deletions include/xsimd/arch/xsimd_sse2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1043,6 +1043,26 @@ namespace xsimd
return _mm_loadu_pd(mem);
}

// load batch_bool

template <class A>
XSIMD_INLINE batch_bool<char, A> load_unaligned(bool const* mem, batch_bool<char, A>, requires_arch<sse2>) noexcept
{
return _mm_sub_epi8(_mm_set1_epi8(0), _mm_loadu_si128((__m128i const*)mem));
}

template <class A>
XSIMD_INLINE batch_bool<unsigned char, A> load_unaligned(bool const* mem, batch_bool<unsigned char, A>, requires_arch<sse2> r) noexcept
{
return { load_unaligned(mem, batch_bool<char, A> {}, r).data };
}

template <class A>
XSIMD_INLINE batch_bool<signed char, A> load_unaligned(bool const* mem, batch_bool<signed char, A>, requires_arch<sse2> r) noexcept
{
return { load_unaligned(mem, batch_bool<char, A> {}, r).data };
}

// load_complex
namespace detail
{
Expand Down
78 changes: 33 additions & 45 deletions include/xsimd/arch/xsimd_sse4_1.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,65 +123,53 @@ namespace xsimd
}

// load_unaligned<batch_bool>
namespace detail

template <class T, class A, class = typename std::enable_if<(std::is_integral<T>::value && sizeof(T) > 1), void>::type>
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<sse4_1>) noexcept
{
template <class T>
XSIMD_INLINE __m128i load_bool_sse4_1(bool const* mem) noexcept
// GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
// GCC/Clang/MSVC will turn it into the correct load.
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm_sub_epi8(_mm_set1_epi8(0), _mm_loadu_si128((__m128i const*)mem));
}
// GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
// GCC/Clang/MSVC will turn it into the correct load.
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
#if defined(__x86_64__)
uint64_t tmp;
memcpy(&tmp, mem, sizeof(tmp));
auto val = _mm_cvtsi64_si128(tmp);
uint64_t tmp;
memcpy(&tmp, mem, sizeof(tmp));
auto val = _mm_cvtsi64_si128(tmp);
#else
__m128i val;
memcpy(&val, mem, sizeof(uint64_t));
__m128i val;
memcpy(&val, mem, sizeof(uint64_t));
#endif
return _mm_sub_epi16(_mm_set1_epi8(0), _mm_cvtepu8_epi16(val));
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
uint32_t tmp;
memcpy(&tmp, mem, sizeof(tmp));
return _mm_sub_epi32(_mm_set1_epi8(0), _mm_cvtepu8_epi32(_mm_cvtsi32_si128(tmp)));
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
uint16_t tmp;
memcpy(&tmp, mem, sizeof(tmp));
return _mm_sub_epi64(_mm_set1_epi8(0), _mm_cvtepu8_epi64(_mm_cvtsi32_si128((uint32_t)tmp)));
}
else
{
assert(false && "unsupported arch/op combination");
return __m128i {};
}
return { _mm_sub_epi16(_mm_set1_epi8(0), _mm_cvtepu8_epi16(val)) };
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
uint32_t tmp;
memcpy(&tmp, mem, sizeof(tmp));
return { _mm_sub_epi32(_mm_set1_epi8(0), _mm_cvtepu8_epi32(_mm_cvtsi32_si128(tmp))) };
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
uint16_t tmp;
memcpy(&tmp, mem, sizeof(tmp));
return { _mm_sub_epi64(_mm_set1_epi8(0), _mm_cvtepu8_epi64(_mm_cvtsi32_si128((uint32_t)tmp))) };
}
else
{
assert(false && "unsupported arch/op combination");
return __m128i {};
}
}

template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<sse4_1>) noexcept
{
return batch_bool<T, A>(detail::load_bool_sse4_1<T>(mem));
}

template <class A>
XSIMD_INLINE batch_bool<float, A> load_unaligned(bool const* mem, batch_bool<float, A>, requires_arch<sse4_1>) noexcept
XSIMD_INLINE batch_bool<float, A> load_unaligned(bool const* mem, batch_bool<float, A>, requires_arch<sse4_1> r) noexcept
{
return batch_bool<float, A>(_mm_castsi128_ps(detail::load_bool_sse4_1<float>(mem)));
return { _mm_castsi128_ps(load_unaligned(mem, batch_bool<uint32_t, A> {}, r)) };
}

template <class A>
XSIMD_INLINE batch_bool<double, A> load_unaligned(bool const* mem, batch_bool<double, A>, requires_arch<sse4_1>) noexcept
XSIMD_INLINE batch_bool<double, A> load_unaligned(bool const* mem, batch_bool<double, A>, requires_arch<sse4_1> r) noexcept
{
return batch_bool<double, A>(_mm_castsi128_pd(detail::load_bool_sse4_1<double>(mem)));
return { _mm_castsi128_pd(load_unaligned(mem, batch_bool<uint64_t, A> {}, r)) };
}

// max
Expand Down