diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index d74111efb..03a914bda 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -32,6 +32,7 @@ jobs: - { compiler: 'clang', version: '18', flags: 'avx512' } - { compiler: 'clang', version: '18', flags: 'avx_128' } - { compiler: 'clang', version: '18', flags: 'avx2_128' } + - { compiler: 'clang', version: '18', flags: 'avx512vl_128' } - { compiler: 'clang', version: '18', flags: 'avx512vl_256' } steps: - name: Setup compiler @@ -97,6 +98,10 @@ jobs: if [[ '${{ matrix.sys.flags }}' == 'avx512' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" fi + if [[ '${{ matrix.sys.flags }}' == 'avx512vl_128' ]]; then + CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" + CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_128" + fi if [[ '${{ matrix.sys.flags }}' == 'avx512vl_256' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_256" diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 16a8e7ba0..6a7316722 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -1518,15 +1518,40 @@ namespace xsimd { // Adapted from https://github.com/serge-sans-paille/fast-bitset-from-bool-array // Generate a bitset from an array of boolean. - XSIMD_INLINE unsigned char tobitset(unsigned char unpacked[8]) + template + XSIMD_INLINE unsigned char tobitset(unsigned char unpacked[N]) { - uint64_t data; - memcpy(&data, unpacked, sizeof(uint64_t)); + static_assert(N == 8 || N == 4 || N == 2, "valid pack size"); + XSIMD_IF_CONSTEXPR(N == 8) + { + uint64_t data; + memcpy(&data, unpacked, sizeof(uint64_t)); + + const uint64_t magic = (0x80 + 0x4000 + 0x200000 + 0x10000000 + 0x0800000000 + 0x040000000000 + 0x02000000000000 + 0x0100000000000000); + + unsigned char res = ((data * magic) >> 56) & 0xFF; + return res; + } + else XSIMD_IF_CONSTEXPR(N == 4) + { + uint32_t data; + memcpy(&data, unpacked, sizeof(uint32_t)); - const uint64_t magic = (0x80 + 0x4000 + 0x200000 + 0x10000000 + 0x0800000000 + 0x040000000000 + 0x02000000000000 + 0x0100000000000000); + const uint32_t magic = (0x80 + 0x4000 + 0x200000 + 0x10000000); - unsigned char res = ((data * magic) >> 56) & 0xFF; - return res; + unsigned char res = ((data * magic) >> 24) & 0xFF; + return res; + } + else XSIMD_IF_CONSTEXPR(N == 2) + { + uint16_t data; + memcpy(&data, unpacked, sizeof(uint16_t)); + + const uint16_t magic = (0x80 + 0x4000); + + unsigned char res = ((data * magic) >> 8) & 0xFF; + return res; + } } } @@ -1541,7 +1566,7 @@ namespace xsimd register_type mask = 0; for (std::size_t i = 0; i < iter; ++i) { - unsigned char block = detail::tobitset((unsigned char*)mem + i * 8); + unsigned char block = detail::tobitset<8>((unsigned char*)mem + i * 8); mask |= (register_type(block) << (i * 8)); } return mask; diff --git a/include/xsimd/arch/xsimd_avx512vl_128.hpp b/include/xsimd/arch/xsimd_avx512vl_128.hpp new file mode 100644 index 000000000..155338425 --- /dev/null +++ b/include/xsimd/arch/xsimd_avx512vl_128.hpp @@ -0,0 +1,647 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * Copyright (c) Marco Barbone * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512VL_128_HPP +#define XSIMD_AVX512VL_128_HPP + +#include "../types/xsimd_avx512vl_register.hpp" +#include "../types/xsimd_batch_constant.hpp" + +#include + +namespace xsimd +{ + namespace kernel + { + using namespace types; + + namespace detail + { + template + XSIMD_INLINE batch_bool compare_int_avx512vl_128(batch const& self, batch const& other) noexcept + { + using register_type = typename batch_bool::register_type; + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + // shifting to take sign into account + uint64_t mask_low0 = _mm_cmp_epi32_mask((batch(self.data) & batch(0x000000FF)) << 24, + (batch(other.data) & batch(0x000000FF)) << 24, + Cmp); + uint64_t mask_low1 = _mm_cmp_epi32_mask((batch(self.data) & batch(0x0000FF00)) << 16, + (batch(other.data) & batch(0x0000FF00)) << 16, + Cmp); + uint64_t mask_high0 = _mm_cmp_epi32_mask((batch(self.data) & batch(0x00FF0000)) << 8, + (batch(other.data) & batch(0x00FF0000)) << 8, + Cmp); + uint64_t mask_high1 = _mm_cmp_epi32_mask((batch(self.data) & batch(0xFF000000)), + (batch(other.data) & batch(0xFF000000)), + Cmp); + uint64_t mask = 0; + for (unsigned i = 0; i < 8; ++i) + { + mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); + mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); + mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); + mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); + } + return (register_type)mask; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + // shifting to take sign into account + uint16_t mask_low = _mm_cmp_epi32_mask((batch(self.data) & batch(0x0000FFFF)) << 16, + (batch(other.data) & batch(0x0000FFFF)) << 16, + Cmp); + uint16_t mask_high = _mm_cmp_epi32_mask((batch(self.data) & batch(0xFFFF0000)), + (batch(other.data) & batch(0xFFFF0000)), + Cmp); + return static_cast(morton(mask_low, mask_high)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return (register_type)_mm_cmp_epi32_mask(self, other, Cmp); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return (register_type)_mm_cmp_epi64_mask(self, other, Cmp); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + uint64_t mask_low0 = _mm_cmp_epu32_mask((batch(self.data) & batch(0x000000FF)), (batch(other.data) & batch(0x000000FF)), Cmp); + uint64_t mask_low1 = _mm_cmp_epu32_mask((batch(self.data) & batch(0x0000FF00)), (batch(other.data) & batch(0x0000FF00)), Cmp); + uint64_t mask_high0 = _mm_cmp_epu32_mask((batch(self.data) & batch(0x00FF0000)), (batch(other.data) & batch(0x00FF0000)), Cmp); + uint64_t mask_high1 = _mm_cmp_epu32_mask((batch(self.data) & batch(0xFF000000)), (batch(other.data) & batch(0xFF000000)), Cmp); + uint64_t mask = 0; + for (unsigned i = 0; i < 8; ++i) + { + mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); + mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); + mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); + mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); + } + return (register_type)mask; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + uint16_t mask_low = _mm_cmp_epu32_mask((batch(self.data) & batch(0x0000FFFF)), (batch(other.data) & batch(0x0000FFFF)), Cmp); + uint16_t mask_high = _mm_cmp_epu32_mask((batch(self.data) & batch(0xFFFF0000)), (batch(other.data) & batch(0xFFFF0000)), Cmp); + return static_cast(morton(mask_low, mask_high)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return (register_type)_mm_cmp_epu32_mask(self, other, Cmp); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return (register_type)_mm_cmp_epu64_mask(self, other, Cmp); + } + } + } + } + + // load mask + template + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + constexpr auto size = batch_bool::size; + constexpr auto chunk_size = size >= 8 ? 8 : (size >= 4 ? 4 : 2); + constexpr auto iter = size / chunk_size; + static_assert((size % chunk_size) == 0, "incorrect size of bool batch"); + register_type mask = 0; + for (std::size_t i = 0; i < iter; ++i) + { + unsigned char block = detail::tobitset((unsigned char*)mem + i * chunk_size); + mask |= (register_type(block) << (i * chunk_size)); + } + return mask; + } + + // from bool + template + XSIMD_INLINE batch from_bool(batch_bool const& self, requires_arch) noexcept + { + return select(self, batch(1), batch(0)); + } + + // from_mask + template + XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + { + assert(mask == (mask & ((uint64_t(1) << batch_bool::size) - 1)) && "inbound mask"); + return static_cast::register_type>(mask & ((uint64_t(1) << batch_bool::size) - 1)); + } + + // mask + template + XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + return self.data & ((uint64_t(1) << batch_bool::size) - 1); + } + + // batch_bool_cast + template + XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept + { + return self.data; + } + + // set + template + XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + using register_type = typename batch_bool::register_type; + register_type r = 0; + unsigned shift = 0; + (void)std::initializer_list { (r |= register_type(values ? 1 : 0) << (shift++))... }; + return r; + } + + // store + template + XSIMD_INLINE void store(batch_bool const& self, bool* mem, requires_arch) noexcept + { + constexpr auto size = batch_bool::size; + for (std::size_t i = 0; i < size; ++i) + mem[i] = (self.data >> i) & 0x1; + } + + // abs + template + XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept + { + return _mm_abs_epi64(self); + } + + // load masked + template + XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + constexpr auto imm_mask = mask.mask(); + return _mm_mask_loadu_epi32(_mm_setzero_si128(), imm_mask, mem); + } + template + XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + constexpr auto imm_mask = mask.mask(); + return _mm_mask_loadu_epi32(_mm_setzero_si128(), imm_mask, mem); + } + + // store masked + template + XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm_mask_storeu_epi32(mem, mask.mask(), src); + } + template + XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm_mask_storeu_epi32(mem, mask.mask(), src); + } + + template + XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm_mask_storeu_epi64(mem, mask.mask(), src); + } + + template + XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm_mask_storeu_epi64(mem, mask.mask(), src); + } + template + XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm_mask_storeu_ps(mem, mask.mask(), src); + } + + template + XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm_mask_storeu_pd(mem, mask.mask(), src); + } + + // max + template + XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_max_epi64(self, other); + } + template + XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_max_epu64(self, other); + } + + // min + template + XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_min_epi64(self, other); + } + template + XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_min_epu64(self, other); + } + + // insert + template + XSIMD_INLINE batch insert(batch const& self, float val, index, requires_arch) noexcept + { + + int32_t tmp = bit_cast(val); + return _mm_castsi128_ps(_mm_mask_set1_epi32(_mm_castps_si128(self), __mmask8(1 << (I & 7)), tmp)); + } + + template + XSIMD_INLINE batch insert(batch const& self, double val, index, requires_arch) noexcept + { + int64_t tmp = bit_cast(val); + return _mm_castsi128_pd(_mm_mask_set1_epi64(_mm_castpd_si128(self), __mmask8(1 << (I & 3)), tmp)); + } + + template ::value>> + XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_mask_set1_epi32(self, __mmask8(1 << (I & 7)), val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_mask_set1_epi64(self, __mmask8(1 << (I & 3)), val); + } + else + { + return insert(self, val, pos, common {}); + } + } + + // isnan + template + XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_ps_mask(self, self, _CMP_UNORD_Q); + } + template + XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_pd_mask(self, self, _CMP_UNORD_Q); + } + + // rotl + template ::value>> + XSIMD_INLINE batch rotl(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_rolv_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_rolv_epi64(self, other); + } + else + { + return rotl(self, other, avx2_128 {}); + } + } + template ::value>> + XSIMD_INLINE batch rotl(batch const& self, int32_t other, requires_arch) noexcept + { + return rotl(self, batch(other), A {}); + } + template ::value>> + XSIMD_INLINE batch rotl(batch const& self, requires_arch) noexcept + { + constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; + static_assert(count < bits, "Count must be less than the number of bits in T"); + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_rol_epi32(self, count); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_rol_epi64(self, count); + } + else + { + return rotl(self, avx2_128 {}); + } + } + + // rotr + template ::value>> + XSIMD_INLINE batch rotr(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_unsigned::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_rorv_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_rorv_epi64(self, other); + } + } + return rotr(self, other, avx2_128 {}); + } + template ::value>> + XSIMD_INLINE batch rotr(batch const& self, int32_t other, requires_arch) noexcept + { + return rotr(self, batch(other), A {}); + } + + template ::value>> + XSIMD_INLINE batch rotr(batch const& self, requires_arch) noexcept + { + constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; + static_assert(count < bits, "Count must be less than the number of bits in T"); + XSIMD_IF_CONSTEXPR(std::is_unsigned::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_ror_epi32(self, count); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_ror_epi64(self, count); + } + } + return rotr(self, avx2_128 {}); + } + + // all + template + XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + constexpr register_type bitmask = (register_type(1) << batch_bool::size) - 1; + return (self.data & bitmask) == bitmask; + } + + // any + template + XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + constexpr register_type bitmask = (register_type(1) << batch_bool::size) - 1; + return (self.data & bitmask) != 0; + } + + // eq + template + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_ps_mask(self, other, _CMP_EQ_OQ); + } + template + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_pd_mask(self, other, _CMP_EQ_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512vl_128(self, other); + } + template + XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(~self.data ^ other.data); + } + + // neq + template + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_ps_mask(self, other, _CMP_NEQ_OQ); + } + template + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_pd_mask(self, other, _CMP_NEQ_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return (~(self == other)); + } + template + XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data ^ other.data); + } + + // gt + template + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_ps_mask(self, other, _CMP_GT_OQ); + } + template + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_pd_mask(self, other, _CMP_GT_OQ); + } + template ::value>> + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512vl_128(self, other); + } + + // ge + template + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_ps_mask(self, other, _CMP_GE_OQ); + } + template + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_pd_mask(self, other, _CMP_GE_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512vl_128(self, other); + } + + // lt + template + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_ps_mask(self, other, _CMP_LT_OQ); + } + template + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_pd_mask(self, other, _CMP_LT_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512vl_128(self, other); + } + + // le + template + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_ps_mask(self, other, _CMP_LE_OQ); + } + template + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_pd_mask(self, other, _CMP_LE_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512vl_128(self, other); + } + + // select + template + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm_mask_blend_ps(cond, false_br, true_br); + } + template + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm_mask_blend_pd(cond, false_br, true_br); + } + template ::value>> + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + batch_bool batch_cond = batch_bool::from_mask(cond.mask()); + return _mm_blendv_epi8(false_br, true_br, batch_cond); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + batch_bool batch_cond = batch_bool::from_mask(cond.mask()); + return _mm_blendv_epi8(false_br, true_br, batch_cond); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_mask_blend_epi32(cond, false_br, true_br); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_mask_blend_epi64(cond, false_br, true_br); + } + } + template + XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return select(batch_bool { Values... }, true_br, false_br, avx512vl_128 {}); + } + + // reciprocal + template + XSIMD_INLINE batch + reciprocal(batch const& self, + kernel::requires_arch) noexcept + { + return _mm_rcp14_ps(self); + } + + template + XSIMD_INLINE batch + reciprocal(batch const& self, + kernel::requires_arch) noexcept + { + return _mm_rcp14_pd(self); + } + + // bitwise_and + template + XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data & other.data); + } + + // bitwise_andnot + template + XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data & ~other.data); + } + + // bitwise_not + template + XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(~self.data); + } + + // bitwise_or + template + XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data | other.data); + } + + // bitwise_xor + template + XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data ^ other.data); + } + + // sadd + template ::value>> + XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + auto mask = other < 0; + auto self_pos_branch = min(std::numeric_limits::max() - other, self); + auto self_neg_branch = max(std::numeric_limits::min() - other, self); + return other + select(mask, self_neg_branch, self_pos_branch); + } + else + { + const auto diffmax = std::numeric_limits::max() - self; + const auto mindiff = min(diffmax, other); + return self + mindiff; + } + } + + } +} + +#endif diff --git a/include/xsimd/arch/xsimd_avx512vl_256.hpp b/include/xsimd/arch/xsimd_avx512vl_256.hpp index c302a37a7..a5ea546bc 100644 --- a/include/xsimd/arch/xsimd_avx512vl_256.hpp +++ b/include/xsimd/arch/xsimd_avx512vl_256.hpp @@ -119,13 +119,14 @@ namespace xsimd { using register_type = typename batch_bool::register_type; constexpr auto size = batch_bool::size; - constexpr auto iter = size / 4; - static_assert((size % 4) == 0, "incorrect size of bool batch"); + constexpr auto chunk_size = size >= 8 ? 8 : 4; + constexpr auto iter = size / chunk_size; + static_assert((size % chunk_size) == 0, "incorrect size of bool batch"); register_type mask = 0; for (std::size_t i = 0; i < iter; ++i) { - unsigned char block = detail::tobitset((unsigned char*)mem + i * 4); - mask |= (register_type(block) << (i * 4)); + unsigned char block = detail::tobitset((unsigned char*)mem + i * chunk_size); + mask |= (register_type(block) << (i * chunk_size)); } return mask; } diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp index 5bee73b1a..cf88f64d7 100644 --- a/include/xsimd/arch/xsimd_isa.hpp +++ b/include/xsimd/arch/xsimd_isa.hpp @@ -75,6 +75,7 @@ #if XSIMD_WITH_AVX512VL #include "./xsimd_avx512vl.hpp" +#include "./xsimd_avx512vl_128.hpp" #include "./xsimd_avx512vl_256.hpp" #endif diff --git a/include/xsimd/config/xsimd_arch.hpp b/include/xsimd/config/xsimd_arch.hpp index 1084d3faa..b3995912e 100644 --- a/include/xsimd/config/xsimd_arch.hpp +++ b/include/xsimd/config/xsimd_arch.hpp @@ -163,7 +163,7 @@ namespace xsimd using all_x86_architectures = arch_list< avx512vnni, avx512vbmi2, avx512vbmi, avx512ifma, avx512pf, avx512vnni, avx512bw, avx512er, avx512dq, avx512vl, avx512cd, avx512f, - avxvnni, avx512vl_256, fma3, avx2, fma3, avx, avx2_128, avx_128, fma4, fma3, + avxvnni, avx512vl_256, fma3, avx2, fma3, avx, avx512vl_128, avx2_128, avx_128, fma4, fma3, sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>; using all_sve_architectures = arch_list, detail::sve<256>, detail::sve<128>>; diff --git a/include/xsimd/config/xsimd_cpu_features_x86.hpp b/include/xsimd/config/xsimd_cpu_features_x86.hpp index adeefae56..dc98b549b 100644 --- a/include/xsimd/config/xsimd_cpu_features_x86.hpp +++ b/include/xsimd/config/xsimd_cpu_features_x86.hpp @@ -895,7 +895,9 @@ namespace xsimd inline bool avx512vl() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } - inline bool avx512vl_256() const noexcept { return avx512_enabled() && osxsave() && leaf7().all_bits_set(); } + inline bool avx512vl_128() const noexcept { return avx512vl() && osxsave(); } + + inline bool avx512vl_256() const noexcept { return avx512vl_128(); } inline bool avx512vbmi() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } diff --git a/include/xsimd/types/xsimd_avx512vl_register.hpp b/include/xsimd/types/xsimd_avx512vl_register.hpp index 9b209ee66..c73c2a963 100644 --- a/include/xsimd/types/xsimd_avx512vl_register.hpp +++ b/include/xsimd/types/xsimd_avx512vl_register.hpp @@ -29,6 +29,18 @@ namespace xsimd static constexpr char const* name() noexcept { return "avx512vl"; } }; + /** + * @ingroup architectures + * + * AVX512VL instructions extension for 128 bits registers + */ + struct avx512vl_128 : avx2_128 + { + static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VL; } + static constexpr bool available() noexcept { return true; } + static constexpr char const* name() noexcept { return "avx512vl/128"; } + }; + /** * @ingroup architectures * @@ -57,6 +69,13 @@ namespace xsimd XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vl, avx512cd); + template + struct get_bool_simd_register + { + using type = simd_avx512_bool_register; + }; + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vl_128, avx2_128); + template struct get_bool_simd_register {