diff --git a/include/xsimd/arch/common/xsimd_common_details.hpp b/include/xsimd/arch/common/xsimd_common_details.hpp index 0a8abc419..03088eec4 100644 --- a/include/xsimd/arch/common/xsimd_common_details.hpp +++ b/include/xsimd/arch/common/xsimd_common_details.hpp @@ -90,6 +90,9 @@ namespace xsimd XSIMD_INLINE std::pair, batch> sincos(batch const& self) noexcept; template XSIMD_INLINE batch sqrt(batch const& self) noexcept; + template + XSIMD_INLINE typename std::enable_if::value, batch>::type + swizzle(batch const& x, batch_constant mask) noexcept; template XSIMD_INLINE batch tan(batch const& self) noexcept; template diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index 812ad0bca..773177233 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -341,7 +341,7 @@ namespace xsimd } }; - return swizzle(self, make_batch_constant, rotate_generator, A>(), A {}); + return swizzle(self, make_batch_constant, rotate_generator, A>()); } template @@ -362,7 +362,7 @@ namespace xsimd } }; - return swizzle(self, make_batch_constant, rotate_generator, A>(), A {}); + return swizzle(self, make_batch_constant, rotate_generator, A>()); } template @@ -611,6 +611,15 @@ namespace xsimd return batch::load_aligned(out_buffer); } + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + constexpr size_t size = batch::size; + alignas(A::alignment()) T self_buffer[size]; + store_aligned(&self_buffer[0], self); + return { self_buffer[Is]... }; + } + template XSIMD_INLINE batch, A> swizzle(batch, A> const& self, batch mask, requires_arch) noexcept { diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 3ab8517a9..0a17ce512 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -657,9 +657,35 @@ namespace xsimd // rotate_left template + XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept + { + auto other = _mm256_permute2x128_si256(self, self, 0x1); + if (N < 16) + { + return _mm256_alignr_epi8(other, self, N); + } + else + { + return _mm256_alignr_epi8(self, other, N - 16); + } + } + template + XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept + { + return bitwise_cast(rotate_left(bitwise_cast(self), avx2 {})); + } + template XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept { - return _mm256_alignr_epi8(self, self, N); + auto other = _mm256_permute2x128_si256(self, self, 0x1); + if (N < 8) + { + return _mm256_alignr_epi8(other, self, 2 * N); + } + else + { + return _mm256_alignr_epi8(self, other, 2 * (N - 8)); + } } template XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept @@ -876,6 +902,7 @@ namespace xsimd } // swizzle (dynamic mask) + template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { diff --git a/include/xsimd/arch/xsimd_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512bw.hpp index 832cba6a1..0395853dc 100644 --- a/include/xsimd/arch/xsimd_avx512bw.hpp +++ b/include/xsimd/arch/xsimd_avx512bw.hpp @@ -429,18 +429,6 @@ namespace xsimd return detail::compare_int_avx512bw(self, other); } - // rotate_left - template - XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept - { - return _mm512_alignr_epi8(self, self, N); - } - template - XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept - { - return bitwise_cast(rotate_left(bitwise_cast(self), avx512bw {})); - } - // sadd template ::value, void>::type> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index dc46cd867..26947dffc 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -32,6 +32,8 @@ namespace xsimd XSIMD_INLINE batch incr_if(batch const& self, Mask const& mask, requires_arch) noexcept; template XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept; + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept; template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept; template diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp index ac3cdacd0..17642268d 100644 --- a/include/xsimd/arch/xsimd_neon.hpp +++ b/include/xsimd/arch/xsimd_neon.hpp @@ -2771,10 +2771,11 @@ namespace xsimd XSIMD_INLINE batch rotate_left(batch const& a, requires_arch) noexcept { using register_type = typename batch::register_type; + // Adding modulo to avoid warning. const detail::neon_dispatcher::binary dispatcher = { - std::make_tuple(wrap::rotate_left_u8, wrap::rotate_left_s8, wrap::rotate_left_u16, wrap::rotate_left_s16, - wrap::rotate_left_u32, wrap::rotate_left_s32, wrap::rotate_left_u64, wrap::rotate_left_s64, - wrap::rotate_left_f32) + std::make_tuple(wrap::rotate_left_u8, wrap::rotate_left_s8, wrap::rotate_left_u16, wrap::rotate_left_s16, + wrap::rotate_left_u32, wrap::rotate_left_s32, wrap::rotate_left_u64, wrap::rotate_left_s64, + wrap::rotate_left_f32) }; return dispatcher.apply(register_type(a), register_type(a)); } diff --git a/include/xsimd/arch/xsimd_ssse3.hpp b/include/xsimd/arch/xsimd_ssse3.hpp index b38e398b4..b612d438e 100644 --- a/include/xsimd/arch/xsimd_ssse3.hpp +++ b/include/xsimd/arch/xsimd_ssse3.hpp @@ -107,11 +107,22 @@ namespace xsimd // rotate_left template - XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept + XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept { return _mm_alignr_epi8(self, self, N); } template + XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept + { + return bitwise_cast(rotate_left(bitwise_cast(self), ssse3 {})); + } + + template + XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept + { + return _mm_alignr_epi8(self, self, 2 * N); + } + template XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept { return bitwise_cast(rotate_left(bitwise_cast(self), ssse3 {})); diff --git a/test/test_batch_manip.cpp b/test/test_batch_manip.cpp index 000491cd8..34c9c1032 100644 --- a/test/test_batch_manip.cpp +++ b/test/test_batch_manip.cpp @@ -20,7 +20,7 @@ namespace xsimd struct init_swizzle_base { using swizzle_vector_type = std::array; - swizzle_vector_type lhs_in, exped_reverse, exped_fill, exped_dup, exped_ror, exped_rol; + swizzle_vector_type lhs_in, exped_reverse, exped_fill, exped_dup, exped_ror, exped_rol, exped_rol2; template std::vector create_swizzle_vectors() @@ -42,12 +42,14 @@ namespace xsimd exped_dup[i] = lhs_in[2 * (i / 2)]; exped_ror[i] = lhs_in[(i - 1) % N]; exped_rol[i] = lhs_in[(i + 1) % N]; + exped_rol2[i] = lhs_in[(i + N - 1) % N]; } vects.push_back(std::move(exped_reverse)); vects.push_back(std::move(exped_fill)); vects.push_back(std::move(exped_dup)); vects.push_back(std::move(exped_ror)); vects.push_back(std::move(exped_rol)); + vects.push_back(std::move(exped_rol2)); return vects; } @@ -176,6 +178,20 @@ struct swizzle_test CHECK_BATCH_EQ(b_res, b_exped); } + void rotate_left_inv() + { + xsimd::init_swizzle_base swizzle_base; + auto swizzle_vecs = swizzle_base.create_swizzle_vectors(); + auto v_lhs = swizzle_vecs[0]; + auto v_exped = swizzle_vecs[6]; + + B b_lhs = B::load_unaligned(v_lhs.data()); + B b_exped = B::load_unaligned(v_exped.data()); + + B b_res = xsimd::rotate_left(b_lhs); + CHECK_BATCH_EQ(b_res, b_exped); + } + void swizzle_reverse() { xsimd::init_swizzle_base swizzle_base; @@ -248,6 +264,7 @@ TEST_CASE_TEMPLATE("[swizzle]", B, BATCH_SWIZZLE_TYPES) SUBCASE("rotate") { Test.rotate_left(); + Test.rotate_left_inv(); Test.rotate_right(); } diff --git a/test/test_shuffle.cpp b/test/test_shuffle.cpp index 1c7c9884b..bc89aafd5 100644 --- a/test/test_shuffle.cpp +++ b/test/test_shuffle.cpp @@ -561,13 +561,13 @@ struct shuffle_test CHECK_BATCH_EQ(b_res, b_ref); } - void swizzle() + void shuffle() { B b_lhs = B::load_unaligned(lhs.data()); B b_rhs = B::load_unaligned(rhs.data()); { - struct swizzle_lo_generator + struct shuffle_lo_generator { static constexpr size_t get(size_t index, size_t size) { @@ -580,13 +580,13 @@ struct shuffle_test ref[i] = lhs[size - i - 1]; B b_ref = B::load_unaligned(ref.data()); - INFO("swizzle first batch"); - B b_res = xsimd::shuffle(b_lhs, b_rhs, xsimd::make_batch_constant()); + INFO("shuffle first batch"); + B b_res = xsimd::shuffle(b_lhs, b_rhs, xsimd::make_batch_constant()); CHECK_BATCH_EQ(b_res, b_ref); } { - struct swizzle_hi_generator + struct shuffle_hi_generator { static constexpr size_t get(size_t index, size_t size) { @@ -599,8 +599,8 @@ struct shuffle_test ref[i] = rhs[size - i - 1]; B b_ref = B::load_unaligned(ref.data()); - INFO("swizzle second batch"); - B b_res = xsimd::shuffle(b_lhs, b_rhs, xsimd::make_batch_constant()); + INFO("shuffle second batch"); + B b_res = xsimd::shuffle(b_lhs, b_rhs, xsimd::make_batch_constant()); CHECK_BATCH_EQ(b_res, b_ref); } } @@ -709,9 +709,9 @@ TEST_CASE_TEMPLATE("[shuffle]", B, BATCH_FLOAT_TYPES, xsimd::batch, xs { Test.select(); } - SUBCASE("swizzle") + SUBCASE("shuffle") { - Test.swizzle(); + Test.shuffle(); } SUBCASE("transpose") { @@ -733,12 +733,12 @@ TEST_CASE_TEMPLATE("[small integer transpose]", B, xsimd::batch, xsimd } #if (XSIMD_WITH_SSE2 && !XSIMD_WITH_AVX) -TEST_CASE_TEMPLATE("[small integer swizzle]", B, xsimd::batch, xsimd::batch) +TEST_CASE_TEMPLATE("[small integer shuffle]", B, xsimd::batch, xsimd::batch) { shuffle_test Test; - SUBCASE("swizzle") + SUBCASE("shuffle") { - Test.swizzle(); + Test.shuffle(); } } #endif diff --git a/test/test_utils.hpp b/test/test_utils.hpp index 4b56dc01b..80914f331 100644 --- a/test/test_utils.hpp +++ b/test/test_utils.hpp @@ -591,13 +591,7 @@ namespace xsimd #define BATCH_TYPES BATCH_INT_TYPES, BATCH_FLOAT_TYPES #define BATCH_MATH_TYPES xsimd::batch, BATCH_FLOAT_TYPES -#if !XSIMD_WITH_AVX || XSIMD_WITH_AVX2 -#define BATCH_SWIZZLE_TAIL , xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch -#else -#define BATCH_SWIZZLE_TAIL -#endif - -#define BATCH_SWIZZLE_TYPES BATCH_FLOAT_TYPES, BATCH_COMPLEX_TYPES BATCH_SWIZZLE_TAIL +#define BATCH_SWIZZLE_TYPES BATCH_FLOAT_TYPES, BATCH_COMPLEX_TYPES, BATCH_INT_TYPES /******************** * conversion utils *