diff --git a/include/xsimd/arch/xsimd_rvv.hpp b/include/xsimd/arch/xsimd_rvv.hpp index 12ae39280..42b7e2f51 100644 --- a/include/xsimd/arch/xsimd_rvv.hpp +++ b/include/xsimd/arch/xsimd_rvv.hpp @@ -287,7 +287,7 @@ namespace xsimd namespace kernel { - namespace detail + namespace detail_rvv { template using rvv_reg_t = types::detail::rvv_reg_t; @@ -380,17 +380,17 @@ namespace xsimd * Scalar to vector * ********************/ - namespace detail + namespace detail_rvv { template - XSIMD_INLINE detail::rvv_reg_t broadcast(T arg) noexcept + XSIMD_INLINE rvv_reg_t broadcast(T arg) noexcept { // A bit of a dance, here, because rvvmv_splat has no other // argument from which to deduce type, and T=char is not // supported. project_num_t arg_not_char(arg); - const auto splat = detail::rvvmv_splat(arg_not_char); - return detail::rvv_reg_t(splat.get_bytes(), types::detail::XSIMD_RVV_BITCAST); + const auto splat = rvvmv_splat(arg_not_char); + return rvv_reg_t(splat.get_bytes(), types::detail::XSIMD_RVV_BITCAST); } } @@ -398,14 +398,14 @@ namespace xsimd template XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept { - return detail::broadcast(arg); + return detail_rvv::broadcast(arg); } /********* * Load * *********/ - namespace detail + namespace detail_rvv { XSIMD_RVV_OVERLOAD(rvvle, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , vec(T const*)) XSIMD_RVV_OVERLOAD(rvvse, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , void(T*, vec)) @@ -414,7 +414,7 @@ namespace xsimd template = 0> XSIMD_INLINE batch load_aligned(T const* src, convert, requires_arch) noexcept { - return detail::rvvle(reinterpret_cast const*>(src)); + return detail_rvv::rvvle(reinterpret_cast const*>(src)); } template = 0> @@ -424,7 +424,7 @@ namespace xsimd } // load_complex - namespace detail + namespace detail_rvv { template = types::detail::rvv_width_m1, int> = 0> XSIMD_INLINE rvv_reg_t rvvabut(rvv_reg_t const& lo, rvv_reg_t const& hi) noexcept @@ -464,16 +464,23 @@ namespace xsimd return __riscv_vslidedown(vv, vv.vl / 2, vv.vl); } + } + + // Must be in detail::load_complex for use by common memory. + // ODR violation are prevented because the size of the register is encoded + // in batch. + namespace detail + { template = 0> XSIMD_INLINE batch, A> load_complex(batch const& lo, batch const& hi, requires_arch) noexcept { - const auto real_index = vindex, 0, 1>(); - const auto imag_index = vindex, 1, 1>(); - const auto index = rvvabut, A::width>(real_index, imag_index); - const auto input = rvvabut(lo.data, hi.data); - const rvv_reg_t result = __riscv_vrgather(input, index, index.vl); + const auto real_index = detail_rvv::vindex, 0, 1>(); + const auto imag_index = detail_rvv::vindex, 1, 1>(); + const auto index = detail_rvv::rvvabut, A::width>(real_index, imag_index); + const auto input = detail_rvv::rvvabut(lo.data, hi.data); + const detail_rvv::rvv_reg_t result = __riscv_vrgather(input, index, index.vl); - return { rvvget_lo(result), rvvget_hi(result) }; + return { detail_rvv::rvvget_lo(result), detail_rvv::rvvget_hi(result) }; } } @@ -484,7 +491,7 @@ namespace xsimd template = 0> XSIMD_INLINE void store_aligned(T* dst, batch const& src, requires_arch) noexcept { - detail::rvvse(reinterpret_cast*>(dst), src); + detail_rvv::rvvse(reinterpret_cast*>(dst), src); } template = 0> @@ -497,7 +504,7 @@ namespace xsimd * scatter/gather * ******************/ - namespace detail + namespace detail_rvv { template using rvv_enable_sg_t = std::enable_if_t<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>; @@ -510,34 +517,34 @@ namespace xsimd } // scatter - template = 0> + template = 0> XSIMD_INLINE void scatter(batch const& vals, T* dst, batch const& index, kernel::requires_arch) noexcept { using UU = as_unsigned_integer_t; - const auto uindex = detail::rvv_to_unsigned_batch(index); + const auto uindex = detail_rvv::rvv_to_unsigned_batch(index); auto* base = reinterpret_cast*>(dst); // or rvvsuxei - const auto bi = detail::rvvmul_splat(uindex, sizeof(T)); - detail::rvvsoxei(base, bi, vals); + const auto bi = detail_rvv::rvvmul_splat(uindex, sizeof(T)); + detail_rvv::rvvsoxei(base, bi, vals); } // gather - template = 0> + template = 0> XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept { using UU = as_unsigned_integer_t; - const auto uindex = detail::rvv_to_unsigned_batch(index); + const auto uindex = detail_rvv::rvv_to_unsigned_batch(index); auto const* base = reinterpret_cast const*>(src); // or rvvluxei - const auto bi = detail::rvvmul_splat(uindex, sizeof(T)); - return detail::rvvloxei(base, bi); + const auto bi = detail_rvv::rvvmul_splat(uindex, sizeof(T)); + return detail_rvv::rvvloxei(base, bi); } /************** * Arithmetic * **************/ - namespace detail + namespace detail_rvv { XSIMD_RVV_OVERLOAD3(rvvadd, (__riscv_vadd), @@ -621,56 +628,56 @@ namespace xsimd template = 0> XSIMD_INLINE batch add(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return detail::rvvadd(lhs, rhs); + return detail_rvv::rvvadd(lhs, rhs); } // sadd template = 0> XSIMD_INLINE batch sadd(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return detail::rvvsadd(lhs, rhs); + return detail_rvv::rvvsadd(lhs, rhs); } // sub template = 0> XSIMD_INLINE batch sub(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return detail::rvvsub(lhs, rhs); + return detail_rvv::rvvsub(lhs, rhs); } // ssub template = 0> XSIMD_INLINE batch ssub(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return detail::rvvssub(lhs, rhs); + return detail_rvv::rvvssub(lhs, rhs); } // mul template = 0> XSIMD_INLINE batch mul(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return detail::rvvmul(lhs, rhs); + return detail_rvv::rvvmul(lhs, rhs); } // div template = 0> XSIMD_INLINE batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return detail::rvvdiv(lhs, rhs); + return detail_rvv::rvvdiv(lhs, rhs); } // max template = 0> XSIMD_INLINE batch max(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return detail::rvvmax(lhs, rhs); + return detail_rvv::rvvmax(lhs, rhs); } // min template = 0> XSIMD_INLINE batch min(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return detail::rvvmin(lhs, rhs); + return detail_rvv::rvvmin(lhs, rhs); } // neg @@ -678,15 +685,15 @@ namespace xsimd XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept { using S = as_signed_integer_t; - const auto as_signed = detail::rvvreinterpret(arg); - const auto result = detail::rvvneg(as_signed); - return detail::rvvreinterpret(result); + const auto as_signed = detail_rvv::rvvreinterpret(arg); + const auto result = detail_rvv::rvvneg(as_signed); + return detail_rvv::rvvreinterpret(result); } template = 0> XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept { - return detail::rvvneg(arg); + return detail_rvv::rvvneg(arg); } // abs @@ -699,23 +706,23 @@ namespace xsimd template = 0> XSIMD_INLINE batch abs(batch const& arg, requires_arch) noexcept { - return detail::rvvabs(arg); + return detail_rvv::rvvabs(arg); } // fma: x * y + z template = 0> XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { - // also detail::rvvmadd(x, y, z); - return detail::rvvmacc(z, x, y); + // also detail_rvv::rvvmadd(x, y, z); + return detail_rvv::rvvmacc(z, x, y); } // fnma: z - x * y template = 0> XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { - // also detail::rvvnmsub(x, y, z); - return detail::rvvnmsac(z, x, y); + // also detail_rvv::rvvnmsub(x, y, z); + return detail_rvv::rvvnmsac(z, x, y); } // fms: x * y - z @@ -740,7 +747,7 @@ namespace xsimd * Logical operations * **********************/ - namespace detail + namespace detail_rvv { XSIMD_RVV_OVERLOAD_INTS(rvvand, (__riscv_vand), , vec(vec, vec)) XSIMD_RVV_OVERLOAD_INTS(rvvor, (__riscv_vor), , vec(vec, vec)) @@ -758,118 +765,118 @@ namespace xsimd template = 0> XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return detail::rvvand(lhs, rhs); + return detail_rvv::rvvand(lhs, rhs); } template = 0> XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs); - const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs); - const auto result_bits = detail::rvvand(lhs_bits, rhs_bits); - return detail::rvvreinterpret(result_bits); + const auto lhs_bits = detail_rvv::rvv_to_unsigned_batch(lhs); + const auto rhs_bits = detail_rvv::rvv_to_unsigned_batch(rhs); + const auto result_bits = detail_rvv::rvvand(lhs_bits, rhs_bits); + return detail_rvv::rvvreinterpret(result_bits); } template = 0> XSIMD_INLINE batch_bool bitwise_and(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { - return detail::rvvmand(lhs, rhs); + return detail_rvv::rvvmand(lhs, rhs); } // bitwise_andnot template = 0> XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto not_rhs = detail::rvvnot(rhs); - return detail::rvvand(lhs, not_rhs); + const auto not_rhs = detail_rvv::rvvnot(rhs); + return detail_rvv::rvvand(lhs, not_rhs); } template = 0> XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs); - const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs); - const auto not_rhs = detail::rvvnot(rhs_bits); - const auto result_bits = detail::rvvand(lhs_bits, not_rhs); - return detail::rvvreinterpret(result_bits); + const auto lhs_bits = detail_rvv::rvv_to_unsigned_batch(lhs); + const auto rhs_bits = detail_rvv::rvv_to_unsigned_batch(rhs); + const auto not_rhs = detail_rvv::rvvnot(rhs_bits); + const auto result_bits = detail_rvv::rvvand(lhs_bits, not_rhs); + return detail_rvv::rvvreinterpret(result_bits); } template = 0> XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { - return detail::rvvmandn(lhs, rhs); + return detail_rvv::rvvmandn(lhs, rhs); } // bitwise_or template = 0> XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return detail::rvvor(lhs, rhs); + return detail_rvv::rvvor(lhs, rhs); } template = 0> XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs); - const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs); - const auto result_bits = detail::rvvor(lhs_bits, rhs_bits); - return detail::rvvreinterpret(result_bits); + const auto lhs_bits = detail_rvv::rvv_to_unsigned_batch(lhs); + const auto rhs_bits = detail_rvv::rvv_to_unsigned_batch(rhs); + const auto result_bits = detail_rvv::rvvor(lhs_bits, rhs_bits); + return detail_rvv::rvvreinterpret(result_bits); } template = 0> XSIMD_INLINE batch_bool bitwise_or(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { - return detail::rvvmor(lhs, rhs); + return detail_rvv::rvvmor(lhs, rhs); } // bitwise_xor template = 0> XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return detail::rvvxor(lhs, rhs); + return detail_rvv::rvvxor(lhs, rhs); } template = 0> XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs); - const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs); - const auto result_bits = detail::rvvxor(lhs_bits, rhs_bits); - return detail::rvvreinterpret(result_bits); + const auto lhs_bits = detail_rvv::rvv_to_unsigned_batch(lhs); + const auto rhs_bits = detail_rvv::rvv_to_unsigned_batch(rhs); + const auto result_bits = detail_rvv::rvvxor(lhs_bits, rhs_bits); + return detail_rvv::rvvreinterpret(result_bits); } template = 0> XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { - return detail::rvvmxor(lhs, rhs); + return detail_rvv::rvvmxor(lhs, rhs); } // bitwise_not template = 0> XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept { - return detail::rvvnot(arg); + return detail_rvv::rvvnot(arg); } template = 0> XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept { - const auto arg_bits = detail::rvv_to_unsigned_batch(arg); - const auto result_bits = detail::rvvnot(arg_bits); - return detail::rvvreinterpret(result_bits); + const auto arg_bits = detail_rvv::rvv_to_unsigned_batch(arg); + const auto result_bits = detail_rvv::rvvnot(arg_bits); + return detail_rvv::rvvreinterpret(result_bits); } template = 0> XSIMD_INLINE batch_bool bitwise_not(batch_bool const& arg, requires_arch) noexcept { - return detail::rvvmnot(arg); + return detail_rvv::rvvmnot(arg); } /********** * Shifts * **********/ - namespace detail + namespace detail_rvv { XSIMD_RVV_OVERLOAD_INTS(rvvsll_splat, (__riscv_vsll), , vec(vec, size_t)) XSIMD_RVV_OVERLOAD_INTS(rvvsll, (__riscv_vsll), , vec(vec, uvec)) @@ -887,13 +894,13 @@ namespace xsimd { constexpr size_t size = sizeof(typename batch::value_type) * 8; assert(0 <= n && static_cast(n) < size && "index in bounds"); - return detail::rvvsll_splat(arg, n); + return detail_rvv::rvvsll_splat(arg, n); } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return detail::rvvsll(lhs, detail::rvv_to_unsigned_batch(rhs)); + return detail_rvv::rvvsll(lhs, detail_rvv::rvv_to_unsigned_batch(rhs)); } // bitwise_rshift @@ -902,20 +909,20 @@ namespace xsimd { constexpr size_t size = sizeof(typename batch::value_type) * 8; assert(0 <= n && static_cast(n) < size && "index in bounds"); - return detail::rvvsr_splat(arg, n); + return detail_rvv::rvvsr_splat(arg, n); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return detail::rvvsr(lhs, detail::rvv_to_unsigned_batch(rhs)); + return detail_rvv::rvvsr(lhs, detail_rvv::rvv_to_unsigned_batch(rhs)); } /************** * Reductions * **************/ - namespace detail + namespace detail_rvv { XSIMD_RVV_OVERLOAD3(rvvredsum, (__riscv_vredsum), @@ -942,34 +949,34 @@ namespace xsimd template XSIMD_INLINE T reduce_scalar(rvv_reg_t const& arg) { - return detail::rvvmv_lane0(rvv_reg_t(arg.get_bytes(), types::detail::XSIMD_RVV_BITCAST)); + return detail_rvv::rvvmv_lane0(rvv_reg_t(arg.get_bytes(), types::detail::XSIMD_RVV_BITCAST)); } } // reduce_add template ::value_type, detail::enable_arithmetic_t = 0> XSIMD_INLINE V reduce_add(batch const& arg, requires_arch) noexcept { - const auto zero = detail::broadcast(T(0)); - const auto r = detail::rvvredsum(arg, zero); - return detail::reduce_scalar(r); + const auto zero = detail_rvv::broadcast(T(0)); + const auto r = detail_rvv::rvvredsum(arg, zero); + return detail_rvv::reduce_scalar(r); } // reduce_max template = 0> XSIMD_INLINE T reduce_max(batch const& arg, requires_arch) noexcept { - const auto lowest = detail::broadcast(std::numeric_limits::lowest()); - const auto r = detail::rvvredmax(arg, lowest); - return detail::reduce_scalar(r); + const auto lowest = detail_rvv::broadcast(std::numeric_limits::lowest()); + const auto r = detail_rvv::rvvredmax(arg, lowest); + return detail_rvv::reduce_scalar(r); } // reduce_min template = 0> XSIMD_INLINE T reduce_min(batch const& arg, requires_arch) noexcept { - const auto max = detail::broadcast(std::numeric_limits::max()); - const auto r = detail::rvvredmin(arg, max); - return detail::reduce_scalar(r); + const auto max = detail_rvv::broadcast(std::numeric_limits::max()); + const auto r = detail_rvv::rvvredmin(arg, max); + return detail_rvv::reduce_scalar(r); } // haddp @@ -994,61 +1001,61 @@ namespace xsimd template = 0> XSIMD_INLINE batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return detail::rvvmseq(lhs, rhs); + return detail_rvv::rvvmseq(lhs, rhs); } template = 0> XSIMD_INLINE batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { - const auto neq_result = detail::rvvmxor(lhs, rhs); - return detail::rvvmnot(neq_result); + const auto neq_result = detail_rvv::rvvmxor(lhs, rhs); + return detail_rvv::rvvmnot(neq_result); } // neq template = 0> XSIMD_INLINE batch_bool neq(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return detail::rvvmsne(lhs, rhs); + return detail_rvv::rvvmsne(lhs, rhs); } template = 0> XSIMD_INLINE batch_bool neq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { - return detail::rvvmxor(lhs, rhs); + return detail_rvv::rvvmxor(lhs, rhs); } // lt template = 0> XSIMD_INLINE batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return detail::rvvmslt(lhs, rhs); + return detail_rvv::rvvmslt(lhs, rhs); } // le template = 0> XSIMD_INLINE batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return detail::rvvmsle(lhs, rhs); + return detail_rvv::rvvmsle(lhs, rhs); } // gt template = 0> XSIMD_INLINE batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return detail::rvvmsgt(lhs, rhs); + return detail_rvv::rvvmsgt(lhs, rhs); } // ge template = 0> XSIMD_INLINE batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return detail::rvvmsge(lhs, rhs); + return detail_rvv::rvvmsge(lhs, rhs); } /************* * Selection * *************/ - namespace detail + namespace detail_rvv { XSIMD_RVV_OVERLOAD(rvvcompress, (__riscv_vcompress_tu), , vec(vec, vec, bvec)) } @@ -1057,13 +1064,13 @@ namespace xsimd XSIMD_INLINE batch compress(batch const& x, batch_bool const& mask, requires_arch) noexcept { auto zero = broadcast(T(0), rvv {}); - return detail::rvvcompress(zero, x, mask); + return detail_rvv::rvvcompress(zero, x, mask); } /*************** * Permutation * ***************/ - namespace detail + namespace detail_rvv { XSIMD_RVV_OVERLOAD(rvvrgather, (__riscv_vrgather), , vec(vec, uvec)) XSIMD_RVV_OVERLOAD(rvvslideup, (__riscv_vslideup), , vec(vec, vec, size_t)) @@ -1076,7 +1083,7 @@ namespace xsimd { static_assert(batch::size == sizeof...(idx), "invalid swizzle indices"); const batch indices { idx... }; - return detail::rvvrgather(arg, indices); + return detail_rvv::rvvrgather(arg, indices); } template @@ -1098,15 +1105,15 @@ namespace xsimd template = 0> XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, size_t n, requires_arch) noexcept { - const auto tmp = detail::rvvslidedown(rhs, n); - return detail::rvvslideup(tmp, lhs, lhs.size - n); + const auto tmp = detail_rvv::rvvslidedown(rhs, n); + return detail_rvv::rvvslideup(tmp, lhs, lhs.size - n); } // select template = 0> XSIMD_INLINE batch select(batch_bool const& cond, batch const& a, batch const& b, requires_arch) noexcept { - return detail::rvvmerge(b, a, cond); + return detail_rvv::rvvmerge(b, a, cond); } template @@ -1119,22 +1126,22 @@ namespace xsimd template = 0> XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto index = detail::vindex, 0, -1>(); - const auto mask = detail::pmask8(0xaa); - return detail::rvvmerge(detail::rvvrgather(lhs, index), - detail::rvvrgather(rhs, index), - mask); + const auto index = detail_rvv::vindex, 0, -1>(); + const auto mask = detail_rvv::pmask8(0xaa); + return detail_rvv::rvvmerge(detail_rvv::rvvrgather(lhs, index), + detail_rvv::rvvrgather(rhs, index), + mask); } // zip_hi template = 0> XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto index = detail::vindex, batch::size / 2, -1>(); - const auto mask = detail::pmask8(0xaa); - return detail::rvvmerge(detail::rvvrgather(lhs, index), - detail::rvvrgather(rhs, index), - mask); + const auto index = detail_rvv::vindex, batch::size / 2, -1>(); + const auto mask = detail_rvv::pmask8(0xaa); + return detail_rvv::rvvmerge(detail_rvv::rvvrgather(lhs, index), + detail_rvv::rvvrgather(rhs, index), + mask); } // store_complex @@ -1158,7 +1165,7 @@ namespace xsimd * Floating-point arithmetic * *****************************/ - namespace detail + namespace detail_rvv { XSIMD_RVV_OVERLOAD_FLOATS(rvvfsqrt, (__riscv_vfsqrt), , vec(vec)) XSIMD_RVV_OVERLOAD_FLOATS(rvvfrec7, (__riscv_vfrec7), , vec(vec)) @@ -1169,7 +1176,7 @@ namespace xsimd template = 0> XSIMD_INLINE batch rsqrt(batch const& arg, requires_arch) noexcept { - auto approx = detail::rvvfrsqrt7(arg); + auto approx = detail_rvv::rvvfrsqrt7(arg); approx = approx * (1.5 - (0.5 * arg * approx * approx)); return approx; } @@ -1178,14 +1185,14 @@ namespace xsimd template = 0> XSIMD_INLINE batch sqrt(batch const& arg, requires_arch) noexcept { - return detail::rvvfsqrt(arg); + return detail_rvv::rvvfsqrt(arg); } // reciprocal template = 0> XSIMD_INLINE batch reciprocal(const batch& arg, requires_arch) noexcept { - return detail::rvvfrec7(arg); + return detail_rvv::rvvfrec7(arg); } /****************************** @@ -1193,7 +1200,7 @@ namespace xsimd ******************************/ // fast_cast - namespace detail + namespace detail_rvv { XSIMD_RVV_OVERLOAD2(rvvfcvt_rtz, // truncating conversion, like C. (__riscv_vfcvt_rtz_x), @@ -1252,7 +1259,7 @@ namespace xsimd using U = as_unsigned_integer_t; const auto values = set(batch {}, rvv {}, static_cast(args)...); const auto zero = broadcast(U(0), rvv {}); - detail::rvv_bool_t result = detail::rvvmsne(values, zero); + detail_rvv::rvv_bool_t result = detail_rvv::rvvmsne(values, zero); return result; } @@ -1260,65 +1267,65 @@ namespace xsimd template = 0> XSIMD_INLINE T first(batch const& arg, requires_arch) noexcept { - return detail::rvvmv_lane0(arg); + return detail_rvv::rvvmv_lane0(arg); } template = 0> XSIMD_INLINE std::complex first(batch, A> const& arg, requires_arch) noexcept { - return std::complex { detail::rvvmv_lane0(arg.real()), detail::rvvmv_lane0(arg.imag()) }; + return std::complex { detail_rvv::rvvmv_lane0(arg.real()), detail_rvv::rvvmv_lane0(arg.imag()) }; } // insert template = 0> XSIMD_INLINE batch insert(batch const& arg, T val, index, requires_arch) noexcept { - const auto mask = detail::pmask(uint64_t(1) << I); - return detail::rvvmerge_splat(arg, val, mask); + const auto mask = detail_rvv::pmask(uint64_t(1) << I); + return detail_rvv::rvvmerge_splat(arg, val, mask); } // get template = 0> XSIMD_INLINE T get(batch const& arg, size_t i, requires_arch) noexcept { - const auto tmp = detail::rvvslidedown(arg, i); - return detail::rvvmv_lane0(tmp); + const auto tmp = detail_rvv::rvvslidedown(arg, i); + return detail_rvv::rvvmv_lane0(tmp); } template = 0> XSIMD_INLINE std::complex get(batch, A> const& arg, size_t i, requires_arch) noexcept { - const auto tmpr = detail::rvvslidedown(arg.real(), i); - const auto tmpi = detail::rvvslidedown(arg.imag(), i); - return std::complex { detail::rvvmv_lane0(tmpr), detail::rvvmv_lane0(tmpi) }; + const auto tmpr = detail_rvv::rvvslidedown(arg.real(), i); + const auto tmpi = detail_rvv::rvvslidedown(arg.imag(), i); + return std::complex { detail_rvv::rvvmv_lane0(tmpr), detail_rvv::rvvmv_lane0(tmpi) }; } // all template = 0> XSIMD_INLINE bool all(batch_bool const& arg, requires_arch) noexcept { - return detail::rvvcpop(arg) == batch_bool::size; + return detail_rvv::rvvcpop(arg) == batch_bool::size; } // any template = 0> XSIMD_INLINE bool any(batch_bool const& arg, requires_arch) noexcept { - return detail::rvvcpop(arg) > 0; + return detail_rvv::rvvcpop(arg) > 0; } // bitwise_cast template = 0, detail::enable_arithmetic_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return detail::rvv_reg_t(arg.data.get_bytes(), types::detail::XSIMD_RVV_BITCAST); + return detail_rvv::rvv_reg_t(arg.data.get_bytes(), types::detail::XSIMD_RVV_BITCAST); } // batch_bool_cast template = 0> XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& arg, batch_bool const&, requires_arch) noexcept { - using intermediate_t = typename detail::rvv_bool_t; + using intermediate_t = typename detail_rvv::rvv_bool_t; return intermediate_t(arg.data); } @@ -1327,10 +1334,10 @@ namespace xsimd XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept { const auto zero = broadcast(T(0), rvv {}); - return detail::rvvmerge_splat(zero, T(1), arg); + return detail_rvv::rvvmerge_splat(zero, T(1), arg); } - namespace detail + namespace detail_rvv { template XSIMD_INLINE vuint8m1_t rvvslidedownbytes(vuint8m1_t arg, size_t i) @@ -1366,16 +1373,16 @@ namespace xsimd { const auto zero = broadcast(uint8_t(0), rvv {}); const auto bytes = arg.data.get_bytes(); - return detail::rvvreinterpret(detail::rvvslideup(zero, bytes, N)); + return detail_rvv::rvvreinterpret(detail_rvv::rvvslideup(zero, bytes, N)); } // slide_right template = 0> XSIMD_INLINE batch slide_right(batch const& arg, requires_arch) noexcept { - using reg_t = detail::rvv_reg_t; + using reg_t = detail_rvv::rvv_reg_t; const auto bytes = arg.data.get_bytes(); - return reg_t(detail::rvvslidedownbytes(bytes, N), types::detail::XSIMD_RVV_BITCAST); + return reg_t(detail_rvv::rvvslidedownbytes(bytes, N), types::detail::XSIMD_RVV_BITCAST); } // isnan @@ -1385,7 +1392,7 @@ namespace xsimd return !(arg == arg); } - namespace detail + namespace detail_rvv { template using rvv_as_signed_integer_t = as_signed_integer_t>; @@ -1404,11 +1411,11 @@ namespace xsimd } // nearbyint_as_int - template > + template > XSIMD_INLINE batch nearbyint_as_int(batch const& arg, requires_arch) noexcept { // Reference rounds ties to nearest even - return detail::rvvfcvt_default(arg); + return detail_rvv::rvvfcvt_default(arg); } // round @@ -1417,7 +1424,7 @@ namespace xsimd { // Round ties away from zero. const auto mask = abs(arg) < constants::maxflint>(); - return select(mask, to_float(detail::rvvfcvt_afz(arg)), arg, rvv {}); + return select(mask, to_float(detail_rvv::rvvfcvt_afz(arg)), arg, rvv {}); } // nearbyint @@ -1426,7 +1433,7 @@ namespace xsimd { // Round according to current rounding mode. const auto mask = abs(arg) < constants::maxflint>(); - return select(mask, to_float(detail::rvvfcvt_default(arg)), arg, rvv {}); + return select(mask, to_float(detail_rvv::rvvfcvt_default(arg)), arg, rvv {}); } // mask @@ -1439,12 +1446,12 @@ namespace xsimd XSIMD_IF_CONSTEXPR((8 * sizeof(T)) >= batch_bool::size) { // (A) Easy case: the number of slots fits in T. - const auto zero = detail::broadcast, types::detail::rvv_width_m1>(T(0)); - auto ones = detail::broadcast, A::width>(1); - auto iota = detail::rvvid(as_unsigned_integer_t {}); - auto upowers = detail::rvvsll(ones, iota); + const auto zero = detail_rvv::broadcast, types::detail::rvv_width_m1>(T(0)); + auto ones = detail_rvv::broadcast, A::width>(1); + auto iota = detail_rvv::rvvid(as_unsigned_integer_t {}); + auto upowers = detail_rvv::rvvsll(ones, iota); auto r = __riscv_vredor(self.data.as_mask(), upowers, (typename decltype(zero)::register_type)zero, batch_bool::size); - return detail::reduce_scalar>(r); + return detail_rvv::reduce_scalar>(r); } else XSIMD_IF_CONSTEXPR((2 * 8 * sizeof(T)) == batch_bool::size) { @@ -1460,20 +1467,20 @@ namespace xsimd }; // The low part is similar to the approach in (A). - const auto zero = detail::broadcast, types::detail::rvv_width_m1>(T(0)); - auto ones = detail::broadcast, A::width>(1); - auto iota = detail::rvvid(as_unsigned_integer_t {}); - auto upowers = detail::rvvsll(ones, iota); + const auto zero = detail_rvv::broadcast, types::detail::rvv_width_m1>(T(0)); + auto ones = detail_rvv::broadcast, A::width>(1); + auto iota = detail_rvv::rvvid(as_unsigned_integer_t {}); + auto upowers = detail_rvv::rvvsll(ones, iota); auto low_mask = self & make_batch_bool_constant(); auto r_low = __riscv_vredor(low_mask.data.as_mask(), upowers, (typename decltype(zero)::register_type)zero, batch_bool::size); // The high part requires to slide the upower filter to match the high mask. - upowers = detail::rvvslideup(upowers, upowers, 8 * sizeof(T)); + upowers = detail_rvv::rvvslideup(upowers, upowers, 8 * sizeof(T)); auto high_mask = self & make_batch_bool_constant(); auto r_high = __riscv_vredor(high_mask.data.as_mask(), upowers, (typename decltype(zero)::register_type)zero, batch_bool::size); // Concatenate the two parts. - return (uint64_t)detail::reduce_scalar>(r_low) | ((uint64_t)detail::reduce_scalar>(r_high) << (8 * sizeof(T))); + return (uint64_t)detail_rvv::reduce_scalar>(r_low) | ((uint64_t)detail_rvv::reduce_scalar>(r_high) << (8 * sizeof(T))); } else { diff --git a/include/xsimd/arch/xsimd_sve.hpp b/include/xsimd/arch/xsimd_sve.hpp index 05109dfd1..841ed35f6 100644 --- a/include/xsimd/arch/xsimd_sve.hpp +++ b/include/xsimd/arch/xsimd_sve.hpp @@ -16,8 +16,16 @@ #include #include +#include "../config/xsimd_config.hpp" +#include "../config/xsimd_macros.hpp" #include "../types/xsimd_sve_register.hpp" +// Define a inline namespace with the explicit SVE vector size to avoid ODR violation +// When dynamically dispatching between different SVE sizes. +// While most code is safe from ODR violation as the size is already encoded in the +// register (and hence batch) types, utilities can quickly fall prone to this issue. +#define XSIMD_SVE_NAMESPACE XSIMD_CONCAT(sve, XSIMD_SVE_BITS) + namespace xsimd { template @@ -25,54 +33,57 @@ namespace xsimd namespace kernel { - namespace detail - { - using xsimd::index; - using xsimd::types::detail::sve_vector_type; - - // predicate creation - XSIMD_INLINE svbool_t sve_ptrue_impl(index<1>) noexcept { return svptrue_b8(); } - XSIMD_INLINE svbool_t sve_ptrue_impl(index<2>) noexcept { return svptrue_b16(); } - XSIMD_INLINE svbool_t sve_ptrue_impl(index<4>) noexcept { return svptrue_b32(); } - XSIMD_INLINE svbool_t sve_ptrue_impl(index<8>) noexcept { return svptrue_b64(); } - - template - svbool_t sve_ptrue() noexcept { return sve_ptrue_impl(index {}); } - - // predicate loading - template - svbool_t sve_pmask() noexcept { return svdupq_b64(M0, M1); } - template - svbool_t sve_pmask() noexcept { return svdupq_b32(M0, M1, M2, M3); } - template - svbool_t sve_pmask() noexcept { return svdupq_b16(M0, M1, M2, M3, M4, M5, M6, M7); } - template - svbool_t sve_pmask() noexcept { return svdupq_b8(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15); } - - // count active lanes in a predicate - XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<1>) noexcept { return svcntp_b8(p, p); } - XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<2>) noexcept { return svcntp_b16(p, p); } - XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<4>) noexcept { return svcntp_b32(p, p); } - XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<8>) noexcept { return svcntp_b64(p, p); } - - template - XSIMD_INLINE uint64_t sve_pcount(svbool_t p) noexcept { return sve_pcount_impl(p, index {}); } - - // enable for signed integers or floating points - template - using sve_enable_signed_int_or_floating_point_t = std::enable_if_t::value, int>; - - // `sizeless` is the matching sizeless SVE type. xsimd stores SVE - // vectors as fixed-size attributed types (arm_sve_vector_bits), - // which clang treats as implicitly convertible to every sizeless - // SVE type — including multi-vector tuples — making the overloaded - // svreinterpret_*/svsel/etc. intrinsics ambiguous. Static-casting - // to `sizeless` first collapses the overload set to the single - // 1-vector candidate. - template - using sve_sizeless_t = xsimd::types::detail::sizeless_sve_vector_type; - } // namespace detail + namespace detail_sve + { + inline namespace XSIMD_SVE_NAMESPACE + { + using xsimd::index; + using xsimd::types::detail::sve_vector_type; + + // predicate creation + XSIMD_INLINE svbool_t ptrue_impl(index<1>) noexcept { return svptrue_b8(); } + XSIMD_INLINE svbool_t ptrue_impl(index<2>) noexcept { return svptrue_b16(); } + XSIMD_INLINE svbool_t ptrue_impl(index<4>) noexcept { return svptrue_b32(); } + XSIMD_INLINE svbool_t ptrue_impl(index<8>) noexcept { return svptrue_b64(); } + + template + XSIMD_INLINE svbool_t ptrue() noexcept { return ptrue_impl(index {}); } + + // predicate loading + template + XSIMD_INLINE svbool_t pmask() noexcept { return svdupq_b64(M0, M1); } + template + XSIMD_INLINE svbool_t pmask() noexcept { return svdupq_b32(M0, M1, M2, M3); } + template + XSIMD_INLINE svbool_t pmask() noexcept { return svdupq_b16(M0, M1, M2, M3, M4, M5, M6, M7); } + template + XSIMD_INLINE svbool_t pmask() noexcept { return svdupq_b8(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15); } + + // count active lanes in a predicate + XSIMD_INLINE uint64_t pcount_impl(svbool_t p, index<1>) noexcept { return svcntp_b8(p, p); } + XSIMD_INLINE uint64_t pcount_impl(svbool_t p, index<2>) noexcept { return svcntp_b16(p, p); } + XSIMD_INLINE uint64_t pcount_impl(svbool_t p, index<4>) noexcept { return svcntp_b32(p, p); } + XSIMD_INLINE uint64_t pcount_impl(svbool_t p, index<8>) noexcept { return svcntp_b64(p, p); } + + template + XSIMD_INLINE uint64_t pcount(svbool_t p) noexcept { return pcount_impl(p, index {}); } + + // enable for signed integers or floating points + template + using enable_signed_int_or_floating_point_t = std::enable_if_t::value, int>; + + // `sizeless` is the matching sizeless SVE type. xsimd stores SVE + // vectors as fixed-size attributed types (arm_sve_vector_bits), + // which clang treats as implicitly convertible to every sizeless + // SVE type — including multi-vector tuples — making the overloaded + // svreinterpret_*/svsel/etc. intrinsics ambiguous. Static-casting + // to `sizeless` first collapses the overload set to the single + // 1-vector candidate. + template + using sizeless_t = xsimd::types::detail::sizeless_sve_vector_type; + } // namespace XSIMD_SVE_NAMESPACE + } // namespace detail_sve /********* * Load * @@ -81,7 +92,7 @@ namespace xsimd template = 0> XSIMD_INLINE batch load_aligned(T const* src, convert, requires_arch) noexcept { - return svld1(detail::sve_ptrue(), reinterpret_cast const*>(src)); + return svld1(detail_sve::ptrue(), reinterpret_cast const*>(src)); } template = 0> @@ -94,7 +105,7 @@ namespace xsimd template = 0> XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant, Mode, requires_arch) noexcept { - return svld1(detail::sve_pmask(), reinterpret_cast const*>(mem)); + return svld1(detail_sve::pmask(), reinterpret_cast const*>(mem)); } // load_complex @@ -102,7 +113,7 @@ namespace xsimd XSIMD_INLINE batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) noexcept { const T* buf = reinterpret_cast(mem); - const auto tmp = svld2(detail::sve_ptrue(), buf); + const auto tmp = svld2(detail_sve::ptrue(), buf); const auto real = svget2(tmp, 0); const auto imag = svget2(tmp, 1); return batch, A> { real, imag }; @@ -121,7 +132,7 @@ namespace xsimd template = 0> XSIMD_INLINE void store_aligned(T* dst, batch const& src, requires_arch) noexcept { - svst1(detail::sve_ptrue(), reinterpret_cast*>(dst), src); + svst1(detail_sve::ptrue(), reinterpret_cast*>(dst), src); } template = 0> @@ -139,7 +150,7 @@ namespace xsimd tmp = svset2(tmp, 0, src.real()); tmp = svset2(tmp, 1, src.imag()); T* buf = reinterpret_cast(dst); - svst2(detail::sve_ptrue(), buf, tmp); + svst2(detail_sve::ptrue(), buf, tmp); } template = 0> @@ -152,24 +163,24 @@ namespace xsimd * scatter/gather * ******************/ - namespace detail + namespace detail_sve { template - using sve_enable_sg_t = std::enable_if_t<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>; + using enable_sg_t = std::enable_if_t<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>; } // scatter - template = 0> + template = 0> XSIMD_INLINE void scatter(batch const& src, T* dst, batch const& index, kernel::requires_arch) noexcept { - svst1_scatter_index(detail::sve_ptrue(), dst, index.data, src.data); + svst1_scatter_index(detail_sve::ptrue(), dst, index.data, src.data); } // gather - template = 0> + template = 0> XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept { - return svld1_gather_index(detail::sve_ptrue(), src, index.data); + return svld1_gather_index(detail_sve::ptrue(), src, index.data); } /******************** @@ -251,7 +262,7 @@ namespace xsimd template = 0> XSIMD_INLINE batch add(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svadd_x(detail::sve_ptrue(), lhs, rhs); + return svadd_x(detail_sve::ptrue(), lhs, rhs); } // sadd @@ -265,7 +276,7 @@ namespace xsimd template = 0> XSIMD_INLINE batch sub(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svsub_x(detail::sve_ptrue(), lhs, rhs); + return svsub_x(detail_sve::ptrue(), lhs, rhs); } // ssub @@ -279,59 +290,59 @@ namespace xsimd template = 0> XSIMD_INLINE batch mul(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svmul_x(detail::sve_ptrue(), lhs, rhs); + return svmul_x(detail_sve::ptrue(), lhs, rhs); } // div template = 4, int> = 0> XSIMD_INLINE batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svdiv_x(detail::sve_ptrue(), lhs, rhs); + return svdiv_x(detail_sve::ptrue(), lhs, rhs); } // max template = 0> XSIMD_INLINE batch max(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svmax_x(detail::sve_ptrue(), lhs, rhs); + return svmax_x(detail_sve::ptrue(), lhs, rhs); } // min template = 0> XSIMD_INLINE batch min(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svmin_x(detail::sve_ptrue(), lhs, rhs); + return svmin_x(detail_sve::ptrue(), lhs, rhs); } // neg template = 0> XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept { - return svreinterpret_u8(svneg_x(detail::sve_ptrue(), svreinterpret_s8(static_cast>(arg)))); + return svreinterpret_u8(svneg_x(detail_sve::ptrue(), svreinterpret_s8(static_cast>(arg)))); } template = 0> XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept { - return svreinterpret_u16(svneg_x(detail::sve_ptrue(), svreinterpret_s16(static_cast>(arg)))); + return svreinterpret_u16(svneg_x(detail_sve::ptrue(), svreinterpret_s16(static_cast>(arg)))); } template = 0> XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept { - return svreinterpret_u32(svneg_x(detail::sve_ptrue(), svreinterpret_s32(static_cast>(arg)))); + return svreinterpret_u32(svneg_x(detail_sve::ptrue(), svreinterpret_s32(static_cast>(arg)))); } template = 0> XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept { - return svreinterpret_u64(svneg_x(detail::sve_ptrue(), svreinterpret_s64(static_cast>(arg)))); + return svreinterpret_u64(svneg_x(detail_sve::ptrue(), svreinterpret_s64(static_cast>(arg)))); } template = 0> XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept { - return svneg_x(detail::sve_ptrue(), arg); + return svneg_x(detail_sve::ptrue(), arg); } // abs @@ -344,21 +355,21 @@ namespace xsimd template = 0> XSIMD_INLINE batch abs(batch const& arg, requires_arch) noexcept { - return svabs_x(detail::sve_ptrue(), arg); + return svabs_x(detail_sve::ptrue(), arg); } // fma: x * y + z template = 0> XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { - return svmad_x(detail::sve_ptrue(), x, y, z); + return svmad_x(detail_sve::ptrue(), x, y, z); } // fnma: z - x * y template = 0> XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { - return svmsb_x(detail::sve_ptrue(), x, y, z); + return svmsb_x(detail_sve::ptrue(), x, y, z); } // fms: x * y - z @@ -383,191 +394,194 @@ namespace xsimd template = 0> XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svand_x(detail::sve_ptrue(), lhs, rhs); + return svand_x(detail_sve::ptrue(), lhs, rhs); } template XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); - const auto result_bits = svand_x(detail::sve_ptrue(), lhs_bits, rhs_bits); + const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); + const auto result_bits = svand_x(detail_sve::ptrue(), lhs_bits, rhs_bits); return svreinterpret_f32(result_bits); } template XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); - const auto result_bits = svand_x(detail::sve_ptrue(), lhs_bits, rhs_bits); + const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); + const auto result_bits = svand_x(detail_sve::ptrue(), lhs_bits, rhs_bits); return svreinterpret_f64(result_bits); } template = 0> XSIMD_INLINE batch_bool bitwise_and(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { - return svand_z(detail::sve_ptrue(), lhs, rhs); + return svand_z(detail_sve::ptrue(), lhs, rhs); } // bitwise_andnot template = 0> XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svbic_x(detail::sve_ptrue(), lhs, rhs); + return svbic_x(detail_sve::ptrue(), lhs, rhs); } template XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); - const auto result_bits = svbic_x(detail::sve_ptrue(), lhs_bits, rhs_bits); + const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); + const auto result_bits = svbic_x(detail_sve::ptrue(), lhs_bits, rhs_bits); return svreinterpret_f32(result_bits); } template XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); - const auto result_bits = svbic_x(detail::sve_ptrue(), lhs_bits, rhs_bits); + const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); + const auto result_bits = svbic_x(detail_sve::ptrue(), lhs_bits, rhs_bits); return svreinterpret_f64(result_bits); } template = 0> XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { - return svbic_z(detail::sve_ptrue(), lhs, rhs); + return svbic_z(detail_sve::ptrue(), lhs, rhs); } // bitwise_or template = 0> XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svorr_x(detail::sve_ptrue(), lhs, rhs); + return svorr_x(detail_sve::ptrue(), lhs, rhs); } template XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); - const auto result_bits = svorr_x(detail::sve_ptrue(), lhs_bits, rhs_bits); + const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); + const auto result_bits = svorr_x(detail_sve::ptrue(), lhs_bits, rhs_bits); return svreinterpret_f32(result_bits); } template XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); - const auto result_bits = svorr_x(detail::sve_ptrue(), lhs_bits, rhs_bits); + const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); + const auto result_bits = svorr_x(detail_sve::ptrue(), lhs_bits, rhs_bits); return svreinterpret_f64(result_bits); } template = 0> XSIMD_INLINE batch_bool bitwise_or(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { - return svorr_z(detail::sve_ptrue(), lhs, rhs); + return svorr_z(detail_sve::ptrue(), lhs, rhs); } // bitwise_xor template = 0> XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return sveor_x(detail::sve_ptrue(), lhs, rhs); + return sveor_x(detail_sve::ptrue(), lhs, rhs); } template XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); - const auto result_bits = sveor_x(detail::sve_ptrue(), lhs_bits, rhs_bits); + const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); + const auto result_bits = sveor_x(detail_sve::ptrue(), lhs_bits, rhs_bits); return svreinterpret_f32(result_bits); } template XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); - const auto result_bits = sveor_x(detail::sve_ptrue(), lhs_bits, rhs_bits); + const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); + const auto result_bits = sveor_x(detail_sve::ptrue(), lhs_bits, rhs_bits); return svreinterpret_f64(result_bits); } template = 0> XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { - return sveor_z(detail::sve_ptrue(), lhs, rhs); + return sveor_z(detail_sve::ptrue(), lhs, rhs); } // bitwise_not template = 0> XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept { - return svnot_x(detail::sve_ptrue(), arg); + return svnot_x(detail_sve::ptrue(), arg); } template XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept { - const auto arg_bits = svreinterpret_u32(static_cast>(arg)); - const auto result_bits = svnot_x(detail::sve_ptrue(), arg_bits); + const auto arg_bits = svreinterpret_u32(static_cast>(arg)); + const auto result_bits = svnot_x(detail_sve::ptrue(), arg_bits); return svreinterpret_f32(result_bits); } template XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept { - const auto arg_bits = svreinterpret_u64(static_cast>(arg)); - const auto result_bits = svnot_x(detail::sve_ptrue(), arg_bits); + const auto arg_bits = svreinterpret_u64(static_cast>(arg)); + const auto result_bits = svnot_x(detail_sve::ptrue(), arg_bits); return svreinterpret_f64(result_bits); } template = 0> XSIMD_INLINE batch_bool bitwise_not(batch_bool const& arg, requires_arch) noexcept { - return svnot_z(detail::sve_ptrue(), arg); + return svnot_z(detail_sve::ptrue(), arg); } /********** * Shifts * **********/ - namespace detail + namespace detail_sve { - template - XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<1>) noexcept + inline namespace XSIMD_SVE_NAMESPACE { - return svreinterpret_u8(static_cast>(arg)); - } + template + XSIMD_INLINE batch to_unsigned_batch_impl(batch const& arg, index<1>) noexcept + { + return svreinterpret_u8(static_cast>(arg)); + } - template - XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<2>) noexcept - { - return svreinterpret_u16(static_cast>(arg)); - } + template + XSIMD_INLINE batch to_unsigned_batch_impl(batch const& arg, index<2>) noexcept + { + return svreinterpret_u16(static_cast>(arg)); + } - template - XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<4>) noexcept - { - return svreinterpret_u32(static_cast>(arg)); - } + template + XSIMD_INLINE batch to_unsigned_batch_impl(batch const& arg, index<4>) noexcept + { + return svreinterpret_u32(static_cast>(arg)); + } - template - XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<8>) noexcept - { - return svreinterpret_u64(static_cast>(arg)); - } + template + XSIMD_INLINE batch to_unsigned_batch_impl(batch const& arg, index<8>) noexcept + { + return svreinterpret_u64(static_cast>(arg)); + } - template > - XSIMD_INLINE batch sve_to_unsigned_batch(batch const& arg) noexcept - { - return sve_to_unsigned_batch_impl(arg, index {}); - } - } // namespace detail + template > + XSIMD_INLINE batch to_unsigned_batch(batch const& arg) noexcept + { + return to_unsigned_batch_impl(arg, index {}); + } + } // namespace XSIMD_SVE_NAMESPACE + } // namespace detail_sve // bitwise_lshift template = 0> @@ -575,13 +589,13 @@ namespace xsimd { constexpr std::size_t size = sizeof(typename batch::value_type) * 8; assert(0 <= n && static_cast(n) < size && "index in bounds"); - return svlsl_x(detail::sve_ptrue(), arg, n); + return svlsl_x(detail_sve::ptrue(), arg, n); } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svlsl_x(detail::sve_ptrue(), lhs, detail::sve_to_unsigned_batch(rhs)); + return svlsl_x(detail_sve::ptrue(), lhs, detail_sve::to_unsigned_batch(rhs)); } // bitwise_rshift @@ -590,13 +604,13 @@ namespace xsimd { constexpr std::size_t size = sizeof(typename batch::value_type) * 8; assert(0 <= n && static_cast(n) < size && "index in bounds"); - return svlsr_x(detail::sve_ptrue(), arg, static_cast(n)); + return svlsr_x(detail_sve::ptrue(), arg, static_cast(n)); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svlsr_x(detail::sve_ptrue(), lhs, rhs); + return svlsr_x(detail_sve::ptrue(), lhs, rhs); } template = 0> @@ -604,13 +618,13 @@ namespace xsimd { constexpr std::size_t size = sizeof(typename batch::value_type) * 8; assert(0 <= n && static_cast(n) < size && "index in bounds"); - return svasr_x(detail::sve_ptrue(), arg, static_cast>(n)); + return svasr_x(detail_sve::ptrue(), arg, static_cast>(n)); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svasr_x(detail::sve_ptrue(), lhs, detail::sve_to_unsigned_batch(rhs)); + return svasr_x(detail_sve::ptrue(), lhs, detail_sve::to_unsigned_batch(rhs)); } /************** @@ -622,21 +636,21 @@ namespace xsimd XSIMD_INLINE V reduce_add(batch const& arg, requires_arch) noexcept { // sve integer reduction results are promoted to 64 bits - return static_cast(svaddv(detail::sve_ptrue(), arg)); + return static_cast(svaddv(detail_sve::ptrue(), arg)); } // reduce_max template = 0> XSIMD_INLINE T reduce_max(batch const& arg, requires_arch) noexcept { - return svmaxv(detail::sve_ptrue(), arg); + return svmaxv(detail_sve::ptrue(), arg); } // reduce_min template = 0> XSIMD_INLINE T reduce_min(batch const& arg, requires_arch) noexcept { - return svminv(detail::sve_ptrue(), arg); + return svminv(detail_sve::ptrue(), arg); } // haddp @@ -649,7 +663,7 @@ namespace xsimd { sums[i] = reduce_add(row[i], sve {}); } - return svld1(detail::sve_ptrue(), sums); + return svld1(detail_sve::ptrue(), sums); } /*************** @@ -660,55 +674,55 @@ namespace xsimd template = 0> XSIMD_INLINE batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svcmpeq(detail::sve_ptrue(), lhs, rhs); + return svcmpeq(detail_sve::ptrue(), lhs, rhs); } template = 0> XSIMD_INLINE batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { - const auto neq_result = sveor_z(detail::sve_ptrue(), lhs, rhs); - return svnot_z(detail::sve_ptrue(), neq_result); + const auto neq_result = sveor_z(detail_sve::ptrue(), lhs, rhs); + return svnot_z(detail_sve::ptrue(), neq_result); } // neq template = 0> XSIMD_INLINE batch_bool neq(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svcmpne(detail::sve_ptrue(), lhs, rhs); + return svcmpne(detail_sve::ptrue(), lhs, rhs); } template = 0> XSIMD_INLINE batch_bool neq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { - return sveor_z(detail::sve_ptrue(), lhs, rhs); + return sveor_z(detail_sve::ptrue(), lhs, rhs); } // lt template = 0> XSIMD_INLINE batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svcmplt(detail::sve_ptrue(), lhs, rhs); + return svcmplt(detail_sve::ptrue(), lhs, rhs); } // le template = 0> XSIMD_INLINE batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svcmple(detail::sve_ptrue(), lhs, rhs); + return svcmple(detail_sve::ptrue(), lhs, rhs); } // gt template = 0> XSIMD_INLINE batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svcmpgt(detail::sve_ptrue(), lhs, rhs); + return svcmpgt(detail_sve::ptrue(), lhs, rhs); } // ge template = 0> XSIMD_INLINE batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svcmpge(detail::sve_ptrue(), lhs, rhs); + return svcmpge(detail_sve::ptrue(), lhs, rhs); } /*************** @@ -761,55 +775,58 @@ namespace xsimd *************/ // extract_pair - namespace detail + namespace detail_sve { - template - XSIMD_INLINE batch sve_extract_pair(batch const&, batch const& /*rhs*/, std::size_t, std::index_sequence<>) noexcept + inline namespace XSIMD_SVE_NAMESPACE { - assert(false && "extract_pair out of bounds"); - return batch {}; - } - - template - XSIMD_INLINE batch sve_extract_pair(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence) noexcept - { - if (n == I) - { - return svext(rhs, lhs, I); - } - else + template + XSIMD_INLINE batch extract_pair(batch const&, batch const& /*rhs*/, std::size_t, std::index_sequence<>) noexcept { - return sve_extract_pair(lhs, rhs, n, std::index_sequence()); + assert(false && "extract_pair out of bounds"); + return batch {}; } - } - template - XSIMD_INLINE batch sve_extract_pair_impl(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence<0, Is...>) noexcept - { - if (n == 0) + template + XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence) noexcept { - return rhs; + if (n == I) + { + return svext(rhs, lhs, I); + } + else + { + return extract_pair(lhs, rhs, n, std::index_sequence()); + } } - else + + template + XSIMD_INLINE batch extract_pair_impl(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence<0, Is...>) noexcept { - return sve_extract_pair(lhs, rhs, n, std::index_sequence()); + if (n == 0) + { + return rhs; + } + else + { + return extract_pair(lhs, rhs, n, std::index_sequence()); + } } - } - } + } // namespace XSIMD_SVE_NAMESPACE + } // namespace detail_sve template = 0> XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, requires_arch) noexcept { constexpr std::size_t size = batch::size; assert(n < size && "index in bounds"); - return detail::sve_extract_pair_impl(lhs, rhs, n, std::make_index_sequence()); + return detail_sve::extract_pair_impl(lhs, rhs, n, std::make_index_sequence()); } // select template = 0> XSIMD_INLINE batch select(batch_bool const& cond, batch const& a, batch const& b, requires_arch) noexcept { - return svsel(cond, static_cast>(a), static_cast>(b)); + return svsel(cond, static_cast>(a), static_cast>(b)); } template @@ -847,7 +864,7 @@ namespace xsimd template = 0> XSIMD_INLINE batch sqrt(batch const& arg, requires_arch) noexcept { - return svsqrt_x(detail::sve_ptrue(), arg); + return svsqrt_x(detail_sve::ptrue(), arg); } // reciprocal @@ -862,44 +879,47 @@ namespace xsimd ******************************/ // fast_cast - namespace detail + namespace detail_sve { - template = 0> - XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + inline namespace XSIMD_SVE_NAMESPACE { - return svcvt_f32_x(detail::sve_ptrue(), arg); - } + template = 0> + XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svcvt_f32_x(detail_sve::ptrue(), arg); + } - template = 0> - XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svcvt_f64_x(detail::sve_ptrue(), arg); - } + template = 0> + XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svcvt_f64_x(detail_sve::ptrue(), arg); + } - template - XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svcvt_s32_x(detail::sve_ptrue(), arg); - } + template + XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svcvt_s32_x(detail_sve::ptrue(), arg); + } - template - XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svcvt_u32_x(detail::sve_ptrue(), arg); - } + template + XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svcvt_u32_x(detail_sve::ptrue(), arg); + } - template - XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svcvt_s64_x(detail::sve_ptrue(), arg); - } + template + XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svcvt_s64_x(detail_sve::ptrue(), arg); + } - template - XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svcvt_u64_x(detail::sve_ptrue(), arg); - } - } + template + XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svcvt_u64_x(detail_sve::ptrue(), arg); + } + } // namespace XSIMD_SVE_NAMESPACE + } // namespace detail_sve /********* * Miscs * @@ -909,46 +929,49 @@ namespace xsimd template XSIMD_INLINE batch set(batch const&, requires_arch, Args... args) noexcept { - return detail::sve_vector_type { args... }; + return detail_sve::sve_vector_type { args... }; } template XSIMD_INLINE batch, A> set(batch, A> const&, requires_arch, Args... args_complex) noexcept { - return batch>(detail::sve_vector_type { args_complex.real()... }, - detail::sve_vector_type { args_complex.imag()... }); + return batch>(detail_sve::sve_vector_type { args_complex.real()... }, + detail_sve::sve_vector_type { args_complex.imag()... }); } template XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Args... args) noexcept { using U = as_unsigned_integer_t; - const auto values = detail::sve_vector_type { static_cast(args)... }; + const auto values = detail_sve::sve_vector_type { static_cast(args)... }; const auto zero = broadcast(static_cast(0), sve {}); - return svcmpne(detail::sve_ptrue(), values, zero); + return svcmpne(detail_sve::ptrue(), values, zero); } // insert - namespace detail + namespace detail_sve { - // generate index sequence (iota) - XSIMD_INLINE svuint8_t sve_iota_impl(index<1>) noexcept { return svindex_u8(0, 1); } - XSIMD_INLINE svuint16_t sve_iota_impl(index<2>) noexcept { return svindex_u16(0, 1); } - XSIMD_INLINE svuint32_t sve_iota_impl(index<4>) noexcept { return svindex_u32(0, 1); } - XSIMD_INLINE svuint64_t sve_iota_impl(index<8>) noexcept { return svindex_u64(0, 1); } + inline namespace XSIMD_SVE_NAMESPACE + { + // generate index sequence (iota) + XSIMD_INLINE svuint8_t iota_impl(index<1>) noexcept { return svindex_u8(0, 1); } + XSIMD_INLINE svuint16_t iota_impl(index<2>) noexcept { return svindex_u16(0, 1); } + XSIMD_INLINE svuint32_t iota_impl(index<4>) noexcept { return svindex_u32(0, 1); } + XSIMD_INLINE svuint64_t iota_impl(index<8>) noexcept { return svindex_u64(0, 1); } - template >> - XSIMD_INLINE V sve_iota() noexcept { return sve_iota_impl(index {}); } - } // namespace detail + template >> + XSIMD_INLINE V iota() noexcept { return iota_impl(index {}); } + } // namespace XSIMD_SVE_NAMESPACE + } // namespace detail_sve template = 0> XSIMD_INLINE batch insert(batch const& arg, T val, index, requires_arch) noexcept { // create a predicate with only the I-th lane activated - const auto iota = detail::sve_iota(); - const auto index_predicate = svcmpeq(detail::sve_ptrue(), iota, static_cast>(I)); - return svsel(index_predicate, static_cast>(broadcast(val, sve {})), static_cast>(arg)); + const auto iota = detail_sve::iota(); + const auto index_predicate = svcmpeq(detail_sve::ptrue(), iota, static_cast>(I)); + return svsel(index_predicate, static_cast>(broadcast(val, sve {})), static_cast>(arg)); } // first @@ -962,7 +985,7 @@ namespace xsimd template = 0> XSIMD_INLINE bool all(batch_bool const& arg, requires_arch) noexcept { - return detail::sve_pcount(arg) == batch_bool::size; + return detail_sve::pcount(arg) == batch_bool::size; } // any @@ -976,61 +999,61 @@ namespace xsimd template = 0, detail::enable_sized_unsigned_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svreinterpret_u8(static_cast>(arg)); + return svreinterpret_u8(static_cast>(arg)); } template = 0, detail::enable_sized_signed_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svreinterpret_s8(static_cast>(arg)); + return svreinterpret_s8(static_cast>(arg)); } template = 0, detail::enable_sized_unsigned_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svreinterpret_u16(static_cast>(arg)); + return svreinterpret_u16(static_cast>(arg)); } template = 0, detail::enable_sized_signed_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svreinterpret_s16(static_cast>(arg)); + return svreinterpret_s16(static_cast>(arg)); } template = 0, detail::enable_sized_unsigned_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svreinterpret_u32(static_cast>(arg)); + return svreinterpret_u32(static_cast>(arg)); } template = 0, detail::enable_sized_signed_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svreinterpret_s32(static_cast>(arg)); + return svreinterpret_s32(static_cast>(arg)); } template = 0, detail::enable_sized_unsigned_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svreinterpret_u64(static_cast>(arg)); + return svreinterpret_u64(static_cast>(arg)); } template = 0, detail::enable_sized_signed_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svreinterpret_s64(static_cast>(arg)); + return svreinterpret_s64(static_cast>(arg)); } template = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svreinterpret_f32(static_cast>(arg)); + return svreinterpret_f32(static_cast>(arg)); } template = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svreinterpret_f64(static_cast>(arg)); + return svreinterpret_f64(static_cast>(arg)); } // batch_bool_cast @@ -1048,71 +1071,77 @@ namespace xsimd } // slide_left - namespace detail + namespace detail_sve { - template - struct sve_slider_left + inline namespace XSIMD_SVE_NAMESPACE { - template - XSIMD_INLINE batch operator()(batch const& arg) noexcept + template + struct slider_left { - using u8_vector = batch; - const auto left = svdup_n_u8(0); - const auto right = bitwise_cast(arg, u8_vector {}, sve {}).data; - const u8_vector result(svext(left, right, u8_vector::size - N)); - return bitwise_cast(result, batch {}, sve {}); - } - }; - - template <> - struct sve_slider_left<0> - { - template - XSIMD_INLINE batch operator()(batch const& arg) noexcept + template + XSIMD_INLINE batch operator()(batch const& arg) noexcept + { + using u8_vector = batch; + const auto left = svdup_n_u8(0); + const auto right = bitwise_cast(arg, u8_vector {}, sve {}).data; + const u8_vector result(svext(left, right, u8_vector::size - N)); + return bitwise_cast(result, batch {}, sve {}); + } + }; + + template <> + struct slider_left<0> { - return arg; - } - }; - } // namespace detail + template + XSIMD_INLINE batch operator()(batch const& arg) noexcept + { + return arg; + } + }; + } // namespace XSIMD_SVE_NAMESPACE + } // namespace detail_sve template = 0> XSIMD_INLINE batch slide_left(batch const& arg, requires_arch) noexcept { - return detail::sve_slider_left()(arg); + return detail_sve::slider_left()(arg); } // slide_right - namespace detail + namespace detail_sve { - template - struct sve_slider_right + inline namespace XSIMD_SVE_NAMESPACE { - template - XSIMD_INLINE batch operator()(batch const& arg) noexcept + template + struct slider_right { - using u8_vector = batch; - const auto left = bitwise_cast(arg, u8_vector {}, sve {}).data; - const auto right = svdup_n_u8(0); - const u8_vector result(svext(left, right, N)); - return bitwise_cast(result, batch {}, sve {}); - } - }; - - template <> - struct sve_slider_right::size> - { - template - XSIMD_INLINE batch operator()(batch const&) noexcept + template + XSIMD_INLINE batch operator()(batch const& arg) noexcept + { + using u8_vector = batch; + const auto left = bitwise_cast(arg, u8_vector {}, sve {}).data; + const auto right = svdup_n_u8(0); + const u8_vector result(svext(left, right, N)); + return bitwise_cast(result, batch {}, sve {}); + } + }; + + template <> + struct slider_right::size> { - return batch {}; - } - }; - } // namespace detail + template + XSIMD_INLINE batch operator()(batch const&) noexcept + { + return batch {}; + } + }; + } // namespace XSIMD_SVE_NAMESPACE + } // namespace detail_sve template = 0> XSIMD_INLINE batch slide_right(batch const& arg, requires_arch) noexcept { - return detail::sve_slider_right()(arg); + return detail_sve::slider_right()(arg); } // isnan @@ -1126,29 +1155,29 @@ namespace xsimd template = 0> XSIMD_INLINE batch nearbyint(batch const& arg, requires_arch) noexcept { - return svrintx_x(detail::sve_ptrue(), arg); + return svrintx_x(detail_sve::ptrue(), arg); } // nearbyint_as_int template XSIMD_INLINE batch nearbyint_as_int(batch const& arg, requires_arch) noexcept { - const auto nearest = svrintx_x(detail::sve_ptrue(), arg); - return svcvt_s32_x(detail::sve_ptrue(), nearest); + const auto nearest = svrintx_x(detail_sve::ptrue(), arg); + return svcvt_s32_x(detail_sve::ptrue(), nearest); } template XSIMD_INLINE batch nearbyint_as_int(batch const& arg, requires_arch) noexcept { - const auto nearest = svrintx_x(detail::sve_ptrue(), arg); - return svcvt_s64_x(detail::sve_ptrue(), nearest); + const auto nearest = svrintx_x(detail_sve::ptrue(), arg); + return svcvt_s64_x(detail_sve::ptrue(), nearest); } // ldexp template = 0> XSIMD_INLINE batch ldexp(const batch& x, const batch, A>& exp, requires_arch) noexcept { - return svscale_x(detail::sve_ptrue(), x, exp); + return svscale_x(detail_sve::ptrue(), x, exp); } } // namespace kernel