AVX swizzle seems a bit slow

Hi team,

I benchmarked the swizzles and they seem a bit slow. I made some effort in optimizing them. In particular the compile time case where we know the permutation. I added very common pattern in scientific computing that I think is worth hardcoding. I have not looked at AVX512, SSE I see that there is #1086. yet but if this is merged I will have a look.

Results:

![Image](https://github.com/user-attachments/assets/03c03cbf-4930-4df0-a1f0-650b956d08a5)

Code: https://github.com/DiamonDinoia/cpp-learning/blob/master/xsimd/swizzles.cpp


Proposal:
```cpp
template <uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
static inline __m256 swizzle_const_opt(__m256 self) noexcept {
    constexpr bool is_identity = (V0 == 0 && V1 == 1 && V2 == 2 && V3 == 3 && V4 == 4 && V5 == 5 && V6 == 6 && V7 == 7);
    constexpr bool is_reverse = (V0 == 3 && V1 == 2 && V2 == 1 && V3 == 0 && V4 == 7 && V5 == 6 && V6 == 5 && V7 == 4);
    constexpr bool is_dup_lo = (V0 == 0 && V1 == 1 && V2 == 2 && V3 == 3 && V4 == 0 && V5 == 1 && V6 == 2 && V7 == 3);
    constexpr bool is_dup_hi = (V0 == 4 && V1 == 5 && V2 == 6 && V3 == 7 && V4 == 4 && V5 == 5 && V6 == 6 && V7 == 7);
    constexpr bool is_pairdup_lo = (V0 == 0 && V1 == 0 && V2 == 1 && V3 == 1);
    constexpr bool is_pairdup_hi = (V4 == 2 && V5 == 2 && V6 == 3 && V7 == 3);
    constexpr bool is_pairdup = is_pairdup_lo && is_pairdup_hi;

    XSIMD_IF_CONSTEXPR (is_identity) {
        return self;
    } else XSIMD_IF_CONSTEXPR (is_reverse) {
        __m128 lo = _mm256_castps256_ps128(self);
        __m128 hi = _mm256_extractf128_ps(self, 1);
        __m128 lo_rev = _mm_shuffle_ps(lo, lo, _MM_SHUFFLE(0, 1, 2, 3));
        __m128 hi_rev = _mm_shuffle_ps(hi, hi, _MM_SHUFFLE(0, 1, 2, 3));
        return _mm256_set_m128(lo_rev, hi_rev);
    } else XSIMD_IF_CONSTEXPR (is_dup_lo) {
        __m128 lo = _mm256_castps256_ps128(self);
        return _mm256_set_m128(lo, lo);
    } else XSIMD_IF_CONSTEXPR (is_dup_hi) {
        __m128 hi = _mm256_extractf128_ps(self, 1);
        return _mm256_set_m128(hi, hi);
    } else XSIMD_IF_CONSTEXPR (is_pairdup) {
        __m256i idx = _mm256_setr_epi32(V0, V0, V2, V2, V4, V4, V6, V6);
        return _mm256_permutevar8x32_ps(self, idx);
    } else {
        __m128 lo = _mm256_castps256_ps128(self);
        __m128 hi = _mm256_extractf128_ps(self, 1);

        constexpr int lo_im = _MM_SHUFFLE(int(V3 % 4), int(V2 % 4), int(V1 % 4), int(V0 % 4));
        constexpr int hi_im = _MM_SHUFFLE(int(V7 % 4), int(V6 % 4), int(V5 % 4), int(V4 % 4));

        __m128 lo_s = _mm_shuffle_ps(lo, lo, lo_im);
        __m128 hi_s = _mm_shuffle_ps(hi, hi, hi_im);

        return _mm256_set_m128(hi_s, lo_s);
    }
}

template <uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
static inline __m256d swizzle_const_opt(__m256d self) noexcept {
    constexpr bool is_dup_re = (V0 % 2 == 0 && V1 % 2 == 0 && V2 % 2 == 0 && V3 % 2 == 0);
    constexpr bool is_dup_im = (V0 % 2 == 1 && V1 % 2 == 1 && V2 % 2 == 1 && V3 % 2 == 1);
    constexpr bool is_swap = (V0 % 2 == 1 && V1 % 2 == 0 && V2 % 2 == 1 && V3 % 2 == 0);
    constexpr bool is_identity = (V0 == 0 && V1 == 1 && V2 == 2 && V3 == 3);
    constexpr bool is_pairdup = (V0 == V1 && V2 == V3);

    XSIMD_IF_CONSTEXPR (is_identity) {
        return self;
    } else XSIMD_IF_CONSTEXPR (is_dup_re) {
        return _mm256_permute_pd(self, 0x0);
    } else XSIMD_IF_CONSTEXPR (is_dup_im) {
        return _mm256_permute_pd(self, 0xF);
    } else XSIMD_IF_CONSTEXPR (is_swap) {
        return _mm256_permute_pd(self, 0x5);
    } else XSIMD_IF_CONSTEXPR (is_pairdup) {
        constexpr int permute_mask = ((V2 & 3) << 2) | (V0 & 3);
        return _mm256_permute4x64_pd(self, permute_mask);
    } else {
        __m128d lo = _mm256_castpd256_pd128(self);
        __m128d hi = _mm256_extractf128_pd(self, 1);

        constexpr int lo_ctrl = ((V0 % 2) << 0) | ((V1 % 2) << 1);
        constexpr int hi_ctrl = ((V2 % 2) << 0) | ((V3 % 2) << 1);

        __m128d lo_s = _mm_shuffle_pd(lo, lo, lo_ctrl);
        __m128d hi_s = _mm_shuffle_pd(hi, hi, hi_ctrl);

        return _mm256_set_m128d(hi_s, lo_s);
    }

```


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

AVX swizzle seems a bit slow #1138

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

AVX swizzle seems a bit slow #1138

Description

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions