From 0b2852fb59e6d3988b0d79230558baa62f9ba28d Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Thu, 13 Sep 2018 15:28:53 +0200 Subject: [PATCH 01/77] Move OOR_MARK definition to property.h --- src/fec_base.h | 2 -- src/property.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fec_base.h b/src/fec_base.h index 13ac78d9..741d480f 100644 --- a/src/fec_base.h +++ b/src/fec_base.h @@ -74,8 +74,6 @@ static inline uint64_t hrtime_usec(timeval begin) return 1000000 * (tv.tv_sec - begin.tv_sec) + tv.tv_usec - begin.tv_usec; } -#define OOR_MARK 1 - enum class FecType { /** Systematic code * diff --git a/src/property.h b/src/property.h index 766acea7..2ecdc65e 100644 --- a/src/property.h +++ b/src/property.h @@ -40,6 +40,8 @@ namespace quadiron { +#define OOR_MARK 1 + /** Ancillary data attached to values. * * A property carries extra-information (whose interpretation is left to the From 897734065e649f309417398c64f88c0c3347ca92 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Mon, 15 Oct 2018 11:37:09 +0200 Subject: [PATCH 02/77] FFT2n: specialize butterfly operations --- src/fft_2n.h | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/src/fft_2n.h b/src/fft_2n.h index f3da7174..8a63bbb2 100644 --- a/src/fft_2n.h +++ b/src/fft_2n.h @@ -602,6 +602,65 @@ void Radix2::ifft(vec::Buffers& output, vec::Buffers& input) this->gf->mul_vec_to_vecp(*(this->vec_inv_n), output, output); } +#ifdef QUADIRON_USE_SIMD + +/* Operations are vectorized by SIMD */ +template <> +void Radix2::butterfly_ct_two_layers_step( + vec::Buffers& buf, + unsigned start, + unsigned m); +template <> +void Radix2::butterfly_ct_step( + vec::Buffers& buf, + uint16_t r, + unsigned start, + unsigned m, + unsigned step); +template <> +void Radix2::butterfly_gs_step( + vec::Buffers& buf, + uint16_t coef, + unsigned start, + unsigned m, + unsigned step); +template <> +void Radix2::butterfly_gs_step_simple( + vec::Buffers& buf, + uint16_t coef, + unsigned start, + unsigned m, + unsigned step); + +template <> +void Radix2::butterfly_ct_two_layers_step( + vec::Buffers& buf, + unsigned start, + unsigned m); +template <> +void Radix2::butterfly_ct_step( + vec::Buffers& buf, + uint32_t r, + unsigned start, + unsigned m, + unsigned step); +template <> +void Radix2::butterfly_gs_step( + vec::Buffers& buf, + uint32_t coef, + unsigned start, + unsigned m, + unsigned step); +template <> +void Radix2::butterfly_gs_step_simple( + vec::Buffers& buf, + uint32_t coef, + unsigned start, + unsigned m, + unsigned step); + +#endif // #ifdef QUADIRON_USE_SIMD + } // namespace fft } // namespace quadiron From d301e489e5dd5a108772d17004cda2f99c1f24f3 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:45:34 +0200 Subject: [PATCH 03/77] CMakeLists: add fft_2n.cpp file --- src/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index aff9c341..3ff590af 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -31,6 +31,7 @@ include(GNUInstallDirs) set(LIB_SRC ${SOURCE_DIR}/core.cpp ${SOURCE_DIR}/fec_vectorisation.cpp + ${SOURCE_DIR}/fft_2n.cpp ${SOURCE_DIR}/misc.cpp ${SOURCE_DIR}/gf_nf4.cpp ${SOURCE_DIR}/gf_ring.cpp From c7c67434f3edb4554030a7e1c0b28e3f6356d3be Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:41:40 +0200 Subject: [PATCH 04/77] FFT2n.cpp: implement specialized operations --- src/fec_vectorisation.cpp | 2 +- src/fft_2n.cpp | 272 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 273 insertions(+), 1 deletion(-) create mode 100644 src/fft_2n.cpp diff --git a/src/fec_vectorisation.cpp b/src/fec_vectorisation.cpp index 2564ed7d..8684e1ab 100644 --- a/src/fec_vectorisation.cpp +++ b/src/fec_vectorisation.cpp @@ -32,7 +32,7 @@ #include "fec_rs_fnt.h" /* - * The file includes vectorized operations used by FEC classes + * The file includes specialized operations used by FEC classes */ #ifdef QUADIRON_USE_SIMD diff --git a/src/fft_2n.cpp b/src/fft_2n.cpp new file mode 100644 index 00000000..f3d8847f --- /dev/null +++ b/src/fft_2n.cpp @@ -0,0 +1,272 @@ +/* -*- mode: c++ -*- */ +/* + * Copyright 2017-2018 Scality + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "fft_2n.h" + +/* + * The file includes vectorized operations used by Radix2 classes + */ + +#ifdef QUADIRON_USE_SIMD + +#include "simd.h" + +namespace quadiron { +namespace fft { + +template <> +void Radix2::butterfly_ct_two_layers_step( + vec::Buffers& buf, + unsigned start, + unsigned m) +{ + const unsigned ratio = simd::countof(); + const size_t len = this->pkt_size; + const size_t vec_len = len / ratio; + const size_t last_len = len - vec_len * ratio; + const unsigned coefIndex = start * this->n / m / 2; + const uint16_t r1 = vec_W[coefIndex]; + const uint16_t r2 = vec_W[coefIndex / 2]; + const uint16_t r3 = vec_W[coefIndex / 2 + this->n / 4]; + + // perform vector operations + simd::butterfly_ct_two_layers_step( + buf, r1, r2, r3, start, m, vec_len, card); + + // for last elements, perform as non-SIMD method + if (last_len > 0) { + const unsigned step = m << 2; + size_t offset = vec_len * ratio; + // --------- + // First layer + // --------- + const uint16_t r1 = W->get(start * this->n / m / 2); + // first pair + butterfly_ct_step_slow(buf, r1, start, m, step, offset); + // second pair + butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset); + // --------- + // Second layer + // --------- + // first pair + const uint16_t r2 = W->get(start * this->n / m / 4); + butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset); + // second pair + const uint16_t r3 = W->get((start + m) * this->n / m / 4); + butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset); + } +} + +template <> +void Radix2::butterfly_ct_step( + vec::Buffers& buf, + uint16_t r, + unsigned start, + unsigned m, + unsigned step) +{ + const unsigned ratio = simd::countof(); + const size_t len = this->pkt_size; + const size_t vec_len = len / ratio; + const size_t last_len = len - vec_len * ratio; + + // perform vector operations + simd::butterfly_ct_step(buf, r, start, m, step, vec_len, card); + + // for last elements, perform as non-SIMD method + if (last_len > 0) { + size_t offset = vec_len * ratio; + butterfly_ct_step_slow(buf, r, start, m, step, offset); + } +} + +template <> +void Radix2::butterfly_gs_step( + vec::Buffers& buf, + uint16_t coef, + unsigned start, + unsigned m, + unsigned step) +{ + const unsigned ratio = simd::countof(); + const size_t len = this->pkt_size; + const size_t vec_len = len / ratio; + const size_t last_len = len - vec_len * ratio; + + // perform vector operations + simd::butterfly_gs_step(buf, coef, start, m, vec_len, card); + + // for last elements, perform as non-SIMD method + if (last_len > 0) { + size_t offset = vec_len * ratio; + butterfly_gs_step_slow(buf, coef, start, m, step, offset); + } +} + +template <> +void Radix2::butterfly_gs_step_simple( + vec::Buffers& buf, + uint16_t coef, + unsigned start, + unsigned m, + unsigned step) +{ + const unsigned ratio = simd::countof(); + const size_t len = this->pkt_size; + const size_t vec_len = len / ratio; + const size_t last_len = len - vec_len * ratio; + + // perform vector operations + simd::butterfly_gs_step_simple(buf, coef, start, m, vec_len, card); + + // for last elements, perform as non-SIMD method + if (last_len > 0) { + size_t offset = vec_len * ratio; + butterfly_gs_step_simple_slow(buf, coef, start, m, step, offset); + } +} + +template <> +void Radix2::butterfly_ct_two_layers_step( + vec::Buffers& buf, + unsigned start, + unsigned m) +{ + const unsigned ratio = simd::countof(); + const size_t len = this->pkt_size; + const size_t vec_len = len / ratio; + const size_t last_len = len - vec_len * ratio; + const unsigned coefIndex = start * this->n / m / 2; + const uint32_t r1 = vec_W[coefIndex]; + const uint32_t r2 = vec_W[coefIndex / 2]; + const uint32_t r3 = vec_W[coefIndex / 2 + this->n / 4]; + + // perform vector operations + simd::butterfly_ct_two_layers_step( + buf, r1, r2, r3, start, m, vec_len, card); + + // for last elements, perform as non-SIMD method + if (last_len > 0) { + const unsigned step = m << 2; + size_t offset = vec_len * ratio; + // --------- + // First layer + // --------- + const uint32_t r1 = W->get(start * this->n / m / 2); + // first pair + butterfly_ct_step_slow(buf, r1, start, m, step, offset); + // second pair + butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset); + // --------- + // Second layer + // --------- + // first pair + const uint32_t r2 = W->get(start * this->n / m / 4); + butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset); + // second pair + const uint32_t r3 = W->get((start + m) * this->n / m / 4); + butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset); + } +} + +template <> +void Radix2::butterfly_ct_step( + vec::Buffers& buf, + uint32_t r, + unsigned start, + unsigned m, + unsigned step) +{ + const unsigned ratio = simd::countof(); + const size_t len = this->pkt_size; + const size_t vec_len = len / ratio; + const size_t last_len = len - vec_len * ratio; + + // perform vector operations + simd::butterfly_ct_step(buf, r, start, m, step, vec_len, card); + + // for last elements, perform as non-SIMD method + if (last_len > 0) { + size_t offset = vec_len * ratio; + butterfly_ct_step_slow(buf, r, start, m, step, offset); + } +} + +template <> +void Radix2::butterfly_gs_step( + vec::Buffers& buf, + uint32_t coef, + unsigned start, + unsigned m, + unsigned step) +{ + const unsigned ratio = simd::countof(); + const size_t len = this->pkt_size; + const size_t vec_len = len / ratio; + const size_t last_len = len - vec_len * ratio; + + // perform vector operations + simd::butterfly_gs_step(buf, coef, start, m, vec_len, card); + + // for last elements, perform as non-SIMD method + if (last_len > 0) { + size_t offset = vec_len * ratio; + butterfly_gs_step_slow(buf, coef, start, m, step, offset); + } +} + +template <> +void Radix2::butterfly_gs_step_simple( + vec::Buffers& buf, + uint32_t coef, + unsigned start, + unsigned m, + unsigned step) +{ + const unsigned ratio = simd::countof(); + const size_t len = this->pkt_size; + const size_t vec_len = len / ratio; + const size_t last_len = len - vec_len * ratio; + + // perform vector operations + simd::butterfly_gs_step_simple(buf, coef, start, m, vec_len, card); + + // for last elements, perform as non-SIMD method + if (last_len > 0) { + size_t offset = vec_len * ratio; + butterfly_gs_step_simple_slow(buf, coef, start, m, step, offset); + } +} + +} // namespace fft +} // namespace quadiron + +#endif // #ifdef QUADIRON_USE_SIMD From ecde06e965b9240e9d238039fbb5219a89217a97 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:43:47 +0200 Subject: [PATCH 05/77] SIMD 128 u16 & u32: update --- src/simd_128_u16.h | 46 -------- src/simd_128_u32.h | 259 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 228 insertions(+), 77 deletions(-) diff --git a/src/simd_128_u16.h b/src/simd_128_u16.h index a177cfb3..e13d8756 100644 --- a/src/simd_128_u16.h +++ b/src/simd_128_u16.h @@ -271,52 +271,6 @@ mul_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3) } } -/* - * buf1[i] = buf1[i] + coef * buf2[i] - * buf2[i] = buf1[i] - coef * buf2[i] - */ -inline void butterfly_ct( - uint16_t coef, - aint16* buf1, - aint16* buf2, - size_t len, - uint32_t card = F3) -{ - const m128i _coef = _mm_set1_epi16(coef); - m128i* _buf1 = reinterpret_cast(buf1); - m128i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m128i a = mul(_coef, _buf2[i], card); - _buf2[i] = sub(_buf1[i], a, card); - _buf1[i] = add(_buf1[i], a, card); - } -} - -/* - * buf1[i] = buf1[i] + buf2[i] - * buf2[i] = coef * (buf1[i] - buf2[i]) - */ -inline void butterfly_gs( - uint16_t coef, - aint16* buf1, - aint16* buf2, - size_t len, - uint16_t card = F3) -{ - const m128i _coef = _mm_set1_epi16(coef); - m128i* _buf1 = reinterpret_cast(buf1); - m128i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m128i a = _buf1[i]; - m128i b = _buf2[i]; - m128i c = sub(a, b, card); - _buf1[i] = add(a, b, card); - _buf2[i] = mul(_coef, c, card); - } -} - inline void encode_post_process( vec::Buffers& output, std::vector& props, diff --git a/src/simd_128_u32.h b/src/simd_128_u32.h index 5039f0f8..80936a6e 100644 --- a/src/simd_128_u32.h +++ b/src/simd_128_u32.h @@ -156,6 +156,17 @@ inline m128i mul_f4(m128i a, m128i b) return mod_after_multiply_f4(c); } +inline m128i mul_f4_simple(m128i a, m128i b) +{ + m128i _a = _mm_load_si128(&a); + m128i _b = _mm_load_si128(&b); + + m128i c = _mm_mullo_epi32(_a, _b); + + // Modulo + return mod_after_multiply_f4(c); +} + inline m128i mul_f3(m128i a, m128i b) { m128i _a = _mm_load_si128(&a); @@ -174,6 +185,17 @@ inline m128i mul_f3(m128i a, m128i b) return mod_after_multiply_f3(c); } +inline m128i mul_f3_simple(m128i a, m128i b) +{ + m128i _a = _mm_load_si128(&a); + m128i _b = _mm_load_si128(&b); + + m128i c = _mm_mullo_epi32(_a, _b); + + // Modulo + return mod_after_multiply_f3(c); +} + /** Perform multiplication of two numbers a, b whose elements are of GF(card) * where `card` is a prime Fermat number, i.e. card = Fx with x < 5 * Currently, it supports only for F3 and F4 @@ -186,6 +208,14 @@ inline m128i mul(m128i a, m128i b, aint32 card) return mul_f3(a, b); } +inline m128i mul_simple(m128i a, m128i b, aint32 card) +{ + assert(card == F4 || card == F3); + if (card == F4) + return mul_f4_simple(a, b); + return mul_f3_simple(a, b); +} + /** Apply an element-wise negation to a buffer */ inline void neg(size_t len, aint32* buf, aint32 card = F4) @@ -314,49 +344,216 @@ mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4) } } -/* - * buf1[i] = buf1[i] + coef * buf2[i] - * buf2[i] = buf1[i] - coef * buf2[i] - */ -inline void butterfly_ct( - uint32_t coef, - aint32* buf1, - aint32* buf2, +// outputA = inputA + inputB +// outputB = inputA - inputB +inline void butterfly_step( + m128i* inputA, + m128i* inputB, + m128i* outputA, + m128i* outputB, + uint32_t _card) +{ + const m128i card = (_card == F3) ? F3_m128i : F4_m128i; + const m128i card_1 = (_card == F3) ? F3minus1_m128i : F4minus1_m128i; + + // -------------------------------------- + // outputB = inputA - inputB + // -------------------------------------- + m128i a = _mm_load_si128(inputA); + m128i b = _mm_load_si128(inputB); + m128i cmp_1 = _mm_cmpgt_epi32(b, a); + m128i res_1 = _mm_add_epi32(a, _mm_and_si128(card, cmp_1)); + + _mm_store_si128(outputB, _mm_sub_epi32(res_1, b)); + + // -------------------------------------- + // outputA = symbA + symbB + // -------------------------------------- + m128i res_2 = _mm_add_epi32(a, b); + // modulo + m128i cmp_2 = _mm_cmpgt_epi32(res_2, card_1); + m128i c = _mm_sub_epi32(res_2, _mm_and_si128(card, cmp_2)); + + _mm_store_si128(outputA, c); +} + +// for each pair (P, Q) = (buf[i], buf[i + m]): +// P = P + Q +// Q = P - Q +inline void butterfly_ct_1( + vec::Buffers& buf, + unsigned start, + unsigned m, + unsigned step, size_t len, uint32_t card = F4) { - const m128i _coef = _mm_set1_epi32(coef); - m128i* _buf1 = reinterpret_cast(buf1); - m128i* _buf2 = reinterpret_cast(buf2); + for (int i = start; i < buf.get_n(); i += step) { + uint32_t* a = buf.get(i); + uint32_t* b = buf.get(i + m); + m128i* _a = reinterpret_cast(a); + m128i* _b = reinterpret_cast(b); + // perform butterfly operation for Cooley-Tukey FFT algorithm + for (size_t j = 0; j < len; ++j) { + butterfly_step(&(_a[j]), &(_b[j]), &(_a[j]), &(_b[j]), card); + } + } +} - for (size_t i = 0; i < len; ++i) { - m128i a = mul(_coef, _buf2[i], card); - _buf2[i] = sub(_buf1[i], a, card); - _buf1[i] = add(_buf1[i], a, card); +// for each pair (P, Q) = (buf[i], buf[i + m]): +// P = P - Q +// Q = P + Q +inline void butterfly_ct_2( + vec::Buffers& buf, + unsigned start, + unsigned m, + unsigned step, + size_t len, + uint32_t card = F4) +{ + for (int i = start; i < buf.get_n(); i += step) { + uint32_t* a = buf.get(i); + uint32_t* b = buf.get(i + m); + m128i* _a = reinterpret_cast(a); + m128i* _b = reinterpret_cast(b); + // perform butterfly operation for Cooley-Tukey FFT algorithm + for (size_t j = 0; j < len; ++j) { + butterfly_step(&(_a[j]), &(_b[j]), &(_b[j]), &(_a[j]), card); + } } } -/* - * buf1[i] = buf1[i] + buf2[i] - * buf2[i] = coef * (buf1[i] - buf2[i]) - */ -inline void butterfly_gs( +// output = coef * input +inline void +butterfly_mul(m128i* coef, m128i* input, m128i* output, uint32_t _card) +{ + const m128i card = (_card == F3) ? F3_m128i : F4_m128i; + const m128i card_2 = (_card == F3) ? F3minus2_m128i : F4minus2_m128i; + + // -------------------------------------- + // compute coef * symbB + // -------------------------------------- + m128i _coef = _mm_load_si128(coef); + m128i b = _mm_load_si128(input); + m128i res = _mm_mullo_epi32(_coef, b); + // modulo + m128i lo = _mm_and_si128(res, card_2); + m128i res_shift = + (_card == F3) ? _mm_srli_si128(res, 1) : _mm_srli_si128(res, 2); + m128i hi = _mm_and_si128(res_shift, card_2); + + m128i cmp_1 = _mm_cmpgt_epi32(hi, lo); + m128i _lo = _mm_add_epi32(lo, _mm_and_si128(card, cmp_1)); + + m128i res_2 = _mm_sub_epi32(_lo, hi); + + _mm_store_si128(output, res_2); +} + +// symbA = symbA + coef * symbB +// symbB = symbA - coef * symbB +inline void +butterfly_ct_3_step(m128i* coef, m128i* symbA, m128i* symbB, uint32_t _card) +{ + // -------------------------------------- + // compute coef * symbB + // -------------------------------------- + m128i coef_x_symbB; + butterfly_mul(coef, symbB, &coef_x_symbB, _card); + // -------------------------------------- + // symbA = symbA + coef_x_symbB + // symbB = symbA - coef_x_symbB + // -------------------------------------- + butterfly_step(symbA, &coef_x_symbB, symbA, symbB, _card); +} + +// for each pair (P, Q) = (buf[i], buf[i + m]): +// P = P + c * Q +// Q = P - c * Q +inline void butterfly_ct_3( uint32_t coef, - aint32* buf1, - aint32* buf2, + vec::Buffers& buf, + unsigned start, + unsigned m, + unsigned step, size_t len, uint32_t card = F4) { - const m128i _coef = _mm_set1_epi32(coef); - m128i* _buf1 = reinterpret_cast(buf1); - m128i* _buf2 = reinterpret_cast(buf2); + m128i _coef = _mm_set1_epi32(coef); + for (int i = start; i < buf.get_n(); i += step) { + uint32_t* a = buf.get(i); + uint32_t* b = buf.get(i + m); + m128i* _a = reinterpret_cast(a); + m128i* _b = reinterpret_cast(b); + // perform butterfly operation for Cooley-Tukey FFT algorithm + for (size_t j = 0; j < len; ++j) { + butterfly_ct_3_step(&_coef, &(_a[j]), &(_b[j]), card); + } + } +} - for (size_t i = 0; i < len; ++i) { - m128i a = _buf1[i]; - m128i b = _buf2[i]; - m128i c = sub(a, b, card); - _buf1[i] = add(a, b, card); - _buf2[i] = mul(_coef, c, card); +// for each pair (P, Q) = (buf[i], buf[i + m]): +// P = Q + P +// Q = Q - P +inline void butterfly_gs_2( + vec::Buffers& buf, + unsigned start, + unsigned m, + unsigned step, + size_t len, + uint32_t card = F4) +{ + for (int i = start; i < buf.get_n(); i += step) { + uint32_t* a = buf.get(i); + uint32_t* b = buf.get(i + m); + m128i* _a = reinterpret_cast(a); + m128i* _b = reinterpret_cast(b); + // perform butterfly operation for Cooley-Tukey FFT algorithm + for (size_t j = 0; j < len; ++j) { + butterfly_step(&(_b[j]), &(_a[j]), &(_a[j]), &(_b[j]), card); + } + } +} + +// symbA = symbA + symbB +// symbB = coef * (symbA - symbB) +inline void +butterfly_gs_3_step(m128i* coef, m128i* symbA, m128i* symbB, uint32_t _card) +{ + // -------------------------------------- + // symbA = symbA + symbB + // symbB = symbA - symbB + // -------------------------------------- + butterfly_step(symbA, symbB, symbA, symbB, _card); + + // -------------------------------------- + // symbB = coef * symbB + // -------------------------------------- + butterfly_mul(coef, symbB, symbB, _card); +} + +// for each pair (P, Q) = (buf[i], buf[i + m]): +// P = P + Q +// Q = c * (P - Q) +inline void butterfly_gs_3( + uint32_t coef, + vec::Buffers& buf, + unsigned start, + unsigned m, + unsigned step, + size_t len, + uint32_t card = F4) +{ + m128i _coef = _mm_set1_epi32(coef); + for (int i = start; i < buf.get_n(); i += step) { + uint32_t* a = buf.get(i); + uint32_t* b = buf.get(i + m); + m128i* _a = reinterpret_cast(a); + m128i* _b = reinterpret_cast(b); + // perform butterfly operation for Cooley-Tukey FFT algorithm + for (size_t j = 0; j < len; ++j) { + butterfly_gs_3_step(&_coef, &(_a[j]), &(_b[j]), card); + } } } From 23f7ec61e62d4b00ffd8693806eb6d8916411fb6 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:43:57 +0200 Subject: [PATCH 06/77] SIMD 256 u16 & u32: update --- src/simd_256_u16.h | 753 ++++++++++++++++++++++++++++++++---------- src/simd_256_u32.h | 805 +++++++++++++++++++++++++++++++++------------ 2 files changed, 1161 insertions(+), 397 deletions(-) diff --git a/src/simd_256_u16.h b/src/simd_256_u16.h index 974e136e..74d35d24 100644 --- a/src/simd_256_u16.h +++ b/src/simd_256_u16.h @@ -38,142 +38,599 @@ namespace quadiron { namespace simd { -/** Perform a%card where a is a addition of two numbers whose elements are - * symbols of GF(card) */ -inline m256i mod_after_add(m256i a, aint16 card) -{ - const m256i _card = _mm256_set1_epi16(card); - const m256i _card_minus_1 = _mm256_set1_epi16(card - 1); +#define F3_u16 _mm256_set1_epi16(257) +#define F3m1_u16 _mm256_set1_epi16(256) - m256i cmp = _mm256_cmpgt_epi16(a, _card_minus_1); - m256i b = _mm256_sub_epi16(a, _mm256_and_si256(_card, cmp)); +/* ==================== Essential Operations =================== */ +// Following functions are used for AVX2 w/ u16 only - return b; +inline m256i SET1(uint16_t val) +{ + return _mm256_set1_epi16(val); } - -/** Perform addition of two numbers a, b whose elements are of GF(card) */ -inline m256i add(m256i a, m256i b, aint16 card = F3) +inline m256i ADD16(m256i x, m256i y) +{ + return _mm256_add_epi16(x, y); +} +inline m256i SUB16(m256i x, m256i y) { - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); - m256i c = _mm256_add_epi16(_a, _b); + return _mm256_sub_epi16(x, y); +} +inline m256i MUL16(m256i x, m256i y) +{ + return _mm256_mullo_epi16(x, y); +} - // Modulo - return mod_after_add(c, card); +inline m256i CMPEQ16(m256i x, m256i y) +{ + return _mm256_cmpeq_epi16(x, y); +} +inline m256i CMPGT16(m256i x, m256i y) +{ + return _mm256_cmpgt_epi16(x, y); +} +inline m256i MINU16(m256i x, m256i y) +{ + return _mm256_min_epu16(x, y); } -/** Perform subtraction of a by b where a, b whose elements are symbols of - * GF(card) - * sub(a, b) = a - b if a >= b, or - * card + a - b, otherwise - */ -inline m256i sub(m256i a, m256i b, aint16 card) +#define MASK8_LO (_mm256_set1_epi16(0x80)) +#define BLEND8(x, y, mask) (_mm256_blendv_epi8(x, y, mask)) + +// z = x + y mod q +// Input are loaded to registers +// Output is register +inline m256i ADD_MOD(m256i x, m256i y, uint16_t q) { - const m256i _card = _mm256_set1_epi16(card); + m256i res = ADD16(x, y); + return MINU16(res, SUB16(res, F3_u16)); +} - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); +// z = x - y mod q => z = q + x - y mod q +// Input are loaded to registers +// Output is register +inline m256i SUB_MOD(m256i x, m256i y, uint16_t q) +{ + m256i res = SUB16(x, y); + return MINU16(res, SUB16(ADD16(x, F3_u16), y)); +} - m256i cmp = _mm256_cmpgt_epi16(_b, _a); - m256i _a1 = _mm256_add_epi16(_a, _mm256_and_si256(_card, cmp)); +// y = 0 - x mod q => y = q - x mod q +// Input are loaded to registers +// Output is register +inline m256i NEG_MOD(m256i x, uint16_t q) +{ + m256i res = SUB16(F3_u16, x); + return MINU16(res, SUB16(res, F3_u16)); +} - return _mm256_sub_epi16(_a1, _b); +// z = x * y mod q +// Input are loaded to registers +// Output is register +// Note: we assume that at least `x` or `y` is less than `q-1` so it's +// not necessary to verify overflow on multiplying elements +inline m256i MUL_MOD(m256i x, m256i y, uint16_t q) +{ + m256i res = MUL16(x, y); + m256i lo = BLEND8(ZERO, res, MASK8_LO); + m256i hi = BLEND8(ZERO, SHIFTR_1(res), MASK8_LO); + return SUB_MOD(lo, hi, q); } -/** Negate `a` - * @return 0 if (a == 0), else card - a - */ -inline m256i neg(m256i a, aint16 card = F3) +// z = x * y mod q +// Input are loaded to registers +// Output is register +inline m256i MULFULL_MOD(m256i x, m256i y, uint16_t q) { - const m256i _card = _mm256_set1_epi16(card); - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_setzero_si256(); + m256i res = MUL16(x, y); - m256i cmp = _mm256_cmpgt_epi16(_a, _b); + // filter elements of both of a & b = card-1 + m256i cmp = AND(CMPEQ16(x, F3m1_u16), CMPEQ16(y, F3m1_u16)); + res = ADD16(res, AND(ONE, cmp)); - return _mm256_sub_epi16(_mm256_and_si256(cmp, _card), _a); + m256i lo = BLEND8(ZERO, res, MASK8_LO); + m256i hi = BLEND8(ZERO, SHIFTR_1(res), MASK8_LO); + return SUB_MOD(lo, hi, q); } -inline m256i mod_after_multiply(m256i a) +// butterfly CT with r == 1 +inline void BUTTERFLY_1(m256i* x, m256i* y, uint16_t q) { - const m256i mask = _mm256_set1_epi16(F3 - 2); - - m256i lo = _mm256_and_si256(a, mask); + m256i add = ADD_MOD(*x, *y, q); + *y = SUB_MOD(*x, *y, q); + *x = add; +} - m256i a_shift = _mm256_srli_si256(a, 1); - m256i hi = _mm256_and_si256(a_shift, mask); +// butterfly CT with r == q - 1 +inline void BUTTERFLY_2(m256i* x, m256i* y, uint16_t q) +{ + m256i add = ADD_MOD(*x, *y, q); + *x = SUB_MOD(*x, *y, q); + *y = add; +} - m256i cmp = _mm256_cmpgt_epi16(hi, lo); - m256i _lo = _mm256_add_epi16(lo, _mm256_and_si256(F3_m256i_u16, cmp)); +// butterfly CT with 1 < r < q - 1 +inline void BUTTERFLY_3(m256i c, m256i* x, m256i* y, uint16_t q) +{ + m256i z = MUL_MOD(c, *y, q); + *y = SUB_MOD(*x, z, q); + *x = ADD_MOD(*x, z, q); +} - return _mm256_sub_epi16(_lo, hi); +// butterfly GS w/ r = q - 1 +inline void BUTTERFLY_4(m256i* x, m256i* y, uint16_t q) +{ + m256i add = ADD_MOD(*x, *y, q); + *y = SUB_MOD(*y, *x, q); + *x = add; } -inline m256i mul(m256i a, m256i b) +// butterfly GS w/ 1 < r < q - 1 +// x = x + y mod q +// y = z * (x - y) mod q +inline void BUTTERFLY_5(m256i c, m256i* x, m256i* y, uint16_t q) { - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); + m256i sub = SUB_MOD(*x, *y, q); + *x = ADD_MOD(*x, *y, q); + *y = MUL_MOD(c, sub, q); +} - m256i c = _mm256_mullo_epi16(_a, _b); +/** + * Vectorized butterly CT step + * + * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` + * P = P + r * Q + * Q = P - r * Q + * + * @param buf - working buffers + * @param r - coefficient + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +inline void butterfly_ct_step( + vec::Buffers& buf, + uint16_t r, + unsigned start, + unsigned m, + size_t len, + uint16_t card) +{ + const unsigned step = m << 1; + m256i c = SET1(r); + +#define BUTTERFLY_CT(x, y) \ + (EITHER( \ + r == 1, \ + BUTTERFLY_1(x, y, card), \ + EITHER( \ + r < card - 1, \ + BUTTERFLY_3(c, x, y, card), \ + BUTTERFLY_2(x, y, card)))); + + const size_t end = len - 1; + const unsigned bufs_nb = buf.get_n(); + // #pragma omp parallel for + // #pragma unroll + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + m256i x1, y1; + m256i x2, y2; + m256i* __restrict p = reinterpret_cast(mem[i]); + m256i* __restrict q = reinterpret_cast(mem[i + m]); + + // #pragma omp parallel for + size_t j = 0; + // #pragma unroll + for (; j < end; j += 2) { + x1 = LOAD(p + j); + y1 = LOAD(q + j); + x2 = LOAD(p + j + 1); + y2 = LOAD(q + j + 1); + + BUTTERFLY_CT(&x1, &y1); + BUTTERFLY_CT(&x2, &y2); + + // Store back to memory + STORE(p + j, x1); + STORE(p + j + 1, x2); + STORE(q + j, y1); + STORE(q + j + 1, y2); + } + for (; j < len; ++j) { + x1 = LOAD(p + j); + y1 = LOAD(q + j); - // filter elements of both of a & b = card-1 - m256i cmp = _mm256_and_si256( - _mm256_cmpeq_epi16(_a, F3minus1_m256i_u16), - _mm256_cmpeq_epi16(_b, F3minus1_m256i_u16)); + BUTTERFLY_CT(&x1, &y1); - const m256i one = _mm256_set1_epi16(1); - c = _mm256_add_epi16(c, _mm256_and_si256(one, cmp)); + // Store back to memory + STORE(p + j, x1); + STORE(q + j, y1); + } + } +} - // Modulo - return mod_after_multiply(c); +/** + * Vectorized butterly CT on two-layers at a time + * + * For each quadruple + * (P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m]) + * First layer: butterfly on (P, Q) and (R, S) for step = 2 * m + * coef r1 = W[start * n / (2 * m)] + * P = P + r1 * Q + * Q = P - r1 * Q + * R = R + r1 * S + * S = R - r1 * S + * Second layer: butterfly on (P, R) and (Q, S) for step = 4 * m + * coef r2 = W[start * n / (4 * m)] + * coef r3 = W[(start + m) * n / (4 * m)] + * P = P + r2 * R + * R = P - r2 * R + * Q = Q + r3 * S + * S = Q - r3 * S + * + * @param buf - working buffers + * @param r1 - coefficient for the 1st layer + * @param r2 - 1st coefficient for the 2nd layer + * @param r3 - 2nd coefficient for the 2nd layer + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +inline void butterfly_ct_two_layers_step( + vec::Buffers& buf, + uint16_t r1, + uint16_t r2, + uint16_t r3, + unsigned start, + unsigned m, + size_t len, + uint16_t card) +{ + const unsigned step = m << 2; + m256i c1 = SET1(r1); + m256i c2 = SET1(r2); + m256i c3 = SET1(r3); + +#define BUTTERFLY_R1(c, x, y) \ + (EITHER( \ + r1 == 1, \ + BUTTERFLY_1(x, y, card), \ + EITHER( \ + r1 < card - 1, \ + BUTTERFLY_3(c, x, y, card), \ + BUTTERFLY_2(x, y, card)))); +#define BUTTERFLY_R2(c, x, y) \ + (EITHER( \ + r2 == 1, \ + BUTTERFLY_1(x, y, card), \ + EITHER( \ + r2 < card - 1, \ + BUTTERFLY_3(c, x, y, card), \ + BUTTERFLY_2(x, y, card)))); +#define BUTTERFLY_R3(c, x, y) \ + (EITHER( \ + r3 == 1, \ + BUTTERFLY_1(x, y, card), \ + EITHER( \ + r3 < card - 1, \ + BUTTERFLY_3(c, x, y, card), \ + BUTTERFLY_2(x, y, card)))); + + const size_t end = len - 1; + const unsigned bufs_nb = buf.get_n(); + // #pragma omp parallel for + // #pragma unroll + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + m256i x1, y1, u1, v1; + m256i x2, y2, u2, v2; + m256i* __restrict p = reinterpret_cast(mem[i]); + m256i* __restrict q = reinterpret_cast(mem[i + m]); + m256i* __restrict r = reinterpret_cast(mem[i + 2 * m]); + m256i* __restrict s = reinterpret_cast(mem[i + 3 * m]); + + // #pragma omp parallel for + size_t j = 0; + // #pragma unroll + for (; j < end; j += 2) { + // First layer (c1, x, y) & (c1, u, v) + x1 = LOAD(p + j); + y1 = LOAD(q + j); + x2 = LOAD(p + j + 1); + y2 = LOAD(q + j + 1); + + u1 = LOAD(r + j); + v1 = LOAD(s + j); + u2 = LOAD(r + j + 1); + v2 = LOAD(s + j + 1); + + BUTTERFLY_R1(c1, &x1, &y1); + BUTTERFLY_R1(c1, &x2, &y2); + + BUTTERFLY_R1(c1, &u1, &v1); + BUTTERFLY_R1(c1, &u2, &v2); + + // Second layer (c2, x, u) & (c3, y, v) + BUTTERFLY_R2(c2, &x1, &u1); + BUTTERFLY_R2(c2, &x2, &u2); + + BUTTERFLY_R3(c3, &y1, &v1); + BUTTERFLY_R3(c3, &y2, &v2); + + // Store back to memory + STORE(p + j, x1); + STORE(p + j + 1, x2); + STORE(q + j, y1); + STORE(q + j + 1, y2); + + STORE(r + j, u1); + STORE(r + j + 1, u2); + STORE(s + j, v1); + STORE(s + j + 1, v2); + } + for (; j < len; ++j) { + // First layer (c1, x, y) & (c1, u, v) + x1 = LOAD(p + j); + y1 = LOAD(q + j); + u1 = LOAD(r + j); + v1 = LOAD(s + j); + + BUTTERFLY_R1(c1, &x1, &y1); + BUTTERFLY_R1(c1, &u1, &v1); + // Second layer (c2, x, u) & (c3, y, v) + BUTTERFLY_R2(c2, &x1, &u1); + BUTTERFLY_R3(c3, &y1, &v1); + // Store back to memory + STORE(p + j, x1); + STORE(q + j, y1); + STORE(r + j, u1); + STORE(s + j, v1); + } + } } -/** Perform multiplication of two numbers a, b whose elements are of GF(card) - * where `card` is a prime Fermat number, i.e. card = Fx with x < 5 - * Currently, it supports only for F3 +/** + * Vectorized butterly GS step + * + * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` + * P = P + Q + * Q = r * (P - Q) + * + * @param buf - working buffers + * @param r - coefficient + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param len - number of vectors per buffer + * @param card - modulo cardinal */ -inline m256i mul(m256i a, m256i b, aint16 card) +inline void butterfly_gs_step( + vec::Buffers& buf, + uint16_t r, + unsigned start, + unsigned m, + size_t len, + uint16_t card) { - // FIXME: generalize card - assert(card == F3); - return mul(a, b); + const unsigned step = m << 1; + m256i c = SET1(r); + +#define BUTTERFLY_GS(x, y) \ + (EITHER( \ + r == 1, \ + BUTTERFLY_1(x, y, card), \ + EITHER( \ + r < card - 1, \ + BUTTERFLY_5(c, x, y, card), \ + BUTTERFLY_4(x, y, card)))); + + const size_t end = len - 1; + const unsigned bufs_nb = buf.get_n(); + // #pragma omp parallel for + // #pragma unroll + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + m256i x1, y1; + m256i x2, y2; + m256i* __restrict p = reinterpret_cast(mem[i]); + m256i* __restrict q = reinterpret_cast(mem[i + m]); + + // #pragma omp parallel for + size_t j = 0; + // #pragma unroll + for (; j < end; j += 2) { + x1 = LOAD(p + j); + y1 = LOAD(q + j); + x2 = LOAD(p + j + 1); + y2 = LOAD(q + j + 1); + + BUTTERFLY_GS(&x1, &y1); + BUTTERFLY_GS(&x2, &y2); + + // Store back to memory + STORE(p + j, x1); + STORE(p + j + 1, x2); + STORE(q + j, y1); + STORE(q + j + 1, y2); + } + for (; j < len; ++j) { + x1 = LOAD(p + j); + y1 = LOAD(q + j); + + BUTTERFLY_GS(&x1, &y1); + + // Store back to memory + STORE(p + j, x1); + STORE(q + j, y1); + } + } } -/** Apply an element-wise negation to a buffer +/** + * Vectorized butterly GS step + * + * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` + * Q = r * Q + * + * @param buf - working buffers + * @param r - coefficient + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param len - number of vectors per buffer + * @param card - modulo cardinal */ -inline void neg(size_t len, aint16* buf, aint16 card = F3) +inline void butterfly_gs_step_simple( + vec::Buffers& buf, + uint16_t r, + unsigned start, + unsigned m, + size_t len, + uint16_t card) { - m256i* _buf = reinterpret_cast(buf); - unsigned ratio = sizeof(*_buf) / sizeof(*buf); - size_t _len = len / ratio; - size_t _last_len = len - _len * ratio; + const unsigned step = m << 1; + m256i c = SET1(r); + +#define BUTTERFLY_GS_S(x) \ + (EITHER( \ + r == 1, \ + (x), \ + EITHER(r < card - 1, MUL_MOD(c, x, card), NEG_MOD(x, card)))); + + const size_t end = len - 1; + const unsigned bufs_nb = buf.get_n(); + // #pragma omp parallel for + // #pragma unroll + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + m256i x1, y1; + m256i x2, y2; + m256i* __restrict p = reinterpret_cast(mem[i]); + m256i* __restrict q = reinterpret_cast(mem[i + m]); + + // #pragma omp parallel for + size_t j = 0; + // #pragma unroll + for (; j < end; j += 2) { + x1 = LOAD(p + j); + x2 = LOAD(p + j + 1); + + y1 = BUTTERFLY_GS_S(x1); + y2 = BUTTERFLY_GS_S(x2); + + // Store back to memory + STORE(q + j, y1); + STORE(q + j + 1, y2); + } + for (; j < len; ++j) { + x1 = LOAD(p + j); - size_t i; - for (i = 0; i < _len; i++) { - _buf[i] = neg(_buf[i], card); + y1 = BUTTERFLY_GS_S(x1); + + // Store back to memory + STORE(q + j, y1); + } } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - if (buf[i]) - buf[i] = card - buf[i]; +} + +inline void add_props_16( + Properties& props, + m256i threshold, + m256i mask, + m256i symb, + off_t offset) +{ + const m256i b = CMPEQ16(threshold, symb); + const m256i c = AND(mask, b); + uint32_t d = MVMSK8(c); + const unsigned element_size = sizeof(uint16_t); + while (d > 0) { + unsigned byte_idx = __builtin_ctz(d); + off_t _offset = offset + byte_idx / element_size; + props.add(_offset, OOR_MARK); + d ^= 1 << byte_idx; + } +} + +inline void encode_post_process( + vec::Buffers& output, + std::vector& props, + off_t offset, + unsigned code_len, + uint16_t threshold, + size_t vecs_nb) +{ + const unsigned element_size = sizeof(uint16_t); + const unsigned vec_size = ALIGN_SIZE / element_size; + const uint16_t max = 1 << (element_size * 8 - 1); + const m256i _threshold = SET1(threshold); + const m256i mask_hi = SET1(max); + + // #pragma unroll + const std::vector& mem = output.get_mem(); + for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { + m256i* __restrict buf = reinterpret_cast(mem[frag_id]); + + size_t vec_id = 0; + size_t end = vecs_nb - 3; + // #pragma unroll + for (; vec_id < end; vec_id += 4) { + m256i a1 = LOAD(buf + vec_id); + m256i a2 = LOAD(buf + vec_id + 1); + m256i a3 = LOAD(buf + vec_id + 2); + m256i a4 = LOAD(buf + vec_id + 3); + + if (TESTZ(a1, _threshold) == 0) { + const off_t curr_offset = offset + vec_id * vec_size; + add_props_16( + props[frag_id], _threshold, mask_hi, a1, curr_offset); + } + if (TESTZ(a2, _threshold) == 0) { + const off_t curr_offset = offset + (vec_id + 1) * vec_size; + add_props_16( + props[frag_id], _threshold, mask_hi, a2, curr_offset); + } + if (TESTZ(a3, _threshold) == 0) { + const off_t curr_offset = offset + (vec_id + 2) * vec_size; + add_props_16( + props[frag_id], _threshold, mask_hi, a3, curr_offset); + } + if (TESTZ(a4, _threshold) == 0) { + const off_t curr_offset = offset + (vec_id + 3) * vec_size; + add_props_16( + props[frag_id], _threshold, mask_hi, a4, curr_offset); + } + } + for (; vec_id < vecs_nb; ++vec_id) { + m256i a = LOAD(buf + vec_id); + uint16_t c = TESTZ(a, _threshold); + if (c == 0) { + const off_t curr_offset = offset + vec_id * vec_size; + add_props_16( + props[frag_id], _threshold, mask_hi, a, curr_offset); + } } } } +/* ==================== Operations =================== */ /** Perform a multiplication of a coefficient `a` to each element of `src` and * add result to correspondent element of `dest` + * + * @note: 1 < `a` < card - 1 */ inline void mul_coef_to_buf( - const aint16 a, + const uint16_t a, aint16* src, aint16* dest, size_t len, - aint16 card = F3) + uint16_t card) { - const m256i coef = _mm256_set1_epi16(a); + const m256i coef = SET1(a); - m256i* _src = reinterpret_cast(src); - m256i* _dest = reinterpret_cast(dest); + m256i* __restrict _src = reinterpret_cast(src); + m256i* __restrict _dest = reinterpret_cast(dest); const unsigned ratio = sizeof(*_src) / sizeof(*src); const size_t _len = len / ratio; const size_t _last_len = len - _len * ratio; @@ -181,22 +638,21 @@ inline void mul_coef_to_buf( size_t i; for (i = 0; i < _len; i++) { // perform multiplication - _dest[i] = mul(coef, _src[i], card); + _dest[i] = MUL_MOD(coef, _src[i], card); } if (_last_len > 0) { - uint32_t coef_doubled = (uint32_t)a; + uint32_t coef_32 = (uint32_t)a; for (i = _len * ratio; i < len; i++) { // perform multiplication - dest[i] = (aint16)((coef_doubled * src[i]) % card); + dest[i] = (aint16)((coef_32 * src[i]) % card); } } } -inline void -add_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3) +inline void add_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card) { - m256i* _src = reinterpret_cast(src); - m256i* _dest = reinterpret_cast(dest); + m256i* __restrict _src = reinterpret_cast(src); + m256i* __restrict _dest = reinterpret_cast(dest); const unsigned ratio = sizeof(*_src) / sizeof(*src); const size_t _len = len / ratio; const size_t _last_len = len - _len * ratio; @@ -204,7 +660,7 @@ add_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3) size_t i; for (i = 0; i < _len; i++) { // perform addition - _dest[i] = add(_src[i], _dest[i], card); + _dest[i] = ADD_MOD(_src[i], _dest[i], card); } if (_last_len > 0) { for (i = _len * ratio; i < len; i++) { @@ -215,16 +671,12 @@ add_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3) } } -inline void sub_two_bufs( - aint16* bufa, - aint16* bufb, - aint16* res, - size_t len, - aint16 card = F3) +inline void +sub_two_bufs(aint16* bufa, aint16* bufb, aint16* res, size_t len, aint16 card) { - m256i* _bufa = reinterpret_cast(bufa); - m256i* _bufb = reinterpret_cast(bufb); - m256i* _res = reinterpret_cast(res); + m256i* __restrict _bufa = reinterpret_cast(bufa); + m256i* __restrict _bufb = reinterpret_cast(bufb); + m256i* __restrict _res = reinterpret_cast(res); const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa); const size_t _len = len / ratio; const size_t _last_len = len - _len * ratio; @@ -232,7 +684,7 @@ inline void sub_two_bufs( size_t i; for (i = 0; i < _len; i++) { // perform subtraction - _res[i] = sub(_bufa[i], _bufb[i], card); + _res[i] = SUB_MOD(_bufa[i], _bufb[i], card); } if (_last_len > 0) { for (i = _len * ratio; i < len; i++) { @@ -245,11 +697,10 @@ inline void sub_two_bufs( } } -inline void -mul_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3) +inline void mul_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card) { - m256i* _src = reinterpret_cast(src); - m256i* _dest = reinterpret_cast(dest); + m256i* __restrict _src = reinterpret_cast(src); + m256i* __restrict _dest = reinterpret_cast(dest); const unsigned ratio = sizeof(*_src) / sizeof(*src); const size_t _len = len / ratio; const size_t _last_len = len - _len * ratio; @@ -257,93 +708,33 @@ mul_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3) size_t i; for (i = 0; i < _len; i++) { // perform multiplicaton - _dest[i] = mul(_src[i], _dest[i], card); + _dest[i] = MULFULL_MOD(_src[i], _dest[i], card); } if (_last_len > 0) { for (i = _len * ratio; i < len; i++) { // perform multiplicaton - dest[i] = (uint32_t(src[i]) * dest[i]) % card; + dest[i] = uint16_t((uint64_t(src[i]) * dest[i]) % card); } } } -/* - * buf1[i] = buf1[i] + coef * buf2[i] - * buf2[i] = buf1[i] - coef * buf2[i] +/** Apply an element-wise negation to a buffer */ -inline void butterfly_ct( - uint16_t coef, - aint16* buf1, - aint16* buf2, - size_t len, - uint16_t card = F3) +inline void neg(size_t len, aint16* buf, aint16 card) { - const m256i _coef = _mm256_set1_epi16(coef); - m256i* _buf1 = reinterpret_cast(buf1); - m256i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m256i a = mul(_coef, _buf2[i], card); - _buf2[i] = sub(_buf1[i], a, card); - _buf1[i] = add(_buf1[i], a, card); - } -} + m256i* _buf = reinterpret_cast(buf); + unsigned ratio = sizeof(*_buf) / sizeof(*buf); + size_t _len = len / ratio; + size_t _last_len = len - _len * ratio; -/* - * buf1[i] = buf1[i] + buf2[i] - * buf2[i] = coef * (buf1[i] - buf2[i]) - */ -inline void butterfly_gs( - uint16_t coef, - aint16* buf1, - aint16* buf2, - size_t len, - uint16_t card = F3) -{ - const m256i _coef = _mm256_set1_epi16(coef); - m256i* _buf1 = reinterpret_cast(buf1); - m256i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m256i a = _buf1[i]; - m256i b = _buf2[i]; - m256i c = sub(a, b, card); - _buf1[i] = add(a, b, card); - _buf2[i] = mul(_coef, c, card); + size_t i; + for (i = 0; i < _len; i++) { + _buf[i] = NEG_MOD(_buf[i], card); } -} - -inline void encode_post_process( - vec::Buffers& output, - std::vector& props, - off_t offset, - unsigned code_len, - uint16_t threshold, - size_t vecs_nb) -{ - const unsigned vec_size = simd::countof(); - - const m256i _threshold = _mm256_set1_epi16(threshold); - uint16_t max = 1 << (sizeof(uint16_t) * 8 - 1); - const m256i mask_hi = _mm256_set1_epi16(max); - const unsigned element_size = sizeof(uint16_t); - - for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { - uint16_t* chunk = output.get(frag_id); - m256i* buf = reinterpret_cast(chunk); - for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) { - const m256i a = _mm256_load_si256(&(buf[vec_id])); - const m256i b = _mm256_cmpeq_epi16(_threshold, a); - const m256i c = _mm256_and_si256(mask_hi, b); - uint32_t d = _mm256_movemask_epi8(c); - - while (d > 0) { - unsigned byte_idx = __builtin_ctz(d); - unsigned element_idx = byte_idx / element_size; - off_t _offset = offset + vec_id * vec_size + element_idx; - props[frag_id].add(_offset, 1); - d ^= 1 << byte_idx; - } + if (_last_len > 0) { + for (i = _len * ratio; i < len; i++) { + if (buf[i]) + buf[i] = card - buf[i]; } } } diff --git a/src/simd_256_u32.h b/src/simd_256_u32.h index 9c76c89b..5302a472 100644 --- a/src/simd_256_u32.h +++ b/src/simd_256_u32.h @@ -38,212 +38,644 @@ namespace quadiron { namespace simd { -/* ==================== Essential Operations =================== */ +#define F4_u32 _mm256_set1_epi32(65537) +#define F4m1_u32 _mm256_set1_epi32(65536) +#define F3_u32 _mm256_set1_epi32(257) +#define F3m1_u32 _mm256_set1_epi32(256) -/** Perform a%card where a is a addition of two numbers whose elements are - * symbols of GF(card) */ -inline m256i mod_after_add(m256i a, aint32 card) -{ - const m256i _card = _mm256_set1_epi32(card); - const m256i _card_minus_1 = _mm256_set1_epi32(card - 1); +#define CARD(q) (EITHER(q == F3, F3_u32, F4_u32)) +#define CARD_M_1(q) (EITHER(q == F3, F3m1_u32, F4m1_u32)) - m256i cmp = _mm256_cmpgt_epi32(a, _card_minus_1); - m256i b = _mm256_sub_epi32(a, _mm256_and_si256(_card, cmp)); +/* ==================== Essential Operations =================== */ +// Following functions are used for AVX2 w/ u32 only - return b; +inline m256i SET1(uint32_t val) +{ + return _mm256_set1_epi32(val); } - -/** Perform addition of two numbers a, b whose elements are of GF(card) */ -inline m256i add(m256i a, m256i b, aint32 card) +inline m256i ADD32(m256i x, m256i y) { - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); - m256i c = _mm256_add_epi32(_a, _b); + return _mm256_add_epi32(x, y); +} +inline m256i SUB32(m256i x, m256i y) +{ + return _mm256_sub_epi32(x, y); +} +inline m256i MUL32(m256i x, m256i y) +{ + return _mm256_mullo_epi32(x, y); +} - // Modulo - return mod_after_add(c, card); +inline m256i CMPEQ32(m256i x, m256i y) +{ + return _mm256_cmpeq_epi32(x, y); +} +inline m256i CMPGT32(m256i x, m256i y) +{ + return _mm256_cmpgt_epi32(x, y); +} +inline m256i MINU32(m256i x, m256i y) +{ + return _mm256_min_epu32(x, y); } +#define BLEND16(x, y, imm8) (_mm256_blend_epi16(x, y, imm8)) -/** Perform subtraction of a by b where a, b whose elements are symbols of - * GF(card) - * sub(a, b) = a - b if a >= b, or - * card + a - b, otherwise - */ -inline m256i sub(m256i a, m256i b, aint32 card) +// z = x + y mod q +// Input are loaded to registers +// Output is register +inline m256i ADD_MOD(m256i x, m256i y, uint32_t q) { - const m256i _card = _mm256_set1_epi32(card); + m256i res = ADD32(x, y); + return MINU32(res, SUB32(res, CARD(q))); +} - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); +// z = x - y mod q => z = q + x - y mod q +// Input are loaded to registers +// Output is register +inline m256i SUB_MOD(m256i x, m256i y, uint32_t q) +{ + m256i res = SUB32(x, y); + return MINU32(res, ADD32(res, CARD(q))); +} - m256i cmp = _mm256_cmpgt_epi32(_b, _a); - m256i _a1 = _mm256_add_epi32(_a, _mm256_and_si256(_card, cmp)); +// y = 0 - x mod q => y = q - x mod q +// Input are loaded to registers +// Output is register +inline m256i NEG_MOD(m256i x, uint32_t q) +{ + m256i res = SUB32(CARD(q), x); + return MINU32(res, SUB32(res, CARD(q))); +} - return _mm256_sub_epi32(_a1, _b); +// z = x * y mod q +// Input are loaded to registers +// Output is register +// Note: we assume that at least `x` or `y` is less than `q-1` so it's +// not necessary to verify overflow on multiplying elements +inline m256i MUL_MOD(m256i x, m256i y, uint32_t q) +{ + m256i res = MUL32(x, y); + m256i lo = BLEND16(ZERO, res, 0x55); + m256i hi = BLEND16(ZERO, SHIFTR_2(res), 0x55); + return SUB_MOD(lo, hi, q); } -/** Negate `a` - * @return 0 if (a == 0), else card - a - */ -inline m256i neg(m256i a, aint32 card = F4) +inline void MUL_MOD(m256i x, m256i y, m256i* z, uint32_t q) +{ + m256i res = MUL32(x, y); + m256i lo = BLEND16(ZERO, res, 0x55); + m256i hi = BLEND16(ZERO, SHIFTR_2(res), 0x55); + *z = SUB_MOD(lo, hi, q); +} +// z = x * y mod q +// Input are loaded to registers +// Output is register +inline m256i MULFULL_MOD(m256i x, m256i y, uint32_t q) { - const m256i _card = _mm256_set1_epi32(card); - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_setzero_si256(); + m256i res = MUL32(x, y); - m256i cmp = _mm256_cmpgt_epi32(_a, _b); + // filter elements of both of a & b = card-1 + m256i cmp = AND(CMPEQ32(x, CARD_M_1(q)), CMPEQ32(y, CARD_M_1(q))); + res = (q == F3) ? XOR(res, AND(F4_u32, cmp)) : ADD32(res, AND(ONE, cmp)); - return _mm256_sub_epi32(_mm256_and_si256(cmp, _card), _a); + m256i lo = BLEND16(ZERO, res, 0x55); + m256i hi = SHIFTR_2(BLEND16(ZERO, res, 0xAA)); + return SUB_MOD(lo, hi, q); } -/** Perform a%card where a is a multiplication of two numbers whose elements are - * symbols of GF(F4) - * - * We find v in a = u * card + v - * a is expressed also as: a = hi * (card-1) + lo - * where hi and lo is 16-bit for F4 (or 8-bit for F3) high and low parts of a - * hence, v = (lo - hi) % F4 - * v = lo - hi, if lo >= hi - * or - * F4 + lo - hi, otherwise - */ -inline m256i mod_after_multiply_f4(m256i a) +// butterfly CT with r == 1 +inline void BUTTERFLY_1(m256i* x, m256i* y, uint32_t q) { - const m256i mask = _mm256_set1_epi32(F4 - 2); - - m256i lo = _mm256_and_si256(a, mask); - - m256i a_shift = _mm256_srli_si256(a, 2); - m256i hi = _mm256_and_si256(a_shift, mask); + m256i add = ADD_MOD(*x, *y, q); + *y = SUB_MOD(*x, *y, q); + *x = add; +} - m256i cmp = _mm256_cmpgt_epi32(hi, lo); - m256i _lo = _mm256_add_epi32(lo, _mm256_and_si256(F4_m256i, cmp)); +// butterfly CT with r == q - 1 +inline void BUTTERFLY_2(m256i* x, m256i* y, uint32_t q) +{ + m256i add = ADD_MOD(*x, *y, q); + *x = SUB_MOD(*x, *y, q); + *y = add; +} - return _mm256_sub_epi32(_lo, hi); +// butterfly CT with 1 < r < q - 1 +inline void BUTTERFLY_3(m256i c, m256i* x, m256i* y, uint32_t q) +{ + m256i z = MUL_MOD(c, *y, q); + *y = SUB_MOD(*x, z, q); + *x = ADD_MOD(*x, z, q); } -inline m256i mod_after_multiply_f3(m256i a) +// butterfly GS w/ r = q - 1 +inline void BUTTERFLY_4(m256i* x, m256i* y, uint32_t q) { - const m256i mask = _mm256_set1_epi32(F3 - 2); + m256i add = ADD_MOD(*x, *y, q); + *y = SUB_MOD(*y, *x, q); + *x = add; +} - m256i lo = _mm256_and_si256(a, mask); +// butterfly GS w/ 1 < r < q - 1 +// x = x + y mod q +// y = z * (x - y) mod q +inline void BUTTERFLY_5(m256i c, m256i* x, m256i* y, uint32_t q) +{ + m256i sub = SUB_MOD(*x, *y, q); + *x = ADD_MOD(*x, *y, q); + *y = MUL_MOD(c, sub, q); +} - m256i a_shift = _mm256_srli_si256(a, 1); - m256i hi = _mm256_and_si256(a_shift, mask); +/** + * Vectorized butterly CT step + * + * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` + * P = P + r * Q + * Q = P - r * Q + * + * @param buf - working buffers + * @param r - coefficient + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +inline void butterfly_ct_step( + vec::Buffers& buf, + uint32_t r, + unsigned start, + unsigned m, + size_t len, + uint32_t card) +{ + const unsigned step = m << 1; + m256i c = SET1(r); + +#define BUTTERFLY_CT(x, y) \ + (EITHER( \ + r == 1, \ + BUTTERFLY_1(x, y, card), \ + EITHER( \ + r < card - 1, \ + BUTTERFLY_3(c, x, y, card), \ + BUTTERFLY_2(x, y, card)))); + + const size_t end = len - 1; + const unsigned bufs_nb = buf.get_n(); + // #pragma omp parallel for + // #pragma unroll + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + m256i x1, y1; + m256i x2, y2; + m256i* __restrict p = reinterpret_cast(mem[i]); + m256i* __restrict q = reinterpret_cast(mem[i + m]); + + // #pragma omp parallel for + size_t j = 0; + // #pragma unroll + for (; j < end; j += 2) { + x1 = LOAD(p + j); + y1 = LOAD(q + j); + x2 = LOAD(p + j + 1); + y2 = LOAD(q + j + 1); + + BUTTERFLY_CT(&x1, &y1); + BUTTERFLY_CT(&x2, &y2); + + // Store back to memory + STORE(p + j, x1); + STORE(p + j + 1, x2); + STORE(q + j, y1); + STORE(q + j + 1, y2); + } + for (; j < len; ++j) { + x1 = LOAD(p + j); + y1 = LOAD(q + j); - m256i cmp = _mm256_cmpgt_epi32(hi, lo); - m256i _lo = _mm256_add_epi32(lo, _mm256_and_si256(F3_m256i, cmp)); + BUTTERFLY_CT(&x1, &y1); - return _mm256_sub_epi32(_lo, hi); + // Store back to memory + STORE(p + j, x1); + STORE(q + j, y1); + } + } } -inline m256i mul_f4(m256i a, m256i b) +/** + * Vectorized butterly CT on two-layers at a time + * + * For each quadruple + * (P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m]) + * First layer: butterfly on (P, Q) and (R, S) for step = 2 * m + * coef r1 = W[start * n / (2 * m)] + * P = P + r1 * Q + * Q = P - r1 * Q + * R = R + r1 * S + * S = R - r1 * S + * Second layer: butterfly on (P, R) and (Q, S) for step = 4 * m + * coef r2 = W[start * n / (4 * m)] + * coef r3 = W[(start + m) * n / (4 * m)] + * P = P + r2 * R + * R = P - r2 * R + * Q = Q + r3 * S + * S = Q - r3 * S + * + * @param buf - working buffers + * @param r1 - coefficient for the 1st layer + * @param r2 - 1st coefficient for the 2nd layer + * @param r3 - 2nd coefficient for the 2nd layer + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +inline void butterfly_ct_two_layers_step( + vec::Buffers& buf, + uint32_t r1, + uint32_t r2, + uint32_t r3, + unsigned start, + unsigned m, + size_t len, + uint32_t card) { - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); - - m256i c = _mm256_mullo_epi32(_a, _b); + const unsigned step = m << 2; + m256i c1 = SET1(r1); + m256i c2 = SET1(r2); + m256i c3 = SET1(r3); + +#define BUTTERFLY_R1(c, x, y) \ + (EITHER( \ + r1 == 1, \ + BUTTERFLY_1(x, y, card), \ + EITHER( \ + r1 < card - 1, \ + BUTTERFLY_3(c, x, y, card), \ + BUTTERFLY_2(x, y, card)))); +#define BUTTERFLY_R2(c, x, y) \ + (EITHER( \ + r2 == 1, \ + BUTTERFLY_1(x, y, card), \ + EITHER( \ + r2 < card - 1, \ + BUTTERFLY_3(c, x, y, card), \ + BUTTERFLY_2(x, y, card)))); +#define BUTTERFLY_R3(c, x, y) \ + (EITHER( \ + r3 == 1, \ + BUTTERFLY_1(x, y, card), \ + EITHER( \ + r3 < card - 1, \ + BUTTERFLY_3(c, x, y, card), \ + BUTTERFLY_2(x, y, card)))); + + const size_t end = len - 1; + const unsigned bufs_nb = buf.get_n(); + // #pragma omp parallel for + // #pragma unroll + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + m256i x1, y1, u1, v1; + m256i x2, y2, u2, v2; + m256i* __restrict p = reinterpret_cast(mem[i]); + m256i* __restrict q = reinterpret_cast(mem[i + m]); + m256i* __restrict r = reinterpret_cast(mem[i + 2 * m]); + m256i* __restrict s = reinterpret_cast(mem[i + 3 * m]); + + // #pragma omp parallel for + size_t j = 0; + // #pragma unroll + for (; j < end; j += 2) { + // First layer (c1, x, y) & (c1, u, v) + x1 = LOAD(p + j); + y1 = LOAD(q + j); + x2 = LOAD(p + j + 1); + y2 = LOAD(q + j + 1); + + u1 = LOAD(r + j); + v1 = LOAD(s + j); + u2 = LOAD(r + j + 1); + v2 = LOAD(s + j + 1); + + BUTTERFLY_R1(c1, &x1, &y1); + BUTTERFLY_R1(c1, &x2, &y2); + + BUTTERFLY_R1(c1, &u1, &v1); + BUTTERFLY_R1(c1, &u2, &v2); + + // Second layer (c2, x, u) & (c3, y, v) + BUTTERFLY_R2(c2, &x1, &u1); + BUTTERFLY_R2(c2, &x2, &u2); + + BUTTERFLY_R3(c3, &y1, &v1); + BUTTERFLY_R3(c3, &y2, &v2); + + // Store back to memory + STORE(p + j, x1); + STORE(p + j + 1, x2); + STORE(q + j, y1); + STORE(q + j + 1, y2); + + STORE(r + j, u1); + STORE(r + j + 1, u2); + STORE(s + j, v1); + STORE(s + j + 1, v2); + } + for (; j < len; ++j) { + // First layer (c1, x, y) & (c1, u, v) + x1 = LOAD(p + j); + y1 = LOAD(q + j); + u1 = LOAD(r + j); + v1 = LOAD(s + j); + + BUTTERFLY_R1(c1, &x1, &y1); + BUTTERFLY_R1(c1, &u1, &v1); + // Second layer (c2, x, u) & (c3, y, v) + BUTTERFLY_R2(c2, &x1, &u1); + BUTTERFLY_R3(c3, &y1, &v1); + // Store back to memory + STORE(p + j, x1); + STORE(q + j, y1); + STORE(r + j, u1); + STORE(s + j, v1); + } + } +} - // filter elements of both of a & b = card-1 - m256i cmp = _mm256_and_si256( - _mm256_cmpeq_epi32(_a, F4minus1_m256i), - _mm256_cmpeq_epi32(_b, F4minus1_m256i)); +/** + * Vectorized butterly GS step + * + * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` + * P = P + Q + * Q = r * (P - Q) + * + * @param buf - working buffers + * @param r - coefficient + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +inline void butterfly_gs_step( + vec::Buffers& buf, + uint32_t r, + unsigned start, + unsigned m, + size_t len, + uint32_t card) +{ + const unsigned step = m << 1; + m256i c = SET1(r); + +#define BUTTERFLY_GS(x, y) \ + (EITHER( \ + r == 1, \ + BUTTERFLY_1(x, y, card), \ + EITHER( \ + r < card - 1, \ + BUTTERFLY_5(c, x, y, card), \ + BUTTERFLY_4(x, y, card)))); + + const size_t end = len - 3; + const unsigned bufs_nb = buf.get_n(); + // #pragma omp parallel for + // #pragma unroll + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + m256i x1, x2, x3, x4; + m256i y1, y2, y3, y4; + m256i* __restrict p = reinterpret_cast(mem[i]); + m256i* __restrict q = reinterpret_cast(mem[i + m]); + + // #pragma omp parallel for + size_t j = 0; + // #pragma unroll + for (; j < end; j += 4) { + x1 = LOAD(p + j); + x2 = LOAD(p + j + 1); + x3 = LOAD(p + j + 2); + x4 = LOAD(p + j + 3); + y1 = LOAD(q + j); + y2 = LOAD(q + j + 1); + y3 = LOAD(q + j + 2); + y4 = LOAD(q + j + 3); + + BUTTERFLY_GS(&x1, &y1); + BUTTERFLY_GS(&x2, &y2); + BUTTERFLY_GS(&x3, &y3); + BUTTERFLY_GS(&x4, &y4); + + // Store back to memory + STORE(p + j, x1); + STORE(p + j + 1, x2); + STORE(p + j + 2, x3); + STORE(p + j + 3, x4); + STORE(q + j, y1); + STORE(q + j + 1, y2); + STORE(q + j + 2, y3); + STORE(q + j + 3, y4); + } + for (; j < len; ++j) { + x1 = LOAD(p + j); + y1 = LOAD(q + j); - const m256i one = _mm256_set1_epi32(1); - c = _mm256_add_epi32(c, _mm256_and_si256(one, cmp)); + BUTTERFLY_GS(&x1, &y1); - // Modulo - return mod_after_multiply_f4(c); + // Store back to memory + STORE(p + j, x1); + STORE(q + j, y1); + } + } } -inline m256i mul_f3(m256i a, m256i b) +/** + * Vectorized butterly GS step + * + * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` + * Q = r * P + * + * @param buf - working buffers + * @param r - coefficient + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +inline void butterfly_gs_step_simple( + vec::Buffers& buf, + uint32_t r, + unsigned start, + unsigned m, + size_t len, + uint32_t card) { - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); - - m256i c = _mm256_mullo_epi32(_a, _b); - - // filter elements of both of a & b = card-1 - m256i cmp = _mm256_and_si256( - _mm256_cmpeq_epi32(_a, F3minus1_m256i), - _mm256_cmpeq_epi32(_b, F3minus1_m256i)); + const unsigned step = m << 1; + m256i c = SET1(r); + +#define BUTTERFLY_GS_S(x) \ + (EITHER( \ + r == 1, \ + (x), \ + EITHER(r < card - 1, MUL_MOD(c, x, card), NEG_MOD(x, card)))); + + const size_t end = len - 1; + const unsigned bufs_nb = buf.get_n(); + // #pragma omp parallel for + // #pragma unroll + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + m256i x1, y1; + m256i x2, y2; + m256i* __restrict p = reinterpret_cast(mem[i]); + m256i* __restrict q = reinterpret_cast(mem[i + m]); + + // #pragma omp parallel for + size_t j = 0; + // #pragma unroll + for (; j < end; j += 2) { + x1 = LOAD(p + j); + x2 = LOAD(p + j + 1); + + y1 = BUTTERFLY_GS_S(x1); + y2 = BUTTERFLY_GS_S(x2); + + // Store back to memory + STORE(q + j, y1); + STORE(q + j + 1, y2); + } + for (; j < len; ++j) { + x1 = LOAD(p + j); - c = _mm256_xor_si256(c, _mm256_and_si256(F4_m256i, cmp)); + y1 = BUTTERFLY_GS_S(x1); - // Modulo - return mod_after_multiply_f3(c); + // Store back to memory + STORE(q + j, y1); + } + } } -/** Perform multiplication of two numbers a, b whose elements are of GF(card) - * where `card` is a prime Fermat number, i.e. card = Fx with x < 5 - * Currently, it supports only for F3 and F4 - */ -inline m256i mul(m256i a, m256i b, aint32 card) +inline void add_props( + Properties& props, + m256i threshold, + m256i mask, + m256i symb, + off_t offset) { - assert(card == F4 || card == F3); - if (card == F4) - return mul_f4(a, b); - return mul_f3(a, b); + const m256i b = CMPEQ32(threshold, symb); + const m256i c = AND(mask, b); + uint32_t d = MVMSK8(c); + const unsigned element_size = sizeof(uint32_t); + while (d > 0) { + unsigned byte_idx = __builtin_ctz(d); + off_t _offset = offset + byte_idx / element_size; + props.add(_offset, OOR_MARK); + d ^= 1 << byte_idx; + } } -/** Apply an element-wise negation to a buffer - */ -inline void neg(size_t len, aint32* buf, aint32 card = F4) +inline void encode_post_process( + vec::Buffers& output, + std::vector& props, + off_t offset, + unsigned code_len, + uint32_t threshold, + size_t vecs_nb) { - m256i* _buf = reinterpret_cast(buf); - unsigned ratio = sizeof(*_buf) / sizeof(*buf); - size_t _len = len / ratio; - size_t _last_len = len - _len * ratio; + const unsigned element_size = sizeof(uint32_t); + const unsigned vec_size = ALIGN_SIZE / element_size; + const uint32_t max = 1 << (element_size * 8 - 1); + const m256i _threshold = SET1(threshold); + const m256i mask_hi = SET1(max); - size_t i; - for (i = 0; i < _len; i++) { - _buf[i] = neg(_buf[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - if (buf[i]) - buf[i] = card - buf[i]; + // #pragma unroll + const std::vector& mem = output.get_mem(); + for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { + m256i* __restrict buf = reinterpret_cast(mem[frag_id]); + + size_t vec_id = 0; + size_t end = vecs_nb - 3; + // #pragma unroll + for (; vec_id < end; vec_id += 4) { + m256i a1 = LOAD(buf + vec_id); + m256i a2 = LOAD(buf + vec_id + 1); + m256i a3 = LOAD(buf + vec_id + 2); + m256i a4 = LOAD(buf + vec_id + 3); + + if (TESTZ(a1, _threshold) == 0) { + const off_t curr_offset = offset + vec_id * vec_size; + add_props(props[frag_id], _threshold, mask_hi, a1, curr_offset); + } + if (TESTZ(a2, _threshold) == 0) { + const off_t curr_offset = offset + (vec_id + 1) * vec_size; + add_props(props[frag_id], _threshold, mask_hi, a2, curr_offset); + } + if (TESTZ(a3, _threshold) == 0) { + const off_t curr_offset = offset + (vec_id + 2) * vec_size; + add_props(props[frag_id], _threshold, mask_hi, a3, curr_offset); + } + if (TESTZ(a4, _threshold) == 0) { + const off_t curr_offset = offset + (vec_id + 3) * vec_size; + add_props(props[frag_id], _threshold, mask_hi, a4, curr_offset); + } + } + for (; vec_id < vecs_nb; ++vec_id) { + m256i a = LOAD(buf + vec_id); + uint32_t c = TESTZ(a, _threshold); + if (c == 0) { + const off_t curr_offset = offset + vec_id * vec_size; + add_props(props[frag_id], _threshold, mask_hi, a, curr_offset); + } } } } +/* ==================== Operations =================== */ /** Perform a multiplication of a coefficient `a` to each element of `src` and * add result to correspondent element of `dest` + * + * @note: 1 < `a` < card - 1 */ inline void mul_coef_to_buf( - const aint32 a, + const uint32_t a, aint32* src, aint32* dest, size_t len, - aint32 card = F4) + uint32_t card) { - const m256i coef = _mm256_set1_epi32(a); + const m256i coef = SET1(a); - m256i* _src = reinterpret_cast(src); - m256i* _dest = reinterpret_cast(dest); + m256i* __restrict _src = reinterpret_cast(src); + m256i* __restrict _dest = reinterpret_cast(dest); const unsigned ratio = sizeof(*_src) / sizeof(*src); const size_t _len = len / ratio; const size_t _last_len = len - _len * ratio; - size_t i; - for (i = 0; i < _len; i++) { + size_t i = 0; + size_t end = _len - 3; + for (; i < end; i += 4) { // perform multiplication - _dest[i] = mul(coef, _src[i], card); + MUL_MOD(coef, _src[i], _dest + i, card); + MUL_MOD(coef, _src[i + 1], _dest + i + 1, card); + MUL_MOD(coef, _src[i + 2], _dest + i + 2, card); + MUL_MOD(coef, _src[i + 3], _dest + i + 3, card); + } + for (; i < _len; ++i) { + MUL_MOD(coef, _src[i], _dest + i, card); } + if (_last_len > 0) { uint64_t coef_64 = (uint64_t)a; - for (i = _len * ratio; i < len; i++) { + for (size_t i = _len * ratio; i < len; i++) { // perform multiplication dest[i] = (aint32)((coef_64 * src[i]) % card); } } } -inline void -add_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4) +inline void add_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card) { - m256i* _src = reinterpret_cast(src); - m256i* _dest = reinterpret_cast(dest); + m256i* __restrict _src = reinterpret_cast(src); + m256i* __restrict _dest = reinterpret_cast(dest); const unsigned ratio = sizeof(*_src) / sizeof(*src); const size_t _len = len / ratio; const size_t _last_len = len - _len * ratio; @@ -251,7 +683,7 @@ add_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4) size_t i; for (i = 0; i < _len; i++) { // perform addition - _dest[i] = add(_src[i], _dest[i], card); + _dest[i] = ADD_MOD(_src[i], _dest[i], card); } if (_last_len > 0) { for (i = _len * ratio; i < len; i++) { @@ -269,9 +701,9 @@ inline void sub_two_bufs( size_t len, aint32 card = F4) { - m256i* _bufa = reinterpret_cast(bufa); - m256i* _bufb = reinterpret_cast(bufb); - m256i* _res = reinterpret_cast(res); + m256i* __restrict _bufa = reinterpret_cast(bufa); + m256i* __restrict _bufb = reinterpret_cast(bufb); + m256i* __restrict _res = reinterpret_cast(res); const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa); const size_t _len = len / ratio; const size_t _last_len = len - _len * ratio; @@ -279,7 +711,7 @@ inline void sub_two_bufs( size_t i; for (i = 0; i < _len; i++) { // perform subtraction - _res[i] = sub(_bufa[i], _bufb[i], card); + _res[i] = SUB_MOD(_bufa[i], _bufb[i], card); } if (_last_len > 0) { for (i = _len * ratio; i < len; i++) { @@ -292,11 +724,10 @@ inline void sub_two_bufs( } } -inline void -mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4) +inline void mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card) { - m256i* _src = reinterpret_cast(src); - m256i* _dest = reinterpret_cast(dest); + m256i* __restrict _src = reinterpret_cast(src); + m256i* __restrict _dest = reinterpret_cast(dest); const unsigned ratio = sizeof(*_src) / sizeof(*src); const size_t _len = len / ratio; const size_t _last_len = len - _len * ratio; @@ -304,7 +735,7 @@ mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4) size_t i; for (i = 0; i < _len; i++) { // perform multiplicaton - _dest[i] = mul(_src[i], _dest[i], card); + _dest[i] = MULFULL_MOD(_src[i], _dest[i], card); } if (_last_len > 0) { for (i = _len * ratio; i < len; i++) { @@ -314,81 +745,23 @@ mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4) } } -/* - * buf1[i] = buf1[i] + coef * buf2[i] - * buf2[i] = buf1[i] - coef * buf2[i] - */ -inline void butterfly_ct( - uint32_t coef, - aint32* buf1, - aint32* buf2, - size_t len, - uint32_t card = F4) -{ - const m256i _coef = _mm256_set1_epi32(coef); - m256i* _buf1 = reinterpret_cast(buf1); - m256i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m256i a = mul(_coef, _buf2[i], card); - _buf2[i] = sub(_buf1[i], a, card); - _buf1[i] = add(_buf1[i], a, card); - } -} - -/* - * buf1[i] = buf1[i] + buf2[i] - * buf2[i] = coef * (buf1[i] - buf2[i]) +/** Apply an element-wise negation to a buffer */ -inline void butterfly_gs( - uint32_t coef, - aint32* buf1, - aint32* buf2, - size_t len, - uint32_t card = F4) +inline void neg(size_t len, aint32* buf, aint32 card = F4) { - const m256i _coef = _mm256_set1_epi32(coef); - m256i* _buf1 = reinterpret_cast(buf1); - m256i* _buf2 = reinterpret_cast(buf2); + m256i* _buf = reinterpret_cast(buf); + unsigned ratio = sizeof(*_buf) / sizeof(*buf); + size_t _len = len / ratio; + size_t _last_len = len - _len * ratio; - for (size_t i = 0; i < len; ++i) { - m256i a = add(_buf1[i], _buf2[i], card); - _buf2[i] = mul(_coef, sub(_buf1[i], _buf2[i], card), card); - _buf1[i] = a; + size_t i; + for (i = 0; i < _len; i++) { + _buf[i] = NEG_MOD(_buf[i], card); } -} - -inline void encode_post_process( - vec::Buffers& output, - std::vector& props, - off_t offset, - unsigned code_len, - uint32_t threshold, - size_t vecs_nb) -{ - const unsigned vec_size = simd::countof(); - - const m256i _threshold = _mm256_set1_epi32(threshold); - const uint32_t max = 1 << (sizeof(uint32_t) * 8 - 1); - const m256i mask_hi = _mm256_set1_epi32(max); - const unsigned element_size = sizeof(uint32_t); - - for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { - uint32_t* chunk = output.get(frag_id); - m256i* buf = reinterpret_cast(chunk); - for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) { - const m256i a = _mm256_load_si256(&(buf[vec_id])); - const m256i b = _mm256_cmpeq_epi32(_threshold, a); - const m256i c = _mm256_and_si256(mask_hi, b); - uint32_t d = _mm256_movemask_epi8(c); - - while (d > 0) { - unsigned byte_idx = __builtin_ctz(d); - unsigned element_idx = byte_idx / element_size; - off_t _offset = offset + vec_id * vec_size + element_idx; - props[frag_id].add(_offset, 1); - d ^= 1 << byte_idx; - } + if (_last_len > 0) { + for (i = _len * ratio; i < len; i++) { + if (buf[i]) + buf[i] = card - buf[i]; } } } @@ -408,7 +781,7 @@ inline __uint128_t add(__uint128_t a, __uint128_t b) { m256i _a = _mm256_castsi128_si256((m128i)a); m256i _b = _mm256_castsi128_si256((m128i)b); - m256i res = add(_a, _b, F4); + m256i res = ADD_MOD(_a, _b, F4); return m256i_to_uint128(res); } @@ -416,7 +789,7 @@ inline __uint128_t sub(__uint128_t a, __uint128_t b) { m256i _a = _mm256_castsi128_si256((m128i)a); m256i _b = _mm256_castsi128_si256((m128i)b); - m256i res = sub(_a, _b, F4); + m256i res = SUB_MOD(_a, _b, F4); return m256i_to_uint128(res); } @@ -424,7 +797,7 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b) { m256i _a = _mm256_castsi128_si256((m128i)a); m256i _b = _mm256_castsi128_si256((m128i)b); - m256i res = mul(_a, _b, F4); + m256i res = MULFULL_MOD(_a, _b, F4); return m256i_to_uint128(res); } @@ -446,7 +819,7 @@ inline void hadamard_mul(int n, aint128* _x, aint128* _y) // multiply y to the first half of `x` for (i = 0; i < len_256; i++) { - x[i] = mul(x[i], y[i], F4); + x[i] = MULFULL_MOD(x[i], y[i], F4); } if (last_len > 0) { From 8e7fb47b699a8cb99125b8fb4aa6c2cb6a470291 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:42:31 +0200 Subject: [PATCH 07/77] SIMD 256 u16 u32: remove useless files --- src/simd_256_u16.h | 745 ---------------------------------------- src/simd_256_u32.h | 839 --------------------------------------------- 2 files changed, 1584 deletions(-) delete mode 100644 src/simd_256_u16.h delete mode 100644 src/simd_256_u32.h diff --git a/src/simd_256_u16.h b/src/simd_256_u16.h deleted file mode 100644 index 74d35d24..00000000 --- a/src/simd_256_u16.h +++ /dev/null @@ -1,745 +0,0 @@ -/* - * Copyright 2017-2018 Scality - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __QUAD_SIMD_256_U16_H__ -#define __QUAD_SIMD_256_U16_H__ - -#include - -#include "simd/simd.h" - -namespace quadiron { -namespace simd { - -#define F3_u16 _mm256_set1_epi16(257) -#define F3m1_u16 _mm256_set1_epi16(256) - -/* ==================== Essential Operations =================== */ -// Following functions are used for AVX2 w/ u16 only - -inline m256i SET1(uint16_t val) -{ - return _mm256_set1_epi16(val); -} -inline m256i ADD16(m256i x, m256i y) -{ - return _mm256_add_epi16(x, y); -} -inline m256i SUB16(m256i x, m256i y) -{ - return _mm256_sub_epi16(x, y); -} -inline m256i MUL16(m256i x, m256i y) -{ - return _mm256_mullo_epi16(x, y); -} - -inline m256i CMPEQ16(m256i x, m256i y) -{ - return _mm256_cmpeq_epi16(x, y); -} -inline m256i CMPGT16(m256i x, m256i y) -{ - return _mm256_cmpgt_epi16(x, y); -} -inline m256i MINU16(m256i x, m256i y) -{ - return _mm256_min_epu16(x, y); -} - -#define MASK8_LO (_mm256_set1_epi16(0x80)) -#define BLEND8(x, y, mask) (_mm256_blendv_epi8(x, y, mask)) - -// z = x + y mod q -// Input are loaded to registers -// Output is register -inline m256i ADD_MOD(m256i x, m256i y, uint16_t q) -{ - m256i res = ADD16(x, y); - return MINU16(res, SUB16(res, F3_u16)); -} - -// z = x - y mod q => z = q + x - y mod q -// Input are loaded to registers -// Output is register -inline m256i SUB_MOD(m256i x, m256i y, uint16_t q) -{ - m256i res = SUB16(x, y); - return MINU16(res, SUB16(ADD16(x, F3_u16), y)); -} - -// y = 0 - x mod q => y = q - x mod q -// Input are loaded to registers -// Output is register -inline m256i NEG_MOD(m256i x, uint16_t q) -{ - m256i res = SUB16(F3_u16, x); - return MINU16(res, SUB16(res, F3_u16)); -} - -// z = x * y mod q -// Input are loaded to registers -// Output is register -// Note: we assume that at least `x` or `y` is less than `q-1` so it's -// not necessary to verify overflow on multiplying elements -inline m256i MUL_MOD(m256i x, m256i y, uint16_t q) -{ - m256i res = MUL16(x, y); - m256i lo = BLEND8(ZERO, res, MASK8_LO); - m256i hi = BLEND8(ZERO, SHIFTR_1(res), MASK8_LO); - return SUB_MOD(lo, hi, q); -} - -// z = x * y mod q -// Input are loaded to registers -// Output is register -inline m256i MULFULL_MOD(m256i x, m256i y, uint16_t q) -{ - m256i res = MUL16(x, y); - - // filter elements of both of a & b = card-1 - m256i cmp = AND(CMPEQ16(x, F3m1_u16), CMPEQ16(y, F3m1_u16)); - res = ADD16(res, AND(ONE, cmp)); - - m256i lo = BLEND8(ZERO, res, MASK8_LO); - m256i hi = BLEND8(ZERO, SHIFTR_1(res), MASK8_LO); - return SUB_MOD(lo, hi, q); -} - -// butterfly CT with r == 1 -inline void BUTTERFLY_1(m256i* x, m256i* y, uint16_t q) -{ - m256i add = ADD_MOD(*x, *y, q); - *y = SUB_MOD(*x, *y, q); - *x = add; -} - -// butterfly CT with r == q - 1 -inline void BUTTERFLY_2(m256i* x, m256i* y, uint16_t q) -{ - m256i add = ADD_MOD(*x, *y, q); - *x = SUB_MOD(*x, *y, q); - *y = add; -} - -// butterfly CT with 1 < r < q - 1 -inline void BUTTERFLY_3(m256i c, m256i* x, m256i* y, uint16_t q) -{ - m256i z = MUL_MOD(c, *y, q); - *y = SUB_MOD(*x, z, q); - *x = ADD_MOD(*x, z, q); -} - -// butterfly GS w/ r = q - 1 -inline void BUTTERFLY_4(m256i* x, m256i* y, uint16_t q) -{ - m256i add = ADD_MOD(*x, *y, q); - *y = SUB_MOD(*y, *x, q); - *x = add; -} - -// butterfly GS w/ 1 < r < q - 1 -// x = x + y mod q -// y = z * (x - y) mod q -inline void BUTTERFLY_5(m256i c, m256i* x, m256i* y, uint16_t q) -{ - m256i sub = SUB_MOD(*x, *y, q); - *x = ADD_MOD(*x, *y, q); - *y = MUL_MOD(c, sub, q); -} - -/** - * Vectorized butterly CT step - * - * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` - * P = P + r * Q - * Q = P - r * Q - * - * @param buf - working buffers - * @param r - coefficient - * @param start - index of buffer among `m` ones - * @param m - current group size - * @param len - number of vectors per buffer - * @param card - modulo cardinal - */ -inline void butterfly_ct_step( - vec::Buffers& buf, - uint16_t r, - unsigned start, - unsigned m, - size_t len, - uint16_t card) -{ - const unsigned step = m << 1; - m256i c = SET1(r); - -#define BUTTERFLY_CT(x, y) \ - (EITHER( \ - r == 1, \ - BUTTERFLY_1(x, y, card), \ - EITHER( \ - r < card - 1, \ - BUTTERFLY_3(c, x, y, card), \ - BUTTERFLY_2(x, y, card)))); - - const size_t end = len - 1; - const unsigned bufs_nb = buf.get_n(); - // #pragma omp parallel for - // #pragma unroll - const std::vector& mem = buf.get_mem(); - for (unsigned i = start; i < bufs_nb; i += step) { - m256i x1, y1; - m256i x2, y2; - m256i* __restrict p = reinterpret_cast(mem[i]); - m256i* __restrict q = reinterpret_cast(mem[i + m]); - - // #pragma omp parallel for - size_t j = 0; - // #pragma unroll - for (; j < end; j += 2) { - x1 = LOAD(p + j); - y1 = LOAD(q + j); - x2 = LOAD(p + j + 1); - y2 = LOAD(q + j + 1); - - BUTTERFLY_CT(&x1, &y1); - BUTTERFLY_CT(&x2, &y2); - - // Store back to memory - STORE(p + j, x1); - STORE(p + j + 1, x2); - STORE(q + j, y1); - STORE(q + j + 1, y2); - } - for (; j < len; ++j) { - x1 = LOAD(p + j); - y1 = LOAD(q + j); - - BUTTERFLY_CT(&x1, &y1); - - // Store back to memory - STORE(p + j, x1); - STORE(q + j, y1); - } - } -} - -/** - * Vectorized butterly CT on two-layers at a time - * - * For each quadruple - * (P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m]) - * First layer: butterfly on (P, Q) and (R, S) for step = 2 * m - * coef r1 = W[start * n / (2 * m)] - * P = P + r1 * Q - * Q = P - r1 * Q - * R = R + r1 * S - * S = R - r1 * S - * Second layer: butterfly on (P, R) and (Q, S) for step = 4 * m - * coef r2 = W[start * n / (4 * m)] - * coef r3 = W[(start + m) * n / (4 * m)] - * P = P + r2 * R - * R = P - r2 * R - * Q = Q + r3 * S - * S = Q - r3 * S - * - * @param buf - working buffers - * @param r1 - coefficient for the 1st layer - * @param r2 - 1st coefficient for the 2nd layer - * @param r3 - 2nd coefficient for the 2nd layer - * @param start - index of buffer among `m` ones - * @param m - current group size - * @param len - number of vectors per buffer - * @param card - modulo cardinal - */ -inline void butterfly_ct_two_layers_step( - vec::Buffers& buf, - uint16_t r1, - uint16_t r2, - uint16_t r3, - unsigned start, - unsigned m, - size_t len, - uint16_t card) -{ - const unsigned step = m << 2; - m256i c1 = SET1(r1); - m256i c2 = SET1(r2); - m256i c3 = SET1(r3); - -#define BUTTERFLY_R1(c, x, y) \ - (EITHER( \ - r1 == 1, \ - BUTTERFLY_1(x, y, card), \ - EITHER( \ - r1 < card - 1, \ - BUTTERFLY_3(c, x, y, card), \ - BUTTERFLY_2(x, y, card)))); -#define BUTTERFLY_R2(c, x, y) \ - (EITHER( \ - r2 == 1, \ - BUTTERFLY_1(x, y, card), \ - EITHER( \ - r2 < card - 1, \ - BUTTERFLY_3(c, x, y, card), \ - BUTTERFLY_2(x, y, card)))); -#define BUTTERFLY_R3(c, x, y) \ - (EITHER( \ - r3 == 1, \ - BUTTERFLY_1(x, y, card), \ - EITHER( \ - r3 < card - 1, \ - BUTTERFLY_3(c, x, y, card), \ - BUTTERFLY_2(x, y, card)))); - - const size_t end = len - 1; - const unsigned bufs_nb = buf.get_n(); - // #pragma omp parallel for - // #pragma unroll - const std::vector& mem = buf.get_mem(); - for (unsigned i = start; i < bufs_nb; i += step) { - m256i x1, y1, u1, v1; - m256i x2, y2, u2, v2; - m256i* __restrict p = reinterpret_cast(mem[i]); - m256i* __restrict q = reinterpret_cast(mem[i + m]); - m256i* __restrict r = reinterpret_cast(mem[i + 2 * m]); - m256i* __restrict s = reinterpret_cast(mem[i + 3 * m]); - - // #pragma omp parallel for - size_t j = 0; - // #pragma unroll - for (; j < end; j += 2) { - // First layer (c1, x, y) & (c1, u, v) - x1 = LOAD(p + j); - y1 = LOAD(q + j); - x2 = LOAD(p + j + 1); - y2 = LOAD(q + j + 1); - - u1 = LOAD(r + j); - v1 = LOAD(s + j); - u2 = LOAD(r + j + 1); - v2 = LOAD(s + j + 1); - - BUTTERFLY_R1(c1, &x1, &y1); - BUTTERFLY_R1(c1, &x2, &y2); - - BUTTERFLY_R1(c1, &u1, &v1); - BUTTERFLY_R1(c1, &u2, &v2); - - // Second layer (c2, x, u) & (c3, y, v) - BUTTERFLY_R2(c2, &x1, &u1); - BUTTERFLY_R2(c2, &x2, &u2); - - BUTTERFLY_R3(c3, &y1, &v1); - BUTTERFLY_R3(c3, &y2, &v2); - - // Store back to memory - STORE(p + j, x1); - STORE(p + j + 1, x2); - STORE(q + j, y1); - STORE(q + j + 1, y2); - - STORE(r + j, u1); - STORE(r + j + 1, u2); - STORE(s + j, v1); - STORE(s + j + 1, v2); - } - for (; j < len; ++j) { - // First layer (c1, x, y) & (c1, u, v) - x1 = LOAD(p + j); - y1 = LOAD(q + j); - u1 = LOAD(r + j); - v1 = LOAD(s + j); - - BUTTERFLY_R1(c1, &x1, &y1); - BUTTERFLY_R1(c1, &u1, &v1); - // Second layer (c2, x, u) & (c3, y, v) - BUTTERFLY_R2(c2, &x1, &u1); - BUTTERFLY_R3(c3, &y1, &v1); - // Store back to memory - STORE(p + j, x1); - STORE(q + j, y1); - STORE(r + j, u1); - STORE(s + j, v1); - } - } -} - -/** - * Vectorized butterly GS step - * - * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` - * P = P + Q - * Q = r * (P - Q) - * - * @param buf - working buffers - * @param r - coefficient - * @param start - index of buffer among `m` ones - * @param m - current group size - * @param len - number of vectors per buffer - * @param card - modulo cardinal - */ -inline void butterfly_gs_step( - vec::Buffers& buf, - uint16_t r, - unsigned start, - unsigned m, - size_t len, - uint16_t card) -{ - const unsigned step = m << 1; - m256i c = SET1(r); - -#define BUTTERFLY_GS(x, y) \ - (EITHER( \ - r == 1, \ - BUTTERFLY_1(x, y, card), \ - EITHER( \ - r < card - 1, \ - BUTTERFLY_5(c, x, y, card), \ - BUTTERFLY_4(x, y, card)))); - - const size_t end = len - 1; - const unsigned bufs_nb = buf.get_n(); - // #pragma omp parallel for - // #pragma unroll - const std::vector& mem = buf.get_mem(); - for (unsigned i = start; i < bufs_nb; i += step) { - m256i x1, y1; - m256i x2, y2; - m256i* __restrict p = reinterpret_cast(mem[i]); - m256i* __restrict q = reinterpret_cast(mem[i + m]); - - // #pragma omp parallel for - size_t j = 0; - // #pragma unroll - for (; j < end; j += 2) { - x1 = LOAD(p + j); - y1 = LOAD(q + j); - x2 = LOAD(p + j + 1); - y2 = LOAD(q + j + 1); - - BUTTERFLY_GS(&x1, &y1); - BUTTERFLY_GS(&x2, &y2); - - // Store back to memory - STORE(p + j, x1); - STORE(p + j + 1, x2); - STORE(q + j, y1); - STORE(q + j + 1, y2); - } - for (; j < len; ++j) { - x1 = LOAD(p + j); - y1 = LOAD(q + j); - - BUTTERFLY_GS(&x1, &y1); - - // Store back to memory - STORE(p + j, x1); - STORE(q + j, y1); - } - } -} - -/** - * Vectorized butterly GS step - * - * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` - * Q = r * Q - * - * @param buf - working buffers - * @param r - coefficient - * @param start - index of buffer among `m` ones - * @param m - current group size - * @param len - number of vectors per buffer - * @param card - modulo cardinal - */ -inline void butterfly_gs_step_simple( - vec::Buffers& buf, - uint16_t r, - unsigned start, - unsigned m, - size_t len, - uint16_t card) -{ - const unsigned step = m << 1; - m256i c = SET1(r); - -#define BUTTERFLY_GS_S(x) \ - (EITHER( \ - r == 1, \ - (x), \ - EITHER(r < card - 1, MUL_MOD(c, x, card), NEG_MOD(x, card)))); - - const size_t end = len - 1; - const unsigned bufs_nb = buf.get_n(); - // #pragma omp parallel for - // #pragma unroll - const std::vector& mem = buf.get_mem(); - for (unsigned i = start; i < bufs_nb; i += step) { - m256i x1, y1; - m256i x2, y2; - m256i* __restrict p = reinterpret_cast(mem[i]); - m256i* __restrict q = reinterpret_cast(mem[i + m]); - - // #pragma omp parallel for - size_t j = 0; - // #pragma unroll - for (; j < end; j += 2) { - x1 = LOAD(p + j); - x2 = LOAD(p + j + 1); - - y1 = BUTTERFLY_GS_S(x1); - y2 = BUTTERFLY_GS_S(x2); - - // Store back to memory - STORE(q + j, y1); - STORE(q + j + 1, y2); - } - for (; j < len; ++j) { - x1 = LOAD(p + j); - - y1 = BUTTERFLY_GS_S(x1); - - // Store back to memory - STORE(q + j, y1); - } - } -} - -inline void add_props_16( - Properties& props, - m256i threshold, - m256i mask, - m256i symb, - off_t offset) -{ - const m256i b = CMPEQ16(threshold, symb); - const m256i c = AND(mask, b); - uint32_t d = MVMSK8(c); - const unsigned element_size = sizeof(uint16_t); - while (d > 0) { - unsigned byte_idx = __builtin_ctz(d); - off_t _offset = offset + byte_idx / element_size; - props.add(_offset, OOR_MARK); - d ^= 1 << byte_idx; - } -} - -inline void encode_post_process( - vec::Buffers& output, - std::vector& props, - off_t offset, - unsigned code_len, - uint16_t threshold, - size_t vecs_nb) -{ - const unsigned element_size = sizeof(uint16_t); - const unsigned vec_size = ALIGN_SIZE / element_size; - const uint16_t max = 1 << (element_size * 8 - 1); - const m256i _threshold = SET1(threshold); - const m256i mask_hi = SET1(max); - - // #pragma unroll - const std::vector& mem = output.get_mem(); - for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { - m256i* __restrict buf = reinterpret_cast(mem[frag_id]); - - size_t vec_id = 0; - size_t end = vecs_nb - 3; - // #pragma unroll - for (; vec_id < end; vec_id += 4) { - m256i a1 = LOAD(buf + vec_id); - m256i a2 = LOAD(buf + vec_id + 1); - m256i a3 = LOAD(buf + vec_id + 2); - m256i a4 = LOAD(buf + vec_id + 3); - - if (TESTZ(a1, _threshold) == 0) { - const off_t curr_offset = offset + vec_id * vec_size; - add_props_16( - props[frag_id], _threshold, mask_hi, a1, curr_offset); - } - if (TESTZ(a2, _threshold) == 0) { - const off_t curr_offset = offset + (vec_id + 1) * vec_size; - add_props_16( - props[frag_id], _threshold, mask_hi, a2, curr_offset); - } - if (TESTZ(a3, _threshold) == 0) { - const off_t curr_offset = offset + (vec_id + 2) * vec_size; - add_props_16( - props[frag_id], _threshold, mask_hi, a3, curr_offset); - } - if (TESTZ(a4, _threshold) == 0) { - const off_t curr_offset = offset + (vec_id + 3) * vec_size; - add_props_16( - props[frag_id], _threshold, mask_hi, a4, curr_offset); - } - } - for (; vec_id < vecs_nb; ++vec_id) { - m256i a = LOAD(buf + vec_id); - uint16_t c = TESTZ(a, _threshold); - if (c == 0) { - const off_t curr_offset = offset + vec_id * vec_size; - add_props_16( - props[frag_id], _threshold, mask_hi, a, curr_offset); - } - } - } -} - -/* ==================== Operations =================== */ -/** Perform a multiplication of a coefficient `a` to each element of `src` and - * add result to correspondent element of `dest` - * - * @note: 1 < `a` < card - 1 - */ -inline void mul_coef_to_buf( - const uint16_t a, - aint16* src, - aint16* dest, - size_t len, - uint16_t card) -{ - const m256i coef = SET1(a); - - m256i* __restrict _src = reinterpret_cast(src); - m256i* __restrict _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplication - _dest[i] = MUL_MOD(coef, _src[i], card); - } - if (_last_len > 0) { - uint32_t coef_32 = (uint32_t)a; - for (i = _len * ratio; i < len; i++) { - // perform multiplication - dest[i] = (aint16)((coef_32 * src[i]) % card); - } - } -} - -inline void add_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card) -{ - m256i* __restrict _src = reinterpret_cast(src); - m256i* __restrict _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform addition - _dest[i] = ADD_MOD(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform addition - aint16 tmp = src[i] + dest[i]; - dest[i] = (tmp >= card) ? (tmp - card) : tmp; - } - } -} - -inline void -sub_two_bufs(aint16* bufa, aint16* bufb, aint16* res, size_t len, aint16 card) -{ - m256i* __restrict _bufa = reinterpret_cast(bufa); - m256i* __restrict _bufb = reinterpret_cast(bufb); - m256i* __restrict _res = reinterpret_cast(res); - const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform subtraction - _res[i] = SUB_MOD(_bufa[i], _bufb[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform subtraction - if (bufa[i] >= bufb[i]) - res[i] = bufa[i] - bufb[i]; - else - res[i] = card - (bufb[i] - bufa[i]); - } - } -} - -inline void mul_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card) -{ - m256i* __restrict _src = reinterpret_cast(src); - m256i* __restrict _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplicaton - _dest[i] = MULFULL_MOD(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform multiplicaton - dest[i] = uint16_t((uint64_t(src[i]) * dest[i]) % card); - } - } -} - -/** Apply an element-wise negation to a buffer - */ -inline void neg(size_t len, aint16* buf, aint16 card) -{ - m256i* _buf = reinterpret_cast(buf); - unsigned ratio = sizeof(*_buf) / sizeof(*buf); - size_t _len = len / ratio; - size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - _buf[i] = NEG_MOD(_buf[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - if (buf[i]) - buf[i] = card - buf[i]; - } - } -} - -} // namespace simd -} // namespace quadiron - -#endif diff --git a/src/simd_256_u32.h b/src/simd_256_u32.h deleted file mode 100644 index 5302a472..00000000 --- a/src/simd_256_u32.h +++ /dev/null @@ -1,839 +0,0 @@ -/* - * Copyright 2017-2018 Scality - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __QUAD_SIMD_256_U32_H__ -#define __QUAD_SIMD_256_U32_H__ - -#include - -#include "simd/simd.h" - -namespace quadiron { -namespace simd { - -#define F4_u32 _mm256_set1_epi32(65537) -#define F4m1_u32 _mm256_set1_epi32(65536) -#define F3_u32 _mm256_set1_epi32(257) -#define F3m1_u32 _mm256_set1_epi32(256) - -#define CARD(q) (EITHER(q == F3, F3_u32, F4_u32)) -#define CARD_M_1(q) (EITHER(q == F3, F3m1_u32, F4m1_u32)) - -/* ==================== Essential Operations =================== */ -// Following functions are used for AVX2 w/ u32 only - -inline m256i SET1(uint32_t val) -{ - return _mm256_set1_epi32(val); -} -inline m256i ADD32(m256i x, m256i y) -{ - return _mm256_add_epi32(x, y); -} -inline m256i SUB32(m256i x, m256i y) -{ - return _mm256_sub_epi32(x, y); -} -inline m256i MUL32(m256i x, m256i y) -{ - return _mm256_mullo_epi32(x, y); -} - -inline m256i CMPEQ32(m256i x, m256i y) -{ - return _mm256_cmpeq_epi32(x, y); -} -inline m256i CMPGT32(m256i x, m256i y) -{ - return _mm256_cmpgt_epi32(x, y); -} -inline m256i MINU32(m256i x, m256i y) -{ - return _mm256_min_epu32(x, y); -} -#define BLEND16(x, y, imm8) (_mm256_blend_epi16(x, y, imm8)) - -// z = x + y mod q -// Input are loaded to registers -// Output is register -inline m256i ADD_MOD(m256i x, m256i y, uint32_t q) -{ - m256i res = ADD32(x, y); - return MINU32(res, SUB32(res, CARD(q))); -} - -// z = x - y mod q => z = q + x - y mod q -// Input are loaded to registers -// Output is register -inline m256i SUB_MOD(m256i x, m256i y, uint32_t q) -{ - m256i res = SUB32(x, y); - return MINU32(res, ADD32(res, CARD(q))); -} - -// y = 0 - x mod q => y = q - x mod q -// Input are loaded to registers -// Output is register -inline m256i NEG_MOD(m256i x, uint32_t q) -{ - m256i res = SUB32(CARD(q), x); - return MINU32(res, SUB32(res, CARD(q))); -} - -// z = x * y mod q -// Input are loaded to registers -// Output is register -// Note: we assume that at least `x` or `y` is less than `q-1` so it's -// not necessary to verify overflow on multiplying elements -inline m256i MUL_MOD(m256i x, m256i y, uint32_t q) -{ - m256i res = MUL32(x, y); - m256i lo = BLEND16(ZERO, res, 0x55); - m256i hi = BLEND16(ZERO, SHIFTR_2(res), 0x55); - return SUB_MOD(lo, hi, q); -} - -inline void MUL_MOD(m256i x, m256i y, m256i* z, uint32_t q) -{ - m256i res = MUL32(x, y); - m256i lo = BLEND16(ZERO, res, 0x55); - m256i hi = BLEND16(ZERO, SHIFTR_2(res), 0x55); - *z = SUB_MOD(lo, hi, q); -} -// z = x * y mod q -// Input are loaded to registers -// Output is register -inline m256i MULFULL_MOD(m256i x, m256i y, uint32_t q) -{ - m256i res = MUL32(x, y); - - // filter elements of both of a & b = card-1 - m256i cmp = AND(CMPEQ32(x, CARD_M_1(q)), CMPEQ32(y, CARD_M_1(q))); - res = (q == F3) ? XOR(res, AND(F4_u32, cmp)) : ADD32(res, AND(ONE, cmp)); - - m256i lo = BLEND16(ZERO, res, 0x55); - m256i hi = SHIFTR_2(BLEND16(ZERO, res, 0xAA)); - return SUB_MOD(lo, hi, q); -} - -// butterfly CT with r == 1 -inline void BUTTERFLY_1(m256i* x, m256i* y, uint32_t q) -{ - m256i add = ADD_MOD(*x, *y, q); - *y = SUB_MOD(*x, *y, q); - *x = add; -} - -// butterfly CT with r == q - 1 -inline void BUTTERFLY_2(m256i* x, m256i* y, uint32_t q) -{ - m256i add = ADD_MOD(*x, *y, q); - *x = SUB_MOD(*x, *y, q); - *y = add; -} - -// butterfly CT with 1 < r < q - 1 -inline void BUTTERFLY_3(m256i c, m256i* x, m256i* y, uint32_t q) -{ - m256i z = MUL_MOD(c, *y, q); - *y = SUB_MOD(*x, z, q); - *x = ADD_MOD(*x, z, q); -} - -// butterfly GS w/ r = q - 1 -inline void BUTTERFLY_4(m256i* x, m256i* y, uint32_t q) -{ - m256i add = ADD_MOD(*x, *y, q); - *y = SUB_MOD(*y, *x, q); - *x = add; -} - -// butterfly GS w/ 1 < r < q - 1 -// x = x + y mod q -// y = z * (x - y) mod q -inline void BUTTERFLY_5(m256i c, m256i* x, m256i* y, uint32_t q) -{ - m256i sub = SUB_MOD(*x, *y, q); - *x = ADD_MOD(*x, *y, q); - *y = MUL_MOD(c, sub, q); -} - -/** - * Vectorized butterly CT step - * - * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` - * P = P + r * Q - * Q = P - r * Q - * - * @param buf - working buffers - * @param r - coefficient - * @param start - index of buffer among `m` ones - * @param m - current group size - * @param len - number of vectors per buffer - * @param card - modulo cardinal - */ -inline void butterfly_ct_step( - vec::Buffers& buf, - uint32_t r, - unsigned start, - unsigned m, - size_t len, - uint32_t card) -{ - const unsigned step = m << 1; - m256i c = SET1(r); - -#define BUTTERFLY_CT(x, y) \ - (EITHER( \ - r == 1, \ - BUTTERFLY_1(x, y, card), \ - EITHER( \ - r < card - 1, \ - BUTTERFLY_3(c, x, y, card), \ - BUTTERFLY_2(x, y, card)))); - - const size_t end = len - 1; - const unsigned bufs_nb = buf.get_n(); - // #pragma omp parallel for - // #pragma unroll - const std::vector& mem = buf.get_mem(); - for (unsigned i = start; i < bufs_nb; i += step) { - m256i x1, y1; - m256i x2, y2; - m256i* __restrict p = reinterpret_cast(mem[i]); - m256i* __restrict q = reinterpret_cast(mem[i + m]); - - // #pragma omp parallel for - size_t j = 0; - // #pragma unroll - for (; j < end; j += 2) { - x1 = LOAD(p + j); - y1 = LOAD(q + j); - x2 = LOAD(p + j + 1); - y2 = LOAD(q + j + 1); - - BUTTERFLY_CT(&x1, &y1); - BUTTERFLY_CT(&x2, &y2); - - // Store back to memory - STORE(p + j, x1); - STORE(p + j + 1, x2); - STORE(q + j, y1); - STORE(q + j + 1, y2); - } - for (; j < len; ++j) { - x1 = LOAD(p + j); - y1 = LOAD(q + j); - - BUTTERFLY_CT(&x1, &y1); - - // Store back to memory - STORE(p + j, x1); - STORE(q + j, y1); - } - } -} - -/** - * Vectorized butterly CT on two-layers at a time - * - * For each quadruple - * (P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m]) - * First layer: butterfly on (P, Q) and (R, S) for step = 2 * m - * coef r1 = W[start * n / (2 * m)] - * P = P + r1 * Q - * Q = P - r1 * Q - * R = R + r1 * S - * S = R - r1 * S - * Second layer: butterfly on (P, R) and (Q, S) for step = 4 * m - * coef r2 = W[start * n / (4 * m)] - * coef r3 = W[(start + m) * n / (4 * m)] - * P = P + r2 * R - * R = P - r2 * R - * Q = Q + r3 * S - * S = Q - r3 * S - * - * @param buf - working buffers - * @param r1 - coefficient for the 1st layer - * @param r2 - 1st coefficient for the 2nd layer - * @param r3 - 2nd coefficient for the 2nd layer - * @param start - index of buffer among `m` ones - * @param m - current group size - * @param len - number of vectors per buffer - * @param card - modulo cardinal - */ -inline void butterfly_ct_two_layers_step( - vec::Buffers& buf, - uint32_t r1, - uint32_t r2, - uint32_t r3, - unsigned start, - unsigned m, - size_t len, - uint32_t card) -{ - const unsigned step = m << 2; - m256i c1 = SET1(r1); - m256i c2 = SET1(r2); - m256i c3 = SET1(r3); - -#define BUTTERFLY_R1(c, x, y) \ - (EITHER( \ - r1 == 1, \ - BUTTERFLY_1(x, y, card), \ - EITHER( \ - r1 < card - 1, \ - BUTTERFLY_3(c, x, y, card), \ - BUTTERFLY_2(x, y, card)))); -#define BUTTERFLY_R2(c, x, y) \ - (EITHER( \ - r2 == 1, \ - BUTTERFLY_1(x, y, card), \ - EITHER( \ - r2 < card - 1, \ - BUTTERFLY_3(c, x, y, card), \ - BUTTERFLY_2(x, y, card)))); -#define BUTTERFLY_R3(c, x, y) \ - (EITHER( \ - r3 == 1, \ - BUTTERFLY_1(x, y, card), \ - EITHER( \ - r3 < card - 1, \ - BUTTERFLY_3(c, x, y, card), \ - BUTTERFLY_2(x, y, card)))); - - const size_t end = len - 1; - const unsigned bufs_nb = buf.get_n(); - // #pragma omp parallel for - // #pragma unroll - const std::vector& mem = buf.get_mem(); - for (unsigned i = start; i < bufs_nb; i += step) { - m256i x1, y1, u1, v1; - m256i x2, y2, u2, v2; - m256i* __restrict p = reinterpret_cast(mem[i]); - m256i* __restrict q = reinterpret_cast(mem[i + m]); - m256i* __restrict r = reinterpret_cast(mem[i + 2 * m]); - m256i* __restrict s = reinterpret_cast(mem[i + 3 * m]); - - // #pragma omp parallel for - size_t j = 0; - // #pragma unroll - for (; j < end; j += 2) { - // First layer (c1, x, y) & (c1, u, v) - x1 = LOAD(p + j); - y1 = LOAD(q + j); - x2 = LOAD(p + j + 1); - y2 = LOAD(q + j + 1); - - u1 = LOAD(r + j); - v1 = LOAD(s + j); - u2 = LOAD(r + j + 1); - v2 = LOAD(s + j + 1); - - BUTTERFLY_R1(c1, &x1, &y1); - BUTTERFLY_R1(c1, &x2, &y2); - - BUTTERFLY_R1(c1, &u1, &v1); - BUTTERFLY_R1(c1, &u2, &v2); - - // Second layer (c2, x, u) & (c3, y, v) - BUTTERFLY_R2(c2, &x1, &u1); - BUTTERFLY_R2(c2, &x2, &u2); - - BUTTERFLY_R3(c3, &y1, &v1); - BUTTERFLY_R3(c3, &y2, &v2); - - // Store back to memory - STORE(p + j, x1); - STORE(p + j + 1, x2); - STORE(q + j, y1); - STORE(q + j + 1, y2); - - STORE(r + j, u1); - STORE(r + j + 1, u2); - STORE(s + j, v1); - STORE(s + j + 1, v2); - } - for (; j < len; ++j) { - // First layer (c1, x, y) & (c1, u, v) - x1 = LOAD(p + j); - y1 = LOAD(q + j); - u1 = LOAD(r + j); - v1 = LOAD(s + j); - - BUTTERFLY_R1(c1, &x1, &y1); - BUTTERFLY_R1(c1, &u1, &v1); - // Second layer (c2, x, u) & (c3, y, v) - BUTTERFLY_R2(c2, &x1, &u1); - BUTTERFLY_R3(c3, &y1, &v1); - // Store back to memory - STORE(p + j, x1); - STORE(q + j, y1); - STORE(r + j, u1); - STORE(s + j, v1); - } - } -} - -/** - * Vectorized butterly GS step - * - * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` - * P = P + Q - * Q = r * (P - Q) - * - * @param buf - working buffers - * @param r - coefficient - * @param start - index of buffer among `m` ones - * @param m - current group size - * @param len - number of vectors per buffer - * @param card - modulo cardinal - */ -inline void butterfly_gs_step( - vec::Buffers& buf, - uint32_t r, - unsigned start, - unsigned m, - size_t len, - uint32_t card) -{ - const unsigned step = m << 1; - m256i c = SET1(r); - -#define BUTTERFLY_GS(x, y) \ - (EITHER( \ - r == 1, \ - BUTTERFLY_1(x, y, card), \ - EITHER( \ - r < card - 1, \ - BUTTERFLY_5(c, x, y, card), \ - BUTTERFLY_4(x, y, card)))); - - const size_t end = len - 3; - const unsigned bufs_nb = buf.get_n(); - // #pragma omp parallel for - // #pragma unroll - const std::vector& mem = buf.get_mem(); - for (unsigned i = start; i < bufs_nb; i += step) { - m256i x1, x2, x3, x4; - m256i y1, y2, y3, y4; - m256i* __restrict p = reinterpret_cast(mem[i]); - m256i* __restrict q = reinterpret_cast(mem[i + m]); - - // #pragma omp parallel for - size_t j = 0; - // #pragma unroll - for (; j < end; j += 4) { - x1 = LOAD(p + j); - x2 = LOAD(p + j + 1); - x3 = LOAD(p + j + 2); - x4 = LOAD(p + j + 3); - y1 = LOAD(q + j); - y2 = LOAD(q + j + 1); - y3 = LOAD(q + j + 2); - y4 = LOAD(q + j + 3); - - BUTTERFLY_GS(&x1, &y1); - BUTTERFLY_GS(&x2, &y2); - BUTTERFLY_GS(&x3, &y3); - BUTTERFLY_GS(&x4, &y4); - - // Store back to memory - STORE(p + j, x1); - STORE(p + j + 1, x2); - STORE(p + j + 2, x3); - STORE(p + j + 3, x4); - STORE(q + j, y1); - STORE(q + j + 1, y2); - STORE(q + j + 2, y3); - STORE(q + j + 3, y4); - } - for (; j < len; ++j) { - x1 = LOAD(p + j); - y1 = LOAD(q + j); - - BUTTERFLY_GS(&x1, &y1); - - // Store back to memory - STORE(p + j, x1); - STORE(q + j, y1); - } - } -} - -/** - * Vectorized butterly GS step - * - * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` - * Q = r * P - * - * @param buf - working buffers - * @param r - coefficient - * @param start - index of buffer among `m` ones - * @param m - current group size - * @param len - number of vectors per buffer - * @param card - modulo cardinal - */ -inline void butterfly_gs_step_simple( - vec::Buffers& buf, - uint32_t r, - unsigned start, - unsigned m, - size_t len, - uint32_t card) -{ - const unsigned step = m << 1; - m256i c = SET1(r); - -#define BUTTERFLY_GS_S(x) \ - (EITHER( \ - r == 1, \ - (x), \ - EITHER(r < card - 1, MUL_MOD(c, x, card), NEG_MOD(x, card)))); - - const size_t end = len - 1; - const unsigned bufs_nb = buf.get_n(); - // #pragma omp parallel for - // #pragma unroll - const std::vector& mem = buf.get_mem(); - for (unsigned i = start; i < bufs_nb; i += step) { - m256i x1, y1; - m256i x2, y2; - m256i* __restrict p = reinterpret_cast(mem[i]); - m256i* __restrict q = reinterpret_cast(mem[i + m]); - - // #pragma omp parallel for - size_t j = 0; - // #pragma unroll - for (; j < end; j += 2) { - x1 = LOAD(p + j); - x2 = LOAD(p + j + 1); - - y1 = BUTTERFLY_GS_S(x1); - y2 = BUTTERFLY_GS_S(x2); - - // Store back to memory - STORE(q + j, y1); - STORE(q + j + 1, y2); - } - for (; j < len; ++j) { - x1 = LOAD(p + j); - - y1 = BUTTERFLY_GS_S(x1); - - // Store back to memory - STORE(q + j, y1); - } - } -} - -inline void add_props( - Properties& props, - m256i threshold, - m256i mask, - m256i symb, - off_t offset) -{ - const m256i b = CMPEQ32(threshold, symb); - const m256i c = AND(mask, b); - uint32_t d = MVMSK8(c); - const unsigned element_size = sizeof(uint32_t); - while (d > 0) { - unsigned byte_idx = __builtin_ctz(d); - off_t _offset = offset + byte_idx / element_size; - props.add(_offset, OOR_MARK); - d ^= 1 << byte_idx; - } -} - -inline void encode_post_process( - vec::Buffers& output, - std::vector& props, - off_t offset, - unsigned code_len, - uint32_t threshold, - size_t vecs_nb) -{ - const unsigned element_size = sizeof(uint32_t); - const unsigned vec_size = ALIGN_SIZE / element_size; - const uint32_t max = 1 << (element_size * 8 - 1); - const m256i _threshold = SET1(threshold); - const m256i mask_hi = SET1(max); - - // #pragma unroll - const std::vector& mem = output.get_mem(); - for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { - m256i* __restrict buf = reinterpret_cast(mem[frag_id]); - - size_t vec_id = 0; - size_t end = vecs_nb - 3; - // #pragma unroll - for (; vec_id < end; vec_id += 4) { - m256i a1 = LOAD(buf + vec_id); - m256i a2 = LOAD(buf + vec_id + 1); - m256i a3 = LOAD(buf + vec_id + 2); - m256i a4 = LOAD(buf + vec_id + 3); - - if (TESTZ(a1, _threshold) == 0) { - const off_t curr_offset = offset + vec_id * vec_size; - add_props(props[frag_id], _threshold, mask_hi, a1, curr_offset); - } - if (TESTZ(a2, _threshold) == 0) { - const off_t curr_offset = offset + (vec_id + 1) * vec_size; - add_props(props[frag_id], _threshold, mask_hi, a2, curr_offset); - } - if (TESTZ(a3, _threshold) == 0) { - const off_t curr_offset = offset + (vec_id + 2) * vec_size; - add_props(props[frag_id], _threshold, mask_hi, a3, curr_offset); - } - if (TESTZ(a4, _threshold) == 0) { - const off_t curr_offset = offset + (vec_id + 3) * vec_size; - add_props(props[frag_id], _threshold, mask_hi, a4, curr_offset); - } - } - for (; vec_id < vecs_nb; ++vec_id) { - m256i a = LOAD(buf + vec_id); - uint32_t c = TESTZ(a, _threshold); - if (c == 0) { - const off_t curr_offset = offset + vec_id * vec_size; - add_props(props[frag_id], _threshold, mask_hi, a, curr_offset); - } - } - } -} - -/* ==================== Operations =================== */ -/** Perform a multiplication of a coefficient `a` to each element of `src` and - * add result to correspondent element of `dest` - * - * @note: 1 < `a` < card - 1 - */ -inline void mul_coef_to_buf( - const uint32_t a, - aint32* src, - aint32* dest, - size_t len, - uint32_t card) -{ - const m256i coef = SET1(a); - - m256i* __restrict _src = reinterpret_cast(src); - m256i* __restrict _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i = 0; - size_t end = _len - 3; - for (; i < end; i += 4) { - // perform multiplication - MUL_MOD(coef, _src[i], _dest + i, card); - MUL_MOD(coef, _src[i + 1], _dest + i + 1, card); - MUL_MOD(coef, _src[i + 2], _dest + i + 2, card); - MUL_MOD(coef, _src[i + 3], _dest + i + 3, card); - } - for (; i < _len; ++i) { - MUL_MOD(coef, _src[i], _dest + i, card); - } - - if (_last_len > 0) { - uint64_t coef_64 = (uint64_t)a; - for (size_t i = _len * ratio; i < len; i++) { - // perform multiplication - dest[i] = (aint32)((coef_64 * src[i]) % card); - } - } -} - -inline void add_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card) -{ - m256i* __restrict _src = reinterpret_cast(src); - m256i* __restrict _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform addition - _dest[i] = ADD_MOD(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform addition - aint32 tmp = src[i] + dest[i]; - dest[i] = (tmp >= card) ? (tmp - card) : tmp; - } - } -} - -inline void sub_two_bufs( - aint32* bufa, - aint32* bufb, - aint32* res, - size_t len, - aint32 card = F4) -{ - m256i* __restrict _bufa = reinterpret_cast(bufa); - m256i* __restrict _bufb = reinterpret_cast(bufb); - m256i* __restrict _res = reinterpret_cast(res); - const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform subtraction - _res[i] = SUB_MOD(_bufa[i], _bufb[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform subtraction - if (bufa[i] >= bufb[i]) - res[i] = bufa[i] - bufb[i]; - else - res[i] = card - (bufb[i] - bufa[i]); - } - } -} - -inline void mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card) -{ - m256i* __restrict _src = reinterpret_cast(src); - m256i* __restrict _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplicaton - _dest[i] = MULFULL_MOD(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform multiplicaton - dest[i] = uint32_t((uint64_t(src[i]) * dest[i]) % card); - } - } -} - -/** Apply an element-wise negation to a buffer - */ -inline void neg(size_t len, aint32* buf, aint32 card = F4) -{ - m256i* _buf = reinterpret_cast(buf); - unsigned ratio = sizeof(*_buf) / sizeof(*buf); - size_t _len = len / ratio; - size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - _buf[i] = NEG_MOD(_buf[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - if (buf[i]) - buf[i] = card - buf[i]; - } - } -} - -/* ==================== Operations for NF4 =================== */ -typedef __m128i m128i; - -/** Return aint128 integer from a _m128i register */ -inline aint128 m256i_to_uint128(m256i v) -{ - aint128 hi, lo; - _mm256_storeu2_m128i((m128i*)&hi, (m128i*)&lo, v); - return lo; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn) -} - -inline __uint128_t add(__uint128_t a, __uint128_t b) -{ - m256i _a = _mm256_castsi128_si256((m128i)a); - m256i _b = _mm256_castsi128_si256((m128i)b); - m256i res = ADD_MOD(_a, _b, F4); - return m256i_to_uint128(res); -} - -inline __uint128_t sub(__uint128_t a, __uint128_t b) -{ - m256i _a = _mm256_castsi128_si256((m128i)a); - m256i _b = _mm256_castsi128_si256((m128i)b); - m256i res = SUB_MOD(_a, _b, F4); - return m256i_to_uint128(res); -} - -inline __uint128_t mul(__uint128_t a, __uint128_t b) -{ - m256i _a = _mm256_castsi128_si256((m128i)a); - m256i _b = _mm256_castsi128_si256((m128i)b); - m256i res = MULFULL_MOD(_a, _b, F4); - return m256i_to_uint128(res); -} - -/** Store low 128-bit part of `reg` to memory */ -inline void store_low(aint128* address, m256i reg) -{ - _mm_store_si128((m128i*)address, _mm256_castsi256_si128(reg)); -} - -inline void hadamard_mul(int n, aint128* _x, aint128* _y) -{ - int i; - m256i* x = reinterpret_cast(_x); - m256i* y = reinterpret_cast(_y); - - const unsigned ratio = sizeof(*x) / sizeof(*_x); - const int len_256 = n / ratio; - const int last_len = n - len_256 * ratio; - - // multiply y to the first half of `x` - for (i = 0; i < len_256; i++) { - x[i] = MULFULL_MOD(x[i], y[i], F4); - } - - if (last_len > 0) { - // add last _y[] to x - for (i = len_256 * ratio; i < n; i++) { - m256i _x_p = _mm256_castsi128_si256((m128i)_x[i]); - m256i _y_p = _mm256_castsi128_si256((m128i)_y[i]); - - store_low(_x + i, mul(_x_p, _y_p, F4)); - } - } -} - -} // namespace simd -} // namespace quadiron - -#endif From 559f733ad14d3cb7e6968a1b9a0ae57132f65bf4 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:42:45 +0200 Subject: [PATCH 08/77] SIMD 128 u16 u32: remove useless files --- src/simd_128_u16.h | 312 ---------------------- src/simd_128_u32.h | 639 --------------------------------------------- 2 files changed, 951 deletions(-) delete mode 100644 src/simd_128_u16.h delete mode 100644 src/simd_128_u32.h diff --git a/src/simd_128_u16.h b/src/simd_128_u16.h deleted file mode 100644 index e13d8756..00000000 --- a/src/simd_128_u16.h +++ /dev/null @@ -1,312 +0,0 @@ -/* - * Copyright 2017-2018 Scality - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __QUAD_SIMD_128_U16_H__ -#define __QUAD_SIMD_128_U16_H__ - -#include - -#include "property.h" -#include "simd/simd.h" - -namespace quadiron { -namespace simd { - -/* ==================== Essential Operations =================== */ - -/** Perform a%card where a is a addition of two numbers whose elements are - * symbols of GF(card) */ -inline m128i mod_after_add(m128i a, aint16 card) -{ - const m128i _card = _mm_set1_epi16(card); - const m128i _card_minus_1 = _mm_set1_epi16(card - 1); - - m128i cmp = _mm_cmpgt_epi16(a, _card_minus_1); - m128i b = _mm_sub_epi16(a, _mm_and_si128(_card, cmp)); - - return b; -} - -/** Perform addition of two numbers a, b whose elements are of GF(card) */ -inline m128i add(m128i a, m128i b, aint16 card) -{ - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - m128i c = _mm_add_epi16(_a, _b); - - // Modulo - return mod_after_add(c, card); -} - -/** Perform subtraction of a by b where a, b whose elements are symbols of - * GF(card) - * sub(a, b) = a - b if a >= b, or - * card + a - b, otherwise - */ -inline m128i sub(m128i a, m128i b, aint16 card) -{ - const m128i _card = _mm_set1_epi16(card); - - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - - m128i cmp = _mm_cmpgt_epi16(_b, _a); - m128i _a1 = _mm_add_epi16(_a, _mm_and_si128(_card, cmp)); - - return _mm_sub_epi16(_a1, _b); -} - -/** Negate `a` - * @return 0 if (a == 0), else card - a - */ -inline m128i neg(m128i a, aint16 card = F3) -{ - const m128i _card = _mm_set1_epi16(card); - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_setzero_si128(); - - m128i cmp = _mm_cmpgt_epi16(_a, _b); - - return _mm_sub_epi16(_mm_and_si128(cmp, _card), _a); -} - -inline m128i mod_after_multiply(m128i a) -{ - const m128i mask = _mm_set1_epi16(F3 - 2); - - m128i lo = _mm_and_si128(a, mask); - - m128i a_shift = _mm_srli_si128(a, 1); - m128i hi = _mm_and_si128(a_shift, mask); - - m128i cmp = _mm_cmpgt_epi16(hi, lo); - m128i _lo = _mm_add_epi16(lo, _mm_and_si128(F3_m128i_u16, cmp)); - - return _mm_sub_epi16(_lo, hi); -} - -inline m128i mul(m128i a, m128i b) -{ - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - - m128i c = _mm_mullo_epi16(_a, _b); - - // filter elements of both of a & b = card-1 - m128i cmp = _mm_and_si128( - _mm_cmpeq_epi16(_a, F3minus1_m128i_u16), - _mm_cmpeq_epi16(_b, F3minus1_m128i_u16)); - - const m128i one = _mm_set1_epi16(1); - c = _mm_add_epi16(c, _mm_and_si128(one, cmp)); - - // Modulo - return mod_after_multiply(c); -} - -/** Perform multiplication of two numbers a, b whose elements are of GF(card) - * where `card` is a prime Fermat number, i.e. card = Fx with x < 5 - * Currently, it supports only for F3 - */ -inline m128i mul(m128i a, m128i b, aint16 card) -{ - // FIXME: generalize card - assert(card == F3); - return mul(a, b); -} - -/** Apply an element-wise negation to a buffer - */ -inline void neg(size_t len, aint16* buf, aint16 card = F3) -{ - m128i* _buf = reinterpret_cast(buf); - unsigned ratio = sizeof(*_buf) / sizeof(*buf); - size_t _len = len / ratio; - size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - _buf[i] = neg(_buf[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - if (buf[i]) - buf[i] = card - buf[i]; - } - } -} - -/** Perform a multiplication of a coefficient `a` to each element of `src` and - * add result to correspondent element of `dest` - */ -inline void mul_coef_to_buf( - const aint16 a, - aint16* src, - aint16* dest, - size_t len, - aint16 card = F3) -{ - const m128i coef = _mm_set1_epi16(a); - - m128i* _src = reinterpret_cast(src); - m128i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplication - _dest[i] = mul(coef, _src[i], card); - } - if (_last_len > 0) { - uint32_t coef_doubled = (uint32_t)a; - for (i = _len * ratio; i < len; i++) { - // perform multiplication - dest[i] = (aint16)((coef_doubled * src[i]) % card); - } - } -} - -inline void -add_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3) -{ - m128i* _src = reinterpret_cast(src); - m128i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform addition - _dest[i] = add(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform addition - aint16 tmp = src[i] + dest[i]; - dest[i] = (tmp >= card) ? (tmp - card) : tmp; - } - } -} - -inline void sub_two_bufs( - aint16* bufa, - aint16* bufb, - aint16* res, - size_t len, - aint16 card = F3) -{ - m128i* _bufa = reinterpret_cast(bufa); - m128i* _bufb = reinterpret_cast(bufb); - m128i* _res = reinterpret_cast(res); - const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform subtraction - _res[i] = sub(_bufa[i], _bufb[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform subtraction - if (bufa[i] >= bufb[i]) - res[i] = bufa[i] - bufb[i]; - else - res[i] = card - (bufb[i] - bufa[i]); - } - } -} - -inline void -mul_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3) -{ - m128i* _src = reinterpret_cast(src); - m128i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplicaton - _dest[i] = mul(_src[i], _dest[i], F3); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform multiplicaton - // dest[i] = uint32_t(src[i]) * uint32_t(dest[i]) % card; - dest[i] = uint16_t((uint32_t(src[i]) * dest[i]) % card); - } - } -} - -inline void encode_post_process( - vec::Buffers& output, - std::vector& props, - off_t offset, - unsigned code_len, - uint16_t threshold, - size_t vecs_nb) -{ - const unsigned vec_size = simd::countof(); - - const m128i _threshold = _mm_set1_epi16(threshold); - uint16_t max = 1 << (sizeof(uint16_t) * 8 - 1); - const m128i mask_hi = _mm_set1_epi16(max); - const unsigned element_size = sizeof(uint16_t); - - for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { - uint16_t* chunk = output.get(frag_id); - m128i* buf = reinterpret_cast(chunk); - for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) { - const m128i a = _mm_load_si128(&(buf[vec_id])); - const m128i b = _mm_cmpeq_epi16(_threshold, a); - const m128i c = _mm_and_si128(mask_hi, b); - uint16_t d = _mm_movemask_epi8(c); - - while (d > 0) { - unsigned byte_idx = __builtin_ctz(d); - unsigned element_idx = byte_idx / element_size; - off_t _offset = offset + vec_id * vec_size + element_idx; - props[frag_id].add(_offset, 1); - d ^= 1 << byte_idx; - } - } - } -} - -} // namespace simd -} // namespace quadiron - -#endif diff --git a/src/simd_128_u32.h b/src/simd_128_u32.h deleted file mode 100644 index 80936a6e..00000000 --- a/src/simd_128_u32.h +++ /dev/null @@ -1,639 +0,0 @@ -/* - * Copyright 2017-2018 Scality - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __QUAD_SIMD_128_U32_H__ -#define __QUAD_SIMD_128_U32_H__ - -#include - -#include "simd/simd.h" - -namespace quadiron { -namespace simd { - -/* ==================== Essential Operations =================== */ - -/** Perform a%card where a is a addition of two numbers whose elements are - * symbols of GF(card) */ -inline m128i mod_after_add(m128i a, aint32 card) -{ - const m128i _card = _mm_set1_epi32(card); - const m128i _card_minus_1 = _mm_set1_epi32(card - 1); - - m128i cmp = _mm_cmpgt_epi32(a, _card_minus_1); - m128i b = _mm_sub_epi32(a, _mm_and_si128(_card, cmp)); - - return b; -} - -/** Perform addition of two numbers a, b whose elements are of GF(card) */ -inline m128i add(m128i a, m128i b, aint32 card) -{ - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - m128i c = _mm_add_epi32(_a, _b); - - // Modulo - return mod_after_add(c, card); -} - -/** Perform subtraction of a by b where a, b whose elements are symbols of - * GF(card) - * sub(a, b) = a - b if a >= b, or - * card + a - b, otherwise - */ -inline m128i sub(m128i a, m128i b, aint32 card) -{ - const m128i _card = _mm_set1_epi32(card); - - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - - m128i cmp = _mm_cmpgt_epi32(_b, _a); - m128i _a1 = _mm_add_epi32(_a, _mm_and_si128(_card, cmp)); - - return _mm_sub_epi32(_a1, _b); -} - -/** Negate `a` - * @return 0 if (a == 0), else card - a - */ -inline m128i neg(m128i a, aint32 card = F4) -{ - const m128i _card = _mm_set1_epi32(card); - - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_setzero_si128(); - m128i cmp = _mm_cmpgt_epi32(_a, _b); - - return _mm_sub_epi32(_mm_and_si128(cmp, _card), _a); -} - -/** Perform a%card where a is a multiplication of two numbers whose elements are - * symbols of GF(F4) - * - * We find v in a = u * card + v - * a is expressed also as: a = hi * (card-1) + lo - * where hi and lo is 16-bit for F4 (or 8-bit for F3) high and low parts of a - * hence, v = (lo - hi) % F4 - * v = lo - hi, if lo >= hi - * or - * F4 + lo - hi, otherwise - */ -inline m128i mod_after_multiply_f4(m128i a) -{ - const m128i mask = _mm_set1_epi32(F4 - 2); - - m128i lo = _mm_and_si128(a, mask); - - m128i a_shift = _mm_srli_si128(a, 2); - m128i hi = _mm_and_si128(a_shift, mask); - - m128i cmp = _mm_cmpgt_epi32(hi, lo); - m128i _lo = _mm_add_epi32(lo, _mm_and_si128(F4_m128i, cmp)); - - return _mm_sub_epi32(_lo, hi); -} - -inline m128i mod_after_multiply_f3(m128i a) -{ - const m128i mask = _mm_set1_epi32(F3 - 2); - - m128i lo = _mm_and_si128(a, mask); - - m128i a_shift = _mm_srli_si128(a, 1); - m128i hi = _mm_and_si128(a_shift, mask); - - m128i cmp = _mm_cmpgt_epi32(hi, lo); - m128i _lo = _mm_add_epi32(lo, _mm_and_si128(F3_m128i, cmp)); - - return _mm_sub_epi32(_lo, hi); -} - -inline m128i mul_f4(m128i a, m128i b) -{ - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - - m128i c = _mm_mullo_epi32(_a, _b); - - // filter elements of both of a & b = card-1 - m128i cmp = _mm_and_si128( - _mm_cmpeq_epi32(_a, F4minus1_m128i), - _mm_cmpeq_epi32(_b, F4minus1_m128i)); - - const m128i one = _mm_set1_epi32(1); - c = _mm_add_epi32(c, _mm_and_si128(one, cmp)); - - // Modulo - return mod_after_multiply_f4(c); -} - -inline m128i mul_f4_simple(m128i a, m128i b) -{ - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - - m128i c = _mm_mullo_epi32(_a, _b); - - // Modulo - return mod_after_multiply_f4(c); -} - -inline m128i mul_f3(m128i a, m128i b) -{ - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - - m128i c = _mm_mullo_epi32(_a, _b); - - // filter elements of both of a & b = card-1 - m128i cmp = _mm_and_si128( - _mm_cmpeq_epi32(_a, F3minus1_m128i), - _mm_cmpeq_epi32(_b, F3minus1_m128i)); - - c = _mm_xor_si128(c, _mm_and_si128(F4_m128i, cmp)); - - // Modulo - return mod_after_multiply_f3(c); -} - -inline m128i mul_f3_simple(m128i a, m128i b) -{ - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - - m128i c = _mm_mullo_epi32(_a, _b); - - // Modulo - return mod_after_multiply_f3(c); -} - -/** Perform multiplication of two numbers a, b whose elements are of GF(card) - * where `card` is a prime Fermat number, i.e. card = Fx with x < 5 - * Currently, it supports only for F3 and F4 - */ -inline m128i mul(m128i a, m128i b, aint32 card) -{ - assert(card == F4 || card == F3); - if (card == F4) - return mul_f4(a, b); - return mul_f3(a, b); -} - -inline m128i mul_simple(m128i a, m128i b, aint32 card) -{ - assert(card == F4 || card == F3); - if (card == F4) - return mul_f4_simple(a, b); - return mul_f3_simple(a, b); -} - -/** Apply an element-wise negation to a buffer - */ -inline void neg(size_t len, aint32* buf, aint32 card = F4) -{ - m128i* _buf = reinterpret_cast(buf); - unsigned ratio = sizeof(*_buf) / sizeof(*buf); - size_t _len = len / ratio; - size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - _buf[i] = neg(_buf[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - if (buf[i] > 0) - buf[i] = card - buf[i]; - } - } -} - -/** Perform a multiplication of a coefficient `a` to each element of `src` and - * add result to correspondent element of `dest` - */ -inline void mul_coef_to_buf( - const aint32 a, - aint32* src, - aint32* dest, - size_t len, - aint32 card = F4) -{ - const m128i coef = _mm_set1_epi32(a); - - m128i* _src = reinterpret_cast(src); - m128i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplication - _dest[i] = mul(coef, _src[i], card); - } - if (_last_len > 0) { - uint64_t coef_64 = (uint64_t)a; - for (i = _len * ratio; i < len; i++) { - // perform multiplication - dest[i] = (aint32)((coef_64 * src[i]) % card); - } - } -} - -inline void -add_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4) -{ - m128i* _src = reinterpret_cast(src); - m128i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform addition - _dest[i] = add(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform addition - aint32 tmp = src[i] + dest[i]; - dest[i] = (tmp >= card) ? (tmp - card) : tmp; - } - } -} - -inline void sub_two_bufs( - aint32* bufa, - aint32* bufb, - aint32* res, - size_t len, - aint32 card = F4) -{ - m128i* _bufa = reinterpret_cast(bufa); - m128i* _bufb = reinterpret_cast(bufb); - m128i* _res = reinterpret_cast(res); - const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform subtraction - _res[i] = sub(_bufa[i], _bufb[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform subtraction - if (bufa[i] >= bufb[i]) - res[i] = bufa[i] - bufb[i]; - else - res[i] = card - (bufb[i] - bufa[i]); - } - } -} - -inline void -mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4) -{ - m128i* _src = reinterpret_cast(src); - m128i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplicaton - _dest[i] = mul(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform multiplicaton - dest[i] = uint32_t((uint64_t(src[i]) * dest[i]) % card); - } - } -} - -// outputA = inputA + inputB -// outputB = inputA - inputB -inline void butterfly_step( - m128i* inputA, - m128i* inputB, - m128i* outputA, - m128i* outputB, - uint32_t _card) -{ - const m128i card = (_card == F3) ? F3_m128i : F4_m128i; - const m128i card_1 = (_card == F3) ? F3minus1_m128i : F4minus1_m128i; - - // -------------------------------------- - // outputB = inputA - inputB - // -------------------------------------- - m128i a = _mm_load_si128(inputA); - m128i b = _mm_load_si128(inputB); - m128i cmp_1 = _mm_cmpgt_epi32(b, a); - m128i res_1 = _mm_add_epi32(a, _mm_and_si128(card, cmp_1)); - - _mm_store_si128(outputB, _mm_sub_epi32(res_1, b)); - - // -------------------------------------- - // outputA = symbA + symbB - // -------------------------------------- - m128i res_2 = _mm_add_epi32(a, b); - // modulo - m128i cmp_2 = _mm_cmpgt_epi32(res_2, card_1); - m128i c = _mm_sub_epi32(res_2, _mm_and_si128(card, cmp_2)); - - _mm_store_si128(outputA, c); -} - -// for each pair (P, Q) = (buf[i], buf[i + m]): -// P = P + Q -// Q = P - Q -inline void butterfly_ct_1( - vec::Buffers& buf, - unsigned start, - unsigned m, - unsigned step, - size_t len, - uint32_t card = F4) -{ - for (int i = start; i < buf.get_n(); i += step) { - uint32_t* a = buf.get(i); - uint32_t* b = buf.get(i + m); - m128i* _a = reinterpret_cast(a); - m128i* _b = reinterpret_cast(b); - // perform butterfly operation for Cooley-Tukey FFT algorithm - for (size_t j = 0; j < len; ++j) { - butterfly_step(&(_a[j]), &(_b[j]), &(_a[j]), &(_b[j]), card); - } - } -} - -// for each pair (P, Q) = (buf[i], buf[i + m]): -// P = P - Q -// Q = P + Q -inline void butterfly_ct_2( - vec::Buffers& buf, - unsigned start, - unsigned m, - unsigned step, - size_t len, - uint32_t card = F4) -{ - for (int i = start; i < buf.get_n(); i += step) { - uint32_t* a = buf.get(i); - uint32_t* b = buf.get(i + m); - m128i* _a = reinterpret_cast(a); - m128i* _b = reinterpret_cast(b); - // perform butterfly operation for Cooley-Tukey FFT algorithm - for (size_t j = 0; j < len; ++j) { - butterfly_step(&(_a[j]), &(_b[j]), &(_b[j]), &(_a[j]), card); - } - } -} - -// output = coef * input -inline void -butterfly_mul(m128i* coef, m128i* input, m128i* output, uint32_t _card) -{ - const m128i card = (_card == F3) ? F3_m128i : F4_m128i; - const m128i card_2 = (_card == F3) ? F3minus2_m128i : F4minus2_m128i; - - // -------------------------------------- - // compute coef * symbB - // -------------------------------------- - m128i _coef = _mm_load_si128(coef); - m128i b = _mm_load_si128(input); - m128i res = _mm_mullo_epi32(_coef, b); - // modulo - m128i lo = _mm_and_si128(res, card_2); - m128i res_shift = - (_card == F3) ? _mm_srli_si128(res, 1) : _mm_srli_si128(res, 2); - m128i hi = _mm_and_si128(res_shift, card_2); - - m128i cmp_1 = _mm_cmpgt_epi32(hi, lo); - m128i _lo = _mm_add_epi32(lo, _mm_and_si128(card, cmp_1)); - - m128i res_2 = _mm_sub_epi32(_lo, hi); - - _mm_store_si128(output, res_2); -} - -// symbA = symbA + coef * symbB -// symbB = symbA - coef * symbB -inline void -butterfly_ct_3_step(m128i* coef, m128i* symbA, m128i* symbB, uint32_t _card) -{ - // -------------------------------------- - // compute coef * symbB - // -------------------------------------- - m128i coef_x_symbB; - butterfly_mul(coef, symbB, &coef_x_symbB, _card); - // -------------------------------------- - // symbA = symbA + coef_x_symbB - // symbB = symbA - coef_x_symbB - // -------------------------------------- - butterfly_step(symbA, &coef_x_symbB, symbA, symbB, _card); -} - -// for each pair (P, Q) = (buf[i], buf[i + m]): -// P = P + c * Q -// Q = P - c * Q -inline void butterfly_ct_3( - uint32_t coef, - vec::Buffers& buf, - unsigned start, - unsigned m, - unsigned step, - size_t len, - uint32_t card = F4) -{ - m128i _coef = _mm_set1_epi32(coef); - for (int i = start; i < buf.get_n(); i += step) { - uint32_t* a = buf.get(i); - uint32_t* b = buf.get(i + m); - m128i* _a = reinterpret_cast(a); - m128i* _b = reinterpret_cast(b); - // perform butterfly operation for Cooley-Tukey FFT algorithm - for (size_t j = 0; j < len; ++j) { - butterfly_ct_3_step(&_coef, &(_a[j]), &(_b[j]), card); - } - } -} - -// for each pair (P, Q) = (buf[i], buf[i + m]): -// P = Q + P -// Q = Q - P -inline void butterfly_gs_2( - vec::Buffers& buf, - unsigned start, - unsigned m, - unsigned step, - size_t len, - uint32_t card = F4) -{ - for (int i = start; i < buf.get_n(); i += step) { - uint32_t* a = buf.get(i); - uint32_t* b = buf.get(i + m); - m128i* _a = reinterpret_cast(a); - m128i* _b = reinterpret_cast(b); - // perform butterfly operation for Cooley-Tukey FFT algorithm - for (size_t j = 0; j < len; ++j) { - butterfly_step(&(_b[j]), &(_a[j]), &(_a[j]), &(_b[j]), card); - } - } -} - -// symbA = symbA + symbB -// symbB = coef * (symbA - symbB) -inline void -butterfly_gs_3_step(m128i* coef, m128i* symbA, m128i* symbB, uint32_t _card) -{ - // -------------------------------------- - // symbA = symbA + symbB - // symbB = symbA - symbB - // -------------------------------------- - butterfly_step(symbA, symbB, symbA, symbB, _card); - - // -------------------------------------- - // symbB = coef * symbB - // -------------------------------------- - butterfly_mul(coef, symbB, symbB, _card); -} - -// for each pair (P, Q) = (buf[i], buf[i + m]): -// P = P + Q -// Q = c * (P - Q) -inline void butterfly_gs_3( - uint32_t coef, - vec::Buffers& buf, - unsigned start, - unsigned m, - unsigned step, - size_t len, - uint32_t card = F4) -{ - m128i _coef = _mm_set1_epi32(coef); - for (int i = start; i < buf.get_n(); i += step) { - uint32_t* a = buf.get(i); - uint32_t* b = buf.get(i + m); - m128i* _a = reinterpret_cast(a); - m128i* _b = reinterpret_cast(b); - // perform butterfly operation for Cooley-Tukey FFT algorithm - for (size_t j = 0; j < len; ++j) { - butterfly_gs_3_step(&_coef, &(_a[j]), &(_b[j]), card); - } - } -} - -inline void encode_post_process( - vec::Buffers& output, - std::vector& props, - off_t offset, - unsigned code_len, - uint32_t threshold, - size_t vecs_nb) -{ - const unsigned vec_size = simd::countof(); - - const m128i _threshold = _mm_set1_epi32(threshold); - const uint32_t max = 1 << (sizeof(uint32_t) * 8 - 1); - const m128i mask_hi = _mm_set1_epi32(max); - const unsigned element_size = sizeof(uint32_t); - - for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { - uint32_t* chunk = output.get(frag_id); - m128i* buf = reinterpret_cast(chunk); - for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) { - const m128i a = _mm_load_si128(&(buf[vec_id])); - const m128i b = _mm_cmpeq_epi32(_threshold, a); - const m128i c = _mm_and_si128(mask_hi, b); - uint16_t d = _mm_movemask_epi8(c); - - while (d > 0) { - unsigned byte_idx = __builtin_ctz(d); - unsigned element_idx = byte_idx / element_size; - off_t _offset = offset + vec_id * vec_size + element_idx; - props[frag_id].add(_offset, 1); - d ^= 1 << byte_idx; - } - } - } -} - -/* ==================== Operations for NF4 =================== */ - -/** Return aint128 integer from a _m128i register */ -static inline aint128 m128i_to_uint128(m128i v) -{ - aint128 i; - _mm_store_si128((m128i*)&i, v); - - return i; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn) -} - -inline __uint128_t add(__uint128_t a, __uint128_t b) -{ - m128i res = add((m128i)a, (m128i)b, F4); - return m128i_to_uint128(res); -} - -inline __uint128_t sub(__uint128_t a, __uint128_t b) -{ - m128i res = sub((m128i)a, (m128i)b, F4); - return m128i_to_uint128(res); -} - -inline __uint128_t mul(__uint128_t a, __uint128_t b) -{ - m128i res = mul((m128i)a, (m128i)b, F4); - return m128i_to_uint128(res); -} - -inline void hadamard_mul(int n, aint128* _x, aint128* _y) -{ - int i; - m128i* x = reinterpret_cast(_x); - m128i* y = reinterpret_cast(_y); - - // multiply y to `x` - for (i = 0; i < n; i++) { - x[i] = mul(x[i], y[i], F4); - } -} - -} // namespace simd -} // namespace quadiron - -#endif From 08767746f3f251f4acf47db3aa2acb116e3e2aee Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:44:03 +0200 Subject: [PATCH 09/77] SIMD Main file including necessary files 1. Essential operations - simd_128.h contains essential wrappers of SIMD operations on SSE - simd_256.h contains essential wrappers of SIMD operations on AVX 2. Basic operations - simd_basic.h contain basic operations used in following cases, and also operations for RingModN 3. Vectorized operations - simd_fnt.h contains vectorized operations dedicated for FNT - simd_nf4.h contains vectorized operations dedicated for nf4 --- src/simd.h | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/simd.h b/src/simd.h index 8309bfff..d70fcb2f 100644 --- a/src/simd.h +++ b/src/simd.h @@ -39,31 +39,35 @@ const unsigned F4 = 65537; const unsigned F3 = 257; -typedef uint8_t aint8 __attribute__((aligned(quadiron::simd::ALIGNMENT))); -typedef uint16_t aint16 __attribute__((aligned(quadiron::simd::ALIGNMENT))); -typedef uint32_t aint32 __attribute__((aligned(quadiron::simd::ALIGNMENT))); -typedef uint64_t aint64 __attribute__((aligned(quadiron::simd::ALIGNMENT))); -typedef __uint128_t aint128 __attribute__((aligned(quadiron::simd::ALIGNMENT))); - namespace quadiron { -/** The namespace simd contains functions for GF-NF4 that are accelerated by - * using SIMD operations over 128bits +/** The namespace simd contains functions accelerated by + * using SIMD operations over 128bits and 256bits * - * It supports operations on 32-bit numbers + * It supports operations on 16-bit and 32-bit numbers */ namespace simd { +#define EITHER(x, a, b) (((x)) ? (a) : (b)) + // Vectorized operations are implemented in appropriated headers simd*.h } // namespace simd } // namespace quadiron +// Include essential operations that use SIMD functions #if defined(__AVX2__) #include "simd_256.h" #elif defined(__SSE4_1__) #include "simd_128.h" #endif +// Include basic operations +#include "simd_basic.h" + +// Include accelerated operations dedicated for FNT +#include "simd_fnt.h" + +// Include accelerated operations dedicated for NF4 #include "simd_nf4.h" #endif // #ifdef QUADIRON_USE_SIMD From 4f97fa69d8e55a5a522860084e87758a8bf10d56 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:44:10 +0200 Subject: [PATCH 10/77] SIMD 128: essential operations for SSE --- src/simd_128.h | 134 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 123 insertions(+), 11 deletions(-) diff --git a/src/simd_128.h b/src/simd_128.h index 0f6b8857..afbfe70f 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -33,19 +33,131 @@ #include -typedef __m128i m128i; +namespace quadiron { +namespace simd { -// Disable `cert-err58-cpp` on these: AFAIK they cannot throw. -// (probably a false positive present in Clang 5 and fixed in Clang 6). -const m128i F4_m128i = _mm_set1_epi32(65537); // NOLINT(cert-err58-cpp) -const m128i F4minus1_m128i = _mm_set1_epi32(65536); // NOLINT(cert-err58-cpp) -const m128i F3_m128i = _mm_set1_epi32(257); // NOLINT(cert-err58-cpp) -const m128i F3minus1_m128i = _mm_set1_epi32(256); // NOLINT(cert-err58-cpp) +typedef __m128i VecType; +typedef uint32_t MaskIntType; -const m128i F3_m128i_u16 = _mm_set1_epi16(257); // NOLINT(cert-err58-cpp) -const m128i F3minus1_m128i_u16 = _mm_set1_epi16(256); // NOLINT(cert-err58-cpp) +#define F4_u32 _mm_set1_epi32(65537) +#define F4m1_u32 _mm_set1_epi32(65536) +#define F3_u32 _mm_set1_epi32(257) +#define F3m1_u32 _mm_set1_epi32(256) -#include "simd_128_u16.h" -#include "simd_128_u32.h" +#define F3_u16 _mm_set1_epi16(257) +#define F3m1_u16 _mm_set1_epi16(256) + +#define CARD(q) (EITHER(q == F3, F3_u32, F4_u32)) +#define CARD_M_1(q) (EITHER(q == F3, F3m1_u32, F4m1_u32)) + +/* ============= Essential Operations for AVX2 w/ both u16 & u32 ============ */ + +#define ZERO (_mm_setzero_si128()) +#define ONE16 (_mm_set1_epi16(1)) +#define ONE32 (_mm_set1_epi32(1)) + +inline VecType LOAD(VecType* address) +{ + return _mm_load_si128(address); +} +inline void STORE(VecType* address, VecType reg) +{ + _mm_store_si128(address, reg); +} + +inline VecType AND(VecType x, VecType y) +{ + return _mm_and_si128(x, y); +} +inline VecType XOR(VecType x, VecType y) +{ + return _mm_xor_si128(x, y); +} +inline VecType SHIFTR_1(VecType x) +{ + return _mm_srli_si128(x, 1); +} +inline VecType SHIFTR_2(VecType x) +{ + return _mm_srli_si128(x, 2); +} +inline uint16_t MVMSK8(VecType x) +{ + return _mm_movemask_epi8(x); +} +inline uint16_t TESTZ(VecType x, VecType y) +{ + return _mm_testz_si128(x, y); +} + +/* ================= Essential Operations for AVX2 w/ u32 ================= */ + +inline VecType SET1(uint32_t val) +{ + return _mm_set1_epi32(val); +} +inline VecType ADD32(VecType x, VecType y) +{ + return _mm_add_epi32(x, y); +} +inline VecType SUB32(VecType x, VecType y) +{ + return _mm_sub_epi32(x, y); +} +inline VecType MUL32(VecType x, VecType y) +{ + return _mm_mullo_epi32(x, y); +} + +inline VecType CMPEQ32(VecType x, VecType y) +{ + return _mm_cmpeq_epi32(x, y); +} +inline VecType CMPGT32(VecType x, VecType y) +{ + return _mm_cmpgt_epi32(x, y); +} +inline VecType MINU32(VecType x, VecType y) +{ + return _mm_min_epu32(x, y); +} +#define MASK8_LO (_mm_set1_epi16(0x80)) +#define BLEND8(x, y, mask) (_mm_blendv_epi8(x, y, mask)) +#define BLEND16(x, y, imm8) (_mm_blend_epi16(x, y, imm8)) + +/* ================= Essential Operations for AVX2 w/ u16 ================= */ + +inline VecType SET1(uint16_t val) +{ + return _mm_set1_epi16(val); +} +inline VecType ADD16(VecType x, VecType y) +{ + return _mm_add_epi16(x, y); +} +inline VecType SUB16(VecType x, VecType y) +{ + return _mm_sub_epi16(x, y); +} +inline VecType MUL16(VecType x, VecType y) +{ + return _mm_mullo_epi16(x, y); +} + +inline VecType CMPEQ16(VecType x, VecType y) +{ + return _mm_cmpeq_epi16(x, y); +} +inline VecType CMPGT16(VecType x, VecType y) +{ + return _mm_cmpgt_epi16(x, y); +} +inline VecType MINU16(VecType x, VecType y) +{ + return _mm_min_epu16(x, y); +} + +} // namespace simd +} // namespace quadiron #endif From 7dc72c1ce36ec640b136c27746fc66b3db0baf25 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:44:16 +0200 Subject: [PATCH 11/77] SIMD 256: essential operations for AVX --- src/simd_256.h | 144 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 129 insertions(+), 15 deletions(-) diff --git a/src/simd_256.h b/src/simd_256.h index 2dc49cc4..d06f3218 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -33,19 +33,6 @@ #include -typedef __m256i m256i; - -// Disable `cert-err58-cpp` on these: AFAIK they cannot throw. -// (probably a false positive present in Clang 5 and fixed in Clang 6). -const m256i F4_m256i = _mm256_set1_epi32(65537); // NOLINT(cert-err58-cpp) -const m256i F4minus1_m256i = _mm256_set1_epi32(65536); // NOLINT(cert-err58-cpp) -const m256i F3_m256i = _mm256_set1_epi32(257); // NOLINT(cert-err58-cpp) -const m256i F3minus1_m256i = _mm256_set1_epi32(256); // NOLINT(cert-err58-cpp) - -const m256i F3_m256i_u16 = _mm256_set1_epi16(257); // NOLINT(cert-err58-cpp) -// NOLINTNEXTLINE(cert-err58-cpp) -const m256i F3minus1_m256i_u16 = _mm256_set1_epi16(256); - /* GCC doesn't include the split store intrinsics so define them here. */ #if defined(__GNUC__) && !defined(__clang__) @@ -58,7 +45,134 @@ _mm256_storeu2_m128i(__m128i* const hi, __m128i* const lo, const __m256i a) #endif /* defined(__GNUC__) */ -#include "simd_256_u16.h" -#include "simd_256_u32.h" +namespace quadiron { +namespace simd { + +typedef __m256i VecType; +typedef __m128i HalfVecType; +typedef __uint128_t NF4Type; +typedef uint32_t MaskIntType; + +#define F4_u32 _mm256_set1_epi32(65537) +#define F4m1_u32 _mm256_set1_epi32(65536) +#define F3_u32 _mm256_set1_epi32(257) +#define F3m1_u32 _mm256_set1_epi32(256) + +#define F3_u16 _mm256_set1_epi16(257) +#define F3m1_u16 _mm256_set1_epi16(256) + +#define CARD(q) (EITHER(q == F3, F3_u32, F4_u32)) +#define CARD_M_1(q) (EITHER(q == F3, F3m1_u32, F4m1_u32)) + +/* ============= Essential Operations for AVX2 w/ both u16 & u32 ============ */ + +#define ZERO (_mm256_setzero_si256()) +#define ONE16 (_mm256_set1_epi16(1)) +#define ONE32 (_mm256_set1_epi32(1)) + +inline VecType LOAD(VecType* address) +{ + return _mm256_load_si256(address); +} +inline void STORE(VecType* address, VecType reg) +{ + _mm256_store_si256(address, reg); +} + +inline VecType AND(VecType x, VecType y) +{ + return _mm256_and_si256(x, y); +} +inline VecType XOR(VecType x, VecType y) +{ + return _mm256_xor_si256(x, y); +} +inline VecType SHIFTR_1(VecType x) +{ + return _mm256_srli_si256(x, 1); +} +inline VecType SHIFTR_2(VecType x) +{ + return _mm256_srli_si256(x, 2); +} +inline uint32_t MVMSK8(VecType x) +{ + return _mm256_movemask_epi8(x); +} +inline uint32_t TESTZ(VecType x, VecType y) +{ + return _mm256_testz_si256(x, y); +} + +/* ================= Essential Operations for AVX2 w/ u32 ================= */ + +inline VecType SET1(uint32_t val) +{ + return _mm256_set1_epi32(val); +} +inline VecType ADD32(VecType x, VecType y) +{ + return _mm256_add_epi32(x, y); +} +inline VecType SUB32(VecType x, VecType y) +{ + return _mm256_sub_epi32(x, y); +} +inline VecType MUL32(VecType x, VecType y) +{ + return _mm256_mullo_epi32(x, y); +} + +inline VecType CMPEQ32(VecType x, VecType y) +{ + return _mm256_cmpeq_epi32(x, y); +} +inline VecType CMPGT32(VecType x, VecType y) +{ + return _mm256_cmpgt_epi32(x, y); +} +inline VecType MINU32(VecType x, VecType y) +{ + return _mm256_min_epu32(x, y); +} + +#define MASK8_LO (_mm256_set1_epi16(0x80)) +#define BLEND8(x, y, mask) (_mm256_blendv_epi8(x, y, mask)) +#define BLEND16(x, y, imm8) (_mm256_blend_epi16(x, y, imm8)) + +/* ================= Essential Operations for AVX2 w/ u16 ================= */ + +inline VecType SET1(uint16_t val) +{ + return _mm256_set1_epi16(val); +} +inline VecType ADD16(VecType x, VecType y) +{ + return _mm256_add_epi16(x, y); +} +inline VecType SUB16(VecType x, VecType y) +{ + return _mm256_sub_epi16(x, y); +} +inline VecType MUL16(VecType x, VecType y) +{ + return _mm256_mullo_epi16(x, y); +} + +inline VecType CMPEQ16(VecType x, VecType y) +{ + return _mm256_cmpeq_epi16(x, y); +} +inline VecType CMPGT16(VecType x, VecType y) +{ + return _mm256_cmpgt_epi16(x, y); +} +inline VecType MINU16(VecType x, VecType y) +{ + return _mm256_min_epu16(x, y); +} + +} // namespace simd +} // namespace quadiron #endif From fceff91f8700735848eb63c4f29ed57185867bf9 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:44:21 +0200 Subject: [PATCH 12/77] SIMD Basic: includes basic Operations It implements basic operations that will be used everywhere. It includes also operations for RingModN --- src/simd_basic.h | 404 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 404 insertions(+) create mode 100644 src/simd_basic.h diff --git a/src/simd_basic.h b/src/simd_basic.h new file mode 100644 index 00000000..7585734d --- /dev/null +++ b/src/simd_basic.h @@ -0,0 +1,404 @@ +/* + * Copyright 2017-2018 Scality + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __QUAD_SIMD_BASIC_H__ +#define __QUAD_SIMD_BASIC_H__ + +#include + +namespace quadiron { +namespace simd { + +/* ================= Basic Operations for u32 ================= */ + +/** + * Modular addition for packed unsigned 32-bit integers + * + * @param x input register + * @param y input register + * @param q modulo + * @return (x + y) mod q + */ +inline VecType ADD_MOD(VecType x, VecType y, uint32_t q) +{ + VecType res = ADD32(x, y); + return MINU32(res, SUB32(res, CARD(q))); +} + +/** + * Modular subtraction for packed unsigned 32-bit integers + * + * @param x input register + * @param y input register + * @param q modulo + * @return (x - y) mod q + */ +inline VecType SUB_MOD(VecType x, VecType y, uint32_t q) +{ + VecType res = SUB32(x, y); + return MINU32(res, ADD32(res, CARD(q))); +} + +/** + * Modular negation for packed unsigned 32-bit integers + * + * @param x input register + * @param q modulo + * @return (-x) mod q + */ +inline VecType NEG_MOD(VecType x, uint32_t q) +{ + VecType res = SUB32(CARD(q), x); + return MINU32(res, SUB32(res, CARD(q))); +} + +/** + * Modular multiplication for packed unsigned 32-bit integers + * + * @note We assume that at least `x` or `y` is less than `q-1` so it's + * not necessary to verify overflow on multiplying elements + * + * @param x input register + * @param y input register + * @param q modulo + * @return (x * y) mod q + */ +inline VecType MUL_MOD(VecType x, VecType y, uint32_t q) +{ + VecType res = MUL32(x, y); + VecType lo = + (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55); + VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR_1(res), MASK8_LO) + : BLEND16(ZERO, SHIFTR_2(res), 0x55); + return SUB_MOD(lo, hi, q); +} + +/** + * Modular general multiplication for packed unsigned 32-bit integers + * + * @note It's necessary to verify overflow on multiplying elements + * + * @param x input register + * @param y input register + * @param q modulo + * @return (x * y) mod q + */ +inline VecType MULFULL_MOD(VecType x, VecType y, uint32_t q) +{ + VecType res = MUL32(x, y); + + // filter elements of both of a & b = card-1 + VecType cmp = AND(CMPEQ32(x, CARD_M_1(q)), CMPEQ32(y, CARD_M_1(q))); + res = (q == F3) ? XOR(res, AND(F4_u32, cmp)) : ADD32(res, AND(ONE32, cmp)); + + VecType lo = + (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55); + VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR_1(res), MASK8_LO) + : BLEND16(ZERO, SHIFTR_2(res), 0x55); + return SUB_MOD(lo, hi, q); +} + +/** + * Update property for a given register for packed unsigned 32-bit integers + * + * @param props properties bound to fragments + * @param threshold register storing max value in its elements + * @param mask a specific mask + * @param symb input register + * @param offset offset in the data fragments + * @param max a dummy variable + */ +inline void ADD_PROPS( + Properties& props, + VecType threshold, + VecType mask, + VecType symb, + off_t offset, + uint32_t max) +{ + const VecType b = CMPEQ32(threshold, symb); + const VecType c = AND(mask, b); + MaskIntType d = MVMSK8(c); + const unsigned element_size = sizeof(uint32_t); + while (d > 0) { + unsigned byte_idx = __builtin_ctz(d); + off_t _offset = offset + byte_idx / element_size; + props.add(_offset, OOR_MARK); + d ^= 1 << byte_idx; + } +} + +/* ================= Basic Operations for u16 ================= */ + +/** + * Modular addition for packed unsigned 16-bit integers + * + * @param x input register + * @param y input register + * @param q modulo + * @return (x + y) mod q + */ +inline VecType ADD_MOD(VecType x, VecType y, uint16_t q) +{ + VecType res = ADD16(x, y); + return MINU16(res, SUB16(res, F3_u16)); +} + +/** + * Modular subtraction for packed unsigned 16-bit integers + * + * @param x input register + * @param y input register + * @param q modulo + * @return (x - y) mod q + */ +inline VecType SUB_MOD(VecType x, VecType y, uint16_t q) +{ + VecType res = SUB16(x, y); + return MINU16(res, SUB16(ADD16(x, F3_u16), y)); +} + +/** + * Modular negation for packed unsigned 16-bit integers + * + * @param x input register + * @param q modulo + * @return (-x) mod q + */ +inline VecType NEG_MOD(VecType x, uint16_t q) +{ + VecType res = SUB16(F3_u16, x); + return MINU16(res, SUB16(res, F3_u16)); +} + +/** + * Modular multiplication for packed unsigned 16-bit integers + * + * @note We assume that at least `x` or `y` is less than `q-1` so it's + * not necessary to verify overflow on multiplying elements + * + * @param x input register + * @param y input register + * @param q modulo + * @return (x * y) mod q + */ +inline VecType MUL_MOD(VecType x, VecType y, uint16_t q) +{ + VecType res = MUL16(x, y); + VecType lo = BLEND8(ZERO, res, MASK8_LO); + VecType hi = BLEND8(ZERO, SHIFTR_1(res), MASK8_LO); + return SUB_MOD(lo, hi, q); +} + +/** + * Modular general multiplication for packed unsigned 16-bit integers + * + * @note It's necessary to verify overflow on multiplying elements + * + * @param x input register + * @param y input register + * @param q modulo + * @return (x * y) mod q + */ +inline VecType MULFULL_MOD(VecType x, VecType y, uint16_t q) +{ + VecType res = MUL16(x, y); + + // filter elements of both of a & b = card-1 + VecType cmp = AND(CMPEQ16(x, F3m1_u16), CMPEQ16(y, F3m1_u16)); + res = ADD16(res, AND(ONE16, cmp)); + + VecType lo = BLEND8(ZERO, res, MASK8_LO); + VecType hi = BLEND8(ZERO, SHIFTR_1(res), MASK8_LO); + return SUB_MOD(lo, hi, q); +} + +/** + * Update property for a given register for packed unsigned 32-bit integers + * + * @param props properties bound to fragments + * @param threshold register storing max value in its elements + * @param mask a specific mask + * @param symb input register + * @param offset offset in the data fragments + * @param max a dummy variable + */ +inline void ADD_PROPS( + Properties& props, + VecType threshold, + VecType mask, + VecType symb, + off_t offset, + uint16_t max) +{ + const VecType b = CMPEQ16(threshold, symb); + const VecType c = AND(mask, b); + MaskIntType d = MVMSK8(c); + const unsigned element_size = sizeof(uint16_t); + while (d > 0) { + unsigned byte_idx = __builtin_ctz(d); + off_t _offset = offset + byte_idx / element_size; + props.add(_offset, OOR_MARK); + d ^= 1 << byte_idx; + } +} + +/* ==================== Operations for RingModN =================== */ +/** Perform a multiplication of a coefficient `a` to each element of `src` and + * add result to correspondent element of `dest` + * + * @note: 1 < `a` < card - 1 + */ +template +inline void mul_coef_to_buf(const T a, T* src, T* dest, size_t len, T card) +{ + const VecType coef = SET1(a); + + VecType* __restrict _src = reinterpret_cast(src); + VecType* __restrict _dest = reinterpret_cast(dest); + const unsigned ratio = sizeof(*_src) / sizeof(*src); + const size_t _len = len / ratio; + const size_t _last_len = len - _len * ratio; + + size_t i = 0; + size_t end = (_len > 3) ? _len - 3 : 0; + for (; i < end; i += 4) { + _dest[i] = MUL_MOD(coef, _src[i], card); + _dest[i + 1] = MUL_MOD(coef, _src[i + 1], card); + _dest[i + 2] = MUL_MOD(coef, _src[i + 2], card); + _dest[i + 3] = MUL_MOD(coef, _src[i + 3], card); + } + for (; i < _len; ++i) { + _dest[i] = MUL_MOD(coef, _src[i], card); + } + + if (_last_len > 0) { + DoubleSizeVal coef_double = DoubleSizeVal(a); + for (size_t i = _len * ratio; i < len; i++) { + dest[i] = (T)((coef_double * src[i]) % card); + } + } +} + +template +inline void add_two_bufs(T* src, T* dest, size_t len, T card) +{ + VecType* __restrict _src = reinterpret_cast(src); + VecType* __restrict _dest = reinterpret_cast(dest); + const unsigned ratio = sizeof(*_src) / sizeof(*src); + const size_t _len = len / ratio; + const size_t _last_len = len - _len * ratio; + + size_t i; + for (i = 0; i < _len; i++) { + _dest[i] = ADD_MOD(_src[i], _dest[i], card); + } + if (_last_len > 0) { + for (i = _len * ratio; i < len; i++) { + T tmp = src[i] + dest[i]; + dest[i] = (tmp >= card) ? (tmp - card) : tmp; + } + } +} + +template +inline void sub_two_bufs(T* bufa, T* bufb, T* res, size_t len, T card) +{ + VecType* __restrict _bufa = reinterpret_cast(bufa); + VecType* __restrict _bufb = reinterpret_cast(bufb); + VecType* __restrict _res = reinterpret_cast(res); + const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa); + const size_t _len = len / ratio; + const size_t _last_len = len - _len * ratio; + + size_t i; + for (i = 0; i < _len; i++) { + // perform subtraction + _res[i] = SUB_MOD(_bufa[i], _bufb[i], card); + } + if (_last_len > 0) { + for (i = _len * ratio; i < len; i++) { + // perform subtraction + if (bufa[i] >= bufb[i]) + res[i] = bufa[i] - bufb[i]; + else + res[i] = card - (bufb[i] - bufa[i]); + } + } +} + +template +inline void mul_two_bufs(T* src, T* dest, size_t len, T card) +{ + VecType* __restrict _src = reinterpret_cast(src); + VecType* __restrict _dest = reinterpret_cast(dest); + const unsigned ratio = sizeof(*_src) / sizeof(*src); + const size_t _len = len / ratio; + const size_t _last_len = len - _len * ratio; + + size_t i; + for (i = 0; i < _len; i++) { + // perform multiplicaton + _dest[i] = MULFULL_MOD(_src[i], _dest[i], card); + } + if (_last_len > 0) { + for (i = _len * ratio; i < len; i++) { + // perform multiplicaton + dest[i] = T((DoubleSizeVal(src[i]) * dest[i]) % card); + } + } +} + +/** Apply an element-wise negation to a buffer + */ +template +inline void neg(size_t len, T* buf, T card) +{ + VecType* _buf = reinterpret_cast(buf); + unsigned ratio = sizeof(*_buf) / sizeof(*buf); + size_t _len = len / ratio; + size_t _last_len = len - _len * ratio; + + size_t i; + for (i = 0; i < _len; i++) { + _buf[i] = NEG_MOD(_buf[i], card); + } + if (_last_len > 0) { + for (i = _len * ratio; i < len; i++) { + if (buf[i]) + buf[i] = card - buf[i]; + } + } +} + +} // namespace simd +} // namespace quadiron + +#endif From ec70d623d4b84a071c04cac79aaa6145914f0500 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:44:27 +0200 Subject: [PATCH 13/77] SIMD NF4 contains vectorized operations for NF4 --- src/simd_nf4.h | 233 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 208 insertions(+), 25 deletions(-) diff --git a/src/simd_nf4.h b/src/simd_nf4.h index 7d517430..b86e284e 100644 --- a/src/simd_nf4.h +++ b/src/simd_nf4.h @@ -38,37 +38,36 @@ namespace quadiron { namespace simd { -#ifdef __AVX2__ -typedef __m128i m128i; +typedef uint32_t aint32 __attribute__((aligned(ALIGNMENT))); +typedef __uint128_t NF4Type; -/** Return aint128 integer from a _m128i register */ -static inline aint128 m128i_to_uint128(m128i v) +/** Return NF4Type integer from a _m128i register */ +static inline NF4Type m128i_to_uint128(__m128i v) { - aint128 i; - _mm_store_si128((m128i*)&i, v); + NF4Type i; + _mm_store_si128((__m128i*)&i, v); return i; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn) } -#endif // #ifdef __AVX2__ -inline aint128 expand16(uint16_t* arr, int n) +inline NF4Type expand16(uint16_t* arr, int n) { // since n <= 4 uint16_t _arr[4] __attribute__((aligned(ALIGNMENT))) = {0, 0, 0, 0}; std::copy_n(arr, n, _arr); - m128i b = _mm_set_epi16(0, 0, 0, 0, _arr[3], _arr[2], _arr[1], _arr[0]); + __m128i b = _mm_set_epi16(0, 0, 0, 0, _arr[3], _arr[2], _arr[1], _arr[0]); return m128i_to_uint128(b); } -inline aint128 expand32(uint32_t* arr, int n) +inline NF4Type expand32(uint32_t* arr, int n) { // since n <= 4 uint32_t _arr[4] __attribute__((aligned(simd::ALIGNMENT))) = {0, 0, 0, 0}; std::copy_n(arr, n, _arr); - m128i b = _mm_set_epi32(_arr[3], _arr[2], _arr[1], _arr[0]); + __m128i b = _mm_set_epi32(_arr[3], _arr[2], _arr[1], _arr[0]); return m128i_to_uint128(b); } @@ -76,9 +75,9 @@ inline aint128 expand32(uint32_t* arr, int n) inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n) { uint16_t ai[8]; - aint128 values; + NF4Type values; - m128i _a = _mm_loadu_si128((m128i*)&a); + __m128i _a = _mm_loadu_si128((__m128i*)&a); ai[0] = _mm_extract_epi16(_a, 0); ai[1] = _mm_extract_epi16(_a, 1); ai[2] = _mm_extract_epi16(_a, 2); @@ -91,8 +90,8 @@ inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n) const uint32_t flag = ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u); - m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]); - _mm_store_si128((m128i*)&values, val); + __m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]); + _mm_store_si128((__m128i*)&values, val); GroupedValues<__uint128_t> b = {values, flag}; @@ -102,9 +101,9 @@ inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n) inline void unpack(__uint128_t a, GroupedValues<__uint128_t>& b, int n) { uint16_t ai[8]; - aint128 values; + NF4Type values; - m128i _a = _mm_loadu_si128((m128i*)&a); + __m128i _a = _mm_loadu_si128((__m128i*)&a); ai[0] = _mm_extract_epi16(_a, 0); ai[1] = _mm_extract_epi16(_a, 1); ai[2] = _mm_extract_epi16(_a, 2); @@ -117,17 +116,17 @@ inline void unpack(__uint128_t a, GroupedValues<__uint128_t>& b, int n) const uint32_t flag = ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u); - m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]); - _mm_store_si128((m128i*)&values, val); + __m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]); + _mm_store_si128((__m128i*)&values, val); b.flag = flag; b.values = values; // NOLINT(clang-analyzer-core.uninitialized.Assign) } -inline aint128 pack(__uint128_t a) +inline NF4Type pack(__uint128_t a) { - m128i _a = _mm_loadu_si128((m128i*)&a); - m128i b = _mm_set_epi32( + __m128i _a = _mm_loadu_si128((__m128i*)&a); + __m128i b = _mm_set_epi32( _mm_extract_epi16(_a, 3), _mm_extract_epi16(_a, 2), _mm_extract_epi16(_a, 1), @@ -136,10 +135,10 @@ inline aint128 pack(__uint128_t a) return m128i_to_uint128(b); } -inline aint128 pack(__uint128_t a, uint32_t flag) +inline NF4Type pack(__uint128_t a, uint32_t flag) { aint32 b0, b1, b2, b3; - m128i _a = _mm_loadu_si128((m128i*)&a); + __m128i _a = _mm_loadu_si128((__m128i*)&a); if (flag & 1) b0 = 65536; @@ -161,11 +160,195 @@ inline aint128 pack(__uint128_t a, uint32_t flag) else b3 = _mm_extract_epi16(_a, 3); - m128i b = _mm_set_epi32(b3, b2, b1, b0); + __m128i b = _mm_set_epi32(b3, b2, b1, b0); return m128i_to_uint128(b); } +/* ================= Basic operations for NF4 ================= */ + +#if defined(__AVX2__) + +inline VecType CAST_TO_DOUBLE(HalfVecType x) +{ + return _mm256_castsi128_si256(x); +} + +inline void STORE_LOW(HalfVecType* address, VecType reg) +{ + _mm_store_si128(address, _mm256_castsi256_si128(reg)); +} + +inline NF4Type add(NF4Type a, NF4Type b) +{ + HalfVecType res; + VecType _a = CAST_TO_DOUBLE((HalfVecType)a); + VecType _b = CAST_TO_DOUBLE((HalfVecType)b); + STORE_LOW(&res, ADD_MOD(_a, _b, F4)); + return (NF4Type)res; +} + +inline NF4Type sub(NF4Type a, NF4Type b) +{ + HalfVecType res; + VecType _a = CAST_TO_DOUBLE((HalfVecType)a); + VecType _b = CAST_TO_DOUBLE((HalfVecType)b); + STORE_LOW(&res, SUB_MOD(_a, _b, F4)); + return (NF4Type)res; +} + +inline NF4Type mul(NF4Type a, NF4Type b) +{ + HalfVecType res; + VecType _a = CAST_TO_DOUBLE((HalfVecType)a); + VecType _b = CAST_TO_DOUBLE((HalfVecType)b); + STORE_LOW(&res, MULFULL_MOD(_a, _b, F4)); + return (NF4Type)res; +} + +inline void +add_buf_to_two_bufs_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y) +{ + // add last _y[] to x and x_next + HalfVecType* _x = reinterpret_cast(x); + HalfVecType* _x_half = reinterpret_cast(x_half); + HalfVecType* _y = reinterpret_cast(y); + for (unsigned i = 0; i < n; ++i) { + VecType _x_p = CAST_TO_DOUBLE(_x[i]); + VecType _x_next_p = CAST_TO_DOUBLE(_x_half[i]); + VecType _y_p = CAST_TO_DOUBLE(_y[i]); + + STORE_LOW(_x + i, ADD_MOD(_x_p, _y_p, F4)); + STORE_LOW(_x_half + i, ADD_MOD(_x_next_p, _y_p, F4)); + } +} + +inline void hadamard_mul_rem(unsigned n, NF4Type* x, NF4Type* y) +{ + HalfVecType* _x = reinterpret_cast(x); + HalfVecType* _y = reinterpret_cast(y); + for (unsigned i = 0; i < n; ++i) { + VecType _x_p = CAST_TO_DOUBLE(_x[i]); + VecType _y_p = CAST_TO_DOUBLE(_y[i]); + + STORE_LOW(_x + i, MULFULL_MOD(_x_p, _y_p, F4)); + } +} + +inline void +hadamard_mul_doubled_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y) +{ + HalfVecType* _x = reinterpret_cast(x); + HalfVecType* _x_half = reinterpret_cast(x_half); + HalfVecType* _y = reinterpret_cast(y); + for (unsigned i = 0; i < n; ++i) { + VecType _x_p = CAST_TO_DOUBLE(_x[i]); + VecType _x_next_p = CAST_TO_DOUBLE(_x_half[i]); + VecType _y_p = CAST_TO_DOUBLE(_y[i]); + + STORE_LOW(_x + i, MULFULL_MOD(_x_p, _y_p, F4)); + STORE_LOW(_x_half + i, MULFULL_MOD(_x_next_p, _y_p, F4)); + } +} + +#elif defined(__SSE4_1__) + +inline NF4Type add(NF4Type a, NF4Type b) +{ + VecType res; + STORE(&res, ADD_MOD((VecType)a, (VecType)b, F4)); + return (NF4Type)res; +} + +inline NF4Type sub(NF4Type a, NF4Type b) +{ + VecType res; + STORE(&res, SUB_MOD((VecType)a, (VecType)b, F4)); + return (NF4Type)res; +} + +inline NF4Type mul(NF4Type a, NF4Type b) +{ + VecType res; + STORE(&res, MULFULL_MOD((VecType)a, (VecType)b, F4)); + return (NF4Type)res; +} + +inline void +add_buf_to_two_bufs_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y) +{ + // do nothing +} + +inline void hadamard_mul_rem(unsigned n, NF4Type* x, NF4Type* y) +{ + // do nothing +} + +inline void +hadamard_mul_doubled_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y) +{ + // do nothing +} + +#endif + +/* ==================== Operations for NF4 =================== */ + +/** Add buffer `y` to two halves of `x`. `x` is of length `n` */ +inline void add_buf_to_two_bufs(unsigned n, NF4Type* _x, NF4Type* _y) +{ + unsigned i; + VecType* x = reinterpret_cast(_x); + VecType* y = reinterpret_cast(_y); + + const unsigned ratio = sizeof(*x) / sizeof(*_x); + const unsigned half_len = n / 2; + const unsigned vec_len = half_len / ratio; + const unsigned num_len = vec_len * ratio; + const unsigned rem_len = half_len - num_len; + + NF4Type* x_half = _x + half_len; + VecType* x_next = reinterpret_cast(x_half); + + // add y to the first half of `x` + for (i = 0; i < vec_len; ++i) { + x[i] = ADD_MOD(x[i], y[i], F4); + } + + // add y to the second half of `x` + for (i = 0; i < vec_len; ++i) { + x_next[i] = ADD_MOD(x_next[i], y[i], F4); + } + + if (rem_len > 0) { + add_buf_to_two_bufs_rem( + rem_len, _x + num_len, x_half + num_len, _y + num_len); + } +} + +inline void hadamard_mul(unsigned n, NF4Type* _x, NF4Type* _y) +{ + unsigned i; + VecType* x = reinterpret_cast(_x); + VecType* y = reinterpret_cast(_y); + + const unsigned ratio = sizeof(*x) / sizeof(*_x); + const unsigned vec_len = n / ratio; + const unsigned num_len = vec_len * ratio; + const unsigned rem_len = n - num_len; + + // multiply y to the first half of `x` + for (i = 0; i < vec_len; ++i) { + x[i] = MULFULL_MOD(x[i], y[i], F4); + } + + if (rem_len > 0) { + // add last _y[] to x + hadamard_mul_rem(rem_len, _x + num_len, _y + num_len); + } +} + } // namespace simd } // namespace quadiron From 542c81080427b1d8640f0bcb4e86bea1a0faafe4 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:44:34 +0200 Subject: [PATCH 14/77] SIMD FNT: vectorised operations for FNT --- src/simd_fnt.h | 555 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 555 insertions(+) create mode 100644 src/simd_fnt.h diff --git a/src/simd_fnt.h b/src/simd_fnt.h new file mode 100644 index 00000000..fc92b2d0 --- /dev/null +++ b/src/simd_fnt.h @@ -0,0 +1,555 @@ +/* + * Copyright 2017-2018 Scality + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __QUAD_SIMD_FNT_H__ +#define __QUAD_SIMD_FNT_H__ + +#include + +namespace quadiron { +namespace simd { + +/* ================= Vectorized Operations ================= */ + +// butterfly CT with r == 1 +template +inline void BUTTERFLY_1(VecType* x, VecType* y, T q) +{ + VecType add = ADD_MOD(*x, *y, q); + *y = SUB_MOD(*x, *y, q); + *x = add; +} + +// butterfly CT with r == q - 1 +template +inline void BUTTERFLY_2(VecType* x, VecType* y, T q) +{ + VecType add = ADD_MOD(*x, *y, q); + *x = SUB_MOD(*x, *y, q); + *y = add; +} + +// butterfly CT with 1 < r < q - 1 +template +inline void BUTTERFLY_3(VecType c, VecType* x, VecType* y, T q) +{ + VecType z = MUL_MOD(c, *y, q); + *y = SUB_MOD(*x, z, q); + *x = ADD_MOD(*x, z, q); +} + +template +inline void BUTTERFLY_CT(T rp1, VecType c, VecType* x, VecType* y, T q) +{ + if (rp1 == 2) { + BUTTERFLY_1(x, y, q); + } else if (rp1 < q) { + BUTTERFLY_3(c, x, y, q); + } else { + BUTTERFLY_2(x, y, q); + } +} + +// butterfly GS w/ r = q - 1 +template +inline void BUTTERFLY_4(VecType* x, VecType* y, T q) +{ + VecType add = ADD_MOD(*x, *y, q); + *y = SUB_MOD(*y, *x, q); + *x = add; +} + +// butterfly GS w/ 1 < r < q - 1 +// x = x + y mod q +// y = z * (x - y) mod q +template +inline void BUTTERFLY_5(VecType c, VecType* x, VecType* y, T q) +{ + VecType sub = SUB_MOD(*x, *y, q); + *x = ADD_MOD(*x, *y, q); + *y = MUL_MOD(c, sub, q); +} + +template +inline void BUTTERFLY_GS(T rp1, VecType c, VecType* x, VecType* y, T q) +{ + if (rp1 == 2) { + BUTTERFLY_1(x, y, q); + } else if (rp1 < q) { + BUTTERFLY_5(c, x, y, q); + } else { + BUTTERFLY_4(x, y, q); + } +} + +template +inline VecType BUTTERFLY_GS_SIMPLE(T rp1, VecType c, VecType x, T q) +{ + if (rp1 == 2) { + return x; + } else if (rp1 < q) { + return MUL_MOD(c, x, q); + } else { + return NEG_MOD(x, q); + } +} + +/** + * Vectorized butterly CT step + * + * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` + * P = P + r * Q + * Q = P - r * Q + * + * @param buf - working buffers + * @param r - coefficient + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param step - next loop + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +template +inline void butterfly_ct_step( + vec::Buffers& buf, + T r, + unsigned start, + unsigned m, + unsigned step, + size_t len, + T card) +{ + if (len == 0) { + return; + } + const T rp1 = r + 1; + VecType c = SET1(r); + + const size_t end = (len > 1) ? len - 1 : 0; + const unsigned bufs_nb = buf.get_n(); + // #pragma omp parallel for + // #pragma unroll + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + VecType x1, y1; + VecType x2, y2; + VecType* __restrict p = reinterpret_cast(mem[i]); + VecType* __restrict q = reinterpret_cast(mem[i + m]); + + // #pragma omp parallel for + size_t j = 0; + // #pragma unroll + for (; j < end; j += 2) { + x1 = LOAD(p + j); + y1 = LOAD(q + j); + + BUTTERFLY_CT(rp1, c, &x1, &y1, card); + + x2 = LOAD(p + j + 1); + y2 = LOAD(q + j + 1); + + BUTTERFLY_CT(rp1, c, &x2, &y2, card); + + // Store back to memory + STORE(p + j, x1); + STORE(p + j + 1, x2); + STORE(q + j, y1); + STORE(q + j + 1, y2); + } + for (; j < len; ++j) { + x1 = LOAD(p + j); + y1 = LOAD(q + j); + + BUTTERFLY_CT(rp1, c, &x1, &y1, card); + + // Store back to memory + STORE(p + j, x1); + STORE(q + j, y1); + } + } +} + +template +inline static void do_butterfly_ct_2_layers( + const std::vector& mem, + T r1, + T r2, + T r3, + unsigned start, + unsigned m, + size_t len, + T card) +{ + const T r1p1 = r1 + 1; + const T r2p1 = r2 + 1; + const T r3p1 = r3 + 1; + + VecType c1 = SET1(r1); + VecType c2 = SET1(r2); + VecType c3 = SET1(r3); + + VecType* __restrict p = reinterpret_cast(mem[start]); + VecType* __restrict q = reinterpret_cast(mem[start + m]); + VecType* __restrict r = reinterpret_cast(mem[start + 2 * m]); + VecType* __restrict s = reinterpret_cast(mem[start + 3 * m]); + + // #pragma omp parallel for + size_t j = 0; + const size_t end = (len > 1) ? len - 1 : 0; + // #pragma unroll + while (j < end) { + // First layer (c1, x, y) & (c1, u, v) + VecType x1 = LOAD(p); + VecType x2 = LOAD(p + 1); + VecType y1 = LOAD(q); + VecType y2 = LOAD(q + 1); + + BUTTERFLY_CT(r1p1, c1, &x1, &y1, card); + BUTTERFLY_CT(r1p1, c1, &x2, &y2, card); + + VecType u1 = LOAD(r); + VecType u2 = LOAD(r + 1); + VecType v1 = LOAD(s); + VecType v2 = LOAD(s + 1); + + BUTTERFLY_CT(r1p1, c1, &u1, &v1, card); + BUTTERFLY_CT(r1p1, c1, &u2, &v2, card); + + // Second layer (c2, x, u) & (c3, y, v) + BUTTERFLY_CT(r2p1, c2, &x1, &u1, card); + BUTTERFLY_CT(r2p1, c2, &x2, &u2, card); + + BUTTERFLY_CT(r3p1, c3, &y1, &v1, card); + BUTTERFLY_CT(r3p1, c3, &y2, &v2, card); + + // Store back to memory + STORE(p, x1); + STORE(p + 1, x2); + STORE(q, y1); + STORE(q + 1, y2); + + STORE(r, u1); + STORE(r + 1, u2); + STORE(s, v1); + STORE(s + 1, v2); + p = p + 2; + q = q + 2; + r = r + 2; + s = s + 2; + j = j + 2; + }; + + for (; j < len; ++j) { + // First layer (c1, x, y) & (c1, u, v) + VecType x1 = LOAD(p + j); + VecType y1 = LOAD(q + j); + VecType u1 = LOAD(r + j); + VecType v1 = LOAD(s + j); + + // BUTTERFLY_3_test(c1, &x1, &y1, &u1, &v1, card); + BUTTERFLY_CT(r1p1, c1, &x1, &y1, card); + BUTTERFLY_CT(r1p1, c1, &u1, &v1, card); + BUTTERFLY_CT(r2p1, c2, &x1, &u1, card); + BUTTERFLY_CT(r3p1, c3, &y1, &v1, card); + + // Store back to memory + STORE(p + j, x1); + STORE(q + j, y1); + STORE(r + j, u1); + STORE(s + j, v1); + } +} + +/** + * Vectorized butterly CT on two-layers at a time + * + * For each quadruple + * (P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m]) + * First layer: butterfly on (P, Q) and (R, S) for step = 2 * m + * coef r1 = W[start * n / (2 * m)] + * P = P + r1 * Q + * Q = P - r1 * Q + * R = R + r1 * S + * S = R - r1 * S + * Second layer: butterfly on (P, R) and (Q, S) for step = 4 * m + * coef r2 = W[start * n / (4 * m)] + * coef r3 = W[(start + m) * n / (4 * m)] + * P = P + r2 * R + * R = P - r2 * R + * Q = Q + r3 * S + * S = Q - r3 * S + * + * @param buf - working buffers + * @param r1 - coefficient for the 1st layer + * @param r2 - 1st coefficient for the 2nd layer + * @param r3 - 2nd coefficient for the 2nd layer + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +template +inline void butterfly_ct_two_layers_step( + vec::Buffers& buf, + T r1, + T r2, + T r3, + unsigned start, + unsigned m, + size_t len, + T card) +{ + if (len == 0) { + return; + } + const unsigned step = m << 2; + const unsigned bufs_nb = buf.get_n(); + + // #pragma omp parallel for + // #pragma unroll + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + do_butterfly_ct_2_layers(mem, r1, r2, r3, i, m, len, card); + } +} + +/** + * Vectorized butterly GS step + * + * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` + * P = P + Q + * Q = r * (P - Q) + * + * @param buf - working buffers + * @param r - coefficient + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +template +inline void butterfly_gs_step( + vec::Buffers& buf, + T r, + unsigned start, + unsigned m, + size_t len, + T card) +{ + if (len == 0) { + return; + } + const unsigned step = m << 1; + const T rp1 = r + 1; + VecType c = SET1(r); + + const size_t end = (len > 3) ? len - 3 : 0; + const unsigned bufs_nb = buf.get_n(); + // #pragma omp parallel for + // #pragma unroll + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + VecType x1, x2, x3, x4; + VecType y1, y2, y3, y4; + VecType* __restrict p = reinterpret_cast(mem[i]); + VecType* __restrict q = reinterpret_cast(mem[i + m]); + + // #pragma omp parallel for + size_t j = 0; + // #pragma unroll + for (; j < end; j += 4) { + x1 = LOAD(p + j); + x2 = LOAD(p + j + 1); + x3 = LOAD(p + j + 2); + x4 = LOAD(p + j + 3); + y1 = LOAD(q + j); + y2 = LOAD(q + j + 1); + y3 = LOAD(q + j + 2); + y4 = LOAD(q + j + 3); + + BUTTERFLY_GS(rp1, c, &x1, &y1, card); + BUTTERFLY_GS(rp1, c, &x2, &y2, card); + BUTTERFLY_GS(rp1, c, &x3, &y3, card); + BUTTERFLY_GS(rp1, c, &x4, &y4, card); + + // Store back to memory + STORE(p + j, x1); + STORE(p + j + 1, x2); + STORE(p + j + 2, x3); + STORE(p + j + 3, x4); + STORE(q + j, y1); + STORE(q + j + 1, y2); + STORE(q + j + 2, y3); + STORE(q + j + 3, y4); + } + for (; j < len; ++j) { + x1 = LOAD(p + j); + y1 = LOAD(q + j); + + BUTTERFLY_GS(rp1, c, &x1, &y1, card); + + // Store back to memory + STORE(p + j, x1); + STORE(q + j, y1); + } + } +} + +/** + * Vectorized butterly GS step + * + * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` + * Q = r * P + * + * @param buf - working buffers + * @param r - coefficient + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +template +inline void butterfly_gs_step_simple( + vec::Buffers& buf, + T r, + unsigned start, + unsigned m, + size_t len, + T card) +{ + if (len == 0) { + return; + } + const unsigned step = m << 1; + const T rp1 = r + 1; + VecType c = SET1(r); + + const size_t end = (len > 1) ? len - 1 : 0; + const unsigned bufs_nb = buf.get_n(); + // #pragma omp parallel for + // #pragma unroll + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + VecType x1, y1; + VecType x2, y2; + VecType* __restrict p = reinterpret_cast(mem[i]); + VecType* __restrict q = reinterpret_cast(mem[i + m]); + + // #pragma omp parallel for + size_t j = 0; + // #pragma unroll + for (; j < end; j += 2) { + x1 = LOAD(p + j); + x2 = LOAD(p + j + 1); + + y1 = BUTTERFLY_GS_SIMPLE(rp1, c, x1, card); + y2 = BUTTERFLY_GS_SIMPLE(rp1, c, x2, card); + + // Store back to memory + STORE(q + j, y1); + STORE(q + j + 1, y2); + } + for (; j < len; ++j) { + x1 = LOAD(p + j); + + y1 = BUTTERFLY_GS_SIMPLE(rp1, c, x1, card); + + // Store back to memory + STORE(q + j, y1); + } + } +} + +template +inline void encode_post_process( + vec::Buffers& output, + std::vector& props, + off_t offset, + unsigned code_len, + T threshold, + size_t vecs_nb) +{ + const unsigned element_size = sizeof(T); + const unsigned vec_size = countof(); + const T max = 1 << (element_size * 8 - 1); + const VecType _threshold = SET1(threshold); + const VecType mask_hi = SET1(max); + + // #pragma unroll + const std::vector& mem = output.get_mem(); + for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { + VecType* __restrict buf = reinterpret_cast(mem[frag_id]); + + size_t vec_id = 0; + size_t end = (vecs_nb > 3) ? vecs_nb - 3 : 0; + // #pragma unroll + for (; vec_id < end; vec_id += 4) { + VecType a1 = LOAD(buf + vec_id); + VecType a2 = LOAD(buf + vec_id + 1); + VecType a3 = LOAD(buf + vec_id + 2); + VecType a4 = LOAD(buf + vec_id + 3); + + if (TESTZ(a1, _threshold) == 0) { + const off_t curr_offset = offset + vec_id * vec_size; + ADD_PROPS( + props[frag_id], _threshold, mask_hi, a1, curr_offset, max); + } + if (TESTZ(a2, _threshold) == 0) { + const off_t curr_offset = offset + (vec_id + 1) * vec_size; + ADD_PROPS( + props[frag_id], _threshold, mask_hi, a2, curr_offset, max); + } + if (TESTZ(a3, _threshold) == 0) { + const off_t curr_offset = offset + (vec_id + 2) * vec_size; + ADD_PROPS( + props[frag_id], _threshold, mask_hi, a3, curr_offset, max); + } + if (TESTZ(a4, _threshold) == 0) { + const off_t curr_offset = offset + (vec_id + 3) * vec_size; + ADD_PROPS( + props[frag_id], _threshold, mask_hi, a4, curr_offset, max); + } + } + for (; vec_id < vecs_nb; ++vec_id) { + VecType a = LOAD(buf + vec_id); + uint32_t c = TESTZ(a, _threshold); + if (c == 0) { + const off_t curr_offset = offset + vec_id * vec_size; + ADD_PROPS( + props[frag_id], _threshold, mask_hi, a, curr_offset, max); + } + } + } +} + +} // namespace simd +} // namespace quadiron + +#endif From c100aabe74a24b979ece784f55ebb890e4052f2c Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 26 Oct 2018 15:25:55 +0200 Subject: [PATCH 15/77] SIMD 256: update --- src/simd_256.h | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/simd_256.h b/src/simd_256.h index d06f3218..e18faa44 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -53,6 +53,8 @@ typedef __m128i HalfVecType; typedef __uint128_t NF4Type; typedef uint32_t MaskIntType; +/* ============= Constant variable ============ */ + #define F4_u32 _mm256_set1_epi32(65537) #define F4m1_u32 _mm256_set1_epi32(65536) #define F3_u32 _mm256_set1_epi32(257) @@ -61,15 +63,24 @@ typedef uint32_t MaskIntType; #define F3_u16 _mm256_set1_epi16(257) #define F3m1_u16 _mm256_set1_epi16(256) -#define CARD(q) (EITHER(q == F3, F3_u32, F4_u32)) -#define CARD_M_1(q) (EITHER(q == F3, F3m1_u32, F4m1_u32)) - -/* ============= Essential Operations for AVX2 w/ both u16 & u32 ============ */ - #define ZERO (_mm256_setzero_si256()) #define ONE16 (_mm256_set1_epi16(1)) #define ONE32 (_mm256_set1_epi32(1)) +/* ============= Essential Operations for AVX2 w/ both u16 & u32 ============ */ + +template +inline VecType CARD(T q) +{ + return (q == F3) ? F3_u32 : F4_u32; +} + +template +inline VecType CARD_M_1(T q) +{ + return (q == F3) ? F3m1_u32 : F4m1_u32; +} + inline VecType LOAD(VecType* address) { return _mm256_load_si256(address); From d00d648b66ece56784f4efb8d1efe12cda5e8a1c Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 26 Oct 2018 15:41:14 +0200 Subject: [PATCH 16/77] SIMD: use auto for return type of MVMSK8 --- src/simd_128.h | 1 - src/simd_256.h | 1 - src/simd_basic.h | 4 ++-- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/simd_128.h b/src/simd_128.h index afbfe70f..21a2b82d 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -37,7 +37,6 @@ namespace quadiron { namespace simd { typedef __m128i VecType; -typedef uint32_t MaskIntType; #define F4_u32 _mm_set1_epi32(65537) #define F4m1_u32 _mm_set1_epi32(65536) diff --git a/src/simd_256.h b/src/simd_256.h index e18faa44..c2099ee2 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -51,7 +51,6 @@ namespace simd { typedef __m256i VecType; typedef __m128i HalfVecType; typedef __uint128_t NF4Type; -typedef uint32_t MaskIntType; /* ============= Constant variable ============ */ diff --git a/src/simd_basic.h b/src/simd_basic.h index 7585734d..0f142ea4 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -145,7 +145,7 @@ inline void ADD_PROPS( { const VecType b = CMPEQ32(threshold, symb); const VecType c = AND(mask, b); - MaskIntType d = MVMSK8(c); + auto d = MVMSK8(c); const unsigned element_size = sizeof(uint32_t); while (d > 0) { unsigned byte_idx = __builtin_ctz(d); @@ -260,7 +260,7 @@ inline void ADD_PROPS( { const VecType b = CMPEQ16(threshold, symb); const VecType c = AND(mask, b); - MaskIntType d = MVMSK8(c); + auto d = MVMSK8(c); const unsigned element_size = sizeof(uint16_t); while (d > 0) { unsigned byte_idx = __builtin_ctz(d); From fd2197b29ca1e3a4b3e75751d2db9dff2e15cce0 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 26 Oct 2018 15:52:30 +0200 Subject: [PATCH 17/77] SIMD: move CARD & CARD_M_1 to simd_basic.h --- src/simd.h | 2 -- src/simd_128.h | 14 +++++++------- src/simd_256.h | 12 ------------ src/simd_basic.h | 12 ++++++++++++ 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/src/simd.h b/src/simd.h index d70fcb2f..41e4935e 100644 --- a/src/simd.h +++ b/src/simd.h @@ -47,8 +47,6 @@ namespace quadiron { */ namespace simd { -#define EITHER(x, a, b) (((x)) ? (a) : (b)) - // Vectorized operations are implemented in appropriated headers simd*.h } // namespace simd diff --git a/src/simd_128.h b/src/simd_128.h index 21a2b82d..9ad4051c 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -38,6 +38,8 @@ namespace simd { typedef __m128i VecType; +/* ============= Constant variable ============ */ + #define F4_u32 _mm_set1_epi32(65537) #define F4m1_u32 _mm_set1_epi32(65536) #define F3_u32 _mm_set1_epi32(257) @@ -46,15 +48,12 @@ typedef __m128i VecType; #define F3_u16 _mm_set1_epi16(257) #define F3m1_u16 _mm_set1_epi16(256) -#define CARD(q) (EITHER(q == F3, F3_u32, F4_u32)) -#define CARD_M_1(q) (EITHER(q == F3, F3m1_u32, F4m1_u32)) - -/* ============= Essential Operations for AVX2 w/ both u16 & u32 ============ */ - #define ZERO (_mm_setzero_si128()) #define ONE16 (_mm_set1_epi16(1)) #define ONE32 (_mm_set1_epi32(1)) +/* ============= Essential Operations for SSE w/ both u16 & u32 ============ */ + inline VecType LOAD(VecType* address) { return _mm_load_si128(address); @@ -89,7 +88,7 @@ inline uint16_t TESTZ(VecType x, VecType y) return _mm_testz_si128(x, y); } -/* ================= Essential Operations for AVX2 w/ u32 ================= */ +/* ================= Essential Operations for SSE w/ u32 ================= */ inline VecType SET1(uint32_t val) { @@ -120,11 +119,12 @@ inline VecType MINU32(VecType x, VecType y) { return _mm_min_epu32(x, y); } + #define MASK8_LO (_mm_set1_epi16(0x80)) #define BLEND8(x, y, mask) (_mm_blendv_epi8(x, y, mask)) #define BLEND16(x, y, imm8) (_mm_blend_epi16(x, y, imm8)) -/* ================= Essential Operations for AVX2 w/ u16 ================= */ +/* ================= Essential Operations for SSE w/ u16 ================= */ inline VecType SET1(uint16_t val) { diff --git a/src/simd_256.h b/src/simd_256.h index c2099ee2..09253a16 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -68,18 +68,6 @@ typedef __uint128_t NF4Type; /* ============= Essential Operations for AVX2 w/ both u16 & u32 ============ */ -template -inline VecType CARD(T q) -{ - return (q == F3) ? F3_u32 : F4_u32; -} - -template -inline VecType CARD_M_1(T q) -{ - return (q == F3) ? F3m1_u32 : F4m1_u32; -} - inline VecType LOAD(VecType* address) { return _mm256_load_si256(address); diff --git a/src/simd_basic.h b/src/simd_basic.h index 0f142ea4..f014d6d7 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -36,6 +36,18 @@ namespace quadiron { namespace simd { +template +inline VecType CARD(T q) +{ + return (q == F3) ? F3_u32 : F4_u32; +} + +template +inline VecType CARD_M_1(T q) +{ + return (q == F3) ? F3m1_u32 : F4m1_u32; +} + /* ================= Basic Operations for u32 ================= */ /** From 9e11defe181fe639add590caf848683fa96ef122 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 26 Oct 2018 15:55:21 +0200 Subject: [PATCH 18/77] SIMD 128 & 256: move MASK8_LO to const variable groups --- src/simd_128.h | 3 ++- src/simd_256.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/simd_128.h b/src/simd_128.h index 9ad4051c..5dd4c332 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -52,6 +52,8 @@ typedef __m128i VecType; #define ONE16 (_mm_set1_epi16(1)) #define ONE32 (_mm_set1_epi32(1)) +#define MASK8_LO (_mm_set1_epi16(0x80)) + /* ============= Essential Operations for SSE w/ both u16 & u32 ============ */ inline VecType LOAD(VecType* address) @@ -120,7 +122,6 @@ inline VecType MINU32(VecType x, VecType y) return _mm_min_epu32(x, y); } -#define MASK8_LO (_mm_set1_epi16(0x80)) #define BLEND8(x, y, mask) (_mm_blendv_epi8(x, y, mask)) #define BLEND16(x, y, imm8) (_mm_blend_epi16(x, y, imm8)) diff --git a/src/simd_256.h b/src/simd_256.h index 09253a16..c82f810a 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -66,6 +66,8 @@ typedef __uint128_t NF4Type; #define ONE16 (_mm256_set1_epi16(1)) #define ONE32 (_mm256_set1_epi32(1)) +#define MASK8_LO (_mm256_set1_epi16(0x80)) + /* ============= Essential Operations for AVX2 w/ both u16 & u32 ============ */ inline VecType LOAD(VecType* address) @@ -134,7 +136,6 @@ inline VecType MINU32(VecType x, VecType y) return _mm256_min_epu32(x, y); } -#define MASK8_LO (_mm256_set1_epi16(0x80)) #define BLEND8(x, y, mask) (_mm256_blendv_epi8(x, y, mask)) #define BLEND16(x, y, imm8) (_mm256_blend_epi16(x, y, imm8)) From a23a0abe891ac2974f9b81031056aa552be6c7a8 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 26 Oct 2018 16:01:00 +0200 Subject: [PATCH 19/77] SIMD: use macro for shiftr --- src/simd_128.h | 10 ++-------- src/simd_256.h | 10 ++-------- src/simd_basic.h | 12 ++++++------ 3 files changed, 10 insertions(+), 22 deletions(-) diff --git a/src/simd_128.h b/src/simd_128.h index 5dd4c332..6ce80d5d 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -73,14 +73,6 @@ inline VecType XOR(VecType x, VecType y) { return _mm_xor_si128(x, y); } -inline VecType SHIFTR_1(VecType x) -{ - return _mm_srli_si128(x, 1); -} -inline VecType SHIFTR_2(VecType x) -{ - return _mm_srli_si128(x, 2); -} inline uint16_t MVMSK8(VecType x) { return _mm_movemask_epi8(x); @@ -90,6 +82,8 @@ inline uint16_t TESTZ(VecType x, VecType y) return _mm_testz_si128(x, y); } +#define SHIFTR(x, imm8) (_mm_srli_si128(x, imm8)) + /* ================= Essential Operations for SSE w/ u32 ================= */ inline VecType SET1(uint32_t val) diff --git a/src/simd_256.h b/src/simd_256.h index c82f810a..27efd934 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -87,14 +87,6 @@ inline VecType XOR(VecType x, VecType y) { return _mm256_xor_si256(x, y); } -inline VecType SHIFTR_1(VecType x) -{ - return _mm256_srli_si256(x, 1); -} -inline VecType SHIFTR_2(VecType x) -{ - return _mm256_srli_si256(x, 2); -} inline uint32_t MVMSK8(VecType x) { return _mm256_movemask_epi8(x); @@ -104,6 +96,8 @@ inline uint32_t TESTZ(VecType x, VecType y) return _mm256_testz_si256(x, y); } +#define SHIFTR(x, imm8) (_mm256_srli_si256(x, imm8)) + /* ================= Essential Operations for AVX2 w/ u32 ================= */ inline VecType SET1(uint32_t val) diff --git a/src/simd_basic.h b/src/simd_basic.h index f014d6d7..262add1c 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -107,8 +107,8 @@ inline VecType MUL_MOD(VecType x, VecType y, uint32_t q) VecType res = MUL32(x, y); VecType lo = (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55); - VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR_1(res), MASK8_LO) - : BLEND16(ZERO, SHIFTR_2(res), 0x55); + VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO) + : BLEND16(ZERO, SHIFTR(res, 2), 0x55); return SUB_MOD(lo, hi, q); } @@ -132,8 +132,8 @@ inline VecType MULFULL_MOD(VecType x, VecType y, uint32_t q) VecType lo = (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55); - VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR_1(res), MASK8_LO) - : BLEND16(ZERO, SHIFTR_2(res), 0x55); + VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO) + : BLEND16(ZERO, SHIFTR(res, 2), 0x55); return SUB_MOD(lo, hi, q); } @@ -225,7 +225,7 @@ inline VecType MUL_MOD(VecType x, VecType y, uint16_t q) { VecType res = MUL16(x, y); VecType lo = BLEND8(ZERO, res, MASK8_LO); - VecType hi = BLEND8(ZERO, SHIFTR_1(res), MASK8_LO); + VecType hi = BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO); return SUB_MOD(lo, hi, q); } @@ -248,7 +248,7 @@ inline VecType MULFULL_MOD(VecType x, VecType y, uint16_t q) res = ADD16(res, AND(ONE16, cmp)); VecType lo = BLEND8(ZERO, res, MASK8_LO); - VecType hi = BLEND8(ZERO, SHIFTR_1(res), MASK8_LO); + VecType hi = BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO); return SUB_MOD(lo, hi, q); } From 373afd18ccf416ecf2a7ed556ff294b01085cde1 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 26 Oct 2018 17:13:36 +0200 Subject: [PATCH 20/77] SIMD 128: use template functions --- src/simd_128.h | 86 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 35 deletions(-) diff --git a/src/simd_128.h b/src/simd_128.h index 6ce80d5d..2361b6f3 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -83,70 +83,86 @@ inline uint16_t TESTZ(VecType x, VecType y) } #define SHIFTR(x, imm8) (_mm_srli_si128(x, imm8)) +#define BLEND8(x, y, mask) (_mm_blendv_epi8(x, y, mask)) +#define BLEND16(x, y, imm8) (_mm_blend_epi16(x, y, imm8)) -/* ================= Essential Operations for SSE w/ u32 ================= */ +/* ================= Essential Operations for SSE ================= */ +template +inline VecType SET1(T val); +template <> inline VecType SET1(uint32_t val) { return _mm_set1_epi32(val); } -inline VecType ADD32(VecType x, VecType y) -{ - return _mm_add_epi32(x, y); -} -inline VecType SUB32(VecType x, VecType y) -{ - return _mm_sub_epi32(x, y); -} -inline VecType MUL32(VecType x, VecType y) +template <> +inline VecType SET1(uint16_t val) { - return _mm_mullo_epi32(x, y); + return _mm_set1_epi16(val); } -inline VecType CMPEQ32(VecType x, VecType y) -{ - return _mm_cmpeq_epi32(x, y); -} -inline VecType CMPGT32(VecType x, VecType y) +template +inline VecType ADD(VecType x, VecType y); +template <> +inline VecType ADD(VecType x, VecType y) { - return _mm_cmpgt_epi32(x, y); + return _mm_add_epi32(x, y); } -inline VecType MINU32(VecType x, VecType y) +template <> +inline VecType ADD(VecType x, VecType y) { - return _mm_min_epu32(x, y); + return _mm_add_epi16(x, y); } -#define BLEND8(x, y, mask) (_mm_blendv_epi8(x, y, mask)) -#define BLEND16(x, y, imm8) (_mm_blend_epi16(x, y, imm8)) - -/* ================= Essential Operations for SSE w/ u16 ================= */ - -inline VecType SET1(uint16_t val) +template +inline VecType SUB(VecType x, VecType y); +template <> +inline VecType SUB(VecType x, VecType y) { - return _mm_set1_epi16(val); + return _mm_sub_epi32(x, y); } -inline VecType ADD16(VecType x, VecType y) +template <> +inline VecType SUB(VecType x, VecType y) { - return _mm_add_epi16(x, y); + return _mm_sub_epi16(x, y); } -inline VecType SUB16(VecType x, VecType y) + +template +inline VecType MUL(VecType x, VecType y); +template <> +inline VecType MUL(VecType x, VecType y) { - return _mm_sub_epi16(x, y); + return _mm_mullo_epi32(x, y); } -inline VecType MUL16(VecType x, VecType y) +template <> +inline VecType MUL(VecType x, VecType y) { return _mm_mullo_epi16(x, y); } -inline VecType CMPEQ16(VecType x, VecType y) +template +inline VecType CMPEQ(VecType x, VecType y); +template <> +inline VecType CMPEQ(VecType x, VecType y) +{ + return _mm_cmpeq_epi32(x, y); +} +template <> +inline VecType CMPEQ(VecType x, VecType y) { return _mm_cmpeq_epi16(x, y); } -inline VecType CMPGT16(VecType x, VecType y) + + +template +inline VecType MIN(VecType x, VecType y); +template <> +inline VecType MIN(VecType x, VecType y) { - return _mm_cmpgt_epi16(x, y); + return _mm_min_epu32(x, y); } -inline VecType MINU16(VecType x, VecType y) +template <> +inline VecType MIN(VecType x, VecType y) { return _mm_min_epu16(x, y); } From ec0991a5f0cb49ee041a203d4311bfb7e03cd88e Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 26 Oct 2018 17:13:48 +0200 Subject: [PATCH 21/77] SIMD 256: use template functions --- src/simd_256.h | 85 +++++++++++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 35 deletions(-) diff --git a/src/simd_256.h b/src/simd_256.h index 27efd934..b3ab8e78 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -97,70 +97,85 @@ inline uint32_t TESTZ(VecType x, VecType y) } #define SHIFTR(x, imm8) (_mm256_srli_si256(x, imm8)) +#define BLEND8(x, y, mask) (_mm256_blendv_epi8(x, y, mask)) +#define BLEND16(x, y, imm8) (_mm256_blend_epi16(x, y, imm8)) -/* ================= Essential Operations for AVX2 w/ u32 ================= */ +/* ================= Essential Operations for AVX2 ================= */ +template +inline VecType SET1(T val); +template <> inline VecType SET1(uint32_t val) { return _mm256_set1_epi32(val); } -inline VecType ADD32(VecType x, VecType y) -{ - return _mm256_add_epi32(x, y); -} -inline VecType SUB32(VecType x, VecType y) -{ - return _mm256_sub_epi32(x, y); -} -inline VecType MUL32(VecType x, VecType y) +template <> +inline VecType SET1(uint16_t val) { - return _mm256_mullo_epi32(x, y); + return _mm256_set1_epi16(val); } -inline VecType CMPEQ32(VecType x, VecType y) -{ - return _mm256_cmpeq_epi32(x, y); -} -inline VecType CMPGT32(VecType x, VecType y) +template +inline VecType ADD(VecType x, VecType y); +template <> +inline VecType ADD(VecType x, VecType y) { - return _mm256_cmpgt_epi32(x, y); + return _mm256_add_epi32(x, y); } -inline VecType MINU32(VecType x, VecType y) +template <> +inline VecType ADD(VecType x, VecType y) { - return _mm256_min_epu32(x, y); + return _mm256_add_epi16(x, y); } -#define BLEND8(x, y, mask) (_mm256_blendv_epi8(x, y, mask)) -#define BLEND16(x, y, imm8) (_mm256_blend_epi16(x, y, imm8)) - -/* ================= Essential Operations for AVX2 w/ u16 ================= */ - -inline VecType SET1(uint16_t val) +template +inline VecType SUB(VecType x, VecType y); +template <> +inline VecType SUB(VecType x, VecType y) { - return _mm256_set1_epi16(val); + return _mm256_sub_epi32(x, y); } -inline VecType ADD16(VecType x, VecType y) +template <> +inline VecType SUB(VecType x, VecType y) { - return _mm256_add_epi16(x, y); + return _mm256_sub_epi16(x, y); } -inline VecType SUB16(VecType x, VecType y) + +template +inline VecType MUL(VecType x, VecType y); +template <> +inline VecType MUL(VecType x, VecType y) { - return _mm256_sub_epi16(x, y); + return _mm256_mullo_epi32(x, y); } -inline VecType MUL16(VecType x, VecType y) +template <> +inline VecType MUL(VecType x, VecType y) { return _mm256_mullo_epi16(x, y); } -inline VecType CMPEQ16(VecType x, VecType y) +template +inline VecType CMPEQ(VecType x, VecType y); +template <> +inline VecType CMPEQ(VecType x, VecType y) +{ + return _mm256_cmpeq_epi32(x, y); +} +template <> +inline VecType CMPEQ(VecType x, VecType y) { return _mm256_cmpeq_epi16(x, y); } -inline VecType CMPGT16(VecType x, VecType y) + +template +inline VecType MIN(VecType x, VecType y); +template <> +inline VecType MIN(VecType x, VecType y) { - return _mm256_cmpgt_epi16(x, y); + return _mm256_min_epu32(x, y); } -inline VecType MINU16(VecType x, VecType y) +template <> +inline VecType MIN(VecType x, VecType y) { return _mm256_min_epu16(x, y); } From fde40cc98f86adaed56d082630e0d33fb207c8b0 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 26 Oct 2018 17:20:54 +0200 Subject: [PATCH 22/77] SIMD Basic: use templated essential functions --- src/simd_basic.h | 161 ++++++++--------------------------------------- 1 file changed, 26 insertions(+), 135 deletions(-) diff --git a/src/simd_basic.h b/src/simd_basic.h index 262add1c..04f0dee3 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -48,20 +48,21 @@ inline VecType CARD_M_1(T q) return (q == F3) ? F3m1_u32 : F4m1_u32; } -/* ================= Basic Operations for u32 ================= */ +/* ================= Basic Operations ================= */ /** - * Modular addition for packed unsigned 32-bit integers + * Modular addition * * @param x input register * @param y input register * @param q modulo * @return (x + y) mod q */ -inline VecType ADD_MOD(VecType x, VecType y, uint32_t q) +template +inline VecType ADD_MOD(VecType x, VecType y, T q) { - VecType res = ADD32(x, y); - return MINU32(res, SUB32(res, CARD(q))); + VecType res = ADD(x, y); + return MIN(res, SUB(res, CARD(q))); } /** @@ -72,10 +73,11 @@ inline VecType ADD_MOD(VecType x, VecType y, uint32_t q) * @param q modulo * @return (x - y) mod q */ -inline VecType SUB_MOD(VecType x, VecType y, uint32_t q) +template +inline VecType SUB_MOD(VecType x, VecType y, T q) { - VecType res = SUB32(x, y); - return MINU32(res, ADD32(res, CARD(q))); + VecType res = SUB(x, y); + return MIN(res, ADD(res, CARD(q))); } /** @@ -85,10 +87,11 @@ inline VecType SUB_MOD(VecType x, VecType y, uint32_t q) * @param q modulo * @return (-x) mod q */ -inline VecType NEG_MOD(VecType x, uint32_t q) +template +inline VecType NEG_MOD(VecType x, T q) { - VecType res = SUB32(CARD(q), x); - return MINU32(res, SUB32(res, CARD(q))); + VecType res = SUB(CARD(q), x); + return MIN(res, SUB(res, CARD(q))); } /** @@ -102,9 +105,10 @@ inline VecType NEG_MOD(VecType x, uint32_t q) * @param q modulo * @return (x * y) mod q */ -inline VecType MUL_MOD(VecType x, VecType y, uint32_t q) +template +inline VecType MUL_MOD(VecType x, VecType y, T q) { - VecType res = MUL32(x, y); + VecType res = MUL(x, y); VecType lo = (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55); VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO) @@ -122,13 +126,14 @@ inline VecType MUL_MOD(VecType x, VecType y, uint32_t q) * @param q modulo * @return (x * y) mod q */ -inline VecType MULFULL_MOD(VecType x, VecType y, uint32_t q) +template +inline VecType MULFULL_MOD(VecType x, VecType y, T q) { - VecType res = MUL32(x, y); + VecType res = MUL(x, y); // filter elements of both of a & b = card-1 - VecType cmp = AND(CMPEQ32(x, CARD_M_1(q)), CMPEQ32(y, CARD_M_1(q))); - res = (q == F3) ? XOR(res, AND(F4_u32, cmp)) : ADD32(res, AND(ONE32, cmp)); + VecType cmp = AND(CMPEQ(x, CARD_M_1(q)), CMPEQ(y, CARD_M_1(q))); + res = (q == F3) ? XOR(res, AND(F4_u32, cmp)) : ADD(res, AND(ONE32, cmp)); VecType lo = (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55); @@ -147,133 +152,19 @@ inline VecType MULFULL_MOD(VecType x, VecType y, uint32_t q) * @param offset offset in the data fragments * @param max a dummy variable */ +template inline void ADD_PROPS( Properties& props, VecType threshold, VecType mask, VecType symb, off_t offset, - uint32_t max) -{ - const VecType b = CMPEQ32(threshold, symb); - const VecType c = AND(mask, b); - auto d = MVMSK8(c); - const unsigned element_size = sizeof(uint32_t); - while (d > 0) { - unsigned byte_idx = __builtin_ctz(d); - off_t _offset = offset + byte_idx / element_size; - props.add(_offset, OOR_MARK); - d ^= 1 << byte_idx; - } -} - -/* ================= Basic Operations for u16 ================= */ - -/** - * Modular addition for packed unsigned 16-bit integers - * - * @param x input register - * @param y input register - * @param q modulo - * @return (x + y) mod q - */ -inline VecType ADD_MOD(VecType x, VecType y, uint16_t q) -{ - VecType res = ADD16(x, y); - return MINU16(res, SUB16(res, F3_u16)); -} - -/** - * Modular subtraction for packed unsigned 16-bit integers - * - * @param x input register - * @param y input register - * @param q modulo - * @return (x - y) mod q - */ -inline VecType SUB_MOD(VecType x, VecType y, uint16_t q) -{ - VecType res = SUB16(x, y); - return MINU16(res, SUB16(ADD16(x, F3_u16), y)); -} - -/** - * Modular negation for packed unsigned 16-bit integers - * - * @param x input register - * @param q modulo - * @return (-x) mod q - */ -inline VecType NEG_MOD(VecType x, uint16_t q) -{ - VecType res = SUB16(F3_u16, x); - return MINU16(res, SUB16(res, F3_u16)); -} - -/** - * Modular multiplication for packed unsigned 16-bit integers - * - * @note We assume that at least `x` or `y` is less than `q-1` so it's - * not necessary to verify overflow on multiplying elements - * - * @param x input register - * @param y input register - * @param q modulo - * @return (x * y) mod q - */ -inline VecType MUL_MOD(VecType x, VecType y, uint16_t q) -{ - VecType res = MUL16(x, y); - VecType lo = BLEND8(ZERO, res, MASK8_LO); - VecType hi = BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO); - return SUB_MOD(lo, hi, q); -} - -/** - * Modular general multiplication for packed unsigned 16-bit integers - * - * @note It's necessary to verify overflow on multiplying elements - * - * @param x input register - * @param y input register - * @param q modulo - * @return (x * y) mod q - */ -inline VecType MULFULL_MOD(VecType x, VecType y, uint16_t q) -{ - VecType res = MUL16(x, y); - - // filter elements of both of a & b = card-1 - VecType cmp = AND(CMPEQ16(x, F3m1_u16), CMPEQ16(y, F3m1_u16)); - res = ADD16(res, AND(ONE16, cmp)); - - VecType lo = BLEND8(ZERO, res, MASK8_LO); - VecType hi = BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO); - return SUB_MOD(lo, hi, q); -} - -/** - * Update property for a given register for packed unsigned 32-bit integers - * - * @param props properties bound to fragments - * @param threshold register storing max value in its elements - * @param mask a specific mask - * @param symb input register - * @param offset offset in the data fragments - * @param max a dummy variable - */ -inline void ADD_PROPS( - Properties& props, - VecType threshold, - VecType mask, - VecType symb, - off_t offset, - uint16_t max) + T max) { - const VecType b = CMPEQ16(threshold, symb); + const VecType b = CMPEQ(threshold, symb); const VecType c = AND(mask, b); auto d = MVMSK8(c); - const unsigned element_size = sizeof(uint16_t); + const unsigned element_size = sizeof(T); while (d > 0) { unsigned byte_idx = __builtin_ctz(d); off_t _offset = offset + byte_idx / element_size; From 70ace14733e1e00b06f66ba4709b8786bc3565f6 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 26 Oct 2018 17:23:40 +0200 Subject: [PATCH 23/77] SIMD Basic: use const & curly braces --- src/simd_basic.h | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/src/simd_basic.h b/src/simd_basic.h index 04f0dee3..85aa5f16 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -61,7 +61,7 @@ inline VecType CARD_M_1(T q) template inline VecType ADD_MOD(VecType x, VecType y, T q) { - VecType res = ADD(x, y); + const VecType res = ADD(x, y); return MIN(res, SUB(res, CARD(q))); } @@ -76,7 +76,7 @@ inline VecType ADD_MOD(VecType x, VecType y, T q) template inline VecType SUB_MOD(VecType x, VecType y, T q) { - VecType res = SUB(x, y); + const VecType res = SUB(x, y); return MIN(res, ADD(res, CARD(q))); } @@ -90,7 +90,7 @@ inline VecType SUB_MOD(VecType x, VecType y, T q) template inline VecType NEG_MOD(VecType x, T q) { - VecType res = SUB(CARD(q), x); + const VecType res = SUB(CARD(q), x); return MIN(res, SUB(res, CARD(q))); } @@ -108,10 +108,10 @@ inline VecType NEG_MOD(VecType x, T q) template inline VecType MUL_MOD(VecType x, VecType y, T q) { - VecType res = MUL(x, y); - VecType lo = + const VecType res = MUL(x, y); + const VecType lo = (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55); - VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO) + const VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO) : BLEND16(ZERO, SHIFTR(res, 2), 0x55); return SUB_MOD(lo, hi, q); } @@ -132,12 +132,12 @@ inline VecType MULFULL_MOD(VecType x, VecType y, T q) VecType res = MUL(x, y); // filter elements of both of a & b = card-1 - VecType cmp = AND(CMPEQ(x, CARD_M_1(q)), CMPEQ(y, CARD_M_1(q))); + const VecType cmp = AND(CMPEQ(x, CARD_M_1(q)), CMPEQ(y, CARD_M_1(q))); res = (q == F3) ? XOR(res, AND(F4_u32, cmp)) : ADD(res, AND(ONE32, cmp)); - VecType lo = + const VecType lo = (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55); - VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO) + const VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO) : BLEND16(ZERO, SHIFTR(res, 2), 0x55); return SUB_MOD(lo, hi, q); } @@ -166,8 +166,8 @@ inline void ADD_PROPS( auto d = MVMSK8(c); const unsigned element_size = sizeof(T); while (d > 0) { - unsigned byte_idx = __builtin_ctz(d); - off_t _offset = offset + byte_idx / element_size; + const unsigned byte_idx = __builtin_ctz(d); + const size_t _offset = offset + byte_idx / element_size; props.add(_offset, OOR_MARK); d ^= 1 << byte_idx; } @@ -191,7 +191,7 @@ inline void mul_coef_to_buf(const T a, T* src, T* dest, size_t len, T card) const size_t _last_len = len - _len * ratio; size_t i = 0; - size_t end = (_len > 3) ? _len - 3 : 0; + const size_t end = (_len > 3) ? _len - 3 : 0; for (; i < end; i += 4) { _dest[i] = MUL_MOD(coef, _src[i], card); _dest[i + 1] = MUL_MOD(coef, _src[i + 1], card); @@ -203,7 +203,7 @@ inline void mul_coef_to_buf(const T a, T* src, T* dest, size_t len, T card) } if (_last_len > 0) { - DoubleSizeVal coef_double = DoubleSizeVal(a); + const DoubleSizeVal coef_double = DoubleSizeVal(a); for (size_t i = _len * ratio; i < len; i++) { dest[i] = (T)((coef_double * src[i]) % card); } @@ -225,7 +225,7 @@ inline void add_two_bufs(T* src, T* dest, size_t len, T card) } if (_last_len > 0) { for (i = _len * ratio; i < len; i++) { - T tmp = src[i] + dest[i]; + const T tmp = src[i] + dest[i]; dest[i] = (tmp >= card) ? (tmp - card) : tmp; } } @@ -249,10 +249,11 @@ inline void sub_two_bufs(T* bufa, T* bufb, T* res, size_t len, T card) if (_last_len > 0) { for (i = _len * ratio; i < len; i++) { // perform subtraction - if (bufa[i] >= bufb[i]) + if (bufa[i] >= bufb[i]) { res[i] = bufa[i] - bufb[i]; - else + } else { res[i] = card - (bufb[i] - bufa[i]); + } } } } From 49c428733b2e382cb1c62814591d12190da23c9d Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Mon, 29 Oct 2018 11:32:00 +0100 Subject: [PATCH 24/77] SIMD FNT: get rid of refactored butterfly functions --- src/simd_fnt.h | 107 +++++++++++++++++++++++-------------------------- 1 file changed, 50 insertions(+), 57 deletions(-) diff --git a/src/simd_fnt.h b/src/simd_fnt.h index fc92b2d0..0fdb28af 100644 --- a/src/simd_fnt.h +++ b/src/simd_fnt.h @@ -38,77 +38,70 @@ namespace simd { /* ================= Vectorized Operations ================= */ -// butterfly CT with r == 1 -template -inline void BUTTERFLY_1(VecType* x, VecType* y, T q) -{ - VecType add = ADD_MOD(*x, *y, q); - *y = SUB_MOD(*x, *y, q); - *x = add; -} - -// butterfly CT with r == q - 1 -template -inline void BUTTERFLY_2(VecType* x, VecType* y, T q) -{ - VecType add = ADD_MOD(*x, *y, q); - *x = SUB_MOD(*x, *y, q); - *y = add; -} - -// butterfly CT with 1 < r < q - 1 -template -inline void BUTTERFLY_3(VecType c, VecType* x, VecType* y, T q) -{ - VecType z = MUL_MOD(c, *y, q); - *y = SUB_MOD(*x, z, q); - *x = ADD_MOD(*x, z, q); -} - +/** + * Butterfly Cooley-Tukey operation + * + * x <- x + r * y + * y <- x - r * y + * + * @param rp1 coefficient `r` plus one + * @param c a register stores coefficient `r` + * @param x working register + * @param y working register + * @param q modular + */ template inline void BUTTERFLY_CT(T rp1, VecType c, VecType* x, VecType* y, T q) { - if (rp1 == 2) { - BUTTERFLY_1(x, y, q); - } else if (rp1 < q) { - BUTTERFLY_3(c, x, y, q); - } else { - BUTTERFLY_2(x, y, q); + VecType z = (rp1 == 2) ? *y : MUL_MOD(c, *y, q); + if (rp1 < q) { + *y = SUB_MOD(*x, z, q); + *x = ADD_MOD(*x, z, q); + } else { // i.e. r == q - 1 + *y = ADD_MOD(*x, z, q); + *x = SUB_MOD(*x, z, q); } } -// butterfly GS w/ r = q - 1 -template -inline void BUTTERFLY_4(VecType* x, VecType* y, T q) -{ - VecType add = ADD_MOD(*x, *y, q); - *y = SUB_MOD(*y, *x, q); - *x = add; -} - -// butterfly GS w/ 1 < r < q - 1 -// x = x + y mod q -// y = z * (x - y) mod q -template -inline void BUTTERFLY_5(VecType c, VecType* x, VecType* y, T q) -{ - VecType sub = SUB_MOD(*x, *y, q); - *x = ADD_MOD(*x, *y, q); - *y = MUL_MOD(c, sub, q); -} - +/** + * Butterfly Genteleman-Sande operation + * + * x <- x + y + * y <- r * (x - y) + * + * @param rp1 coefficient `r` plus one + * @param c a register stores coefficient `r` + * @param x working register + * @param y working register + * @param q modular + */ template inline void BUTTERFLY_GS(T rp1, VecType c, VecType* x, VecType* y, T q) { + VecType add = ADD_MOD(*x, *y, q); if (rp1 == 2) { - BUTTERFLY_1(x, y, q); + *y = SUB_MOD(*x, *y, q); } else if (rp1 < q) { - BUTTERFLY_5(c, x, y, q); - } else { - BUTTERFLY_4(x, y, q); + VecType sub = SUB_MOD(*x, *y, q); + *y = MUL_MOD(c, sub, q); + } else { // i.e. r == q - 1 + *y = SUB_MOD(*y, *x, q); } + *x = add; } +/** + * Butterfly Genteleman-Sande simple operation where y = 0 + * + * x <- x, i.e. no operation + * y <- r * x + * + * @param rp1 coefficient `r` plus one + * @param c a register stores coefficient `r` + * @param x working register + * @param q modular + * @return r * x + */ template inline VecType BUTTERFLY_GS_SIMPLE(T rp1, VecType c, VecType x, T q) { From 692f7ff1d9dae5944b5ea8568a960a1c679c99e7 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Mon, 29 Oct 2018 12:55:26 +0100 Subject: [PATCH 25/77] SIMD 128: add function is_all_zeros --- src/simd_128.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/simd_128.h b/src/simd_128.h index 2361b6f3..48de4175 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -81,6 +81,10 @@ inline uint16_t TESTZ(VecType x, VecType y) { return _mm_testz_si128(x, y); } +inline int is_all_zeros(VecType x) +{ + return _mm_testc_si128(ZERO, y); +} #define SHIFTR(x, imm8) (_mm_srli_si128(x, imm8)) #define BLEND8(x, y, mask) (_mm_blendv_epi8(x, y, mask)) From 324f470a32a6168480ae4d958fe456881e845b23 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Mon, 29 Oct 2018 12:55:40 +0100 Subject: [PATCH 26/77] SIMD 256: add function is_all_zeros --- src/simd_256.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/simd_256.h b/src/simd_256.h index b3ab8e78..e542b6ec 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -95,6 +95,10 @@ inline uint32_t TESTZ(VecType x, VecType y) { return _mm256_testz_si256(x, y); } +inline int is_all_zeros(VecType x) +{ + return _mm256_testc_si256(ZERO, x); +} #define SHIFTR(x, imm8) (_mm256_srli_si256(x, imm8)) #define BLEND8(x, y, mask) (_mm256_blendv_epi8(x, y, mask)) From e7cfeafeb132582e9fd769bf2fd7c791797d95d4 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Mon, 29 Oct 2018 12:55:57 +0100 Subject: [PATCH 27/77] SIMD Basic: refactor MULFULL_MOD --- src/simd_basic.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/simd_basic.h b/src/simd_basic.h index 85aa5f16..67729e78 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -129,17 +129,16 @@ inline VecType MUL_MOD(VecType x, VecType y, T q) template inline VecType MULFULL_MOD(VecType x, VecType y, T q) { - VecType res = MUL(x, y); + const VecType res = MUL_MOD(x, y, q); // filter elements of both of a & b = card-1 const VecType cmp = AND(CMPEQ(x, CARD_M_1(q)), CMPEQ(y, CARD_M_1(q))); - res = (q == F3) ? XOR(res, AND(F4_u32, cmp)) : ADD(res, AND(ONE32, cmp)); - const VecType lo = - (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55); - const VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO) - : BLEND16(ZERO, SHIFTR(res, 2), 0x55); - return SUB_MOD(lo, hi, q); + if (is_all_zeros(cmp) == 1) { + return res; + } + return (q == F3) ? XOR(res, AND(F4_u32, cmp)) : + ADD(res, AND(ONE32, cmp)); } /** From f28fccc454e0001ba917926e23b25edad93fb4a2 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Mon, 29 Oct 2018 13:02:23 +0100 Subject: [PATCH 28/77] SIMD 128: fix is_all_zeros --- src/simd_128.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/simd_128.h b/src/simd_128.h index 48de4175..0baac264 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -83,7 +83,7 @@ inline uint16_t TESTZ(VecType x, VecType y) } inline int is_all_zeros(VecType x) { - return _mm_testc_si128(ZERO, y); + return _mm_testc_si128(ZERO, x); } #define SHIFTR(x, imm8) (_mm_srli_si128(x, imm8)) From f65910a4cd4ec1a6f531f25af4459914c270257d Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Mon, 29 Oct 2018 16:34:20 +0100 Subject: [PATCH 29/77] SIMD basic: use const --- src/simd_basic.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/simd_basic.h b/src/simd_basic.h index 67729e78..f7cee150 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -285,9 +285,9 @@ template inline void neg(size_t len, T* buf, T card) { VecType* _buf = reinterpret_cast(buf); - unsigned ratio = sizeof(*_buf) / sizeof(*buf); - size_t _len = len / ratio; - size_t _last_len = len - _len * ratio; + const unsigned ratio = sizeof(*_buf) / sizeof(*buf); + const size_t _len = len / ratio; + const size_t _last_len = len - _len * ratio; size_t i; for (i = 0; i < _len; i++) { From 163151a545bd50676392a60bcec39006abe3d8b0 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Mon, 29 Oct 2018 16:37:46 +0100 Subject: [PATCH 30/77] SIMD FNT: fix typo & remove unnecessary comments --- src/simd_fnt.h | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/src/simd_fnt.h b/src/simd_fnt.h index 0fdb28af..05d4d7d0 100644 --- a/src/simd_fnt.h +++ b/src/simd_fnt.h @@ -115,7 +115,7 @@ inline VecType BUTTERFLY_GS_SIMPLE(T rp1, VecType c, VecType x, T q) } /** - * Vectorized butterly CT step + * Vectorized butterfly CT step * * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` * P = P + r * Q @@ -147,8 +147,6 @@ inline void butterfly_ct_step( const size_t end = (len > 1) ? len - 1 : 0; const unsigned bufs_nb = buf.get_n(); - // #pragma omp parallel for - // #pragma unroll const std::vector& mem = buf.get_mem(); for (unsigned i = start; i < bufs_nb; i += step) { VecType x1, y1; @@ -156,9 +154,7 @@ inline void butterfly_ct_step( VecType* __restrict p = reinterpret_cast(mem[i]); VecType* __restrict q = reinterpret_cast(mem[i + m]); - // #pragma omp parallel for size_t j = 0; - // #pragma unroll for (; j < end; j += 2) { x1 = LOAD(p + j); y1 = LOAD(q + j); @@ -213,10 +209,8 @@ inline static void do_butterfly_ct_2_layers( VecType* __restrict r = reinterpret_cast(mem[start + 2 * m]); VecType* __restrict s = reinterpret_cast(mem[start + 3 * m]); - // #pragma omp parallel for size_t j = 0; const size_t end = (len > 1) ? len - 1 : 0; - // #pragma unroll while (j < end) { // First layer (c1, x, y) & (c1, u, v) VecType x1 = LOAD(p); @@ -281,7 +275,7 @@ inline static void do_butterfly_ct_2_layers( } /** - * Vectorized butterly CT on two-layers at a time + * Vectorized butterfly CT on two-layers at a time * * For each quadruple * (P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m]) @@ -325,8 +319,6 @@ inline void butterfly_ct_two_layers_step( const unsigned step = m << 2; const unsigned bufs_nb = buf.get_n(); - // #pragma omp parallel for - // #pragma unroll const std::vector& mem = buf.get_mem(); for (unsigned i = start; i < bufs_nb; i += step) { do_butterfly_ct_2_layers(mem, r1, r2, r3, i, m, len, card); @@ -334,7 +326,7 @@ inline void butterfly_ct_two_layers_step( } /** - * Vectorized butterly GS step + * Vectorized butterfly GS step * * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` * P = P + Q @@ -365,8 +357,6 @@ inline void butterfly_gs_step( const size_t end = (len > 3) ? len - 3 : 0; const unsigned bufs_nb = buf.get_n(); - // #pragma omp parallel for - // #pragma unroll const std::vector& mem = buf.get_mem(); for (unsigned i = start; i < bufs_nb; i += step) { VecType x1, x2, x3, x4; @@ -374,9 +364,7 @@ inline void butterfly_gs_step( VecType* __restrict p = reinterpret_cast(mem[i]); VecType* __restrict q = reinterpret_cast(mem[i + m]); - // #pragma omp parallel for size_t j = 0; - // #pragma unroll for (; j < end; j += 4) { x1 = LOAD(p + j); x2 = LOAD(p + j + 1); @@ -416,7 +404,7 @@ inline void butterfly_gs_step( } /** - * Vectorized butterly GS step + * Vectorized butterfly GS step * * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` * Q = r * P @@ -446,8 +434,6 @@ inline void butterfly_gs_step_simple( const size_t end = (len > 1) ? len - 1 : 0; const unsigned bufs_nb = buf.get_n(); - // #pragma omp parallel for - // #pragma unroll const std::vector& mem = buf.get_mem(); for (unsigned i = start; i < bufs_nb; i += step) { VecType x1, y1; @@ -455,9 +441,7 @@ inline void butterfly_gs_step_simple( VecType* __restrict p = reinterpret_cast(mem[i]); VecType* __restrict q = reinterpret_cast(mem[i + m]); - // #pragma omp parallel for size_t j = 0; - // #pragma unroll for (; j < end; j += 2) { x1 = LOAD(p + j); x2 = LOAD(p + j + 1); @@ -495,14 +479,12 @@ inline void encode_post_process( const VecType _threshold = SET1(threshold); const VecType mask_hi = SET1(max); - // #pragma unroll const std::vector& mem = output.get_mem(); for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { VecType* __restrict buf = reinterpret_cast(mem[frag_id]); size_t vec_id = 0; size_t end = (vecs_nb > 3) ? vecs_nb - 3 : 0; - // #pragma unroll for (; vec_id < end; vec_id += 4) { VecType a1 = LOAD(buf + vec_id); VecType a2 = LOAD(buf + vec_id + 1); From 83c97ea4d4af31de447f4a070e780379059e7026 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Mon, 29 Oct 2018 16:42:31 +0100 Subject: [PATCH 31/77] SIMD 256: remove NF4Type --- src/simd_256.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/simd_256.h b/src/simd_256.h index e542b6ec..85a696a3 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -50,7 +50,6 @@ namespace simd { typedef __m256i VecType; typedef __m128i HalfVecType; -typedef __uint128_t NF4Type; /* ============= Constant variable ============ */ From 435f89d8b1442f8abe6c4023ca07d1ff8d80bfb7 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Mon, 29 Oct 2018 16:42:51 +0100 Subject: [PATCH 32/77] SIMD NF4: remove NF4Type --- src/simd_nf4.h | 61 +++++++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/src/simd_nf4.h b/src/simd_nf4.h index b86e284e..de493220 100644 --- a/src/simd_nf4.h +++ b/src/simd_nf4.h @@ -39,18 +39,17 @@ namespace quadiron { namespace simd { typedef uint32_t aint32 __attribute__((aligned(ALIGNMENT))); -typedef __uint128_t NF4Type; -/** Return NF4Type integer from a _m128i register */ -static inline NF4Type m128i_to_uint128(__m128i v) +/** Return __uint128_t integer from a _m128i register */ +static inline __uint128_t m128i_to_uint128(__m128i v) { - NF4Type i; + __uint128_t i; _mm_store_si128((__m128i*)&i, v); return i; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn) } -inline NF4Type expand16(uint16_t* arr, int n) +inline __uint128_t expand16(uint16_t* arr, int n) { // since n <= 4 uint16_t _arr[4] __attribute__((aligned(ALIGNMENT))) = {0, 0, 0, 0}; @@ -61,7 +60,7 @@ inline NF4Type expand16(uint16_t* arr, int n) return m128i_to_uint128(b); } -inline NF4Type expand32(uint32_t* arr, int n) +inline __uint128_t expand32(uint32_t* arr, int n) { // since n <= 4 uint32_t _arr[4] __attribute__((aligned(simd::ALIGNMENT))) = {0, 0, 0, 0}; @@ -75,7 +74,7 @@ inline NF4Type expand32(uint32_t* arr, int n) inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n) { uint16_t ai[8]; - NF4Type values; + __uint128_t values; __m128i _a = _mm_loadu_si128((__m128i*)&a); ai[0] = _mm_extract_epi16(_a, 0); @@ -101,7 +100,7 @@ inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n) inline void unpack(__uint128_t a, GroupedValues<__uint128_t>& b, int n) { uint16_t ai[8]; - NF4Type values; + __uint128_t values; __m128i _a = _mm_loadu_si128((__m128i*)&a); ai[0] = _mm_extract_epi16(_a, 0); @@ -123,7 +122,7 @@ inline void unpack(__uint128_t a, GroupedValues<__uint128_t>& b, int n) b.values = values; // NOLINT(clang-analyzer-core.uninitialized.Assign) } -inline NF4Type pack(__uint128_t a) +inline __uint128_t pack(__uint128_t a) { __m128i _a = _mm_loadu_si128((__m128i*)&a); __m128i b = _mm_set_epi32( @@ -135,7 +134,7 @@ inline NF4Type pack(__uint128_t a) return m128i_to_uint128(b); } -inline NF4Type pack(__uint128_t a, uint32_t flag) +inline __uint128_t pack(__uint128_t a, uint32_t flag) { aint32 b0, b1, b2, b3; __m128i _a = _mm_loadu_si128((__m128i*)&a); @@ -179,35 +178,35 @@ inline void STORE_LOW(HalfVecType* address, VecType reg) _mm_store_si128(address, _mm256_castsi256_si128(reg)); } -inline NF4Type add(NF4Type a, NF4Type b) +inline __uint128_t add(__uint128_t a, __uint128_t b) { HalfVecType res; VecType _a = CAST_TO_DOUBLE((HalfVecType)a); VecType _b = CAST_TO_DOUBLE((HalfVecType)b); STORE_LOW(&res, ADD_MOD(_a, _b, F4)); - return (NF4Type)res; + return (__uint128_t)res; } -inline NF4Type sub(NF4Type a, NF4Type b) +inline __uint128_t sub(__uint128_t a, __uint128_t b) { HalfVecType res; VecType _a = CAST_TO_DOUBLE((HalfVecType)a); VecType _b = CAST_TO_DOUBLE((HalfVecType)b); STORE_LOW(&res, SUB_MOD(_a, _b, F4)); - return (NF4Type)res; + return (__uint128_t)res; } -inline NF4Type mul(NF4Type a, NF4Type b) +inline __uint128_t mul(__uint128_t a, __uint128_t b) { HalfVecType res; VecType _a = CAST_TO_DOUBLE((HalfVecType)a); VecType _b = CAST_TO_DOUBLE((HalfVecType)b); STORE_LOW(&res, MULFULL_MOD(_a, _b, F4)); - return (NF4Type)res; + return (__uint128_t)res; } inline void -add_buf_to_two_bufs_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y) +add_buf_to_two_bufs_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint128_t* y) { // add last _y[] to x and x_next HalfVecType* _x = reinterpret_cast(x); @@ -223,7 +222,7 @@ add_buf_to_two_bufs_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y) } } -inline void hadamard_mul_rem(unsigned n, NF4Type* x, NF4Type* y) +inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y) { HalfVecType* _x = reinterpret_cast(x); HalfVecType* _y = reinterpret_cast(y); @@ -236,7 +235,7 @@ inline void hadamard_mul_rem(unsigned n, NF4Type* x, NF4Type* y) } inline void -hadamard_mul_doubled_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y) +hadamard_mul_doubled_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint128_t* y) { HalfVecType* _x = reinterpret_cast(x); HalfVecType* _x_half = reinterpret_cast(x_half); @@ -253,40 +252,40 @@ hadamard_mul_doubled_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y) #elif defined(__SSE4_1__) -inline NF4Type add(NF4Type a, NF4Type b) +inline __uint128_t add(__uint128_t a, __uint128_t b) { VecType res; STORE(&res, ADD_MOD((VecType)a, (VecType)b, F4)); - return (NF4Type)res; + return (__uint128_t)res; } -inline NF4Type sub(NF4Type a, NF4Type b) +inline __uint128_t sub(__uint128_t a, __uint128_t b) { VecType res; STORE(&res, SUB_MOD((VecType)a, (VecType)b, F4)); - return (NF4Type)res; + return (__uint128_t)res; } -inline NF4Type mul(NF4Type a, NF4Type b) +inline __uint128_t mul(__uint128_t a, __uint128_t b) { VecType res; STORE(&res, MULFULL_MOD((VecType)a, (VecType)b, F4)); - return (NF4Type)res; + return (__uint128_t)res; } inline void -add_buf_to_two_bufs_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y) +add_buf_to_two_bufs_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint128_t* y) { // do nothing } -inline void hadamard_mul_rem(unsigned n, NF4Type* x, NF4Type* y) +inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y) { // do nothing } inline void -hadamard_mul_doubled_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y) +hadamard_mul_doubled_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint128_t* y) { // do nothing } @@ -296,7 +295,7 @@ hadamard_mul_doubled_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y) /* ==================== Operations for NF4 =================== */ /** Add buffer `y` to two halves of `x`. `x` is of length `n` */ -inline void add_buf_to_two_bufs(unsigned n, NF4Type* _x, NF4Type* _y) +inline void add_buf_to_two_bufs(unsigned n, __uint128_t* _x, __uint128_t* _y) { unsigned i; VecType* x = reinterpret_cast(_x); @@ -308,7 +307,7 @@ inline void add_buf_to_two_bufs(unsigned n, NF4Type* _x, NF4Type* _y) const unsigned num_len = vec_len * ratio; const unsigned rem_len = half_len - num_len; - NF4Type* x_half = _x + half_len; + __uint128_t* x_half = _x + half_len; VecType* x_next = reinterpret_cast(x_half); // add y to the first half of `x` @@ -327,7 +326,7 @@ inline void add_buf_to_two_bufs(unsigned n, NF4Type* _x, NF4Type* _y) } } -inline void hadamard_mul(unsigned n, NF4Type* _x, NF4Type* _y) +inline void hadamard_mul(unsigned n, __uint128_t* _x, __uint128_t* _y) { unsigned i; VecType* x = reinterpret_cast(_x); From 4bd2d6c88f44617780f794ae1211910e3a0fe31c Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Mon, 29 Oct 2018 17:23:14 +0100 Subject: [PATCH 33/77] SIMD NF4: remove C-style cast --- src/simd_nf4.h | 68 +++++++++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 25 deletions(-) diff --git a/src/simd_nf4.h b/src/simd_nf4.h index de493220..175a760b 100644 --- a/src/simd_nf4.h +++ b/src/simd_nf4.h @@ -168,9 +168,15 @@ inline __uint128_t pack(__uint128_t a, uint32_t flag) #if defined(__AVX2__) -inline VecType CAST_TO_DOUBLE(HalfVecType x) +inline VecType load_to_reg(HalfVecType x) { - return _mm256_castsi128_si256(x); + return _mm256_castsi128_si256(_mm_load_si128(&x)); +} + +inline VecType load_to_reg(__uint128_t x) +{ + const HalfVecType* _x = reinterpret_cast(&x); + return load_to_reg(*_x); } inline void STORE_LOW(HalfVecType* address, VecType reg) @@ -181,28 +187,28 @@ inline void STORE_LOW(HalfVecType* address, VecType reg) inline __uint128_t add(__uint128_t a, __uint128_t b) { HalfVecType res; - VecType _a = CAST_TO_DOUBLE((HalfVecType)a); - VecType _b = CAST_TO_DOUBLE((HalfVecType)b); + VecType _a = load_to_reg(a); + VecType _b = load_to_reg(b); STORE_LOW(&res, ADD_MOD(_a, _b, F4)); - return (__uint128_t)res; + return reinterpret_cast<__uint128_t>(res); } inline __uint128_t sub(__uint128_t a, __uint128_t b) { HalfVecType res; - VecType _a = CAST_TO_DOUBLE((HalfVecType)a); - VecType _b = CAST_TO_DOUBLE((HalfVecType)b); + VecType _a = load_to_reg(a); + VecType _b = load_to_reg(b); STORE_LOW(&res, SUB_MOD(_a, _b, F4)); - return (__uint128_t)res; + return reinterpret_cast<__uint128_t>(res); } inline __uint128_t mul(__uint128_t a, __uint128_t b) { HalfVecType res; - VecType _a = CAST_TO_DOUBLE((HalfVecType)a); - VecType _b = CAST_TO_DOUBLE((HalfVecType)b); + VecType _a = load_to_reg(a); + VecType _b = load_to_reg(b); STORE_LOW(&res, MULFULL_MOD(_a, _b, F4)); - return (__uint128_t)res; + return reinterpret_cast<__uint128_t>(res); } inline void @@ -213,9 +219,9 @@ add_buf_to_two_bufs_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint1 HalfVecType* _x_half = reinterpret_cast(x_half); HalfVecType* _y = reinterpret_cast(y); for (unsigned i = 0; i < n; ++i) { - VecType _x_p = CAST_TO_DOUBLE(_x[i]); - VecType _x_next_p = CAST_TO_DOUBLE(_x_half[i]); - VecType _y_p = CAST_TO_DOUBLE(_y[i]); + VecType _x_p = load_to_reg(_x[i]); + VecType _x_next_p = load_to_reg(_x_half[i]); + VecType _y_p = load_to_reg(_y[i]); STORE_LOW(_x + i, ADD_MOD(_x_p, _y_p, F4)); STORE_LOW(_x_half + i, ADD_MOD(_x_next_p, _y_p, F4)); @@ -227,8 +233,8 @@ inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y) HalfVecType* _x = reinterpret_cast(x); HalfVecType* _y = reinterpret_cast(y); for (unsigned i = 0; i < n; ++i) { - VecType _x_p = CAST_TO_DOUBLE(_x[i]); - VecType _y_p = CAST_TO_DOUBLE(_y[i]); + VecType _x_p = load_to_reg(_x[i]); + VecType _y_p = load_to_reg(_y[i]); STORE_LOW(_x + i, MULFULL_MOD(_x_p, _y_p, F4)); } @@ -241,9 +247,9 @@ hadamard_mul_doubled_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint HalfVecType* _x_half = reinterpret_cast(x_half); HalfVecType* _y = reinterpret_cast(y); for (unsigned i = 0; i < n; ++i) { - VecType _x_p = CAST_TO_DOUBLE(_x[i]); - VecType _x_next_p = CAST_TO_DOUBLE(_x_half[i]); - VecType _y_p = CAST_TO_DOUBLE(_y[i]); + VecType _x_p = load_to_reg(_x[i]); + VecType _x_next_p = load_to_reg(_x_half[i]); + VecType _y_p = load_to_reg(_y[i]); STORE_LOW(_x + i, MULFULL_MOD(_x_p, _y_p, F4)); STORE_LOW(_x_half + i, MULFULL_MOD(_x_next_p, _y_p, F4)); @@ -252,25 +258,37 @@ hadamard_mul_doubled_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint #elif defined(__SSE4_1__) +inline VecType load_to_reg(__uint128_t x) +{ + const VecType* _x = reinterpret_cast(&x); + return _mm_load_si128(_x); +} + inline __uint128_t add(__uint128_t a, __uint128_t b) { VecType res; - STORE(&res, ADD_MOD((VecType)a, (VecType)b, F4)); - return (__uint128_t)res; + VecType _a = load_to_reg(a); + VecType _b = load_to_reg(b); + STORE(&res, ADD_MOD(_a, _b, F4)); + return reinterpret_cast<__uint128_t>(res); } inline __uint128_t sub(__uint128_t a, __uint128_t b) { VecType res; - STORE(&res, SUB_MOD((VecType)a, (VecType)b, F4)); - return (__uint128_t)res; + VecType _a = load_to_reg(a); + VecType _b = load_to_reg(b); + STORE(&res, SUB_MOD(_a, _b, F4)); + return reinterpret_cast<__uint128_t>(res); } inline __uint128_t mul(__uint128_t a, __uint128_t b) { VecType res; - STORE(&res, MULFULL_MOD((VecType)a, (VecType)b, F4)); - return (__uint128_t)res; + VecType _a = load_to_reg(a); + VecType _b = load_to_reg(b); + STORE(&res, MULFULL_MOD(_a, _b, F4)); + return reinterpret_cast<__uint128_t>(res); } inline void From f608c75b147e0aa2a5eb7134503f080b23e11cd4 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 11:04:14 +0100 Subject: [PATCH 34/77] FFT_2n.h: compute simd indices --- src/fft_2n.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/fft_2n.h b/src/fft_2n.h index 8a63bbb2..86626c25 100644 --- a/src/fft_2n.h +++ b/src/fft_2n.h @@ -142,6 +142,11 @@ class Radix2 : public FourierTransform { size_t pkt_size; size_t buf_size; + // Indices used for accelerated functions + size_t simd_vec_len; + size_t simd_trailing_len; + size_t simd_offset; + std::unique_ptr rev = nullptr; std::unique_ptr> W = nullptr; std::unique_ptr> inv_W = nullptr; @@ -182,6 +187,12 @@ Radix2::Radix2(const gf::Field& gf, int n, int data_len, size_t pkt_size) rev = std::unique_ptr(new T[n]); init_bitrev(); + + // Indices used for accelerated functions + const unsigned ratio = simd::countof(); + simd_vec_len = this->pkt_size / ratio; + simd_trailing_len = this->pkt_size - simd_vec_len * ratio; + simd_offset = simd_vec_len * ratio; } template From 664cb68bb5baadd18d4f4a242f4b9e103514eddb Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 11:04:48 +0100 Subject: [PATCH 35/77] FFT_2n.cpp: remove calculation of indices --- src/fft_2n.cpp | 106 ++++++++++++++----------------------------------- 1 file changed, 30 insertions(+), 76 deletions(-) diff --git a/src/fft_2n.cpp b/src/fft_2n.cpp index f3d8847f..efa7abd8 100644 --- a/src/fft_2n.cpp +++ b/src/fft_2n.cpp @@ -48,10 +48,6 @@ void Radix2::butterfly_ct_two_layers_step( unsigned start, unsigned m) { - const unsigned ratio = simd::countof(); - const size_t len = this->pkt_size; - const size_t vec_len = len / ratio; - const size_t last_len = len - vec_len * ratio; const unsigned coefIndex = start * this->n / m / 2; const uint16_t r1 = vec_W[coefIndex]; const uint16_t r2 = vec_W[coefIndex / 2]; @@ -59,29 +55,28 @@ void Radix2::butterfly_ct_two_layers_step( // perform vector operations simd::butterfly_ct_two_layers_step( - buf, r1, r2, r3, start, m, vec_len, card); + buf, r1, r2, r3, start, m, simd_vec_len, card); // for last elements, perform as non-SIMD method - if (last_len > 0) { + if (simd_trailing_len > 0) { const unsigned step = m << 2; - size_t offset = vec_len * ratio; // --------- // First layer // --------- const uint16_t r1 = W->get(start * this->n / m / 2); // first pair - butterfly_ct_step_slow(buf, r1, start, m, step, offset); + butterfly_ct_step_slow(buf, r1, start, m, step, simd_offset); // second pair - butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset); + butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, simd_offset); // --------- // Second layer // --------- // first pair const uint16_t r2 = W->get(start * this->n / m / 4); - butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset); + butterfly_ct_step_slow(buf, r2, start, 2 * m, step, simd_offset); // second pair const uint16_t r3 = W->get((start + m) * this->n / m / 4); - butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset); + butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, simd_offset); } } @@ -93,18 +88,12 @@ void Radix2::butterfly_ct_step( unsigned m, unsigned step) { - const unsigned ratio = simd::countof(); - const size_t len = this->pkt_size; - const size_t vec_len = len / ratio; - const size_t last_len = len - vec_len * ratio; - // perform vector operations - simd::butterfly_ct_step(buf, r, start, m, step, vec_len, card); + simd::butterfly_ct_step(buf, r, start, m, step, simd_vec_len, card); // for last elements, perform as non-SIMD method - if (last_len > 0) { - size_t offset = vec_len * ratio; - butterfly_ct_step_slow(buf, r, start, m, step, offset); + if (simd_trailing_len > 0) { + butterfly_ct_step_slow(buf, r, start, m, step, simd_offset); } } @@ -116,18 +105,12 @@ void Radix2::butterfly_gs_step( unsigned m, unsigned step) { - const unsigned ratio = simd::countof(); - const size_t len = this->pkt_size; - const size_t vec_len = len / ratio; - const size_t last_len = len - vec_len * ratio; - // perform vector operations - simd::butterfly_gs_step(buf, coef, start, m, vec_len, card); + simd::butterfly_gs_step(buf, coef, start, m, simd_vec_len, card); // for last elements, perform as non-SIMD method - if (last_len > 0) { - size_t offset = vec_len * ratio; - butterfly_gs_step_slow(buf, coef, start, m, step, offset); + if (simd_trailing_len > 0) { + butterfly_gs_step_slow(buf, coef, start, m, step, simd_offset); } } @@ -139,18 +122,12 @@ void Radix2::butterfly_gs_step_simple( unsigned m, unsigned step) { - const unsigned ratio = simd::countof(); - const size_t len = this->pkt_size; - const size_t vec_len = len / ratio; - const size_t last_len = len - vec_len * ratio; - // perform vector operations - simd::butterfly_gs_step_simple(buf, coef, start, m, vec_len, card); + simd::butterfly_gs_step_simple(buf, coef, start, m, simd_vec_len, card); // for last elements, perform as non-SIMD method - if (last_len > 0) { - size_t offset = vec_len * ratio; - butterfly_gs_step_simple_slow(buf, coef, start, m, step, offset); + if (simd_trailing_len > 0) { + butterfly_gs_step_simple_slow(buf, coef, start, m, step, simd_offset); } } @@ -160,10 +137,6 @@ void Radix2::butterfly_ct_two_layers_step( unsigned start, unsigned m) { - const unsigned ratio = simd::countof(); - const size_t len = this->pkt_size; - const size_t vec_len = len / ratio; - const size_t last_len = len - vec_len * ratio; const unsigned coefIndex = start * this->n / m / 2; const uint32_t r1 = vec_W[coefIndex]; const uint32_t r2 = vec_W[coefIndex / 2]; @@ -171,29 +144,28 @@ void Radix2::butterfly_ct_two_layers_step( // perform vector operations simd::butterfly_ct_two_layers_step( - buf, r1, r2, r3, start, m, vec_len, card); + buf, r1, r2, r3, start, m, simd_vec_len, card); // for last elements, perform as non-SIMD method - if (last_len > 0) { + if (simd_trailing_len > 0) { const unsigned step = m << 2; - size_t offset = vec_len * ratio; // --------- // First layer // --------- const uint32_t r1 = W->get(start * this->n / m / 2); // first pair - butterfly_ct_step_slow(buf, r1, start, m, step, offset); + butterfly_ct_step_slow(buf, r1, start, m, step, simd_offset); // second pair - butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset); + butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, simd_offset); // --------- // Second layer // --------- // first pair const uint32_t r2 = W->get(start * this->n / m / 4); - butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset); + butterfly_ct_step_slow(buf, r2, start, 2 * m, step, simd_offset); // second pair const uint32_t r3 = W->get((start + m) * this->n / m / 4); - butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset); + butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, simd_offset); } } @@ -205,18 +177,12 @@ void Radix2::butterfly_ct_step( unsigned m, unsigned step) { - const unsigned ratio = simd::countof(); - const size_t len = this->pkt_size; - const size_t vec_len = len / ratio; - const size_t last_len = len - vec_len * ratio; - // perform vector operations - simd::butterfly_ct_step(buf, r, start, m, step, vec_len, card); + simd::butterfly_ct_step(buf, r, start, m, step, simd_vec_len, card); // for last elements, perform as non-SIMD method - if (last_len > 0) { - size_t offset = vec_len * ratio; - butterfly_ct_step_slow(buf, r, start, m, step, offset); + if (simd_trailing_len > 0) { + butterfly_ct_step_slow(buf, r, start, m, step, simd_offset); } } @@ -228,18 +194,12 @@ void Radix2::butterfly_gs_step( unsigned m, unsigned step) { - const unsigned ratio = simd::countof(); - const size_t len = this->pkt_size; - const size_t vec_len = len / ratio; - const size_t last_len = len - vec_len * ratio; - // perform vector operations - simd::butterfly_gs_step(buf, coef, start, m, vec_len, card); + simd::butterfly_gs_step(buf, coef, start, m, simd_vec_len, card); // for last elements, perform as non-SIMD method - if (last_len > 0) { - size_t offset = vec_len * ratio; - butterfly_gs_step_slow(buf, coef, start, m, step, offset); + if (simd_trailing_len > 0) { + butterfly_gs_step_slow(buf, coef, start, m, step, simd_offset); } } @@ -251,18 +211,12 @@ void Radix2::butterfly_gs_step_simple( unsigned m, unsigned step) { - const unsigned ratio = simd::countof(); - const size_t len = this->pkt_size; - const size_t vec_len = len / ratio; - const size_t last_len = len - vec_len * ratio; - // perform vector operations - simd::butterfly_gs_step_simple(buf, coef, start, m, vec_len, card); + simd::butterfly_gs_step_simple(buf, coef, start, m, simd_vec_len, card); // for last elements, perform as non-SIMD method - if (last_len > 0) { - size_t offset = vec_len * ratio; - butterfly_gs_step_simple_slow(buf, coef, start, m, step, offset); + if (simd_trailing_len > 0) { + butterfly_gs_step_simple_slow(buf, coef, start, m, step, simd_offset); } } From e657e79d3392a0dcb9ef9b24ac92ac54e1c8f9d1 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 11:11:37 +0100 Subject: [PATCH 36/77] FFT_2n.h: define butterfly_ct_two_layers_step_slow --- src/fft_2n.h | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/fft_2n.h b/src/fft_2n.h index 86626c25..4af68528 100644 --- a/src/fft_2n.h +++ b/src/fft_2n.h @@ -112,6 +112,11 @@ class Radix2 : public FourierTransform { unsigned step); // Only used for non-vectorized elements + void butterfly_ct_two_layers_step_slow( + vec::Buffers& buf, + unsigned start, + unsigned m, + size_t offset = 0); void butterfly_ct_step_slow( vec::Buffers& buf, T coef, @@ -432,6 +437,16 @@ void Radix2::butterfly_ct_two_layers_step( vec::Buffers& buf, unsigned start, unsigned m) +{ + butterfly_ct_two_layers_step_slow(buf, start, m); +} + +template +void Radix2::butterfly_ct_two_layers_step_slow( + vec::Buffers& buf, + unsigned start, + unsigned m, + size_t offset) { const unsigned step = m << 2; // --------- @@ -439,18 +454,18 @@ void Radix2::butterfly_ct_two_layers_step( // --------- const T r1 = W->get(start * this->n / m / 2); // first pair - butterfly_ct_step(buf, r1, start, m, step); + butterfly_ct_step_slow(buf, r1, start, m, step, offset); // second pair - butterfly_ct_step(buf, r1, start + 2 * m, m, step); + butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset); // --------- // Second layer // --------- // first pair const T r2 = W->get(start * this->n / m / 4); - butterfly_ct_step(buf, r2, start, 2 * m, step); + butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset); // second pair const T r3 = W->get((start + m) * this->n / m / 4); - butterfly_ct_step(buf, r3, start + m, 2 * m, step); + butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset); } template From cb734b42d09a42fa47fb53fdd0bfa5f9e4a7c8d7 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 11:11:58 +0100 Subject: [PATCH 37/77] FFT_2n.cpp: use butterfly_ct_two_layers_step_slow --- src/fft_2n.cpp | 38 ++------------------------------------ 1 file changed, 2 insertions(+), 36 deletions(-) diff --git a/src/fft_2n.cpp b/src/fft_2n.cpp index efa7abd8..f7d91468 100644 --- a/src/fft_2n.cpp +++ b/src/fft_2n.cpp @@ -59,24 +59,7 @@ void Radix2::butterfly_ct_two_layers_step( // for last elements, perform as non-SIMD method if (simd_trailing_len > 0) { - const unsigned step = m << 2; - // --------- - // First layer - // --------- - const uint16_t r1 = W->get(start * this->n / m / 2); - // first pair - butterfly_ct_step_slow(buf, r1, start, m, step, simd_offset); - // second pair - butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, simd_offset); - // --------- - // Second layer - // --------- - // first pair - const uint16_t r2 = W->get(start * this->n / m / 4); - butterfly_ct_step_slow(buf, r2, start, 2 * m, step, simd_offset); - // second pair - const uint16_t r3 = W->get((start + m) * this->n / m / 4); - butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, simd_offset); + butterfly_ct_two_layers_step_slow(buf, start, m, simd_offset); } } @@ -148,24 +131,7 @@ void Radix2::butterfly_ct_two_layers_step( // for last elements, perform as non-SIMD method if (simd_trailing_len > 0) { - const unsigned step = m << 2; - // --------- - // First layer - // --------- - const uint32_t r1 = W->get(start * this->n / m / 2); - // first pair - butterfly_ct_step_slow(buf, r1, start, m, step, simd_offset); - // second pair - butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, simd_offset); - // --------- - // Second layer - // --------- - // first pair - const uint32_t r2 = W->get(start * this->n / m / 4); - butterfly_ct_step_slow(buf, r2, start, 2 * m, step, simd_offset); - // second pair - const uint32_t r3 = W->get((start + m) * this->n / m / 4); - butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, simd_offset); + butterfly_ct_two_layers_step_slow(buf, start, m, simd_offset); } } From 1a11fb0dd2b1f9598993f1287fd8b08972dedf0f Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 11:16:37 +0100 Subject: [PATCH 38/77] FEC RS FNT: simd indices as member variables --- src/fec_rs_fnt.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/fec_rs_fnt.h b/src/fec_rs_fnt.h index 4bdd5475..55ffb487 100644 --- a/src/fec_rs_fnt.h +++ b/src/fec_rs_fnt.h @@ -60,6 +60,11 @@ class RsFnt : public FecCode { // decoding context used in encoding of systematic FNT std::unique_ptr> enc_context; + // Indices used for accelerated functions + size_t simd_vec_len; + size_t simd_trailing_len; + size_t simd_offset; + public: RsFnt( FecType type, @@ -70,6 +75,12 @@ class RsFnt : public FecCode { : FecCode(type, word_size, n_data, n_parities, pkt_size) { this->fec_init(); + + // Indices used for accelerated functions + const unsigned ratio = simd::countof(); + simd_vec_len = this->pkt_size / ratio; + simd_trailing_len = this->pkt_size - simd_vec_len * ratio; + simd_offset = simd_vec_len * ratio; } inline void check_params() override From e1e9eebaa39e06be6af43d6048b3f199c27ceacb Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 11:16:59 +0100 Subject: [PATCH 39/77] FEC Vectorisation: use FNT's simd indices --- src/fec_vectorisation.cpp | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/src/fec_vectorisation.cpp b/src/fec_vectorisation.cpp index 8684e1ab..ed82fab8 100644 --- a/src/fec_vectorisation.cpp +++ b/src/fec_vectorisation.cpp @@ -53,20 +53,13 @@ void RsFnt::encode_post_process( uint16_t threshold = this->gf->card_minus_one(); unsigned code_len = this->n_outputs; - // number of elements per vector register - unsigned vec_size = simd::countof(); - // number of vector registers per fragment packet - size_t vecs_nb = size / vec_size; - // odd number of elements not vectorized - size_t last_len = size - vecs_nb * vec_size; - simd::encode_post_process( - output, props, offset, code_len, threshold, vecs_nb); + output, props, offset, code_len, threshold, simd_vec_len); - if (last_len > 0) { + if (simd_trailing_len > 0) { for (unsigned i = 0; i < code_len; ++i) { uint16_t* chunk = output.get(i); - for (size_t j = vecs_nb * vec_size; j < size; ++j) { + for (size_t j = simd_offset; j < size; ++j) { if (chunk[j] == threshold) { props[i].add(offset + j, OOR_MARK); } @@ -85,20 +78,13 @@ void RsFnt::encode_post_process( const uint32_t threshold = this->gf->card_minus_one(); const unsigned code_len = this->n_outputs; - // number of elements per vector register - const unsigned vec_size = simd::countof(); - // number of vector registers per fragment packet - const size_t vecs_nb = size / vec_size; - // odd number of elements not vectorized - const size_t last_len = size - vecs_nb * vec_size; - simd::encode_post_process( - output, props, offset, code_len, threshold, vecs_nb); + output, props, offset, code_len, threshold, simd_vec_len); - if (last_len > 0) { + if (simd_trailing_len > 0) { for (unsigned i = 0; i < code_len; ++i) { uint32_t* chunk = output.get(i); - for (size_t j = vecs_nb * vec_size; j < size; ++j) { + for (size_t j = simd_offset; j < size; ++j) { if (chunk[j] == threshold) { props[i].add(offset + j, OOR_MARK); } From 761eef761e37a7cf3850cd6161ba13cc90800417 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 11:27:04 +0100 Subject: [PATCH 40/77] SIMD Basic: clang-format fix --- src/simd_basic.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/simd_basic.h b/src/simd_basic.h index f7cee150..106f620b 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -112,7 +112,7 @@ inline VecType MUL_MOD(VecType x, VecType y, T q) const VecType lo = (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55); const VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO) - : BLEND16(ZERO, SHIFTR(res, 2), 0x55); + : BLEND16(ZERO, SHIFTR(res, 2), 0x55); return SUB_MOD(lo, hi, q); } @@ -137,8 +137,8 @@ inline VecType MULFULL_MOD(VecType x, VecType y, T q) if (is_all_zeros(cmp) == 1) { return res; } - return (q == F3) ? XOR(res, AND(F4_u32, cmp)) : - ADD(res, AND(ONE32, cmp)); + return (q == F3) ? XOR(res, AND(F4_u32, cmp)) + : ADD(res, AND(ONE32, cmp)); } /** From fbc8c77ba0be6d2dcf08a012e21f00e233a2654a Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 11:27:28 +0100 Subject: [PATCH 41/77] SIMD NF4: clang-format fix --- src/simd_nf4.h | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/src/simd_nf4.h b/src/simd_nf4.h index 175a760b..e60a9880 100644 --- a/src/simd_nf4.h +++ b/src/simd_nf4.h @@ -211,8 +211,11 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b) return reinterpret_cast<__uint128_t>(res); } -inline void -add_buf_to_two_bufs_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint128_t* y) +inline void add_buf_to_two_bufs_rem( + unsigned n, + __uint128_t* x, + __uint128_t* x_half, + __uint128_t* y) { // add last _y[] to x and x_next HalfVecType* _x = reinterpret_cast(x); @@ -240,8 +243,11 @@ inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y) } } -inline void -hadamard_mul_doubled_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint128_t* y) +inline void hadamard_mul_doubled_rem( + unsigned n, + __uint128_t* x, + __uint128_t* x_half, + __uint128_t* y) { HalfVecType* _x = reinterpret_cast(x); HalfVecType* _x_half = reinterpret_cast(x_half); @@ -291,8 +297,11 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b) return reinterpret_cast<__uint128_t>(res); } -inline void -add_buf_to_two_bufs_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint128_t* y) +inline void add_buf_to_two_bufs_rem( + unsigned n, + __uint128_t* x, + __uint128_t* x_half, + __uint128_t* y) { // do nothing } @@ -302,8 +311,11 @@ inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y) // do nothing } -inline void -hadamard_mul_doubled_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint128_t* y) +inline void hadamard_mul_doubled_rem( + unsigned n, + __uint128_t* x, + __uint128_t* x_half, + __uint128_t* y) { // do nothing } From 114186c185a97506bb9a2f8649e55ea7d566b53d Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 11:28:38 +0100 Subject: [PATCH 42/77] SIMD: rename LOAD --- src/simd_128.h | 3 +-- src/simd_256.h | 2 +- src/simd_fnt.h | 72 +++++++++++++++++++++++++------------------------- src/simd_nf4.h | 48 ++++++++++++++++----------------- 4 files changed, 62 insertions(+), 63 deletions(-) diff --git a/src/simd_128.h b/src/simd_128.h index 0baac264..e04cf6cd 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -56,7 +56,7 @@ typedef __m128i VecType; /* ============= Essential Operations for SSE w/ both u16 & u32 ============ */ -inline VecType LOAD(VecType* address) +inline VecType LoadToReg(VecType* address) { return _mm_load_si128(address); } @@ -157,7 +157,6 @@ inline VecType CMPEQ(VecType x, VecType y) return _mm_cmpeq_epi16(x, y); } - template inline VecType MIN(VecType x, VecType y); template <> diff --git a/src/simd_256.h b/src/simd_256.h index 85a696a3..80953e92 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -69,7 +69,7 @@ typedef __m128i HalfVecType; /* ============= Essential Operations for AVX2 w/ both u16 & u32 ============ */ -inline VecType LOAD(VecType* address) +inline VecType LoadToReg(VecType* address) { return _mm256_load_si256(address); } diff --git a/src/simd_fnt.h b/src/simd_fnt.h index 05d4d7d0..7e025610 100644 --- a/src/simd_fnt.h +++ b/src/simd_fnt.h @@ -156,13 +156,13 @@ inline void butterfly_ct_step( size_t j = 0; for (; j < end; j += 2) { - x1 = LOAD(p + j); - y1 = LOAD(q + j); + x1 = LoadToReg(p + j); + y1 = LoadToReg(q + j); BUTTERFLY_CT(rp1, c, &x1, &y1, card); - x2 = LOAD(p + j + 1); - y2 = LOAD(q + j + 1); + x2 = LoadToReg(p + j + 1); + y2 = LoadToReg(q + j + 1); BUTTERFLY_CT(rp1, c, &x2, &y2, card); @@ -173,8 +173,8 @@ inline void butterfly_ct_step( STORE(q + j + 1, y2); } for (; j < len; ++j) { - x1 = LOAD(p + j); - y1 = LOAD(q + j); + x1 = LoadToReg(p + j); + y1 = LoadToReg(q + j); BUTTERFLY_CT(rp1, c, &x1, &y1, card); @@ -213,18 +213,18 @@ inline static void do_butterfly_ct_2_layers( const size_t end = (len > 1) ? len - 1 : 0; while (j < end) { // First layer (c1, x, y) & (c1, u, v) - VecType x1 = LOAD(p); - VecType x2 = LOAD(p + 1); - VecType y1 = LOAD(q); - VecType y2 = LOAD(q + 1); + VecType x1 = LoadToReg(p); + VecType x2 = LoadToReg(p + 1); + VecType y1 = LoadToReg(q); + VecType y2 = LoadToReg(q + 1); BUTTERFLY_CT(r1p1, c1, &x1, &y1, card); BUTTERFLY_CT(r1p1, c1, &x2, &y2, card); - VecType u1 = LOAD(r); - VecType u2 = LOAD(r + 1); - VecType v1 = LOAD(s); - VecType v2 = LOAD(s + 1); + VecType u1 = LoadToReg(r); + VecType u2 = LoadToReg(r + 1); + VecType v1 = LoadToReg(s); + VecType v2 = LoadToReg(s + 1); BUTTERFLY_CT(r1p1, c1, &u1, &v1, card); BUTTERFLY_CT(r1p1, c1, &u2, &v2, card); @@ -255,10 +255,10 @@ inline static void do_butterfly_ct_2_layers( for (; j < len; ++j) { // First layer (c1, x, y) & (c1, u, v) - VecType x1 = LOAD(p + j); - VecType y1 = LOAD(q + j); - VecType u1 = LOAD(r + j); - VecType v1 = LOAD(s + j); + VecType x1 = LoadToReg(p + j); + VecType y1 = LoadToReg(q + j); + VecType u1 = LoadToReg(r + j); + VecType v1 = LoadToReg(s + j); // BUTTERFLY_3_test(c1, &x1, &y1, &u1, &v1, card); BUTTERFLY_CT(r1p1, c1, &x1, &y1, card); @@ -366,14 +366,14 @@ inline void butterfly_gs_step( size_t j = 0; for (; j < end; j += 4) { - x1 = LOAD(p + j); - x2 = LOAD(p + j + 1); - x3 = LOAD(p + j + 2); - x4 = LOAD(p + j + 3); - y1 = LOAD(q + j); - y2 = LOAD(q + j + 1); - y3 = LOAD(q + j + 2); - y4 = LOAD(q + j + 3); + x1 = LoadToReg(p + j); + x2 = LoadToReg(p + j + 1); + x3 = LoadToReg(p + j + 2); + x4 = LoadToReg(p + j + 3); + y1 = LoadToReg(q + j); + y2 = LoadToReg(q + j + 1); + y3 = LoadToReg(q + j + 2); + y4 = LoadToReg(q + j + 3); BUTTERFLY_GS(rp1, c, &x1, &y1, card); BUTTERFLY_GS(rp1, c, &x2, &y2, card); @@ -391,8 +391,8 @@ inline void butterfly_gs_step( STORE(q + j + 3, y4); } for (; j < len; ++j) { - x1 = LOAD(p + j); - y1 = LOAD(q + j); + x1 = LoadToReg(p + j); + y1 = LoadToReg(q + j); BUTTERFLY_GS(rp1, c, &x1, &y1, card); @@ -443,8 +443,8 @@ inline void butterfly_gs_step_simple( size_t j = 0; for (; j < end; j += 2) { - x1 = LOAD(p + j); - x2 = LOAD(p + j + 1); + x1 = LoadToReg(p + j); + x2 = LoadToReg(p + j + 1); y1 = BUTTERFLY_GS_SIMPLE(rp1, c, x1, card); y2 = BUTTERFLY_GS_SIMPLE(rp1, c, x2, card); @@ -454,7 +454,7 @@ inline void butterfly_gs_step_simple( STORE(q + j + 1, y2); } for (; j < len; ++j) { - x1 = LOAD(p + j); + x1 = LoadToReg(p + j); y1 = BUTTERFLY_GS_SIMPLE(rp1, c, x1, card); @@ -486,10 +486,10 @@ inline void encode_post_process( size_t vec_id = 0; size_t end = (vecs_nb > 3) ? vecs_nb - 3 : 0; for (; vec_id < end; vec_id += 4) { - VecType a1 = LOAD(buf + vec_id); - VecType a2 = LOAD(buf + vec_id + 1); - VecType a3 = LOAD(buf + vec_id + 2); - VecType a4 = LOAD(buf + vec_id + 3); + VecType a1 = LoadToReg(buf + vec_id); + VecType a2 = LoadToReg(buf + vec_id + 1); + VecType a3 = LoadToReg(buf + vec_id + 2); + VecType a4 = LoadToReg(buf + vec_id + 3); if (TESTZ(a1, _threshold) == 0) { const off_t curr_offset = offset + vec_id * vec_size; @@ -513,7 +513,7 @@ inline void encode_post_process( } } for (; vec_id < vecs_nb; ++vec_id) { - VecType a = LOAD(buf + vec_id); + VecType a = LoadToReg(buf + vec_id); uint32_t c = TESTZ(a, _threshold); if (c == 0) { const off_t curr_offset = offset + vec_id * vec_size; diff --git a/src/simd_nf4.h b/src/simd_nf4.h index e60a9880..9cf558e2 100644 --- a/src/simd_nf4.h +++ b/src/simd_nf4.h @@ -168,15 +168,15 @@ inline __uint128_t pack(__uint128_t a, uint32_t flag) #if defined(__AVX2__) -inline VecType load_to_reg(HalfVecType x) +inline VecType LoadToReg(HalfVecType x) { return _mm256_castsi128_si256(_mm_load_si128(&x)); } -inline VecType load_to_reg(__uint128_t x) +inline VecType LoadToReg(__uint128_t x) { const HalfVecType* _x = reinterpret_cast(&x); - return load_to_reg(*_x); + return LoadToReg(*_x); } inline void STORE_LOW(HalfVecType* address, VecType reg) @@ -187,8 +187,8 @@ inline void STORE_LOW(HalfVecType* address, VecType reg) inline __uint128_t add(__uint128_t a, __uint128_t b) { HalfVecType res; - VecType _a = load_to_reg(a); - VecType _b = load_to_reg(b); + VecType _a = LoadToReg(a); + VecType _b = LoadToReg(b); STORE_LOW(&res, ADD_MOD(_a, _b, F4)); return reinterpret_cast<__uint128_t>(res); } @@ -196,8 +196,8 @@ inline __uint128_t add(__uint128_t a, __uint128_t b) inline __uint128_t sub(__uint128_t a, __uint128_t b) { HalfVecType res; - VecType _a = load_to_reg(a); - VecType _b = load_to_reg(b); + VecType _a = LoadToReg(a); + VecType _b = LoadToReg(b); STORE_LOW(&res, SUB_MOD(_a, _b, F4)); return reinterpret_cast<__uint128_t>(res); } @@ -205,8 +205,8 @@ inline __uint128_t sub(__uint128_t a, __uint128_t b) inline __uint128_t mul(__uint128_t a, __uint128_t b) { HalfVecType res; - VecType _a = load_to_reg(a); - VecType _b = load_to_reg(b); + VecType _a = LoadToReg(a); + VecType _b = LoadToReg(b); STORE_LOW(&res, MULFULL_MOD(_a, _b, F4)); return reinterpret_cast<__uint128_t>(res); } @@ -222,9 +222,9 @@ inline void add_buf_to_two_bufs_rem( HalfVecType* _x_half = reinterpret_cast(x_half); HalfVecType* _y = reinterpret_cast(y); for (unsigned i = 0; i < n; ++i) { - VecType _x_p = load_to_reg(_x[i]); - VecType _x_next_p = load_to_reg(_x_half[i]); - VecType _y_p = load_to_reg(_y[i]); + VecType _x_p = LoadToReg(_x[i]); + VecType _x_next_p = LoadToReg(_x_half[i]); + VecType _y_p = LoadToReg(_y[i]); STORE_LOW(_x + i, ADD_MOD(_x_p, _y_p, F4)); STORE_LOW(_x_half + i, ADD_MOD(_x_next_p, _y_p, F4)); @@ -236,8 +236,8 @@ inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y) HalfVecType* _x = reinterpret_cast(x); HalfVecType* _y = reinterpret_cast(y); for (unsigned i = 0; i < n; ++i) { - VecType _x_p = load_to_reg(_x[i]); - VecType _y_p = load_to_reg(_y[i]); + VecType _x_p = LoadToReg(_x[i]); + VecType _y_p = LoadToReg(_y[i]); STORE_LOW(_x + i, MULFULL_MOD(_x_p, _y_p, F4)); } @@ -253,9 +253,9 @@ inline void hadamard_mul_doubled_rem( HalfVecType* _x_half = reinterpret_cast(x_half); HalfVecType* _y = reinterpret_cast(y); for (unsigned i = 0; i < n; ++i) { - VecType _x_p = load_to_reg(_x[i]); - VecType _x_next_p = load_to_reg(_x_half[i]); - VecType _y_p = load_to_reg(_y[i]); + VecType _x_p = LoadToReg(_x[i]); + VecType _x_next_p = LoadToReg(_x_half[i]); + VecType _y_p = LoadToReg(_y[i]); STORE_LOW(_x + i, MULFULL_MOD(_x_p, _y_p, F4)); STORE_LOW(_x_half + i, MULFULL_MOD(_x_next_p, _y_p, F4)); @@ -264,7 +264,7 @@ inline void hadamard_mul_doubled_rem( #elif defined(__SSE4_1__) -inline VecType load_to_reg(__uint128_t x) +inline VecType LoadToReg(__uint128_t x) { const VecType* _x = reinterpret_cast(&x); return _mm_load_si128(_x); @@ -273,8 +273,8 @@ inline VecType load_to_reg(__uint128_t x) inline __uint128_t add(__uint128_t a, __uint128_t b) { VecType res; - VecType _a = load_to_reg(a); - VecType _b = load_to_reg(b); + VecType _a = LoadToReg(a); + VecType _b = LoadToReg(b); STORE(&res, ADD_MOD(_a, _b, F4)); return reinterpret_cast<__uint128_t>(res); } @@ -282,8 +282,8 @@ inline __uint128_t add(__uint128_t a, __uint128_t b) inline __uint128_t sub(__uint128_t a, __uint128_t b) { VecType res; - VecType _a = load_to_reg(a); - VecType _b = load_to_reg(b); + VecType _a = LoadToReg(a); + VecType _b = LoadToReg(b); STORE(&res, SUB_MOD(_a, _b, F4)); return reinterpret_cast<__uint128_t>(res); } @@ -291,8 +291,8 @@ inline __uint128_t sub(__uint128_t a, __uint128_t b) inline __uint128_t mul(__uint128_t a, __uint128_t b) { VecType res; - VecType _a = load_to_reg(a); - VecType _b = load_to_reg(b); + VecType _a = LoadToReg(a); + VecType _b = LoadToReg(b); STORE(&res, MULFULL_MOD(_a, _b, F4)); return reinterpret_cast<__uint128_t>(res); } From 96617b349e33160e13d437aaaf69a5fc711aa71f Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:00:25 +0100 Subject: [PATCH 43/77] SIMD: rename STORE --- src/simd_128.h | 2 +- src/simd_256.h | 2 +- src/simd_fnt.h | 64 +++++++++++++++++++++++++------------------------- src/simd_nf4.h | 6 ++--- 4 files changed, 37 insertions(+), 37 deletions(-) diff --git a/src/simd_128.h b/src/simd_128.h index e04cf6cd..545324d6 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -60,7 +60,7 @@ inline VecType LoadToReg(VecType* address) { return _mm_load_si128(address); } -inline void STORE(VecType* address, VecType reg) +inline void StoreToMem(VecType* address, VecType reg) { _mm_store_si128(address, reg); } diff --git a/src/simd_256.h b/src/simd_256.h index 80953e92..cd8df25b 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -73,7 +73,7 @@ inline VecType LoadToReg(VecType* address) { return _mm256_load_si256(address); } -inline void STORE(VecType* address, VecType reg) +inline void StoreToMem(VecType* address, VecType reg) { _mm256_store_si256(address, reg); } diff --git a/src/simd_fnt.h b/src/simd_fnt.h index 7e025610..08b73636 100644 --- a/src/simd_fnt.h +++ b/src/simd_fnt.h @@ -167,10 +167,10 @@ inline void butterfly_ct_step( BUTTERFLY_CT(rp1, c, &x2, &y2, card); // Store back to memory - STORE(p + j, x1); - STORE(p + j + 1, x2); - STORE(q + j, y1); - STORE(q + j + 1, y2); + StoreToMem(p + j, x1); + StoreToMem(p + j + 1, x2); + StoreToMem(q + j, y1); + StoreToMem(q + j + 1, y2); } for (; j < len; ++j) { x1 = LoadToReg(p + j); @@ -179,8 +179,8 @@ inline void butterfly_ct_step( BUTTERFLY_CT(rp1, c, &x1, &y1, card); // Store back to memory - STORE(p + j, x1); - STORE(q + j, y1); + StoreToMem(p + j, x1); + StoreToMem(q + j, y1); } } } @@ -237,15 +237,15 @@ inline static void do_butterfly_ct_2_layers( BUTTERFLY_CT(r3p1, c3, &y2, &v2, card); // Store back to memory - STORE(p, x1); - STORE(p + 1, x2); - STORE(q, y1); - STORE(q + 1, y2); - - STORE(r, u1); - STORE(r + 1, u2); - STORE(s, v1); - STORE(s + 1, v2); + StoreToMem(p, x1); + StoreToMem(p + 1, x2); + StoreToMem(q, y1); + StoreToMem(q + 1, y2); + + StoreToMem(r, u1); + StoreToMem(r + 1, u2); + StoreToMem(s, v1); + StoreToMem(s + 1, v2); p = p + 2; q = q + 2; r = r + 2; @@ -267,10 +267,10 @@ inline static void do_butterfly_ct_2_layers( BUTTERFLY_CT(r3p1, c3, &y1, &v1, card); // Store back to memory - STORE(p + j, x1); - STORE(q + j, y1); - STORE(r + j, u1); - STORE(s + j, v1); + StoreToMem(p + j, x1); + StoreToMem(q + j, y1); + StoreToMem(r + j, u1); + StoreToMem(s + j, v1); } } @@ -381,14 +381,14 @@ inline void butterfly_gs_step( BUTTERFLY_GS(rp1, c, &x4, &y4, card); // Store back to memory - STORE(p + j, x1); - STORE(p + j + 1, x2); - STORE(p + j + 2, x3); - STORE(p + j + 3, x4); - STORE(q + j, y1); - STORE(q + j + 1, y2); - STORE(q + j + 2, y3); - STORE(q + j + 3, y4); + StoreToMem(p + j, x1); + StoreToMem(p + j + 1, x2); + StoreToMem(p + j + 2, x3); + StoreToMem(p + j + 3, x4); + StoreToMem(q + j, y1); + StoreToMem(q + j + 1, y2); + StoreToMem(q + j + 2, y3); + StoreToMem(q + j + 3, y4); } for (; j < len; ++j) { x1 = LoadToReg(p + j); @@ -397,8 +397,8 @@ inline void butterfly_gs_step( BUTTERFLY_GS(rp1, c, &x1, &y1, card); // Store back to memory - STORE(p + j, x1); - STORE(q + j, y1); + StoreToMem(p + j, x1); + StoreToMem(q + j, y1); } } } @@ -450,8 +450,8 @@ inline void butterfly_gs_step_simple( y2 = BUTTERFLY_GS_SIMPLE(rp1, c, x2, card); // Store back to memory - STORE(q + j, y1); - STORE(q + j + 1, y2); + StoreToMem(q + j, y1); + StoreToMem(q + j + 1, y2); } for (; j < len; ++j) { x1 = LoadToReg(p + j); @@ -459,7 +459,7 @@ inline void butterfly_gs_step_simple( y1 = BUTTERFLY_GS_SIMPLE(rp1, c, x1, card); // Store back to memory - STORE(q + j, y1); + StoreToMem(q + j, y1); } } } diff --git a/src/simd_nf4.h b/src/simd_nf4.h index 9cf558e2..eac6af80 100644 --- a/src/simd_nf4.h +++ b/src/simd_nf4.h @@ -275,7 +275,7 @@ inline __uint128_t add(__uint128_t a, __uint128_t b) VecType res; VecType _a = LoadToReg(a); VecType _b = LoadToReg(b); - STORE(&res, ADD_MOD(_a, _b, F4)); + StoreToMem(&res, ADD_MOD(_a, _b, F4)); return reinterpret_cast<__uint128_t>(res); } @@ -284,7 +284,7 @@ inline __uint128_t sub(__uint128_t a, __uint128_t b) VecType res; VecType _a = LoadToReg(a); VecType _b = LoadToReg(b); - STORE(&res, SUB_MOD(_a, _b, F4)); + StoreToMem(&res, SUB_MOD(_a, _b, F4)); return reinterpret_cast<__uint128_t>(res); } @@ -293,7 +293,7 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b) VecType res; VecType _a = LoadToReg(a); VecType _b = LoadToReg(b); - STORE(&res, MULFULL_MOD(_a, _b, F4)); + StoreToMem(&res, MULFULL_MOD(_a, _b, F4)); return reinterpret_cast<__uint128_t>(res); } From 96dc58bd359116a1737b94a5a8b9f827798348f8 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:01:08 +0100 Subject: [PATCH 44/77] SIMD: rename AND --- src/simd_128.h | 2 +- src/simd_256.h | 2 +- src/simd_basic.h | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/simd_128.h b/src/simd_128.h index 545324d6..ffc16309 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -65,7 +65,7 @@ inline void StoreToMem(VecType* address, VecType reg) _mm_store_si128(address, reg); } -inline VecType AND(VecType x, VecType y) +inline VecType And(VecType x, VecType y) { return _mm_and_si128(x, y); } diff --git a/src/simd_256.h b/src/simd_256.h index cd8df25b..efa167de 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -78,7 +78,7 @@ inline void StoreToMem(VecType* address, VecType reg) _mm256_store_si256(address, reg); } -inline VecType AND(VecType x, VecType y) +inline VecType And(VecType x, VecType y) { return _mm256_and_si256(x, y); } diff --git a/src/simd_basic.h b/src/simd_basic.h index 106f620b..54654ef9 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -132,13 +132,13 @@ inline VecType MULFULL_MOD(VecType x, VecType y, T q) const VecType res = MUL_MOD(x, y, q); // filter elements of both of a & b = card-1 - const VecType cmp = AND(CMPEQ(x, CARD_M_1(q)), CMPEQ(y, CARD_M_1(q))); + const VecType cmp = And(CMPEQ(x, CARD_M_1(q)), CMPEQ(y, CARD_M_1(q))); if (is_all_zeros(cmp) == 1) { return res; } - return (q == F3) ? XOR(res, AND(F4_u32, cmp)) - : ADD(res, AND(ONE32, cmp)); + return (q == F3) ? XOR(res, And(F4_u32, cmp)) + : ADD(res, And(ONE32, cmp)); } /** @@ -161,7 +161,7 @@ inline void ADD_PROPS( T max) { const VecType b = CMPEQ(threshold, symb); - const VecType c = AND(mask, b); + const VecType c = And(mask, b); auto d = MVMSK8(c); const unsigned element_size = sizeof(T); while (d > 0) { From f2114e325fae25975c10456def2f4f7c85543c0c Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:01:26 +0100 Subject: [PATCH 45/77] SIMD: rename XOR --- src/simd_128.h | 2 +- src/simd_256.h | 2 +- src/simd_basic.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/simd_128.h b/src/simd_128.h index ffc16309..029a87df 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -69,7 +69,7 @@ inline VecType And(VecType x, VecType y) { return _mm_and_si128(x, y); } -inline VecType XOR(VecType x, VecType y) +inline VecType Xor(VecType x, VecType y) { return _mm_xor_si128(x, y); } diff --git a/src/simd_256.h b/src/simd_256.h index efa167de..392fb414 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -82,7 +82,7 @@ inline VecType And(VecType x, VecType y) { return _mm256_and_si256(x, y); } -inline VecType XOR(VecType x, VecType y) +inline VecType Xor(VecType x, VecType y) { return _mm256_xor_si256(x, y); } diff --git a/src/simd_basic.h b/src/simd_basic.h index 54654ef9..c07c592c 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -137,7 +137,7 @@ inline VecType MULFULL_MOD(VecType x, VecType y, T q) if (is_all_zeros(cmp) == 1) { return res; } - return (q == F3) ? XOR(res, And(F4_u32, cmp)) + return (q == F3) ? Xor(res, And(F4_u32, cmp)) : ADD(res, And(ONE32, cmp)); } From 1df8169f90324052f28d3918cd0976518cd35508 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:02:53 +0100 Subject: [PATCH 46/77] SIMD: rename MVMSK8 --- src/simd_128.h | 2 +- src/simd_256.h | 2 +- src/simd_basic.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/simd_128.h b/src/simd_128.h index 029a87df..47b6edc3 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -73,7 +73,7 @@ inline VecType Xor(VecType x, VecType y) { return _mm_xor_si128(x, y); } -inline uint16_t MVMSK8(VecType x) +inline uint16_t Msb8Mask(VecType x) { return _mm_movemask_epi8(x); } diff --git a/src/simd_256.h b/src/simd_256.h index 392fb414..4fe42fc4 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -86,7 +86,7 @@ inline VecType Xor(VecType x, VecType y) { return _mm256_xor_si256(x, y); } -inline uint32_t MVMSK8(VecType x) +inline uint32_t Msb8Mask(VecType x) { return _mm256_movemask_epi8(x); } diff --git a/src/simd_basic.h b/src/simd_basic.h index c07c592c..71818702 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -162,7 +162,7 @@ inline void ADD_PROPS( { const VecType b = CMPEQ(threshold, symb); const VecType c = And(mask, b); - auto d = MVMSK8(c); + auto d = Msb8Mask(c); const unsigned element_size = sizeof(T); while (d > 0) { const unsigned byte_idx = __builtin_ctz(d); From 18399bdee827e0bc1c1400d13566095728857888 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:09:02 +0100 Subject: [PATCH 47/77] SIMD: rename TESTZ --- src/simd_128.h | 2 +- src/simd_256.h | 2 +- src/simd_fnt.h | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/simd_128.h b/src/simd_128.h index 47b6edc3..9f24655e 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -77,7 +77,7 @@ inline uint16_t Msb8Mask(VecType x) { return _mm_movemask_epi8(x); } -inline uint16_t TESTZ(VecType x, VecType y) +inline uint16_t AndIsZero(VecType x, VecType y) { return _mm_testz_si128(x, y); } diff --git a/src/simd_256.h b/src/simd_256.h index 4fe42fc4..01484616 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -90,7 +90,7 @@ inline uint32_t Msb8Mask(VecType x) { return _mm256_movemask_epi8(x); } -inline uint32_t TESTZ(VecType x, VecType y) +inline uint32_t AndIsZero(VecType x, VecType y) { return _mm256_testz_si256(x, y); } diff --git a/src/simd_fnt.h b/src/simd_fnt.h index 08b73636..53fb8df3 100644 --- a/src/simd_fnt.h +++ b/src/simd_fnt.h @@ -491,22 +491,22 @@ inline void encode_post_process( VecType a3 = LoadToReg(buf + vec_id + 2); VecType a4 = LoadToReg(buf + vec_id + 3); - if (TESTZ(a1, _threshold) == 0) { + if (AndIsZero(a1, _threshold) == 0) { const off_t curr_offset = offset + vec_id * vec_size; ADD_PROPS( props[frag_id], _threshold, mask_hi, a1, curr_offset, max); } - if (TESTZ(a2, _threshold) == 0) { + if (AndIsZero(a2, _threshold) == 0) { const off_t curr_offset = offset + (vec_id + 1) * vec_size; ADD_PROPS( props[frag_id], _threshold, mask_hi, a2, curr_offset, max); } - if (TESTZ(a3, _threshold) == 0) { + if (AndIsZero(a3, _threshold) == 0) { const off_t curr_offset = offset + (vec_id + 2) * vec_size; ADD_PROPS( props[frag_id], _threshold, mask_hi, a3, curr_offset, max); } - if (TESTZ(a4, _threshold) == 0) { + if (AndIsZero(a4, _threshold) == 0) { const off_t curr_offset = offset + (vec_id + 3) * vec_size; ADD_PROPS( props[frag_id], _threshold, mask_hi, a4, curr_offset, max); @@ -514,7 +514,7 @@ inline void encode_post_process( } for (; vec_id < vecs_nb; ++vec_id) { VecType a = LoadToReg(buf + vec_id); - uint32_t c = TESTZ(a, _threshold); + uint32_t c = AndIsZero(a, _threshold); if (c == 0) { const off_t curr_offset = offset + vec_id * vec_size; ADD_PROPS( From a873b88e6191ee93761ed4cbea6f74cd0b636975 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:09:25 +0100 Subject: [PATCH 48/77] SIMD: rename is_all_zeros --- src/simd_128.h | 2 +- src/simd_256.h | 2 +- src/simd_basic.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/simd_128.h b/src/simd_128.h index 9f24655e..7b241453 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -81,7 +81,7 @@ inline uint16_t AndIsZero(VecType x, VecType y) { return _mm_testz_si128(x, y); } -inline int is_all_zeros(VecType x) +inline int IsZero(VecType x) { return _mm_testc_si128(ZERO, x); } diff --git a/src/simd_256.h b/src/simd_256.h index 01484616..e9048772 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -94,7 +94,7 @@ inline uint32_t AndIsZero(VecType x, VecType y) { return _mm256_testz_si256(x, y); } -inline int is_all_zeros(VecType x) +inline int IsZero(VecType x) { return _mm256_testc_si256(ZERO, x); } diff --git a/src/simd_basic.h b/src/simd_basic.h index 71818702..4b5cf763 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -134,7 +134,7 @@ inline VecType MULFULL_MOD(VecType x, VecType y, T q) // filter elements of both of a & b = card-1 const VecType cmp = And(CMPEQ(x, CARD_M_1(q)), CMPEQ(y, CARD_M_1(q))); - if (is_all_zeros(cmp) == 1) { + if (IsZero(cmp) == 1) { return res; } return (q == F3) ? Xor(res, And(F4_u32, cmp)) From 02be87fd86a07a1d1c3ae9cfca4c17d46769c24d Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:10:00 +0100 Subject: [PATCH 49/77] SIMD: rename SET1 --- src/simd_128.h | 6 +++--- src/simd_256.h | 6 +++--- src/simd_basic.h | 2 +- src/simd_fnt.h | 16 ++++++++-------- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/simd_128.h b/src/simd_128.h index 7b241453..c453c19b 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -93,14 +93,14 @@ inline int IsZero(VecType x) /* ================= Essential Operations for SSE ================= */ template -inline VecType SET1(T val); +inline VecType SetOne(T val); template <> -inline VecType SET1(uint32_t val) +inline VecType SetOne(uint32_t val) { return _mm_set1_epi32(val); } template <> -inline VecType SET1(uint16_t val) +inline VecType SetOne(uint16_t val) { return _mm_set1_epi16(val); } diff --git a/src/simd_256.h b/src/simd_256.h index e9048772..07be31de 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -106,14 +106,14 @@ inline int IsZero(VecType x) /* ================= Essential Operations for AVX2 ================= */ template -inline VecType SET1(T val); +inline VecType SetOne(T val); template <> -inline VecType SET1(uint32_t val) +inline VecType SetOne(uint32_t val) { return _mm256_set1_epi32(val); } template <> -inline VecType SET1(uint16_t val) +inline VecType SetOne(uint16_t val) { return _mm256_set1_epi16(val); } diff --git a/src/simd_basic.h b/src/simd_basic.h index 4b5cf763..52dd3305 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -181,7 +181,7 @@ inline void ADD_PROPS( template inline void mul_coef_to_buf(const T a, T* src, T* dest, size_t len, T card) { - const VecType coef = SET1(a); + const VecType coef = SetOne(a); VecType* __restrict _src = reinterpret_cast(src); VecType* __restrict _dest = reinterpret_cast(dest); diff --git a/src/simd_fnt.h b/src/simd_fnt.h index 53fb8df3..1b1843cd 100644 --- a/src/simd_fnt.h +++ b/src/simd_fnt.h @@ -143,7 +143,7 @@ inline void butterfly_ct_step( return; } const T rp1 = r + 1; - VecType c = SET1(r); + VecType c = SetOne(r); const size_t end = (len > 1) ? len - 1 : 0; const unsigned bufs_nb = buf.get_n(); @@ -200,9 +200,9 @@ inline static void do_butterfly_ct_2_layers( const T r2p1 = r2 + 1; const T r3p1 = r3 + 1; - VecType c1 = SET1(r1); - VecType c2 = SET1(r2); - VecType c3 = SET1(r3); + VecType c1 = SetOne(r1); + VecType c2 = SetOne(r2); + VecType c3 = SetOne(r3); VecType* __restrict p = reinterpret_cast(mem[start]); VecType* __restrict q = reinterpret_cast(mem[start + m]); @@ -353,7 +353,7 @@ inline void butterfly_gs_step( } const unsigned step = m << 1; const T rp1 = r + 1; - VecType c = SET1(r); + VecType c = SetOne(r); const size_t end = (len > 3) ? len - 3 : 0; const unsigned bufs_nb = buf.get_n(); @@ -430,7 +430,7 @@ inline void butterfly_gs_step_simple( } const unsigned step = m << 1; const T rp1 = r + 1; - VecType c = SET1(r); + VecType c = SetOne(r); const size_t end = (len > 1) ? len - 1 : 0; const unsigned bufs_nb = buf.get_n(); @@ -476,8 +476,8 @@ inline void encode_post_process( const unsigned element_size = sizeof(T); const unsigned vec_size = countof(); const T max = 1 << (element_size * 8 - 1); - const VecType _threshold = SET1(threshold); - const VecType mask_hi = SET1(max); + const VecType _threshold = SetOne(threshold); + const VecType mask_hi = SetOne(max); const std::vector& mem = output.get_mem(); for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { From 76abc31ebfb6cf6de2bbeee04911c33fe6a95149 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:10:18 +0100 Subject: [PATCH 50/77] SIMD: rename ADD --- src/simd_128.h | 6 +++--- src/simd_256.h | 6 +++--- src/simd_basic.h | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/simd_128.h b/src/simd_128.h index c453c19b..134131df 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -106,14 +106,14 @@ inline VecType SetOne(uint16_t val) } template -inline VecType ADD(VecType x, VecType y); +inline VecType Add(VecType x, VecType y); template <> -inline VecType ADD(VecType x, VecType y) +inline VecType Add(VecType x, VecType y) { return _mm_add_epi32(x, y); } template <> -inline VecType ADD(VecType x, VecType y) +inline VecType Add(VecType x, VecType y) { return _mm_add_epi16(x, y); } diff --git a/src/simd_256.h b/src/simd_256.h index 07be31de..4cde3dae 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -119,14 +119,14 @@ inline VecType SetOne(uint16_t val) } template -inline VecType ADD(VecType x, VecType y); +inline VecType Add(VecType x, VecType y); template <> -inline VecType ADD(VecType x, VecType y) +inline VecType Add(VecType x, VecType y) { return _mm256_add_epi32(x, y); } template <> -inline VecType ADD(VecType x, VecType y) +inline VecType Add(VecType x, VecType y) { return _mm256_add_epi16(x, y); } diff --git a/src/simd_basic.h b/src/simd_basic.h index 52dd3305..7a90e3ec 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -61,7 +61,7 @@ inline VecType CARD_M_1(T q) template inline VecType ADD_MOD(VecType x, VecType y, T q) { - const VecType res = ADD(x, y); + const VecType res = Add(x, y); return MIN(res, SUB(res, CARD(q))); } @@ -77,7 +77,7 @@ template inline VecType SUB_MOD(VecType x, VecType y, T q) { const VecType res = SUB(x, y); - return MIN(res, ADD(res, CARD(q))); + return MIN(res, Add(res, CARD(q))); } /** @@ -138,7 +138,7 @@ inline VecType MULFULL_MOD(VecType x, VecType y, T q) return res; } return (q == F3) ? Xor(res, And(F4_u32, cmp)) - : ADD(res, And(ONE32, cmp)); + : Add(res, And(ONE32, cmp)); } /** From 3a5beca6796a0b33f3d60866e8ac76e35c14c2c8 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:10:37 +0100 Subject: [PATCH 51/77] SIMD: rename SUB --- src/simd_128.h | 6 +++--- src/simd_256.h | 6 +++--- src/simd_basic.h | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/simd_128.h b/src/simd_128.h index 134131df..6431099b 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -119,14 +119,14 @@ inline VecType Add(VecType x, VecType y) } template -inline VecType SUB(VecType x, VecType y); +inline VecType Sub(VecType x, VecType y); template <> -inline VecType SUB(VecType x, VecType y) +inline VecType Sub(VecType x, VecType y) { return _mm_sub_epi32(x, y); } template <> -inline VecType SUB(VecType x, VecType y) +inline VecType Sub(VecType x, VecType y) { return _mm_sub_epi16(x, y); } diff --git a/src/simd_256.h b/src/simd_256.h index 4cde3dae..b03f4c8d 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -132,14 +132,14 @@ inline VecType Add(VecType x, VecType y) } template -inline VecType SUB(VecType x, VecType y); +inline VecType Sub(VecType x, VecType y); template <> -inline VecType SUB(VecType x, VecType y) +inline VecType Sub(VecType x, VecType y) { return _mm256_sub_epi32(x, y); } template <> -inline VecType SUB(VecType x, VecType y) +inline VecType Sub(VecType x, VecType y) { return _mm256_sub_epi16(x, y); } diff --git a/src/simd_basic.h b/src/simd_basic.h index 7a90e3ec..ea1869e8 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -62,7 +62,7 @@ template inline VecType ADD_MOD(VecType x, VecType y, T q) { const VecType res = Add(x, y); - return MIN(res, SUB(res, CARD(q))); + return MIN(res, Sub(res, CARD(q))); } /** @@ -76,7 +76,7 @@ inline VecType ADD_MOD(VecType x, VecType y, T q) template inline VecType SUB_MOD(VecType x, VecType y, T q) { - const VecType res = SUB(x, y); + const VecType res = Sub(x, y); return MIN(res, Add(res, CARD(q))); } @@ -90,8 +90,8 @@ inline VecType SUB_MOD(VecType x, VecType y, T q) template inline VecType NEG_MOD(VecType x, T q) { - const VecType res = SUB(CARD(q), x); - return MIN(res, SUB(res, CARD(q))); + const VecType res = Sub(CARD(q), x); + return MIN(res, Sub(res, CARD(q))); } /** From 5d7d0fd1724a707ab9b669eae1ee35984e354ee3 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:10:59 +0100 Subject: [PATCH 52/77] SIMD: rename MUL --- src/simd_128.h | 6 +++--- src/simd_256.h | 6 +++--- src/simd_basic.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/simd_128.h b/src/simd_128.h index 6431099b..0eaa0b28 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -132,14 +132,14 @@ inline VecType Sub(VecType x, VecType y) } template -inline VecType MUL(VecType x, VecType y); +inline VecType Mul(VecType x, VecType y); template <> -inline VecType MUL(VecType x, VecType y) +inline VecType Mul(VecType x, VecType y) { return _mm_mullo_epi32(x, y); } template <> -inline VecType MUL(VecType x, VecType y) +inline VecType Mul(VecType x, VecType y) { return _mm_mullo_epi16(x, y); } diff --git a/src/simd_256.h b/src/simd_256.h index b03f4c8d..f3a73ea1 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -145,14 +145,14 @@ inline VecType Sub(VecType x, VecType y) } template -inline VecType MUL(VecType x, VecType y); +inline VecType Mul(VecType x, VecType y); template <> -inline VecType MUL(VecType x, VecType y) +inline VecType Mul(VecType x, VecType y) { return _mm256_mullo_epi32(x, y); } template <> -inline VecType MUL(VecType x, VecType y) +inline VecType Mul(VecType x, VecType y) { return _mm256_mullo_epi16(x, y); } diff --git a/src/simd_basic.h b/src/simd_basic.h index ea1869e8..1db3623f 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -108,7 +108,7 @@ inline VecType NEG_MOD(VecType x, T q) template inline VecType MUL_MOD(VecType x, VecType y, T q) { - const VecType res = MUL(x, y); + const VecType res = Mul(x, y); const VecType lo = (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55); const VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO) From 4ac96507e358762c70e4c8cd9a383cde19a3516d Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:11:38 +0100 Subject: [PATCH 53/77] SIMD: rename CMPEQ --- src/simd_128.h | 6 +++--- src/simd_256.h | 6 +++--- src/simd_basic.h | 5 +++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/simd_128.h b/src/simd_128.h index 0eaa0b28..c8d30c90 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -145,14 +145,14 @@ inline VecType Mul(VecType x, VecType y) } template -inline VecType CMPEQ(VecType x, VecType y); +inline VecType CompareEq(VecType x, VecType y); template <> -inline VecType CMPEQ(VecType x, VecType y) +inline VecType CompareEq(VecType x, VecType y) { return _mm_cmpeq_epi32(x, y); } template <> -inline VecType CMPEQ(VecType x, VecType y) +inline VecType CompareEq(VecType x, VecType y) { return _mm_cmpeq_epi16(x, y); } diff --git a/src/simd_256.h b/src/simd_256.h index f3a73ea1..bd06232c 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -158,14 +158,14 @@ inline VecType Mul(VecType x, VecType y) } template -inline VecType CMPEQ(VecType x, VecType y); +inline VecType CompareEq(VecType x, VecType y); template <> -inline VecType CMPEQ(VecType x, VecType y) +inline VecType CompareEq(VecType x, VecType y) { return _mm256_cmpeq_epi32(x, y); } template <> -inline VecType CMPEQ(VecType x, VecType y) +inline VecType CompareEq(VecType x, VecType y) { return _mm256_cmpeq_epi16(x, y); } diff --git a/src/simd_basic.h b/src/simd_basic.h index 1db3623f..1cad74e1 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -132,7 +132,8 @@ inline VecType MULFULL_MOD(VecType x, VecType y, T q) const VecType res = MUL_MOD(x, y, q); // filter elements of both of a & b = card-1 - const VecType cmp = And(CMPEQ(x, CARD_M_1(q)), CMPEQ(y, CARD_M_1(q))); + const VecType cmp = + And(CompareEq(x, CARD_M_1(q)), CompareEq(y, CARD_M_1(q))); if (IsZero(cmp) == 1) { return res; @@ -160,7 +161,7 @@ inline void ADD_PROPS( off_t offset, T max) { - const VecType b = CMPEQ(threshold, symb); + const VecType b = CompareEq(threshold, symb); const VecType c = And(mask, b); auto d = Msb8Mask(c); const unsigned element_size = sizeof(T); From 91fd3ee3cb80d5979a87e6a113c48fa86a2b1b93 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:14:21 +0100 Subject: [PATCH 54/77] SIMD: rename MIN --- src/simd_128.h | 6 +++--- src/simd_256.h | 6 +++--- src/simd_basic.h | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/simd_128.h b/src/simd_128.h index c8d30c90..fe3ca80c 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -158,14 +158,14 @@ inline VecType CompareEq(VecType x, VecType y) } template -inline VecType MIN(VecType x, VecType y); +inline VecType Min(VecType x, VecType y); template <> -inline VecType MIN(VecType x, VecType y) +inline VecType Min(VecType x, VecType y) { return _mm_min_epu32(x, y); } template <> -inline VecType MIN(VecType x, VecType y) +inline VecType Min(VecType x, VecType y) { return _mm_min_epu16(x, y); } diff --git a/src/simd_256.h b/src/simd_256.h index bd06232c..92c5d5f6 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -171,14 +171,14 @@ inline VecType CompareEq(VecType x, VecType y) } template -inline VecType MIN(VecType x, VecType y); +inline VecType Min(VecType x, VecType y); template <> -inline VecType MIN(VecType x, VecType y) +inline VecType Min(VecType x, VecType y) { return _mm256_min_epu32(x, y); } template <> -inline VecType MIN(VecType x, VecType y) +inline VecType Min(VecType x, VecType y) { return _mm256_min_epu16(x, y); } diff --git a/src/simd_basic.h b/src/simd_basic.h index 1cad74e1..3e597e9e 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -62,7 +62,7 @@ template inline VecType ADD_MOD(VecType x, VecType y, T q) { const VecType res = Add(x, y); - return MIN(res, Sub(res, CARD(q))); + return Min(res, Sub(res, CARD(q))); } /** @@ -77,7 +77,7 @@ template inline VecType SUB_MOD(VecType x, VecType y, T q) { const VecType res = Sub(x, y); - return MIN(res, Add(res, CARD(q))); + return Min(res, Add(res, CARD(q))); } /** @@ -91,7 +91,7 @@ template inline VecType NEG_MOD(VecType x, T q) { const VecType res = Sub(CARD(q), x); - return MIN(res, Sub(res, CARD(q))); + return Min(res, Sub(res, CARD(q))); } /** From a8f971a9cdfd02ca78b75bf286ef009d2ccab033 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:15:27 +0100 Subject: [PATCH 55/77] SIMD: rename CARD & CARD_M_1 --- src/simd_basic.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/simd_basic.h b/src/simd_basic.h index 3e597e9e..693603b1 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -37,13 +37,13 @@ namespace quadiron { namespace simd { template -inline VecType CARD(T q) +inline VecType Card(T q) { return (q == F3) ? F3_u32 : F4_u32; } template -inline VecType CARD_M_1(T q) +inline VecType CardMinusOne(T q) { return (q == F3) ? F3m1_u32 : F4m1_u32; } @@ -62,7 +62,7 @@ template inline VecType ADD_MOD(VecType x, VecType y, T q) { const VecType res = Add(x, y); - return Min(res, Sub(res, CARD(q))); + return Min(res, Sub(res, Card(q))); } /** @@ -77,7 +77,7 @@ template inline VecType SUB_MOD(VecType x, VecType y, T q) { const VecType res = Sub(x, y); - return Min(res, Add(res, CARD(q))); + return Min(res, Add(res, Card(q))); } /** @@ -90,8 +90,8 @@ inline VecType SUB_MOD(VecType x, VecType y, T q) template inline VecType NEG_MOD(VecType x, T q) { - const VecType res = Sub(CARD(q), x); - return Min(res, Sub(res, CARD(q))); + const VecType res = Sub(Card(q), x); + return Min(res, Sub(res, Card(q))); } /** @@ -133,7 +133,7 @@ inline VecType MULFULL_MOD(VecType x, VecType y, T q) // filter elements of both of a & b = card-1 const VecType cmp = - And(CompareEq(x, CARD_M_1(q)), CompareEq(y, CARD_M_1(q))); + And(CompareEq(x, CardMinusOne(q)), CompareEq(y, CardMinusOne(q))); if (IsZero(cmp) == 1) { return res; From b0516fc5bec26533f139e5aaf39eb446d1a5f547 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:15:58 +0100 Subject: [PATCH 56/77] SIMD: rename ADD_MOD --- src/simd_basic.h | 4 ++-- src/simd_fnt.h | 6 +++--- src/simd_nf4.h | 12 ++++++------ 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/simd_basic.h b/src/simd_basic.h index 693603b1..d6c64895 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -59,7 +59,7 @@ inline VecType CardMinusOne(T q) * @return (x + y) mod q */ template -inline VecType ADD_MOD(VecType x, VecType y, T q) +inline VecType ModAdd(VecType x, VecType y, T q) { const VecType res = Add(x, y); return Min(res, Sub(res, Card(q))); @@ -221,7 +221,7 @@ inline void add_two_bufs(T* src, T* dest, size_t len, T card) size_t i; for (i = 0; i < _len; i++) { - _dest[i] = ADD_MOD(_src[i], _dest[i], card); + _dest[i] = ModAdd(_src[i], _dest[i], card); } if (_last_len > 0) { for (i = _len * ratio; i < len; i++) { diff --git a/src/simd_fnt.h b/src/simd_fnt.h index 1b1843cd..ba6767f2 100644 --- a/src/simd_fnt.h +++ b/src/simd_fnt.h @@ -56,9 +56,9 @@ inline void BUTTERFLY_CT(T rp1, VecType c, VecType* x, VecType* y, T q) VecType z = (rp1 == 2) ? *y : MUL_MOD(c, *y, q); if (rp1 < q) { *y = SUB_MOD(*x, z, q); - *x = ADD_MOD(*x, z, q); + *x = ModAdd(*x, z, q); } else { // i.e. r == q - 1 - *y = ADD_MOD(*x, z, q); + *y = ModAdd(*x, z, q); *x = SUB_MOD(*x, z, q); } } @@ -78,7 +78,7 @@ inline void BUTTERFLY_CT(T rp1, VecType c, VecType* x, VecType* y, T q) template inline void BUTTERFLY_GS(T rp1, VecType c, VecType* x, VecType* y, T q) { - VecType add = ADD_MOD(*x, *y, q); + VecType add = ModAdd(*x, *y, q); if (rp1 == 2) { *y = SUB_MOD(*x, *y, q); } else if (rp1 < q) { diff --git a/src/simd_nf4.h b/src/simd_nf4.h index eac6af80..c5dc5150 100644 --- a/src/simd_nf4.h +++ b/src/simd_nf4.h @@ -189,7 +189,7 @@ inline __uint128_t add(__uint128_t a, __uint128_t b) HalfVecType res; VecType _a = LoadToReg(a); VecType _b = LoadToReg(b); - STORE_LOW(&res, ADD_MOD(_a, _b, F4)); + STORE_LOW(&res, ModAdd(_a, _b, F4)); return reinterpret_cast<__uint128_t>(res); } @@ -226,8 +226,8 @@ inline void add_buf_to_two_bufs_rem( VecType _x_next_p = LoadToReg(_x_half[i]); VecType _y_p = LoadToReg(_y[i]); - STORE_LOW(_x + i, ADD_MOD(_x_p, _y_p, F4)); - STORE_LOW(_x_half + i, ADD_MOD(_x_next_p, _y_p, F4)); + STORE_LOW(_x + i, ModAdd(_x_p, _y_p, F4)); + STORE_LOW(_x_half + i, ModAdd(_x_next_p, _y_p, F4)); } } @@ -275,7 +275,7 @@ inline __uint128_t add(__uint128_t a, __uint128_t b) VecType res; VecType _a = LoadToReg(a); VecType _b = LoadToReg(b); - StoreToMem(&res, ADD_MOD(_a, _b, F4)); + StoreToMem(&res, ModAdd(_a, _b, F4)); return reinterpret_cast<__uint128_t>(res); } @@ -342,12 +342,12 @@ inline void add_buf_to_two_bufs(unsigned n, __uint128_t* _x, __uint128_t* _y) // add y to the first half of `x` for (i = 0; i < vec_len; ++i) { - x[i] = ADD_MOD(x[i], y[i], F4); + x[i] = ModAdd(x[i], y[i], F4); } // add y to the second half of `x` for (i = 0; i < vec_len; ++i) { - x_next[i] = ADD_MOD(x_next[i], y[i], F4); + x_next[i] = ModAdd(x_next[i], y[i], F4); } if (rem_len > 0) { From 61122ea64bbf45762e52d169a74e647350cb1fe8 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:16:16 +0100 Subject: [PATCH 57/77] SIMD: rename SUB_MOD --- src/simd_basic.h | 6 +++--- src/simd_fnt.h | 10 +++++----- src/simd_nf4.h | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/simd_basic.h b/src/simd_basic.h index d6c64895..ce40725e 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -74,7 +74,7 @@ inline VecType ModAdd(VecType x, VecType y, T q) * @return (x - y) mod q */ template -inline VecType SUB_MOD(VecType x, VecType y, T q) +inline VecType ModSub(VecType x, VecType y, T q) { const VecType res = Sub(x, y); return Min(res, Add(res, Card(q))); @@ -113,7 +113,7 @@ inline VecType MUL_MOD(VecType x, VecType y, T q) (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55); const VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO) : BLEND16(ZERO, SHIFTR(res, 2), 0x55); - return SUB_MOD(lo, hi, q); + return ModSub(lo, hi, q); } /** @@ -244,7 +244,7 @@ inline void sub_two_bufs(T* bufa, T* bufb, T* res, size_t len, T card) size_t i; for (i = 0; i < _len; i++) { // perform subtraction - _res[i] = SUB_MOD(_bufa[i], _bufb[i], card); + _res[i] = ModSub(_bufa[i], _bufb[i], card); } if (_last_len > 0) { for (i = _len * ratio; i < len; i++) { diff --git a/src/simd_fnt.h b/src/simd_fnt.h index ba6767f2..f002c65b 100644 --- a/src/simd_fnt.h +++ b/src/simd_fnt.h @@ -55,11 +55,11 @@ inline void BUTTERFLY_CT(T rp1, VecType c, VecType* x, VecType* y, T q) { VecType z = (rp1 == 2) ? *y : MUL_MOD(c, *y, q); if (rp1 < q) { - *y = SUB_MOD(*x, z, q); + *y = ModSub(*x, z, q); *x = ModAdd(*x, z, q); } else { // i.e. r == q - 1 *y = ModAdd(*x, z, q); - *x = SUB_MOD(*x, z, q); + *x = ModSub(*x, z, q); } } @@ -80,12 +80,12 @@ inline void BUTTERFLY_GS(T rp1, VecType c, VecType* x, VecType* y, T q) { VecType add = ModAdd(*x, *y, q); if (rp1 == 2) { - *y = SUB_MOD(*x, *y, q); + *y = ModSub(*x, *y, q); } else if (rp1 < q) { - VecType sub = SUB_MOD(*x, *y, q); + VecType sub = ModSub(*x, *y, q); *y = MUL_MOD(c, sub, q); } else { // i.e. r == q - 1 - *y = SUB_MOD(*y, *x, q); + *y = ModSub(*y, *x, q); } *x = add; } diff --git a/src/simd_nf4.h b/src/simd_nf4.h index c5dc5150..f224cfbf 100644 --- a/src/simd_nf4.h +++ b/src/simd_nf4.h @@ -198,7 +198,7 @@ inline __uint128_t sub(__uint128_t a, __uint128_t b) HalfVecType res; VecType _a = LoadToReg(a); VecType _b = LoadToReg(b); - STORE_LOW(&res, SUB_MOD(_a, _b, F4)); + STORE_LOW(&res, ModSub(_a, _b, F4)); return reinterpret_cast<__uint128_t>(res); } @@ -284,7 +284,7 @@ inline __uint128_t sub(__uint128_t a, __uint128_t b) VecType res; VecType _a = LoadToReg(a); VecType _b = LoadToReg(b); - StoreToMem(&res, SUB_MOD(_a, _b, F4)); + StoreToMem(&res, ModSub(_a, _b, F4)); return reinterpret_cast<__uint128_t>(res); } From 2d86344995931989cf3a8c0b0859ae93856f54fb Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:16:33 +0100 Subject: [PATCH 58/77] SIMD: rename NEG_MOD --- src/simd_basic.h | 4 ++-- src/simd_fnt.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/simd_basic.h b/src/simd_basic.h index ce40725e..60864e4a 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -88,7 +88,7 @@ inline VecType ModSub(VecType x, VecType y, T q) * @return (-x) mod q */ template -inline VecType NEG_MOD(VecType x, T q) +inline VecType ModNeg(VecType x, T q) { const VecType res = Sub(Card(q), x); return Min(res, Sub(res, Card(q))); @@ -292,7 +292,7 @@ inline void neg(size_t len, T* buf, T card) size_t i; for (i = 0; i < _len; i++) { - _buf[i] = NEG_MOD(_buf[i], card); + _buf[i] = ModNeg(_buf[i], card); } if (_last_len > 0) { for (i = _len * ratio; i < len; i++) { diff --git a/src/simd_fnt.h b/src/simd_fnt.h index f002c65b..86938b28 100644 --- a/src/simd_fnt.h +++ b/src/simd_fnt.h @@ -110,7 +110,7 @@ inline VecType BUTTERFLY_GS_SIMPLE(T rp1, VecType c, VecType x, T q) } else if (rp1 < q) { return MUL_MOD(c, x, q); } else { - return NEG_MOD(x, q); + return ModNeg(x, q); } } From 7766b5679cd5498fb5b2ac28176745585c2db84a Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:16:57 +0100 Subject: [PATCH 59/77] SIMD: rename MUL_MOD --- src/simd_basic.h | 14 +++++++------- src/simd_fnt.h | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/simd_basic.h b/src/simd_basic.h index 60864e4a..4fe780cb 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -106,7 +106,7 @@ inline VecType ModNeg(VecType x, T q) * @return (x * y) mod q */ template -inline VecType MUL_MOD(VecType x, VecType y, T q) +inline VecType ModMul(VecType x, VecType y, T q) { const VecType res = Mul(x, y); const VecType lo = @@ -129,7 +129,7 @@ inline VecType MUL_MOD(VecType x, VecType y, T q) template inline VecType MULFULL_MOD(VecType x, VecType y, T q) { - const VecType res = MUL_MOD(x, y, q); + const VecType res = ModMul(x, y, q); // filter elements of both of a & b = card-1 const VecType cmp = @@ -193,13 +193,13 @@ inline void mul_coef_to_buf(const T a, T* src, T* dest, size_t len, T card) size_t i = 0; const size_t end = (_len > 3) ? _len - 3 : 0; for (; i < end; i += 4) { - _dest[i] = MUL_MOD(coef, _src[i], card); - _dest[i + 1] = MUL_MOD(coef, _src[i + 1], card); - _dest[i + 2] = MUL_MOD(coef, _src[i + 2], card); - _dest[i + 3] = MUL_MOD(coef, _src[i + 3], card); + _dest[i] = ModMul(coef, _src[i], card); + _dest[i + 1] = ModMul(coef, _src[i + 1], card); + _dest[i + 2] = ModMul(coef, _src[i + 2], card); + _dest[i + 3] = ModMul(coef, _src[i + 3], card); } for (; i < _len; ++i) { - _dest[i] = MUL_MOD(coef, _src[i], card); + _dest[i] = ModMul(coef, _src[i], card); } if (_last_len > 0) { diff --git a/src/simd_fnt.h b/src/simd_fnt.h index 86938b28..0d013c46 100644 --- a/src/simd_fnt.h +++ b/src/simd_fnt.h @@ -53,7 +53,7 @@ namespace simd { template inline void BUTTERFLY_CT(T rp1, VecType c, VecType* x, VecType* y, T q) { - VecType z = (rp1 == 2) ? *y : MUL_MOD(c, *y, q); + VecType z = (rp1 == 2) ? *y : ModMul(c, *y, q); if (rp1 < q) { *y = ModSub(*x, z, q); *x = ModAdd(*x, z, q); @@ -83,7 +83,7 @@ inline void BUTTERFLY_GS(T rp1, VecType c, VecType* x, VecType* y, T q) *y = ModSub(*x, *y, q); } else if (rp1 < q) { VecType sub = ModSub(*x, *y, q); - *y = MUL_MOD(c, sub, q); + *y = ModMul(c, sub, q); } else { // i.e. r == q - 1 *y = ModSub(*y, *x, q); } @@ -108,7 +108,7 @@ inline VecType BUTTERFLY_GS_SIMPLE(T rp1, VecType c, VecType x, T q) if (rp1 == 2) { return x; } else if (rp1 < q) { - return MUL_MOD(c, x, q); + return ModMul(c, x, q); } else { return ModNeg(x, q); } From 34b6595153a8e5255d26b239f88cb6c2e0d6b634 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:17:51 +0100 Subject: [PATCH 60/77] SIMD: rename MULFULL_MOD --- src/simd_basic.h | 4 ++-- src/simd_nf4.h | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/simd_basic.h b/src/simd_basic.h index 4fe780cb..eefbf370 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -127,7 +127,7 @@ inline VecType ModMul(VecType x, VecType y, T q) * @return (x * y) mod q */ template -inline VecType MULFULL_MOD(VecType x, VecType y, T q) +inline VecType ModMulSafe(VecType x, VecType y, T q) { const VecType res = ModMul(x, y, q); @@ -270,7 +270,7 @@ inline void mul_two_bufs(T* src, T* dest, size_t len, T card) size_t i; for (i = 0; i < _len; i++) { // perform multiplicaton - _dest[i] = MULFULL_MOD(_src[i], _dest[i], card); + _dest[i] = ModMulSafe(_src[i], _dest[i], card); } if (_last_len > 0) { for (i = _len * ratio; i < len; i++) { diff --git a/src/simd_nf4.h b/src/simd_nf4.h index f224cfbf..0accb358 100644 --- a/src/simd_nf4.h +++ b/src/simd_nf4.h @@ -207,7 +207,7 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b) HalfVecType res; VecType _a = LoadToReg(a); VecType _b = LoadToReg(b); - STORE_LOW(&res, MULFULL_MOD(_a, _b, F4)); + STORE_LOW(&res, ModMulSafe(_a, _b, F4)); return reinterpret_cast<__uint128_t>(res); } @@ -239,7 +239,7 @@ inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y) VecType _x_p = LoadToReg(_x[i]); VecType _y_p = LoadToReg(_y[i]); - STORE_LOW(_x + i, MULFULL_MOD(_x_p, _y_p, F4)); + STORE_LOW(_x + i, ModMulSafe(_x_p, _y_p, F4)); } } @@ -257,8 +257,8 @@ inline void hadamard_mul_doubled_rem( VecType _x_next_p = LoadToReg(_x_half[i]); VecType _y_p = LoadToReg(_y[i]); - STORE_LOW(_x + i, MULFULL_MOD(_x_p, _y_p, F4)); - STORE_LOW(_x_half + i, MULFULL_MOD(_x_next_p, _y_p, F4)); + STORE_LOW(_x + i, ModMulSafe(_x_p, _y_p, F4)); + STORE_LOW(_x_half + i, ModMulSafe(_x_next_p, _y_p, F4)); } } @@ -293,7 +293,7 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b) VecType res; VecType _a = LoadToReg(a); VecType _b = LoadToReg(b); - StoreToMem(&res, MULFULL_MOD(_a, _b, F4)); + StoreToMem(&res, ModMulSafe(_a, _b, F4)); return reinterpret_cast<__uint128_t>(res); } @@ -369,7 +369,7 @@ inline void hadamard_mul(unsigned n, __uint128_t* _x, __uint128_t* _y) // multiply y to the first half of `x` for (i = 0; i < vec_len; ++i) { - x[i] = MULFULL_MOD(x[i], y[i], F4); + x[i] = ModMulSafe(x[i], y[i], F4); } if (rem_len > 0) { From dca8686c1b6c6609865899ddfe3daec17113bbed Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:18:13 +0100 Subject: [PATCH 61/77] SIMD: rename ADD_PROPS --- src/simd_basic.h | 2 +- src/simd_fnt.h | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/simd_basic.h b/src/simd_basic.h index eefbf370..d33e5b92 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -153,7 +153,7 @@ inline VecType ModMulSafe(VecType x, VecType y, T q) * @param max a dummy variable */ template -inline void ADD_PROPS( +inline void AddProps( Properties& props, VecType threshold, VecType mask, diff --git a/src/simd_fnt.h b/src/simd_fnt.h index 0d013c46..4761e945 100644 --- a/src/simd_fnt.h +++ b/src/simd_fnt.h @@ -493,22 +493,22 @@ inline void encode_post_process( if (AndIsZero(a1, _threshold) == 0) { const off_t curr_offset = offset + vec_id * vec_size; - ADD_PROPS( + AddProps( props[frag_id], _threshold, mask_hi, a1, curr_offset, max); } if (AndIsZero(a2, _threshold) == 0) { const off_t curr_offset = offset + (vec_id + 1) * vec_size; - ADD_PROPS( + AddProps( props[frag_id], _threshold, mask_hi, a2, curr_offset, max); } if (AndIsZero(a3, _threshold) == 0) { const off_t curr_offset = offset + (vec_id + 2) * vec_size; - ADD_PROPS( + AddProps( props[frag_id], _threshold, mask_hi, a3, curr_offset, max); } if (AndIsZero(a4, _threshold) == 0) { const off_t curr_offset = offset + (vec_id + 3) * vec_size; - ADD_PROPS( + AddProps( props[frag_id], _threshold, mask_hi, a4, curr_offset, max); } } @@ -517,7 +517,7 @@ inline void encode_post_process( uint32_t c = AndIsZero(a, _threshold); if (c == 0) { const off_t curr_offset = offset + vec_id * vec_size; - ADD_PROPS( + AddProps( props[frag_id], _threshold, mask_hi, a, curr_offset, max); } } From 84e0714ca76ed95845aaea1d88e163eb25352bda Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:23:24 +0100 Subject: [PATCH 62/77] SIMD: rename BUTTERFLY_CT --- src/simd_fnt.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/simd_fnt.h b/src/simd_fnt.h index 4761e945..08376688 100644 --- a/src/simd_fnt.h +++ b/src/simd_fnt.h @@ -51,7 +51,7 @@ namespace simd { * @param q modular */ template -inline void BUTTERFLY_CT(T rp1, VecType c, VecType* x, VecType* y, T q) +inline void ButterflyCT(T rp1, VecType c, VecType* x, VecType* y, T q) { VecType z = (rp1 == 2) ? *y : ModMul(c, *y, q); if (rp1 < q) { @@ -159,12 +159,12 @@ inline void butterfly_ct_step( x1 = LoadToReg(p + j); y1 = LoadToReg(q + j); - BUTTERFLY_CT(rp1, c, &x1, &y1, card); + ButterflyCT(rp1, c, &x1, &y1, card); x2 = LoadToReg(p + j + 1); y2 = LoadToReg(q + j + 1); - BUTTERFLY_CT(rp1, c, &x2, &y2, card); + ButterflyCT(rp1, c, &x2, &y2, card); // Store back to memory StoreToMem(p + j, x1); @@ -176,7 +176,7 @@ inline void butterfly_ct_step( x1 = LoadToReg(p + j); y1 = LoadToReg(q + j); - BUTTERFLY_CT(rp1, c, &x1, &y1, card); + ButterflyCT(rp1, c, &x1, &y1, card); // Store back to memory StoreToMem(p + j, x1); @@ -218,23 +218,23 @@ inline static void do_butterfly_ct_2_layers( VecType y1 = LoadToReg(q); VecType y2 = LoadToReg(q + 1); - BUTTERFLY_CT(r1p1, c1, &x1, &y1, card); - BUTTERFLY_CT(r1p1, c1, &x2, &y2, card); + ButterflyCT(r1p1, c1, &x1, &y1, card); + ButterflyCT(r1p1, c1, &x2, &y2, card); VecType u1 = LoadToReg(r); VecType u2 = LoadToReg(r + 1); VecType v1 = LoadToReg(s); VecType v2 = LoadToReg(s + 1); - BUTTERFLY_CT(r1p1, c1, &u1, &v1, card); - BUTTERFLY_CT(r1p1, c1, &u2, &v2, card); + ButterflyCT(r1p1, c1, &u1, &v1, card); + ButterflyCT(r1p1, c1, &u2, &v2, card); // Second layer (c2, x, u) & (c3, y, v) - BUTTERFLY_CT(r2p1, c2, &x1, &u1, card); - BUTTERFLY_CT(r2p1, c2, &x2, &u2, card); + ButterflyCT(r2p1, c2, &x1, &u1, card); + ButterflyCT(r2p1, c2, &x2, &u2, card); - BUTTERFLY_CT(r3p1, c3, &y1, &v1, card); - BUTTERFLY_CT(r3p1, c3, &y2, &v2, card); + ButterflyCT(r3p1, c3, &y1, &v1, card); + ButterflyCT(r3p1, c3, &y2, &v2, card); // Store back to memory StoreToMem(p, x1); @@ -261,10 +261,10 @@ inline static void do_butterfly_ct_2_layers( VecType v1 = LoadToReg(s + j); // BUTTERFLY_3_test(c1, &x1, &y1, &u1, &v1, card); - BUTTERFLY_CT(r1p1, c1, &x1, &y1, card); - BUTTERFLY_CT(r1p1, c1, &u1, &v1, card); - BUTTERFLY_CT(r2p1, c2, &x1, &u1, card); - BUTTERFLY_CT(r3p1, c3, &y1, &v1, card); + ButterflyCT(r1p1, c1, &x1, &y1, card); + ButterflyCT(r1p1, c1, &u1, &v1, card); + ButterflyCT(r2p1, c2, &x1, &u1, card); + ButterflyCT(r3p1, c3, &y1, &v1, card); // Store back to memory StoreToMem(p + j, x1); From 51d2f8d2423f7df969be4cd9cdcf15931743341c Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:23:45 +0100 Subject: [PATCH 63/77] SIMD: rename BUTTERFLY_GS --- src/simd_fnt.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/simd_fnt.h b/src/simd_fnt.h index 08376688..c5672884 100644 --- a/src/simd_fnt.h +++ b/src/simd_fnt.h @@ -76,7 +76,7 @@ inline void ButterflyCT(T rp1, VecType c, VecType* x, VecType* y, T q) * @param q modular */ template -inline void BUTTERFLY_GS(T rp1, VecType c, VecType* x, VecType* y, T q) +inline void ButterflyGS(T rp1, VecType c, VecType* x, VecType* y, T q) { VecType add = ModAdd(*x, *y, q); if (rp1 == 2) { @@ -375,10 +375,10 @@ inline void butterfly_gs_step( y3 = LoadToReg(q + j + 2); y4 = LoadToReg(q + j + 3); - BUTTERFLY_GS(rp1, c, &x1, &y1, card); - BUTTERFLY_GS(rp1, c, &x2, &y2, card); - BUTTERFLY_GS(rp1, c, &x3, &y3, card); - BUTTERFLY_GS(rp1, c, &x4, &y4, card); + ButterflyGS(rp1, c, &x1, &y1, card); + ButterflyGS(rp1, c, &x2, &y2, card); + ButterflyGS(rp1, c, &x3, &y3, card); + ButterflyGS(rp1, c, &x4, &y4, card); // Store back to memory StoreToMem(p + j, x1); @@ -394,7 +394,7 @@ inline void butterfly_gs_step( x1 = LoadToReg(p + j); y1 = LoadToReg(q + j); - BUTTERFLY_GS(rp1, c, &x1, &y1, card); + ButterflyGS(rp1, c, &x1, &y1, card); // Store back to memory StoreToMem(p + j, x1); From cc7d37c4cd1a98ea1e1ae9c4084a6b6fa315868f Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:24:11 +0100 Subject: [PATCH 64/77] SIMD: rename BUTTERFLY_GS_SIMPLE --- src/simd_fnt.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/simd_fnt.h b/src/simd_fnt.h index c5672884..885a3d25 100644 --- a/src/simd_fnt.h +++ b/src/simd_fnt.h @@ -103,7 +103,7 @@ inline void ButterflyGS(T rp1, VecType c, VecType* x, VecType* y, T q) * @return r * x */ template -inline VecType BUTTERFLY_GS_SIMPLE(T rp1, VecType c, VecType x, T q) +inline VecType ButterflySimpleGS(T rp1, VecType c, VecType x, T q) { if (rp1 == 2) { return x; @@ -446,8 +446,8 @@ inline void butterfly_gs_step_simple( x1 = LoadToReg(p + j); x2 = LoadToReg(p + j + 1); - y1 = BUTTERFLY_GS_SIMPLE(rp1, c, x1, card); - y2 = BUTTERFLY_GS_SIMPLE(rp1, c, x2, card); + y1 = ButterflySimpleGS(rp1, c, x1, card); + y2 = ButterflySimpleGS(rp1, c, x2, card); // Store back to memory StoreToMem(q + j, y1); @@ -456,7 +456,7 @@ inline void butterfly_gs_step_simple( for (; j < len; ++j) { x1 = LoadToReg(p + j); - y1 = BUTTERFLY_GS_SIMPLE(rp1, c, x1, card); + y1 = ButterflySimpleGS(rp1, c, x1, card); // Store back to memory StoreToMem(q + j, y1); From db987642265f9372bd1f870dccd0b9598e530a8b Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:31:55 +0100 Subject: [PATCH 65/77] SIMD: rename STORE_LOW --- src/simd_nf4.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/simd_nf4.h b/src/simd_nf4.h index 0accb358..25b435c0 100644 --- a/src/simd_nf4.h +++ b/src/simd_nf4.h @@ -179,7 +179,7 @@ inline VecType LoadToReg(__uint128_t x) return LoadToReg(*_x); } -inline void STORE_LOW(HalfVecType* address, VecType reg) +inline void StoreLowHalfToMem(HalfVecType* address, VecType reg) { _mm_store_si128(address, _mm256_castsi256_si128(reg)); } @@ -189,7 +189,7 @@ inline __uint128_t add(__uint128_t a, __uint128_t b) HalfVecType res; VecType _a = LoadToReg(a); VecType _b = LoadToReg(b); - STORE_LOW(&res, ModAdd(_a, _b, F4)); + StoreLowHalfToMem(&res, ModAdd(_a, _b, F4)); return reinterpret_cast<__uint128_t>(res); } @@ -198,7 +198,7 @@ inline __uint128_t sub(__uint128_t a, __uint128_t b) HalfVecType res; VecType _a = LoadToReg(a); VecType _b = LoadToReg(b); - STORE_LOW(&res, ModSub(_a, _b, F4)); + StoreLowHalfToMem(&res, ModSub(_a, _b, F4)); return reinterpret_cast<__uint128_t>(res); } @@ -207,7 +207,7 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b) HalfVecType res; VecType _a = LoadToReg(a); VecType _b = LoadToReg(b); - STORE_LOW(&res, ModMulSafe(_a, _b, F4)); + StoreLowHalfToMem(&res, ModMulSafe(_a, _b, F4)); return reinterpret_cast<__uint128_t>(res); } @@ -226,8 +226,8 @@ inline void add_buf_to_two_bufs_rem( VecType _x_next_p = LoadToReg(_x_half[i]); VecType _y_p = LoadToReg(_y[i]); - STORE_LOW(_x + i, ModAdd(_x_p, _y_p, F4)); - STORE_LOW(_x_half + i, ModAdd(_x_next_p, _y_p, F4)); + StoreLowHalfToMem(_x + i, ModAdd(_x_p, _y_p, F4)); + StoreLowHalfToMem(_x_half + i, ModAdd(_x_next_p, _y_p, F4)); } } @@ -239,7 +239,7 @@ inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y) VecType _x_p = LoadToReg(_x[i]); VecType _y_p = LoadToReg(_y[i]); - STORE_LOW(_x + i, ModMulSafe(_x_p, _y_p, F4)); + StoreLowHalfToMem(_x + i, ModMulSafe(_x_p, _y_p, F4)); } } @@ -257,8 +257,8 @@ inline void hadamard_mul_doubled_rem( VecType _x_next_p = LoadToReg(_x_half[i]); VecType _y_p = LoadToReg(_y[i]); - STORE_LOW(_x + i, ModMulSafe(_x_p, _y_p, F4)); - STORE_LOW(_x_half + i, ModMulSafe(_x_next_p, _y_p, F4)); + StoreLowHalfToMem(_x + i, ModMulSafe(_x_p, _y_p, F4)); + StoreLowHalfToMem(_x_half + i, ModMulSafe(_x_next_p, _y_p, F4)); } } From e9de0badd522cafe4e22ca11187cadd57254788f Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 12:36:04 +0100 Subject: [PATCH 66/77] SIMD: rename macro names --- src/simd_128.h | 16 ++++++++-------- src/simd_256.h | 16 ++++++++-------- src/simd_basic.h | 8 ++++---- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/simd_128.h b/src/simd_128.h index fe3ca80c..6cfbc8e5 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -40,17 +40,17 @@ typedef __m128i VecType; /* ============= Constant variable ============ */ -#define F4_u32 _mm_set1_epi32(65537) -#define F4m1_u32 _mm_set1_epi32(65536) -#define F3_u32 _mm_set1_epi32(257) -#define F3m1_u32 _mm_set1_epi32(256) +#define F4_U32 _mm_set1_epi32(65537) +#define F4_MINUS_ONE_U32 _mm_set1_epi32(65536) +#define F3_U32 _mm_set1_epi32(257) +#define F3_MINUS_ONE_U32 _mm_set1_epi32(256) -#define F3_u16 _mm_set1_epi16(257) -#define F3m1_u16 _mm_set1_epi16(256) +#define F3_U16 _mm_set1_epi16(257) +#define F3_MINUS_ONE_U16 _mm_set1_epi16(256) #define ZERO (_mm_setzero_si128()) -#define ONE16 (_mm_set1_epi16(1)) -#define ONE32 (_mm_set1_epi32(1)) +#define ONE_U16 (_mm_set1_epi16(1)) +#define ONE_U32 (_mm_set1_epi32(1)) #define MASK8_LO (_mm_set1_epi16(0x80)) diff --git a/src/simd_256.h b/src/simd_256.h index 92c5d5f6..8b9ae688 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -53,17 +53,17 @@ typedef __m128i HalfVecType; /* ============= Constant variable ============ */ -#define F4_u32 _mm256_set1_epi32(65537) -#define F4m1_u32 _mm256_set1_epi32(65536) -#define F3_u32 _mm256_set1_epi32(257) -#define F3m1_u32 _mm256_set1_epi32(256) +#define F4_U32 _mm256_set1_epi32(65537) +#define F4_MINUS_ONE_U32 _mm256_set1_epi32(65536) +#define F3_U32 _mm256_set1_epi32(257) +#define F3_MINUS_ONE_U32 _mm256_set1_epi32(256) -#define F3_u16 _mm256_set1_epi16(257) -#define F3m1_u16 _mm256_set1_epi16(256) +#define F3_U16 _mm256_set1_epi16(257) +#define F3_MINUS_ONE_U16 _mm256_set1_epi16(256) #define ZERO (_mm256_setzero_si256()) -#define ONE16 (_mm256_set1_epi16(1)) -#define ONE32 (_mm256_set1_epi32(1)) +#define ONE_U16 (_mm256_set1_epi16(1)) +#define ONE_U32 (_mm256_set1_epi32(1)) #define MASK8_LO (_mm256_set1_epi16(0x80)) diff --git a/src/simd_basic.h b/src/simd_basic.h index d33e5b92..de302273 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -39,13 +39,13 @@ namespace simd { template inline VecType Card(T q) { - return (q == F3) ? F3_u32 : F4_u32; + return (q == F3) ? F3_U32 : F4_U32; } template inline VecType CardMinusOne(T q) { - return (q == F3) ? F3m1_u32 : F4m1_u32; + return (q == F3) ? F3_MINUS_ONE_U32 : F4_MINUS_ONE_U32; } /* ================= Basic Operations ================= */ @@ -138,8 +138,8 @@ inline VecType ModMulSafe(VecType x, VecType y, T q) if (IsZero(cmp) == 1) { return res; } - return (q == F3) ? Xor(res, And(F4_u32, cmp)) - : Add(res, And(ONE32, cmp)); + return (q == F3) ? Xor(res, And(F4_U32, cmp)) + : Add(res, And(ONE_U32, cmp)); } /** From d35c4d31f9b2b7c57439dd8c0d2a435c9cda1d63 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 13:40:41 +0100 Subject: [PATCH 67/77] SIMD Basic: fix Card & CardMinusOne functions --- src/simd_basic.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/simd_basic.h b/src/simd_basic.h index de302273..b6bd7c80 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -37,13 +37,27 @@ namespace quadiron { namespace simd { template -inline VecType Card(T q) +inline VecType Card(T q); +template <> +inline VecType Card(uint16_t q) +{ + return F3_U16; +} +template <> +inline VecType Card(uint32_t q) { return (q == F3) ? F3_U32 : F4_U32; } template -inline VecType CardMinusOne(T q) +inline VecType CardMinusOne(T q); +template <> +inline VecType CardMinusOne(uint16_t q) +{ + return F3_MINUS_ONE_U16; +} +template <> +inline VecType CardMinusOne(uint32_t q) { return (q == F3) ? F3_MINUS_ONE_U32 : F4_MINUS_ONE_U32; } From 05d39381440f8c3337fbd2741d824292eb61bc9a Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 13:41:18 +0100 Subject: [PATCH 68/77] SIMD Basic: refactor get low/high half elements for ModMul --- src/simd_basic.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/simd_basic.h b/src/simd_basic.h index b6bd7c80..92382217 100644 --- a/src/simd_basic.h +++ b/src/simd_basic.h @@ -62,6 +62,19 @@ inline VecType CardMinusOne(uint32_t q) return (q == F3) ? F3_MINUS_ONE_U32 : F4_MINUS_ONE_U32; } +template +inline VecType GetLowHalf(VecType x, T q) +{ + return (q == F3) ? BLEND8(ZERO, x, MASK8_LO) : BLEND16(ZERO, x, 0x55); +} + +template +inline VecType GetHighHalf(VecType x, T q) +{ + return (q == F3) ? BLEND8(ZERO, SHIFTR(x, 1), MASK8_LO) + : BLEND16(ZERO, SHIFTR(x, 2), 0x55); +} + /* ================= Basic Operations ================= */ /** @@ -123,10 +136,8 @@ template inline VecType ModMul(VecType x, VecType y, T q) { const VecType res = Mul(x, y); - const VecType lo = - (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55); - const VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO) - : BLEND16(ZERO, SHIFTR(res, 2), 0x55); + const VecType lo = GetLowHalf(res, q); + const VecType hi = GetHighHalf(res, q); return ModSub(lo, hi, q); } From 0c275e43126d0393192e97cd8a2ca39791c94801 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Wed, 31 Oct 2018 15:30:33 +0100 Subject: [PATCH 69/77] Core includes only SIMD's allocator It moves typedef for DoubleSize and SignedDoubleSize from arith to core --- src/arith.h | 6 ------ src/core.h | 8 +++++++- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/arith.h b/src/arith.h index 1da85320..9b230677 100644 --- a/src/arith.h +++ b/src/arith.h @@ -41,12 +41,6 @@ namespace quadiron { -template -using DoubleSizeVal = typename DoubleSize::T; - -template -using SignedDoubleSizeVal = typename SignedDoubleSize::T; - /** Base/core arithmetical functions of QuadIron. */ namespace arith { diff --git a/src/core.h b/src/core.h index 5eaf84fe..a9033f90 100644 --- a/src/core.h +++ b/src/core.h @@ -34,7 +34,7 @@ #include #include "big_int.h" -#include "simd/simd.h" +#include "simd/allocator.h" namespace quadiron { @@ -78,6 +78,12 @@ struct SignedDoubleSize<__uint128_t> { typedef Int256 T; }; +template +using DoubleSizeVal = typename DoubleSize::T; + +template +using SignedDoubleSizeVal = typename SignedDoubleSize::T; + /** A group of values stored as one. * * This allows faster processing, as the values can be processed as one. From c72dd8af0cfe4c50b4942944545e1bf140eb5944 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Wed, 31 Oct 2018 15:35:45 +0100 Subject: [PATCH 70/77] SIMD: update simd header --- src/simd/simd.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/simd/simd.h b/src/simd/simd.h index 9cdcd251..ad02f3fc 100644 --- a/src/simd/simd.h +++ b/src/simd/simd.h @@ -31,6 +31,8 @@ #ifndef __QUAD_SIMD_SIMD_H__ #define __QUAD_SIMD_SIMD_H__ +#include "property.h" + #include "simd/allocator.h" #include "simd/definitions.h" @@ -57,4 +59,31 @@ static constexpr std::size_t countof() } // namespace simd } // namespace quadiron +#ifdef QUADIRON_USE_SIMD + +const unsigned F4 = 65537; +const unsigned F3 = 257; + +// Include essential operations that use SIMD functions +#if defined(__AVX2__) + +#include "simd_256.h" + +#elif defined(__SSE4_1__) + +#include "simd_128.h" + +#endif + +// Include basic operations +#include "simd_basic.h" + +// Include accelerated operations dedicated for FNT +#include "simd_fnt.h" + +// Include accelerated operations dedicated for NF4 +#include "simd_nf4.h" + +#endif // #ifdef QUADIRON_USE_SIMD + #endif From ce91fb25ee470a594cef147931540e0ecaf29137 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Wed, 31 Oct 2018 15:35:58 +0100 Subject: [PATCH 71/77] Remove simd.h --- src/simd.h | 73 ------------------------------------------------------ 1 file changed, 73 deletions(-) delete mode 100644 src/simd.h diff --git a/src/simd.h b/src/simd.h deleted file mode 100644 index 41e4935e..00000000 --- a/src/simd.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright 2017-2018 Scality - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __QUAD_SIMD_H__ -#define __QUAD_SIMD_H__ - -#ifdef QUADIRON_USE_SIMD - -#include "property.h" -#include "simd/simd.h" - -const unsigned F4 = 65537; -const unsigned F3 = 257; - -namespace quadiron { -/** The namespace simd contains functions accelerated by - * using SIMD operations over 128bits and 256bits - * - * It supports operations on 16-bit and 32-bit numbers - */ -namespace simd { - -// Vectorized operations are implemented in appropriated headers simd*.h - -} // namespace simd -} // namespace quadiron - -// Include essential operations that use SIMD functions -#if defined(__AVX2__) -#include "simd_256.h" -#elif defined(__SSE4_1__) -#include "simd_128.h" -#endif - -// Include basic operations -#include "simd_basic.h" - -// Include accelerated operations dedicated for FNT -#include "simd_fnt.h" - -// Include accelerated operations dedicated for NF4 -#include "simd_nf4.h" - -#endif // #ifdef QUADIRON_USE_SIMD - -#endif From bb838aedf44a7359436d0bd2d5b50155f94c1494 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Wed, 31 Oct 2018 15:38:58 +0100 Subject: [PATCH 72/77] SIMD: move simd_* header to simd dir --- src/{ => simd}/simd_128.h | 0 src/{ => simd}/simd_256.h | 0 src/{ => simd}/simd_basic.h | 0 src/{ => simd}/simd_fnt.h | 0 src/{ => simd}/simd_nf4.h | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename src/{ => simd}/simd_128.h (100%) rename src/{ => simd}/simd_256.h (100%) rename src/{ => simd}/simd_basic.h (100%) rename src/{ => simd}/simd_fnt.h (100%) rename src/{ => simd}/simd_nf4.h (100%) diff --git a/src/simd_128.h b/src/simd/simd_128.h similarity index 100% rename from src/simd_128.h rename to src/simd/simd_128.h diff --git a/src/simd_256.h b/src/simd/simd_256.h similarity index 100% rename from src/simd_256.h rename to src/simd/simd_256.h diff --git a/src/simd_basic.h b/src/simd/simd_basic.h similarity index 100% rename from src/simd_basic.h rename to src/simd/simd_basic.h diff --git a/src/simd_fnt.h b/src/simd/simd_fnt.h similarity index 100% rename from src/simd_fnt.h rename to src/simd/simd_fnt.h diff --git a/src/simd_nf4.h b/src/simd/simd_nf4.h similarity index 100% rename from src/simd_nf4.h rename to src/simd/simd_nf4.h From 97d9cf88b37a33dd94ddaab09565bedc9e6aa107 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Wed, 31 Oct 2018 15:39:58 +0100 Subject: [PATCH 73/77] SIMD: remove useless included headers --- src/simd/simd_128.h | 2 -- src/simd/simd_256.h | 2 -- src/simd/simd_basic.h | 2 -- src/simd/simd_fnt.h | 2 -- src/simd/simd_nf4.h | 4 ---- 5 files changed, 12 deletions(-) diff --git a/src/simd/simd_128.h b/src/simd/simd_128.h index 6cfbc8e5..bb33ee4f 100644 --- a/src/simd/simd_128.h +++ b/src/simd/simd_128.h @@ -31,8 +31,6 @@ #ifndef __QUAD_SIMD_128_H__ #define __QUAD_SIMD_128_H__ -#include - namespace quadiron { namespace simd { diff --git a/src/simd/simd_256.h b/src/simd/simd_256.h index 8b9ae688..0723e80f 100644 --- a/src/simd/simd_256.h +++ b/src/simd/simd_256.h @@ -31,8 +31,6 @@ #ifndef __QUAD_SIMD_256_H__ #define __QUAD_SIMD_256_H__ -#include - /* GCC doesn't include the split store intrinsics so define them here. */ #if defined(__GNUC__) && !defined(__clang__) diff --git a/src/simd/simd_basic.h b/src/simd/simd_basic.h index 92382217..ab2301ad 100644 --- a/src/simd/simd_basic.h +++ b/src/simd/simd_basic.h @@ -31,8 +31,6 @@ #ifndef __QUAD_SIMD_BASIC_H__ #define __QUAD_SIMD_BASIC_H__ -#include - namespace quadiron { namespace simd { diff --git a/src/simd/simd_fnt.h b/src/simd/simd_fnt.h index 885a3d25..97467050 100644 --- a/src/simd/simd_fnt.h +++ b/src/simd/simd_fnt.h @@ -31,8 +31,6 @@ #ifndef __QUAD_SIMD_FNT_H__ #define __QUAD_SIMD_FNT_H__ -#include - namespace quadiron { namespace simd { diff --git a/src/simd/simd_nf4.h b/src/simd/simd_nf4.h index 25b435c0..8c3c1d92 100644 --- a/src/simd/simd_nf4.h +++ b/src/simd/simd_nf4.h @@ -31,10 +31,6 @@ #ifndef __QUAD_SIMD_NF4_H__ #define __QUAD_SIMD_NF4_H__ -#include - -#include - namespace quadiron { namespace simd { From 4b26041a29fa2b0b67f39c4c1505b84fa666c159 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Wed, 31 Oct 2018 15:45:12 +0100 Subject: [PATCH 74/77] Buffers includes only SIMD's allocator --- src/vec_buffers.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vec_buffers.h b/src/vec_buffers.h index d31c69d3..2122d300 100644 --- a/src/vec_buffers.h +++ b/src/vec_buffers.h @@ -38,7 +38,7 @@ #include #include "core.h" -#include "simd/simd.h" +#include "simd/allocator.h" namespace quadiron { namespace vec { From cac0b66dea742a9b9a761aa037d79212e628cd51 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Wed, 31 Oct 2018 15:47:38 +0100 Subject: [PATCH 75/77] Include new SIMD's header --- src/fec_base.h | 2 +- src/fec_vectorisation.cpp | 1 - src/fft_2n.cpp | 2 +- src/gf_nf4.cpp | 2 +- src/gf_ring.cpp | 2 +- 5 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/fec_base.h b/src/fec_base.h index 741d480f..0a417155 100644 --- a/src/fec_base.h +++ b/src/fec_base.h @@ -51,7 +51,7 @@ #ifdef QUADIRON_USE_SIMD -#include "simd.h" +#include "simd/simd.h" #endif // #ifdef QUADIRON_USE_SIMD diff --git a/src/fec_vectorisation.cpp b/src/fec_vectorisation.cpp index ed82fab8..3900fd6d 100644 --- a/src/fec_vectorisation.cpp +++ b/src/fec_vectorisation.cpp @@ -37,7 +37,6 @@ #ifdef QUADIRON_USE_SIMD -#include "simd.h" #include "simd/simd.h" namespace quadiron { diff --git a/src/fft_2n.cpp b/src/fft_2n.cpp index f7d91468..6cc1f181 100644 --- a/src/fft_2n.cpp +++ b/src/fft_2n.cpp @@ -37,7 +37,7 @@ #ifdef QUADIRON_USE_SIMD -#include "simd.h" +#include "simd/simd.h" namespace quadiron { namespace fft { diff --git a/src/gf_nf4.cpp b/src/gf_nf4.cpp index 9e7fa4dc..ecbf31b7 100644 --- a/src/gf_nf4.cpp +++ b/src/gf_nf4.cpp @@ -32,7 +32,7 @@ #ifdef QUADIRON_USE_SIMD -#include "simd.h" +#include "simd/simd.h" namespace quadiron { namespace gf { diff --git a/src/gf_ring.cpp b/src/gf_ring.cpp index da1ed530..9120fe01 100644 --- a/src/gf_ring.cpp +++ b/src/gf_ring.cpp @@ -31,7 +31,7 @@ #include "gf_ring.h" #ifdef QUADIRON_USE_SIMD -#include "simd.h" + #include "simd/simd.h" namespace quadiron { From 942d47c48b1e61be0e81b6edd8944bef3403dfaf Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Wed, 31 Oct 2018 15:58:18 +0100 Subject: [PATCH 76/77] Include right headers for simd tests --- test/simd/test_allocator.cpp | 2 +- test/simd/test_definitions.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/simd/test_allocator.cpp b/test/simd/test_allocator.cpp index a1d59034..09edb70f 100644 --- a/test/simd/test_allocator.cpp +++ b/test/simd/test_allocator.cpp @@ -32,7 +32,7 @@ #include -#include "simd/simd.h" +#include "simd/allocator.h" namespace simd = quadiron::simd; diff --git a/test/simd/test_definitions.cpp b/test/simd/test_definitions.cpp index c7a48975..bde45d05 100644 --- a/test/simd/test_definitions.cpp +++ b/test/simd/test_definitions.cpp @@ -29,7 +29,7 @@ */ #include -#include "simd/simd.h" +#include "simd/definitions.h" namespace simd = quadiron::simd; From 880881491c3d31b7d7db93420590dc662953b77c Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Wed, 31 Oct 2018 15:58:55 +0100 Subject: [PATCH 77/77] SIMD: include headers for simd tests --- src/simd/simd.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/simd/simd.h b/src/simd/simd.h index ad02f3fc..372931cc 100644 --- a/src/simd/simd.h +++ b/src/simd/simd.h @@ -31,7 +31,9 @@ #ifndef __QUAD_SIMD_SIMD_H__ #define __QUAD_SIMD_SIMD_H__ +#include "core.h" #include "property.h" +#include "vec_buffers.h" #include "simd/allocator.h" #include "simd/definitions.h"