diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index aff9c341..3ff590af 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -31,6 +31,7 @@ include(GNUInstallDirs) set(LIB_SRC ${SOURCE_DIR}/core.cpp ${SOURCE_DIR}/fec_vectorisation.cpp + ${SOURCE_DIR}/fft_2n.cpp ${SOURCE_DIR}/misc.cpp ${SOURCE_DIR}/gf_nf4.cpp ${SOURCE_DIR}/gf_ring.cpp diff --git a/src/arith.h b/src/arith.h index 1da85320..9b230677 100644 --- a/src/arith.h +++ b/src/arith.h @@ -41,12 +41,6 @@ namespace quadiron { -template -using DoubleSizeVal = typename DoubleSize::T; - -template -using SignedDoubleSizeVal = typename SignedDoubleSize::T; - /** Base/core arithmetical functions of QuadIron. */ namespace arith { diff --git a/src/core.h b/src/core.h index 5eaf84fe..a9033f90 100644 --- a/src/core.h +++ b/src/core.h @@ -34,7 +34,7 @@ #include #include "big_int.h" -#include "simd/simd.h" +#include "simd/allocator.h" namespace quadiron { @@ -78,6 +78,12 @@ struct SignedDoubleSize<__uint128_t> { typedef Int256 T; }; +template +using DoubleSizeVal = typename DoubleSize::T; + +template +using SignedDoubleSizeVal = typename SignedDoubleSize::T; + /** A group of values stored as one. * * This allows faster processing, as the values can be processed as one. diff --git a/src/fec_base.h b/src/fec_base.h index 13ac78d9..0a417155 100644 --- a/src/fec_base.h +++ b/src/fec_base.h @@ -51,7 +51,7 @@ #ifdef QUADIRON_USE_SIMD -#include "simd.h" +#include "simd/simd.h" #endif // #ifdef QUADIRON_USE_SIMD @@ -74,8 +74,6 @@ static inline uint64_t hrtime_usec(timeval begin) return 1000000 * (tv.tv_sec - begin.tv_sec) + tv.tv_usec - begin.tv_usec; } -#define OOR_MARK 1 - enum class FecType { /** Systematic code * diff --git a/src/fec_rs_fnt.h b/src/fec_rs_fnt.h index 4bdd5475..55ffb487 100644 --- a/src/fec_rs_fnt.h +++ b/src/fec_rs_fnt.h @@ -60,6 +60,11 @@ class RsFnt : public FecCode { // decoding context used in encoding of systematic FNT std::unique_ptr> enc_context; + // Indices used for accelerated functions + size_t simd_vec_len; + size_t simd_trailing_len; + size_t simd_offset; + public: RsFnt( FecType type, @@ -70,6 +75,12 @@ class RsFnt : public FecCode { : FecCode(type, word_size, n_data, n_parities, pkt_size) { this->fec_init(); + + // Indices used for accelerated functions + const unsigned ratio = simd::countof(); + simd_vec_len = this->pkt_size / ratio; + simd_trailing_len = this->pkt_size - simd_vec_len * ratio; + simd_offset = simd_vec_len * ratio; } inline void check_params() override diff --git a/src/fec_vectorisation.cpp b/src/fec_vectorisation.cpp index 2564ed7d..3900fd6d 100644 --- a/src/fec_vectorisation.cpp +++ b/src/fec_vectorisation.cpp @@ -32,12 +32,11 @@ #include "fec_rs_fnt.h" /* - * The file includes vectorized operations used by FEC classes + * The file includes specialized operations used by FEC classes */ #ifdef QUADIRON_USE_SIMD -#include "simd.h" #include "simd/simd.h" namespace quadiron { @@ -53,20 +52,13 @@ void RsFnt::encode_post_process( uint16_t threshold = this->gf->card_minus_one(); unsigned code_len = this->n_outputs; - // number of elements per vector register - unsigned vec_size = simd::countof(); - // number of vector registers per fragment packet - size_t vecs_nb = size / vec_size; - // odd number of elements not vectorized - size_t last_len = size - vecs_nb * vec_size; - simd::encode_post_process( - output, props, offset, code_len, threshold, vecs_nb); + output, props, offset, code_len, threshold, simd_vec_len); - if (last_len > 0) { + if (simd_trailing_len > 0) { for (unsigned i = 0; i < code_len; ++i) { uint16_t* chunk = output.get(i); - for (size_t j = vecs_nb * vec_size; j < size; ++j) { + for (size_t j = simd_offset; j < size; ++j) { if (chunk[j] == threshold) { props[i].add(offset + j, OOR_MARK); } @@ -85,20 +77,13 @@ void RsFnt::encode_post_process( const uint32_t threshold = this->gf->card_minus_one(); const unsigned code_len = this->n_outputs; - // number of elements per vector register - const unsigned vec_size = simd::countof(); - // number of vector registers per fragment packet - const size_t vecs_nb = size / vec_size; - // odd number of elements not vectorized - const size_t last_len = size - vecs_nb * vec_size; - simd::encode_post_process( - output, props, offset, code_len, threshold, vecs_nb); + output, props, offset, code_len, threshold, simd_vec_len); - if (last_len > 0) { + if (simd_trailing_len > 0) { for (unsigned i = 0; i < code_len; ++i) { uint32_t* chunk = output.get(i); - for (size_t j = vecs_nb * vec_size; j < size; ++j) { + for (size_t j = simd_offset; j < size; ++j) { if (chunk[j] == threshold) { props[i].add(offset + j, OOR_MARK); } diff --git a/src/fft_2n.cpp b/src/fft_2n.cpp new file mode 100644 index 00000000..6cc1f181 --- /dev/null +++ b/src/fft_2n.cpp @@ -0,0 +1,192 @@ +/* -*- mode: c++ -*- */ +/* + * Copyright 2017-2018 Scality + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "fft_2n.h" + +/* + * The file includes vectorized operations used by Radix2 classes + */ + +#ifdef QUADIRON_USE_SIMD + +#include "simd/simd.h" + +namespace quadiron { +namespace fft { + +template <> +void Radix2::butterfly_ct_two_layers_step( + vec::Buffers& buf, + unsigned start, + unsigned m) +{ + const unsigned coefIndex = start * this->n / m / 2; + const uint16_t r1 = vec_W[coefIndex]; + const uint16_t r2 = vec_W[coefIndex / 2]; + const uint16_t r3 = vec_W[coefIndex / 2 + this->n / 4]; + + // perform vector operations + simd::butterfly_ct_two_layers_step( + buf, r1, r2, r3, start, m, simd_vec_len, card); + + // for last elements, perform as non-SIMD method + if (simd_trailing_len > 0) { + butterfly_ct_two_layers_step_slow(buf, start, m, simd_offset); + } +} + +template <> +void Radix2::butterfly_ct_step( + vec::Buffers& buf, + uint16_t r, + unsigned start, + unsigned m, + unsigned step) +{ + // perform vector operations + simd::butterfly_ct_step(buf, r, start, m, step, simd_vec_len, card); + + // for last elements, perform as non-SIMD method + if (simd_trailing_len > 0) { + butterfly_ct_step_slow(buf, r, start, m, step, simd_offset); + } +} + +template <> +void Radix2::butterfly_gs_step( + vec::Buffers& buf, + uint16_t coef, + unsigned start, + unsigned m, + unsigned step) +{ + // perform vector operations + simd::butterfly_gs_step(buf, coef, start, m, simd_vec_len, card); + + // for last elements, perform as non-SIMD method + if (simd_trailing_len > 0) { + butterfly_gs_step_slow(buf, coef, start, m, step, simd_offset); + } +} + +template <> +void Radix2::butterfly_gs_step_simple( + vec::Buffers& buf, + uint16_t coef, + unsigned start, + unsigned m, + unsigned step) +{ + // perform vector operations + simd::butterfly_gs_step_simple(buf, coef, start, m, simd_vec_len, card); + + // for last elements, perform as non-SIMD method + if (simd_trailing_len > 0) { + butterfly_gs_step_simple_slow(buf, coef, start, m, step, simd_offset); + } +} + +template <> +void Radix2::butterfly_ct_two_layers_step( + vec::Buffers& buf, + unsigned start, + unsigned m) +{ + const unsigned coefIndex = start * this->n / m / 2; + const uint32_t r1 = vec_W[coefIndex]; + const uint32_t r2 = vec_W[coefIndex / 2]; + const uint32_t r3 = vec_W[coefIndex / 2 + this->n / 4]; + + // perform vector operations + simd::butterfly_ct_two_layers_step( + buf, r1, r2, r3, start, m, simd_vec_len, card); + + // for last elements, perform as non-SIMD method + if (simd_trailing_len > 0) { + butterfly_ct_two_layers_step_slow(buf, start, m, simd_offset); + } +} + +template <> +void Radix2::butterfly_ct_step( + vec::Buffers& buf, + uint32_t r, + unsigned start, + unsigned m, + unsigned step) +{ + // perform vector operations + simd::butterfly_ct_step(buf, r, start, m, step, simd_vec_len, card); + + // for last elements, perform as non-SIMD method + if (simd_trailing_len > 0) { + butterfly_ct_step_slow(buf, r, start, m, step, simd_offset); + } +} + +template <> +void Radix2::butterfly_gs_step( + vec::Buffers& buf, + uint32_t coef, + unsigned start, + unsigned m, + unsigned step) +{ + // perform vector operations + simd::butterfly_gs_step(buf, coef, start, m, simd_vec_len, card); + + // for last elements, perform as non-SIMD method + if (simd_trailing_len > 0) { + butterfly_gs_step_slow(buf, coef, start, m, step, simd_offset); + } +} + +template <> +void Radix2::butterfly_gs_step_simple( + vec::Buffers& buf, + uint32_t coef, + unsigned start, + unsigned m, + unsigned step) +{ + // perform vector operations + simd::butterfly_gs_step_simple(buf, coef, start, m, simd_vec_len, card); + + // for last elements, perform as non-SIMD method + if (simd_trailing_len > 0) { + butterfly_gs_step_simple_slow(buf, coef, start, m, step, simd_offset); + } +} + +} // namespace fft +} // namespace quadiron + +#endif // #ifdef QUADIRON_USE_SIMD diff --git a/src/fft_2n.h b/src/fft_2n.h index f3da7174..4af68528 100644 --- a/src/fft_2n.h +++ b/src/fft_2n.h @@ -112,6 +112,11 @@ class Radix2 : public FourierTransform { unsigned step); // Only used for non-vectorized elements + void butterfly_ct_two_layers_step_slow( + vec::Buffers& buf, + unsigned start, + unsigned m, + size_t offset = 0); void butterfly_ct_step_slow( vec::Buffers& buf, T coef, @@ -142,6 +147,11 @@ class Radix2 : public FourierTransform { size_t pkt_size; size_t buf_size; + // Indices used for accelerated functions + size_t simd_vec_len; + size_t simd_trailing_len; + size_t simd_offset; + std::unique_ptr rev = nullptr; std::unique_ptr> W = nullptr; std::unique_ptr> inv_W = nullptr; @@ -182,6 +192,12 @@ Radix2::Radix2(const gf::Field& gf, int n, int data_len, size_t pkt_size) rev = std::unique_ptr(new T[n]); init_bitrev(); + + // Indices used for accelerated functions + const unsigned ratio = simd::countof(); + simd_vec_len = this->pkt_size / ratio; + simd_trailing_len = this->pkt_size - simd_vec_len * ratio; + simd_offset = simd_vec_len * ratio; } template @@ -421,6 +437,16 @@ void Radix2::butterfly_ct_two_layers_step( vec::Buffers& buf, unsigned start, unsigned m) +{ + butterfly_ct_two_layers_step_slow(buf, start, m); +} + +template +void Radix2::butterfly_ct_two_layers_step_slow( + vec::Buffers& buf, + unsigned start, + unsigned m, + size_t offset) { const unsigned step = m << 2; // --------- @@ -428,18 +454,18 @@ void Radix2::butterfly_ct_two_layers_step( // --------- const T r1 = W->get(start * this->n / m / 2); // first pair - butterfly_ct_step(buf, r1, start, m, step); + butterfly_ct_step_slow(buf, r1, start, m, step, offset); // second pair - butterfly_ct_step(buf, r1, start + 2 * m, m, step); + butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset); // --------- // Second layer // --------- // first pair const T r2 = W->get(start * this->n / m / 4); - butterfly_ct_step(buf, r2, start, 2 * m, step); + butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset); // second pair const T r3 = W->get((start + m) * this->n / m / 4); - butterfly_ct_step(buf, r3, start + m, 2 * m, step); + butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset); } template @@ -602,6 +628,65 @@ void Radix2::ifft(vec::Buffers& output, vec::Buffers& input) this->gf->mul_vec_to_vecp(*(this->vec_inv_n), output, output); } +#ifdef QUADIRON_USE_SIMD + +/* Operations are vectorized by SIMD */ +template <> +void Radix2::butterfly_ct_two_layers_step( + vec::Buffers& buf, + unsigned start, + unsigned m); +template <> +void Radix2::butterfly_ct_step( + vec::Buffers& buf, + uint16_t r, + unsigned start, + unsigned m, + unsigned step); +template <> +void Radix2::butterfly_gs_step( + vec::Buffers& buf, + uint16_t coef, + unsigned start, + unsigned m, + unsigned step); +template <> +void Radix2::butterfly_gs_step_simple( + vec::Buffers& buf, + uint16_t coef, + unsigned start, + unsigned m, + unsigned step); + +template <> +void Radix2::butterfly_ct_two_layers_step( + vec::Buffers& buf, + unsigned start, + unsigned m); +template <> +void Radix2::butterfly_ct_step( + vec::Buffers& buf, + uint32_t r, + unsigned start, + unsigned m, + unsigned step); +template <> +void Radix2::butterfly_gs_step( + vec::Buffers& buf, + uint32_t coef, + unsigned start, + unsigned m, + unsigned step); +template <> +void Radix2::butterfly_gs_step_simple( + vec::Buffers& buf, + uint32_t coef, + unsigned start, + unsigned m, + unsigned step); + +#endif // #ifdef QUADIRON_USE_SIMD + } // namespace fft } // namespace quadiron diff --git a/src/gf_nf4.cpp b/src/gf_nf4.cpp index 9e7fa4dc..ecbf31b7 100644 --- a/src/gf_nf4.cpp +++ b/src/gf_nf4.cpp @@ -32,7 +32,7 @@ #ifdef QUADIRON_USE_SIMD -#include "simd.h" +#include "simd/simd.h" namespace quadiron { namespace gf { diff --git a/src/gf_ring.cpp b/src/gf_ring.cpp index da1ed530..9120fe01 100644 --- a/src/gf_ring.cpp +++ b/src/gf_ring.cpp @@ -31,7 +31,7 @@ #include "gf_ring.h" #ifdef QUADIRON_USE_SIMD -#include "simd.h" + #include "simd/simd.h" namespace quadiron { diff --git a/src/property.h b/src/property.h index 766acea7..2ecdc65e 100644 --- a/src/property.h +++ b/src/property.h @@ -40,6 +40,8 @@ namespace quadiron { +#define OOR_MARK 1 + /** Ancillary data attached to values. * * A property carries extra-information (whose interpretation is left to the diff --git a/src/simd.h b/src/simd.h deleted file mode 100644 index 8309bfff..00000000 --- a/src/simd.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright 2017-2018 Scality - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __QUAD_SIMD_H__ -#define __QUAD_SIMD_H__ - -#ifdef QUADIRON_USE_SIMD - -#include "property.h" -#include "simd/simd.h" - -const unsigned F4 = 65537; -const unsigned F3 = 257; - -typedef uint8_t aint8 __attribute__((aligned(quadiron::simd::ALIGNMENT))); -typedef uint16_t aint16 __attribute__((aligned(quadiron::simd::ALIGNMENT))); -typedef uint32_t aint32 __attribute__((aligned(quadiron::simd::ALIGNMENT))); -typedef uint64_t aint64 __attribute__((aligned(quadiron::simd::ALIGNMENT))); -typedef __uint128_t aint128 __attribute__((aligned(quadiron::simd::ALIGNMENT))); - -namespace quadiron { -/** The namespace simd contains functions for GF-NF4 that are accelerated by - * using SIMD operations over 128bits - * - * It supports operations on 32-bit numbers - */ -namespace simd { - -// Vectorized operations are implemented in appropriated headers simd*.h - -} // namespace simd -} // namespace quadiron - -#if defined(__AVX2__) -#include "simd_256.h" -#elif defined(__SSE4_1__) -#include "simd_128.h" -#endif - -#include "simd_nf4.h" - -#endif // #ifdef QUADIRON_USE_SIMD - -#endif diff --git a/src/simd/simd.h b/src/simd/simd.h index 9cdcd251..372931cc 100644 --- a/src/simd/simd.h +++ b/src/simd/simd.h @@ -31,6 +31,10 @@ #ifndef __QUAD_SIMD_SIMD_H__ #define __QUAD_SIMD_SIMD_H__ +#include "core.h" +#include "property.h" +#include "vec_buffers.h" + #include "simd/allocator.h" #include "simd/definitions.h" @@ -57,4 +61,31 @@ static constexpr std::size_t countof() } // namespace simd } // namespace quadiron +#ifdef QUADIRON_USE_SIMD + +const unsigned F4 = 65537; +const unsigned F3 = 257; + +// Include essential operations that use SIMD functions +#if defined(__AVX2__) + +#include "simd_256.h" + +#elif defined(__SSE4_1__) + +#include "simd_128.h" + +#endif + +// Include basic operations +#include "simd_basic.h" + +// Include accelerated operations dedicated for FNT +#include "simd_fnt.h" + +// Include accelerated operations dedicated for NF4 +#include "simd_nf4.h" + +#endif // #ifdef QUADIRON_USE_SIMD + #endif diff --git a/src/simd/simd_128.h b/src/simd/simd_128.h new file mode 100644 index 00000000..bb33ee4f --- /dev/null +++ b/src/simd/simd_128.h @@ -0,0 +1,174 @@ +/* + * Copyright 2017-2018 Scality + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __QUAD_SIMD_128_H__ +#define __QUAD_SIMD_128_H__ + +namespace quadiron { +namespace simd { + +typedef __m128i VecType; + +/* ============= Constant variable ============ */ + +#define F4_U32 _mm_set1_epi32(65537) +#define F4_MINUS_ONE_U32 _mm_set1_epi32(65536) +#define F3_U32 _mm_set1_epi32(257) +#define F3_MINUS_ONE_U32 _mm_set1_epi32(256) + +#define F3_U16 _mm_set1_epi16(257) +#define F3_MINUS_ONE_U16 _mm_set1_epi16(256) + +#define ZERO (_mm_setzero_si128()) +#define ONE_U16 (_mm_set1_epi16(1)) +#define ONE_U32 (_mm_set1_epi32(1)) + +#define MASK8_LO (_mm_set1_epi16(0x80)) + +/* ============= Essential Operations for SSE w/ both u16 & u32 ============ */ + +inline VecType LoadToReg(VecType* address) +{ + return _mm_load_si128(address); +} +inline void StoreToMem(VecType* address, VecType reg) +{ + _mm_store_si128(address, reg); +} + +inline VecType And(VecType x, VecType y) +{ + return _mm_and_si128(x, y); +} +inline VecType Xor(VecType x, VecType y) +{ + return _mm_xor_si128(x, y); +} +inline uint16_t Msb8Mask(VecType x) +{ + return _mm_movemask_epi8(x); +} +inline uint16_t AndIsZero(VecType x, VecType y) +{ + return _mm_testz_si128(x, y); +} +inline int IsZero(VecType x) +{ + return _mm_testc_si128(ZERO, x); +} + +#define SHIFTR(x, imm8) (_mm_srli_si128(x, imm8)) +#define BLEND8(x, y, mask) (_mm_blendv_epi8(x, y, mask)) +#define BLEND16(x, y, imm8) (_mm_blend_epi16(x, y, imm8)) + +/* ================= Essential Operations for SSE ================= */ + +template +inline VecType SetOne(T val); +template <> +inline VecType SetOne(uint32_t val) +{ + return _mm_set1_epi32(val); +} +template <> +inline VecType SetOne(uint16_t val) +{ + return _mm_set1_epi16(val); +} + +template +inline VecType Add(VecType x, VecType y); +template <> +inline VecType Add(VecType x, VecType y) +{ + return _mm_add_epi32(x, y); +} +template <> +inline VecType Add(VecType x, VecType y) +{ + return _mm_add_epi16(x, y); +} + +template +inline VecType Sub(VecType x, VecType y); +template <> +inline VecType Sub(VecType x, VecType y) +{ + return _mm_sub_epi32(x, y); +} +template <> +inline VecType Sub(VecType x, VecType y) +{ + return _mm_sub_epi16(x, y); +} + +template +inline VecType Mul(VecType x, VecType y); +template <> +inline VecType Mul(VecType x, VecType y) +{ + return _mm_mullo_epi32(x, y); +} +template <> +inline VecType Mul(VecType x, VecType y) +{ + return _mm_mullo_epi16(x, y); +} + +template +inline VecType CompareEq(VecType x, VecType y); +template <> +inline VecType CompareEq(VecType x, VecType y) +{ + return _mm_cmpeq_epi32(x, y); +} +template <> +inline VecType CompareEq(VecType x, VecType y) +{ + return _mm_cmpeq_epi16(x, y); +} + +template +inline VecType Min(VecType x, VecType y); +template <> +inline VecType Min(VecType x, VecType y) +{ + return _mm_min_epu32(x, y); +} +template <> +inline VecType Min(VecType x, VecType y) +{ + return _mm_min_epu16(x, y); +} + +} // namespace simd +} // namespace quadiron + +#endif diff --git a/src/simd/simd_256.h b/src/simd/simd_256.h new file mode 100644 index 00000000..0723e80f --- /dev/null +++ b/src/simd/simd_256.h @@ -0,0 +1,187 @@ +/* + * Copyright 2017-2018 Scality + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __QUAD_SIMD_256_H__ +#define __QUAD_SIMD_256_H__ + +/* GCC doesn't include the split store intrinsics so define them here. */ +#if defined(__GNUC__) && !defined(__clang__) + +static inline void __attribute__((__always_inline__)) +_mm256_storeu2_m128i(__m128i* const hi, __m128i* const lo, const __m256i a) +{ + _mm_storeu_si128(lo, _mm256_castsi256_si128(a)); + _mm_storeu_si128(hi, _mm256_extracti128_si256(a, 1)); +} + +#endif /* defined(__GNUC__) */ + +namespace quadiron { +namespace simd { + +typedef __m256i VecType; +typedef __m128i HalfVecType; + +/* ============= Constant variable ============ */ + +#define F4_U32 _mm256_set1_epi32(65537) +#define F4_MINUS_ONE_U32 _mm256_set1_epi32(65536) +#define F3_U32 _mm256_set1_epi32(257) +#define F3_MINUS_ONE_U32 _mm256_set1_epi32(256) + +#define F3_U16 _mm256_set1_epi16(257) +#define F3_MINUS_ONE_U16 _mm256_set1_epi16(256) + +#define ZERO (_mm256_setzero_si256()) +#define ONE_U16 (_mm256_set1_epi16(1)) +#define ONE_U32 (_mm256_set1_epi32(1)) + +#define MASK8_LO (_mm256_set1_epi16(0x80)) + +/* ============= Essential Operations for AVX2 w/ both u16 & u32 ============ */ + +inline VecType LoadToReg(VecType* address) +{ + return _mm256_load_si256(address); +} +inline void StoreToMem(VecType* address, VecType reg) +{ + _mm256_store_si256(address, reg); +} + +inline VecType And(VecType x, VecType y) +{ + return _mm256_and_si256(x, y); +} +inline VecType Xor(VecType x, VecType y) +{ + return _mm256_xor_si256(x, y); +} +inline uint32_t Msb8Mask(VecType x) +{ + return _mm256_movemask_epi8(x); +} +inline uint32_t AndIsZero(VecType x, VecType y) +{ + return _mm256_testz_si256(x, y); +} +inline int IsZero(VecType x) +{ + return _mm256_testc_si256(ZERO, x); +} + +#define SHIFTR(x, imm8) (_mm256_srli_si256(x, imm8)) +#define BLEND8(x, y, mask) (_mm256_blendv_epi8(x, y, mask)) +#define BLEND16(x, y, imm8) (_mm256_blend_epi16(x, y, imm8)) + +/* ================= Essential Operations for AVX2 ================= */ + +template +inline VecType SetOne(T val); +template <> +inline VecType SetOne(uint32_t val) +{ + return _mm256_set1_epi32(val); +} +template <> +inline VecType SetOne(uint16_t val) +{ + return _mm256_set1_epi16(val); +} + +template +inline VecType Add(VecType x, VecType y); +template <> +inline VecType Add(VecType x, VecType y) +{ + return _mm256_add_epi32(x, y); +} +template <> +inline VecType Add(VecType x, VecType y) +{ + return _mm256_add_epi16(x, y); +} + +template +inline VecType Sub(VecType x, VecType y); +template <> +inline VecType Sub(VecType x, VecType y) +{ + return _mm256_sub_epi32(x, y); +} +template <> +inline VecType Sub(VecType x, VecType y) +{ + return _mm256_sub_epi16(x, y); +} + +template +inline VecType Mul(VecType x, VecType y); +template <> +inline VecType Mul(VecType x, VecType y) +{ + return _mm256_mullo_epi32(x, y); +} +template <> +inline VecType Mul(VecType x, VecType y) +{ + return _mm256_mullo_epi16(x, y); +} + +template +inline VecType CompareEq(VecType x, VecType y); +template <> +inline VecType CompareEq(VecType x, VecType y) +{ + return _mm256_cmpeq_epi32(x, y); +} +template <> +inline VecType CompareEq(VecType x, VecType y) +{ + return _mm256_cmpeq_epi16(x, y); +} + +template +inline VecType Min(VecType x, VecType y); +template <> +inline VecType Min(VecType x, VecType y) +{ + return _mm256_min_epu32(x, y); +} +template <> +inline VecType Min(VecType x, VecType y) +{ + return _mm256_min_epu16(x, y); +} + +} // namespace simd +} // namespace quadiron + +#endif diff --git a/src/simd/simd_basic.h b/src/simd/simd_basic.h new file mode 100644 index 00000000..ab2301ad --- /dev/null +++ b/src/simd/simd_basic.h @@ -0,0 +1,331 @@ +/* + * Copyright 2017-2018 Scality + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __QUAD_SIMD_BASIC_H__ +#define __QUAD_SIMD_BASIC_H__ + +namespace quadiron { +namespace simd { + +template +inline VecType Card(T q); +template <> +inline VecType Card(uint16_t q) +{ + return F3_U16; +} +template <> +inline VecType Card(uint32_t q) +{ + return (q == F3) ? F3_U32 : F4_U32; +} + +template +inline VecType CardMinusOne(T q); +template <> +inline VecType CardMinusOne(uint16_t q) +{ + return F3_MINUS_ONE_U16; +} +template <> +inline VecType CardMinusOne(uint32_t q) +{ + return (q == F3) ? F3_MINUS_ONE_U32 : F4_MINUS_ONE_U32; +} + +template +inline VecType GetLowHalf(VecType x, T q) +{ + return (q == F3) ? BLEND8(ZERO, x, MASK8_LO) : BLEND16(ZERO, x, 0x55); +} + +template +inline VecType GetHighHalf(VecType x, T q) +{ + return (q == F3) ? BLEND8(ZERO, SHIFTR(x, 1), MASK8_LO) + : BLEND16(ZERO, SHIFTR(x, 2), 0x55); +} + +/* ================= Basic Operations ================= */ + +/** + * Modular addition + * + * @param x input register + * @param y input register + * @param q modulo + * @return (x + y) mod q + */ +template +inline VecType ModAdd(VecType x, VecType y, T q) +{ + const VecType res = Add(x, y); + return Min(res, Sub(res, Card(q))); +} + +/** + * Modular subtraction for packed unsigned 32-bit integers + * + * @param x input register + * @param y input register + * @param q modulo + * @return (x - y) mod q + */ +template +inline VecType ModSub(VecType x, VecType y, T q) +{ + const VecType res = Sub(x, y); + return Min(res, Add(res, Card(q))); +} + +/** + * Modular negation for packed unsigned 32-bit integers + * + * @param x input register + * @param q modulo + * @return (-x) mod q + */ +template +inline VecType ModNeg(VecType x, T q) +{ + const VecType res = Sub(Card(q), x); + return Min(res, Sub(res, Card(q))); +} + +/** + * Modular multiplication for packed unsigned 32-bit integers + * + * @note We assume that at least `x` or `y` is less than `q-1` so it's + * not necessary to verify overflow on multiplying elements + * + * @param x input register + * @param y input register + * @param q modulo + * @return (x * y) mod q + */ +template +inline VecType ModMul(VecType x, VecType y, T q) +{ + const VecType res = Mul(x, y); + const VecType lo = GetLowHalf(res, q); + const VecType hi = GetHighHalf(res, q); + return ModSub(lo, hi, q); +} + +/** + * Modular general multiplication for packed unsigned 32-bit integers + * + * @note It's necessary to verify overflow on multiplying elements + * + * @param x input register + * @param y input register + * @param q modulo + * @return (x * y) mod q + */ +template +inline VecType ModMulSafe(VecType x, VecType y, T q) +{ + const VecType res = ModMul(x, y, q); + + // filter elements of both of a & b = card-1 + const VecType cmp = + And(CompareEq(x, CardMinusOne(q)), CompareEq(y, CardMinusOne(q))); + + if (IsZero(cmp) == 1) { + return res; + } + return (q == F3) ? Xor(res, And(F4_U32, cmp)) + : Add(res, And(ONE_U32, cmp)); +} + +/** + * Update property for a given register for packed unsigned 32-bit integers + * + * @param props properties bound to fragments + * @param threshold register storing max value in its elements + * @param mask a specific mask + * @param symb input register + * @param offset offset in the data fragments + * @param max a dummy variable + */ +template +inline void AddProps( + Properties& props, + VecType threshold, + VecType mask, + VecType symb, + off_t offset, + T max) +{ + const VecType b = CompareEq(threshold, symb); + const VecType c = And(mask, b); + auto d = Msb8Mask(c); + const unsigned element_size = sizeof(T); + while (d > 0) { + const unsigned byte_idx = __builtin_ctz(d); + const size_t _offset = offset + byte_idx / element_size; + props.add(_offset, OOR_MARK); + d ^= 1 << byte_idx; + } +} + +/* ==================== Operations for RingModN =================== */ +/** Perform a multiplication of a coefficient `a` to each element of `src` and + * add result to correspondent element of `dest` + * + * @note: 1 < `a` < card - 1 + */ +template +inline void mul_coef_to_buf(const T a, T* src, T* dest, size_t len, T card) +{ + const VecType coef = SetOne(a); + + VecType* __restrict _src = reinterpret_cast(src); + VecType* __restrict _dest = reinterpret_cast(dest); + const unsigned ratio = sizeof(*_src) / sizeof(*src); + const size_t _len = len / ratio; + const size_t _last_len = len - _len * ratio; + + size_t i = 0; + const size_t end = (_len > 3) ? _len - 3 : 0; + for (; i < end; i += 4) { + _dest[i] = ModMul(coef, _src[i], card); + _dest[i + 1] = ModMul(coef, _src[i + 1], card); + _dest[i + 2] = ModMul(coef, _src[i + 2], card); + _dest[i + 3] = ModMul(coef, _src[i + 3], card); + } + for (; i < _len; ++i) { + _dest[i] = ModMul(coef, _src[i], card); + } + + if (_last_len > 0) { + const DoubleSizeVal coef_double = DoubleSizeVal(a); + for (size_t i = _len * ratio; i < len; i++) { + dest[i] = (T)((coef_double * src[i]) % card); + } + } +} + +template +inline void add_two_bufs(T* src, T* dest, size_t len, T card) +{ + VecType* __restrict _src = reinterpret_cast(src); + VecType* __restrict _dest = reinterpret_cast(dest); + const unsigned ratio = sizeof(*_src) / sizeof(*src); + const size_t _len = len / ratio; + const size_t _last_len = len - _len * ratio; + + size_t i; + for (i = 0; i < _len; i++) { + _dest[i] = ModAdd(_src[i], _dest[i], card); + } + if (_last_len > 0) { + for (i = _len * ratio; i < len; i++) { + const T tmp = src[i] + dest[i]; + dest[i] = (tmp >= card) ? (tmp - card) : tmp; + } + } +} + +template +inline void sub_two_bufs(T* bufa, T* bufb, T* res, size_t len, T card) +{ + VecType* __restrict _bufa = reinterpret_cast(bufa); + VecType* __restrict _bufb = reinterpret_cast(bufb); + VecType* __restrict _res = reinterpret_cast(res); + const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa); + const size_t _len = len / ratio; + const size_t _last_len = len - _len * ratio; + + size_t i; + for (i = 0; i < _len; i++) { + // perform subtraction + _res[i] = ModSub(_bufa[i], _bufb[i], card); + } + if (_last_len > 0) { + for (i = _len * ratio; i < len; i++) { + // perform subtraction + if (bufa[i] >= bufb[i]) { + res[i] = bufa[i] - bufb[i]; + } else { + res[i] = card - (bufb[i] - bufa[i]); + } + } + } +} + +template +inline void mul_two_bufs(T* src, T* dest, size_t len, T card) +{ + VecType* __restrict _src = reinterpret_cast(src); + VecType* __restrict _dest = reinterpret_cast(dest); + const unsigned ratio = sizeof(*_src) / sizeof(*src); + const size_t _len = len / ratio; + const size_t _last_len = len - _len * ratio; + + size_t i; + for (i = 0; i < _len; i++) { + // perform multiplicaton + _dest[i] = ModMulSafe(_src[i], _dest[i], card); + } + if (_last_len > 0) { + for (i = _len * ratio; i < len; i++) { + // perform multiplicaton + dest[i] = T((DoubleSizeVal(src[i]) * dest[i]) % card); + } + } +} + +/** Apply an element-wise negation to a buffer + */ +template +inline void neg(size_t len, T* buf, T card) +{ + VecType* _buf = reinterpret_cast(buf); + const unsigned ratio = sizeof(*_buf) / sizeof(*buf); + const size_t _len = len / ratio; + const size_t _last_len = len - _len * ratio; + + size_t i; + for (i = 0; i < _len; i++) { + _buf[i] = ModNeg(_buf[i], card); + } + if (_last_len > 0) { + for (i = _len * ratio; i < len; i++) { + if (buf[i]) + buf[i] = card - buf[i]; + } + } +} + +} // namespace simd +} // namespace quadiron + +#endif diff --git a/src/simd/simd_fnt.h b/src/simd/simd_fnt.h new file mode 100644 index 00000000..97467050 --- /dev/null +++ b/src/simd/simd_fnt.h @@ -0,0 +1,528 @@ +/* + * Copyright 2017-2018 Scality + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __QUAD_SIMD_FNT_H__ +#define __QUAD_SIMD_FNT_H__ + +namespace quadiron { +namespace simd { + +/* ================= Vectorized Operations ================= */ + +/** + * Butterfly Cooley-Tukey operation + * + * x <- x + r * y + * y <- x - r * y + * + * @param rp1 coefficient `r` plus one + * @param c a register stores coefficient `r` + * @param x working register + * @param y working register + * @param q modular + */ +template +inline void ButterflyCT(T rp1, VecType c, VecType* x, VecType* y, T q) +{ + VecType z = (rp1 == 2) ? *y : ModMul(c, *y, q); + if (rp1 < q) { + *y = ModSub(*x, z, q); + *x = ModAdd(*x, z, q); + } else { // i.e. r == q - 1 + *y = ModAdd(*x, z, q); + *x = ModSub(*x, z, q); + } +} + +/** + * Butterfly Genteleman-Sande operation + * + * x <- x + y + * y <- r * (x - y) + * + * @param rp1 coefficient `r` plus one + * @param c a register stores coefficient `r` + * @param x working register + * @param y working register + * @param q modular + */ +template +inline void ButterflyGS(T rp1, VecType c, VecType* x, VecType* y, T q) +{ + VecType add = ModAdd(*x, *y, q); + if (rp1 == 2) { + *y = ModSub(*x, *y, q); + } else if (rp1 < q) { + VecType sub = ModSub(*x, *y, q); + *y = ModMul(c, sub, q); + } else { // i.e. r == q - 1 + *y = ModSub(*y, *x, q); + } + *x = add; +} + +/** + * Butterfly Genteleman-Sande simple operation where y = 0 + * + * x <- x, i.e. no operation + * y <- r * x + * + * @param rp1 coefficient `r` plus one + * @param c a register stores coefficient `r` + * @param x working register + * @param q modular + * @return r * x + */ +template +inline VecType ButterflySimpleGS(T rp1, VecType c, VecType x, T q) +{ + if (rp1 == 2) { + return x; + } else if (rp1 < q) { + return ModMul(c, x, q); + } else { + return ModNeg(x, q); + } +} + +/** + * Vectorized butterfly CT step + * + * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` + * P = P + r * Q + * Q = P - r * Q + * + * @param buf - working buffers + * @param r - coefficient + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param step - next loop + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +template +inline void butterfly_ct_step( + vec::Buffers& buf, + T r, + unsigned start, + unsigned m, + unsigned step, + size_t len, + T card) +{ + if (len == 0) { + return; + } + const T rp1 = r + 1; + VecType c = SetOne(r); + + const size_t end = (len > 1) ? len - 1 : 0; + const unsigned bufs_nb = buf.get_n(); + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + VecType x1, y1; + VecType x2, y2; + VecType* __restrict p = reinterpret_cast(mem[i]); + VecType* __restrict q = reinterpret_cast(mem[i + m]); + + size_t j = 0; + for (; j < end; j += 2) { + x1 = LoadToReg(p + j); + y1 = LoadToReg(q + j); + + ButterflyCT(rp1, c, &x1, &y1, card); + + x2 = LoadToReg(p + j + 1); + y2 = LoadToReg(q + j + 1); + + ButterflyCT(rp1, c, &x2, &y2, card); + + // Store back to memory + StoreToMem(p + j, x1); + StoreToMem(p + j + 1, x2); + StoreToMem(q + j, y1); + StoreToMem(q + j + 1, y2); + } + for (; j < len; ++j) { + x1 = LoadToReg(p + j); + y1 = LoadToReg(q + j); + + ButterflyCT(rp1, c, &x1, &y1, card); + + // Store back to memory + StoreToMem(p + j, x1); + StoreToMem(q + j, y1); + } + } +} + +template +inline static void do_butterfly_ct_2_layers( + const std::vector& mem, + T r1, + T r2, + T r3, + unsigned start, + unsigned m, + size_t len, + T card) +{ + const T r1p1 = r1 + 1; + const T r2p1 = r2 + 1; + const T r3p1 = r3 + 1; + + VecType c1 = SetOne(r1); + VecType c2 = SetOne(r2); + VecType c3 = SetOne(r3); + + VecType* __restrict p = reinterpret_cast(mem[start]); + VecType* __restrict q = reinterpret_cast(mem[start + m]); + VecType* __restrict r = reinterpret_cast(mem[start + 2 * m]); + VecType* __restrict s = reinterpret_cast(mem[start + 3 * m]); + + size_t j = 0; + const size_t end = (len > 1) ? len - 1 : 0; + while (j < end) { + // First layer (c1, x, y) & (c1, u, v) + VecType x1 = LoadToReg(p); + VecType x2 = LoadToReg(p + 1); + VecType y1 = LoadToReg(q); + VecType y2 = LoadToReg(q + 1); + + ButterflyCT(r1p1, c1, &x1, &y1, card); + ButterflyCT(r1p1, c1, &x2, &y2, card); + + VecType u1 = LoadToReg(r); + VecType u2 = LoadToReg(r + 1); + VecType v1 = LoadToReg(s); + VecType v2 = LoadToReg(s + 1); + + ButterflyCT(r1p1, c1, &u1, &v1, card); + ButterflyCT(r1p1, c1, &u2, &v2, card); + + // Second layer (c2, x, u) & (c3, y, v) + ButterflyCT(r2p1, c2, &x1, &u1, card); + ButterflyCT(r2p1, c2, &x2, &u2, card); + + ButterflyCT(r3p1, c3, &y1, &v1, card); + ButterflyCT(r3p1, c3, &y2, &v2, card); + + // Store back to memory + StoreToMem(p, x1); + StoreToMem(p + 1, x2); + StoreToMem(q, y1); + StoreToMem(q + 1, y2); + + StoreToMem(r, u1); + StoreToMem(r + 1, u2); + StoreToMem(s, v1); + StoreToMem(s + 1, v2); + p = p + 2; + q = q + 2; + r = r + 2; + s = s + 2; + j = j + 2; + }; + + for (; j < len; ++j) { + // First layer (c1, x, y) & (c1, u, v) + VecType x1 = LoadToReg(p + j); + VecType y1 = LoadToReg(q + j); + VecType u1 = LoadToReg(r + j); + VecType v1 = LoadToReg(s + j); + + // BUTTERFLY_3_test(c1, &x1, &y1, &u1, &v1, card); + ButterflyCT(r1p1, c1, &x1, &y1, card); + ButterflyCT(r1p1, c1, &u1, &v1, card); + ButterflyCT(r2p1, c2, &x1, &u1, card); + ButterflyCT(r3p1, c3, &y1, &v1, card); + + // Store back to memory + StoreToMem(p + j, x1); + StoreToMem(q + j, y1); + StoreToMem(r + j, u1); + StoreToMem(s + j, v1); + } +} + +/** + * Vectorized butterfly CT on two-layers at a time + * + * For each quadruple + * (P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m]) + * First layer: butterfly on (P, Q) and (R, S) for step = 2 * m + * coef r1 = W[start * n / (2 * m)] + * P = P + r1 * Q + * Q = P - r1 * Q + * R = R + r1 * S + * S = R - r1 * S + * Second layer: butterfly on (P, R) and (Q, S) for step = 4 * m + * coef r2 = W[start * n / (4 * m)] + * coef r3 = W[(start + m) * n / (4 * m)] + * P = P + r2 * R + * R = P - r2 * R + * Q = Q + r3 * S + * S = Q - r3 * S + * + * @param buf - working buffers + * @param r1 - coefficient for the 1st layer + * @param r2 - 1st coefficient for the 2nd layer + * @param r3 - 2nd coefficient for the 2nd layer + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +template +inline void butterfly_ct_two_layers_step( + vec::Buffers& buf, + T r1, + T r2, + T r3, + unsigned start, + unsigned m, + size_t len, + T card) +{ + if (len == 0) { + return; + } + const unsigned step = m << 2; + const unsigned bufs_nb = buf.get_n(); + + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + do_butterfly_ct_2_layers(mem, r1, r2, r3, i, m, len, card); + } +} + +/** + * Vectorized butterfly GS step + * + * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` + * P = P + Q + * Q = r * (P - Q) + * + * @param buf - working buffers + * @param r - coefficient + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +template +inline void butterfly_gs_step( + vec::Buffers& buf, + T r, + unsigned start, + unsigned m, + size_t len, + T card) +{ + if (len == 0) { + return; + } + const unsigned step = m << 1; + const T rp1 = r + 1; + VecType c = SetOne(r); + + const size_t end = (len > 3) ? len - 3 : 0; + const unsigned bufs_nb = buf.get_n(); + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + VecType x1, x2, x3, x4; + VecType y1, y2, y3, y4; + VecType* __restrict p = reinterpret_cast(mem[i]); + VecType* __restrict q = reinterpret_cast(mem[i + m]); + + size_t j = 0; + for (; j < end; j += 4) { + x1 = LoadToReg(p + j); + x2 = LoadToReg(p + j + 1); + x3 = LoadToReg(p + j + 2); + x4 = LoadToReg(p + j + 3); + y1 = LoadToReg(q + j); + y2 = LoadToReg(q + j + 1); + y3 = LoadToReg(q + j + 2); + y4 = LoadToReg(q + j + 3); + + ButterflyGS(rp1, c, &x1, &y1, card); + ButterflyGS(rp1, c, &x2, &y2, card); + ButterflyGS(rp1, c, &x3, &y3, card); + ButterflyGS(rp1, c, &x4, &y4, card); + + // Store back to memory + StoreToMem(p + j, x1); + StoreToMem(p + j + 1, x2); + StoreToMem(p + j + 2, x3); + StoreToMem(p + j + 3, x4); + StoreToMem(q + j, y1); + StoreToMem(q + j + 1, y2); + StoreToMem(q + j + 2, y3); + StoreToMem(q + j + 3, y4); + } + for (; j < len; ++j) { + x1 = LoadToReg(p + j); + y1 = LoadToReg(q + j); + + ButterflyGS(rp1, c, &x1, &y1, card); + + // Store back to memory + StoreToMem(p + j, x1); + StoreToMem(q + j, y1); + } + } +} + +/** + * Vectorized butterfly GS step + * + * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` + * Q = r * P + * + * @param buf - working buffers + * @param r - coefficient + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +template +inline void butterfly_gs_step_simple( + vec::Buffers& buf, + T r, + unsigned start, + unsigned m, + size_t len, + T card) +{ + if (len == 0) { + return; + } + const unsigned step = m << 1; + const T rp1 = r + 1; + VecType c = SetOne(r); + + const size_t end = (len > 1) ? len - 1 : 0; + const unsigned bufs_nb = buf.get_n(); + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + VecType x1, y1; + VecType x2, y2; + VecType* __restrict p = reinterpret_cast(mem[i]); + VecType* __restrict q = reinterpret_cast(mem[i + m]); + + size_t j = 0; + for (; j < end; j += 2) { + x1 = LoadToReg(p + j); + x2 = LoadToReg(p + j + 1); + + y1 = ButterflySimpleGS(rp1, c, x1, card); + y2 = ButterflySimpleGS(rp1, c, x2, card); + + // Store back to memory + StoreToMem(q + j, y1); + StoreToMem(q + j + 1, y2); + } + for (; j < len; ++j) { + x1 = LoadToReg(p + j); + + y1 = ButterflySimpleGS(rp1, c, x1, card); + + // Store back to memory + StoreToMem(q + j, y1); + } + } +} + +template +inline void encode_post_process( + vec::Buffers& output, + std::vector& props, + off_t offset, + unsigned code_len, + T threshold, + size_t vecs_nb) +{ + const unsigned element_size = sizeof(T); + const unsigned vec_size = countof(); + const T max = 1 << (element_size * 8 - 1); + const VecType _threshold = SetOne(threshold); + const VecType mask_hi = SetOne(max); + + const std::vector& mem = output.get_mem(); + for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { + VecType* __restrict buf = reinterpret_cast(mem[frag_id]); + + size_t vec_id = 0; + size_t end = (vecs_nb > 3) ? vecs_nb - 3 : 0; + for (; vec_id < end; vec_id += 4) { + VecType a1 = LoadToReg(buf + vec_id); + VecType a2 = LoadToReg(buf + vec_id + 1); + VecType a3 = LoadToReg(buf + vec_id + 2); + VecType a4 = LoadToReg(buf + vec_id + 3); + + if (AndIsZero(a1, _threshold) == 0) { + const off_t curr_offset = offset + vec_id * vec_size; + AddProps( + props[frag_id], _threshold, mask_hi, a1, curr_offset, max); + } + if (AndIsZero(a2, _threshold) == 0) { + const off_t curr_offset = offset + (vec_id + 1) * vec_size; + AddProps( + props[frag_id], _threshold, mask_hi, a2, curr_offset, max); + } + if (AndIsZero(a3, _threshold) == 0) { + const off_t curr_offset = offset + (vec_id + 2) * vec_size; + AddProps( + props[frag_id], _threshold, mask_hi, a3, curr_offset, max); + } + if (AndIsZero(a4, _threshold) == 0) { + const off_t curr_offset = offset + (vec_id + 3) * vec_size; + AddProps( + props[frag_id], _threshold, mask_hi, a4, curr_offset, max); + } + } + for (; vec_id < vecs_nb; ++vec_id) { + VecType a = LoadToReg(buf + vec_id); + uint32_t c = AndIsZero(a, _threshold); + if (c == 0) { + const off_t curr_offset = offset + vec_id * vec_size; + AddProps( + props[frag_id], _threshold, mask_hi, a, curr_offset, max); + } + } + } +} + +} // namespace simd +} // namespace quadiron + +#endif diff --git a/src/simd/simd_nf4.h b/src/simd/simd_nf4.h new file mode 100644 index 00000000..8c3c1d92 --- /dev/null +++ b/src/simd/simd_nf4.h @@ -0,0 +1,380 @@ +/* + * Copyright 2017-2018 Scality + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __QUAD_SIMD_NF4_H__ +#define __QUAD_SIMD_NF4_H__ + +namespace quadiron { +namespace simd { + +typedef uint32_t aint32 __attribute__((aligned(ALIGNMENT))); + +/** Return __uint128_t integer from a _m128i register */ +static inline __uint128_t m128i_to_uint128(__m128i v) +{ + __uint128_t i; + _mm_store_si128((__m128i*)&i, v); + + return i; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn) +} + +inline __uint128_t expand16(uint16_t* arr, int n) +{ + // since n <= 4 + uint16_t _arr[4] __attribute__((aligned(ALIGNMENT))) = {0, 0, 0, 0}; + std::copy_n(arr, n, _arr); + + __m128i b = _mm_set_epi16(0, 0, 0, 0, _arr[3], _arr[2], _arr[1], _arr[0]); + + return m128i_to_uint128(b); +} + +inline __uint128_t expand32(uint32_t* arr, int n) +{ + // since n <= 4 + uint32_t _arr[4] __attribute__((aligned(simd::ALIGNMENT))) = {0, 0, 0, 0}; + std::copy_n(arr, n, _arr); + + __m128i b = _mm_set_epi32(_arr[3], _arr[2], _arr[1], _arr[0]); + + return m128i_to_uint128(b); +} + +inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n) +{ + uint16_t ai[8]; + __uint128_t values; + + __m128i _a = _mm_loadu_si128((__m128i*)&a); + ai[0] = _mm_extract_epi16(_a, 0); + ai[1] = _mm_extract_epi16(_a, 1); + ai[2] = _mm_extract_epi16(_a, 2); + ai[3] = _mm_extract_epi16(_a, 3); + ai[4] = _mm_extract_epi16(_a, 4); + ai[5] = _mm_extract_epi16(_a, 5); + ai[6] = _mm_extract_epi16(_a, 6); + ai[7] = _mm_extract_epi16(_a, 7); + + const uint32_t flag = + ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u); + + __m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]); + _mm_store_si128((__m128i*)&values, val); + + GroupedValues<__uint128_t> b = {values, flag}; + + return b; +} + +inline void unpack(__uint128_t a, GroupedValues<__uint128_t>& b, int n) +{ + uint16_t ai[8]; + __uint128_t values; + + __m128i _a = _mm_loadu_si128((__m128i*)&a); + ai[0] = _mm_extract_epi16(_a, 0); + ai[1] = _mm_extract_epi16(_a, 1); + ai[2] = _mm_extract_epi16(_a, 2); + ai[3] = _mm_extract_epi16(_a, 3); + ai[4] = _mm_extract_epi16(_a, 4); + ai[5] = _mm_extract_epi16(_a, 5); + ai[6] = _mm_extract_epi16(_a, 6); + ai[7] = _mm_extract_epi16(_a, 7); + + const uint32_t flag = + ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u); + + __m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]); + _mm_store_si128((__m128i*)&values, val); + + b.flag = flag; + b.values = values; // NOLINT(clang-analyzer-core.uninitialized.Assign) +} + +inline __uint128_t pack(__uint128_t a) +{ + __m128i _a = _mm_loadu_si128((__m128i*)&a); + __m128i b = _mm_set_epi32( + _mm_extract_epi16(_a, 3), + _mm_extract_epi16(_a, 2), + _mm_extract_epi16(_a, 1), + _mm_extract_epi16(_a, 0)); + + return m128i_to_uint128(b); +} + +inline __uint128_t pack(__uint128_t a, uint32_t flag) +{ + aint32 b0, b1, b2, b3; + __m128i _a = _mm_loadu_si128((__m128i*)&a); + + if (flag & 1) + b0 = 65536; + else + b0 = _mm_extract_epi16(_a, 0); + flag >>= 1; + if (flag & 1) + b1 = 65536; + else + b1 = _mm_extract_epi16(_a, 1); + flag >>= 1; + if (flag & 1) + b2 = 65536; + else + b2 = _mm_extract_epi16(_a, 2); + flag >>= 1; + if (flag & 1) + b3 = 65536; + else + b3 = _mm_extract_epi16(_a, 3); + + __m128i b = _mm_set_epi32(b3, b2, b1, b0); + + return m128i_to_uint128(b); +} + +/* ================= Basic operations for NF4 ================= */ + +#if defined(__AVX2__) + +inline VecType LoadToReg(HalfVecType x) +{ + return _mm256_castsi128_si256(_mm_load_si128(&x)); +} + +inline VecType LoadToReg(__uint128_t x) +{ + const HalfVecType* _x = reinterpret_cast(&x); + return LoadToReg(*_x); +} + +inline void StoreLowHalfToMem(HalfVecType* address, VecType reg) +{ + _mm_store_si128(address, _mm256_castsi256_si128(reg)); +} + +inline __uint128_t add(__uint128_t a, __uint128_t b) +{ + HalfVecType res; + VecType _a = LoadToReg(a); + VecType _b = LoadToReg(b); + StoreLowHalfToMem(&res, ModAdd(_a, _b, F4)); + return reinterpret_cast<__uint128_t>(res); +} + +inline __uint128_t sub(__uint128_t a, __uint128_t b) +{ + HalfVecType res; + VecType _a = LoadToReg(a); + VecType _b = LoadToReg(b); + StoreLowHalfToMem(&res, ModSub(_a, _b, F4)); + return reinterpret_cast<__uint128_t>(res); +} + +inline __uint128_t mul(__uint128_t a, __uint128_t b) +{ + HalfVecType res; + VecType _a = LoadToReg(a); + VecType _b = LoadToReg(b); + StoreLowHalfToMem(&res, ModMulSafe(_a, _b, F4)); + return reinterpret_cast<__uint128_t>(res); +} + +inline void add_buf_to_two_bufs_rem( + unsigned n, + __uint128_t* x, + __uint128_t* x_half, + __uint128_t* y) +{ + // add last _y[] to x and x_next + HalfVecType* _x = reinterpret_cast(x); + HalfVecType* _x_half = reinterpret_cast(x_half); + HalfVecType* _y = reinterpret_cast(y); + for (unsigned i = 0; i < n; ++i) { + VecType _x_p = LoadToReg(_x[i]); + VecType _x_next_p = LoadToReg(_x_half[i]); + VecType _y_p = LoadToReg(_y[i]); + + StoreLowHalfToMem(_x + i, ModAdd(_x_p, _y_p, F4)); + StoreLowHalfToMem(_x_half + i, ModAdd(_x_next_p, _y_p, F4)); + } +} + +inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y) +{ + HalfVecType* _x = reinterpret_cast(x); + HalfVecType* _y = reinterpret_cast(y); + for (unsigned i = 0; i < n; ++i) { + VecType _x_p = LoadToReg(_x[i]); + VecType _y_p = LoadToReg(_y[i]); + + StoreLowHalfToMem(_x + i, ModMulSafe(_x_p, _y_p, F4)); + } +} + +inline void hadamard_mul_doubled_rem( + unsigned n, + __uint128_t* x, + __uint128_t* x_half, + __uint128_t* y) +{ + HalfVecType* _x = reinterpret_cast(x); + HalfVecType* _x_half = reinterpret_cast(x_half); + HalfVecType* _y = reinterpret_cast(y); + for (unsigned i = 0; i < n; ++i) { + VecType _x_p = LoadToReg(_x[i]); + VecType _x_next_p = LoadToReg(_x_half[i]); + VecType _y_p = LoadToReg(_y[i]); + + StoreLowHalfToMem(_x + i, ModMulSafe(_x_p, _y_p, F4)); + StoreLowHalfToMem(_x_half + i, ModMulSafe(_x_next_p, _y_p, F4)); + } +} + +#elif defined(__SSE4_1__) + +inline VecType LoadToReg(__uint128_t x) +{ + const VecType* _x = reinterpret_cast(&x); + return _mm_load_si128(_x); +} + +inline __uint128_t add(__uint128_t a, __uint128_t b) +{ + VecType res; + VecType _a = LoadToReg(a); + VecType _b = LoadToReg(b); + StoreToMem(&res, ModAdd(_a, _b, F4)); + return reinterpret_cast<__uint128_t>(res); +} + +inline __uint128_t sub(__uint128_t a, __uint128_t b) +{ + VecType res; + VecType _a = LoadToReg(a); + VecType _b = LoadToReg(b); + StoreToMem(&res, ModSub(_a, _b, F4)); + return reinterpret_cast<__uint128_t>(res); +} + +inline __uint128_t mul(__uint128_t a, __uint128_t b) +{ + VecType res; + VecType _a = LoadToReg(a); + VecType _b = LoadToReg(b); + StoreToMem(&res, ModMulSafe(_a, _b, F4)); + return reinterpret_cast<__uint128_t>(res); +} + +inline void add_buf_to_two_bufs_rem( + unsigned n, + __uint128_t* x, + __uint128_t* x_half, + __uint128_t* y) +{ + // do nothing +} + +inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y) +{ + // do nothing +} + +inline void hadamard_mul_doubled_rem( + unsigned n, + __uint128_t* x, + __uint128_t* x_half, + __uint128_t* y) +{ + // do nothing +} + +#endif + +/* ==================== Operations for NF4 =================== */ + +/** Add buffer `y` to two halves of `x`. `x` is of length `n` */ +inline void add_buf_to_two_bufs(unsigned n, __uint128_t* _x, __uint128_t* _y) +{ + unsigned i; + VecType* x = reinterpret_cast(_x); + VecType* y = reinterpret_cast(_y); + + const unsigned ratio = sizeof(*x) / sizeof(*_x); + const unsigned half_len = n / 2; + const unsigned vec_len = half_len / ratio; + const unsigned num_len = vec_len * ratio; + const unsigned rem_len = half_len - num_len; + + __uint128_t* x_half = _x + half_len; + VecType* x_next = reinterpret_cast(x_half); + + // add y to the first half of `x` + for (i = 0; i < vec_len; ++i) { + x[i] = ModAdd(x[i], y[i], F4); + } + + // add y to the second half of `x` + for (i = 0; i < vec_len; ++i) { + x_next[i] = ModAdd(x_next[i], y[i], F4); + } + + if (rem_len > 0) { + add_buf_to_two_bufs_rem( + rem_len, _x + num_len, x_half + num_len, _y + num_len); + } +} + +inline void hadamard_mul(unsigned n, __uint128_t* _x, __uint128_t* _y) +{ + unsigned i; + VecType* x = reinterpret_cast(_x); + VecType* y = reinterpret_cast(_y); + + const unsigned ratio = sizeof(*x) / sizeof(*_x); + const unsigned vec_len = n / ratio; + const unsigned num_len = vec_len * ratio; + const unsigned rem_len = n - num_len; + + // multiply y to the first half of `x` + for (i = 0; i < vec_len; ++i) { + x[i] = ModMulSafe(x[i], y[i], F4); + } + + if (rem_len > 0) { + // add last _y[] to x + hadamard_mul_rem(rem_len, _x + num_len, _y + num_len); + } +} + +} // namespace simd +} // namespace quadiron + +#endif diff --git a/src/simd_128.h b/src/simd_128.h deleted file mode 100644 index 0f6b8857..00000000 --- a/src/simd_128.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright 2017-2018 Scality - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __QUAD_SIMD_128_H__ -#define __QUAD_SIMD_128_H__ - -#include - -typedef __m128i m128i; - -// Disable `cert-err58-cpp` on these: AFAIK they cannot throw. -// (probably a false positive present in Clang 5 and fixed in Clang 6). -const m128i F4_m128i = _mm_set1_epi32(65537); // NOLINT(cert-err58-cpp) -const m128i F4minus1_m128i = _mm_set1_epi32(65536); // NOLINT(cert-err58-cpp) -const m128i F3_m128i = _mm_set1_epi32(257); // NOLINT(cert-err58-cpp) -const m128i F3minus1_m128i = _mm_set1_epi32(256); // NOLINT(cert-err58-cpp) - -const m128i F3_m128i_u16 = _mm_set1_epi16(257); // NOLINT(cert-err58-cpp) -const m128i F3minus1_m128i_u16 = _mm_set1_epi16(256); // NOLINT(cert-err58-cpp) - -#include "simd_128_u16.h" -#include "simd_128_u32.h" - -#endif diff --git a/src/simd_128_u16.h b/src/simd_128_u16.h deleted file mode 100644 index a177cfb3..00000000 --- a/src/simd_128_u16.h +++ /dev/null @@ -1,358 +0,0 @@ -/* - * Copyright 2017-2018 Scality - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __QUAD_SIMD_128_U16_H__ -#define __QUAD_SIMD_128_U16_H__ - -#include - -#include "property.h" -#include "simd/simd.h" - -namespace quadiron { -namespace simd { - -/* ==================== Essential Operations =================== */ - -/** Perform a%card where a is a addition of two numbers whose elements are - * symbols of GF(card) */ -inline m128i mod_after_add(m128i a, aint16 card) -{ - const m128i _card = _mm_set1_epi16(card); - const m128i _card_minus_1 = _mm_set1_epi16(card - 1); - - m128i cmp = _mm_cmpgt_epi16(a, _card_minus_1); - m128i b = _mm_sub_epi16(a, _mm_and_si128(_card, cmp)); - - return b; -} - -/** Perform addition of two numbers a, b whose elements are of GF(card) */ -inline m128i add(m128i a, m128i b, aint16 card) -{ - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - m128i c = _mm_add_epi16(_a, _b); - - // Modulo - return mod_after_add(c, card); -} - -/** Perform subtraction of a by b where a, b whose elements are symbols of - * GF(card) - * sub(a, b) = a - b if a >= b, or - * card + a - b, otherwise - */ -inline m128i sub(m128i a, m128i b, aint16 card) -{ - const m128i _card = _mm_set1_epi16(card); - - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - - m128i cmp = _mm_cmpgt_epi16(_b, _a); - m128i _a1 = _mm_add_epi16(_a, _mm_and_si128(_card, cmp)); - - return _mm_sub_epi16(_a1, _b); -} - -/** Negate `a` - * @return 0 if (a == 0), else card - a - */ -inline m128i neg(m128i a, aint16 card = F3) -{ - const m128i _card = _mm_set1_epi16(card); - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_setzero_si128(); - - m128i cmp = _mm_cmpgt_epi16(_a, _b); - - return _mm_sub_epi16(_mm_and_si128(cmp, _card), _a); -} - -inline m128i mod_after_multiply(m128i a) -{ - const m128i mask = _mm_set1_epi16(F3 - 2); - - m128i lo = _mm_and_si128(a, mask); - - m128i a_shift = _mm_srli_si128(a, 1); - m128i hi = _mm_and_si128(a_shift, mask); - - m128i cmp = _mm_cmpgt_epi16(hi, lo); - m128i _lo = _mm_add_epi16(lo, _mm_and_si128(F3_m128i_u16, cmp)); - - return _mm_sub_epi16(_lo, hi); -} - -inline m128i mul(m128i a, m128i b) -{ - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - - m128i c = _mm_mullo_epi16(_a, _b); - - // filter elements of both of a & b = card-1 - m128i cmp = _mm_and_si128( - _mm_cmpeq_epi16(_a, F3minus1_m128i_u16), - _mm_cmpeq_epi16(_b, F3minus1_m128i_u16)); - - const m128i one = _mm_set1_epi16(1); - c = _mm_add_epi16(c, _mm_and_si128(one, cmp)); - - // Modulo - return mod_after_multiply(c); -} - -/** Perform multiplication of two numbers a, b whose elements are of GF(card) - * where `card` is a prime Fermat number, i.e. card = Fx with x < 5 - * Currently, it supports only for F3 - */ -inline m128i mul(m128i a, m128i b, aint16 card) -{ - // FIXME: generalize card - assert(card == F3); - return mul(a, b); -} - -/** Apply an element-wise negation to a buffer - */ -inline void neg(size_t len, aint16* buf, aint16 card = F3) -{ - m128i* _buf = reinterpret_cast(buf); - unsigned ratio = sizeof(*_buf) / sizeof(*buf); - size_t _len = len / ratio; - size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - _buf[i] = neg(_buf[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - if (buf[i]) - buf[i] = card - buf[i]; - } - } -} - -/** Perform a multiplication of a coefficient `a` to each element of `src` and - * add result to correspondent element of `dest` - */ -inline void mul_coef_to_buf( - const aint16 a, - aint16* src, - aint16* dest, - size_t len, - aint16 card = F3) -{ - const m128i coef = _mm_set1_epi16(a); - - m128i* _src = reinterpret_cast(src); - m128i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplication - _dest[i] = mul(coef, _src[i], card); - } - if (_last_len > 0) { - uint32_t coef_doubled = (uint32_t)a; - for (i = _len * ratio; i < len; i++) { - // perform multiplication - dest[i] = (aint16)((coef_doubled * src[i]) % card); - } - } -} - -inline void -add_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3) -{ - m128i* _src = reinterpret_cast(src); - m128i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform addition - _dest[i] = add(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform addition - aint16 tmp = src[i] + dest[i]; - dest[i] = (tmp >= card) ? (tmp - card) : tmp; - } - } -} - -inline void sub_two_bufs( - aint16* bufa, - aint16* bufb, - aint16* res, - size_t len, - aint16 card = F3) -{ - m128i* _bufa = reinterpret_cast(bufa); - m128i* _bufb = reinterpret_cast(bufb); - m128i* _res = reinterpret_cast(res); - const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform subtraction - _res[i] = sub(_bufa[i], _bufb[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform subtraction - if (bufa[i] >= bufb[i]) - res[i] = bufa[i] - bufb[i]; - else - res[i] = card - (bufb[i] - bufa[i]); - } - } -} - -inline void -mul_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3) -{ - m128i* _src = reinterpret_cast(src); - m128i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplicaton - _dest[i] = mul(_src[i], _dest[i], F3); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform multiplicaton - // dest[i] = uint32_t(src[i]) * uint32_t(dest[i]) % card; - dest[i] = uint16_t((uint32_t(src[i]) * dest[i]) % card); - } - } -} - -/* - * buf1[i] = buf1[i] + coef * buf2[i] - * buf2[i] = buf1[i] - coef * buf2[i] - */ -inline void butterfly_ct( - uint16_t coef, - aint16* buf1, - aint16* buf2, - size_t len, - uint32_t card = F3) -{ - const m128i _coef = _mm_set1_epi16(coef); - m128i* _buf1 = reinterpret_cast(buf1); - m128i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m128i a = mul(_coef, _buf2[i], card); - _buf2[i] = sub(_buf1[i], a, card); - _buf1[i] = add(_buf1[i], a, card); - } -} - -/* - * buf1[i] = buf1[i] + buf2[i] - * buf2[i] = coef * (buf1[i] - buf2[i]) - */ -inline void butterfly_gs( - uint16_t coef, - aint16* buf1, - aint16* buf2, - size_t len, - uint16_t card = F3) -{ - const m128i _coef = _mm_set1_epi16(coef); - m128i* _buf1 = reinterpret_cast(buf1); - m128i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m128i a = _buf1[i]; - m128i b = _buf2[i]; - m128i c = sub(a, b, card); - _buf1[i] = add(a, b, card); - _buf2[i] = mul(_coef, c, card); - } -} - -inline void encode_post_process( - vec::Buffers& output, - std::vector& props, - off_t offset, - unsigned code_len, - uint16_t threshold, - size_t vecs_nb) -{ - const unsigned vec_size = simd::countof(); - - const m128i _threshold = _mm_set1_epi16(threshold); - uint16_t max = 1 << (sizeof(uint16_t) * 8 - 1); - const m128i mask_hi = _mm_set1_epi16(max); - const unsigned element_size = sizeof(uint16_t); - - for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { - uint16_t* chunk = output.get(frag_id); - m128i* buf = reinterpret_cast(chunk); - for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) { - const m128i a = _mm_load_si128(&(buf[vec_id])); - const m128i b = _mm_cmpeq_epi16(_threshold, a); - const m128i c = _mm_and_si128(mask_hi, b); - uint16_t d = _mm_movemask_epi8(c); - - while (d > 0) { - unsigned byte_idx = __builtin_ctz(d); - unsigned element_idx = byte_idx / element_size; - off_t _offset = offset + vec_id * vec_size + element_idx; - props[frag_id].add(_offset, 1); - d ^= 1 << byte_idx; - } - } - } -} - -} // namespace simd -} // namespace quadiron - -#endif diff --git a/src/simd_128_u32.h b/src/simd_128_u32.h deleted file mode 100644 index 5039f0f8..00000000 --- a/src/simd_128_u32.h +++ /dev/null @@ -1,442 +0,0 @@ -/* - * Copyright 2017-2018 Scality - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __QUAD_SIMD_128_U32_H__ -#define __QUAD_SIMD_128_U32_H__ - -#include - -#include "simd/simd.h" - -namespace quadiron { -namespace simd { - -/* ==================== Essential Operations =================== */ - -/** Perform a%card where a is a addition of two numbers whose elements are - * symbols of GF(card) */ -inline m128i mod_after_add(m128i a, aint32 card) -{ - const m128i _card = _mm_set1_epi32(card); - const m128i _card_minus_1 = _mm_set1_epi32(card - 1); - - m128i cmp = _mm_cmpgt_epi32(a, _card_minus_1); - m128i b = _mm_sub_epi32(a, _mm_and_si128(_card, cmp)); - - return b; -} - -/** Perform addition of two numbers a, b whose elements are of GF(card) */ -inline m128i add(m128i a, m128i b, aint32 card) -{ - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - m128i c = _mm_add_epi32(_a, _b); - - // Modulo - return mod_after_add(c, card); -} - -/** Perform subtraction of a by b where a, b whose elements are symbols of - * GF(card) - * sub(a, b) = a - b if a >= b, or - * card + a - b, otherwise - */ -inline m128i sub(m128i a, m128i b, aint32 card) -{ - const m128i _card = _mm_set1_epi32(card); - - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - - m128i cmp = _mm_cmpgt_epi32(_b, _a); - m128i _a1 = _mm_add_epi32(_a, _mm_and_si128(_card, cmp)); - - return _mm_sub_epi32(_a1, _b); -} - -/** Negate `a` - * @return 0 if (a == 0), else card - a - */ -inline m128i neg(m128i a, aint32 card = F4) -{ - const m128i _card = _mm_set1_epi32(card); - - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_setzero_si128(); - m128i cmp = _mm_cmpgt_epi32(_a, _b); - - return _mm_sub_epi32(_mm_and_si128(cmp, _card), _a); -} - -/** Perform a%card where a is a multiplication of two numbers whose elements are - * symbols of GF(F4) - * - * We find v in a = u * card + v - * a is expressed also as: a = hi * (card-1) + lo - * where hi and lo is 16-bit for F4 (or 8-bit for F3) high and low parts of a - * hence, v = (lo - hi) % F4 - * v = lo - hi, if lo >= hi - * or - * F4 + lo - hi, otherwise - */ -inline m128i mod_after_multiply_f4(m128i a) -{ - const m128i mask = _mm_set1_epi32(F4 - 2); - - m128i lo = _mm_and_si128(a, mask); - - m128i a_shift = _mm_srli_si128(a, 2); - m128i hi = _mm_and_si128(a_shift, mask); - - m128i cmp = _mm_cmpgt_epi32(hi, lo); - m128i _lo = _mm_add_epi32(lo, _mm_and_si128(F4_m128i, cmp)); - - return _mm_sub_epi32(_lo, hi); -} - -inline m128i mod_after_multiply_f3(m128i a) -{ - const m128i mask = _mm_set1_epi32(F3 - 2); - - m128i lo = _mm_and_si128(a, mask); - - m128i a_shift = _mm_srli_si128(a, 1); - m128i hi = _mm_and_si128(a_shift, mask); - - m128i cmp = _mm_cmpgt_epi32(hi, lo); - m128i _lo = _mm_add_epi32(lo, _mm_and_si128(F3_m128i, cmp)); - - return _mm_sub_epi32(_lo, hi); -} - -inline m128i mul_f4(m128i a, m128i b) -{ - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - - m128i c = _mm_mullo_epi32(_a, _b); - - // filter elements of both of a & b = card-1 - m128i cmp = _mm_and_si128( - _mm_cmpeq_epi32(_a, F4minus1_m128i), - _mm_cmpeq_epi32(_b, F4minus1_m128i)); - - const m128i one = _mm_set1_epi32(1); - c = _mm_add_epi32(c, _mm_and_si128(one, cmp)); - - // Modulo - return mod_after_multiply_f4(c); -} - -inline m128i mul_f3(m128i a, m128i b) -{ - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - - m128i c = _mm_mullo_epi32(_a, _b); - - // filter elements of both of a & b = card-1 - m128i cmp = _mm_and_si128( - _mm_cmpeq_epi32(_a, F3minus1_m128i), - _mm_cmpeq_epi32(_b, F3minus1_m128i)); - - c = _mm_xor_si128(c, _mm_and_si128(F4_m128i, cmp)); - - // Modulo - return mod_after_multiply_f3(c); -} - -/** Perform multiplication of two numbers a, b whose elements are of GF(card) - * where `card` is a prime Fermat number, i.e. card = Fx with x < 5 - * Currently, it supports only for F3 and F4 - */ -inline m128i mul(m128i a, m128i b, aint32 card) -{ - assert(card == F4 || card == F3); - if (card == F4) - return mul_f4(a, b); - return mul_f3(a, b); -} - -/** Apply an element-wise negation to a buffer - */ -inline void neg(size_t len, aint32* buf, aint32 card = F4) -{ - m128i* _buf = reinterpret_cast(buf); - unsigned ratio = sizeof(*_buf) / sizeof(*buf); - size_t _len = len / ratio; - size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - _buf[i] = neg(_buf[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - if (buf[i] > 0) - buf[i] = card - buf[i]; - } - } -} - -/** Perform a multiplication of a coefficient `a` to each element of `src` and - * add result to correspondent element of `dest` - */ -inline void mul_coef_to_buf( - const aint32 a, - aint32* src, - aint32* dest, - size_t len, - aint32 card = F4) -{ - const m128i coef = _mm_set1_epi32(a); - - m128i* _src = reinterpret_cast(src); - m128i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplication - _dest[i] = mul(coef, _src[i], card); - } - if (_last_len > 0) { - uint64_t coef_64 = (uint64_t)a; - for (i = _len * ratio; i < len; i++) { - // perform multiplication - dest[i] = (aint32)((coef_64 * src[i]) % card); - } - } -} - -inline void -add_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4) -{ - m128i* _src = reinterpret_cast(src); - m128i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform addition - _dest[i] = add(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform addition - aint32 tmp = src[i] + dest[i]; - dest[i] = (tmp >= card) ? (tmp - card) : tmp; - } - } -} - -inline void sub_two_bufs( - aint32* bufa, - aint32* bufb, - aint32* res, - size_t len, - aint32 card = F4) -{ - m128i* _bufa = reinterpret_cast(bufa); - m128i* _bufb = reinterpret_cast(bufb); - m128i* _res = reinterpret_cast(res); - const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform subtraction - _res[i] = sub(_bufa[i], _bufb[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform subtraction - if (bufa[i] >= bufb[i]) - res[i] = bufa[i] - bufb[i]; - else - res[i] = card - (bufb[i] - bufa[i]); - } - } -} - -inline void -mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4) -{ - m128i* _src = reinterpret_cast(src); - m128i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplicaton - _dest[i] = mul(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform multiplicaton - dest[i] = uint32_t((uint64_t(src[i]) * dest[i]) % card); - } - } -} - -/* - * buf1[i] = buf1[i] + coef * buf2[i] - * buf2[i] = buf1[i] - coef * buf2[i] - */ -inline void butterfly_ct( - uint32_t coef, - aint32* buf1, - aint32* buf2, - size_t len, - uint32_t card = F4) -{ - const m128i _coef = _mm_set1_epi32(coef); - m128i* _buf1 = reinterpret_cast(buf1); - m128i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m128i a = mul(_coef, _buf2[i], card); - _buf2[i] = sub(_buf1[i], a, card); - _buf1[i] = add(_buf1[i], a, card); - } -} - -/* - * buf1[i] = buf1[i] + buf2[i] - * buf2[i] = coef * (buf1[i] - buf2[i]) - */ -inline void butterfly_gs( - uint32_t coef, - aint32* buf1, - aint32* buf2, - size_t len, - uint32_t card = F4) -{ - const m128i _coef = _mm_set1_epi32(coef); - m128i* _buf1 = reinterpret_cast(buf1); - m128i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m128i a = _buf1[i]; - m128i b = _buf2[i]; - m128i c = sub(a, b, card); - _buf1[i] = add(a, b, card); - _buf2[i] = mul(_coef, c, card); - } -} - -inline void encode_post_process( - vec::Buffers& output, - std::vector& props, - off_t offset, - unsigned code_len, - uint32_t threshold, - size_t vecs_nb) -{ - const unsigned vec_size = simd::countof(); - - const m128i _threshold = _mm_set1_epi32(threshold); - const uint32_t max = 1 << (sizeof(uint32_t) * 8 - 1); - const m128i mask_hi = _mm_set1_epi32(max); - const unsigned element_size = sizeof(uint32_t); - - for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { - uint32_t* chunk = output.get(frag_id); - m128i* buf = reinterpret_cast(chunk); - for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) { - const m128i a = _mm_load_si128(&(buf[vec_id])); - const m128i b = _mm_cmpeq_epi32(_threshold, a); - const m128i c = _mm_and_si128(mask_hi, b); - uint16_t d = _mm_movemask_epi8(c); - - while (d > 0) { - unsigned byte_idx = __builtin_ctz(d); - unsigned element_idx = byte_idx / element_size; - off_t _offset = offset + vec_id * vec_size + element_idx; - props[frag_id].add(_offset, 1); - d ^= 1 << byte_idx; - } - } - } -} - -/* ==================== Operations for NF4 =================== */ - -/** Return aint128 integer from a _m128i register */ -static inline aint128 m128i_to_uint128(m128i v) -{ - aint128 i; - _mm_store_si128((m128i*)&i, v); - - return i; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn) -} - -inline __uint128_t add(__uint128_t a, __uint128_t b) -{ - m128i res = add((m128i)a, (m128i)b, F4); - return m128i_to_uint128(res); -} - -inline __uint128_t sub(__uint128_t a, __uint128_t b) -{ - m128i res = sub((m128i)a, (m128i)b, F4); - return m128i_to_uint128(res); -} - -inline __uint128_t mul(__uint128_t a, __uint128_t b) -{ - m128i res = mul((m128i)a, (m128i)b, F4); - return m128i_to_uint128(res); -} - -inline void hadamard_mul(int n, aint128* _x, aint128* _y) -{ - int i; - m128i* x = reinterpret_cast(_x); - m128i* y = reinterpret_cast(_y); - - // multiply y to `x` - for (i = 0; i < n; i++) { - x[i] = mul(x[i], y[i], F4); - } -} - -} // namespace simd -} // namespace quadiron - -#endif diff --git a/src/simd_256.h b/src/simd_256.h deleted file mode 100644 index 2dc49cc4..00000000 --- a/src/simd_256.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright 2017-2018 Scality - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __QUAD_SIMD_256_H__ -#define __QUAD_SIMD_256_H__ - -#include - -typedef __m256i m256i; - -// Disable `cert-err58-cpp` on these: AFAIK they cannot throw. -// (probably a false positive present in Clang 5 and fixed in Clang 6). -const m256i F4_m256i = _mm256_set1_epi32(65537); // NOLINT(cert-err58-cpp) -const m256i F4minus1_m256i = _mm256_set1_epi32(65536); // NOLINT(cert-err58-cpp) -const m256i F3_m256i = _mm256_set1_epi32(257); // NOLINT(cert-err58-cpp) -const m256i F3minus1_m256i = _mm256_set1_epi32(256); // NOLINT(cert-err58-cpp) - -const m256i F3_m256i_u16 = _mm256_set1_epi16(257); // NOLINT(cert-err58-cpp) -// NOLINTNEXTLINE(cert-err58-cpp) -const m256i F3minus1_m256i_u16 = _mm256_set1_epi16(256); - -/* GCC doesn't include the split store intrinsics so define them here. */ -#if defined(__GNUC__) && !defined(__clang__) - -static inline void __attribute__((__always_inline__)) -_mm256_storeu2_m128i(__m128i* const hi, __m128i* const lo, const __m256i a) -{ - _mm_storeu_si128(lo, _mm256_castsi256_si128(a)); - _mm_storeu_si128(hi, _mm256_extracti128_si256(a, 1)); -} - -#endif /* defined(__GNUC__) */ - -#include "simd_256_u16.h" -#include "simd_256_u32.h" - -#endif diff --git a/src/simd_256_u16.h b/src/simd_256_u16.h deleted file mode 100644 index 974e136e..00000000 --- a/src/simd_256_u16.h +++ /dev/null @@ -1,354 +0,0 @@ -/* - * Copyright 2017-2018 Scality - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __QUAD_SIMD_256_U16_H__ -#define __QUAD_SIMD_256_U16_H__ - -#include - -#include "simd/simd.h" - -namespace quadiron { -namespace simd { - -/** Perform a%card where a is a addition of two numbers whose elements are - * symbols of GF(card) */ -inline m256i mod_after_add(m256i a, aint16 card) -{ - const m256i _card = _mm256_set1_epi16(card); - const m256i _card_minus_1 = _mm256_set1_epi16(card - 1); - - m256i cmp = _mm256_cmpgt_epi16(a, _card_minus_1); - m256i b = _mm256_sub_epi16(a, _mm256_and_si256(_card, cmp)); - - return b; -} - -/** Perform addition of two numbers a, b whose elements are of GF(card) */ -inline m256i add(m256i a, m256i b, aint16 card = F3) -{ - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); - m256i c = _mm256_add_epi16(_a, _b); - - // Modulo - return mod_after_add(c, card); -} - -/** Perform subtraction of a by b where a, b whose elements are symbols of - * GF(card) - * sub(a, b) = a - b if a >= b, or - * card + a - b, otherwise - */ -inline m256i sub(m256i a, m256i b, aint16 card) -{ - const m256i _card = _mm256_set1_epi16(card); - - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); - - m256i cmp = _mm256_cmpgt_epi16(_b, _a); - m256i _a1 = _mm256_add_epi16(_a, _mm256_and_si256(_card, cmp)); - - return _mm256_sub_epi16(_a1, _b); -} - -/** Negate `a` - * @return 0 if (a == 0), else card - a - */ -inline m256i neg(m256i a, aint16 card = F3) -{ - const m256i _card = _mm256_set1_epi16(card); - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_setzero_si256(); - - m256i cmp = _mm256_cmpgt_epi16(_a, _b); - - return _mm256_sub_epi16(_mm256_and_si256(cmp, _card), _a); -} - -inline m256i mod_after_multiply(m256i a) -{ - const m256i mask = _mm256_set1_epi16(F3 - 2); - - m256i lo = _mm256_and_si256(a, mask); - - m256i a_shift = _mm256_srli_si256(a, 1); - m256i hi = _mm256_and_si256(a_shift, mask); - - m256i cmp = _mm256_cmpgt_epi16(hi, lo); - m256i _lo = _mm256_add_epi16(lo, _mm256_and_si256(F3_m256i_u16, cmp)); - - return _mm256_sub_epi16(_lo, hi); -} - -inline m256i mul(m256i a, m256i b) -{ - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); - - m256i c = _mm256_mullo_epi16(_a, _b); - - // filter elements of both of a & b = card-1 - m256i cmp = _mm256_and_si256( - _mm256_cmpeq_epi16(_a, F3minus1_m256i_u16), - _mm256_cmpeq_epi16(_b, F3minus1_m256i_u16)); - - const m256i one = _mm256_set1_epi16(1); - c = _mm256_add_epi16(c, _mm256_and_si256(one, cmp)); - - // Modulo - return mod_after_multiply(c); -} - -/** Perform multiplication of two numbers a, b whose elements are of GF(card) - * where `card` is a prime Fermat number, i.e. card = Fx with x < 5 - * Currently, it supports only for F3 - */ -inline m256i mul(m256i a, m256i b, aint16 card) -{ - // FIXME: generalize card - assert(card == F3); - return mul(a, b); -} - -/** Apply an element-wise negation to a buffer - */ -inline void neg(size_t len, aint16* buf, aint16 card = F3) -{ - m256i* _buf = reinterpret_cast(buf); - unsigned ratio = sizeof(*_buf) / sizeof(*buf); - size_t _len = len / ratio; - size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - _buf[i] = neg(_buf[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - if (buf[i]) - buf[i] = card - buf[i]; - } - } -} - -/** Perform a multiplication of a coefficient `a` to each element of `src` and - * add result to correspondent element of `dest` - */ -inline void mul_coef_to_buf( - const aint16 a, - aint16* src, - aint16* dest, - size_t len, - aint16 card = F3) -{ - const m256i coef = _mm256_set1_epi16(a); - - m256i* _src = reinterpret_cast(src); - m256i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplication - _dest[i] = mul(coef, _src[i], card); - } - if (_last_len > 0) { - uint32_t coef_doubled = (uint32_t)a; - for (i = _len * ratio; i < len; i++) { - // perform multiplication - dest[i] = (aint16)((coef_doubled * src[i]) % card); - } - } -} - -inline void -add_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3) -{ - m256i* _src = reinterpret_cast(src); - m256i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform addition - _dest[i] = add(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform addition - aint16 tmp = src[i] + dest[i]; - dest[i] = (tmp >= card) ? (tmp - card) : tmp; - } - } -} - -inline void sub_two_bufs( - aint16* bufa, - aint16* bufb, - aint16* res, - size_t len, - aint16 card = F3) -{ - m256i* _bufa = reinterpret_cast(bufa); - m256i* _bufb = reinterpret_cast(bufb); - m256i* _res = reinterpret_cast(res); - const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform subtraction - _res[i] = sub(_bufa[i], _bufb[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform subtraction - if (bufa[i] >= bufb[i]) - res[i] = bufa[i] - bufb[i]; - else - res[i] = card - (bufb[i] - bufa[i]); - } - } -} - -inline void -mul_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3) -{ - m256i* _src = reinterpret_cast(src); - m256i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplicaton - _dest[i] = mul(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform multiplicaton - dest[i] = (uint32_t(src[i]) * dest[i]) % card; - } - } -} - -/* - * buf1[i] = buf1[i] + coef * buf2[i] - * buf2[i] = buf1[i] - coef * buf2[i] - */ -inline void butterfly_ct( - uint16_t coef, - aint16* buf1, - aint16* buf2, - size_t len, - uint16_t card = F3) -{ - const m256i _coef = _mm256_set1_epi16(coef); - m256i* _buf1 = reinterpret_cast(buf1); - m256i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m256i a = mul(_coef, _buf2[i], card); - _buf2[i] = sub(_buf1[i], a, card); - _buf1[i] = add(_buf1[i], a, card); - } -} - -/* - * buf1[i] = buf1[i] + buf2[i] - * buf2[i] = coef * (buf1[i] - buf2[i]) - */ -inline void butterfly_gs( - uint16_t coef, - aint16* buf1, - aint16* buf2, - size_t len, - uint16_t card = F3) -{ - const m256i _coef = _mm256_set1_epi16(coef); - m256i* _buf1 = reinterpret_cast(buf1); - m256i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m256i a = _buf1[i]; - m256i b = _buf2[i]; - m256i c = sub(a, b, card); - _buf1[i] = add(a, b, card); - _buf2[i] = mul(_coef, c, card); - } -} - -inline void encode_post_process( - vec::Buffers& output, - std::vector& props, - off_t offset, - unsigned code_len, - uint16_t threshold, - size_t vecs_nb) -{ - const unsigned vec_size = simd::countof(); - - const m256i _threshold = _mm256_set1_epi16(threshold); - uint16_t max = 1 << (sizeof(uint16_t) * 8 - 1); - const m256i mask_hi = _mm256_set1_epi16(max); - const unsigned element_size = sizeof(uint16_t); - - for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { - uint16_t* chunk = output.get(frag_id); - m256i* buf = reinterpret_cast(chunk); - for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) { - const m256i a = _mm256_load_si256(&(buf[vec_id])); - const m256i b = _mm256_cmpeq_epi16(_threshold, a); - const m256i c = _mm256_and_si256(mask_hi, b); - uint32_t d = _mm256_movemask_epi8(c); - - while (d > 0) { - unsigned byte_idx = __builtin_ctz(d); - unsigned element_idx = byte_idx / element_size; - off_t _offset = offset + vec_id * vec_size + element_idx; - props[frag_id].add(_offset, 1); - d ^= 1 << byte_idx; - } - } - } -} - -} // namespace simd -} // namespace quadiron - -#endif diff --git a/src/simd_256_u32.h b/src/simd_256_u32.h deleted file mode 100644 index 9c76c89b..00000000 --- a/src/simd_256_u32.h +++ /dev/null @@ -1,466 +0,0 @@ -/* - * Copyright 2017-2018 Scality - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __QUAD_SIMD_256_U32_H__ -#define __QUAD_SIMD_256_U32_H__ - -#include - -#include "simd/simd.h" - -namespace quadiron { -namespace simd { - -/* ==================== Essential Operations =================== */ - -/** Perform a%card where a is a addition of two numbers whose elements are - * symbols of GF(card) */ -inline m256i mod_after_add(m256i a, aint32 card) -{ - const m256i _card = _mm256_set1_epi32(card); - const m256i _card_minus_1 = _mm256_set1_epi32(card - 1); - - m256i cmp = _mm256_cmpgt_epi32(a, _card_minus_1); - m256i b = _mm256_sub_epi32(a, _mm256_and_si256(_card, cmp)); - - return b; -} - -/** Perform addition of two numbers a, b whose elements are of GF(card) */ -inline m256i add(m256i a, m256i b, aint32 card) -{ - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); - m256i c = _mm256_add_epi32(_a, _b); - - // Modulo - return mod_after_add(c, card); -} - -/** Perform subtraction of a by b where a, b whose elements are symbols of - * GF(card) - * sub(a, b) = a - b if a >= b, or - * card + a - b, otherwise - */ -inline m256i sub(m256i a, m256i b, aint32 card) -{ - const m256i _card = _mm256_set1_epi32(card); - - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); - - m256i cmp = _mm256_cmpgt_epi32(_b, _a); - m256i _a1 = _mm256_add_epi32(_a, _mm256_and_si256(_card, cmp)); - - return _mm256_sub_epi32(_a1, _b); -} - -/** Negate `a` - * @return 0 if (a == 0), else card - a - */ -inline m256i neg(m256i a, aint32 card = F4) -{ - const m256i _card = _mm256_set1_epi32(card); - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_setzero_si256(); - - m256i cmp = _mm256_cmpgt_epi32(_a, _b); - - return _mm256_sub_epi32(_mm256_and_si256(cmp, _card), _a); -} - -/** Perform a%card where a is a multiplication of two numbers whose elements are - * symbols of GF(F4) - * - * We find v in a = u * card + v - * a is expressed also as: a = hi * (card-1) + lo - * where hi and lo is 16-bit for F4 (or 8-bit for F3) high and low parts of a - * hence, v = (lo - hi) % F4 - * v = lo - hi, if lo >= hi - * or - * F4 + lo - hi, otherwise - */ -inline m256i mod_after_multiply_f4(m256i a) -{ - const m256i mask = _mm256_set1_epi32(F4 - 2); - - m256i lo = _mm256_and_si256(a, mask); - - m256i a_shift = _mm256_srli_si256(a, 2); - m256i hi = _mm256_and_si256(a_shift, mask); - - m256i cmp = _mm256_cmpgt_epi32(hi, lo); - m256i _lo = _mm256_add_epi32(lo, _mm256_and_si256(F4_m256i, cmp)); - - return _mm256_sub_epi32(_lo, hi); -} - -inline m256i mod_after_multiply_f3(m256i a) -{ - const m256i mask = _mm256_set1_epi32(F3 - 2); - - m256i lo = _mm256_and_si256(a, mask); - - m256i a_shift = _mm256_srli_si256(a, 1); - m256i hi = _mm256_and_si256(a_shift, mask); - - m256i cmp = _mm256_cmpgt_epi32(hi, lo); - m256i _lo = _mm256_add_epi32(lo, _mm256_and_si256(F3_m256i, cmp)); - - return _mm256_sub_epi32(_lo, hi); -} - -inline m256i mul_f4(m256i a, m256i b) -{ - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); - - m256i c = _mm256_mullo_epi32(_a, _b); - - // filter elements of both of a & b = card-1 - m256i cmp = _mm256_and_si256( - _mm256_cmpeq_epi32(_a, F4minus1_m256i), - _mm256_cmpeq_epi32(_b, F4minus1_m256i)); - - const m256i one = _mm256_set1_epi32(1); - c = _mm256_add_epi32(c, _mm256_and_si256(one, cmp)); - - // Modulo - return mod_after_multiply_f4(c); -} - -inline m256i mul_f3(m256i a, m256i b) -{ - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); - - m256i c = _mm256_mullo_epi32(_a, _b); - - // filter elements of both of a & b = card-1 - m256i cmp = _mm256_and_si256( - _mm256_cmpeq_epi32(_a, F3minus1_m256i), - _mm256_cmpeq_epi32(_b, F3minus1_m256i)); - - c = _mm256_xor_si256(c, _mm256_and_si256(F4_m256i, cmp)); - - // Modulo - return mod_after_multiply_f3(c); -} - -/** Perform multiplication of two numbers a, b whose elements are of GF(card) - * where `card` is a prime Fermat number, i.e. card = Fx with x < 5 - * Currently, it supports only for F3 and F4 - */ -inline m256i mul(m256i a, m256i b, aint32 card) -{ - assert(card == F4 || card == F3); - if (card == F4) - return mul_f4(a, b); - return mul_f3(a, b); -} - -/** Apply an element-wise negation to a buffer - */ -inline void neg(size_t len, aint32* buf, aint32 card = F4) -{ - m256i* _buf = reinterpret_cast(buf); - unsigned ratio = sizeof(*_buf) / sizeof(*buf); - size_t _len = len / ratio; - size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - _buf[i] = neg(_buf[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - if (buf[i]) - buf[i] = card - buf[i]; - } - } -} - -/** Perform a multiplication of a coefficient `a` to each element of `src` and - * add result to correspondent element of `dest` - */ -inline void mul_coef_to_buf( - const aint32 a, - aint32* src, - aint32* dest, - size_t len, - aint32 card = F4) -{ - const m256i coef = _mm256_set1_epi32(a); - - m256i* _src = reinterpret_cast(src); - m256i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplication - _dest[i] = mul(coef, _src[i], card); - } - if (_last_len > 0) { - uint64_t coef_64 = (uint64_t)a; - for (i = _len * ratio; i < len; i++) { - // perform multiplication - dest[i] = (aint32)((coef_64 * src[i]) % card); - } - } -} - -inline void -add_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4) -{ - m256i* _src = reinterpret_cast(src); - m256i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform addition - _dest[i] = add(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform addition - aint32 tmp = src[i] + dest[i]; - dest[i] = (tmp >= card) ? (tmp - card) : tmp; - } - } -} - -inline void sub_two_bufs( - aint32* bufa, - aint32* bufb, - aint32* res, - size_t len, - aint32 card = F4) -{ - m256i* _bufa = reinterpret_cast(bufa); - m256i* _bufb = reinterpret_cast(bufb); - m256i* _res = reinterpret_cast(res); - const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform subtraction - _res[i] = sub(_bufa[i], _bufb[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform subtraction - if (bufa[i] >= bufb[i]) - res[i] = bufa[i] - bufb[i]; - else - res[i] = card - (bufb[i] - bufa[i]); - } - } -} - -inline void -mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4) -{ - m256i* _src = reinterpret_cast(src); - m256i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplicaton - _dest[i] = mul(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform multiplicaton - dest[i] = uint32_t((uint64_t(src[i]) * dest[i]) % card); - } - } -} - -/* - * buf1[i] = buf1[i] + coef * buf2[i] - * buf2[i] = buf1[i] - coef * buf2[i] - */ -inline void butterfly_ct( - uint32_t coef, - aint32* buf1, - aint32* buf2, - size_t len, - uint32_t card = F4) -{ - const m256i _coef = _mm256_set1_epi32(coef); - m256i* _buf1 = reinterpret_cast(buf1); - m256i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m256i a = mul(_coef, _buf2[i], card); - _buf2[i] = sub(_buf1[i], a, card); - _buf1[i] = add(_buf1[i], a, card); - } -} - -/* - * buf1[i] = buf1[i] + buf2[i] - * buf2[i] = coef * (buf1[i] - buf2[i]) - */ -inline void butterfly_gs( - uint32_t coef, - aint32* buf1, - aint32* buf2, - size_t len, - uint32_t card = F4) -{ - const m256i _coef = _mm256_set1_epi32(coef); - m256i* _buf1 = reinterpret_cast(buf1); - m256i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m256i a = add(_buf1[i], _buf2[i], card); - _buf2[i] = mul(_coef, sub(_buf1[i], _buf2[i], card), card); - _buf1[i] = a; - } -} - -inline void encode_post_process( - vec::Buffers& output, - std::vector& props, - off_t offset, - unsigned code_len, - uint32_t threshold, - size_t vecs_nb) -{ - const unsigned vec_size = simd::countof(); - - const m256i _threshold = _mm256_set1_epi32(threshold); - const uint32_t max = 1 << (sizeof(uint32_t) * 8 - 1); - const m256i mask_hi = _mm256_set1_epi32(max); - const unsigned element_size = sizeof(uint32_t); - - for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { - uint32_t* chunk = output.get(frag_id); - m256i* buf = reinterpret_cast(chunk); - for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) { - const m256i a = _mm256_load_si256(&(buf[vec_id])); - const m256i b = _mm256_cmpeq_epi32(_threshold, a); - const m256i c = _mm256_and_si256(mask_hi, b); - uint32_t d = _mm256_movemask_epi8(c); - - while (d > 0) { - unsigned byte_idx = __builtin_ctz(d); - unsigned element_idx = byte_idx / element_size; - off_t _offset = offset + vec_id * vec_size + element_idx; - props[frag_id].add(_offset, 1); - d ^= 1 << byte_idx; - } - } - } -} - -/* ==================== Operations for NF4 =================== */ -typedef __m128i m128i; - -/** Return aint128 integer from a _m128i register */ -inline aint128 m256i_to_uint128(m256i v) -{ - aint128 hi, lo; - _mm256_storeu2_m128i((m128i*)&hi, (m128i*)&lo, v); - return lo; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn) -} - -inline __uint128_t add(__uint128_t a, __uint128_t b) -{ - m256i _a = _mm256_castsi128_si256((m128i)a); - m256i _b = _mm256_castsi128_si256((m128i)b); - m256i res = add(_a, _b, F4); - return m256i_to_uint128(res); -} - -inline __uint128_t sub(__uint128_t a, __uint128_t b) -{ - m256i _a = _mm256_castsi128_si256((m128i)a); - m256i _b = _mm256_castsi128_si256((m128i)b); - m256i res = sub(_a, _b, F4); - return m256i_to_uint128(res); -} - -inline __uint128_t mul(__uint128_t a, __uint128_t b) -{ - m256i _a = _mm256_castsi128_si256((m128i)a); - m256i _b = _mm256_castsi128_si256((m128i)b); - m256i res = mul(_a, _b, F4); - return m256i_to_uint128(res); -} - -/** Store low 128-bit part of `reg` to memory */ -inline void store_low(aint128* address, m256i reg) -{ - _mm_store_si128((m128i*)address, _mm256_castsi256_si128(reg)); -} - -inline void hadamard_mul(int n, aint128* _x, aint128* _y) -{ - int i; - m256i* x = reinterpret_cast(_x); - m256i* y = reinterpret_cast(_y); - - const unsigned ratio = sizeof(*x) / sizeof(*_x); - const int len_256 = n / ratio; - const int last_len = n - len_256 * ratio; - - // multiply y to the first half of `x` - for (i = 0; i < len_256; i++) { - x[i] = mul(x[i], y[i], F4); - } - - if (last_len > 0) { - // add last _y[] to x - for (i = len_256 * ratio; i < n; i++) { - m256i _x_p = _mm256_castsi128_si256((m128i)_x[i]); - m256i _y_p = _mm256_castsi128_si256((m128i)_y[i]); - - store_low(_x + i, mul(_x_p, _y_p, F4)); - } - } -} - -} // namespace simd -} // namespace quadiron - -#endif diff --git a/src/simd_nf4.h b/src/simd_nf4.h deleted file mode 100644 index 7d517430..00000000 --- a/src/simd_nf4.h +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Copyright 2017-2018 Scality - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __QUAD_SIMD_NF4_H__ -#define __QUAD_SIMD_NF4_H__ - -#include - -#include - -namespace quadiron { -namespace simd { - -#ifdef __AVX2__ -typedef __m128i m128i; - -/** Return aint128 integer from a _m128i register */ -static inline aint128 m128i_to_uint128(m128i v) -{ - aint128 i; - _mm_store_si128((m128i*)&i, v); - - return i; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn) -} -#endif // #ifdef __AVX2__ - -inline aint128 expand16(uint16_t* arr, int n) -{ - // since n <= 4 - uint16_t _arr[4] __attribute__((aligned(ALIGNMENT))) = {0, 0, 0, 0}; - std::copy_n(arr, n, _arr); - - m128i b = _mm_set_epi16(0, 0, 0, 0, _arr[3], _arr[2], _arr[1], _arr[0]); - - return m128i_to_uint128(b); -} - -inline aint128 expand32(uint32_t* arr, int n) -{ - // since n <= 4 - uint32_t _arr[4] __attribute__((aligned(simd::ALIGNMENT))) = {0, 0, 0, 0}; - std::copy_n(arr, n, _arr); - - m128i b = _mm_set_epi32(_arr[3], _arr[2], _arr[1], _arr[0]); - - return m128i_to_uint128(b); -} - -inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n) -{ - uint16_t ai[8]; - aint128 values; - - m128i _a = _mm_loadu_si128((m128i*)&a); - ai[0] = _mm_extract_epi16(_a, 0); - ai[1] = _mm_extract_epi16(_a, 1); - ai[2] = _mm_extract_epi16(_a, 2); - ai[3] = _mm_extract_epi16(_a, 3); - ai[4] = _mm_extract_epi16(_a, 4); - ai[5] = _mm_extract_epi16(_a, 5); - ai[6] = _mm_extract_epi16(_a, 6); - ai[7] = _mm_extract_epi16(_a, 7); - - const uint32_t flag = - ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u); - - m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]); - _mm_store_si128((m128i*)&values, val); - - GroupedValues<__uint128_t> b = {values, flag}; - - return b; -} - -inline void unpack(__uint128_t a, GroupedValues<__uint128_t>& b, int n) -{ - uint16_t ai[8]; - aint128 values; - - m128i _a = _mm_loadu_si128((m128i*)&a); - ai[0] = _mm_extract_epi16(_a, 0); - ai[1] = _mm_extract_epi16(_a, 1); - ai[2] = _mm_extract_epi16(_a, 2); - ai[3] = _mm_extract_epi16(_a, 3); - ai[4] = _mm_extract_epi16(_a, 4); - ai[5] = _mm_extract_epi16(_a, 5); - ai[6] = _mm_extract_epi16(_a, 6); - ai[7] = _mm_extract_epi16(_a, 7); - - const uint32_t flag = - ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u); - - m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]); - _mm_store_si128((m128i*)&values, val); - - b.flag = flag; - b.values = values; // NOLINT(clang-analyzer-core.uninitialized.Assign) -} - -inline aint128 pack(__uint128_t a) -{ - m128i _a = _mm_loadu_si128((m128i*)&a); - m128i b = _mm_set_epi32( - _mm_extract_epi16(_a, 3), - _mm_extract_epi16(_a, 2), - _mm_extract_epi16(_a, 1), - _mm_extract_epi16(_a, 0)); - - return m128i_to_uint128(b); -} - -inline aint128 pack(__uint128_t a, uint32_t flag) -{ - aint32 b0, b1, b2, b3; - m128i _a = _mm_loadu_si128((m128i*)&a); - - if (flag & 1) - b0 = 65536; - else - b0 = _mm_extract_epi16(_a, 0); - flag >>= 1; - if (flag & 1) - b1 = 65536; - else - b1 = _mm_extract_epi16(_a, 1); - flag >>= 1; - if (flag & 1) - b2 = 65536; - else - b2 = _mm_extract_epi16(_a, 2); - flag >>= 1; - if (flag & 1) - b3 = 65536; - else - b3 = _mm_extract_epi16(_a, 3); - - m128i b = _mm_set_epi32(b3, b2, b1, b0); - - return m128i_to_uint128(b); -} - -} // namespace simd -} // namespace quadiron - -#endif diff --git a/src/vec_buffers.h b/src/vec_buffers.h index d31c69d3..2122d300 100644 --- a/src/vec_buffers.h +++ b/src/vec_buffers.h @@ -38,7 +38,7 @@ #include #include "core.h" -#include "simd/simd.h" +#include "simd/allocator.h" namespace quadiron { namespace vec { diff --git a/test/simd/test_allocator.cpp b/test/simd/test_allocator.cpp index a1d59034..09edb70f 100644 --- a/test/simd/test_allocator.cpp +++ b/test/simd/test_allocator.cpp @@ -32,7 +32,7 @@ #include -#include "simd/simd.h" +#include "simd/allocator.h" namespace simd = quadiron::simd; diff --git a/test/simd/test_definitions.cpp b/test/simd/test_definitions.cpp index c7a48975..bde45d05 100644 --- a/test/simd/test_definitions.cpp +++ b/test/simd/test_definitions.cpp @@ -29,7 +29,7 @@ */ #include -#include "simd/simd.h" +#include "simd/definitions.h" namespace simd = quadiron::simd;