From 0b2852fb59e6d3988b0d79230558baa62f9ba28d Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Thu, 13 Sep 2018 15:28:53 +0200
Subject: [PATCH 01/77] Move OOR_MARK definition to property.h

---
 src/fec_base.h | 2 --
 src/property.h | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/fec_base.h b/src/fec_base.h
index 13ac78d9..741d480f 100644
--- a/src/fec_base.h
+++ b/src/fec_base.h
@@ -74,8 +74,6 @@ static inline uint64_t hrtime_usec(timeval begin)
     return 1000000 * (tv.tv_sec - begin.tv_sec) + tv.tv_usec - begin.tv_usec;
 }
 
-#define OOR_MARK 1
-
 enum class FecType {
     /** Systematic code
      *
diff --git a/src/property.h b/src/property.h
index 766acea7..2ecdc65e 100644
--- a/src/property.h
+++ b/src/property.h
@@ -40,6 +40,8 @@
 
 namespace quadiron {
 
+#define OOR_MARK 1
+
 /** Ancillary data attached to values.
  *
  * A property carries extra-information (whose interpretation is left to the

From 897734065e649f309417398c64f88c0c3347ca92 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Mon, 15 Oct 2018 11:37:09 +0200
Subject: [PATCH 02/77] FFT2n: specialize butterfly operations

---
 src/fft_2n.h | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/src/fft_2n.h b/src/fft_2n.h
index f3da7174..8a63bbb2 100644
--- a/src/fft_2n.h
+++ b/src/fft_2n.h
@@ -602,6 +602,65 @@ void Radix2<T>::ifft(vec::Buffers<T>& output, vec::Buffers<T>& input)
     this->gf->mul_vec_to_vecp(*(this->vec_inv_n), output, output);
 }
 
+#ifdef QUADIRON_USE_SIMD
+
+/* Operations are vectorized by SIMD */
+template <>
+void Radix2<uint16_t>::butterfly_ct_two_layers_step(
+    vec::Buffers<uint16_t>& buf,
+    unsigned start,
+    unsigned m);
+template <>
+void Radix2<uint16_t>::butterfly_ct_step(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t r,
+    unsigned start,
+    unsigned m,
+    unsigned step);
+template <>
+void Radix2<uint16_t>::butterfly_gs_step(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step);
+template <>
+void Radix2<uint16_t>::butterfly_gs_step_simple(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step);
+
+template <>
+void Radix2<uint32_t>::butterfly_ct_two_layers_step(
+    vec::Buffers<uint32_t>& buf,
+    unsigned start,
+    unsigned m);
+template <>
+void Radix2<uint32_t>::butterfly_ct_step(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t r,
+    unsigned start,
+    unsigned m,
+    unsigned step);
+template <>
+void Radix2<uint32_t>::butterfly_gs_step(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step);
+template <>
+void Radix2<uint32_t>::butterfly_gs_step_simple(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step);
+
+#endif // #ifdef QUADIRON_USE_SIMD
+
 } // namespace fft
 } // namespace quadiron
 

From d301e489e5dd5a108772d17004cda2f99c1f24f3 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:45:34 +0200
Subject: [PATCH 03/77] CMakeLists: add fft_2n.cpp file

---
 src/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index aff9c341..3ff590af 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -31,6 +31,7 @@ include(GNUInstallDirs)
 set(LIB_SRC
   ${SOURCE_DIR}/core.cpp
   ${SOURCE_DIR}/fec_vectorisation.cpp
+  ${SOURCE_DIR}/fft_2n.cpp
   ${SOURCE_DIR}/misc.cpp
   ${SOURCE_DIR}/gf_nf4.cpp
   ${SOURCE_DIR}/gf_ring.cpp

From c7c67434f3edb4554030a7e1c0b28e3f6356d3be Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:41:40 +0200
Subject: [PATCH 04/77] FFT2n.cpp: implement specialized operations

---
 src/fec_vectorisation.cpp |   2 +-
 src/fft_2n.cpp            | 272 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 273 insertions(+), 1 deletion(-)
 create mode 100644 src/fft_2n.cpp

diff --git a/src/fec_vectorisation.cpp b/src/fec_vectorisation.cpp
index 2564ed7d..8684e1ab 100644
--- a/src/fec_vectorisation.cpp
+++ b/src/fec_vectorisation.cpp
@@ -32,7 +32,7 @@
 #include "fec_rs_fnt.h"
 
 /*
- * The file includes vectorized operations used by FEC classes
+ * The file includes specialized operations used by FEC classes
  */
 
 #ifdef QUADIRON_USE_SIMD
diff --git a/src/fft_2n.cpp b/src/fft_2n.cpp
new file mode 100644
index 00000000..f3d8847f
--- /dev/null
+++ b/src/fft_2n.cpp
@@ -0,0 +1,272 @@
+/* -*- mode: c++ -*- */
+/*
+ * Copyright 2017-2018 Scality
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fft_2n.h"
+
+/*
+ * The file includes vectorized operations used by Radix2 classes
+ */
+
+#ifdef QUADIRON_USE_SIMD
+
+#include "simd.h"
+
+namespace quadiron {
+namespace fft {
+
+template <>
+void Radix2<uint16_t>::butterfly_ct_two_layers_step(
+    vec::Buffers<uint16_t>& buf,
+    unsigned start,
+    unsigned m)
+{
+    const unsigned ratio = simd::countof<uint16_t>();
+    const size_t len = this->pkt_size;
+    const size_t vec_len = len / ratio;
+    const size_t last_len = len - vec_len * ratio;
+    const unsigned coefIndex = start * this->n / m / 2;
+    const uint16_t r1 = vec_W[coefIndex];
+    const uint16_t r2 = vec_W[coefIndex / 2];
+    const uint16_t r3 = vec_W[coefIndex / 2 + this->n / 4];
+
+    // perform vector operations
+    simd::butterfly_ct_two_layers_step(
+        buf, r1, r2, r3, start, m, vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (last_len > 0) {
+        const unsigned step = m << 2;
+        size_t offset = vec_len * ratio;
+        //  ---------
+        // First layer
+        //  ---------
+        const uint16_t r1 = W->get(start * this->n / m / 2);
+        // first pair
+        butterfly_ct_step_slow(buf, r1, start, m, step, offset);
+        // second pair
+        butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset);
+        //  ---------
+        // Second layer
+        //  ---------
+        // first pair
+        const uint16_t r2 = W->get(start * this->n / m / 4);
+        butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset);
+        // second pair
+        const uint16_t r3 = W->get((start + m) * this->n / m / 4);
+        butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset);
+    }
+}
+
+template <>
+void Radix2<uint16_t>::butterfly_ct_step(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t r,
+    unsigned start,
+    unsigned m,
+    unsigned step)
+{
+    const unsigned ratio = simd::countof<uint16_t>();
+    const size_t len = this->pkt_size;
+    const size_t vec_len = len / ratio;
+    const size_t last_len = len - vec_len * ratio;
+
+    // perform vector operations
+    simd::butterfly_ct_step(buf, r, start, m, step, vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (last_len > 0) {
+        size_t offset = vec_len * ratio;
+        butterfly_ct_step_slow(buf, r, start, m, step, offset);
+    }
+}
+
+template <>
+void Radix2<uint16_t>::butterfly_gs_step(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step)
+{
+    const unsigned ratio = simd::countof<uint16_t>();
+    const size_t len = this->pkt_size;
+    const size_t vec_len = len / ratio;
+    const size_t last_len = len - vec_len * ratio;
+
+    // perform vector operations
+    simd::butterfly_gs_step(buf, coef, start, m, vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (last_len > 0) {
+        size_t offset = vec_len * ratio;
+        butterfly_gs_step_slow(buf, coef, start, m, step, offset);
+    }
+}
+
+template <>
+void Radix2<uint16_t>::butterfly_gs_step_simple(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step)
+{
+    const unsigned ratio = simd::countof<uint16_t>();
+    const size_t len = this->pkt_size;
+    const size_t vec_len = len / ratio;
+    const size_t last_len = len - vec_len * ratio;
+
+    // perform vector operations
+    simd::butterfly_gs_step_simple(buf, coef, start, m, vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (last_len > 0) {
+        size_t offset = vec_len * ratio;
+        butterfly_gs_step_simple_slow(buf, coef, start, m, step, offset);
+    }
+}
+
+template <>
+void Radix2<uint32_t>::butterfly_ct_two_layers_step(
+    vec::Buffers<uint32_t>& buf,
+    unsigned start,
+    unsigned m)
+{
+    const unsigned ratio = simd::countof<uint32_t>();
+    const size_t len = this->pkt_size;
+    const size_t vec_len = len / ratio;
+    const size_t last_len = len - vec_len * ratio;
+    const unsigned coefIndex = start * this->n / m / 2;
+    const uint32_t r1 = vec_W[coefIndex];
+    const uint32_t r2 = vec_W[coefIndex / 2];
+    const uint32_t r3 = vec_W[coefIndex / 2 + this->n / 4];
+
+    // perform vector operations
+    simd::butterfly_ct_two_layers_step(
+        buf, r1, r2, r3, start, m, vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (last_len > 0) {
+        const unsigned step = m << 2;
+        size_t offset = vec_len * ratio;
+        //  ---------
+        // First layer
+        //  ---------
+        const uint32_t r1 = W->get(start * this->n / m / 2);
+        // first pair
+        butterfly_ct_step_slow(buf, r1, start, m, step, offset);
+        // second pair
+        butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset);
+        //  ---------
+        // Second layer
+        //  ---------
+        // first pair
+        const uint32_t r2 = W->get(start * this->n / m / 4);
+        butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset);
+        // second pair
+        const uint32_t r3 = W->get((start + m) * this->n / m / 4);
+        butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset);
+    }
+}
+
+template <>
+void Radix2<uint32_t>::butterfly_ct_step(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t r,
+    unsigned start,
+    unsigned m,
+    unsigned step)
+{
+    const unsigned ratio = simd::countof<uint32_t>();
+    const size_t len = this->pkt_size;
+    const size_t vec_len = len / ratio;
+    const size_t last_len = len - vec_len * ratio;
+
+    // perform vector operations
+    simd::butterfly_ct_step(buf, r, start, m, step, vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (last_len > 0) {
+        size_t offset = vec_len * ratio;
+        butterfly_ct_step_slow(buf, r, start, m, step, offset);
+    }
+}
+
+template <>
+void Radix2<uint32_t>::butterfly_gs_step(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step)
+{
+    const unsigned ratio = simd::countof<uint32_t>();
+    const size_t len = this->pkt_size;
+    const size_t vec_len = len / ratio;
+    const size_t last_len = len - vec_len * ratio;
+
+    // perform vector operations
+    simd::butterfly_gs_step(buf, coef, start, m, vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (last_len > 0) {
+        size_t offset = vec_len * ratio;
+        butterfly_gs_step_slow(buf, coef, start, m, step, offset);
+    }
+}
+
+template <>
+void Radix2<uint32_t>::butterfly_gs_step_simple(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step)
+{
+    const unsigned ratio = simd::countof<uint32_t>();
+    const size_t len = this->pkt_size;
+    const size_t vec_len = len / ratio;
+    const size_t last_len = len - vec_len * ratio;
+
+    // perform vector operations
+    simd::butterfly_gs_step_simple(buf, coef, start, m, vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (last_len > 0) {
+        size_t offset = vec_len * ratio;
+        butterfly_gs_step_simple_slow(buf, coef, start, m, step, offset);
+    }
+}
+
+} // namespace fft
+} // namespace quadiron
+
+#endif // #ifdef QUADIRON_USE_SIMD

From ecde06e965b9240e9d238039fbb5219a89217a97 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:43:47 +0200
Subject: [PATCH 05/77] SIMD 128 u16 & u32: update

---
 src/simd_128_u16.h |  46 --------
 src/simd_128_u32.h | 259 +++++++++++++++++++++++++++++++++++++++------
 2 files changed, 228 insertions(+), 77 deletions(-)

diff --git a/src/simd_128_u16.h b/src/simd_128_u16.h
index a177cfb3..e13d8756 100644
--- a/src/simd_128_u16.h
+++ b/src/simd_128_u16.h
@@ -271,52 +271,6 @@ mul_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3)
     }
 }
 
-/*
- * buf1[i] = buf1[i] + coef * buf2[i]
- * buf2[i] = buf1[i] - coef * buf2[i]
- */
-inline void butterfly_ct(
-    uint16_t coef,
-    aint16* buf1,
-    aint16* buf2,
-    size_t len,
-    uint32_t card = F3)
-{
-    const m128i _coef = _mm_set1_epi16(coef);
-    m128i* _buf1 = reinterpret_cast<m128i*>(buf1);
-    m128i* _buf2 = reinterpret_cast<m128i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m128i a = mul(_coef, _buf2[i], card);
-        _buf2[i] = sub(_buf1[i], a, card);
-        _buf1[i] = add(_buf1[i], a, card);
-    }
-}
-
-/*
- * buf1[i] = buf1[i] + buf2[i]
- * buf2[i] = coef * (buf1[i] - buf2[i])
- */
-inline void butterfly_gs(
-    uint16_t coef,
-    aint16* buf1,
-    aint16* buf2,
-    size_t len,
-    uint16_t card = F3)
-{
-    const m128i _coef = _mm_set1_epi16(coef);
-    m128i* _buf1 = reinterpret_cast<m128i*>(buf1);
-    m128i* _buf2 = reinterpret_cast<m128i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m128i a = _buf1[i];
-        m128i b = _buf2[i];
-        m128i c = sub(a, b, card);
-        _buf1[i] = add(a, b, card);
-        _buf2[i] = mul(_coef, c, card);
-    }
-}
-
 inline void encode_post_process(
     vec::Buffers<uint16_t>& output,
     std::vector<Properties>& props,
diff --git a/src/simd_128_u32.h b/src/simd_128_u32.h
index 5039f0f8..80936a6e 100644
--- a/src/simd_128_u32.h
+++ b/src/simd_128_u32.h
@@ -156,6 +156,17 @@ inline m128i mul_f4(m128i a, m128i b)
     return mod_after_multiply_f4(c);
 }
 
+inline m128i mul_f4_simple(m128i a, m128i b)
+{
+    m128i _a = _mm_load_si128(&a);
+    m128i _b = _mm_load_si128(&b);
+
+    m128i c = _mm_mullo_epi32(_a, _b);
+
+    // Modulo
+    return mod_after_multiply_f4(c);
+}
+
 inline m128i mul_f3(m128i a, m128i b)
 {
     m128i _a = _mm_load_si128(&a);
@@ -174,6 +185,17 @@ inline m128i mul_f3(m128i a, m128i b)
     return mod_after_multiply_f3(c);
 }
 
+inline m128i mul_f3_simple(m128i a, m128i b)
+{
+    m128i _a = _mm_load_si128(&a);
+    m128i _b = _mm_load_si128(&b);
+
+    m128i c = _mm_mullo_epi32(_a, _b);
+
+    // Modulo
+    return mod_after_multiply_f3(c);
+}
+
 /** Perform multiplication of two numbers a, b whose elements are of GF(card)
  *  where `card` is a prime Fermat number, i.e. card = Fx with x < 5
  *  Currently, it supports only for F3 and F4
@@ -186,6 +208,14 @@ inline m128i mul(m128i a, m128i b, aint32 card)
     return mul_f3(a, b);
 }
 
+inline m128i mul_simple(m128i a, m128i b, aint32 card)
+{
+    assert(card == F4 || card == F3);
+    if (card == F4)
+        return mul_f4_simple(a, b);
+    return mul_f3_simple(a, b);
+}
+
 /** Apply an element-wise negation to a buffer
  */
 inline void neg(size_t len, aint32* buf, aint32 card = F4)
@@ -314,49 +344,216 @@ mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4)
     }
 }
 
-/*
- * buf1[i] = buf1[i] + coef * buf2[i]
- * buf2[i] = buf1[i] - coef * buf2[i]
- */
-inline void butterfly_ct(
-    uint32_t coef,
-    aint32* buf1,
-    aint32* buf2,
+// outputA = inputA + inputB
+// outputB = inputA - inputB
+inline void butterfly_step(
+    m128i* inputA,
+    m128i* inputB,
+    m128i* outputA,
+    m128i* outputB,
+    uint32_t _card)
+{
+    const m128i card = (_card == F3) ? F3_m128i : F4_m128i;
+    const m128i card_1 = (_card == F3) ? F3minus1_m128i : F4minus1_m128i;
+
+    // --------------------------------------
+    // outputB = inputA - inputB
+    // --------------------------------------
+    m128i a = _mm_load_si128(inputA);
+    m128i b = _mm_load_si128(inputB);
+    m128i cmp_1 = _mm_cmpgt_epi32(b, a);
+    m128i res_1 = _mm_add_epi32(a, _mm_and_si128(card, cmp_1));
+
+    _mm_store_si128(outputB, _mm_sub_epi32(res_1, b));
+
+    // --------------------------------------
+    // outputA = symbA + symbB
+    // --------------------------------------
+    m128i res_2 = _mm_add_epi32(a, b);
+    // modulo
+    m128i cmp_2 = _mm_cmpgt_epi32(res_2, card_1);
+    m128i c = _mm_sub_epi32(res_2, _mm_and_si128(card, cmp_2));
+
+    _mm_store_si128(outputA, c);
+}
+
+// for each pair (P, Q) = (buf[i], buf[i + m]):
+// P = P + Q
+// Q = P - Q
+inline void butterfly_ct_1(
+    vec::Buffers<uint32_t>& buf,
+    unsigned start,
+    unsigned m,
+    unsigned step,
     size_t len,
     uint32_t card = F4)
 {
-    const m128i _coef = _mm_set1_epi32(coef);
-    m128i* _buf1 = reinterpret_cast<m128i*>(buf1);
-    m128i* _buf2 = reinterpret_cast<m128i*>(buf2);
+    for (int i = start; i < buf.get_n(); i += step) {
+        uint32_t* a = buf.get(i);
+        uint32_t* b = buf.get(i + m);
+        m128i* _a = reinterpret_cast<m128i*>(a);
+        m128i* _b = reinterpret_cast<m128i*>(b);
+        // perform butterfly operation for Cooley-Tukey FFT algorithm
+        for (size_t j = 0; j < len; ++j) {
+            butterfly_step(&(_a[j]), &(_b[j]), &(_a[j]), &(_b[j]), card);
+        }
+    }
+}
 
-    for (size_t i = 0; i < len; ++i) {
-        m128i a = mul(_coef, _buf2[i], card);
-        _buf2[i] = sub(_buf1[i], a, card);
-        _buf1[i] = add(_buf1[i], a, card);
+// for each pair (P, Q) = (buf[i], buf[i + m]):
+// P = P - Q
+// Q = P + Q
+inline void butterfly_ct_2(
+    vec::Buffers<uint32_t>& buf,
+    unsigned start,
+    unsigned m,
+    unsigned step,
+    size_t len,
+    uint32_t card = F4)
+{
+    for (int i = start; i < buf.get_n(); i += step) {
+        uint32_t* a = buf.get(i);
+        uint32_t* b = buf.get(i + m);
+        m128i* _a = reinterpret_cast<m128i*>(a);
+        m128i* _b = reinterpret_cast<m128i*>(b);
+        // perform butterfly operation for Cooley-Tukey FFT algorithm
+        for (size_t j = 0; j < len; ++j) {
+            butterfly_step(&(_a[j]), &(_b[j]), &(_b[j]), &(_a[j]), card);
+        }
     }
 }
 
-/*
- * buf1[i] = buf1[i] + buf2[i]
- * buf2[i] = coef * (buf1[i] - buf2[i])
- */
-inline void butterfly_gs(
+// output = coef * input
+inline void
+butterfly_mul(m128i* coef, m128i* input, m128i* output, uint32_t _card)
+{
+    const m128i card = (_card == F3) ? F3_m128i : F4_m128i;
+    const m128i card_2 = (_card == F3) ? F3minus2_m128i : F4minus2_m128i;
+
+    // --------------------------------------
+    // compute coef * symbB
+    // --------------------------------------
+    m128i _coef = _mm_load_si128(coef);
+    m128i b = _mm_load_si128(input);
+    m128i res = _mm_mullo_epi32(_coef, b);
+    // modulo
+    m128i lo = _mm_and_si128(res, card_2);
+    m128i res_shift =
+        (_card == F3) ? _mm_srli_si128(res, 1) : _mm_srli_si128(res, 2);
+    m128i hi = _mm_and_si128(res_shift, card_2);
+
+    m128i cmp_1 = _mm_cmpgt_epi32(hi, lo);
+    m128i _lo = _mm_add_epi32(lo, _mm_and_si128(card, cmp_1));
+
+    m128i res_2 = _mm_sub_epi32(_lo, hi);
+
+    _mm_store_si128(output, res_2);
+}
+
+// symbA = symbA + coef * symbB
+// symbB = symbA - coef * symbB
+inline void
+butterfly_ct_3_step(m128i* coef, m128i* symbA, m128i* symbB, uint32_t _card)
+{
+    // --------------------------------------
+    // compute coef * symbB
+    // --------------------------------------
+    m128i coef_x_symbB;
+    butterfly_mul(coef, symbB, &coef_x_symbB, _card);
+    // --------------------------------------
+    // symbA = symbA + coef_x_symbB
+    // symbB = symbA - coef_x_symbB
+    // --------------------------------------
+    butterfly_step(symbA, &coef_x_symbB, symbA, symbB, _card);
+}
+
+// for each pair (P, Q) = (buf[i], buf[i + m]):
+// P = P + c * Q
+// Q = P - c * Q
+inline void butterfly_ct_3(
     uint32_t coef,
-    aint32* buf1,
-    aint32* buf2,
+    vec::Buffers<uint32_t>& buf,
+    unsigned start,
+    unsigned m,
+    unsigned step,
     size_t len,
     uint32_t card = F4)
 {
-    const m128i _coef = _mm_set1_epi32(coef);
-    m128i* _buf1 = reinterpret_cast<m128i*>(buf1);
-    m128i* _buf2 = reinterpret_cast<m128i*>(buf2);
+    m128i _coef = _mm_set1_epi32(coef);
+    for (int i = start; i < buf.get_n(); i += step) {
+        uint32_t* a = buf.get(i);
+        uint32_t* b = buf.get(i + m);
+        m128i* _a = reinterpret_cast<m128i*>(a);
+        m128i* _b = reinterpret_cast<m128i*>(b);
+        // perform butterfly operation for Cooley-Tukey FFT algorithm
+        for (size_t j = 0; j < len; ++j) {
+            butterfly_ct_3_step(&_coef, &(_a[j]), &(_b[j]), card);
+        }
+    }
+}
 
-    for (size_t i = 0; i < len; ++i) {
-        m128i a = _buf1[i];
-        m128i b = _buf2[i];
-        m128i c = sub(a, b, card);
-        _buf1[i] = add(a, b, card);
-        _buf2[i] = mul(_coef, c, card);
+// for each pair (P, Q) = (buf[i], buf[i + m]):
+// P = Q + P
+// Q = Q - P
+inline void butterfly_gs_2(
+    vec::Buffers<uint32_t>& buf,
+    unsigned start,
+    unsigned m,
+    unsigned step,
+    size_t len,
+    uint32_t card = F4)
+{
+    for (int i = start; i < buf.get_n(); i += step) {
+        uint32_t* a = buf.get(i);
+        uint32_t* b = buf.get(i + m);
+        m128i* _a = reinterpret_cast<m128i*>(a);
+        m128i* _b = reinterpret_cast<m128i*>(b);
+        // perform butterfly operation for Cooley-Tukey FFT algorithm
+        for (size_t j = 0; j < len; ++j) {
+            butterfly_step(&(_b[j]), &(_a[j]), &(_a[j]), &(_b[j]), card);
+        }
+    }
+}
+
+// symbA = symbA + symbB
+// symbB = coef * (symbA - symbB)
+inline void
+butterfly_gs_3_step(m128i* coef, m128i* symbA, m128i* symbB, uint32_t _card)
+{
+    // --------------------------------------
+    // symbA = symbA + symbB
+    // symbB = symbA - symbB
+    // --------------------------------------
+    butterfly_step(symbA, symbB, symbA, symbB, _card);
+
+    // --------------------------------------
+    // symbB = coef * symbB
+    // --------------------------------------
+    butterfly_mul(coef, symbB, symbB, _card);
+}
+
+// for each pair (P, Q) = (buf[i], buf[i + m]):
+// P = P + Q
+// Q = c * (P - Q)
+inline void butterfly_gs_3(
+    uint32_t coef,
+    vec::Buffers<uint32_t>& buf,
+    unsigned start,
+    unsigned m,
+    unsigned step,
+    size_t len,
+    uint32_t card = F4)
+{
+    m128i _coef = _mm_set1_epi32(coef);
+    for (int i = start; i < buf.get_n(); i += step) {
+        uint32_t* a = buf.get(i);
+        uint32_t* b = buf.get(i + m);
+        m128i* _a = reinterpret_cast<m128i*>(a);
+        m128i* _b = reinterpret_cast<m128i*>(b);
+        // perform butterfly operation for Cooley-Tukey FFT algorithm
+        for (size_t j = 0; j < len; ++j) {
+            butterfly_gs_3_step(&_coef, &(_a[j]), &(_b[j]), card);
+        }
     }
 }
 

From 23f7ec61e62d4b00ffd8693806eb6d8916411fb6 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:43:57 +0200
Subject: [PATCH 06/77] SIMD 256 u16 & u32: update

---
 src/simd_256_u16.h | 753 ++++++++++++++++++++++++++++++++----------
 src/simd_256_u32.h | 805 +++++++++++++++++++++++++++++++++------------
 2 files changed, 1161 insertions(+), 397 deletions(-)

diff --git a/src/simd_256_u16.h b/src/simd_256_u16.h
index 974e136e..74d35d24 100644
--- a/src/simd_256_u16.h
+++ b/src/simd_256_u16.h
@@ -38,142 +38,599 @@
 namespace quadiron {
 namespace simd {
 
-/** Perform a%card where a is a addition of two numbers whose elements are
- *  symbols of GF(card) */
-inline m256i mod_after_add(m256i a, aint16 card)
-{
-    const m256i _card = _mm256_set1_epi16(card);
-    const m256i _card_minus_1 = _mm256_set1_epi16(card - 1);
+#define F3_u16 _mm256_set1_epi16(257)
+#define F3m1_u16 _mm256_set1_epi16(256)
 
-    m256i cmp = _mm256_cmpgt_epi16(a, _card_minus_1);
-    m256i b = _mm256_sub_epi16(a, _mm256_and_si256(_card, cmp));
+/* ==================== Essential Operations =================== */
+// Following functions are used for AVX2 w/ u16 only
 
-    return b;
+inline m256i SET1(uint16_t val)
+{
+    return _mm256_set1_epi16(val);
 }
-
-/** Perform addition of two numbers a, b whose elements are of GF(card) */
-inline m256i add(m256i a, m256i b, aint16 card = F3)
+inline m256i ADD16(m256i x, m256i y)
+{
+    return _mm256_add_epi16(x, y);
+}
+inline m256i SUB16(m256i x, m256i y)
 {
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
-    m256i c = _mm256_add_epi16(_a, _b);
+    return _mm256_sub_epi16(x, y);
+}
+inline m256i MUL16(m256i x, m256i y)
+{
+    return _mm256_mullo_epi16(x, y);
+}
 
-    // Modulo
-    return mod_after_add(c, card);
+inline m256i CMPEQ16(m256i x, m256i y)
+{
+    return _mm256_cmpeq_epi16(x, y);
+}
+inline m256i CMPGT16(m256i x, m256i y)
+{
+    return _mm256_cmpgt_epi16(x, y);
+}
+inline m256i MINU16(m256i x, m256i y)
+{
+    return _mm256_min_epu16(x, y);
 }
 
-/** Perform subtraction of a by b where a, b whose elements are symbols of
- *  GF(card)
- * sub(a, b) = a - b if a >= b, or
- *             card + a - b, otherwise
- */
-inline m256i sub(m256i a, m256i b, aint16 card)
+#define MASK8_LO (_mm256_set1_epi16(0x80))
+#define BLEND8(x, y, mask) (_mm256_blendv_epi8(x, y, mask))
+
+// z = x + y mod q
+// Input are loaded to registers
+// Output is register
+inline m256i ADD_MOD(m256i x, m256i y, uint16_t q)
 {
-    const m256i _card = _mm256_set1_epi16(card);
+    m256i res = ADD16(x, y);
+    return MINU16(res, SUB16(res, F3_u16));
+}
 
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
+// z = x - y mod q => z = q + x - y mod q
+// Input are loaded to registers
+// Output is register
+inline m256i SUB_MOD(m256i x, m256i y, uint16_t q)
+{
+    m256i res = SUB16(x, y);
+    return MINU16(res, SUB16(ADD16(x, F3_u16), y));
+}
 
-    m256i cmp = _mm256_cmpgt_epi16(_b, _a);
-    m256i _a1 = _mm256_add_epi16(_a, _mm256_and_si256(_card, cmp));
+// y = 0 - x mod q => y = q - x mod q
+// Input are loaded to registers
+// Output is register
+inline m256i NEG_MOD(m256i x, uint16_t q)
+{
+    m256i res = SUB16(F3_u16, x);
+    return MINU16(res, SUB16(res, F3_u16));
+}
 
-    return _mm256_sub_epi16(_a1, _b);
+// z = x * y mod q
+// Input are loaded to registers
+// Output is register
+// Note: we assume that at least `x` or `y` is less than `q-1` so it's
+// not necessary to verify overflow on multiplying elements
+inline m256i MUL_MOD(m256i x, m256i y, uint16_t q)
+{
+    m256i res = MUL16(x, y);
+    m256i lo = BLEND8(ZERO, res, MASK8_LO);
+    m256i hi = BLEND8(ZERO, SHIFTR_1(res), MASK8_LO);
+    return SUB_MOD(lo, hi, q);
 }
 
-/** Negate `a`
- * @return 0 if (a == 0), else card - a
- */
-inline m256i neg(m256i a, aint16 card = F3)
+// z = x * y mod q
+// Input are loaded to registers
+// Output is register
+inline m256i MULFULL_MOD(m256i x, m256i y, uint16_t q)
 {
-    const m256i _card = _mm256_set1_epi16(card);
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_setzero_si256();
+    m256i res = MUL16(x, y);
 
-    m256i cmp = _mm256_cmpgt_epi16(_a, _b);
+    // filter elements of both of a & b = card-1
+    m256i cmp = AND(CMPEQ16(x, F3m1_u16), CMPEQ16(y, F3m1_u16));
+    res = ADD16(res, AND(ONE, cmp));
 
-    return _mm256_sub_epi16(_mm256_and_si256(cmp, _card), _a);
+    m256i lo = BLEND8(ZERO, res, MASK8_LO);
+    m256i hi = BLEND8(ZERO, SHIFTR_1(res), MASK8_LO);
+    return SUB_MOD(lo, hi, q);
 }
 
-inline m256i mod_after_multiply(m256i a)
+// butterfly CT with r == 1
+inline void BUTTERFLY_1(m256i* x, m256i* y, uint16_t q)
 {
-    const m256i mask = _mm256_set1_epi16(F3 - 2);
-
-    m256i lo = _mm256_and_si256(a, mask);
+    m256i add = ADD_MOD(*x, *y, q);
+    *y = SUB_MOD(*x, *y, q);
+    *x = add;
+}
 
-    m256i a_shift = _mm256_srli_si256(a, 1);
-    m256i hi = _mm256_and_si256(a_shift, mask);
+// butterfly CT with r == q - 1
+inline void BUTTERFLY_2(m256i* x, m256i* y, uint16_t q)
+{
+    m256i add = ADD_MOD(*x, *y, q);
+    *x = SUB_MOD(*x, *y, q);
+    *y = add;
+}
 
-    m256i cmp = _mm256_cmpgt_epi16(hi, lo);
-    m256i _lo = _mm256_add_epi16(lo, _mm256_and_si256(F3_m256i_u16, cmp));
+// butterfly CT with 1 < r < q - 1
+inline void BUTTERFLY_3(m256i c, m256i* x, m256i* y, uint16_t q)
+{
+    m256i z = MUL_MOD(c, *y, q);
+    *y = SUB_MOD(*x, z, q);
+    *x = ADD_MOD(*x, z, q);
+}
 
-    return _mm256_sub_epi16(_lo, hi);
+// butterfly GS w/ r = q - 1
+inline void BUTTERFLY_4(m256i* x, m256i* y, uint16_t q)
+{
+    m256i add = ADD_MOD(*x, *y, q);
+    *y = SUB_MOD(*y, *x, q);
+    *x = add;
 }
 
-inline m256i mul(m256i a, m256i b)
+// butterfly GS w/ 1 < r < q - 1
+// x = x + y mod q
+// y = z * (x - y) mod q
+inline void BUTTERFLY_5(m256i c, m256i* x, m256i* y, uint16_t q)
 {
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
+    m256i sub = SUB_MOD(*x, *y, q);
+    *x = ADD_MOD(*x, *y, q);
+    *y = MUL_MOD(c, sub, q);
+}
 
-    m256i c = _mm256_mullo_epi16(_a, _b);
+/**
+ * Vectorized butterly CT step
+ *
+ * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
+ *      P = P + r * Q
+ *      Q = P - r * Q
+ *
+ * @param buf - working buffers
+ * @param r - coefficient
+ * @param start - index of buffer among `m` ones
+ * @param m - current group size
+ * @param len - number of vectors per buffer
+ * @param card - modulo cardinal
+ */
+inline void butterfly_ct_step(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t r,
+    unsigned start,
+    unsigned m,
+    size_t len,
+    uint16_t card)
+{
+    const unsigned step = m << 1;
+    m256i c = SET1(r);
+
+#define BUTTERFLY_CT(x, y)                                                     \
+    (EITHER(                                                                   \
+        r == 1,                                                                \
+        BUTTERFLY_1(x, y, card),                                               \
+        EITHER(                                                                \
+            r < card - 1,                                                      \
+            BUTTERFLY_3(c, x, y, card),                                        \
+            BUTTERFLY_2(x, y, card))));
+
+    const size_t end = len - 1;
+    const unsigned bufs_nb = buf.get_n();
+    // #pragma omp parallel for
+    // #pragma unroll
+    const std::vector<uint16_t*>& mem = buf.get_mem();
+    for (unsigned i = start; i < bufs_nb; i += step) {
+        m256i x1, y1;
+        m256i x2, y2;
+        m256i* __restrict p = reinterpret_cast<m256i*>(mem[i]);
+        m256i* __restrict q = reinterpret_cast<m256i*>(mem[i + m]);
+
+        // #pragma omp parallel for
+        size_t j = 0;
+        // #pragma unroll
+        for (; j < end; j += 2) {
+            x1 = LOAD(p + j);
+            y1 = LOAD(q + j);
+            x2 = LOAD(p + j + 1);
+            y2 = LOAD(q + j + 1);
+
+            BUTTERFLY_CT(&x1, &y1);
+            BUTTERFLY_CT(&x2, &y2);
+
+            // Store back to memory
+            STORE(p + j, x1);
+            STORE(p + j + 1, x2);
+            STORE(q + j, y1);
+            STORE(q + j + 1, y2);
+        }
+        for (; j < len; ++j) {
+            x1 = LOAD(p + j);
+            y1 = LOAD(q + j);
 
-    // filter elements of both of a & b = card-1
-    m256i cmp = _mm256_and_si256(
-        _mm256_cmpeq_epi16(_a, F3minus1_m256i_u16),
-        _mm256_cmpeq_epi16(_b, F3minus1_m256i_u16));
+            BUTTERFLY_CT(&x1, &y1);
 
-    const m256i one = _mm256_set1_epi16(1);
-    c = _mm256_add_epi16(c, _mm256_and_si256(one, cmp));
+            // Store back to memory
+            STORE(p + j, x1);
+            STORE(q + j, y1);
+        }
+    }
+}
 
-    // Modulo
-    return mod_after_multiply(c);
+/**
+ * Vectorized butterly CT on two-layers at a time
+ *
+ * For each quadruple
+ * (P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m])
+ * First layer: butterfly on (P, Q) and (R, S) for step = 2 * m
+ *      coef r1 = W[start * n / (2 * m)]
+ *      P = P + r1 * Q
+ *      Q = P - r1 * Q
+ *      R = R + r1 * S
+ *      S = R - r1 * S
+ * Second layer: butterfly on (P, R) and (Q, S) for step = 4 * m
+ *      coef r2 = W[start * n / (4 * m)]
+ *      coef r3 = W[(start + m) * n / (4 * m)]
+ *      P = P + r2 * R
+ *      R = P - r2 * R
+ *      Q = Q + r3 * S
+ *      S = Q - r3 * S
+ *
+ * @param buf - working buffers
+ * @param r1 - coefficient for the 1st layer
+ * @param r2 - 1st coefficient for the 2nd layer
+ * @param r3 - 2nd coefficient for the 2nd layer
+ * @param start - index of buffer among `m` ones
+ * @param m - current group size
+ * @param len - number of vectors per buffer
+ * @param card - modulo cardinal
+ */
+inline void butterfly_ct_two_layers_step(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t r1,
+    uint16_t r2,
+    uint16_t r3,
+    unsigned start,
+    unsigned m,
+    size_t len,
+    uint16_t card)
+{
+    const unsigned step = m << 2;
+    m256i c1 = SET1(r1);
+    m256i c2 = SET1(r2);
+    m256i c3 = SET1(r3);
+
+#define BUTTERFLY_R1(c, x, y)                                                  \
+    (EITHER(                                                                   \
+        r1 == 1,                                                               \
+        BUTTERFLY_1(x, y, card),                                               \
+        EITHER(                                                                \
+            r1 < card - 1,                                                     \
+            BUTTERFLY_3(c, x, y, card),                                        \
+            BUTTERFLY_2(x, y, card))));
+#define BUTTERFLY_R2(c, x, y)                                                  \
+    (EITHER(                                                                   \
+        r2 == 1,                                                               \
+        BUTTERFLY_1(x, y, card),                                               \
+        EITHER(                                                                \
+            r2 < card - 1,                                                     \
+            BUTTERFLY_3(c, x, y, card),                                        \
+            BUTTERFLY_2(x, y, card))));
+#define BUTTERFLY_R3(c, x, y)                                                  \
+    (EITHER(                                                                   \
+        r3 == 1,                                                               \
+        BUTTERFLY_1(x, y, card),                                               \
+        EITHER(                                                                \
+            r3 < card - 1,                                                     \
+            BUTTERFLY_3(c, x, y, card),                                        \
+            BUTTERFLY_2(x, y, card))));
+
+    const size_t end = len - 1;
+    const unsigned bufs_nb = buf.get_n();
+    // #pragma omp parallel for
+    // #pragma unroll
+    const std::vector<uint16_t*>& mem = buf.get_mem();
+    for (unsigned i = start; i < bufs_nb; i += step) {
+        m256i x1, y1, u1, v1;
+        m256i x2, y2, u2, v2;
+        m256i* __restrict p = reinterpret_cast<m256i*>(mem[i]);
+        m256i* __restrict q = reinterpret_cast<m256i*>(mem[i + m]);
+        m256i* __restrict r = reinterpret_cast<m256i*>(mem[i + 2 * m]);
+        m256i* __restrict s = reinterpret_cast<m256i*>(mem[i + 3 * m]);
+
+        // #pragma omp parallel for
+        size_t j = 0;
+        // #pragma unroll
+        for (; j < end; j += 2) {
+            // First layer (c1, x, y) & (c1, u, v)
+            x1 = LOAD(p + j);
+            y1 = LOAD(q + j);
+            x2 = LOAD(p + j + 1);
+            y2 = LOAD(q + j + 1);
+
+            u1 = LOAD(r + j);
+            v1 = LOAD(s + j);
+            u2 = LOAD(r + j + 1);
+            v2 = LOAD(s + j + 1);
+
+            BUTTERFLY_R1(c1, &x1, &y1);
+            BUTTERFLY_R1(c1, &x2, &y2);
+
+            BUTTERFLY_R1(c1, &u1, &v1);
+            BUTTERFLY_R1(c1, &u2, &v2);
+
+            // Second layer (c2, x, u) & (c3, y, v)
+            BUTTERFLY_R2(c2, &x1, &u1);
+            BUTTERFLY_R2(c2, &x2, &u2);
+
+            BUTTERFLY_R3(c3, &y1, &v1);
+            BUTTERFLY_R3(c3, &y2, &v2);
+
+            // Store back to memory
+            STORE(p + j, x1);
+            STORE(p + j + 1, x2);
+            STORE(q + j, y1);
+            STORE(q + j + 1, y2);
+
+            STORE(r + j, u1);
+            STORE(r + j + 1, u2);
+            STORE(s + j, v1);
+            STORE(s + j + 1, v2);
+        }
+        for (; j < len; ++j) {
+            // First layer (c1, x, y) & (c1, u, v)
+            x1 = LOAD(p + j);
+            y1 = LOAD(q + j);
+            u1 = LOAD(r + j);
+            v1 = LOAD(s + j);
+
+            BUTTERFLY_R1(c1, &x1, &y1);
+            BUTTERFLY_R1(c1, &u1, &v1);
+            // Second layer (c2, x, u) & (c3, y, v)
+            BUTTERFLY_R2(c2, &x1, &u1);
+            BUTTERFLY_R3(c3, &y1, &v1);
+            // Store back to memory
+            STORE(p + j, x1);
+            STORE(q + j, y1);
+            STORE(r + j, u1);
+            STORE(s + j, v1);
+        }
+    }
 }
 
-/** Perform multiplication of two numbers a, b whose elements are of GF(card)
- *  where `card` is a prime Fermat number, i.e. card = Fx with x < 5
- *  Currently, it supports only for F3
+/**
+ * Vectorized butterly GS step
+ *
+ * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
+ *      P = P + Q
+ *      Q = r * (P - Q)
+ *
+ * @param buf - working buffers
+ * @param r - coefficient
+ * @param start - index of buffer among `m` ones
+ * @param m - current group size
+ * @param len - number of vectors per buffer
+ * @param card - modulo cardinal
  */
-inline m256i mul(m256i a, m256i b, aint16 card)
+inline void butterfly_gs_step(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t r,
+    unsigned start,
+    unsigned m,
+    size_t len,
+    uint16_t card)
 {
-    // FIXME: generalize card
-    assert(card == F3);
-    return mul(a, b);
+    const unsigned step = m << 1;
+    m256i c = SET1(r);
+
+#define BUTTERFLY_GS(x, y)                                                     \
+    (EITHER(                                                                   \
+        r == 1,                                                                \
+        BUTTERFLY_1(x, y, card),                                               \
+        EITHER(                                                                \
+            r < card - 1,                                                      \
+            BUTTERFLY_5(c, x, y, card),                                        \
+            BUTTERFLY_4(x, y, card))));
+
+    const size_t end = len - 1;
+    const unsigned bufs_nb = buf.get_n();
+    // #pragma omp parallel for
+    // #pragma unroll
+    const std::vector<uint16_t*>& mem = buf.get_mem();
+    for (unsigned i = start; i < bufs_nb; i += step) {
+        m256i x1, y1;
+        m256i x2, y2;
+        m256i* __restrict p = reinterpret_cast<m256i*>(mem[i]);
+        m256i* __restrict q = reinterpret_cast<m256i*>(mem[i + m]);
+
+        // #pragma omp parallel for
+        size_t j = 0;
+        // #pragma unroll
+        for (; j < end; j += 2) {
+            x1 = LOAD(p + j);
+            y1 = LOAD(q + j);
+            x2 = LOAD(p + j + 1);
+            y2 = LOAD(q + j + 1);
+
+            BUTTERFLY_GS(&x1, &y1);
+            BUTTERFLY_GS(&x2, &y2);
+
+            // Store back to memory
+            STORE(p + j, x1);
+            STORE(p + j + 1, x2);
+            STORE(q + j, y1);
+            STORE(q + j + 1, y2);
+        }
+        for (; j < len; ++j) {
+            x1 = LOAD(p + j);
+            y1 = LOAD(q + j);
+
+            BUTTERFLY_GS(&x1, &y1);
+
+            // Store back to memory
+            STORE(p + j, x1);
+            STORE(q + j, y1);
+        }
+    }
 }
 
-/** Apply an element-wise negation to a buffer
+/**
+ * Vectorized butterly GS step
+ *
+ * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
+ *      Q = r * Q
+ *
+ * @param buf - working buffers
+ * @param r - coefficient
+ * @param start - index of buffer among `m` ones
+ * @param m - current group size
+ * @param len - number of vectors per buffer
+ * @param card - modulo cardinal
  */
-inline void neg(size_t len, aint16* buf, aint16 card = F3)
+inline void butterfly_gs_step_simple(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t r,
+    unsigned start,
+    unsigned m,
+    size_t len,
+    uint16_t card)
 {
-    m256i* _buf = reinterpret_cast<m256i*>(buf);
-    unsigned ratio = sizeof(*_buf) / sizeof(*buf);
-    size_t _len = len / ratio;
-    size_t _last_len = len - _len * ratio;
+    const unsigned step = m << 1;
+    m256i c = SET1(r);
+
+#define BUTTERFLY_GS_S(x)                                                      \
+    (EITHER(                                                                   \
+        r == 1,                                                                \
+        (x),                                                                   \
+        EITHER(r < card - 1, MUL_MOD(c, x, card), NEG_MOD(x, card))));
+
+    const size_t end = len - 1;
+    const unsigned bufs_nb = buf.get_n();
+    // #pragma omp parallel for
+    // #pragma unroll
+    const std::vector<uint16_t*>& mem = buf.get_mem();
+    for (unsigned i = start; i < bufs_nb; i += step) {
+        m256i x1, y1;
+        m256i x2, y2;
+        m256i* __restrict p = reinterpret_cast<m256i*>(mem[i]);
+        m256i* __restrict q = reinterpret_cast<m256i*>(mem[i + m]);
+
+        // #pragma omp parallel for
+        size_t j = 0;
+        // #pragma unroll
+        for (; j < end; j += 2) {
+            x1 = LOAD(p + j);
+            x2 = LOAD(p + j + 1);
+
+            y1 = BUTTERFLY_GS_S(x1);
+            y2 = BUTTERFLY_GS_S(x2);
+
+            // Store back to memory
+            STORE(q + j, y1);
+            STORE(q + j + 1, y2);
+        }
+        for (; j < len; ++j) {
+            x1 = LOAD(p + j);
 
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        _buf[i] = neg(_buf[i], card);
+            y1 = BUTTERFLY_GS_S(x1);
+
+            // Store back to memory
+            STORE(q + j, y1);
+        }
     }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            if (buf[i])
-                buf[i] = card - buf[i];
+}
+
+inline void add_props_16(
+    Properties& props,
+    m256i threshold,
+    m256i mask,
+    m256i symb,
+    off_t offset)
+{
+    const m256i b = CMPEQ16(threshold, symb);
+    const m256i c = AND(mask, b);
+    uint32_t d = MVMSK8(c);
+    const unsigned element_size = sizeof(uint16_t);
+    while (d > 0) {
+        unsigned byte_idx = __builtin_ctz(d);
+        off_t _offset = offset + byte_idx / element_size;
+        props.add(_offset, OOR_MARK);
+        d ^= 1 << byte_idx;
+    }
+}
+
+inline void encode_post_process(
+    vec::Buffers<uint16_t>& output,
+    std::vector<Properties>& props,
+    off_t offset,
+    unsigned code_len,
+    uint16_t threshold,
+    size_t vecs_nb)
+{
+    const unsigned element_size = sizeof(uint16_t);
+    const unsigned vec_size = ALIGN_SIZE / element_size;
+    const uint16_t max = 1 << (element_size * 8 - 1);
+    const m256i _threshold = SET1(threshold);
+    const m256i mask_hi = SET1(max);
+
+    // #pragma unroll
+    const std::vector<uint16_t*>& mem = output.get_mem();
+    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
+        m256i* __restrict buf = reinterpret_cast<m256i*>(mem[frag_id]);
+
+        size_t vec_id = 0;
+        size_t end = vecs_nb - 3;
+        // #pragma unroll
+        for (; vec_id < end; vec_id += 4) {
+            m256i a1 = LOAD(buf + vec_id);
+            m256i a2 = LOAD(buf + vec_id + 1);
+            m256i a3 = LOAD(buf + vec_id + 2);
+            m256i a4 = LOAD(buf + vec_id + 3);
+
+            if (TESTZ(a1, _threshold) == 0) {
+                const off_t curr_offset = offset + vec_id * vec_size;
+                add_props_16(
+                    props[frag_id], _threshold, mask_hi, a1, curr_offset);
+            }
+            if (TESTZ(a2, _threshold) == 0) {
+                const off_t curr_offset = offset + (vec_id + 1) * vec_size;
+                add_props_16(
+                    props[frag_id], _threshold, mask_hi, a2, curr_offset);
+            }
+            if (TESTZ(a3, _threshold) == 0) {
+                const off_t curr_offset = offset + (vec_id + 2) * vec_size;
+                add_props_16(
+                    props[frag_id], _threshold, mask_hi, a3, curr_offset);
+            }
+            if (TESTZ(a4, _threshold) == 0) {
+                const off_t curr_offset = offset + (vec_id + 3) * vec_size;
+                add_props_16(
+                    props[frag_id], _threshold, mask_hi, a4, curr_offset);
+            }
+        }
+        for (; vec_id < vecs_nb; ++vec_id) {
+            m256i a = LOAD(buf + vec_id);
+            uint16_t c = TESTZ(a, _threshold);
+            if (c == 0) {
+                const off_t curr_offset = offset + vec_id * vec_size;
+                add_props_16(
+                    props[frag_id], _threshold, mask_hi, a, curr_offset);
+            }
         }
     }
 }
 
+/* ==================== Operations =================== */
 /** Perform a multiplication of a coefficient `a` to each element of `src` and
  *  add result to correspondent element of `dest`
+ *
+ * @note: 1 < `a` < card - 1
  */
 inline void mul_coef_to_buf(
-    const aint16 a,
+    const uint16_t a,
     aint16* src,
     aint16* dest,
     size_t len,
-    aint16 card = F3)
+    uint16_t card)
 {
-    const m256i coef = _mm256_set1_epi16(a);
+    const m256i coef = SET1(a);
 
-    m256i* _src = reinterpret_cast<m256i*>(src);
-    m256i* _dest = reinterpret_cast<m256i*>(dest);
+    m256i* __restrict _src = reinterpret_cast<m256i*>(src);
+    m256i* __restrict _dest = reinterpret_cast<m256i*>(dest);
     const unsigned ratio = sizeof(*_src) / sizeof(*src);
     const size_t _len = len / ratio;
     const size_t _last_len = len - _len * ratio;
@@ -181,22 +638,21 @@ inline void mul_coef_to_buf(
     size_t i;
     for (i = 0; i < _len; i++) {
         // perform multiplication
-        _dest[i] = mul(coef, _src[i], card);
+        _dest[i] = MUL_MOD(coef, _src[i], card);
     }
     if (_last_len > 0) {
-        uint32_t coef_doubled = (uint32_t)a;
+        uint32_t coef_32 = (uint32_t)a;
         for (i = _len * ratio; i < len; i++) {
             // perform multiplication
-            dest[i] = (aint16)((coef_doubled * src[i]) % card);
+            dest[i] = (aint16)((coef_32 * src[i]) % card);
         }
     }
 }
 
-inline void
-add_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3)
+inline void add_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card)
 {
-    m256i* _src = reinterpret_cast<m256i*>(src);
-    m256i* _dest = reinterpret_cast<m256i*>(dest);
+    m256i* __restrict _src = reinterpret_cast<m256i*>(src);
+    m256i* __restrict _dest = reinterpret_cast<m256i*>(dest);
     const unsigned ratio = sizeof(*_src) / sizeof(*src);
     const size_t _len = len / ratio;
     const size_t _last_len = len - _len * ratio;
@@ -204,7 +660,7 @@ add_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3)
     size_t i;
     for (i = 0; i < _len; i++) {
         // perform addition
-        _dest[i] = add(_src[i], _dest[i], card);
+        _dest[i] = ADD_MOD(_src[i], _dest[i], card);
     }
     if (_last_len > 0) {
         for (i = _len * ratio; i < len; i++) {
@@ -215,16 +671,12 @@ add_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3)
     }
 }
 
-inline void sub_two_bufs(
-    aint16* bufa,
-    aint16* bufb,
-    aint16* res,
-    size_t len,
-    aint16 card = F3)
+inline void
+sub_two_bufs(aint16* bufa, aint16* bufb, aint16* res, size_t len, aint16 card)
 {
-    m256i* _bufa = reinterpret_cast<m256i*>(bufa);
-    m256i* _bufb = reinterpret_cast<m256i*>(bufb);
-    m256i* _res = reinterpret_cast<m256i*>(res);
+    m256i* __restrict _bufa = reinterpret_cast<m256i*>(bufa);
+    m256i* __restrict _bufb = reinterpret_cast<m256i*>(bufb);
+    m256i* __restrict _res = reinterpret_cast<m256i*>(res);
     const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa);
     const size_t _len = len / ratio;
     const size_t _last_len = len - _len * ratio;
@@ -232,7 +684,7 @@ inline void sub_two_bufs(
     size_t i;
     for (i = 0; i < _len; i++) {
         // perform subtraction
-        _res[i] = sub(_bufa[i], _bufb[i], card);
+        _res[i] = SUB_MOD(_bufa[i], _bufb[i], card);
     }
     if (_last_len > 0) {
         for (i = _len * ratio; i < len; i++) {
@@ -245,11 +697,10 @@ inline void sub_two_bufs(
     }
 }
 
-inline void
-mul_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3)
+inline void mul_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card)
 {
-    m256i* _src = reinterpret_cast<m256i*>(src);
-    m256i* _dest = reinterpret_cast<m256i*>(dest);
+    m256i* __restrict _src = reinterpret_cast<m256i*>(src);
+    m256i* __restrict _dest = reinterpret_cast<m256i*>(dest);
     const unsigned ratio = sizeof(*_src) / sizeof(*src);
     const size_t _len = len / ratio;
     const size_t _last_len = len - _len * ratio;
@@ -257,93 +708,33 @@ mul_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3)
     size_t i;
     for (i = 0; i < _len; i++) {
         // perform multiplicaton
-        _dest[i] = mul(_src[i], _dest[i], card);
+        _dest[i] = MULFULL_MOD(_src[i], _dest[i], card);
     }
     if (_last_len > 0) {
         for (i = _len * ratio; i < len; i++) {
             // perform multiplicaton
-            dest[i] = (uint32_t(src[i]) * dest[i]) % card;
+            dest[i] = uint16_t((uint64_t(src[i]) * dest[i]) % card);
         }
     }
 }
 
-/*
- * buf1[i] = buf1[i] + coef * buf2[i]
- * buf2[i] = buf1[i] - coef * buf2[i]
+/** Apply an element-wise negation to a buffer
  */
-inline void butterfly_ct(
-    uint16_t coef,
-    aint16* buf1,
-    aint16* buf2,
-    size_t len,
-    uint16_t card = F3)
+inline void neg(size_t len, aint16* buf, aint16 card)
 {
-    const m256i _coef = _mm256_set1_epi16(coef);
-    m256i* _buf1 = reinterpret_cast<m256i*>(buf1);
-    m256i* _buf2 = reinterpret_cast<m256i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m256i a = mul(_coef, _buf2[i], card);
-        _buf2[i] = sub(_buf1[i], a, card);
-        _buf1[i] = add(_buf1[i], a, card);
-    }
-}
+    m256i* _buf = reinterpret_cast<m256i*>(buf);
+    unsigned ratio = sizeof(*_buf) / sizeof(*buf);
+    size_t _len = len / ratio;
+    size_t _last_len = len - _len * ratio;
 
-/*
- * buf1[i] = buf1[i] + buf2[i]
- * buf2[i] = coef * (buf1[i] - buf2[i])
- */
-inline void butterfly_gs(
-    uint16_t coef,
-    aint16* buf1,
-    aint16* buf2,
-    size_t len,
-    uint16_t card = F3)
-{
-    const m256i _coef = _mm256_set1_epi16(coef);
-    m256i* _buf1 = reinterpret_cast<m256i*>(buf1);
-    m256i* _buf2 = reinterpret_cast<m256i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m256i a = _buf1[i];
-        m256i b = _buf2[i];
-        m256i c = sub(a, b, card);
-        _buf1[i] = add(a, b, card);
-        _buf2[i] = mul(_coef, c, card);
+    size_t i;
+    for (i = 0; i < _len; i++) {
+        _buf[i] = NEG_MOD(_buf[i], card);
     }
-}
-
-inline void encode_post_process(
-    vec::Buffers<uint16_t>& output,
-    std::vector<Properties>& props,
-    off_t offset,
-    unsigned code_len,
-    uint16_t threshold,
-    size_t vecs_nb)
-{
-    const unsigned vec_size = simd::countof<uint16_t>();
-
-    const m256i _threshold = _mm256_set1_epi16(threshold);
-    uint16_t max = 1 << (sizeof(uint16_t) * 8 - 1);
-    const m256i mask_hi = _mm256_set1_epi16(max);
-    const unsigned element_size = sizeof(uint16_t);
-
-    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
-        uint16_t* chunk = output.get(frag_id);
-        m256i* buf = reinterpret_cast<m256i*>(chunk);
-        for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) {
-            const m256i a = _mm256_load_si256(&(buf[vec_id]));
-            const m256i b = _mm256_cmpeq_epi16(_threshold, a);
-            const m256i c = _mm256_and_si256(mask_hi, b);
-            uint32_t d = _mm256_movemask_epi8(c);
-
-            while (d > 0) {
-                unsigned byte_idx = __builtin_ctz(d);
-                unsigned element_idx = byte_idx / element_size;
-                off_t _offset = offset + vec_id * vec_size + element_idx;
-                props[frag_id].add(_offset, 1);
-                d ^= 1 << byte_idx;
-            }
+    if (_last_len > 0) {
+        for (i = _len * ratio; i < len; i++) {
+            if (buf[i])
+                buf[i] = card - buf[i];
         }
     }
 }
diff --git a/src/simd_256_u32.h b/src/simd_256_u32.h
index 9c76c89b..5302a472 100644
--- a/src/simd_256_u32.h
+++ b/src/simd_256_u32.h
@@ -38,212 +38,644 @@
 namespace quadiron {
 namespace simd {
 
-/* ==================== Essential Operations =================== */
+#define F4_u32 _mm256_set1_epi32(65537)
+#define F4m1_u32 _mm256_set1_epi32(65536)
+#define F3_u32 _mm256_set1_epi32(257)
+#define F3m1_u32 _mm256_set1_epi32(256)
 
-/** Perform a%card where a is a addition of two numbers whose elements are
- *  symbols of GF(card) */
-inline m256i mod_after_add(m256i a, aint32 card)
-{
-    const m256i _card = _mm256_set1_epi32(card);
-    const m256i _card_minus_1 = _mm256_set1_epi32(card - 1);
+#define CARD(q) (EITHER(q == F3, F3_u32, F4_u32))
+#define CARD_M_1(q) (EITHER(q == F3, F3m1_u32, F4m1_u32))
 
-    m256i cmp = _mm256_cmpgt_epi32(a, _card_minus_1);
-    m256i b = _mm256_sub_epi32(a, _mm256_and_si256(_card, cmp));
+/* ==================== Essential Operations =================== */
+// Following functions are used for AVX2 w/ u32 only
 
-    return b;
+inline m256i SET1(uint32_t val)
+{
+    return _mm256_set1_epi32(val);
 }
-
-/** Perform addition of two numbers a, b whose elements are of GF(card) */
-inline m256i add(m256i a, m256i b, aint32 card)
+inline m256i ADD32(m256i x, m256i y)
 {
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
-    m256i c = _mm256_add_epi32(_a, _b);
+    return _mm256_add_epi32(x, y);
+}
+inline m256i SUB32(m256i x, m256i y)
+{
+    return _mm256_sub_epi32(x, y);
+}
+inline m256i MUL32(m256i x, m256i y)
+{
+    return _mm256_mullo_epi32(x, y);
+}
 
-    // Modulo
-    return mod_after_add(c, card);
+inline m256i CMPEQ32(m256i x, m256i y)
+{
+    return _mm256_cmpeq_epi32(x, y);
+}
+inline m256i CMPGT32(m256i x, m256i y)
+{
+    return _mm256_cmpgt_epi32(x, y);
+}
+inline m256i MINU32(m256i x, m256i y)
+{
+    return _mm256_min_epu32(x, y);
 }
+#define BLEND16(x, y, imm8) (_mm256_blend_epi16(x, y, imm8))
 
-/** Perform subtraction of a by b where a, b whose elements are symbols of
- *  GF(card)
- * sub(a, b) = a - b if a >= b, or
- *             card + a - b, otherwise
- */
-inline m256i sub(m256i a, m256i b, aint32 card)
+// z = x + y mod q
+// Input are loaded to registers
+// Output is register
+inline m256i ADD_MOD(m256i x, m256i y, uint32_t q)
 {
-    const m256i _card = _mm256_set1_epi32(card);
+    m256i res = ADD32(x, y);
+    return MINU32(res, SUB32(res, CARD(q)));
+}
 
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
+// z = x - y mod q => z = q + x - y mod q
+// Input are loaded to registers
+// Output is register
+inline m256i SUB_MOD(m256i x, m256i y, uint32_t q)
+{
+    m256i res = SUB32(x, y);
+    return MINU32(res, ADD32(res, CARD(q)));
+}
 
-    m256i cmp = _mm256_cmpgt_epi32(_b, _a);
-    m256i _a1 = _mm256_add_epi32(_a, _mm256_and_si256(_card, cmp));
+// y = 0 - x mod q => y = q - x mod q
+// Input are loaded to registers
+// Output is register
+inline m256i NEG_MOD(m256i x, uint32_t q)
+{
+    m256i res = SUB32(CARD(q), x);
+    return MINU32(res, SUB32(res, CARD(q)));
+}
 
-    return _mm256_sub_epi32(_a1, _b);
+// z = x * y mod q
+// Input are loaded to registers
+// Output is register
+// Note: we assume that at least `x` or `y` is less than `q-1` so it's
+// not necessary to verify overflow on multiplying elements
+inline m256i MUL_MOD(m256i x, m256i y, uint32_t q)
+{
+    m256i res = MUL32(x, y);
+    m256i lo = BLEND16(ZERO, res, 0x55);
+    m256i hi = BLEND16(ZERO, SHIFTR_2(res), 0x55);
+    return SUB_MOD(lo, hi, q);
 }
 
-/** Negate `a`
- * @return 0 if (a == 0), else card - a
- */
-inline m256i neg(m256i a, aint32 card = F4)
+inline void MUL_MOD(m256i x, m256i y, m256i* z, uint32_t q)
+{
+    m256i res = MUL32(x, y);
+    m256i lo = BLEND16(ZERO, res, 0x55);
+    m256i hi = BLEND16(ZERO, SHIFTR_2(res), 0x55);
+    *z = SUB_MOD(lo, hi, q);
+}
+// z = x * y mod q
+// Input are loaded to registers
+// Output is register
+inline m256i MULFULL_MOD(m256i x, m256i y, uint32_t q)
 {
-    const m256i _card = _mm256_set1_epi32(card);
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_setzero_si256();
+    m256i res = MUL32(x, y);
 
-    m256i cmp = _mm256_cmpgt_epi32(_a, _b);
+    // filter elements of both of a & b = card-1
+    m256i cmp = AND(CMPEQ32(x, CARD_M_1(q)), CMPEQ32(y, CARD_M_1(q)));
+    res = (q == F3) ? XOR(res, AND(F4_u32, cmp)) : ADD32(res, AND(ONE, cmp));
 
-    return _mm256_sub_epi32(_mm256_and_si256(cmp, _card), _a);
+    m256i lo = BLEND16(ZERO, res, 0x55);
+    m256i hi = SHIFTR_2(BLEND16(ZERO, res, 0xAA));
+    return SUB_MOD(lo, hi, q);
 }
 
-/** Perform a%card where a is a multiplication of two numbers whose elements are
- *  symbols of GF(F4)
- *
- *  We find v in a = u * card + v
- *  a is expressed also as: a = hi * (card-1) + lo
- *  where hi and lo is 16-bit for F4 (or 8-bit for F3) high and low parts of a
- *  hence, v = (lo - hi) % F4
- *      v = lo - hi, if lo >= hi
- *          or
- *          F4 + lo - hi, otherwise
- */
-inline m256i mod_after_multiply_f4(m256i a)
+// butterfly CT with r == 1
+inline void BUTTERFLY_1(m256i* x, m256i* y, uint32_t q)
 {
-    const m256i mask = _mm256_set1_epi32(F4 - 2);
-
-    m256i lo = _mm256_and_si256(a, mask);
-
-    m256i a_shift = _mm256_srli_si256(a, 2);
-    m256i hi = _mm256_and_si256(a_shift, mask);
+    m256i add = ADD_MOD(*x, *y, q);
+    *y = SUB_MOD(*x, *y, q);
+    *x = add;
+}
 
-    m256i cmp = _mm256_cmpgt_epi32(hi, lo);
-    m256i _lo = _mm256_add_epi32(lo, _mm256_and_si256(F4_m256i, cmp));
+// butterfly CT with r == q - 1
+inline void BUTTERFLY_2(m256i* x, m256i* y, uint32_t q)
+{
+    m256i add = ADD_MOD(*x, *y, q);
+    *x = SUB_MOD(*x, *y, q);
+    *y = add;
+}
 
-    return _mm256_sub_epi32(_lo, hi);
+// butterfly CT with 1 < r < q - 1
+inline void BUTTERFLY_3(m256i c, m256i* x, m256i* y, uint32_t q)
+{
+    m256i z = MUL_MOD(c, *y, q);
+    *y = SUB_MOD(*x, z, q);
+    *x = ADD_MOD(*x, z, q);
 }
 
-inline m256i mod_after_multiply_f3(m256i a)
+// butterfly GS w/ r = q - 1
+inline void BUTTERFLY_4(m256i* x, m256i* y, uint32_t q)
 {
-    const m256i mask = _mm256_set1_epi32(F3 - 2);
+    m256i add = ADD_MOD(*x, *y, q);
+    *y = SUB_MOD(*y, *x, q);
+    *x = add;
+}
 
-    m256i lo = _mm256_and_si256(a, mask);
+// butterfly GS w/ 1 < r < q - 1
+// x = x + y mod q
+// y = z * (x - y) mod q
+inline void BUTTERFLY_5(m256i c, m256i* x, m256i* y, uint32_t q)
+{
+    m256i sub = SUB_MOD(*x, *y, q);
+    *x = ADD_MOD(*x, *y, q);
+    *y = MUL_MOD(c, sub, q);
+}
 
-    m256i a_shift = _mm256_srli_si256(a, 1);
-    m256i hi = _mm256_and_si256(a_shift, mask);
+/**
+ * Vectorized butterly CT step
+ *
+ * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
+ *      P = P + r * Q
+ *      Q = P - r * Q
+ *
+ * @param buf - working buffers
+ * @param r - coefficient
+ * @param start - index of buffer among `m` ones
+ * @param m - current group size
+ * @param len - number of vectors per buffer
+ * @param card - modulo cardinal
+ */
+inline void butterfly_ct_step(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t r,
+    unsigned start,
+    unsigned m,
+    size_t len,
+    uint32_t card)
+{
+    const unsigned step = m << 1;
+    m256i c = SET1(r);
+
+#define BUTTERFLY_CT(x, y)                                                     \
+    (EITHER(                                                                   \
+        r == 1,                                                                \
+        BUTTERFLY_1(x, y, card),                                               \
+        EITHER(                                                                \
+            r < card - 1,                                                      \
+            BUTTERFLY_3(c, x, y, card),                                        \
+            BUTTERFLY_2(x, y, card))));
+
+    const size_t end = len - 1;
+    const unsigned bufs_nb = buf.get_n();
+    // #pragma omp parallel for
+    // #pragma unroll
+    const std::vector<uint32_t*>& mem = buf.get_mem();
+    for (unsigned i = start; i < bufs_nb; i += step) {
+        m256i x1, y1;
+        m256i x2, y2;
+        m256i* __restrict p = reinterpret_cast<m256i*>(mem[i]);
+        m256i* __restrict q = reinterpret_cast<m256i*>(mem[i + m]);
+
+        // #pragma omp parallel for
+        size_t j = 0;
+        // #pragma unroll
+        for (; j < end; j += 2) {
+            x1 = LOAD(p + j);
+            y1 = LOAD(q + j);
+            x2 = LOAD(p + j + 1);
+            y2 = LOAD(q + j + 1);
+
+            BUTTERFLY_CT(&x1, &y1);
+            BUTTERFLY_CT(&x2, &y2);
+
+            // Store back to memory
+            STORE(p + j, x1);
+            STORE(p + j + 1, x2);
+            STORE(q + j, y1);
+            STORE(q + j + 1, y2);
+        }
+        for (; j < len; ++j) {
+            x1 = LOAD(p + j);
+            y1 = LOAD(q + j);
 
-    m256i cmp = _mm256_cmpgt_epi32(hi, lo);
-    m256i _lo = _mm256_add_epi32(lo, _mm256_and_si256(F3_m256i, cmp));
+            BUTTERFLY_CT(&x1, &y1);
 
-    return _mm256_sub_epi32(_lo, hi);
+            // Store back to memory
+            STORE(p + j, x1);
+            STORE(q + j, y1);
+        }
+    }
 }
 
-inline m256i mul_f4(m256i a, m256i b)
+/**
+ * Vectorized butterly CT on two-layers at a time
+ *
+ * For each quadruple
+ * (P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m])
+ * First layer: butterfly on (P, Q) and (R, S) for step = 2 * m
+ *      coef r1 = W[start * n / (2 * m)]
+ *      P = P + r1 * Q
+ *      Q = P - r1 * Q
+ *      R = R + r1 * S
+ *      S = R - r1 * S
+ * Second layer: butterfly on (P, R) and (Q, S) for step = 4 * m
+ *      coef r2 = W[start * n / (4 * m)]
+ *      coef r3 = W[(start + m) * n / (4 * m)]
+ *      P = P + r2 * R
+ *      R = P - r2 * R
+ *      Q = Q + r3 * S
+ *      S = Q - r3 * S
+ *
+ * @param buf - working buffers
+ * @param r1 - coefficient for the 1st layer
+ * @param r2 - 1st coefficient for the 2nd layer
+ * @param r3 - 2nd coefficient for the 2nd layer
+ * @param start - index of buffer among `m` ones
+ * @param m - current group size
+ * @param len - number of vectors per buffer
+ * @param card - modulo cardinal
+ */
+inline void butterfly_ct_two_layers_step(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t r1,
+    uint32_t r2,
+    uint32_t r3,
+    unsigned start,
+    unsigned m,
+    size_t len,
+    uint32_t card)
 {
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
-
-    m256i c = _mm256_mullo_epi32(_a, _b);
+    const unsigned step = m << 2;
+    m256i c1 = SET1(r1);
+    m256i c2 = SET1(r2);
+    m256i c3 = SET1(r3);
+
+#define BUTTERFLY_R1(c, x, y)                                                  \
+    (EITHER(                                                                   \
+        r1 == 1,                                                               \
+        BUTTERFLY_1(x, y, card),                                               \
+        EITHER(                                                                \
+            r1 < card - 1,                                                     \
+            BUTTERFLY_3(c, x, y, card),                                        \
+            BUTTERFLY_2(x, y, card))));
+#define BUTTERFLY_R2(c, x, y)                                                  \
+    (EITHER(                                                                   \
+        r2 == 1,                                                               \
+        BUTTERFLY_1(x, y, card),                                               \
+        EITHER(                                                                \
+            r2 < card - 1,                                                     \
+            BUTTERFLY_3(c, x, y, card),                                        \
+            BUTTERFLY_2(x, y, card))));
+#define BUTTERFLY_R3(c, x, y)                                                  \
+    (EITHER(                                                                   \
+        r3 == 1,                                                               \
+        BUTTERFLY_1(x, y, card),                                               \
+        EITHER(                                                                \
+            r3 < card - 1,                                                     \
+            BUTTERFLY_3(c, x, y, card),                                        \
+            BUTTERFLY_2(x, y, card))));
+
+    const size_t end = len - 1;
+    const unsigned bufs_nb = buf.get_n();
+    // #pragma omp parallel for
+    // #pragma unroll
+    const std::vector<uint32_t*>& mem = buf.get_mem();
+    for (unsigned i = start; i < bufs_nb; i += step) {
+        m256i x1, y1, u1, v1;
+        m256i x2, y2, u2, v2;
+        m256i* __restrict p = reinterpret_cast<m256i*>(mem[i]);
+        m256i* __restrict q = reinterpret_cast<m256i*>(mem[i + m]);
+        m256i* __restrict r = reinterpret_cast<m256i*>(mem[i + 2 * m]);
+        m256i* __restrict s = reinterpret_cast<m256i*>(mem[i + 3 * m]);
+
+        // #pragma omp parallel for
+        size_t j = 0;
+        // #pragma unroll
+        for (; j < end; j += 2) {
+            // First layer (c1, x, y) & (c1, u, v)
+            x1 = LOAD(p + j);
+            y1 = LOAD(q + j);
+            x2 = LOAD(p + j + 1);
+            y2 = LOAD(q + j + 1);
+
+            u1 = LOAD(r + j);
+            v1 = LOAD(s + j);
+            u2 = LOAD(r + j + 1);
+            v2 = LOAD(s + j + 1);
+
+            BUTTERFLY_R1(c1, &x1, &y1);
+            BUTTERFLY_R1(c1, &x2, &y2);
+
+            BUTTERFLY_R1(c1, &u1, &v1);
+            BUTTERFLY_R1(c1, &u2, &v2);
+
+            // Second layer (c2, x, u) & (c3, y, v)
+            BUTTERFLY_R2(c2, &x1, &u1);
+            BUTTERFLY_R2(c2, &x2, &u2);
+
+            BUTTERFLY_R3(c3, &y1, &v1);
+            BUTTERFLY_R3(c3, &y2, &v2);
+
+            // Store back to memory
+            STORE(p + j, x1);
+            STORE(p + j + 1, x2);
+            STORE(q + j, y1);
+            STORE(q + j + 1, y2);
+
+            STORE(r + j, u1);
+            STORE(r + j + 1, u2);
+            STORE(s + j, v1);
+            STORE(s + j + 1, v2);
+        }
+        for (; j < len; ++j) {
+            // First layer (c1, x, y) & (c1, u, v)
+            x1 = LOAD(p + j);
+            y1 = LOAD(q + j);
+            u1 = LOAD(r + j);
+            v1 = LOAD(s + j);
+
+            BUTTERFLY_R1(c1, &x1, &y1);
+            BUTTERFLY_R1(c1, &u1, &v1);
+            // Second layer (c2, x, u) & (c3, y, v)
+            BUTTERFLY_R2(c2, &x1, &u1);
+            BUTTERFLY_R3(c3, &y1, &v1);
+            // Store back to memory
+            STORE(p + j, x1);
+            STORE(q + j, y1);
+            STORE(r + j, u1);
+            STORE(s + j, v1);
+        }
+    }
+}
 
-    // filter elements of both of a & b = card-1
-    m256i cmp = _mm256_and_si256(
-        _mm256_cmpeq_epi32(_a, F4minus1_m256i),
-        _mm256_cmpeq_epi32(_b, F4minus1_m256i));
+/**
+ * Vectorized butterly GS step
+ *
+ * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
+ *      P = P + Q
+ *      Q = r * (P - Q)
+ *
+ * @param buf - working buffers
+ * @param r - coefficient
+ * @param start - index of buffer among `m` ones
+ * @param m - current group size
+ * @param len - number of vectors per buffer
+ * @param card - modulo cardinal
+ */
+inline void butterfly_gs_step(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t r,
+    unsigned start,
+    unsigned m,
+    size_t len,
+    uint32_t card)
+{
+    const unsigned step = m << 1;
+    m256i c = SET1(r);
+
+#define BUTTERFLY_GS(x, y)                                                     \
+    (EITHER(                                                                   \
+        r == 1,                                                                \
+        BUTTERFLY_1(x, y, card),                                               \
+        EITHER(                                                                \
+            r < card - 1,                                                      \
+            BUTTERFLY_5(c, x, y, card),                                        \
+            BUTTERFLY_4(x, y, card))));
+
+    const size_t end = len - 3;
+    const unsigned bufs_nb = buf.get_n();
+    // #pragma omp parallel for
+    // #pragma unroll
+    const std::vector<uint32_t*>& mem = buf.get_mem();
+    for (unsigned i = start; i < bufs_nb; i += step) {
+        m256i x1, x2, x3, x4;
+        m256i y1, y2, y3, y4;
+        m256i* __restrict p = reinterpret_cast<m256i*>(mem[i]);
+        m256i* __restrict q = reinterpret_cast<m256i*>(mem[i + m]);
+
+        // #pragma omp parallel for
+        size_t j = 0;
+        // #pragma unroll
+        for (; j < end; j += 4) {
+            x1 = LOAD(p + j);
+            x2 = LOAD(p + j + 1);
+            x3 = LOAD(p + j + 2);
+            x4 = LOAD(p + j + 3);
+            y1 = LOAD(q + j);
+            y2 = LOAD(q + j + 1);
+            y3 = LOAD(q + j + 2);
+            y4 = LOAD(q + j + 3);
+
+            BUTTERFLY_GS(&x1, &y1);
+            BUTTERFLY_GS(&x2, &y2);
+            BUTTERFLY_GS(&x3, &y3);
+            BUTTERFLY_GS(&x4, &y4);
+
+            // Store back to memory
+            STORE(p + j, x1);
+            STORE(p + j + 1, x2);
+            STORE(p + j + 2, x3);
+            STORE(p + j + 3, x4);
+            STORE(q + j, y1);
+            STORE(q + j + 1, y2);
+            STORE(q + j + 2, y3);
+            STORE(q + j + 3, y4);
+        }
+        for (; j < len; ++j) {
+            x1 = LOAD(p + j);
+            y1 = LOAD(q + j);
 
-    const m256i one = _mm256_set1_epi32(1);
-    c = _mm256_add_epi32(c, _mm256_and_si256(one, cmp));
+            BUTTERFLY_GS(&x1, &y1);
 
-    // Modulo
-    return mod_after_multiply_f4(c);
+            // Store back to memory
+            STORE(p + j, x1);
+            STORE(q + j, y1);
+        }
+    }
 }
 
-inline m256i mul_f3(m256i a, m256i b)
+/**
+ * Vectorized butterly GS step
+ *
+ * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
+ *      Q = r * P
+ *
+ * @param buf - working buffers
+ * @param r - coefficient
+ * @param start - index of buffer among `m` ones
+ * @param m - current group size
+ * @param len - number of vectors per buffer
+ * @param card - modulo cardinal
+ */
+inline void butterfly_gs_step_simple(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t r,
+    unsigned start,
+    unsigned m,
+    size_t len,
+    uint32_t card)
 {
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
-
-    m256i c = _mm256_mullo_epi32(_a, _b);
-
-    // filter elements of both of a & b = card-1
-    m256i cmp = _mm256_and_si256(
-        _mm256_cmpeq_epi32(_a, F3minus1_m256i),
-        _mm256_cmpeq_epi32(_b, F3minus1_m256i));
+    const unsigned step = m << 1;
+    m256i c = SET1(r);
+
+#define BUTTERFLY_GS_S(x)                                                      \
+    (EITHER(                                                                   \
+        r == 1,                                                                \
+        (x),                                                                   \
+        EITHER(r < card - 1, MUL_MOD(c, x, card), NEG_MOD(x, card))));
+
+    const size_t end = len - 1;
+    const unsigned bufs_nb = buf.get_n();
+    // #pragma omp parallel for
+    // #pragma unroll
+    const std::vector<uint32_t*>& mem = buf.get_mem();
+    for (unsigned i = start; i < bufs_nb; i += step) {
+        m256i x1, y1;
+        m256i x2, y2;
+        m256i* __restrict p = reinterpret_cast<m256i*>(mem[i]);
+        m256i* __restrict q = reinterpret_cast<m256i*>(mem[i + m]);
+
+        // #pragma omp parallel for
+        size_t j = 0;
+        // #pragma unroll
+        for (; j < end; j += 2) {
+            x1 = LOAD(p + j);
+            x2 = LOAD(p + j + 1);
+
+            y1 = BUTTERFLY_GS_S(x1);
+            y2 = BUTTERFLY_GS_S(x2);
+
+            // Store back to memory
+            STORE(q + j, y1);
+            STORE(q + j + 1, y2);
+        }
+        for (; j < len; ++j) {
+            x1 = LOAD(p + j);
 
-    c = _mm256_xor_si256(c, _mm256_and_si256(F4_m256i, cmp));
+            y1 = BUTTERFLY_GS_S(x1);
 
-    // Modulo
-    return mod_after_multiply_f3(c);
+            // Store back to memory
+            STORE(q + j, y1);
+        }
+    }
 }
 
-/** Perform multiplication of two numbers a, b whose elements are of GF(card)
- *  where `card` is a prime Fermat number, i.e. card = Fx with x < 5
- *  Currently, it supports only for F3 and F4
- */
-inline m256i mul(m256i a, m256i b, aint32 card)
+inline void add_props(
+    Properties& props,
+    m256i threshold,
+    m256i mask,
+    m256i symb,
+    off_t offset)
 {
-    assert(card == F4 || card == F3);
-    if (card == F4)
-        return mul_f4(a, b);
-    return mul_f3(a, b);
+    const m256i b = CMPEQ32(threshold, symb);
+    const m256i c = AND(mask, b);
+    uint32_t d = MVMSK8(c);
+    const unsigned element_size = sizeof(uint32_t);
+    while (d > 0) {
+        unsigned byte_idx = __builtin_ctz(d);
+        off_t _offset = offset + byte_idx / element_size;
+        props.add(_offset, OOR_MARK);
+        d ^= 1 << byte_idx;
+    }
 }
 
-/** Apply an element-wise negation to a buffer
- */
-inline void neg(size_t len, aint32* buf, aint32 card = F4)
+inline void encode_post_process(
+    vec::Buffers<uint32_t>& output,
+    std::vector<Properties>& props,
+    off_t offset,
+    unsigned code_len,
+    uint32_t threshold,
+    size_t vecs_nb)
 {
-    m256i* _buf = reinterpret_cast<m256i*>(buf);
-    unsigned ratio = sizeof(*_buf) / sizeof(*buf);
-    size_t _len = len / ratio;
-    size_t _last_len = len - _len * ratio;
+    const unsigned element_size = sizeof(uint32_t);
+    const unsigned vec_size = ALIGN_SIZE / element_size;
+    const uint32_t max = 1 << (element_size * 8 - 1);
+    const m256i _threshold = SET1(threshold);
+    const m256i mask_hi = SET1(max);
 
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        _buf[i] = neg(_buf[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            if (buf[i])
-                buf[i] = card - buf[i];
+    // #pragma unroll
+    const std::vector<uint32_t*>& mem = output.get_mem();
+    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
+        m256i* __restrict buf = reinterpret_cast<m256i*>(mem[frag_id]);
+
+        size_t vec_id = 0;
+        size_t end = vecs_nb - 3;
+        // #pragma unroll
+        for (; vec_id < end; vec_id += 4) {
+            m256i a1 = LOAD(buf + vec_id);
+            m256i a2 = LOAD(buf + vec_id + 1);
+            m256i a3 = LOAD(buf + vec_id + 2);
+            m256i a4 = LOAD(buf + vec_id + 3);
+
+            if (TESTZ(a1, _threshold) == 0) {
+                const off_t curr_offset = offset + vec_id * vec_size;
+                add_props(props[frag_id], _threshold, mask_hi, a1, curr_offset);
+            }
+            if (TESTZ(a2, _threshold) == 0) {
+                const off_t curr_offset = offset + (vec_id + 1) * vec_size;
+                add_props(props[frag_id], _threshold, mask_hi, a2, curr_offset);
+            }
+            if (TESTZ(a3, _threshold) == 0) {
+                const off_t curr_offset = offset + (vec_id + 2) * vec_size;
+                add_props(props[frag_id], _threshold, mask_hi, a3, curr_offset);
+            }
+            if (TESTZ(a4, _threshold) == 0) {
+                const off_t curr_offset = offset + (vec_id + 3) * vec_size;
+                add_props(props[frag_id], _threshold, mask_hi, a4, curr_offset);
+            }
+        }
+        for (; vec_id < vecs_nb; ++vec_id) {
+            m256i a = LOAD(buf + vec_id);
+            uint32_t c = TESTZ(a, _threshold);
+            if (c == 0) {
+                const off_t curr_offset = offset + vec_id * vec_size;
+                add_props(props[frag_id], _threshold, mask_hi, a, curr_offset);
+            }
         }
     }
 }
 
+/* ==================== Operations =================== */
 /** Perform a multiplication of a coefficient `a` to each element of `src` and
  *  add result to correspondent element of `dest`
+ *
+ * @note: 1 < `a` < card - 1
  */
 inline void mul_coef_to_buf(
-    const aint32 a,
+    const uint32_t a,
     aint32* src,
     aint32* dest,
     size_t len,
-    aint32 card = F4)
+    uint32_t card)
 {
-    const m256i coef = _mm256_set1_epi32(a);
+    const m256i coef = SET1(a);
 
-    m256i* _src = reinterpret_cast<m256i*>(src);
-    m256i* _dest = reinterpret_cast<m256i*>(dest);
+    m256i* __restrict _src = reinterpret_cast<m256i*>(src);
+    m256i* __restrict _dest = reinterpret_cast<m256i*>(dest);
     const unsigned ratio = sizeof(*_src) / sizeof(*src);
     const size_t _len = len / ratio;
     const size_t _last_len = len - _len * ratio;
 
-    size_t i;
-    for (i = 0; i < _len; i++) {
+    size_t i = 0;
+    size_t end = _len - 3;
+    for (; i < end; i += 4) {
         // perform multiplication
-        _dest[i] = mul(coef, _src[i], card);
+        MUL_MOD(coef, _src[i], _dest + i, card);
+        MUL_MOD(coef, _src[i + 1], _dest + i + 1, card);
+        MUL_MOD(coef, _src[i + 2], _dest + i + 2, card);
+        MUL_MOD(coef, _src[i + 3], _dest + i + 3, card);
+    }
+    for (; i < _len; ++i) {
+        MUL_MOD(coef, _src[i], _dest + i, card);
     }
+
     if (_last_len > 0) {
         uint64_t coef_64 = (uint64_t)a;
-        for (i = _len * ratio; i < len; i++) {
+        for (size_t i = _len * ratio; i < len; i++) {
             // perform multiplication
             dest[i] = (aint32)((coef_64 * src[i]) % card);
         }
     }
 }
 
-inline void
-add_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4)
+inline void add_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card)
 {
-    m256i* _src = reinterpret_cast<m256i*>(src);
-    m256i* _dest = reinterpret_cast<m256i*>(dest);
+    m256i* __restrict _src = reinterpret_cast<m256i*>(src);
+    m256i* __restrict _dest = reinterpret_cast<m256i*>(dest);
     const unsigned ratio = sizeof(*_src) / sizeof(*src);
     const size_t _len = len / ratio;
     const size_t _last_len = len - _len * ratio;
@@ -251,7 +683,7 @@ add_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4)
     size_t i;
     for (i = 0; i < _len; i++) {
         // perform addition
-        _dest[i] = add(_src[i], _dest[i], card);
+        _dest[i] = ADD_MOD(_src[i], _dest[i], card);
     }
     if (_last_len > 0) {
         for (i = _len * ratio; i < len; i++) {
@@ -269,9 +701,9 @@ inline void sub_two_bufs(
     size_t len,
     aint32 card = F4)
 {
-    m256i* _bufa = reinterpret_cast<m256i*>(bufa);
-    m256i* _bufb = reinterpret_cast<m256i*>(bufb);
-    m256i* _res = reinterpret_cast<m256i*>(res);
+    m256i* __restrict _bufa = reinterpret_cast<m256i*>(bufa);
+    m256i* __restrict _bufb = reinterpret_cast<m256i*>(bufb);
+    m256i* __restrict _res = reinterpret_cast<m256i*>(res);
     const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa);
     const size_t _len = len / ratio;
     const size_t _last_len = len - _len * ratio;
@@ -279,7 +711,7 @@ inline void sub_two_bufs(
     size_t i;
     for (i = 0; i < _len; i++) {
         // perform subtraction
-        _res[i] = sub(_bufa[i], _bufb[i], card);
+        _res[i] = SUB_MOD(_bufa[i], _bufb[i], card);
     }
     if (_last_len > 0) {
         for (i = _len * ratio; i < len; i++) {
@@ -292,11 +724,10 @@ inline void sub_two_bufs(
     }
 }
 
-inline void
-mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4)
+inline void mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card)
 {
-    m256i* _src = reinterpret_cast<m256i*>(src);
-    m256i* _dest = reinterpret_cast<m256i*>(dest);
+    m256i* __restrict _src = reinterpret_cast<m256i*>(src);
+    m256i* __restrict _dest = reinterpret_cast<m256i*>(dest);
     const unsigned ratio = sizeof(*_src) / sizeof(*src);
     const size_t _len = len / ratio;
     const size_t _last_len = len - _len * ratio;
@@ -304,7 +735,7 @@ mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4)
     size_t i;
     for (i = 0; i < _len; i++) {
         // perform multiplicaton
-        _dest[i] = mul(_src[i], _dest[i], card);
+        _dest[i] = MULFULL_MOD(_src[i], _dest[i], card);
     }
     if (_last_len > 0) {
         for (i = _len * ratio; i < len; i++) {
@@ -314,81 +745,23 @@ mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4)
     }
 }
 
-/*
- * buf1[i] = buf1[i] + coef * buf2[i]
- * buf2[i] = buf1[i] - coef * buf2[i]
- */
-inline void butterfly_ct(
-    uint32_t coef,
-    aint32* buf1,
-    aint32* buf2,
-    size_t len,
-    uint32_t card = F4)
-{
-    const m256i _coef = _mm256_set1_epi32(coef);
-    m256i* _buf1 = reinterpret_cast<m256i*>(buf1);
-    m256i* _buf2 = reinterpret_cast<m256i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m256i a = mul(_coef, _buf2[i], card);
-        _buf2[i] = sub(_buf1[i], a, card);
-        _buf1[i] = add(_buf1[i], a, card);
-    }
-}
-
-/*
- * buf1[i] = buf1[i] + buf2[i]
- * buf2[i] = coef * (buf1[i] - buf2[i])
+/** Apply an element-wise negation to a buffer
  */
-inline void butterfly_gs(
-    uint32_t coef,
-    aint32* buf1,
-    aint32* buf2,
-    size_t len,
-    uint32_t card = F4)
+inline void neg(size_t len, aint32* buf, aint32 card = F4)
 {
-    const m256i _coef = _mm256_set1_epi32(coef);
-    m256i* _buf1 = reinterpret_cast<m256i*>(buf1);
-    m256i* _buf2 = reinterpret_cast<m256i*>(buf2);
+    m256i* _buf = reinterpret_cast<m256i*>(buf);
+    unsigned ratio = sizeof(*_buf) / sizeof(*buf);
+    size_t _len = len / ratio;
+    size_t _last_len = len - _len * ratio;
 
-    for (size_t i = 0; i < len; ++i) {
-        m256i a = add(_buf1[i], _buf2[i], card);
-        _buf2[i] = mul(_coef, sub(_buf1[i], _buf2[i], card), card);
-        _buf1[i] = a;
+    size_t i;
+    for (i = 0; i < _len; i++) {
+        _buf[i] = NEG_MOD(_buf[i], card);
     }
-}
-
-inline void encode_post_process(
-    vec::Buffers<uint32_t>& output,
-    std::vector<Properties>& props,
-    off_t offset,
-    unsigned code_len,
-    uint32_t threshold,
-    size_t vecs_nb)
-{
-    const unsigned vec_size = simd::countof<uint32_t>();
-
-    const m256i _threshold = _mm256_set1_epi32(threshold);
-    const uint32_t max = 1 << (sizeof(uint32_t) * 8 - 1);
-    const m256i mask_hi = _mm256_set1_epi32(max);
-    const unsigned element_size = sizeof(uint32_t);
-
-    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
-        uint32_t* chunk = output.get(frag_id);
-        m256i* buf = reinterpret_cast<m256i*>(chunk);
-        for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) {
-            const m256i a = _mm256_load_si256(&(buf[vec_id]));
-            const m256i b = _mm256_cmpeq_epi32(_threshold, a);
-            const m256i c = _mm256_and_si256(mask_hi, b);
-            uint32_t d = _mm256_movemask_epi8(c);
-
-            while (d > 0) {
-                unsigned byte_idx = __builtin_ctz(d);
-                unsigned element_idx = byte_idx / element_size;
-                off_t _offset = offset + vec_id * vec_size + element_idx;
-                props[frag_id].add(_offset, 1);
-                d ^= 1 << byte_idx;
-            }
+    if (_last_len > 0) {
+        for (i = _len * ratio; i < len; i++) {
+            if (buf[i])
+                buf[i] = card - buf[i];
         }
     }
 }
@@ -408,7 +781,7 @@ inline __uint128_t add(__uint128_t a, __uint128_t b)
 {
     m256i _a = _mm256_castsi128_si256((m128i)a);
     m256i _b = _mm256_castsi128_si256((m128i)b);
-    m256i res = add(_a, _b, F4);
+    m256i res = ADD_MOD(_a, _b, F4);
     return m256i_to_uint128(res);
 }
 
@@ -416,7 +789,7 @@ inline __uint128_t sub(__uint128_t a, __uint128_t b)
 {
     m256i _a = _mm256_castsi128_si256((m128i)a);
     m256i _b = _mm256_castsi128_si256((m128i)b);
-    m256i res = sub(_a, _b, F4);
+    m256i res = SUB_MOD(_a, _b, F4);
     return m256i_to_uint128(res);
 }
 
@@ -424,7 +797,7 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b)
 {
     m256i _a = _mm256_castsi128_si256((m128i)a);
     m256i _b = _mm256_castsi128_si256((m128i)b);
-    m256i res = mul(_a, _b, F4);
+    m256i res = MULFULL_MOD(_a, _b, F4);
     return m256i_to_uint128(res);
 }
 
@@ -446,7 +819,7 @@ inline void hadamard_mul(int n, aint128* _x, aint128* _y)
 
     // multiply y to the first half of `x`
     for (i = 0; i < len_256; i++) {
-        x[i] = mul(x[i], y[i], F4);
+        x[i] = MULFULL_MOD(x[i], y[i], F4);
     }
 
     if (last_len > 0) {

From 8e7fb47b699a8cb99125b8fb4aa6c2cb6a470291 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:42:31 +0200
Subject: [PATCH 07/77] SIMD 256 u16 u32: remove useless files

---
 src/simd_256_u16.h | 745 ----------------------------------------
 src/simd_256_u32.h | 839 ---------------------------------------------
 2 files changed, 1584 deletions(-)
 delete mode 100644 src/simd_256_u16.h
 delete mode 100644 src/simd_256_u32.h

diff --git a/src/simd_256_u16.h b/src/simd_256_u16.h
deleted file mode 100644
index 74d35d24..00000000
--- a/src/simd_256_u16.h
+++ /dev/null
@@ -1,745 +0,0 @@
-/*
- * Copyright 2017-2018 Scality
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __QUAD_SIMD_256_U16_H__
-#define __QUAD_SIMD_256_U16_H__
-
-#include <x86intrin.h>
-
-#include "simd/simd.h"
-
-namespace quadiron {
-namespace simd {
-
-#define F3_u16 _mm256_set1_epi16(257)
-#define F3m1_u16 _mm256_set1_epi16(256)
-
-/* ==================== Essential Operations =================== */
-// Following functions are used for AVX2 w/ u16 only
-
-inline m256i SET1(uint16_t val)
-{
-    return _mm256_set1_epi16(val);
-}
-inline m256i ADD16(m256i x, m256i y)
-{
-    return _mm256_add_epi16(x, y);
-}
-inline m256i SUB16(m256i x, m256i y)
-{
-    return _mm256_sub_epi16(x, y);
-}
-inline m256i MUL16(m256i x, m256i y)
-{
-    return _mm256_mullo_epi16(x, y);
-}
-
-inline m256i CMPEQ16(m256i x, m256i y)
-{
-    return _mm256_cmpeq_epi16(x, y);
-}
-inline m256i CMPGT16(m256i x, m256i y)
-{
-    return _mm256_cmpgt_epi16(x, y);
-}
-inline m256i MINU16(m256i x, m256i y)
-{
-    return _mm256_min_epu16(x, y);
-}
-
-#define MASK8_LO (_mm256_set1_epi16(0x80))
-#define BLEND8(x, y, mask) (_mm256_blendv_epi8(x, y, mask))
-
-// z = x + y mod q
-// Input are loaded to registers
-// Output is register
-inline m256i ADD_MOD(m256i x, m256i y, uint16_t q)
-{
-    m256i res = ADD16(x, y);
-    return MINU16(res, SUB16(res, F3_u16));
-}
-
-// z = x - y mod q => z = q + x - y mod q
-// Input are loaded to registers
-// Output is register
-inline m256i SUB_MOD(m256i x, m256i y, uint16_t q)
-{
-    m256i res = SUB16(x, y);
-    return MINU16(res, SUB16(ADD16(x, F3_u16), y));
-}
-
-// y = 0 - x mod q => y = q - x mod q
-// Input are loaded to registers
-// Output is register
-inline m256i NEG_MOD(m256i x, uint16_t q)
-{
-    m256i res = SUB16(F3_u16, x);
-    return MINU16(res, SUB16(res, F3_u16));
-}
-
-// z = x * y mod q
-// Input are loaded to registers
-// Output is register
-// Note: we assume that at least `x` or `y` is less than `q-1` so it's
-// not necessary to verify overflow on multiplying elements
-inline m256i MUL_MOD(m256i x, m256i y, uint16_t q)
-{
-    m256i res = MUL16(x, y);
-    m256i lo = BLEND8(ZERO, res, MASK8_LO);
-    m256i hi = BLEND8(ZERO, SHIFTR_1(res), MASK8_LO);
-    return SUB_MOD(lo, hi, q);
-}
-
-// z = x * y mod q
-// Input are loaded to registers
-// Output is register
-inline m256i MULFULL_MOD(m256i x, m256i y, uint16_t q)
-{
-    m256i res = MUL16(x, y);
-
-    // filter elements of both of a & b = card-1
-    m256i cmp = AND(CMPEQ16(x, F3m1_u16), CMPEQ16(y, F3m1_u16));
-    res = ADD16(res, AND(ONE, cmp));
-
-    m256i lo = BLEND8(ZERO, res, MASK8_LO);
-    m256i hi = BLEND8(ZERO, SHIFTR_1(res), MASK8_LO);
-    return SUB_MOD(lo, hi, q);
-}
-
-// butterfly CT with r == 1
-inline void BUTTERFLY_1(m256i* x, m256i* y, uint16_t q)
-{
-    m256i add = ADD_MOD(*x, *y, q);
-    *y = SUB_MOD(*x, *y, q);
-    *x = add;
-}
-
-// butterfly CT with r == q - 1
-inline void BUTTERFLY_2(m256i* x, m256i* y, uint16_t q)
-{
-    m256i add = ADD_MOD(*x, *y, q);
-    *x = SUB_MOD(*x, *y, q);
-    *y = add;
-}
-
-// butterfly CT with 1 < r < q - 1
-inline void BUTTERFLY_3(m256i c, m256i* x, m256i* y, uint16_t q)
-{
-    m256i z = MUL_MOD(c, *y, q);
-    *y = SUB_MOD(*x, z, q);
-    *x = ADD_MOD(*x, z, q);
-}
-
-// butterfly GS w/ r = q - 1
-inline void BUTTERFLY_4(m256i* x, m256i* y, uint16_t q)
-{
-    m256i add = ADD_MOD(*x, *y, q);
-    *y = SUB_MOD(*y, *x, q);
-    *x = add;
-}
-
-// butterfly GS w/ 1 < r < q - 1
-// x = x + y mod q
-// y = z * (x - y) mod q
-inline void BUTTERFLY_5(m256i c, m256i* x, m256i* y, uint16_t q)
-{
-    m256i sub = SUB_MOD(*x, *y, q);
-    *x = ADD_MOD(*x, *y, q);
-    *y = MUL_MOD(c, sub, q);
-}
-
-/**
- * Vectorized butterly CT step
- *
- * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
- *      P = P + r * Q
- *      Q = P - r * Q
- *
- * @param buf - working buffers
- * @param r - coefficient
- * @param start - index of buffer among `m` ones
- * @param m - current group size
- * @param len - number of vectors per buffer
- * @param card - modulo cardinal
- */
-inline void butterfly_ct_step(
-    vec::Buffers<uint16_t>& buf,
-    uint16_t r,
-    unsigned start,
-    unsigned m,
-    size_t len,
-    uint16_t card)
-{
-    const unsigned step = m << 1;
-    m256i c = SET1(r);
-
-#define BUTTERFLY_CT(x, y)                                                     \
-    (EITHER(                                                                   \
-        r == 1,                                                                \
-        BUTTERFLY_1(x, y, card),                                               \
-        EITHER(                                                                \
-            r < card - 1,                                                      \
-            BUTTERFLY_3(c, x, y, card),                                        \
-            BUTTERFLY_2(x, y, card))));
-
-    const size_t end = len - 1;
-    const unsigned bufs_nb = buf.get_n();
-    // #pragma omp parallel for
-    // #pragma unroll
-    const std::vector<uint16_t*>& mem = buf.get_mem();
-    for (unsigned i = start; i < bufs_nb; i += step) {
-        m256i x1, y1;
-        m256i x2, y2;
-        m256i* __restrict p = reinterpret_cast<m256i*>(mem[i]);
-        m256i* __restrict q = reinterpret_cast<m256i*>(mem[i + m]);
-
-        // #pragma omp parallel for
-        size_t j = 0;
-        // #pragma unroll
-        for (; j < end; j += 2) {
-            x1 = LOAD(p + j);
-            y1 = LOAD(q + j);
-            x2 = LOAD(p + j + 1);
-            y2 = LOAD(q + j + 1);
-
-            BUTTERFLY_CT(&x1, &y1);
-            BUTTERFLY_CT(&x2, &y2);
-
-            // Store back to memory
-            STORE(p + j, x1);
-            STORE(p + j + 1, x2);
-            STORE(q + j, y1);
-            STORE(q + j + 1, y2);
-        }
-        for (; j < len; ++j) {
-            x1 = LOAD(p + j);
-            y1 = LOAD(q + j);
-
-            BUTTERFLY_CT(&x1, &y1);
-
-            // Store back to memory
-            STORE(p + j, x1);
-            STORE(q + j, y1);
-        }
-    }
-}
-
-/**
- * Vectorized butterly CT on two-layers at a time
- *
- * For each quadruple
- * (P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m])
- * First layer: butterfly on (P, Q) and (R, S) for step = 2 * m
- *      coef r1 = W[start * n / (2 * m)]
- *      P = P + r1 * Q
- *      Q = P - r1 * Q
- *      R = R + r1 * S
- *      S = R - r1 * S
- * Second layer: butterfly on (P, R) and (Q, S) for step = 4 * m
- *      coef r2 = W[start * n / (4 * m)]
- *      coef r3 = W[(start + m) * n / (4 * m)]
- *      P = P + r2 * R
- *      R = P - r2 * R
- *      Q = Q + r3 * S
- *      S = Q - r3 * S
- *
- * @param buf - working buffers
- * @param r1 - coefficient for the 1st layer
- * @param r2 - 1st coefficient for the 2nd layer
- * @param r3 - 2nd coefficient for the 2nd layer
- * @param start - index of buffer among `m` ones
- * @param m - current group size
- * @param len - number of vectors per buffer
- * @param card - modulo cardinal
- */
-inline void butterfly_ct_two_layers_step(
-    vec::Buffers<uint16_t>& buf,
-    uint16_t r1,
-    uint16_t r2,
-    uint16_t r3,
-    unsigned start,
-    unsigned m,
-    size_t len,
-    uint16_t card)
-{
-    const unsigned step = m << 2;
-    m256i c1 = SET1(r1);
-    m256i c2 = SET1(r2);
-    m256i c3 = SET1(r3);
-
-#define BUTTERFLY_R1(c, x, y)                                                  \
-    (EITHER(                                                                   \
-        r1 == 1,                                                               \
-        BUTTERFLY_1(x, y, card),                                               \
-        EITHER(                                                                \
-            r1 < card - 1,                                                     \
-            BUTTERFLY_3(c, x, y, card),                                        \
-            BUTTERFLY_2(x, y, card))));
-#define BUTTERFLY_R2(c, x, y)                                                  \
-    (EITHER(                                                                   \
-        r2 == 1,                                                               \
-        BUTTERFLY_1(x, y, card),                                               \
-        EITHER(                                                                \
-            r2 < card - 1,                                                     \
-            BUTTERFLY_3(c, x, y, card),                                        \
-            BUTTERFLY_2(x, y, card))));
-#define BUTTERFLY_R3(c, x, y)                                                  \
-    (EITHER(                                                                   \
-        r3 == 1,                                                               \
-        BUTTERFLY_1(x, y, card),                                               \
-        EITHER(                                                                \
-            r3 < card - 1,                                                     \
-            BUTTERFLY_3(c, x, y, card),                                        \
-            BUTTERFLY_2(x, y, card))));
-
-    const size_t end = len - 1;
-    const unsigned bufs_nb = buf.get_n();
-    // #pragma omp parallel for
-    // #pragma unroll
-    const std::vector<uint16_t*>& mem = buf.get_mem();
-    for (unsigned i = start; i < bufs_nb; i += step) {
-        m256i x1, y1, u1, v1;
-        m256i x2, y2, u2, v2;
-        m256i* __restrict p = reinterpret_cast<m256i*>(mem[i]);
-        m256i* __restrict q = reinterpret_cast<m256i*>(mem[i + m]);
-        m256i* __restrict r = reinterpret_cast<m256i*>(mem[i + 2 * m]);
-        m256i* __restrict s = reinterpret_cast<m256i*>(mem[i + 3 * m]);
-
-        // #pragma omp parallel for
-        size_t j = 0;
-        // #pragma unroll
-        for (; j < end; j += 2) {
-            // First layer (c1, x, y) & (c1, u, v)
-            x1 = LOAD(p + j);
-            y1 = LOAD(q + j);
-            x2 = LOAD(p + j + 1);
-            y2 = LOAD(q + j + 1);
-
-            u1 = LOAD(r + j);
-            v1 = LOAD(s + j);
-            u2 = LOAD(r + j + 1);
-            v2 = LOAD(s + j + 1);
-
-            BUTTERFLY_R1(c1, &x1, &y1);
-            BUTTERFLY_R1(c1, &x2, &y2);
-
-            BUTTERFLY_R1(c1, &u1, &v1);
-            BUTTERFLY_R1(c1, &u2, &v2);
-
-            // Second layer (c2, x, u) & (c3, y, v)
-            BUTTERFLY_R2(c2, &x1, &u1);
-            BUTTERFLY_R2(c2, &x2, &u2);
-
-            BUTTERFLY_R3(c3, &y1, &v1);
-            BUTTERFLY_R3(c3, &y2, &v2);
-
-            // Store back to memory
-            STORE(p + j, x1);
-            STORE(p + j + 1, x2);
-            STORE(q + j, y1);
-            STORE(q + j + 1, y2);
-
-            STORE(r + j, u1);
-            STORE(r + j + 1, u2);
-            STORE(s + j, v1);
-            STORE(s + j + 1, v2);
-        }
-        for (; j < len; ++j) {
-            // First layer (c1, x, y) & (c1, u, v)
-            x1 = LOAD(p + j);
-            y1 = LOAD(q + j);
-            u1 = LOAD(r + j);
-            v1 = LOAD(s + j);
-
-            BUTTERFLY_R1(c1, &x1, &y1);
-            BUTTERFLY_R1(c1, &u1, &v1);
-            // Second layer (c2, x, u) & (c3, y, v)
-            BUTTERFLY_R2(c2, &x1, &u1);
-            BUTTERFLY_R3(c3, &y1, &v1);
-            // Store back to memory
-            STORE(p + j, x1);
-            STORE(q + j, y1);
-            STORE(r + j, u1);
-            STORE(s + j, v1);
-        }
-    }
-}
-
-/**
- * Vectorized butterly GS step
- *
- * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
- *      P = P + Q
- *      Q = r * (P - Q)
- *
- * @param buf - working buffers
- * @param r - coefficient
- * @param start - index of buffer among `m` ones
- * @param m - current group size
- * @param len - number of vectors per buffer
- * @param card - modulo cardinal
- */
-inline void butterfly_gs_step(
-    vec::Buffers<uint16_t>& buf,
-    uint16_t r,
-    unsigned start,
-    unsigned m,
-    size_t len,
-    uint16_t card)
-{
-    const unsigned step = m << 1;
-    m256i c = SET1(r);
-
-#define BUTTERFLY_GS(x, y)                                                     \
-    (EITHER(                                                                   \
-        r == 1,                                                                \
-        BUTTERFLY_1(x, y, card),                                               \
-        EITHER(                                                                \
-            r < card - 1,                                                      \
-            BUTTERFLY_5(c, x, y, card),                                        \
-            BUTTERFLY_4(x, y, card))));
-
-    const size_t end = len - 1;
-    const unsigned bufs_nb = buf.get_n();
-    // #pragma omp parallel for
-    // #pragma unroll
-    const std::vector<uint16_t*>& mem = buf.get_mem();
-    for (unsigned i = start; i < bufs_nb; i += step) {
-        m256i x1, y1;
-        m256i x2, y2;
-        m256i* __restrict p = reinterpret_cast<m256i*>(mem[i]);
-        m256i* __restrict q = reinterpret_cast<m256i*>(mem[i + m]);
-
-        // #pragma omp parallel for
-        size_t j = 0;
-        // #pragma unroll
-        for (; j < end; j += 2) {
-            x1 = LOAD(p + j);
-            y1 = LOAD(q + j);
-            x2 = LOAD(p + j + 1);
-            y2 = LOAD(q + j + 1);
-
-            BUTTERFLY_GS(&x1, &y1);
-            BUTTERFLY_GS(&x2, &y2);
-
-            // Store back to memory
-            STORE(p + j, x1);
-            STORE(p + j + 1, x2);
-            STORE(q + j, y1);
-            STORE(q + j + 1, y2);
-        }
-        for (; j < len; ++j) {
-            x1 = LOAD(p + j);
-            y1 = LOAD(q + j);
-
-            BUTTERFLY_GS(&x1, &y1);
-
-            // Store back to memory
-            STORE(p + j, x1);
-            STORE(q + j, y1);
-        }
-    }
-}
-
-/**
- * Vectorized butterly GS step
- *
- * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
- *      Q = r * Q
- *
- * @param buf - working buffers
- * @param r - coefficient
- * @param start - index of buffer among `m` ones
- * @param m - current group size
- * @param len - number of vectors per buffer
- * @param card - modulo cardinal
- */
-inline void butterfly_gs_step_simple(
-    vec::Buffers<uint16_t>& buf,
-    uint16_t r,
-    unsigned start,
-    unsigned m,
-    size_t len,
-    uint16_t card)
-{
-    const unsigned step = m << 1;
-    m256i c = SET1(r);
-
-#define BUTTERFLY_GS_S(x)                                                      \
-    (EITHER(                                                                   \
-        r == 1,                                                                \
-        (x),                                                                   \
-        EITHER(r < card - 1, MUL_MOD(c, x, card), NEG_MOD(x, card))));
-
-    const size_t end = len - 1;
-    const unsigned bufs_nb = buf.get_n();
-    // #pragma omp parallel for
-    // #pragma unroll
-    const std::vector<uint16_t*>& mem = buf.get_mem();
-    for (unsigned i = start; i < bufs_nb; i += step) {
-        m256i x1, y1;
-        m256i x2, y2;
-        m256i* __restrict p = reinterpret_cast<m256i*>(mem[i]);
-        m256i* __restrict q = reinterpret_cast<m256i*>(mem[i + m]);
-
-        // #pragma omp parallel for
-        size_t j = 0;
-        // #pragma unroll
-        for (; j < end; j += 2) {
-            x1 = LOAD(p + j);
-            x2 = LOAD(p + j + 1);
-
-            y1 = BUTTERFLY_GS_S(x1);
-            y2 = BUTTERFLY_GS_S(x2);
-
-            // Store back to memory
-            STORE(q + j, y1);
-            STORE(q + j + 1, y2);
-        }
-        for (; j < len; ++j) {
-            x1 = LOAD(p + j);
-
-            y1 = BUTTERFLY_GS_S(x1);
-
-            // Store back to memory
-            STORE(q + j, y1);
-        }
-    }
-}
-
-inline void add_props_16(
-    Properties& props,
-    m256i threshold,
-    m256i mask,
-    m256i symb,
-    off_t offset)
-{
-    const m256i b = CMPEQ16(threshold, symb);
-    const m256i c = AND(mask, b);
-    uint32_t d = MVMSK8(c);
-    const unsigned element_size = sizeof(uint16_t);
-    while (d > 0) {
-        unsigned byte_idx = __builtin_ctz(d);
-        off_t _offset = offset + byte_idx / element_size;
-        props.add(_offset, OOR_MARK);
-        d ^= 1 << byte_idx;
-    }
-}
-
-inline void encode_post_process(
-    vec::Buffers<uint16_t>& output,
-    std::vector<Properties>& props,
-    off_t offset,
-    unsigned code_len,
-    uint16_t threshold,
-    size_t vecs_nb)
-{
-    const unsigned element_size = sizeof(uint16_t);
-    const unsigned vec_size = ALIGN_SIZE / element_size;
-    const uint16_t max = 1 << (element_size * 8 - 1);
-    const m256i _threshold = SET1(threshold);
-    const m256i mask_hi = SET1(max);
-
-    // #pragma unroll
-    const std::vector<uint16_t*>& mem = output.get_mem();
-    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
-        m256i* __restrict buf = reinterpret_cast<m256i*>(mem[frag_id]);
-
-        size_t vec_id = 0;
-        size_t end = vecs_nb - 3;
-        // #pragma unroll
-        for (; vec_id < end; vec_id += 4) {
-            m256i a1 = LOAD(buf + vec_id);
-            m256i a2 = LOAD(buf + vec_id + 1);
-            m256i a3 = LOAD(buf + vec_id + 2);
-            m256i a4 = LOAD(buf + vec_id + 3);
-
-            if (TESTZ(a1, _threshold) == 0) {
-                const off_t curr_offset = offset + vec_id * vec_size;
-                add_props_16(
-                    props[frag_id], _threshold, mask_hi, a1, curr_offset);
-            }
-            if (TESTZ(a2, _threshold) == 0) {
-                const off_t curr_offset = offset + (vec_id + 1) * vec_size;
-                add_props_16(
-                    props[frag_id], _threshold, mask_hi, a2, curr_offset);
-            }
-            if (TESTZ(a3, _threshold) == 0) {
-                const off_t curr_offset = offset + (vec_id + 2) * vec_size;
-                add_props_16(
-                    props[frag_id], _threshold, mask_hi, a3, curr_offset);
-            }
-            if (TESTZ(a4, _threshold) == 0) {
-                const off_t curr_offset = offset + (vec_id + 3) * vec_size;
-                add_props_16(
-                    props[frag_id], _threshold, mask_hi, a4, curr_offset);
-            }
-        }
-        for (; vec_id < vecs_nb; ++vec_id) {
-            m256i a = LOAD(buf + vec_id);
-            uint16_t c = TESTZ(a, _threshold);
-            if (c == 0) {
-                const off_t curr_offset = offset + vec_id * vec_size;
-                add_props_16(
-                    props[frag_id], _threshold, mask_hi, a, curr_offset);
-            }
-        }
-    }
-}
-
-/* ==================== Operations =================== */
-/** Perform a multiplication of a coefficient `a` to each element of `src` and
- *  add result to correspondent element of `dest`
- *
- * @note: 1 < `a` < card - 1
- */
-inline void mul_coef_to_buf(
-    const uint16_t a,
-    aint16* src,
-    aint16* dest,
-    size_t len,
-    uint16_t card)
-{
-    const m256i coef = SET1(a);
-
-    m256i* __restrict _src = reinterpret_cast<m256i*>(src);
-    m256i* __restrict _dest = reinterpret_cast<m256i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplication
-        _dest[i] = MUL_MOD(coef, _src[i], card);
-    }
-    if (_last_len > 0) {
-        uint32_t coef_32 = (uint32_t)a;
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplication
-            dest[i] = (aint16)((coef_32 * src[i]) % card);
-        }
-    }
-}
-
-inline void add_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card)
-{
-    m256i* __restrict _src = reinterpret_cast<m256i*>(src);
-    m256i* __restrict _dest = reinterpret_cast<m256i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform addition
-        _dest[i] = ADD_MOD(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform addition
-            aint16 tmp = src[i] + dest[i];
-            dest[i] = (tmp >= card) ? (tmp - card) : tmp;
-        }
-    }
-}
-
-inline void
-sub_two_bufs(aint16* bufa, aint16* bufb, aint16* res, size_t len, aint16 card)
-{
-    m256i* __restrict _bufa = reinterpret_cast<m256i*>(bufa);
-    m256i* __restrict _bufb = reinterpret_cast<m256i*>(bufb);
-    m256i* __restrict _res = reinterpret_cast<m256i*>(res);
-    const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform subtraction
-        _res[i] = SUB_MOD(_bufa[i], _bufb[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform subtraction
-            if (bufa[i] >= bufb[i])
-                res[i] = bufa[i] - bufb[i];
-            else
-                res[i] = card - (bufb[i] - bufa[i]);
-        }
-    }
-}
-
-inline void mul_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card)
-{
-    m256i* __restrict _src = reinterpret_cast<m256i*>(src);
-    m256i* __restrict _dest = reinterpret_cast<m256i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplicaton
-        _dest[i] = MULFULL_MOD(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplicaton
-            dest[i] = uint16_t((uint64_t(src[i]) * dest[i]) % card);
-        }
-    }
-}
-
-/** Apply an element-wise negation to a buffer
- */
-inline void neg(size_t len, aint16* buf, aint16 card)
-{
-    m256i* _buf = reinterpret_cast<m256i*>(buf);
-    unsigned ratio = sizeof(*_buf) / sizeof(*buf);
-    size_t _len = len / ratio;
-    size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        _buf[i] = NEG_MOD(_buf[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            if (buf[i])
-                buf[i] = card - buf[i];
-        }
-    }
-}
-
-} // namespace simd
-} // namespace quadiron
-
-#endif
diff --git a/src/simd_256_u32.h b/src/simd_256_u32.h
deleted file mode 100644
index 5302a472..00000000
--- a/src/simd_256_u32.h
+++ /dev/null
@@ -1,839 +0,0 @@
-/*
- * Copyright 2017-2018 Scality
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __QUAD_SIMD_256_U32_H__
-#define __QUAD_SIMD_256_U32_H__
-
-#include <x86intrin.h>
-
-#include "simd/simd.h"
-
-namespace quadiron {
-namespace simd {
-
-#define F4_u32 _mm256_set1_epi32(65537)
-#define F4m1_u32 _mm256_set1_epi32(65536)
-#define F3_u32 _mm256_set1_epi32(257)
-#define F3m1_u32 _mm256_set1_epi32(256)
-
-#define CARD(q) (EITHER(q == F3, F3_u32, F4_u32))
-#define CARD_M_1(q) (EITHER(q == F3, F3m1_u32, F4m1_u32))
-
-/* ==================== Essential Operations =================== */
-// Following functions are used for AVX2 w/ u32 only
-
-inline m256i SET1(uint32_t val)
-{
-    return _mm256_set1_epi32(val);
-}
-inline m256i ADD32(m256i x, m256i y)
-{
-    return _mm256_add_epi32(x, y);
-}
-inline m256i SUB32(m256i x, m256i y)
-{
-    return _mm256_sub_epi32(x, y);
-}
-inline m256i MUL32(m256i x, m256i y)
-{
-    return _mm256_mullo_epi32(x, y);
-}
-
-inline m256i CMPEQ32(m256i x, m256i y)
-{
-    return _mm256_cmpeq_epi32(x, y);
-}
-inline m256i CMPGT32(m256i x, m256i y)
-{
-    return _mm256_cmpgt_epi32(x, y);
-}
-inline m256i MINU32(m256i x, m256i y)
-{
-    return _mm256_min_epu32(x, y);
-}
-#define BLEND16(x, y, imm8) (_mm256_blend_epi16(x, y, imm8))
-
-// z = x + y mod q
-// Input are loaded to registers
-// Output is register
-inline m256i ADD_MOD(m256i x, m256i y, uint32_t q)
-{
-    m256i res = ADD32(x, y);
-    return MINU32(res, SUB32(res, CARD(q)));
-}
-
-// z = x - y mod q => z = q + x - y mod q
-// Input are loaded to registers
-// Output is register
-inline m256i SUB_MOD(m256i x, m256i y, uint32_t q)
-{
-    m256i res = SUB32(x, y);
-    return MINU32(res, ADD32(res, CARD(q)));
-}
-
-// y = 0 - x mod q => y = q - x mod q
-// Input are loaded to registers
-// Output is register
-inline m256i NEG_MOD(m256i x, uint32_t q)
-{
-    m256i res = SUB32(CARD(q), x);
-    return MINU32(res, SUB32(res, CARD(q)));
-}
-
-// z = x * y mod q
-// Input are loaded to registers
-// Output is register
-// Note: we assume that at least `x` or `y` is less than `q-1` so it's
-// not necessary to verify overflow on multiplying elements
-inline m256i MUL_MOD(m256i x, m256i y, uint32_t q)
-{
-    m256i res = MUL32(x, y);
-    m256i lo = BLEND16(ZERO, res, 0x55);
-    m256i hi = BLEND16(ZERO, SHIFTR_2(res), 0x55);
-    return SUB_MOD(lo, hi, q);
-}
-
-inline void MUL_MOD(m256i x, m256i y, m256i* z, uint32_t q)
-{
-    m256i res = MUL32(x, y);
-    m256i lo = BLEND16(ZERO, res, 0x55);
-    m256i hi = BLEND16(ZERO, SHIFTR_2(res), 0x55);
-    *z = SUB_MOD(lo, hi, q);
-}
-// z = x * y mod q
-// Input are loaded to registers
-// Output is register
-inline m256i MULFULL_MOD(m256i x, m256i y, uint32_t q)
-{
-    m256i res = MUL32(x, y);
-
-    // filter elements of both of a & b = card-1
-    m256i cmp = AND(CMPEQ32(x, CARD_M_1(q)), CMPEQ32(y, CARD_M_1(q)));
-    res = (q == F3) ? XOR(res, AND(F4_u32, cmp)) : ADD32(res, AND(ONE, cmp));
-
-    m256i lo = BLEND16(ZERO, res, 0x55);
-    m256i hi = SHIFTR_2(BLEND16(ZERO, res, 0xAA));
-    return SUB_MOD(lo, hi, q);
-}
-
-// butterfly CT with r == 1
-inline void BUTTERFLY_1(m256i* x, m256i* y, uint32_t q)
-{
-    m256i add = ADD_MOD(*x, *y, q);
-    *y = SUB_MOD(*x, *y, q);
-    *x = add;
-}
-
-// butterfly CT with r == q - 1
-inline void BUTTERFLY_2(m256i* x, m256i* y, uint32_t q)
-{
-    m256i add = ADD_MOD(*x, *y, q);
-    *x = SUB_MOD(*x, *y, q);
-    *y = add;
-}
-
-// butterfly CT with 1 < r < q - 1
-inline void BUTTERFLY_3(m256i c, m256i* x, m256i* y, uint32_t q)
-{
-    m256i z = MUL_MOD(c, *y, q);
-    *y = SUB_MOD(*x, z, q);
-    *x = ADD_MOD(*x, z, q);
-}
-
-// butterfly GS w/ r = q - 1
-inline void BUTTERFLY_4(m256i* x, m256i* y, uint32_t q)
-{
-    m256i add = ADD_MOD(*x, *y, q);
-    *y = SUB_MOD(*y, *x, q);
-    *x = add;
-}
-
-// butterfly GS w/ 1 < r < q - 1
-// x = x + y mod q
-// y = z * (x - y) mod q
-inline void BUTTERFLY_5(m256i c, m256i* x, m256i* y, uint32_t q)
-{
-    m256i sub = SUB_MOD(*x, *y, q);
-    *x = ADD_MOD(*x, *y, q);
-    *y = MUL_MOD(c, sub, q);
-}
-
-/**
- * Vectorized butterly CT step
- *
- * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
- *      P = P + r * Q
- *      Q = P - r * Q
- *
- * @param buf - working buffers
- * @param r - coefficient
- * @param start - index of buffer among `m` ones
- * @param m - current group size
- * @param len - number of vectors per buffer
- * @param card - modulo cardinal
- */
-inline void butterfly_ct_step(
-    vec::Buffers<uint32_t>& buf,
-    uint32_t r,
-    unsigned start,
-    unsigned m,
-    size_t len,
-    uint32_t card)
-{
-    const unsigned step = m << 1;
-    m256i c = SET1(r);
-
-#define BUTTERFLY_CT(x, y)                                                     \
-    (EITHER(                                                                   \
-        r == 1,                                                                \
-        BUTTERFLY_1(x, y, card),                                               \
-        EITHER(                                                                \
-            r < card - 1,                                                      \
-            BUTTERFLY_3(c, x, y, card),                                        \
-            BUTTERFLY_2(x, y, card))));
-
-    const size_t end = len - 1;
-    const unsigned bufs_nb = buf.get_n();
-    // #pragma omp parallel for
-    // #pragma unroll
-    const std::vector<uint32_t*>& mem = buf.get_mem();
-    for (unsigned i = start; i < bufs_nb; i += step) {
-        m256i x1, y1;
-        m256i x2, y2;
-        m256i* __restrict p = reinterpret_cast<m256i*>(mem[i]);
-        m256i* __restrict q = reinterpret_cast<m256i*>(mem[i + m]);
-
-        // #pragma omp parallel for
-        size_t j = 0;
-        // #pragma unroll
-        for (; j < end; j += 2) {
-            x1 = LOAD(p + j);
-            y1 = LOAD(q + j);
-            x2 = LOAD(p + j + 1);
-            y2 = LOAD(q + j + 1);
-
-            BUTTERFLY_CT(&x1, &y1);
-            BUTTERFLY_CT(&x2, &y2);
-
-            // Store back to memory
-            STORE(p + j, x1);
-            STORE(p + j + 1, x2);
-            STORE(q + j, y1);
-            STORE(q + j + 1, y2);
-        }
-        for (; j < len; ++j) {
-            x1 = LOAD(p + j);
-            y1 = LOAD(q + j);
-
-            BUTTERFLY_CT(&x1, &y1);
-
-            // Store back to memory
-            STORE(p + j, x1);
-            STORE(q + j, y1);
-        }
-    }
-}
-
-/**
- * Vectorized butterly CT on two-layers at a time
- *
- * For each quadruple
- * (P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m])
- * First layer: butterfly on (P, Q) and (R, S) for step = 2 * m
- *      coef r1 = W[start * n / (2 * m)]
- *      P = P + r1 * Q
- *      Q = P - r1 * Q
- *      R = R + r1 * S
- *      S = R - r1 * S
- * Second layer: butterfly on (P, R) and (Q, S) for step = 4 * m
- *      coef r2 = W[start * n / (4 * m)]
- *      coef r3 = W[(start + m) * n / (4 * m)]
- *      P = P + r2 * R
- *      R = P - r2 * R
- *      Q = Q + r3 * S
- *      S = Q - r3 * S
- *
- * @param buf - working buffers
- * @param r1 - coefficient for the 1st layer
- * @param r2 - 1st coefficient for the 2nd layer
- * @param r3 - 2nd coefficient for the 2nd layer
- * @param start - index of buffer among `m` ones
- * @param m - current group size
- * @param len - number of vectors per buffer
- * @param card - modulo cardinal
- */
-inline void butterfly_ct_two_layers_step(
-    vec::Buffers<uint32_t>& buf,
-    uint32_t r1,
-    uint32_t r2,
-    uint32_t r3,
-    unsigned start,
-    unsigned m,
-    size_t len,
-    uint32_t card)
-{
-    const unsigned step = m << 2;
-    m256i c1 = SET1(r1);
-    m256i c2 = SET1(r2);
-    m256i c3 = SET1(r3);
-
-#define BUTTERFLY_R1(c, x, y)                                                  \
-    (EITHER(                                                                   \
-        r1 == 1,                                                               \
-        BUTTERFLY_1(x, y, card),                                               \
-        EITHER(                                                                \
-            r1 < card - 1,                                                     \
-            BUTTERFLY_3(c, x, y, card),                                        \
-            BUTTERFLY_2(x, y, card))));
-#define BUTTERFLY_R2(c, x, y)                                                  \
-    (EITHER(                                                                   \
-        r2 == 1,                                                               \
-        BUTTERFLY_1(x, y, card),                                               \
-        EITHER(                                                                \
-            r2 < card - 1,                                                     \
-            BUTTERFLY_3(c, x, y, card),                                        \
-            BUTTERFLY_2(x, y, card))));
-#define BUTTERFLY_R3(c, x, y)                                                  \
-    (EITHER(                                                                   \
-        r3 == 1,                                                               \
-        BUTTERFLY_1(x, y, card),                                               \
-        EITHER(                                                                \
-            r3 < card - 1,                                                     \
-            BUTTERFLY_3(c, x, y, card),                                        \
-            BUTTERFLY_2(x, y, card))));
-
-    const size_t end = len - 1;
-    const unsigned bufs_nb = buf.get_n();
-    // #pragma omp parallel for
-    // #pragma unroll
-    const std::vector<uint32_t*>& mem = buf.get_mem();
-    for (unsigned i = start; i < bufs_nb; i += step) {
-        m256i x1, y1, u1, v1;
-        m256i x2, y2, u2, v2;
-        m256i* __restrict p = reinterpret_cast<m256i*>(mem[i]);
-        m256i* __restrict q = reinterpret_cast<m256i*>(mem[i + m]);
-        m256i* __restrict r = reinterpret_cast<m256i*>(mem[i + 2 * m]);
-        m256i* __restrict s = reinterpret_cast<m256i*>(mem[i + 3 * m]);
-
-        // #pragma omp parallel for
-        size_t j = 0;
-        // #pragma unroll
-        for (; j < end; j += 2) {
-            // First layer (c1, x, y) & (c1, u, v)
-            x1 = LOAD(p + j);
-            y1 = LOAD(q + j);
-            x2 = LOAD(p + j + 1);
-            y2 = LOAD(q + j + 1);
-
-            u1 = LOAD(r + j);
-            v1 = LOAD(s + j);
-            u2 = LOAD(r + j + 1);
-            v2 = LOAD(s + j + 1);
-
-            BUTTERFLY_R1(c1, &x1, &y1);
-            BUTTERFLY_R1(c1, &x2, &y2);
-
-            BUTTERFLY_R1(c1, &u1, &v1);
-            BUTTERFLY_R1(c1, &u2, &v2);
-
-            // Second layer (c2, x, u) & (c3, y, v)
-            BUTTERFLY_R2(c2, &x1, &u1);
-            BUTTERFLY_R2(c2, &x2, &u2);
-
-            BUTTERFLY_R3(c3, &y1, &v1);
-            BUTTERFLY_R3(c3, &y2, &v2);
-
-            // Store back to memory
-            STORE(p + j, x1);
-            STORE(p + j + 1, x2);
-            STORE(q + j, y1);
-            STORE(q + j + 1, y2);
-
-            STORE(r + j, u1);
-            STORE(r + j + 1, u2);
-            STORE(s + j, v1);
-            STORE(s + j + 1, v2);
-        }
-        for (; j < len; ++j) {
-            // First layer (c1, x, y) & (c1, u, v)
-            x1 = LOAD(p + j);
-            y1 = LOAD(q + j);
-            u1 = LOAD(r + j);
-            v1 = LOAD(s + j);
-
-            BUTTERFLY_R1(c1, &x1, &y1);
-            BUTTERFLY_R1(c1, &u1, &v1);
-            // Second layer (c2, x, u) & (c3, y, v)
-            BUTTERFLY_R2(c2, &x1, &u1);
-            BUTTERFLY_R3(c3, &y1, &v1);
-            // Store back to memory
-            STORE(p + j, x1);
-            STORE(q + j, y1);
-            STORE(r + j, u1);
-            STORE(s + j, v1);
-        }
-    }
-}
-
-/**
- * Vectorized butterly GS step
- *
- * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
- *      P = P + Q
- *      Q = r * (P - Q)
- *
- * @param buf - working buffers
- * @param r - coefficient
- * @param start - index of buffer among `m` ones
- * @param m - current group size
- * @param len - number of vectors per buffer
- * @param card - modulo cardinal
- */
-inline void butterfly_gs_step(
-    vec::Buffers<uint32_t>& buf,
-    uint32_t r,
-    unsigned start,
-    unsigned m,
-    size_t len,
-    uint32_t card)
-{
-    const unsigned step = m << 1;
-    m256i c = SET1(r);
-
-#define BUTTERFLY_GS(x, y)                                                     \
-    (EITHER(                                                                   \
-        r == 1,                                                                \
-        BUTTERFLY_1(x, y, card),                                               \
-        EITHER(                                                                \
-            r < card - 1,                                                      \
-            BUTTERFLY_5(c, x, y, card),                                        \
-            BUTTERFLY_4(x, y, card))));
-
-    const size_t end = len - 3;
-    const unsigned bufs_nb = buf.get_n();
-    // #pragma omp parallel for
-    // #pragma unroll
-    const std::vector<uint32_t*>& mem = buf.get_mem();
-    for (unsigned i = start; i < bufs_nb; i += step) {
-        m256i x1, x2, x3, x4;
-        m256i y1, y2, y3, y4;
-        m256i* __restrict p = reinterpret_cast<m256i*>(mem[i]);
-        m256i* __restrict q = reinterpret_cast<m256i*>(mem[i + m]);
-
-        // #pragma omp parallel for
-        size_t j = 0;
-        // #pragma unroll
-        for (; j < end; j += 4) {
-            x1 = LOAD(p + j);
-            x2 = LOAD(p + j + 1);
-            x3 = LOAD(p + j + 2);
-            x4 = LOAD(p + j + 3);
-            y1 = LOAD(q + j);
-            y2 = LOAD(q + j + 1);
-            y3 = LOAD(q + j + 2);
-            y4 = LOAD(q + j + 3);
-
-            BUTTERFLY_GS(&x1, &y1);
-            BUTTERFLY_GS(&x2, &y2);
-            BUTTERFLY_GS(&x3, &y3);
-            BUTTERFLY_GS(&x4, &y4);
-
-            // Store back to memory
-            STORE(p + j, x1);
-            STORE(p + j + 1, x2);
-            STORE(p + j + 2, x3);
-            STORE(p + j + 3, x4);
-            STORE(q + j, y1);
-            STORE(q + j + 1, y2);
-            STORE(q + j + 2, y3);
-            STORE(q + j + 3, y4);
-        }
-        for (; j < len; ++j) {
-            x1 = LOAD(p + j);
-            y1 = LOAD(q + j);
-
-            BUTTERFLY_GS(&x1, &y1);
-
-            // Store back to memory
-            STORE(p + j, x1);
-            STORE(q + j, y1);
-        }
-    }
-}
-
-/**
- * Vectorized butterly GS step
- *
- * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
- *      Q = r * P
- *
- * @param buf - working buffers
- * @param r - coefficient
- * @param start - index of buffer among `m` ones
- * @param m - current group size
- * @param len - number of vectors per buffer
- * @param card - modulo cardinal
- */
-inline void butterfly_gs_step_simple(
-    vec::Buffers<uint32_t>& buf,
-    uint32_t r,
-    unsigned start,
-    unsigned m,
-    size_t len,
-    uint32_t card)
-{
-    const unsigned step = m << 1;
-    m256i c = SET1(r);
-
-#define BUTTERFLY_GS_S(x)                                                      \
-    (EITHER(                                                                   \
-        r == 1,                                                                \
-        (x),                                                                   \
-        EITHER(r < card - 1, MUL_MOD(c, x, card), NEG_MOD(x, card))));
-
-    const size_t end = len - 1;
-    const unsigned bufs_nb = buf.get_n();
-    // #pragma omp parallel for
-    // #pragma unroll
-    const std::vector<uint32_t*>& mem = buf.get_mem();
-    for (unsigned i = start; i < bufs_nb; i += step) {
-        m256i x1, y1;
-        m256i x2, y2;
-        m256i* __restrict p = reinterpret_cast<m256i*>(mem[i]);
-        m256i* __restrict q = reinterpret_cast<m256i*>(mem[i + m]);
-
-        // #pragma omp parallel for
-        size_t j = 0;
-        // #pragma unroll
-        for (; j < end; j += 2) {
-            x1 = LOAD(p + j);
-            x2 = LOAD(p + j + 1);
-
-            y1 = BUTTERFLY_GS_S(x1);
-            y2 = BUTTERFLY_GS_S(x2);
-
-            // Store back to memory
-            STORE(q + j, y1);
-            STORE(q + j + 1, y2);
-        }
-        for (; j < len; ++j) {
-            x1 = LOAD(p + j);
-
-            y1 = BUTTERFLY_GS_S(x1);
-
-            // Store back to memory
-            STORE(q + j, y1);
-        }
-    }
-}
-
-inline void add_props(
-    Properties& props,
-    m256i threshold,
-    m256i mask,
-    m256i symb,
-    off_t offset)
-{
-    const m256i b = CMPEQ32(threshold, symb);
-    const m256i c = AND(mask, b);
-    uint32_t d = MVMSK8(c);
-    const unsigned element_size = sizeof(uint32_t);
-    while (d > 0) {
-        unsigned byte_idx = __builtin_ctz(d);
-        off_t _offset = offset + byte_idx / element_size;
-        props.add(_offset, OOR_MARK);
-        d ^= 1 << byte_idx;
-    }
-}
-
-inline void encode_post_process(
-    vec::Buffers<uint32_t>& output,
-    std::vector<Properties>& props,
-    off_t offset,
-    unsigned code_len,
-    uint32_t threshold,
-    size_t vecs_nb)
-{
-    const unsigned element_size = sizeof(uint32_t);
-    const unsigned vec_size = ALIGN_SIZE / element_size;
-    const uint32_t max = 1 << (element_size * 8 - 1);
-    const m256i _threshold = SET1(threshold);
-    const m256i mask_hi = SET1(max);
-
-    // #pragma unroll
-    const std::vector<uint32_t*>& mem = output.get_mem();
-    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
-        m256i* __restrict buf = reinterpret_cast<m256i*>(mem[frag_id]);
-
-        size_t vec_id = 0;
-        size_t end = vecs_nb - 3;
-        // #pragma unroll
-        for (; vec_id < end; vec_id += 4) {
-            m256i a1 = LOAD(buf + vec_id);
-            m256i a2 = LOAD(buf + vec_id + 1);
-            m256i a3 = LOAD(buf + vec_id + 2);
-            m256i a4 = LOAD(buf + vec_id + 3);
-
-            if (TESTZ(a1, _threshold) == 0) {
-                const off_t curr_offset = offset + vec_id * vec_size;
-                add_props(props[frag_id], _threshold, mask_hi, a1, curr_offset);
-            }
-            if (TESTZ(a2, _threshold) == 0) {
-                const off_t curr_offset = offset + (vec_id + 1) * vec_size;
-                add_props(props[frag_id], _threshold, mask_hi, a2, curr_offset);
-            }
-            if (TESTZ(a3, _threshold) == 0) {
-                const off_t curr_offset = offset + (vec_id + 2) * vec_size;
-                add_props(props[frag_id], _threshold, mask_hi, a3, curr_offset);
-            }
-            if (TESTZ(a4, _threshold) == 0) {
-                const off_t curr_offset = offset + (vec_id + 3) * vec_size;
-                add_props(props[frag_id], _threshold, mask_hi, a4, curr_offset);
-            }
-        }
-        for (; vec_id < vecs_nb; ++vec_id) {
-            m256i a = LOAD(buf + vec_id);
-            uint32_t c = TESTZ(a, _threshold);
-            if (c == 0) {
-                const off_t curr_offset = offset + vec_id * vec_size;
-                add_props(props[frag_id], _threshold, mask_hi, a, curr_offset);
-            }
-        }
-    }
-}
-
-/* ==================== Operations =================== */
-/** Perform a multiplication of a coefficient `a` to each element of `src` and
- *  add result to correspondent element of `dest`
- *
- * @note: 1 < `a` < card - 1
- */
-inline void mul_coef_to_buf(
-    const uint32_t a,
-    aint32* src,
-    aint32* dest,
-    size_t len,
-    uint32_t card)
-{
-    const m256i coef = SET1(a);
-
-    m256i* __restrict _src = reinterpret_cast<m256i*>(src);
-    m256i* __restrict _dest = reinterpret_cast<m256i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i = 0;
-    size_t end = _len - 3;
-    for (; i < end; i += 4) {
-        // perform multiplication
-        MUL_MOD(coef, _src[i], _dest + i, card);
-        MUL_MOD(coef, _src[i + 1], _dest + i + 1, card);
-        MUL_MOD(coef, _src[i + 2], _dest + i + 2, card);
-        MUL_MOD(coef, _src[i + 3], _dest + i + 3, card);
-    }
-    for (; i < _len; ++i) {
-        MUL_MOD(coef, _src[i], _dest + i, card);
-    }
-
-    if (_last_len > 0) {
-        uint64_t coef_64 = (uint64_t)a;
-        for (size_t i = _len * ratio; i < len; i++) {
-            // perform multiplication
-            dest[i] = (aint32)((coef_64 * src[i]) % card);
-        }
-    }
-}
-
-inline void add_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card)
-{
-    m256i* __restrict _src = reinterpret_cast<m256i*>(src);
-    m256i* __restrict _dest = reinterpret_cast<m256i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform addition
-        _dest[i] = ADD_MOD(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform addition
-            aint32 tmp = src[i] + dest[i];
-            dest[i] = (tmp >= card) ? (tmp - card) : tmp;
-        }
-    }
-}
-
-inline void sub_two_bufs(
-    aint32* bufa,
-    aint32* bufb,
-    aint32* res,
-    size_t len,
-    aint32 card = F4)
-{
-    m256i* __restrict _bufa = reinterpret_cast<m256i*>(bufa);
-    m256i* __restrict _bufb = reinterpret_cast<m256i*>(bufb);
-    m256i* __restrict _res = reinterpret_cast<m256i*>(res);
-    const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform subtraction
-        _res[i] = SUB_MOD(_bufa[i], _bufb[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform subtraction
-            if (bufa[i] >= bufb[i])
-                res[i] = bufa[i] - bufb[i];
-            else
-                res[i] = card - (bufb[i] - bufa[i]);
-        }
-    }
-}
-
-inline void mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card)
-{
-    m256i* __restrict _src = reinterpret_cast<m256i*>(src);
-    m256i* __restrict _dest = reinterpret_cast<m256i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplicaton
-        _dest[i] = MULFULL_MOD(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplicaton
-            dest[i] = uint32_t((uint64_t(src[i]) * dest[i]) % card);
-        }
-    }
-}
-
-/** Apply an element-wise negation to a buffer
- */
-inline void neg(size_t len, aint32* buf, aint32 card = F4)
-{
-    m256i* _buf = reinterpret_cast<m256i*>(buf);
-    unsigned ratio = sizeof(*_buf) / sizeof(*buf);
-    size_t _len = len / ratio;
-    size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        _buf[i] = NEG_MOD(_buf[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            if (buf[i])
-                buf[i] = card - buf[i];
-        }
-    }
-}
-
-/* ==================== Operations for NF4 =================== */
-typedef __m128i m128i;
-
-/** Return aint128 integer from a _m128i register */
-inline aint128 m256i_to_uint128(m256i v)
-{
-    aint128 hi, lo;
-    _mm256_storeu2_m128i((m128i*)&hi, (m128i*)&lo, v);
-    return lo; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn)
-}
-
-inline __uint128_t add(__uint128_t a, __uint128_t b)
-{
-    m256i _a = _mm256_castsi128_si256((m128i)a);
-    m256i _b = _mm256_castsi128_si256((m128i)b);
-    m256i res = ADD_MOD(_a, _b, F4);
-    return m256i_to_uint128(res);
-}
-
-inline __uint128_t sub(__uint128_t a, __uint128_t b)
-{
-    m256i _a = _mm256_castsi128_si256((m128i)a);
-    m256i _b = _mm256_castsi128_si256((m128i)b);
-    m256i res = SUB_MOD(_a, _b, F4);
-    return m256i_to_uint128(res);
-}
-
-inline __uint128_t mul(__uint128_t a, __uint128_t b)
-{
-    m256i _a = _mm256_castsi128_si256((m128i)a);
-    m256i _b = _mm256_castsi128_si256((m128i)b);
-    m256i res = MULFULL_MOD(_a, _b, F4);
-    return m256i_to_uint128(res);
-}
-
-/** Store low 128-bit part of `reg` to memory */
-inline void store_low(aint128* address, m256i reg)
-{
-    _mm_store_si128((m128i*)address, _mm256_castsi256_si128(reg));
-}
-
-inline void hadamard_mul(int n, aint128* _x, aint128* _y)
-{
-    int i;
-    m256i* x = reinterpret_cast<m256i*>(_x);
-    m256i* y = reinterpret_cast<m256i*>(_y);
-
-    const unsigned ratio = sizeof(*x) / sizeof(*_x);
-    const int len_256 = n / ratio;
-    const int last_len = n - len_256 * ratio;
-
-    // multiply y to the first half of `x`
-    for (i = 0; i < len_256; i++) {
-        x[i] = MULFULL_MOD(x[i], y[i], F4);
-    }
-
-    if (last_len > 0) {
-        // add last _y[] to x
-        for (i = len_256 * ratio; i < n; i++) {
-            m256i _x_p = _mm256_castsi128_si256((m128i)_x[i]);
-            m256i _y_p = _mm256_castsi128_si256((m128i)_y[i]);
-
-            store_low(_x + i, mul(_x_p, _y_p, F4));
-        }
-    }
-}
-
-} // namespace simd
-} // namespace quadiron
-
-#endif

From 559f733ad14d3cb7e6968a1b9a0ae57132f65bf4 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:42:45 +0200
Subject: [PATCH 08/77] SIMD 128 u16 u32: remove useless files

---
 src/simd_128_u16.h | 312 ----------------------
 src/simd_128_u32.h | 639 ---------------------------------------------
 2 files changed, 951 deletions(-)
 delete mode 100644 src/simd_128_u16.h
 delete mode 100644 src/simd_128_u32.h

diff --git a/src/simd_128_u16.h b/src/simd_128_u16.h
deleted file mode 100644
index e13d8756..00000000
--- a/src/simd_128_u16.h
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * Copyright 2017-2018 Scality
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __QUAD_SIMD_128_U16_H__
-#define __QUAD_SIMD_128_U16_H__
-
-#include <x86intrin.h>
-
-#include "property.h"
-#include "simd/simd.h"
-
-namespace quadiron {
-namespace simd {
-
-/* ==================== Essential Operations =================== */
-
-/** Perform a%card where a is a addition of two numbers whose elements are
- *  symbols of GF(card) */
-inline m128i mod_after_add(m128i a, aint16 card)
-{
-    const m128i _card = _mm_set1_epi16(card);
-    const m128i _card_minus_1 = _mm_set1_epi16(card - 1);
-
-    m128i cmp = _mm_cmpgt_epi16(a, _card_minus_1);
-    m128i b = _mm_sub_epi16(a, _mm_and_si128(_card, cmp));
-
-    return b;
-}
-
-/** Perform addition of two numbers a, b whose elements are of GF(card) */
-inline m128i add(m128i a, m128i b, aint16 card)
-{
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-    m128i c = _mm_add_epi16(_a, _b);
-
-    // Modulo
-    return mod_after_add(c, card);
-}
-
-/** Perform subtraction of a by b where a, b whose elements are symbols of
- *  GF(card)
- * sub(a, b) = a - b if a >= b, or
- *             card + a - b, otherwise
- */
-inline m128i sub(m128i a, m128i b, aint16 card)
-{
-    const m128i _card = _mm_set1_epi16(card);
-
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-
-    m128i cmp = _mm_cmpgt_epi16(_b, _a);
-    m128i _a1 = _mm_add_epi16(_a, _mm_and_si128(_card, cmp));
-
-    return _mm_sub_epi16(_a1, _b);
-}
-
-/** Negate `a`
- * @return 0 if (a == 0), else card - a
- */
-inline m128i neg(m128i a, aint16 card = F3)
-{
-    const m128i _card = _mm_set1_epi16(card);
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_setzero_si128();
-
-    m128i cmp = _mm_cmpgt_epi16(_a, _b);
-
-    return _mm_sub_epi16(_mm_and_si128(cmp, _card), _a);
-}
-
-inline m128i mod_after_multiply(m128i a)
-{
-    const m128i mask = _mm_set1_epi16(F3 - 2);
-
-    m128i lo = _mm_and_si128(a, mask);
-
-    m128i a_shift = _mm_srli_si128(a, 1);
-    m128i hi = _mm_and_si128(a_shift, mask);
-
-    m128i cmp = _mm_cmpgt_epi16(hi, lo);
-    m128i _lo = _mm_add_epi16(lo, _mm_and_si128(F3_m128i_u16, cmp));
-
-    return _mm_sub_epi16(_lo, hi);
-}
-
-inline m128i mul(m128i a, m128i b)
-{
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-
-    m128i c = _mm_mullo_epi16(_a, _b);
-
-    // filter elements of both of a & b = card-1
-    m128i cmp = _mm_and_si128(
-        _mm_cmpeq_epi16(_a, F3minus1_m128i_u16),
-        _mm_cmpeq_epi16(_b, F3minus1_m128i_u16));
-
-    const m128i one = _mm_set1_epi16(1);
-    c = _mm_add_epi16(c, _mm_and_si128(one, cmp));
-
-    // Modulo
-    return mod_after_multiply(c);
-}
-
-/** Perform multiplication of two numbers a, b whose elements are of GF(card)
- *  where `card` is a prime Fermat number, i.e. card = Fx with x < 5
- *  Currently, it supports only for F3
- */
-inline m128i mul(m128i a, m128i b, aint16 card)
-{
-    // FIXME: generalize card
-    assert(card == F3);
-    return mul(a, b);
-}
-
-/** Apply an element-wise negation to a buffer
- */
-inline void neg(size_t len, aint16* buf, aint16 card = F3)
-{
-    m128i* _buf = reinterpret_cast<m128i*>(buf);
-    unsigned ratio = sizeof(*_buf) / sizeof(*buf);
-    size_t _len = len / ratio;
-    size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        _buf[i] = neg(_buf[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            if (buf[i])
-                buf[i] = card - buf[i];
-        }
-    }
-}
-
-/** Perform a multiplication of a coefficient `a` to each element of `src` and
- *  add result to correspondent element of `dest`
- */
-inline void mul_coef_to_buf(
-    const aint16 a,
-    aint16* src,
-    aint16* dest,
-    size_t len,
-    aint16 card = F3)
-{
-    const m128i coef = _mm_set1_epi16(a);
-
-    m128i* _src = reinterpret_cast<m128i*>(src);
-    m128i* _dest = reinterpret_cast<m128i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplication
-        _dest[i] = mul(coef, _src[i], card);
-    }
-    if (_last_len > 0) {
-        uint32_t coef_doubled = (uint32_t)a;
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplication
-            dest[i] = (aint16)((coef_doubled * src[i]) % card);
-        }
-    }
-}
-
-inline void
-add_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3)
-{
-    m128i* _src = reinterpret_cast<m128i*>(src);
-    m128i* _dest = reinterpret_cast<m128i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform addition
-        _dest[i] = add(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform addition
-            aint16 tmp = src[i] + dest[i];
-            dest[i] = (tmp >= card) ? (tmp - card) : tmp;
-        }
-    }
-}
-
-inline void sub_two_bufs(
-    aint16* bufa,
-    aint16* bufb,
-    aint16* res,
-    size_t len,
-    aint16 card = F3)
-{
-    m128i* _bufa = reinterpret_cast<m128i*>(bufa);
-    m128i* _bufb = reinterpret_cast<m128i*>(bufb);
-    m128i* _res = reinterpret_cast<m128i*>(res);
-    const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform subtraction
-        _res[i] = sub(_bufa[i], _bufb[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform subtraction
-            if (bufa[i] >= bufb[i])
-                res[i] = bufa[i] - bufb[i];
-            else
-                res[i] = card - (bufb[i] - bufa[i]);
-        }
-    }
-}
-
-inline void
-mul_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3)
-{
-    m128i* _src = reinterpret_cast<m128i*>(src);
-    m128i* _dest = reinterpret_cast<m128i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplicaton
-        _dest[i] = mul(_src[i], _dest[i], F3);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplicaton
-            // dest[i] = uint32_t(src[i]) * uint32_t(dest[i]) % card;
-            dest[i] = uint16_t((uint32_t(src[i]) * dest[i]) % card);
-        }
-    }
-}
-
-inline void encode_post_process(
-    vec::Buffers<uint16_t>& output,
-    std::vector<Properties>& props,
-    off_t offset,
-    unsigned code_len,
-    uint16_t threshold,
-    size_t vecs_nb)
-{
-    const unsigned vec_size = simd::countof<uint16_t>();
-
-    const m128i _threshold = _mm_set1_epi16(threshold);
-    uint16_t max = 1 << (sizeof(uint16_t) * 8 - 1);
-    const m128i mask_hi = _mm_set1_epi16(max);
-    const unsigned element_size = sizeof(uint16_t);
-
-    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
-        uint16_t* chunk = output.get(frag_id);
-        m128i* buf = reinterpret_cast<m128i*>(chunk);
-        for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) {
-            const m128i a = _mm_load_si128(&(buf[vec_id]));
-            const m128i b = _mm_cmpeq_epi16(_threshold, a);
-            const m128i c = _mm_and_si128(mask_hi, b);
-            uint16_t d = _mm_movemask_epi8(c);
-
-            while (d > 0) {
-                unsigned byte_idx = __builtin_ctz(d);
-                unsigned element_idx = byte_idx / element_size;
-                off_t _offset = offset + vec_id * vec_size + element_idx;
-                props[frag_id].add(_offset, 1);
-                d ^= 1 << byte_idx;
-            }
-        }
-    }
-}
-
-} // namespace simd
-} // namespace quadiron
-
-#endif
diff --git a/src/simd_128_u32.h b/src/simd_128_u32.h
deleted file mode 100644
index 80936a6e..00000000
--- a/src/simd_128_u32.h
+++ /dev/null
@@ -1,639 +0,0 @@
-/*
- * Copyright 2017-2018 Scality
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __QUAD_SIMD_128_U32_H__
-#define __QUAD_SIMD_128_U32_H__
-
-#include <x86intrin.h>
-
-#include "simd/simd.h"
-
-namespace quadiron {
-namespace simd {
-
-/* ==================== Essential Operations =================== */
-
-/** Perform a%card where a is a addition of two numbers whose elements are
- *  symbols of GF(card) */
-inline m128i mod_after_add(m128i a, aint32 card)
-{
-    const m128i _card = _mm_set1_epi32(card);
-    const m128i _card_minus_1 = _mm_set1_epi32(card - 1);
-
-    m128i cmp = _mm_cmpgt_epi32(a, _card_minus_1);
-    m128i b = _mm_sub_epi32(a, _mm_and_si128(_card, cmp));
-
-    return b;
-}
-
-/** Perform addition of two numbers a, b whose elements are of GF(card) */
-inline m128i add(m128i a, m128i b, aint32 card)
-{
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-    m128i c = _mm_add_epi32(_a, _b);
-
-    // Modulo
-    return mod_after_add(c, card);
-}
-
-/** Perform subtraction of a by b where a, b whose elements are symbols of
- *  GF(card)
- * sub(a, b) = a - b if a >= b, or
- *             card + a - b, otherwise
- */
-inline m128i sub(m128i a, m128i b, aint32 card)
-{
-    const m128i _card = _mm_set1_epi32(card);
-
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-
-    m128i cmp = _mm_cmpgt_epi32(_b, _a);
-    m128i _a1 = _mm_add_epi32(_a, _mm_and_si128(_card, cmp));
-
-    return _mm_sub_epi32(_a1, _b);
-}
-
-/** Negate `a`
- * @return 0 if (a == 0), else card - a
- */
-inline m128i neg(m128i a, aint32 card = F4)
-{
-    const m128i _card = _mm_set1_epi32(card);
-
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_setzero_si128();
-    m128i cmp = _mm_cmpgt_epi32(_a, _b);
-
-    return _mm_sub_epi32(_mm_and_si128(cmp, _card), _a);
-}
-
-/** Perform a%card where a is a multiplication of two numbers whose elements are
- *  symbols of GF(F4)
- *
- *  We find v in a = u * card + v
- *  a is expressed also as: a = hi * (card-1) + lo
- *  where hi and lo is 16-bit for F4 (or 8-bit for F3) high and low parts of a
- *  hence, v = (lo - hi) % F4
- *      v = lo - hi, if lo >= hi
- *          or
- *          F4 + lo - hi, otherwise
- */
-inline m128i mod_after_multiply_f4(m128i a)
-{
-    const m128i mask = _mm_set1_epi32(F4 - 2);
-
-    m128i lo = _mm_and_si128(a, mask);
-
-    m128i a_shift = _mm_srli_si128(a, 2);
-    m128i hi = _mm_and_si128(a_shift, mask);
-
-    m128i cmp = _mm_cmpgt_epi32(hi, lo);
-    m128i _lo = _mm_add_epi32(lo, _mm_and_si128(F4_m128i, cmp));
-
-    return _mm_sub_epi32(_lo, hi);
-}
-
-inline m128i mod_after_multiply_f3(m128i a)
-{
-    const m128i mask = _mm_set1_epi32(F3 - 2);
-
-    m128i lo = _mm_and_si128(a, mask);
-
-    m128i a_shift = _mm_srli_si128(a, 1);
-    m128i hi = _mm_and_si128(a_shift, mask);
-
-    m128i cmp = _mm_cmpgt_epi32(hi, lo);
-    m128i _lo = _mm_add_epi32(lo, _mm_and_si128(F3_m128i, cmp));
-
-    return _mm_sub_epi32(_lo, hi);
-}
-
-inline m128i mul_f4(m128i a, m128i b)
-{
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-
-    m128i c = _mm_mullo_epi32(_a, _b);
-
-    // filter elements of both of a & b = card-1
-    m128i cmp = _mm_and_si128(
-        _mm_cmpeq_epi32(_a, F4minus1_m128i),
-        _mm_cmpeq_epi32(_b, F4minus1_m128i));
-
-    const m128i one = _mm_set1_epi32(1);
-    c = _mm_add_epi32(c, _mm_and_si128(one, cmp));
-
-    // Modulo
-    return mod_after_multiply_f4(c);
-}
-
-inline m128i mul_f4_simple(m128i a, m128i b)
-{
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-
-    m128i c = _mm_mullo_epi32(_a, _b);
-
-    // Modulo
-    return mod_after_multiply_f4(c);
-}
-
-inline m128i mul_f3(m128i a, m128i b)
-{
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-
-    m128i c = _mm_mullo_epi32(_a, _b);
-
-    // filter elements of both of a & b = card-1
-    m128i cmp = _mm_and_si128(
-        _mm_cmpeq_epi32(_a, F3minus1_m128i),
-        _mm_cmpeq_epi32(_b, F3minus1_m128i));
-
-    c = _mm_xor_si128(c, _mm_and_si128(F4_m128i, cmp));
-
-    // Modulo
-    return mod_after_multiply_f3(c);
-}
-
-inline m128i mul_f3_simple(m128i a, m128i b)
-{
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-
-    m128i c = _mm_mullo_epi32(_a, _b);
-
-    // Modulo
-    return mod_after_multiply_f3(c);
-}
-
-/** Perform multiplication of two numbers a, b whose elements are of GF(card)
- *  where `card` is a prime Fermat number, i.e. card = Fx with x < 5
- *  Currently, it supports only for F3 and F4
- */
-inline m128i mul(m128i a, m128i b, aint32 card)
-{
-    assert(card == F4 || card == F3);
-    if (card == F4)
-        return mul_f4(a, b);
-    return mul_f3(a, b);
-}
-
-inline m128i mul_simple(m128i a, m128i b, aint32 card)
-{
-    assert(card == F4 || card == F3);
-    if (card == F4)
-        return mul_f4_simple(a, b);
-    return mul_f3_simple(a, b);
-}
-
-/** Apply an element-wise negation to a buffer
- */
-inline void neg(size_t len, aint32* buf, aint32 card = F4)
-{
-    m128i* _buf = reinterpret_cast<m128i*>(buf);
-    unsigned ratio = sizeof(*_buf) / sizeof(*buf);
-    size_t _len = len / ratio;
-    size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        _buf[i] = neg(_buf[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            if (buf[i] > 0)
-                buf[i] = card - buf[i];
-        }
-    }
-}
-
-/** Perform a multiplication of a coefficient `a` to each element of `src` and
- *  add result to correspondent element of `dest`
- */
-inline void mul_coef_to_buf(
-    const aint32 a,
-    aint32* src,
-    aint32* dest,
-    size_t len,
-    aint32 card = F4)
-{
-    const m128i coef = _mm_set1_epi32(a);
-
-    m128i* _src = reinterpret_cast<m128i*>(src);
-    m128i* _dest = reinterpret_cast<m128i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplication
-        _dest[i] = mul(coef, _src[i], card);
-    }
-    if (_last_len > 0) {
-        uint64_t coef_64 = (uint64_t)a;
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplication
-            dest[i] = (aint32)((coef_64 * src[i]) % card);
-        }
-    }
-}
-
-inline void
-add_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4)
-{
-    m128i* _src = reinterpret_cast<m128i*>(src);
-    m128i* _dest = reinterpret_cast<m128i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform addition
-        _dest[i] = add(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform addition
-            aint32 tmp = src[i] + dest[i];
-            dest[i] = (tmp >= card) ? (tmp - card) : tmp;
-        }
-    }
-}
-
-inline void sub_two_bufs(
-    aint32* bufa,
-    aint32* bufb,
-    aint32* res,
-    size_t len,
-    aint32 card = F4)
-{
-    m128i* _bufa = reinterpret_cast<m128i*>(bufa);
-    m128i* _bufb = reinterpret_cast<m128i*>(bufb);
-    m128i* _res = reinterpret_cast<m128i*>(res);
-    const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform subtraction
-        _res[i] = sub(_bufa[i], _bufb[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform subtraction
-            if (bufa[i] >= bufb[i])
-                res[i] = bufa[i] - bufb[i];
-            else
-                res[i] = card - (bufb[i] - bufa[i]);
-        }
-    }
-}
-
-inline void
-mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4)
-{
-    m128i* _src = reinterpret_cast<m128i*>(src);
-    m128i* _dest = reinterpret_cast<m128i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplicaton
-        _dest[i] = mul(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplicaton
-            dest[i] = uint32_t((uint64_t(src[i]) * dest[i]) % card);
-        }
-    }
-}
-
-// outputA = inputA + inputB
-// outputB = inputA - inputB
-inline void butterfly_step(
-    m128i* inputA,
-    m128i* inputB,
-    m128i* outputA,
-    m128i* outputB,
-    uint32_t _card)
-{
-    const m128i card = (_card == F3) ? F3_m128i : F4_m128i;
-    const m128i card_1 = (_card == F3) ? F3minus1_m128i : F4minus1_m128i;
-
-    // --------------------------------------
-    // outputB = inputA - inputB
-    // --------------------------------------
-    m128i a = _mm_load_si128(inputA);
-    m128i b = _mm_load_si128(inputB);
-    m128i cmp_1 = _mm_cmpgt_epi32(b, a);
-    m128i res_1 = _mm_add_epi32(a, _mm_and_si128(card, cmp_1));
-
-    _mm_store_si128(outputB, _mm_sub_epi32(res_1, b));
-
-    // --------------------------------------
-    // outputA = symbA + symbB
-    // --------------------------------------
-    m128i res_2 = _mm_add_epi32(a, b);
-    // modulo
-    m128i cmp_2 = _mm_cmpgt_epi32(res_2, card_1);
-    m128i c = _mm_sub_epi32(res_2, _mm_and_si128(card, cmp_2));
-
-    _mm_store_si128(outputA, c);
-}
-
-// for each pair (P, Q) = (buf[i], buf[i + m]):
-// P = P + Q
-// Q = P - Q
-inline void butterfly_ct_1(
-    vec::Buffers<uint32_t>& buf,
-    unsigned start,
-    unsigned m,
-    unsigned step,
-    size_t len,
-    uint32_t card = F4)
-{
-    for (int i = start; i < buf.get_n(); i += step) {
-        uint32_t* a = buf.get(i);
-        uint32_t* b = buf.get(i + m);
-        m128i* _a = reinterpret_cast<m128i*>(a);
-        m128i* _b = reinterpret_cast<m128i*>(b);
-        // perform butterfly operation for Cooley-Tukey FFT algorithm
-        for (size_t j = 0; j < len; ++j) {
-            butterfly_step(&(_a[j]), &(_b[j]), &(_a[j]), &(_b[j]), card);
-        }
-    }
-}
-
-// for each pair (P, Q) = (buf[i], buf[i + m]):
-// P = P - Q
-// Q = P + Q
-inline void butterfly_ct_2(
-    vec::Buffers<uint32_t>& buf,
-    unsigned start,
-    unsigned m,
-    unsigned step,
-    size_t len,
-    uint32_t card = F4)
-{
-    for (int i = start; i < buf.get_n(); i += step) {
-        uint32_t* a = buf.get(i);
-        uint32_t* b = buf.get(i + m);
-        m128i* _a = reinterpret_cast<m128i*>(a);
-        m128i* _b = reinterpret_cast<m128i*>(b);
-        // perform butterfly operation for Cooley-Tukey FFT algorithm
-        for (size_t j = 0; j < len; ++j) {
-            butterfly_step(&(_a[j]), &(_b[j]), &(_b[j]), &(_a[j]), card);
-        }
-    }
-}
-
-// output = coef * input
-inline void
-butterfly_mul(m128i* coef, m128i* input, m128i* output, uint32_t _card)
-{
-    const m128i card = (_card == F3) ? F3_m128i : F4_m128i;
-    const m128i card_2 = (_card == F3) ? F3minus2_m128i : F4minus2_m128i;
-
-    // --------------------------------------
-    // compute coef * symbB
-    // --------------------------------------
-    m128i _coef = _mm_load_si128(coef);
-    m128i b = _mm_load_si128(input);
-    m128i res = _mm_mullo_epi32(_coef, b);
-    // modulo
-    m128i lo = _mm_and_si128(res, card_2);
-    m128i res_shift =
-        (_card == F3) ? _mm_srli_si128(res, 1) : _mm_srli_si128(res, 2);
-    m128i hi = _mm_and_si128(res_shift, card_2);
-
-    m128i cmp_1 = _mm_cmpgt_epi32(hi, lo);
-    m128i _lo = _mm_add_epi32(lo, _mm_and_si128(card, cmp_1));
-
-    m128i res_2 = _mm_sub_epi32(_lo, hi);
-
-    _mm_store_si128(output, res_2);
-}
-
-// symbA = symbA + coef * symbB
-// symbB = symbA - coef * symbB
-inline void
-butterfly_ct_3_step(m128i* coef, m128i* symbA, m128i* symbB, uint32_t _card)
-{
-    // --------------------------------------
-    // compute coef * symbB
-    // --------------------------------------
-    m128i coef_x_symbB;
-    butterfly_mul(coef, symbB, &coef_x_symbB, _card);
-    // --------------------------------------
-    // symbA = symbA + coef_x_symbB
-    // symbB = symbA - coef_x_symbB
-    // --------------------------------------
-    butterfly_step(symbA, &coef_x_symbB, symbA, symbB, _card);
-}
-
-// for each pair (P, Q) = (buf[i], buf[i + m]):
-// P = P + c * Q
-// Q = P - c * Q
-inline void butterfly_ct_3(
-    uint32_t coef,
-    vec::Buffers<uint32_t>& buf,
-    unsigned start,
-    unsigned m,
-    unsigned step,
-    size_t len,
-    uint32_t card = F4)
-{
-    m128i _coef = _mm_set1_epi32(coef);
-    for (int i = start; i < buf.get_n(); i += step) {
-        uint32_t* a = buf.get(i);
-        uint32_t* b = buf.get(i + m);
-        m128i* _a = reinterpret_cast<m128i*>(a);
-        m128i* _b = reinterpret_cast<m128i*>(b);
-        // perform butterfly operation for Cooley-Tukey FFT algorithm
-        for (size_t j = 0; j < len; ++j) {
-            butterfly_ct_3_step(&_coef, &(_a[j]), &(_b[j]), card);
-        }
-    }
-}
-
-// for each pair (P, Q) = (buf[i], buf[i + m]):
-// P = Q + P
-// Q = Q - P
-inline void butterfly_gs_2(
-    vec::Buffers<uint32_t>& buf,
-    unsigned start,
-    unsigned m,
-    unsigned step,
-    size_t len,
-    uint32_t card = F4)
-{
-    for (int i = start; i < buf.get_n(); i += step) {
-        uint32_t* a = buf.get(i);
-        uint32_t* b = buf.get(i + m);
-        m128i* _a = reinterpret_cast<m128i*>(a);
-        m128i* _b = reinterpret_cast<m128i*>(b);
-        // perform butterfly operation for Cooley-Tukey FFT algorithm
-        for (size_t j = 0; j < len; ++j) {
-            butterfly_step(&(_b[j]), &(_a[j]), &(_a[j]), &(_b[j]), card);
-        }
-    }
-}
-
-// symbA = symbA + symbB
-// symbB = coef * (symbA - symbB)
-inline void
-butterfly_gs_3_step(m128i* coef, m128i* symbA, m128i* symbB, uint32_t _card)
-{
-    // --------------------------------------
-    // symbA = symbA + symbB
-    // symbB = symbA - symbB
-    // --------------------------------------
-    butterfly_step(symbA, symbB, symbA, symbB, _card);
-
-    // --------------------------------------
-    // symbB = coef * symbB
-    // --------------------------------------
-    butterfly_mul(coef, symbB, symbB, _card);
-}
-
-// for each pair (P, Q) = (buf[i], buf[i + m]):
-// P = P + Q
-// Q = c * (P - Q)
-inline void butterfly_gs_3(
-    uint32_t coef,
-    vec::Buffers<uint32_t>& buf,
-    unsigned start,
-    unsigned m,
-    unsigned step,
-    size_t len,
-    uint32_t card = F4)
-{
-    m128i _coef = _mm_set1_epi32(coef);
-    for (int i = start; i < buf.get_n(); i += step) {
-        uint32_t* a = buf.get(i);
-        uint32_t* b = buf.get(i + m);
-        m128i* _a = reinterpret_cast<m128i*>(a);
-        m128i* _b = reinterpret_cast<m128i*>(b);
-        // perform butterfly operation for Cooley-Tukey FFT algorithm
-        for (size_t j = 0; j < len; ++j) {
-            butterfly_gs_3_step(&_coef, &(_a[j]), &(_b[j]), card);
-        }
-    }
-}
-
-inline void encode_post_process(
-    vec::Buffers<uint32_t>& output,
-    std::vector<Properties>& props,
-    off_t offset,
-    unsigned code_len,
-    uint32_t threshold,
-    size_t vecs_nb)
-{
-    const unsigned vec_size = simd::countof<uint32_t>();
-
-    const m128i _threshold = _mm_set1_epi32(threshold);
-    const uint32_t max = 1 << (sizeof(uint32_t) * 8 - 1);
-    const m128i mask_hi = _mm_set1_epi32(max);
-    const unsigned element_size = sizeof(uint32_t);
-
-    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
-        uint32_t* chunk = output.get(frag_id);
-        m128i* buf = reinterpret_cast<m128i*>(chunk);
-        for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) {
-            const m128i a = _mm_load_si128(&(buf[vec_id]));
-            const m128i b = _mm_cmpeq_epi32(_threshold, a);
-            const m128i c = _mm_and_si128(mask_hi, b);
-            uint16_t d = _mm_movemask_epi8(c);
-
-            while (d > 0) {
-                unsigned byte_idx = __builtin_ctz(d);
-                unsigned element_idx = byte_idx / element_size;
-                off_t _offset = offset + vec_id * vec_size + element_idx;
-                props[frag_id].add(_offset, 1);
-                d ^= 1 << byte_idx;
-            }
-        }
-    }
-}
-
-/* ==================== Operations for NF4 =================== */
-
-/** Return aint128 integer from a _m128i register */
-static inline aint128 m128i_to_uint128(m128i v)
-{
-    aint128 i;
-    _mm_store_si128((m128i*)&i, v);
-
-    return i; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn)
-}
-
-inline __uint128_t add(__uint128_t a, __uint128_t b)
-{
-    m128i res = add((m128i)a, (m128i)b, F4);
-    return m128i_to_uint128(res);
-}
-
-inline __uint128_t sub(__uint128_t a, __uint128_t b)
-{
-    m128i res = sub((m128i)a, (m128i)b, F4);
-    return m128i_to_uint128(res);
-}
-
-inline __uint128_t mul(__uint128_t a, __uint128_t b)
-{
-    m128i res = mul((m128i)a, (m128i)b, F4);
-    return m128i_to_uint128(res);
-}
-
-inline void hadamard_mul(int n, aint128* _x, aint128* _y)
-{
-    int i;
-    m128i* x = reinterpret_cast<m128i*>(_x);
-    m128i* y = reinterpret_cast<m128i*>(_y);
-
-    // multiply y to `x`
-    for (i = 0; i < n; i++) {
-        x[i] = mul(x[i], y[i], F4);
-    }
-}
-
-} // namespace simd
-} // namespace quadiron
-
-#endif

From 08767746f3f251f4acf47db3aa2acb116e3e2aee Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:44:03 +0200
Subject: [PATCH 09/77] SIMD Main file including necessary files

1. Essential operations

- simd_128.h contains essential wrappers of SIMD operations on SSE
- simd_256.h contains essential wrappers of SIMD operations on AVX

2. Basic operations

- simd_basic.h contain basic operations used in following cases, and
also operations for RingModN

3. Vectorized operations

- simd_fnt.h contains vectorized operations dedicated for FNT
- simd_nf4.h contains vectorized operations dedicated for nf4
---
 src/simd.h | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/simd.h b/src/simd.h
index 8309bfff..d70fcb2f 100644
--- a/src/simd.h
+++ b/src/simd.h
@@ -39,31 +39,35 @@
 const unsigned F4 = 65537;
 const unsigned F3 = 257;
 
-typedef uint8_t aint8 __attribute__((aligned(quadiron::simd::ALIGNMENT)));
-typedef uint16_t aint16 __attribute__((aligned(quadiron::simd::ALIGNMENT)));
-typedef uint32_t aint32 __attribute__((aligned(quadiron::simd::ALIGNMENT)));
-typedef uint64_t aint64 __attribute__((aligned(quadiron::simd::ALIGNMENT)));
-typedef __uint128_t aint128 __attribute__((aligned(quadiron::simd::ALIGNMENT)));
-
 namespace quadiron {
-/** The namespace simd contains functions for GF-NF4 that are accelerated by
- *  using SIMD operations over 128bits
+/** The namespace simd contains functions accelerated by
+ *  using SIMD operations over 128bits and 256bits
  *
- *  It supports operations on 32-bit numbers
+ *  It supports operations on 16-bit and 32-bit numbers
  */
 namespace simd {
 
+#define EITHER(x, a, b) (((x)) ? (a) : (b))
+
 // Vectorized operations are implemented in appropriated headers simd*.h
 
 } // namespace simd
 } // namespace quadiron
 
+// Include essential operations that use SIMD functions
 #if defined(__AVX2__)
 #include "simd_256.h"
 #elif defined(__SSE4_1__)
 #include "simd_128.h"
 #endif
 
+// Include basic operations
+#include "simd_basic.h"
+
+// Include accelerated operations dedicated for FNT
+#include "simd_fnt.h"
+
+// Include accelerated operations dedicated for NF4
 #include "simd_nf4.h"
 
 #endif // #ifdef QUADIRON_USE_SIMD

From 4f97fa69d8e55a5a522860084e87758a8bf10d56 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:44:10 +0200
Subject: [PATCH 10/77] SIMD 128: essential operations for SSE

---
 src/simd_128.h | 134 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 123 insertions(+), 11 deletions(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index 0f6b8857..afbfe70f 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -33,19 +33,131 @@
 
 #include <x86intrin.h>
 
-typedef __m128i m128i;
+namespace quadiron {
+namespace simd {
 
-// Disable `cert-err58-cpp` on these: AFAIK they cannot throw.
-// (probably a false positive present in Clang 5 and fixed in Clang 6).
-const m128i F4_m128i = _mm_set1_epi32(65537);       // NOLINT(cert-err58-cpp)
-const m128i F4minus1_m128i = _mm_set1_epi32(65536); // NOLINT(cert-err58-cpp)
-const m128i F3_m128i = _mm_set1_epi32(257);         // NOLINT(cert-err58-cpp)
-const m128i F3minus1_m128i = _mm_set1_epi32(256);   // NOLINT(cert-err58-cpp)
+typedef __m128i VecType;
+typedef uint32_t MaskIntType;
 
-const m128i F3_m128i_u16 = _mm_set1_epi16(257);       // NOLINT(cert-err58-cpp)
-const m128i F3minus1_m128i_u16 = _mm_set1_epi16(256); // NOLINT(cert-err58-cpp)
+#define F4_u32 _mm_set1_epi32(65537)
+#define F4m1_u32 _mm_set1_epi32(65536)
+#define F3_u32 _mm_set1_epi32(257)
+#define F3m1_u32 _mm_set1_epi32(256)
 
-#include "simd_128_u16.h"
-#include "simd_128_u32.h"
+#define F3_u16 _mm_set1_epi16(257)
+#define F3m1_u16 _mm_set1_epi16(256)
+
+#define CARD(q) (EITHER(q == F3, F3_u32, F4_u32))
+#define CARD_M_1(q) (EITHER(q == F3, F3m1_u32, F4m1_u32))
+
+/* ============= Essential Operations for AVX2 w/ both u16 & u32 ============ */
+
+#define ZERO (_mm_setzero_si128())
+#define ONE16 (_mm_set1_epi16(1))
+#define ONE32 (_mm_set1_epi32(1))
+
+inline VecType LOAD(VecType* address)
+{
+    return _mm_load_si128(address);
+}
+inline void STORE(VecType* address, VecType reg)
+{
+    _mm_store_si128(address, reg);
+}
+
+inline VecType AND(VecType x, VecType y)
+{
+    return _mm_and_si128(x, y);
+}
+inline VecType XOR(VecType x, VecType y)
+{
+    return _mm_xor_si128(x, y);
+}
+inline VecType SHIFTR_1(VecType x)
+{
+    return _mm_srli_si128(x, 1);
+}
+inline VecType SHIFTR_2(VecType x)
+{
+    return _mm_srli_si128(x, 2);
+}
+inline uint16_t MVMSK8(VecType x)
+{
+    return _mm_movemask_epi8(x);
+}
+inline uint16_t TESTZ(VecType x, VecType y)
+{
+    return _mm_testz_si128(x, y);
+}
+
+/* ================= Essential Operations for AVX2 w/ u32 ================= */
+
+inline VecType SET1(uint32_t val)
+{
+    return _mm_set1_epi32(val);
+}
+inline VecType ADD32(VecType x, VecType y)
+{
+    return _mm_add_epi32(x, y);
+}
+inline VecType SUB32(VecType x, VecType y)
+{
+    return _mm_sub_epi32(x, y);
+}
+inline VecType MUL32(VecType x, VecType y)
+{
+    return _mm_mullo_epi32(x, y);
+}
+
+inline VecType CMPEQ32(VecType x, VecType y)
+{
+    return _mm_cmpeq_epi32(x, y);
+}
+inline VecType CMPGT32(VecType x, VecType y)
+{
+    return _mm_cmpgt_epi32(x, y);
+}
+inline VecType MINU32(VecType x, VecType y)
+{
+    return _mm_min_epu32(x, y);
+}
+#define MASK8_LO (_mm_set1_epi16(0x80))
+#define BLEND8(x, y, mask) (_mm_blendv_epi8(x, y, mask))
+#define BLEND16(x, y, imm8) (_mm_blend_epi16(x, y, imm8))
+
+/* ================= Essential Operations for AVX2 w/ u16 ================= */
+
+inline VecType SET1(uint16_t val)
+{
+    return _mm_set1_epi16(val);
+}
+inline VecType ADD16(VecType x, VecType y)
+{
+    return _mm_add_epi16(x, y);
+}
+inline VecType SUB16(VecType x, VecType y)
+{
+    return _mm_sub_epi16(x, y);
+}
+inline VecType MUL16(VecType x, VecType y)
+{
+    return _mm_mullo_epi16(x, y);
+}
+
+inline VecType CMPEQ16(VecType x, VecType y)
+{
+    return _mm_cmpeq_epi16(x, y);
+}
+inline VecType CMPGT16(VecType x, VecType y)
+{
+    return _mm_cmpgt_epi16(x, y);
+}
+inline VecType MINU16(VecType x, VecType y)
+{
+    return _mm_min_epu16(x, y);
+}
+
+} // namespace simd
+} // namespace quadiron
 
 #endif

From 7dc72c1ce36ec640b136c27746fc66b3db0baf25 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:44:16 +0200
Subject: [PATCH 11/77] SIMD 256: essential operations for AVX

---
 src/simd_256.h | 144 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 129 insertions(+), 15 deletions(-)

diff --git a/src/simd_256.h b/src/simd_256.h
index 2dc49cc4..d06f3218 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -33,19 +33,6 @@
 
 #include <x86intrin.h>
 
-typedef __m256i m256i;
-
-// Disable `cert-err58-cpp` on these: AFAIK they cannot throw.
-// (probably a false positive present in Clang 5 and fixed in Clang 6).
-const m256i F4_m256i = _mm256_set1_epi32(65537);       // NOLINT(cert-err58-cpp)
-const m256i F4minus1_m256i = _mm256_set1_epi32(65536); // NOLINT(cert-err58-cpp)
-const m256i F3_m256i = _mm256_set1_epi32(257);         // NOLINT(cert-err58-cpp)
-const m256i F3minus1_m256i = _mm256_set1_epi32(256);   // NOLINT(cert-err58-cpp)
-
-const m256i F3_m256i_u16 = _mm256_set1_epi16(257); // NOLINT(cert-err58-cpp)
-// NOLINTNEXTLINE(cert-err58-cpp)
-const m256i F3minus1_m256i_u16 = _mm256_set1_epi16(256);
-
 /* GCC doesn't include the split store intrinsics so define them here. */
 #if defined(__GNUC__) && !defined(__clang__)
 
@@ -58,7 +45,134 @@ _mm256_storeu2_m128i(__m128i* const hi, __m128i* const lo, const __m256i a)
 
 #endif /* defined(__GNUC__) */
 
-#include "simd_256_u16.h"
-#include "simd_256_u32.h"
+namespace quadiron {
+namespace simd {
+
+typedef __m256i VecType;
+typedef __m128i HalfVecType;
+typedef __uint128_t NF4Type;
+typedef uint32_t MaskIntType;
+
+#define F4_u32 _mm256_set1_epi32(65537)
+#define F4m1_u32 _mm256_set1_epi32(65536)
+#define F3_u32 _mm256_set1_epi32(257)
+#define F3m1_u32 _mm256_set1_epi32(256)
+
+#define F3_u16 _mm256_set1_epi16(257)
+#define F3m1_u16 _mm256_set1_epi16(256)
+
+#define CARD(q) (EITHER(q == F3, F3_u32, F4_u32))
+#define CARD_M_1(q) (EITHER(q == F3, F3m1_u32, F4m1_u32))
+
+/* ============= Essential Operations for AVX2 w/ both u16 & u32 ============ */
+
+#define ZERO (_mm256_setzero_si256())
+#define ONE16 (_mm256_set1_epi16(1))
+#define ONE32 (_mm256_set1_epi32(1))
+
+inline VecType LOAD(VecType* address)
+{
+    return _mm256_load_si256(address);
+}
+inline void STORE(VecType* address, VecType reg)
+{
+    _mm256_store_si256(address, reg);
+}
+
+inline VecType AND(VecType x, VecType y)
+{
+    return _mm256_and_si256(x, y);
+}
+inline VecType XOR(VecType x, VecType y)
+{
+    return _mm256_xor_si256(x, y);
+}
+inline VecType SHIFTR_1(VecType x)
+{
+    return _mm256_srli_si256(x, 1);
+}
+inline VecType SHIFTR_2(VecType x)
+{
+    return _mm256_srli_si256(x, 2);
+}
+inline uint32_t MVMSK8(VecType x)
+{
+    return _mm256_movemask_epi8(x);
+}
+inline uint32_t TESTZ(VecType x, VecType y)
+{
+    return _mm256_testz_si256(x, y);
+}
+
+/* ================= Essential Operations for AVX2 w/ u32 ================= */
+
+inline VecType SET1(uint32_t val)
+{
+    return _mm256_set1_epi32(val);
+}
+inline VecType ADD32(VecType x, VecType y)
+{
+    return _mm256_add_epi32(x, y);
+}
+inline VecType SUB32(VecType x, VecType y)
+{
+    return _mm256_sub_epi32(x, y);
+}
+inline VecType MUL32(VecType x, VecType y)
+{
+    return _mm256_mullo_epi32(x, y);
+}
+
+inline VecType CMPEQ32(VecType x, VecType y)
+{
+    return _mm256_cmpeq_epi32(x, y);
+}
+inline VecType CMPGT32(VecType x, VecType y)
+{
+    return _mm256_cmpgt_epi32(x, y);
+}
+inline VecType MINU32(VecType x, VecType y)
+{
+    return _mm256_min_epu32(x, y);
+}
+
+#define MASK8_LO (_mm256_set1_epi16(0x80))
+#define BLEND8(x, y, mask) (_mm256_blendv_epi8(x, y, mask))
+#define BLEND16(x, y, imm8) (_mm256_blend_epi16(x, y, imm8))
+
+/* ================= Essential Operations for AVX2 w/ u16 ================= */
+
+inline VecType SET1(uint16_t val)
+{
+    return _mm256_set1_epi16(val);
+}
+inline VecType ADD16(VecType x, VecType y)
+{
+    return _mm256_add_epi16(x, y);
+}
+inline VecType SUB16(VecType x, VecType y)
+{
+    return _mm256_sub_epi16(x, y);
+}
+inline VecType MUL16(VecType x, VecType y)
+{
+    return _mm256_mullo_epi16(x, y);
+}
+
+inline VecType CMPEQ16(VecType x, VecType y)
+{
+    return _mm256_cmpeq_epi16(x, y);
+}
+inline VecType CMPGT16(VecType x, VecType y)
+{
+    return _mm256_cmpgt_epi16(x, y);
+}
+inline VecType MINU16(VecType x, VecType y)
+{
+    return _mm256_min_epu16(x, y);
+}
+
+} // namespace simd
+} // namespace quadiron
 
 #endif

From fceff91f8700735848eb63c4f29ed57185867bf9 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:44:21 +0200
Subject: [PATCH 12/77] SIMD Basic: includes basic Operations

It implements basic operations that will be used everywhere.
It includes also operations for RingModN
---
 src/simd_basic.h | 404 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 404 insertions(+)
 create mode 100644 src/simd_basic.h

diff --git a/src/simd_basic.h b/src/simd_basic.h
new file mode 100644
index 00000000..7585734d
--- /dev/null
+++ b/src/simd_basic.h
@@ -0,0 +1,404 @@
+/*
+ * Copyright 2017-2018 Scality
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __QUAD_SIMD_BASIC_H__
+#define __QUAD_SIMD_BASIC_H__
+
+#include <x86intrin.h>
+
+namespace quadiron {
+namespace simd {
+
+/* ================= Basic Operations for u32 ================= */
+
+/**
+ * Modular addition for packed unsigned 32-bit integers
+ *
+ * @param x input register
+ * @param y input register
+ * @param q modulo
+ * @return (x + y) mod q
+ */
+inline VecType ADD_MOD(VecType x, VecType y, uint32_t q)
+{
+    VecType res = ADD32(x, y);
+    return MINU32(res, SUB32(res, CARD(q)));
+}
+
+/**
+ * Modular subtraction for packed unsigned 32-bit integers
+ *
+ * @param x input register
+ * @param y input register
+ * @param q modulo
+ * @return (x - y) mod q
+ */
+inline VecType SUB_MOD(VecType x, VecType y, uint32_t q)
+{
+    VecType res = SUB32(x, y);
+    return MINU32(res, ADD32(res, CARD(q)));
+}
+
+/**
+ * Modular negation for packed unsigned 32-bit integers
+ *
+ * @param x input register
+ * @param q modulo
+ * @return (-x) mod q
+ */
+inline VecType NEG_MOD(VecType x, uint32_t q)
+{
+    VecType res = SUB32(CARD(q), x);
+    return MINU32(res, SUB32(res, CARD(q)));
+}
+
+/**
+ * Modular multiplication for packed unsigned 32-bit integers
+ *
+ * @note We assume that at least `x` or `y` is less than `q-1` so it's
+ * not necessary to verify overflow on multiplying elements
+ *
+ * @param x input register
+ * @param y input register
+ * @param q modulo
+ * @return (x * y) mod q
+ */
+inline VecType MUL_MOD(VecType x, VecType y, uint32_t q)
+{
+    VecType res = MUL32(x, y);
+    VecType lo =
+        (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55);
+    VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR_1(res), MASK8_LO)
+                           : BLEND16(ZERO, SHIFTR_2(res), 0x55);
+    return SUB_MOD(lo, hi, q);
+}
+
+/**
+ * Modular general multiplication for packed unsigned 32-bit integers
+ *
+ * @note It's necessary to verify overflow on multiplying elements
+ *
+ * @param x input register
+ * @param y input register
+ * @param q modulo
+ * @return (x * y) mod q
+ */
+inline VecType MULFULL_MOD(VecType x, VecType y, uint32_t q)
+{
+    VecType res = MUL32(x, y);
+
+    // filter elements of both of a & b = card-1
+    VecType cmp = AND(CMPEQ32(x, CARD_M_1(q)), CMPEQ32(y, CARD_M_1(q)));
+    res = (q == F3) ? XOR(res, AND(F4_u32, cmp)) : ADD32(res, AND(ONE32, cmp));
+
+    VecType lo =
+        (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55);
+    VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR_1(res), MASK8_LO)
+                           : BLEND16(ZERO, SHIFTR_2(res), 0x55);
+    return SUB_MOD(lo, hi, q);
+}
+
+/**
+ * Update property for a given register for packed unsigned 32-bit integers
+ *
+ * @param props properties bound to fragments
+ * @param threshold register storing max value in its elements
+ * @param mask a specific mask
+ * @param symb input register
+ * @param offset offset in the data fragments
+ * @param max a dummy variable
+ */
+inline void ADD_PROPS(
+    Properties& props,
+    VecType threshold,
+    VecType mask,
+    VecType symb,
+    off_t offset,
+    uint32_t max)
+{
+    const VecType b = CMPEQ32(threshold, symb);
+    const VecType c = AND(mask, b);
+    MaskIntType d = MVMSK8(c);
+    const unsigned element_size = sizeof(uint32_t);
+    while (d > 0) {
+        unsigned byte_idx = __builtin_ctz(d);
+        off_t _offset = offset + byte_idx / element_size;
+        props.add(_offset, OOR_MARK);
+        d ^= 1 << byte_idx;
+    }
+}
+
+/* ================= Basic Operations for u16 ================= */
+
+/**
+ * Modular addition for packed unsigned 16-bit integers
+ *
+ * @param x input register
+ * @param y input register
+ * @param q modulo
+ * @return (x + y) mod q
+ */
+inline VecType ADD_MOD(VecType x, VecType y, uint16_t q)
+{
+    VecType res = ADD16(x, y);
+    return MINU16(res, SUB16(res, F3_u16));
+}
+
+/**
+ * Modular subtraction for packed unsigned 16-bit integers
+ *
+ * @param x input register
+ * @param y input register
+ * @param q modulo
+ * @return (x - y) mod q
+ */
+inline VecType SUB_MOD(VecType x, VecType y, uint16_t q)
+{
+    VecType res = SUB16(x, y);
+    return MINU16(res, SUB16(ADD16(x, F3_u16), y));
+}
+
+/**
+ * Modular negation for packed unsigned 16-bit integers
+ *
+ * @param x input register
+ * @param q modulo
+ * @return (-x) mod q
+ */
+inline VecType NEG_MOD(VecType x, uint16_t q)
+{
+    VecType res = SUB16(F3_u16, x);
+    return MINU16(res, SUB16(res, F3_u16));
+}
+
+/**
+ * Modular multiplication for packed unsigned 16-bit integers
+ *
+ * @note We assume that at least `x` or `y` is less than `q-1` so it's
+ * not necessary to verify overflow on multiplying elements
+ *
+ * @param x input register
+ * @param y input register
+ * @param q modulo
+ * @return (x * y) mod q
+ */
+inline VecType MUL_MOD(VecType x, VecType y, uint16_t q)
+{
+    VecType res = MUL16(x, y);
+    VecType lo = BLEND8(ZERO, res, MASK8_LO);
+    VecType hi = BLEND8(ZERO, SHIFTR_1(res), MASK8_LO);
+    return SUB_MOD(lo, hi, q);
+}
+
+/**
+ * Modular general multiplication for packed unsigned 16-bit integers
+ *
+ * @note It's necessary to verify overflow on multiplying elements
+ *
+ * @param x input register
+ * @param y input register
+ * @param q modulo
+ * @return (x * y) mod q
+ */
+inline VecType MULFULL_MOD(VecType x, VecType y, uint16_t q)
+{
+    VecType res = MUL16(x, y);
+
+    // filter elements of both of a & b = card-1
+    VecType cmp = AND(CMPEQ16(x, F3m1_u16), CMPEQ16(y, F3m1_u16));
+    res = ADD16(res, AND(ONE16, cmp));
+
+    VecType lo = BLEND8(ZERO, res, MASK8_LO);
+    VecType hi = BLEND8(ZERO, SHIFTR_1(res), MASK8_LO);
+    return SUB_MOD(lo, hi, q);
+}
+
+/**
+ * Update property for a given register for packed unsigned 32-bit integers
+ *
+ * @param props properties bound to fragments
+ * @param threshold register storing max value in its elements
+ * @param mask a specific mask
+ * @param symb input register
+ * @param offset offset in the data fragments
+ * @param max a dummy variable
+ */
+inline void ADD_PROPS(
+    Properties& props,
+    VecType threshold,
+    VecType mask,
+    VecType symb,
+    off_t offset,
+    uint16_t max)
+{
+    const VecType b = CMPEQ16(threshold, symb);
+    const VecType c = AND(mask, b);
+    MaskIntType d = MVMSK8(c);
+    const unsigned element_size = sizeof(uint16_t);
+    while (d > 0) {
+        unsigned byte_idx = __builtin_ctz(d);
+        off_t _offset = offset + byte_idx / element_size;
+        props.add(_offset, OOR_MARK);
+        d ^= 1 << byte_idx;
+    }
+}
+
+/* ==================== Operations for RingModN =================== */
+/** Perform a multiplication of a coefficient `a` to each element of `src` and
+ *  add result to correspondent element of `dest`
+ *
+ * @note: 1 < `a` < card - 1
+ */
+template <typename T>
+inline void mul_coef_to_buf(const T a, T* src, T* dest, size_t len, T card)
+{
+    const VecType coef = SET1(a);
+
+    VecType* __restrict _src = reinterpret_cast<VecType*>(src);
+    VecType* __restrict _dest = reinterpret_cast<VecType*>(dest);
+    const unsigned ratio = sizeof(*_src) / sizeof(*src);
+    const size_t _len = len / ratio;
+    const size_t _last_len = len - _len * ratio;
+
+    size_t i = 0;
+    size_t end = (_len > 3) ? _len - 3 : 0;
+    for (; i < end; i += 4) {
+        _dest[i] = MUL_MOD(coef, _src[i], card);
+        _dest[i + 1] = MUL_MOD(coef, _src[i + 1], card);
+        _dest[i + 2] = MUL_MOD(coef, _src[i + 2], card);
+        _dest[i + 3] = MUL_MOD(coef, _src[i + 3], card);
+    }
+    for (; i < _len; ++i) {
+        _dest[i] = MUL_MOD(coef, _src[i], card);
+    }
+
+    if (_last_len > 0) {
+        DoubleSizeVal<T> coef_double = DoubleSizeVal<T>(a);
+        for (size_t i = _len * ratio; i < len; i++) {
+            dest[i] = (T)((coef_double * src[i]) % card);
+        }
+    }
+}
+
+template <typename T>
+inline void add_two_bufs(T* src, T* dest, size_t len, T card)
+{
+    VecType* __restrict _src = reinterpret_cast<VecType*>(src);
+    VecType* __restrict _dest = reinterpret_cast<VecType*>(dest);
+    const unsigned ratio = sizeof(*_src) / sizeof(*src);
+    const size_t _len = len / ratio;
+    const size_t _last_len = len - _len * ratio;
+
+    size_t i;
+    for (i = 0; i < _len; i++) {
+        _dest[i] = ADD_MOD(_src[i], _dest[i], card);
+    }
+    if (_last_len > 0) {
+        for (i = _len * ratio; i < len; i++) {
+            T tmp = src[i] + dest[i];
+            dest[i] = (tmp >= card) ? (tmp - card) : tmp;
+        }
+    }
+}
+
+template <typename T>
+inline void sub_two_bufs(T* bufa, T* bufb, T* res, size_t len, T card)
+{
+    VecType* __restrict _bufa = reinterpret_cast<VecType*>(bufa);
+    VecType* __restrict _bufb = reinterpret_cast<VecType*>(bufb);
+    VecType* __restrict _res = reinterpret_cast<VecType*>(res);
+    const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa);
+    const size_t _len = len / ratio;
+    const size_t _last_len = len - _len * ratio;
+
+    size_t i;
+    for (i = 0; i < _len; i++) {
+        // perform subtraction
+        _res[i] = SUB_MOD(_bufa[i], _bufb[i], card);
+    }
+    if (_last_len > 0) {
+        for (i = _len * ratio; i < len; i++) {
+            // perform subtraction
+            if (bufa[i] >= bufb[i])
+                res[i] = bufa[i] - bufb[i];
+            else
+                res[i] = card - (bufb[i] - bufa[i]);
+        }
+    }
+}
+
+template <typename T>
+inline void mul_two_bufs(T* src, T* dest, size_t len, T card)
+{
+    VecType* __restrict _src = reinterpret_cast<VecType*>(src);
+    VecType* __restrict _dest = reinterpret_cast<VecType*>(dest);
+    const unsigned ratio = sizeof(*_src) / sizeof(*src);
+    const size_t _len = len / ratio;
+    const size_t _last_len = len - _len * ratio;
+
+    size_t i;
+    for (i = 0; i < _len; i++) {
+        // perform multiplicaton
+        _dest[i] = MULFULL_MOD(_src[i], _dest[i], card);
+    }
+    if (_last_len > 0) {
+        for (i = _len * ratio; i < len; i++) {
+            // perform multiplicaton
+            dest[i] = T((DoubleSizeVal<T>(src[i]) * dest[i]) % card);
+        }
+    }
+}
+
+/** Apply an element-wise negation to a buffer
+ */
+template <typename T>
+inline void neg(size_t len, T* buf, T card)
+{
+    VecType* _buf = reinterpret_cast<VecType*>(buf);
+    unsigned ratio = sizeof(*_buf) / sizeof(*buf);
+    size_t _len = len / ratio;
+    size_t _last_len = len - _len * ratio;
+
+    size_t i;
+    for (i = 0; i < _len; i++) {
+        _buf[i] = NEG_MOD(_buf[i], card);
+    }
+    if (_last_len > 0) {
+        for (i = _len * ratio; i < len; i++) {
+            if (buf[i])
+                buf[i] = card - buf[i];
+        }
+    }
+}
+
+} // namespace simd
+} // namespace quadiron
+
+#endif

From ec70d623d4b84a071c04cac79aaa6145914f0500 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:44:27 +0200
Subject: [PATCH 13/77] SIMD NF4 contains vectorized operations for NF4

---
 src/simd_nf4.h | 233 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 208 insertions(+), 25 deletions(-)

diff --git a/src/simd_nf4.h b/src/simd_nf4.h
index 7d517430..b86e284e 100644
--- a/src/simd_nf4.h
+++ b/src/simd_nf4.h
@@ -38,37 +38,36 @@
 namespace quadiron {
 namespace simd {
 
-#ifdef __AVX2__
-typedef __m128i m128i;
+typedef uint32_t aint32 __attribute__((aligned(ALIGNMENT)));
+typedef __uint128_t NF4Type;
 
-/** Return aint128 integer from a _m128i register */
-static inline aint128 m128i_to_uint128(m128i v)
+/** Return NF4Type integer from a _m128i register */
+static inline NF4Type m128i_to_uint128(__m128i v)
 {
-    aint128 i;
-    _mm_store_si128((m128i*)&i, v);
+    NF4Type i;
+    _mm_store_si128((__m128i*)&i, v);
 
     return i; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn)
 }
-#endif // #ifdef __AVX2__
 
-inline aint128 expand16(uint16_t* arr, int n)
+inline NF4Type expand16(uint16_t* arr, int n)
 {
     // since n <= 4
     uint16_t _arr[4] __attribute__((aligned(ALIGNMENT))) = {0, 0, 0, 0};
     std::copy_n(arr, n, _arr);
 
-    m128i b = _mm_set_epi16(0, 0, 0, 0, _arr[3], _arr[2], _arr[1], _arr[0]);
+    __m128i b = _mm_set_epi16(0, 0, 0, 0, _arr[3], _arr[2], _arr[1], _arr[0]);
 
     return m128i_to_uint128(b);
 }
 
-inline aint128 expand32(uint32_t* arr, int n)
+inline NF4Type expand32(uint32_t* arr, int n)
 {
     // since n <= 4
     uint32_t _arr[4] __attribute__((aligned(simd::ALIGNMENT))) = {0, 0, 0, 0};
     std::copy_n(arr, n, _arr);
 
-    m128i b = _mm_set_epi32(_arr[3], _arr[2], _arr[1], _arr[0]);
+    __m128i b = _mm_set_epi32(_arr[3], _arr[2], _arr[1], _arr[0]);
 
     return m128i_to_uint128(b);
 }
@@ -76,9 +75,9 @@ inline aint128 expand32(uint32_t* arr, int n)
 inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n)
 {
     uint16_t ai[8];
-    aint128 values;
+    NF4Type values;
 
-    m128i _a = _mm_loadu_si128((m128i*)&a);
+    __m128i _a = _mm_loadu_si128((__m128i*)&a);
     ai[0] = _mm_extract_epi16(_a, 0);
     ai[1] = _mm_extract_epi16(_a, 1);
     ai[2] = _mm_extract_epi16(_a, 2);
@@ -91,8 +90,8 @@ inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n)
     const uint32_t flag =
         ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u);
 
-    m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]);
-    _mm_store_si128((m128i*)&values, val);
+    __m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]);
+    _mm_store_si128((__m128i*)&values, val);
 
     GroupedValues<__uint128_t> b = {values, flag};
 
@@ -102,9 +101,9 @@ inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n)
 inline void unpack(__uint128_t a, GroupedValues<__uint128_t>& b, int n)
 {
     uint16_t ai[8];
-    aint128 values;
+    NF4Type values;
 
-    m128i _a = _mm_loadu_si128((m128i*)&a);
+    __m128i _a = _mm_loadu_si128((__m128i*)&a);
     ai[0] = _mm_extract_epi16(_a, 0);
     ai[1] = _mm_extract_epi16(_a, 1);
     ai[2] = _mm_extract_epi16(_a, 2);
@@ -117,17 +116,17 @@ inline void unpack(__uint128_t a, GroupedValues<__uint128_t>& b, int n)
     const uint32_t flag =
         ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u);
 
-    m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]);
-    _mm_store_si128((m128i*)&values, val);
+    __m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]);
+    _mm_store_si128((__m128i*)&values, val);
 
     b.flag = flag;
     b.values = values; // NOLINT(clang-analyzer-core.uninitialized.Assign)
 }
 
-inline aint128 pack(__uint128_t a)
+inline NF4Type pack(__uint128_t a)
 {
-    m128i _a = _mm_loadu_si128((m128i*)&a);
-    m128i b = _mm_set_epi32(
+    __m128i _a = _mm_loadu_si128((__m128i*)&a);
+    __m128i b = _mm_set_epi32(
         _mm_extract_epi16(_a, 3),
         _mm_extract_epi16(_a, 2),
         _mm_extract_epi16(_a, 1),
@@ -136,10 +135,10 @@ inline aint128 pack(__uint128_t a)
     return m128i_to_uint128(b);
 }
 
-inline aint128 pack(__uint128_t a, uint32_t flag)
+inline NF4Type pack(__uint128_t a, uint32_t flag)
 {
     aint32 b0, b1, b2, b3;
-    m128i _a = _mm_loadu_si128((m128i*)&a);
+    __m128i _a = _mm_loadu_si128((__m128i*)&a);
 
     if (flag & 1)
         b0 = 65536;
@@ -161,11 +160,195 @@ inline aint128 pack(__uint128_t a, uint32_t flag)
     else
         b3 = _mm_extract_epi16(_a, 3);
 
-    m128i b = _mm_set_epi32(b3, b2, b1, b0);
+    __m128i b = _mm_set_epi32(b3, b2, b1, b0);
 
     return m128i_to_uint128(b);
 }
 
+/* ================= Basic operations for NF4 ================= */
+
+#if defined(__AVX2__)
+
+inline VecType CAST_TO_DOUBLE(HalfVecType x)
+{
+    return _mm256_castsi128_si256(x);
+}
+
+inline void STORE_LOW(HalfVecType* address, VecType reg)
+{
+    _mm_store_si128(address, _mm256_castsi256_si128(reg));
+}
+
+inline NF4Type add(NF4Type a, NF4Type b)
+{
+    HalfVecType res;
+    VecType _a = CAST_TO_DOUBLE((HalfVecType)a);
+    VecType _b = CAST_TO_DOUBLE((HalfVecType)b);
+    STORE_LOW(&res, ADD_MOD(_a, _b, F4));
+    return (NF4Type)res;
+}
+
+inline NF4Type sub(NF4Type a, NF4Type b)
+{
+    HalfVecType res;
+    VecType _a = CAST_TO_DOUBLE((HalfVecType)a);
+    VecType _b = CAST_TO_DOUBLE((HalfVecType)b);
+    STORE_LOW(&res, SUB_MOD(_a, _b, F4));
+    return (NF4Type)res;
+}
+
+inline NF4Type mul(NF4Type a, NF4Type b)
+{
+    HalfVecType res;
+    VecType _a = CAST_TO_DOUBLE((HalfVecType)a);
+    VecType _b = CAST_TO_DOUBLE((HalfVecType)b);
+    STORE_LOW(&res, MULFULL_MOD(_a, _b, F4));
+    return (NF4Type)res;
+}
+
+inline void
+add_buf_to_two_bufs_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y)
+{
+    // add last _y[] to x and x_next
+    HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
+    HalfVecType* _x_half = reinterpret_cast<HalfVecType*>(x_half);
+    HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
+    for (unsigned i = 0; i < n; ++i) {
+        VecType _x_p = CAST_TO_DOUBLE(_x[i]);
+        VecType _x_next_p = CAST_TO_DOUBLE(_x_half[i]);
+        VecType _y_p = CAST_TO_DOUBLE(_y[i]);
+
+        STORE_LOW(_x + i, ADD_MOD(_x_p, _y_p, F4));
+        STORE_LOW(_x_half + i, ADD_MOD(_x_next_p, _y_p, F4));
+    }
+}
+
+inline void hadamard_mul_rem(unsigned n, NF4Type* x, NF4Type* y)
+{
+    HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
+    HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
+    for (unsigned i = 0; i < n; ++i) {
+        VecType _x_p = CAST_TO_DOUBLE(_x[i]);
+        VecType _y_p = CAST_TO_DOUBLE(_y[i]);
+
+        STORE_LOW(_x + i, MULFULL_MOD(_x_p, _y_p, F4));
+    }
+}
+
+inline void
+hadamard_mul_doubled_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y)
+{
+    HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
+    HalfVecType* _x_half = reinterpret_cast<HalfVecType*>(x_half);
+    HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
+    for (unsigned i = 0; i < n; ++i) {
+        VecType _x_p = CAST_TO_DOUBLE(_x[i]);
+        VecType _x_next_p = CAST_TO_DOUBLE(_x_half[i]);
+        VecType _y_p = CAST_TO_DOUBLE(_y[i]);
+
+        STORE_LOW(_x + i, MULFULL_MOD(_x_p, _y_p, F4));
+        STORE_LOW(_x_half + i, MULFULL_MOD(_x_next_p, _y_p, F4));
+    }
+}
+
+#elif defined(__SSE4_1__)
+
+inline NF4Type add(NF4Type a, NF4Type b)
+{
+    VecType res;
+    STORE(&res, ADD_MOD((VecType)a, (VecType)b, F4));
+    return (NF4Type)res;
+}
+
+inline NF4Type sub(NF4Type a, NF4Type b)
+{
+    VecType res;
+    STORE(&res, SUB_MOD((VecType)a, (VecType)b, F4));
+    return (NF4Type)res;
+}
+
+inline NF4Type mul(NF4Type a, NF4Type b)
+{
+    VecType res;
+    STORE(&res, MULFULL_MOD((VecType)a, (VecType)b, F4));
+    return (NF4Type)res;
+}
+
+inline void
+add_buf_to_two_bufs_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y)
+{
+    // do nothing
+}
+
+inline void hadamard_mul_rem(unsigned n, NF4Type* x, NF4Type* y)
+{
+    // do nothing
+}
+
+inline void
+hadamard_mul_doubled_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y)
+{
+    // do nothing
+}
+
+#endif
+
+/* ==================== Operations for NF4 =================== */
+
+/** Add buffer `y` to two halves of `x`. `x` is of length `n` */
+inline void add_buf_to_two_bufs(unsigned n, NF4Type* _x, NF4Type* _y)
+{
+    unsigned i;
+    VecType* x = reinterpret_cast<VecType*>(_x);
+    VecType* y = reinterpret_cast<VecType*>(_y);
+
+    const unsigned ratio = sizeof(*x) / sizeof(*_x);
+    const unsigned half_len = n / 2;
+    const unsigned vec_len = half_len / ratio;
+    const unsigned num_len = vec_len * ratio;
+    const unsigned rem_len = half_len - num_len;
+
+    NF4Type* x_half = _x + half_len;
+    VecType* x_next = reinterpret_cast<VecType*>(x_half);
+
+    // add y to the first half of `x`
+    for (i = 0; i < vec_len; ++i) {
+        x[i] = ADD_MOD(x[i], y[i], F4);
+    }
+
+    // add y to the second half of `x`
+    for (i = 0; i < vec_len; ++i) {
+        x_next[i] = ADD_MOD(x_next[i], y[i], F4);
+    }
+
+    if (rem_len > 0) {
+        add_buf_to_two_bufs_rem(
+            rem_len, _x + num_len, x_half + num_len, _y + num_len);
+    }
+}
+
+inline void hadamard_mul(unsigned n, NF4Type* _x, NF4Type* _y)
+{
+    unsigned i;
+    VecType* x = reinterpret_cast<VecType*>(_x);
+    VecType* y = reinterpret_cast<VecType*>(_y);
+
+    const unsigned ratio = sizeof(*x) / sizeof(*_x);
+    const unsigned vec_len = n / ratio;
+    const unsigned num_len = vec_len * ratio;
+    const unsigned rem_len = n - num_len;
+
+    // multiply y to the first half of `x`
+    for (i = 0; i < vec_len; ++i) {
+        x[i] = MULFULL_MOD(x[i], y[i], F4);
+    }
+
+    if (rem_len > 0) {
+        // add last _y[] to x
+        hadamard_mul_rem(rem_len, _x + num_len, _y + num_len);
+    }
+}
+
 } // namespace simd
 } // namespace quadiron
 

From 542c81080427b1d8640f0bcb4e86bea1a0faafe4 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:44:34 +0200
Subject: [PATCH 14/77] SIMD FNT: vectorised operations for FNT

---
 src/simd_fnt.h | 555 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 555 insertions(+)
 create mode 100644 src/simd_fnt.h

diff --git a/src/simd_fnt.h b/src/simd_fnt.h
new file mode 100644
index 00000000..fc92b2d0
--- /dev/null
+++ b/src/simd_fnt.h
@@ -0,0 +1,555 @@
+/*
+ * Copyright 2017-2018 Scality
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __QUAD_SIMD_FNT_H__
+#define __QUAD_SIMD_FNT_H__
+
+#include <x86intrin.h>
+
+namespace quadiron {
+namespace simd {
+
+/* ================= Vectorized Operations ================= */
+
+// butterfly CT with r == 1
+template <typename T>
+inline void BUTTERFLY_1(VecType* x, VecType* y, T q)
+{
+    VecType add = ADD_MOD(*x, *y, q);
+    *y = SUB_MOD(*x, *y, q);
+    *x = add;
+}
+
+// butterfly CT with r == q - 1
+template <typename T>
+inline void BUTTERFLY_2(VecType* x, VecType* y, T q)
+{
+    VecType add = ADD_MOD(*x, *y, q);
+    *x = SUB_MOD(*x, *y, q);
+    *y = add;
+}
+
+// butterfly CT with 1 < r < q - 1
+template <typename T>
+inline void BUTTERFLY_3(VecType c, VecType* x, VecType* y, T q)
+{
+    VecType z = MUL_MOD(c, *y, q);
+    *y = SUB_MOD(*x, z, q);
+    *x = ADD_MOD(*x, z, q);
+}
+
+template <typename T>
+inline void BUTTERFLY_CT(T rp1, VecType c, VecType* x, VecType* y, T q)
+{
+    if (rp1 == 2) {
+        BUTTERFLY_1(x, y, q);
+    } else if (rp1 < q) {
+        BUTTERFLY_3(c, x, y, q);
+    } else {
+        BUTTERFLY_2(x, y, q);
+    }
+}
+
+// butterfly GS w/ r = q - 1
+template <typename T>
+inline void BUTTERFLY_4(VecType* x, VecType* y, T q)
+{
+    VecType add = ADD_MOD(*x, *y, q);
+    *y = SUB_MOD(*y, *x, q);
+    *x = add;
+}
+
+// butterfly GS w/ 1 < r < q - 1
+// x = x + y mod q
+// y = z * (x - y) mod q
+template <typename T>
+inline void BUTTERFLY_5(VecType c, VecType* x, VecType* y, T q)
+{
+    VecType sub = SUB_MOD(*x, *y, q);
+    *x = ADD_MOD(*x, *y, q);
+    *y = MUL_MOD(c, sub, q);
+}
+
+template <typename T>
+inline void BUTTERFLY_GS(T rp1, VecType c, VecType* x, VecType* y, T q)
+{
+    if (rp1 == 2) {
+        BUTTERFLY_1(x, y, q);
+    } else if (rp1 < q) {
+        BUTTERFLY_5(c, x, y, q);
+    } else {
+        BUTTERFLY_4(x, y, q);
+    }
+}
+
+template <typename T>
+inline VecType BUTTERFLY_GS_SIMPLE(T rp1, VecType c, VecType x, T q)
+{
+    if (rp1 == 2) {
+        return x;
+    } else if (rp1 < q) {
+        return MUL_MOD(c, x, q);
+    } else {
+        return NEG_MOD(x, q);
+    }
+}
+
+/**
+ * Vectorized butterly CT step
+ *
+ * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
+ *      P = P + r * Q
+ *      Q = P - r * Q
+ *
+ * @param buf - working buffers
+ * @param r - coefficient
+ * @param start - index of buffer among `m` ones
+ * @param m - current group size
+ * @param step - next loop
+ * @param len - number of vectors per buffer
+ * @param card - modulo cardinal
+ */
+template <typename T>
+inline void butterfly_ct_step(
+    vec::Buffers<T>& buf,
+    T r,
+    unsigned start,
+    unsigned m,
+    unsigned step,
+    size_t len,
+    T card)
+{
+    if (len == 0) {
+        return;
+    }
+    const T rp1 = r + 1;
+    VecType c = SET1(r);
+
+    const size_t end = (len > 1) ? len - 1 : 0;
+    const unsigned bufs_nb = buf.get_n();
+    // #pragma omp parallel for
+    // #pragma unroll
+    const std::vector<T*>& mem = buf.get_mem();
+    for (unsigned i = start; i < bufs_nb; i += step) {
+        VecType x1, y1;
+        VecType x2, y2;
+        VecType* __restrict p = reinterpret_cast<VecType*>(mem[i]);
+        VecType* __restrict q = reinterpret_cast<VecType*>(mem[i + m]);
+
+        // #pragma omp parallel for
+        size_t j = 0;
+        // #pragma unroll
+        for (; j < end; j += 2) {
+            x1 = LOAD(p + j);
+            y1 = LOAD(q + j);
+
+            BUTTERFLY_CT(rp1, c, &x1, &y1, card);
+
+            x2 = LOAD(p + j + 1);
+            y2 = LOAD(q + j + 1);
+
+            BUTTERFLY_CT(rp1, c, &x2, &y2, card);
+
+            // Store back to memory
+            STORE(p + j, x1);
+            STORE(p + j + 1, x2);
+            STORE(q + j, y1);
+            STORE(q + j + 1, y2);
+        }
+        for (; j < len; ++j) {
+            x1 = LOAD(p + j);
+            y1 = LOAD(q + j);
+
+            BUTTERFLY_CT(rp1, c, &x1, &y1, card);
+
+            // Store back to memory
+            STORE(p + j, x1);
+            STORE(q + j, y1);
+        }
+    }
+}
+
+template <typename T>
+inline static void do_butterfly_ct_2_layers(
+    const std::vector<T*>& mem,
+    T r1,
+    T r2,
+    T r3,
+    unsigned start,
+    unsigned m,
+    size_t len,
+    T card)
+{
+    const T r1p1 = r1 + 1;
+    const T r2p1 = r2 + 1;
+    const T r3p1 = r3 + 1;
+
+    VecType c1 = SET1(r1);
+    VecType c2 = SET1(r2);
+    VecType c3 = SET1(r3);
+
+    VecType* __restrict p = reinterpret_cast<VecType*>(mem[start]);
+    VecType* __restrict q = reinterpret_cast<VecType*>(mem[start + m]);
+    VecType* __restrict r = reinterpret_cast<VecType*>(mem[start + 2 * m]);
+    VecType* __restrict s = reinterpret_cast<VecType*>(mem[start + 3 * m]);
+
+    // #pragma omp parallel for
+    size_t j = 0;
+    const size_t end = (len > 1) ? len - 1 : 0;
+    // #pragma unroll
+    while (j < end) {
+        // First layer (c1, x, y) & (c1, u, v)
+        VecType x1 = LOAD(p);
+        VecType x2 = LOAD(p + 1);
+        VecType y1 = LOAD(q);
+        VecType y2 = LOAD(q + 1);
+
+        BUTTERFLY_CT(r1p1, c1, &x1, &y1, card);
+        BUTTERFLY_CT(r1p1, c1, &x2, &y2, card);
+
+        VecType u1 = LOAD(r);
+        VecType u2 = LOAD(r + 1);
+        VecType v1 = LOAD(s);
+        VecType v2 = LOAD(s + 1);
+
+        BUTTERFLY_CT(r1p1, c1, &u1, &v1, card);
+        BUTTERFLY_CT(r1p1, c1, &u2, &v2, card);
+
+        // Second layer (c2, x, u) & (c3, y, v)
+        BUTTERFLY_CT(r2p1, c2, &x1, &u1, card);
+        BUTTERFLY_CT(r2p1, c2, &x2, &u2, card);
+
+        BUTTERFLY_CT(r3p1, c3, &y1, &v1, card);
+        BUTTERFLY_CT(r3p1, c3, &y2, &v2, card);
+
+        // Store back to memory
+        STORE(p, x1);
+        STORE(p + 1, x2);
+        STORE(q, y1);
+        STORE(q + 1, y2);
+
+        STORE(r, u1);
+        STORE(r + 1, u2);
+        STORE(s, v1);
+        STORE(s + 1, v2);
+        p = p + 2;
+        q = q + 2;
+        r = r + 2;
+        s = s + 2;
+        j = j + 2;
+    };
+
+    for (; j < len; ++j) {
+        // First layer (c1, x, y) & (c1, u, v)
+        VecType x1 = LOAD(p + j);
+        VecType y1 = LOAD(q + j);
+        VecType u1 = LOAD(r + j);
+        VecType v1 = LOAD(s + j);
+
+        // BUTTERFLY_3_test(c1, &x1, &y1, &u1, &v1, card);
+        BUTTERFLY_CT(r1p1, c1, &x1, &y1, card);
+        BUTTERFLY_CT(r1p1, c1, &u1, &v1, card);
+        BUTTERFLY_CT(r2p1, c2, &x1, &u1, card);
+        BUTTERFLY_CT(r3p1, c3, &y1, &v1, card);
+
+        // Store back to memory
+        STORE(p + j, x1);
+        STORE(q + j, y1);
+        STORE(r + j, u1);
+        STORE(s + j, v1);
+    }
+}
+
+/**
+ * Vectorized butterly CT on two-layers at a time
+ *
+ * For each quadruple
+ * (P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m])
+ * First layer: butterfly on (P, Q) and (R, S) for step = 2 * m
+ *      coef r1 = W[start * n / (2 * m)]
+ *      P = P + r1 * Q
+ *      Q = P - r1 * Q
+ *      R = R + r1 * S
+ *      S = R - r1 * S
+ * Second layer: butterfly on (P, R) and (Q, S) for step = 4 * m
+ *      coef r2 = W[start * n / (4 * m)]
+ *      coef r3 = W[(start + m) * n / (4 * m)]
+ *      P = P + r2 * R
+ *      R = P - r2 * R
+ *      Q = Q + r3 * S
+ *      S = Q - r3 * S
+ *
+ * @param buf - working buffers
+ * @param r1 - coefficient for the 1st layer
+ * @param r2 - 1st coefficient for the 2nd layer
+ * @param r3 - 2nd coefficient for the 2nd layer
+ * @param start - index of buffer among `m` ones
+ * @param m - current group size
+ * @param len - number of vectors per buffer
+ * @param card - modulo cardinal
+ */
+template <typename T>
+inline void butterfly_ct_two_layers_step(
+    vec::Buffers<T>& buf,
+    T r1,
+    T r2,
+    T r3,
+    unsigned start,
+    unsigned m,
+    size_t len,
+    T card)
+{
+    if (len == 0) {
+        return;
+    }
+    const unsigned step = m << 2;
+    const unsigned bufs_nb = buf.get_n();
+
+    // #pragma omp parallel for
+    // #pragma unroll
+    const std::vector<T*>& mem = buf.get_mem();
+    for (unsigned i = start; i < bufs_nb; i += step) {
+        do_butterfly_ct_2_layers(mem, r1, r2, r3, i, m, len, card);
+    }
+}
+
+/**
+ * Vectorized butterly GS step
+ *
+ * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
+ *      P = P + Q
+ *      Q = r * (P - Q)
+ *
+ * @param buf - working buffers
+ * @param r - coefficient
+ * @param start - index of buffer among `m` ones
+ * @param m - current group size
+ * @param len - number of vectors per buffer
+ * @param card - modulo cardinal
+ */
+template <typename T>
+inline void butterfly_gs_step(
+    vec::Buffers<T>& buf,
+    T r,
+    unsigned start,
+    unsigned m,
+    size_t len,
+    T card)
+{
+    if (len == 0) {
+        return;
+    }
+    const unsigned step = m << 1;
+    const T rp1 = r + 1;
+    VecType c = SET1(r);
+
+    const size_t end = (len > 3) ? len - 3 : 0;
+    const unsigned bufs_nb = buf.get_n();
+    // #pragma omp parallel for
+    // #pragma unroll
+    const std::vector<T*>& mem = buf.get_mem();
+    for (unsigned i = start; i < bufs_nb; i += step) {
+        VecType x1, x2, x3, x4;
+        VecType y1, y2, y3, y4;
+        VecType* __restrict p = reinterpret_cast<VecType*>(mem[i]);
+        VecType* __restrict q = reinterpret_cast<VecType*>(mem[i + m]);
+
+        // #pragma omp parallel for
+        size_t j = 0;
+        // #pragma unroll
+        for (; j < end; j += 4) {
+            x1 = LOAD(p + j);
+            x2 = LOAD(p + j + 1);
+            x3 = LOAD(p + j + 2);
+            x4 = LOAD(p + j + 3);
+            y1 = LOAD(q + j);
+            y2 = LOAD(q + j + 1);
+            y3 = LOAD(q + j + 2);
+            y4 = LOAD(q + j + 3);
+
+            BUTTERFLY_GS(rp1, c, &x1, &y1, card);
+            BUTTERFLY_GS(rp1, c, &x2, &y2, card);
+            BUTTERFLY_GS(rp1, c, &x3, &y3, card);
+            BUTTERFLY_GS(rp1, c, &x4, &y4, card);
+
+            // Store back to memory
+            STORE(p + j, x1);
+            STORE(p + j + 1, x2);
+            STORE(p + j + 2, x3);
+            STORE(p + j + 3, x4);
+            STORE(q + j, y1);
+            STORE(q + j + 1, y2);
+            STORE(q + j + 2, y3);
+            STORE(q + j + 3, y4);
+        }
+        for (; j < len; ++j) {
+            x1 = LOAD(p + j);
+            y1 = LOAD(q + j);
+
+            BUTTERFLY_GS(rp1, c, &x1, &y1, card);
+
+            // Store back to memory
+            STORE(p + j, x1);
+            STORE(q + j, y1);
+        }
+    }
+}
+
+/**
+ * Vectorized butterly GS step
+ *
+ * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
+ *      Q = r * P
+ *
+ * @param buf - working buffers
+ * @param r - coefficient
+ * @param start - index of buffer among `m` ones
+ * @param m - current group size
+ * @param len - number of vectors per buffer
+ * @param card - modulo cardinal
+ */
+template <typename T>
+inline void butterfly_gs_step_simple(
+    vec::Buffers<T>& buf,
+    T r,
+    unsigned start,
+    unsigned m,
+    size_t len,
+    T card)
+{
+    if (len == 0) {
+        return;
+    }
+    const unsigned step = m << 1;
+    const T rp1 = r + 1;
+    VecType c = SET1(r);
+
+    const size_t end = (len > 1) ? len - 1 : 0;
+    const unsigned bufs_nb = buf.get_n();
+    // #pragma omp parallel for
+    // #pragma unroll
+    const std::vector<T*>& mem = buf.get_mem();
+    for (unsigned i = start; i < bufs_nb; i += step) {
+        VecType x1, y1;
+        VecType x2, y2;
+        VecType* __restrict p = reinterpret_cast<VecType*>(mem[i]);
+        VecType* __restrict q = reinterpret_cast<VecType*>(mem[i + m]);
+
+        // #pragma omp parallel for
+        size_t j = 0;
+        // #pragma unroll
+        for (; j < end; j += 2) {
+            x1 = LOAD(p + j);
+            x2 = LOAD(p + j + 1);
+
+            y1 = BUTTERFLY_GS_SIMPLE(rp1, c, x1, card);
+            y2 = BUTTERFLY_GS_SIMPLE(rp1, c, x2, card);
+
+            // Store back to memory
+            STORE(q + j, y1);
+            STORE(q + j + 1, y2);
+        }
+        for (; j < len; ++j) {
+            x1 = LOAD(p + j);
+
+            y1 = BUTTERFLY_GS_SIMPLE(rp1, c, x1, card);
+
+            // Store back to memory
+            STORE(q + j, y1);
+        }
+    }
+}
+
+template <typename T>
+inline void encode_post_process(
+    vec::Buffers<T>& output,
+    std::vector<Properties>& props,
+    off_t offset,
+    unsigned code_len,
+    T threshold,
+    size_t vecs_nb)
+{
+    const unsigned element_size = sizeof(T);
+    const unsigned vec_size = countof<T>();
+    const T max = 1 << (element_size * 8 - 1);
+    const VecType _threshold = SET1(threshold);
+    const VecType mask_hi = SET1(max);
+
+    // #pragma unroll
+    const std::vector<T*>& mem = output.get_mem();
+    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
+        VecType* __restrict buf = reinterpret_cast<VecType*>(mem[frag_id]);
+
+        size_t vec_id = 0;
+        size_t end = (vecs_nb > 3) ? vecs_nb - 3 : 0;
+        // #pragma unroll
+        for (; vec_id < end; vec_id += 4) {
+            VecType a1 = LOAD(buf + vec_id);
+            VecType a2 = LOAD(buf + vec_id + 1);
+            VecType a3 = LOAD(buf + vec_id + 2);
+            VecType a4 = LOAD(buf + vec_id + 3);
+
+            if (TESTZ(a1, _threshold) == 0) {
+                const off_t curr_offset = offset + vec_id * vec_size;
+                ADD_PROPS(
+                    props[frag_id], _threshold, mask_hi, a1, curr_offset, max);
+            }
+            if (TESTZ(a2, _threshold) == 0) {
+                const off_t curr_offset = offset + (vec_id + 1) * vec_size;
+                ADD_PROPS(
+                    props[frag_id], _threshold, mask_hi, a2, curr_offset, max);
+            }
+            if (TESTZ(a3, _threshold) == 0) {
+                const off_t curr_offset = offset + (vec_id + 2) * vec_size;
+                ADD_PROPS(
+                    props[frag_id], _threshold, mask_hi, a3, curr_offset, max);
+            }
+            if (TESTZ(a4, _threshold) == 0) {
+                const off_t curr_offset = offset + (vec_id + 3) * vec_size;
+                ADD_PROPS(
+                    props[frag_id], _threshold, mask_hi, a4, curr_offset, max);
+            }
+        }
+        for (; vec_id < vecs_nb; ++vec_id) {
+            VecType a = LOAD(buf + vec_id);
+            uint32_t c = TESTZ(a, _threshold);
+            if (c == 0) {
+                const off_t curr_offset = offset + vec_id * vec_size;
+                ADD_PROPS(
+                    props[frag_id], _threshold, mask_hi, a, curr_offset, max);
+            }
+        }
+    }
+}
+
+} // namespace simd
+} // namespace quadiron
+
+#endif

From c100aabe74a24b979ece784f55ebb890e4052f2c Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 26 Oct 2018 15:25:55 +0200
Subject: [PATCH 15/77] SIMD 256: update

---
 src/simd_256.h | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/simd_256.h b/src/simd_256.h
index d06f3218..e18faa44 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -53,6 +53,8 @@ typedef __m128i HalfVecType;
 typedef __uint128_t NF4Type;
 typedef uint32_t MaskIntType;
 
+/* ============= Constant variable  ============ */
+
 #define F4_u32 _mm256_set1_epi32(65537)
 #define F4m1_u32 _mm256_set1_epi32(65536)
 #define F3_u32 _mm256_set1_epi32(257)
@@ -61,15 +63,24 @@ typedef uint32_t MaskIntType;
 #define F3_u16 _mm256_set1_epi16(257)
 #define F3m1_u16 _mm256_set1_epi16(256)
 
-#define CARD(q) (EITHER(q == F3, F3_u32, F4_u32))
-#define CARD_M_1(q) (EITHER(q == F3, F3m1_u32, F4m1_u32))
-
-/* ============= Essential Operations for AVX2 w/ both u16 & u32 ============ */
-
 #define ZERO (_mm256_setzero_si256())
 #define ONE16 (_mm256_set1_epi16(1))
 #define ONE32 (_mm256_set1_epi32(1))
 
+/* ============= Essential Operations for AVX2 w/ both u16 & u32 ============ */
+
+template <typename T>
+inline VecType CARD(T q)
+{
+    return (q == F3) ? F3_u32 : F4_u32;
+}
+
+template <typename T>
+inline VecType CARD_M_1(T q)
+{
+    return (q == F3) ? F3m1_u32 : F4m1_u32;
+}
+
 inline VecType LOAD(VecType* address)
 {
     return _mm256_load_si256(address);

From d00d648b66ece56784f4efb8d1efe12cda5e8a1c Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 26 Oct 2018 15:41:14 +0200
Subject: [PATCH 16/77] SIMD: use auto for return type of MVMSK8

---
 src/simd_128.h   | 1 -
 src/simd_256.h   | 1 -
 src/simd_basic.h | 4 ++--
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index afbfe70f..21a2b82d 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -37,7 +37,6 @@ namespace quadiron {
 namespace simd {
 
 typedef __m128i VecType;
-typedef uint32_t MaskIntType;
 
 #define F4_u32 _mm_set1_epi32(65537)
 #define F4m1_u32 _mm_set1_epi32(65536)
diff --git a/src/simd_256.h b/src/simd_256.h
index e18faa44..c2099ee2 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -51,7 +51,6 @@ namespace simd {
 typedef __m256i VecType;
 typedef __m128i HalfVecType;
 typedef __uint128_t NF4Type;
-typedef uint32_t MaskIntType;
 
 /* ============= Constant variable  ============ */
 
diff --git a/src/simd_basic.h b/src/simd_basic.h
index 7585734d..0f142ea4 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -145,7 +145,7 @@ inline void ADD_PROPS(
 {
     const VecType b = CMPEQ32(threshold, symb);
     const VecType c = AND(mask, b);
-    MaskIntType d = MVMSK8(c);
+    auto d = MVMSK8(c);
     const unsigned element_size = sizeof(uint32_t);
     while (d > 0) {
         unsigned byte_idx = __builtin_ctz(d);
@@ -260,7 +260,7 @@ inline void ADD_PROPS(
 {
     const VecType b = CMPEQ16(threshold, symb);
     const VecType c = AND(mask, b);
-    MaskIntType d = MVMSK8(c);
+    auto d = MVMSK8(c);
     const unsigned element_size = sizeof(uint16_t);
     while (d > 0) {
         unsigned byte_idx = __builtin_ctz(d);

From fd2197b29ca1e3a4b3e75751d2db9dff2e15cce0 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 26 Oct 2018 15:52:30 +0200
Subject: [PATCH 17/77] SIMD: move CARD & CARD_M_1 to simd_basic.h

---
 src/simd.h       |  2 --
 src/simd_128.h   | 14 +++++++-------
 src/simd_256.h   | 12 ------------
 src/simd_basic.h | 12 ++++++++++++
 4 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/src/simd.h b/src/simd.h
index d70fcb2f..41e4935e 100644
--- a/src/simd.h
+++ b/src/simd.h
@@ -47,8 +47,6 @@ namespace quadiron {
  */
 namespace simd {
 
-#define EITHER(x, a, b) (((x)) ? (a) : (b))
-
 // Vectorized operations are implemented in appropriated headers simd*.h
 
 } // namespace simd
diff --git a/src/simd_128.h b/src/simd_128.h
index 21a2b82d..9ad4051c 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -38,6 +38,8 @@ namespace simd {
 
 typedef __m128i VecType;
 
+/* ============= Constant variable  ============ */
+
 #define F4_u32 _mm_set1_epi32(65537)
 #define F4m1_u32 _mm_set1_epi32(65536)
 #define F3_u32 _mm_set1_epi32(257)
@@ -46,15 +48,12 @@ typedef __m128i VecType;
 #define F3_u16 _mm_set1_epi16(257)
 #define F3m1_u16 _mm_set1_epi16(256)
 
-#define CARD(q) (EITHER(q == F3, F3_u32, F4_u32))
-#define CARD_M_1(q) (EITHER(q == F3, F3m1_u32, F4m1_u32))
-
-/* ============= Essential Operations for AVX2 w/ both u16 & u32 ============ */
-
 #define ZERO (_mm_setzero_si128())
 #define ONE16 (_mm_set1_epi16(1))
 #define ONE32 (_mm_set1_epi32(1))
 
+/* ============= Essential Operations for SSE w/ both u16 & u32 ============ */
+
 inline VecType LOAD(VecType* address)
 {
     return _mm_load_si128(address);
@@ -89,7 +88,7 @@ inline uint16_t TESTZ(VecType x, VecType y)
     return _mm_testz_si128(x, y);
 }
 
-/* ================= Essential Operations for AVX2 w/ u32 ================= */
+/* ================= Essential Operations for SSE w/ u32 ================= */
 
 inline VecType SET1(uint32_t val)
 {
@@ -120,11 +119,12 @@ inline VecType MINU32(VecType x, VecType y)
 {
     return _mm_min_epu32(x, y);
 }
+
 #define MASK8_LO (_mm_set1_epi16(0x80))
 #define BLEND8(x, y, mask) (_mm_blendv_epi8(x, y, mask))
 #define BLEND16(x, y, imm8) (_mm_blend_epi16(x, y, imm8))
 
-/* ================= Essential Operations for AVX2 w/ u16 ================= */
+/* ================= Essential Operations for SSE w/ u16 ================= */
 
 inline VecType SET1(uint16_t val)
 {
diff --git a/src/simd_256.h b/src/simd_256.h
index c2099ee2..09253a16 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -68,18 +68,6 @@ typedef __uint128_t NF4Type;
 
 /* ============= Essential Operations for AVX2 w/ both u16 & u32 ============ */
 
-template <typename T>
-inline VecType CARD(T q)
-{
-    return (q == F3) ? F3_u32 : F4_u32;
-}
-
-template <typename T>
-inline VecType CARD_M_1(T q)
-{
-    return (q == F3) ? F3m1_u32 : F4m1_u32;
-}
-
 inline VecType LOAD(VecType* address)
 {
     return _mm256_load_si256(address);
diff --git a/src/simd_basic.h b/src/simd_basic.h
index 0f142ea4..f014d6d7 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -36,6 +36,18 @@
 namespace quadiron {
 namespace simd {
 
+template <typename T>
+inline VecType CARD(T q)
+{
+    return (q == F3) ? F3_u32 : F4_u32;
+}
+
+template <typename T>
+inline VecType CARD_M_1(T q)
+{
+    return (q == F3) ? F3m1_u32 : F4m1_u32;
+}
+
 /* ================= Basic Operations for u32 ================= */
 
 /**

From 9e11defe181fe639add590caf848683fa96ef122 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 26 Oct 2018 15:55:21 +0200
Subject: [PATCH 18/77] SIMD 128 & 256: move MASK8_LO to const variable groups

---
 src/simd_128.h | 3 ++-
 src/simd_256.h | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index 9ad4051c..5dd4c332 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -52,6 +52,8 @@ typedef __m128i VecType;
 #define ONE16 (_mm_set1_epi16(1))
 #define ONE32 (_mm_set1_epi32(1))
 
+#define MASK8_LO (_mm_set1_epi16(0x80))
+
 /* ============= Essential Operations for SSE w/ both u16 & u32 ============ */
 
 inline VecType LOAD(VecType* address)
@@ -120,7 +122,6 @@ inline VecType MINU32(VecType x, VecType y)
     return _mm_min_epu32(x, y);
 }
 
-#define MASK8_LO (_mm_set1_epi16(0x80))
 #define BLEND8(x, y, mask) (_mm_blendv_epi8(x, y, mask))
 #define BLEND16(x, y, imm8) (_mm_blend_epi16(x, y, imm8))
 
diff --git a/src/simd_256.h b/src/simd_256.h
index 09253a16..c82f810a 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -66,6 +66,8 @@ typedef __uint128_t NF4Type;
 #define ONE16 (_mm256_set1_epi16(1))
 #define ONE32 (_mm256_set1_epi32(1))
 
+#define MASK8_LO (_mm256_set1_epi16(0x80))
+
 /* ============= Essential Operations for AVX2 w/ both u16 & u32 ============ */
 
 inline VecType LOAD(VecType* address)
@@ -134,7 +136,6 @@ inline VecType MINU32(VecType x, VecType y)
     return _mm256_min_epu32(x, y);
 }
 
-#define MASK8_LO (_mm256_set1_epi16(0x80))
 #define BLEND8(x, y, mask) (_mm256_blendv_epi8(x, y, mask))
 #define BLEND16(x, y, imm8) (_mm256_blend_epi16(x, y, imm8))
 

From a23a0abe891ac2974f9b81031056aa552be6c7a8 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 26 Oct 2018 16:01:00 +0200
Subject: [PATCH 19/77] SIMD: use macro for shiftr

---
 src/simd_128.h   | 10 ++--------
 src/simd_256.h   | 10 ++--------
 src/simd_basic.h | 12 ++++++------
 3 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index 5dd4c332..6ce80d5d 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -73,14 +73,6 @@ inline VecType XOR(VecType x, VecType y)
 {
     return _mm_xor_si128(x, y);
 }
-inline VecType SHIFTR_1(VecType x)
-{
-    return _mm_srli_si128(x, 1);
-}
-inline VecType SHIFTR_2(VecType x)
-{
-    return _mm_srli_si128(x, 2);
-}
 inline uint16_t MVMSK8(VecType x)
 {
     return _mm_movemask_epi8(x);
@@ -90,6 +82,8 @@ inline uint16_t TESTZ(VecType x, VecType y)
     return _mm_testz_si128(x, y);
 }
 
+#define SHIFTR(x, imm8) (_mm_srli_si128(x, imm8))
+
 /* ================= Essential Operations for SSE w/ u32 ================= */
 
 inline VecType SET1(uint32_t val)
diff --git a/src/simd_256.h b/src/simd_256.h
index c82f810a..27efd934 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -87,14 +87,6 @@ inline VecType XOR(VecType x, VecType y)
 {
     return _mm256_xor_si256(x, y);
 }
-inline VecType SHIFTR_1(VecType x)
-{
-    return _mm256_srli_si256(x, 1);
-}
-inline VecType SHIFTR_2(VecType x)
-{
-    return _mm256_srli_si256(x, 2);
-}
 inline uint32_t MVMSK8(VecType x)
 {
     return _mm256_movemask_epi8(x);
@@ -104,6 +96,8 @@ inline uint32_t TESTZ(VecType x, VecType y)
     return _mm256_testz_si256(x, y);
 }
 
+#define SHIFTR(x, imm8) (_mm256_srli_si256(x, imm8))
+
 /* ================= Essential Operations for AVX2 w/ u32 ================= */
 
 inline VecType SET1(uint32_t val)
diff --git a/src/simd_basic.h b/src/simd_basic.h
index f014d6d7..262add1c 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -107,8 +107,8 @@ inline VecType MUL_MOD(VecType x, VecType y, uint32_t q)
     VecType res = MUL32(x, y);
     VecType lo =
         (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55);
-    VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR_1(res), MASK8_LO)
-                           : BLEND16(ZERO, SHIFTR_2(res), 0x55);
+    VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO)
+                           : BLEND16(ZERO, SHIFTR(res, 2), 0x55);
     return SUB_MOD(lo, hi, q);
 }
 
@@ -132,8 +132,8 @@ inline VecType MULFULL_MOD(VecType x, VecType y, uint32_t q)
 
     VecType lo =
         (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55);
-    VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR_1(res), MASK8_LO)
-                           : BLEND16(ZERO, SHIFTR_2(res), 0x55);
+    VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO)
+                           : BLEND16(ZERO, SHIFTR(res, 2), 0x55);
     return SUB_MOD(lo, hi, q);
 }
 
@@ -225,7 +225,7 @@ inline VecType MUL_MOD(VecType x, VecType y, uint16_t q)
 {
     VecType res = MUL16(x, y);
     VecType lo = BLEND8(ZERO, res, MASK8_LO);
-    VecType hi = BLEND8(ZERO, SHIFTR_1(res), MASK8_LO);
+    VecType hi = BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO);
     return SUB_MOD(lo, hi, q);
 }
 
@@ -248,7 +248,7 @@ inline VecType MULFULL_MOD(VecType x, VecType y, uint16_t q)
     res = ADD16(res, AND(ONE16, cmp));
 
     VecType lo = BLEND8(ZERO, res, MASK8_LO);
-    VecType hi = BLEND8(ZERO, SHIFTR_1(res), MASK8_LO);
+    VecType hi = BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO);
     return SUB_MOD(lo, hi, q);
 }
 

From 373afd18ccf416ecf2a7ed556ff294b01085cde1 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 26 Oct 2018 17:13:36 +0200
Subject: [PATCH 20/77] SIMD 128: use template functions

---
 src/simd_128.h | 86 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 51 insertions(+), 35 deletions(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index 6ce80d5d..2361b6f3 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -83,70 +83,86 @@ inline uint16_t TESTZ(VecType x, VecType y)
 }
 
 #define SHIFTR(x, imm8) (_mm_srli_si128(x, imm8))
+#define BLEND8(x, y, mask) (_mm_blendv_epi8(x, y, mask))
+#define BLEND16(x, y, imm8) (_mm_blend_epi16(x, y, imm8))
 
-/* ================= Essential Operations for SSE w/ u32 ================= */
+/* ================= Essential Operations for SSE ================= */
 
+template <typename T>
+inline VecType SET1(T val);
+template <>
 inline VecType SET1(uint32_t val)
 {
     return _mm_set1_epi32(val);
 }
-inline VecType ADD32(VecType x, VecType y)
-{
-    return _mm_add_epi32(x, y);
-}
-inline VecType SUB32(VecType x, VecType y)
-{
-    return _mm_sub_epi32(x, y);
-}
-inline VecType MUL32(VecType x, VecType y)
+template <>
+inline VecType SET1(uint16_t val)
 {
-    return _mm_mullo_epi32(x, y);
+    return _mm_set1_epi16(val);
 }
 
-inline VecType CMPEQ32(VecType x, VecType y)
-{
-    return _mm_cmpeq_epi32(x, y);
-}
-inline VecType CMPGT32(VecType x, VecType y)
+template <typename T>
+inline VecType ADD(VecType x, VecType y);
+template <>
+inline VecType ADD<uint32_t>(VecType x, VecType y)
 {
-    return _mm_cmpgt_epi32(x, y);
+    return _mm_add_epi32(x, y);
 }
-inline VecType MINU32(VecType x, VecType y)
+template <>
+inline VecType ADD<uint16_t>(VecType x, VecType y)
 {
-    return _mm_min_epu32(x, y);
+    return _mm_add_epi16(x, y);
 }
 
-#define BLEND8(x, y, mask) (_mm_blendv_epi8(x, y, mask))
-#define BLEND16(x, y, imm8) (_mm_blend_epi16(x, y, imm8))
-
-/* ================= Essential Operations for SSE w/ u16 ================= */
-
-inline VecType SET1(uint16_t val)
+template <typename T>
+inline VecType SUB(VecType x, VecType y);
+template <>
+inline VecType SUB<uint32_t>(VecType x, VecType y)
 {
-    return _mm_set1_epi16(val);
+    return _mm_sub_epi32(x, y);
 }
-inline VecType ADD16(VecType x, VecType y)
+template <>
+inline VecType SUB<uint16_t>(VecType x, VecType y)
 {
-    return _mm_add_epi16(x, y);
+    return _mm_sub_epi16(x, y);
 }
-inline VecType SUB16(VecType x, VecType y)
+
+template <typename T>
+inline VecType MUL(VecType x, VecType y);
+template <>
+inline VecType MUL<uint32_t>(VecType x, VecType y)
 {
-    return _mm_sub_epi16(x, y);
+    return _mm_mullo_epi32(x, y);
 }
-inline VecType MUL16(VecType x, VecType y)
+template <>
+inline VecType MUL<uint16_t>(VecType x, VecType y)
 {
     return _mm_mullo_epi16(x, y);
 }
 
-inline VecType CMPEQ16(VecType x, VecType y)
+template <typename T>
+inline VecType CMPEQ(VecType x, VecType y);
+template <>
+inline VecType CMPEQ<uint32_t>(VecType x, VecType y)
+{
+    return _mm_cmpeq_epi32(x, y);
+}
+template <>
+inline VecType CMPEQ<uint16_t>(VecType x, VecType y)
 {
     return _mm_cmpeq_epi16(x, y);
 }
-inline VecType CMPGT16(VecType x, VecType y)
+
+
+template <typename T>
+inline VecType MIN(VecType x, VecType y);
+template <>
+inline VecType MIN<uint32_t>(VecType x, VecType y)
 {
-    return _mm_cmpgt_epi16(x, y);
+    return _mm_min_epu32(x, y);
 }
-inline VecType MINU16(VecType x, VecType y)
+template <>
+inline VecType MIN<uint16_t>(VecType x, VecType y)
 {
     return _mm_min_epu16(x, y);
 }

From ec0991a5f0cb49ee041a203d4311bfb7e03cd88e Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 26 Oct 2018 17:13:48 +0200
Subject: [PATCH 21/77] SIMD 256: use template functions

---
 src/simd_256.h | 85 +++++++++++++++++++++++++++++---------------------
 1 file changed, 50 insertions(+), 35 deletions(-)

diff --git a/src/simd_256.h b/src/simd_256.h
index 27efd934..b3ab8e78 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -97,70 +97,85 @@ inline uint32_t TESTZ(VecType x, VecType y)
 }
 
 #define SHIFTR(x, imm8) (_mm256_srli_si256(x, imm8))
+#define BLEND8(x, y, mask) (_mm256_blendv_epi8(x, y, mask))
+#define BLEND16(x, y, imm8) (_mm256_blend_epi16(x, y, imm8))
 
-/* ================= Essential Operations for AVX2 w/ u32 ================= */
+/* ================= Essential Operations for AVX2 ================= */
 
+template <typename T>
+inline VecType SET1(T val);
+template <>
 inline VecType SET1(uint32_t val)
 {
     return _mm256_set1_epi32(val);
 }
-inline VecType ADD32(VecType x, VecType y)
-{
-    return _mm256_add_epi32(x, y);
-}
-inline VecType SUB32(VecType x, VecType y)
-{
-    return _mm256_sub_epi32(x, y);
-}
-inline VecType MUL32(VecType x, VecType y)
+template <>
+inline VecType SET1(uint16_t val)
 {
-    return _mm256_mullo_epi32(x, y);
+    return _mm256_set1_epi16(val);
 }
 
-inline VecType CMPEQ32(VecType x, VecType y)
-{
-    return _mm256_cmpeq_epi32(x, y);
-}
-inline VecType CMPGT32(VecType x, VecType y)
+template <typename T>
+inline VecType ADD(VecType x, VecType y);
+template <>
+inline VecType ADD<uint32_t>(VecType x, VecType y)
 {
-    return _mm256_cmpgt_epi32(x, y);
+    return _mm256_add_epi32(x, y);
 }
-inline VecType MINU32(VecType x, VecType y)
+template <>
+inline VecType ADD<uint16_t>(VecType x, VecType y)
 {
-    return _mm256_min_epu32(x, y);
+    return _mm256_add_epi16(x, y);
 }
 
-#define BLEND8(x, y, mask) (_mm256_blendv_epi8(x, y, mask))
-#define BLEND16(x, y, imm8) (_mm256_blend_epi16(x, y, imm8))
-
-/* ================= Essential Operations for AVX2 w/ u16 ================= */
-
-inline VecType SET1(uint16_t val)
+template <typename T>
+inline VecType SUB(VecType x, VecType y);
+template <>
+inline VecType SUB<uint32_t>(VecType x, VecType y)
 {
-    return _mm256_set1_epi16(val);
+    return _mm256_sub_epi32(x, y);
 }
-inline VecType ADD16(VecType x, VecType y)
+template <>
+inline VecType SUB<uint16_t>(VecType x, VecType y)
 {
-    return _mm256_add_epi16(x, y);
+    return _mm256_sub_epi16(x, y);
 }
-inline VecType SUB16(VecType x, VecType y)
+
+template <typename T>
+inline VecType MUL(VecType x, VecType y);
+template <>
+inline VecType MUL<uint32_t>(VecType x, VecType y)
 {
-    return _mm256_sub_epi16(x, y);
+    return _mm256_mullo_epi32(x, y);
 }
-inline VecType MUL16(VecType x, VecType y)
+template <>
+inline VecType MUL<uint16_t>(VecType x, VecType y)
 {
     return _mm256_mullo_epi16(x, y);
 }
 
-inline VecType CMPEQ16(VecType x, VecType y)
+template <typename T>
+inline VecType CMPEQ(VecType x, VecType y);
+template <>
+inline VecType CMPEQ<uint32_t>(VecType x, VecType y)
+{
+    return _mm256_cmpeq_epi32(x, y);
+}
+template <>
+inline VecType CMPEQ<uint16_t>(VecType x, VecType y)
 {
     return _mm256_cmpeq_epi16(x, y);
 }
-inline VecType CMPGT16(VecType x, VecType y)
+
+template <typename T>
+inline VecType MIN(VecType x, VecType y);
+template <>
+inline VecType MIN<uint32_t>(VecType x, VecType y)
 {
-    return _mm256_cmpgt_epi16(x, y);
+    return _mm256_min_epu32(x, y);
 }
-inline VecType MINU16(VecType x, VecType y)
+template <>
+inline VecType MIN<uint16_t>(VecType x, VecType y)
 {
     return _mm256_min_epu16(x, y);
 }

From fde40cc98f86adaed56d082630e0d33fb207c8b0 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 26 Oct 2018 17:20:54 +0200
Subject: [PATCH 22/77] SIMD Basic: use templated essential functions

---
 src/simd_basic.h | 161 ++++++++---------------------------------------
 1 file changed, 26 insertions(+), 135 deletions(-)

diff --git a/src/simd_basic.h b/src/simd_basic.h
index 262add1c..04f0dee3 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -48,20 +48,21 @@ inline VecType CARD_M_1(T q)
     return (q == F3) ? F3m1_u32 : F4m1_u32;
 }
 
-/* ================= Basic Operations for u32 ================= */
+/* ================= Basic Operations ================= */
 
 /**
- * Modular addition for packed unsigned 32-bit integers
+ * Modular addition
  *
  * @param x input register
  * @param y input register
  * @param q modulo
  * @return (x + y) mod q
  */
-inline VecType ADD_MOD(VecType x, VecType y, uint32_t q)
+template <typename T>
+inline VecType ADD_MOD(VecType x, VecType y, T q)
 {
-    VecType res = ADD32(x, y);
-    return MINU32(res, SUB32(res, CARD(q)));
+    VecType res = ADD<T>(x, y);
+    return MIN<T>(res, SUB<T>(res, CARD(q)));
 }
 
 /**
@@ -72,10 +73,11 @@ inline VecType ADD_MOD(VecType x, VecType y, uint32_t q)
  * @param q modulo
  * @return (x - y) mod q
  */
-inline VecType SUB_MOD(VecType x, VecType y, uint32_t q)
+template <typename T>
+inline VecType SUB_MOD(VecType x, VecType y, T q)
 {
-    VecType res = SUB32(x, y);
-    return MINU32(res, ADD32(res, CARD(q)));
+    VecType res = SUB<T>(x, y);
+    return MIN<T>(res, ADD<T>(res, CARD(q)));
 }
 
 /**
@@ -85,10 +87,11 @@ inline VecType SUB_MOD(VecType x, VecType y, uint32_t q)
  * @param q modulo
  * @return (-x) mod q
  */
-inline VecType NEG_MOD(VecType x, uint32_t q)
+template <typename T>
+inline VecType NEG_MOD(VecType x, T q)
 {
-    VecType res = SUB32(CARD(q), x);
-    return MINU32(res, SUB32(res, CARD(q)));
+    VecType res = SUB<T>(CARD(q), x);
+    return MIN<T>(res, SUB<T>(res, CARD(q)));
 }
 
 /**
@@ -102,9 +105,10 @@ inline VecType NEG_MOD(VecType x, uint32_t q)
  * @param q modulo
  * @return (x * y) mod q
  */
-inline VecType MUL_MOD(VecType x, VecType y, uint32_t q)
+template <typename T>
+inline VecType MUL_MOD(VecType x, VecType y, T q)
 {
-    VecType res = MUL32(x, y);
+    VecType res = MUL<T>(x, y);
     VecType lo =
         (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55);
     VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO)
@@ -122,13 +126,14 @@ inline VecType MUL_MOD(VecType x, VecType y, uint32_t q)
  * @param q modulo
  * @return (x * y) mod q
  */
-inline VecType MULFULL_MOD(VecType x, VecType y, uint32_t q)
+template <typename T>
+inline VecType MULFULL_MOD(VecType x, VecType y, T q)
 {
-    VecType res = MUL32(x, y);
+    VecType res = MUL<T>(x, y);
 
     // filter elements of both of a & b = card-1
-    VecType cmp = AND(CMPEQ32(x, CARD_M_1(q)), CMPEQ32(y, CARD_M_1(q)));
-    res = (q == F3) ? XOR(res, AND(F4_u32, cmp)) : ADD32(res, AND(ONE32, cmp));
+    VecType cmp = AND(CMPEQ<T>(x, CARD_M_1(q)), CMPEQ<T>(y, CARD_M_1(q)));
+    res = (q == F3) ? XOR(res, AND(F4_u32, cmp)) : ADD<T>(res, AND(ONE32, cmp));
 
     VecType lo =
         (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55);
@@ -147,133 +152,19 @@ inline VecType MULFULL_MOD(VecType x, VecType y, uint32_t q)
  * @param offset offset in the data fragments
  * @param max a dummy variable
  */
+template <typename T>
 inline void ADD_PROPS(
     Properties& props,
     VecType threshold,
     VecType mask,
     VecType symb,
     off_t offset,
-    uint32_t max)
-{
-    const VecType b = CMPEQ32(threshold, symb);
-    const VecType c = AND(mask, b);
-    auto d = MVMSK8(c);
-    const unsigned element_size = sizeof(uint32_t);
-    while (d > 0) {
-        unsigned byte_idx = __builtin_ctz(d);
-        off_t _offset = offset + byte_idx / element_size;
-        props.add(_offset, OOR_MARK);
-        d ^= 1 << byte_idx;
-    }
-}
-
-/* ================= Basic Operations for u16 ================= */
-
-/**
- * Modular addition for packed unsigned 16-bit integers
- *
- * @param x input register
- * @param y input register
- * @param q modulo
- * @return (x + y) mod q
- */
-inline VecType ADD_MOD(VecType x, VecType y, uint16_t q)
-{
-    VecType res = ADD16(x, y);
-    return MINU16(res, SUB16(res, F3_u16));
-}
-
-/**
- * Modular subtraction for packed unsigned 16-bit integers
- *
- * @param x input register
- * @param y input register
- * @param q modulo
- * @return (x - y) mod q
- */
-inline VecType SUB_MOD(VecType x, VecType y, uint16_t q)
-{
-    VecType res = SUB16(x, y);
-    return MINU16(res, SUB16(ADD16(x, F3_u16), y));
-}
-
-/**
- * Modular negation for packed unsigned 16-bit integers
- *
- * @param x input register
- * @param q modulo
- * @return (-x) mod q
- */
-inline VecType NEG_MOD(VecType x, uint16_t q)
-{
-    VecType res = SUB16(F3_u16, x);
-    return MINU16(res, SUB16(res, F3_u16));
-}
-
-/**
- * Modular multiplication for packed unsigned 16-bit integers
- *
- * @note We assume that at least `x` or `y` is less than `q-1` so it's
- * not necessary to verify overflow on multiplying elements
- *
- * @param x input register
- * @param y input register
- * @param q modulo
- * @return (x * y) mod q
- */
-inline VecType MUL_MOD(VecType x, VecType y, uint16_t q)
-{
-    VecType res = MUL16(x, y);
-    VecType lo = BLEND8(ZERO, res, MASK8_LO);
-    VecType hi = BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO);
-    return SUB_MOD(lo, hi, q);
-}
-
-/**
- * Modular general multiplication for packed unsigned 16-bit integers
- *
- * @note It's necessary to verify overflow on multiplying elements
- *
- * @param x input register
- * @param y input register
- * @param q modulo
- * @return (x * y) mod q
- */
-inline VecType MULFULL_MOD(VecType x, VecType y, uint16_t q)
-{
-    VecType res = MUL16(x, y);
-
-    // filter elements of both of a & b = card-1
-    VecType cmp = AND(CMPEQ16(x, F3m1_u16), CMPEQ16(y, F3m1_u16));
-    res = ADD16(res, AND(ONE16, cmp));
-
-    VecType lo = BLEND8(ZERO, res, MASK8_LO);
-    VecType hi = BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO);
-    return SUB_MOD(lo, hi, q);
-}
-
-/**
- * Update property for a given register for packed unsigned 32-bit integers
- *
- * @param props properties bound to fragments
- * @param threshold register storing max value in its elements
- * @param mask a specific mask
- * @param symb input register
- * @param offset offset in the data fragments
- * @param max a dummy variable
- */
-inline void ADD_PROPS(
-    Properties& props,
-    VecType threshold,
-    VecType mask,
-    VecType symb,
-    off_t offset,
-    uint16_t max)
+    T max)
 {
-    const VecType b = CMPEQ16(threshold, symb);
+    const VecType b = CMPEQ<T>(threshold, symb);
     const VecType c = AND(mask, b);
     auto d = MVMSK8(c);
-    const unsigned element_size = sizeof(uint16_t);
+    const unsigned element_size = sizeof(T);
     while (d > 0) {
         unsigned byte_idx = __builtin_ctz(d);
         off_t _offset = offset + byte_idx / element_size;

From 70ace14733e1e00b06f66ba4709b8786bc3565f6 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 26 Oct 2018 17:23:40 +0200
Subject: [PATCH 23/77] SIMD Basic: use const & curly braces

---
 src/simd_basic.h | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/src/simd_basic.h b/src/simd_basic.h
index 04f0dee3..85aa5f16 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -61,7 +61,7 @@ inline VecType CARD_M_1(T q)
 template <typename T>
 inline VecType ADD_MOD(VecType x, VecType y, T q)
 {
-    VecType res = ADD<T>(x, y);
+    const VecType res = ADD<T>(x, y);
     return MIN<T>(res, SUB<T>(res, CARD(q)));
 }
 
@@ -76,7 +76,7 @@ inline VecType ADD_MOD(VecType x, VecType y, T q)
 template <typename T>
 inline VecType SUB_MOD(VecType x, VecType y, T q)
 {
-    VecType res = SUB<T>(x, y);
+    const VecType res = SUB<T>(x, y);
     return MIN<T>(res, ADD<T>(res, CARD(q)));
 }
 
@@ -90,7 +90,7 @@ inline VecType SUB_MOD(VecType x, VecType y, T q)
 template <typename T>
 inline VecType NEG_MOD(VecType x, T q)
 {
-    VecType res = SUB<T>(CARD(q), x);
+    const VecType res = SUB<T>(CARD(q), x);
     return MIN<T>(res, SUB<T>(res, CARD(q)));
 }
 
@@ -108,10 +108,10 @@ inline VecType NEG_MOD(VecType x, T q)
 template <typename T>
 inline VecType MUL_MOD(VecType x, VecType y, T q)
 {
-    VecType res = MUL<T>(x, y);
-    VecType lo =
+    const VecType res = MUL<T>(x, y);
+    const VecType lo =
         (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55);
-    VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO)
+    const VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO)
                            : BLEND16(ZERO, SHIFTR(res, 2), 0x55);
     return SUB_MOD(lo, hi, q);
 }
@@ -132,12 +132,12 @@ inline VecType MULFULL_MOD(VecType x, VecType y, T q)
     VecType res = MUL<T>(x, y);
 
     // filter elements of both of a & b = card-1
-    VecType cmp = AND(CMPEQ<T>(x, CARD_M_1(q)), CMPEQ<T>(y, CARD_M_1(q)));
+    const VecType cmp = AND(CMPEQ<T>(x, CARD_M_1(q)), CMPEQ<T>(y, CARD_M_1(q)));
     res = (q == F3) ? XOR(res, AND(F4_u32, cmp)) : ADD<T>(res, AND(ONE32, cmp));
 
-    VecType lo =
+    const VecType lo =
         (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55);
-    VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO)
+    const VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO)
                            : BLEND16(ZERO, SHIFTR(res, 2), 0x55);
     return SUB_MOD(lo, hi, q);
 }
@@ -166,8 +166,8 @@ inline void ADD_PROPS(
     auto d = MVMSK8(c);
     const unsigned element_size = sizeof(T);
     while (d > 0) {
-        unsigned byte_idx = __builtin_ctz(d);
-        off_t _offset = offset + byte_idx / element_size;
+        const unsigned byte_idx = __builtin_ctz(d);
+        const size_t _offset = offset + byte_idx / element_size;
         props.add(_offset, OOR_MARK);
         d ^= 1 << byte_idx;
     }
@@ -191,7 +191,7 @@ inline void mul_coef_to_buf(const T a, T* src, T* dest, size_t len, T card)
     const size_t _last_len = len - _len * ratio;
 
     size_t i = 0;
-    size_t end = (_len > 3) ? _len - 3 : 0;
+    const size_t end = (_len > 3) ? _len - 3 : 0;
     for (; i < end; i += 4) {
         _dest[i] = MUL_MOD(coef, _src[i], card);
         _dest[i + 1] = MUL_MOD(coef, _src[i + 1], card);
@@ -203,7 +203,7 @@ inline void mul_coef_to_buf(const T a, T* src, T* dest, size_t len, T card)
     }
 
     if (_last_len > 0) {
-        DoubleSizeVal<T> coef_double = DoubleSizeVal<T>(a);
+        const DoubleSizeVal<T> coef_double = DoubleSizeVal<T>(a);
         for (size_t i = _len * ratio; i < len; i++) {
             dest[i] = (T)((coef_double * src[i]) % card);
         }
@@ -225,7 +225,7 @@ inline void add_two_bufs(T* src, T* dest, size_t len, T card)
     }
     if (_last_len > 0) {
         for (i = _len * ratio; i < len; i++) {
-            T tmp = src[i] + dest[i];
+            const T tmp = src[i] + dest[i];
             dest[i] = (tmp >= card) ? (tmp - card) : tmp;
         }
     }
@@ -249,10 +249,11 @@ inline void sub_two_bufs(T* bufa, T* bufb, T* res, size_t len, T card)
     if (_last_len > 0) {
         for (i = _len * ratio; i < len; i++) {
             // perform subtraction
-            if (bufa[i] >= bufb[i])
+            if (bufa[i] >= bufb[i]) {
                 res[i] = bufa[i] - bufb[i];
-            else
+            } else {
                 res[i] = card - (bufb[i] - bufa[i]);
+            }
         }
     }
 }

From 49c428733b2e382cb1c62814591d12190da23c9d Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Mon, 29 Oct 2018 11:32:00 +0100
Subject: [PATCH 24/77] SIMD FNT: get rid of refactored butterfly functions

---
 src/simd_fnt.h | 107 +++++++++++++++++++++++--------------------------
 1 file changed, 50 insertions(+), 57 deletions(-)

diff --git a/src/simd_fnt.h b/src/simd_fnt.h
index fc92b2d0..0fdb28af 100644
--- a/src/simd_fnt.h
+++ b/src/simd_fnt.h
@@ -38,77 +38,70 @@ namespace simd {
 
 /* ================= Vectorized Operations ================= */
 
-// butterfly CT with r == 1
-template <typename T>
-inline void BUTTERFLY_1(VecType* x, VecType* y, T q)
-{
-    VecType add = ADD_MOD(*x, *y, q);
-    *y = SUB_MOD(*x, *y, q);
-    *x = add;
-}
-
-// butterfly CT with r == q - 1
-template <typename T>
-inline void BUTTERFLY_2(VecType* x, VecType* y, T q)
-{
-    VecType add = ADD_MOD(*x, *y, q);
-    *x = SUB_MOD(*x, *y, q);
-    *y = add;
-}
-
-// butterfly CT with 1 < r < q - 1
-template <typename T>
-inline void BUTTERFLY_3(VecType c, VecType* x, VecType* y, T q)
-{
-    VecType z = MUL_MOD(c, *y, q);
-    *y = SUB_MOD(*x, z, q);
-    *x = ADD_MOD(*x, z, q);
-}
-
+/**
+ * Butterfly Cooley-Tukey operation
+ *
+ * x <- x + r * y
+ * y <- x - r * y
+ *
+ * @param rp1 coefficient `r` plus one
+ * @param c a register stores coefficient `r`
+ * @param x working register
+ * @param y working register
+ * @param q modular
+ */
 template <typename T>
 inline void BUTTERFLY_CT(T rp1, VecType c, VecType* x, VecType* y, T q)
 {
-    if (rp1 == 2) {
-        BUTTERFLY_1(x, y, q);
-    } else if (rp1 < q) {
-        BUTTERFLY_3(c, x, y, q);
-    } else {
-        BUTTERFLY_2(x, y, q);
+    VecType z = (rp1 == 2) ? *y : MUL_MOD(c, *y, q);
+    if (rp1 < q) {
+        *y = SUB_MOD(*x, z, q);
+        *x = ADD_MOD(*x, z, q);
+    } else { // i.e. r == q - 1
+        *y = ADD_MOD(*x, z, q);
+        *x = SUB_MOD(*x, z, q);
     }
 }
 
-// butterfly GS w/ r = q - 1
-template <typename T>
-inline void BUTTERFLY_4(VecType* x, VecType* y, T q)
-{
-    VecType add = ADD_MOD(*x, *y, q);
-    *y = SUB_MOD(*y, *x, q);
-    *x = add;
-}
-
-// butterfly GS w/ 1 < r < q - 1
-// x = x + y mod q
-// y = z * (x - y) mod q
-template <typename T>
-inline void BUTTERFLY_5(VecType c, VecType* x, VecType* y, T q)
-{
-    VecType sub = SUB_MOD(*x, *y, q);
-    *x = ADD_MOD(*x, *y, q);
-    *y = MUL_MOD(c, sub, q);
-}
-
+/**
+ * Butterfly Genteleman-Sande operation
+ *
+ * x <- x + y
+ * y <- r * (x - y)
+ *
+ * @param rp1 coefficient `r` plus one
+ * @param c a register stores coefficient `r`
+ * @param x working register
+ * @param y working register
+ * @param q modular
+ */
 template <typename T>
 inline void BUTTERFLY_GS(T rp1, VecType c, VecType* x, VecType* y, T q)
 {
+    VecType add = ADD_MOD(*x, *y, q);
     if (rp1 == 2) {
-        BUTTERFLY_1(x, y, q);
+        *y = SUB_MOD(*x, *y, q);
     } else if (rp1 < q) {
-        BUTTERFLY_5(c, x, y, q);
-    } else {
-        BUTTERFLY_4(x, y, q);
+        VecType sub = SUB_MOD(*x, *y, q);
+        *y = MUL_MOD(c, sub, q);
+    } else { // i.e. r == q - 1
+        *y = SUB_MOD(*y, *x, q);
     }
+    *x = add;
 }
 
+/**
+ * Butterfly Genteleman-Sande simple operation where y = 0
+ *
+ * x <- x, i.e. no operation
+ * y <- r * x
+ *
+ * @param rp1 coefficient `r` plus one
+ * @param c a register stores coefficient `r`
+ * @param x working register
+ * @param q modular
+ * @return r * x
+ */
 template <typename T>
 inline VecType BUTTERFLY_GS_SIMPLE(T rp1, VecType c, VecType x, T q)
 {

From 692f7ff1d9dae5944b5ea8568a960a1c679c99e7 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Mon, 29 Oct 2018 12:55:26 +0100
Subject: [PATCH 25/77] SIMD 128: add function is_all_zeros

---
 src/simd_128.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/simd_128.h b/src/simd_128.h
index 2361b6f3..48de4175 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -81,6 +81,10 @@ inline uint16_t TESTZ(VecType x, VecType y)
 {
     return _mm_testz_si128(x, y);
 }
+inline int is_all_zeros(VecType x)
+{
+    return _mm_testc_si128(ZERO, y);
+}
 
 #define SHIFTR(x, imm8) (_mm_srli_si128(x, imm8))
 #define BLEND8(x, y, mask) (_mm_blendv_epi8(x, y, mask))

From 324f470a32a6168480ae4d958fe456881e845b23 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Mon, 29 Oct 2018 12:55:40 +0100
Subject: [PATCH 26/77] SIMD 256: add function is_all_zeros

---
 src/simd_256.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/simd_256.h b/src/simd_256.h
index b3ab8e78..e542b6ec 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -95,6 +95,10 @@ inline uint32_t TESTZ(VecType x, VecType y)
 {
     return _mm256_testz_si256(x, y);
 }
+inline int is_all_zeros(VecType x)
+{
+    return _mm256_testc_si256(ZERO, x);
+}
 
 #define SHIFTR(x, imm8) (_mm256_srli_si256(x, imm8))
 #define BLEND8(x, y, mask) (_mm256_blendv_epi8(x, y, mask))

From e7cfeafeb132582e9fd769bf2fd7c791797d95d4 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Mon, 29 Oct 2018 12:55:57 +0100
Subject: [PATCH 27/77] SIMD Basic: refactor MULFULL_MOD

---
 src/simd_basic.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/simd_basic.h b/src/simd_basic.h
index 85aa5f16..67729e78 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -129,17 +129,16 @@ inline VecType MUL_MOD(VecType x, VecType y, T q)
 template <typename T>
 inline VecType MULFULL_MOD(VecType x, VecType y, T q)
 {
-    VecType res = MUL<T>(x, y);
+    const VecType res = MUL_MOD(x, y, q);
 
     // filter elements of both of a & b = card-1
     const VecType cmp = AND(CMPEQ<T>(x, CARD_M_1(q)), CMPEQ<T>(y, CARD_M_1(q)));
-    res = (q == F3) ? XOR(res, AND(F4_u32, cmp)) : ADD<T>(res, AND(ONE32, cmp));
 
-    const VecType lo =
-        (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55);
-    const VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO)
-                           : BLEND16(ZERO, SHIFTR(res, 2), 0x55);
-    return SUB_MOD(lo, hi, q);
+    if (is_all_zeros(cmp) == 1) {
+        return res;
+    }
+    return (q == F3) ? XOR(res, AND(F4_u32, cmp)) :
+                       ADD<T>(res, AND(ONE32, cmp));
 }
 
 /**

From f28fccc454e0001ba917926e23b25edad93fb4a2 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Mon, 29 Oct 2018 13:02:23 +0100
Subject: [PATCH 28/77] SIMD 128: fix is_all_zeros

---
 src/simd_128.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index 48de4175..0baac264 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -83,7 +83,7 @@ inline uint16_t TESTZ(VecType x, VecType y)
 }
 inline int is_all_zeros(VecType x)
 {
-    return _mm_testc_si128(ZERO, y);
+    return _mm_testc_si128(ZERO, x);
 }
 
 #define SHIFTR(x, imm8) (_mm_srli_si128(x, imm8))

From f65910a4cd4ec1a6f531f25af4459914c270257d Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Mon, 29 Oct 2018 16:34:20 +0100
Subject: [PATCH 29/77] SIMD basic: use const

---
 src/simd_basic.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/simd_basic.h b/src/simd_basic.h
index 67729e78..f7cee150 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -285,9 +285,9 @@ template <typename T>
 inline void neg(size_t len, T* buf, T card)
 {
     VecType* _buf = reinterpret_cast<VecType*>(buf);
-    unsigned ratio = sizeof(*_buf) / sizeof(*buf);
-    size_t _len = len / ratio;
-    size_t _last_len = len - _len * ratio;
+    const unsigned ratio = sizeof(*_buf) / sizeof(*buf);
+    const size_t _len = len / ratio;
+    const size_t _last_len = len - _len * ratio;
 
     size_t i;
     for (i = 0; i < _len; i++) {

From 163151a545bd50676392a60bcec39006abe3d8b0 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Mon, 29 Oct 2018 16:37:46 +0100
Subject: [PATCH 30/77] SIMD FNT: fix typo & remove unnecessary comments

---
 src/simd_fnt.h | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

diff --git a/src/simd_fnt.h b/src/simd_fnt.h
index 0fdb28af..05d4d7d0 100644
--- a/src/simd_fnt.h
+++ b/src/simd_fnt.h
@@ -115,7 +115,7 @@ inline VecType BUTTERFLY_GS_SIMPLE(T rp1, VecType c, VecType x, T q)
 }
 
 /**
- * Vectorized butterly CT step
+ * Vectorized butterfly CT step
  *
  * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
  *      P = P + r * Q
@@ -147,8 +147,6 @@ inline void butterfly_ct_step(
 
     const size_t end = (len > 1) ? len - 1 : 0;
     const unsigned bufs_nb = buf.get_n();
-    // #pragma omp parallel for
-    // #pragma unroll
     const std::vector<T*>& mem = buf.get_mem();
     for (unsigned i = start; i < bufs_nb; i += step) {
         VecType x1, y1;
@@ -156,9 +154,7 @@ inline void butterfly_ct_step(
         VecType* __restrict p = reinterpret_cast<VecType*>(mem[i]);
         VecType* __restrict q = reinterpret_cast<VecType*>(mem[i + m]);
 
-        // #pragma omp parallel for
         size_t j = 0;
-        // #pragma unroll
         for (; j < end; j += 2) {
             x1 = LOAD(p + j);
             y1 = LOAD(q + j);
@@ -213,10 +209,8 @@ inline static void do_butterfly_ct_2_layers(
     VecType* __restrict r = reinterpret_cast<VecType*>(mem[start + 2 * m]);
     VecType* __restrict s = reinterpret_cast<VecType*>(mem[start + 3 * m]);
 
-    // #pragma omp parallel for
     size_t j = 0;
     const size_t end = (len > 1) ? len - 1 : 0;
-    // #pragma unroll
     while (j < end) {
         // First layer (c1, x, y) & (c1, u, v)
         VecType x1 = LOAD(p);
@@ -281,7 +275,7 @@ inline static void do_butterfly_ct_2_layers(
 }
 
 /**
- * Vectorized butterly CT on two-layers at a time
+ * Vectorized butterfly CT on two-layers at a time
  *
  * For each quadruple
  * (P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m])
@@ -325,8 +319,6 @@ inline void butterfly_ct_two_layers_step(
     const unsigned step = m << 2;
     const unsigned bufs_nb = buf.get_n();
 
-    // #pragma omp parallel for
-    // #pragma unroll
     const std::vector<T*>& mem = buf.get_mem();
     for (unsigned i = start; i < bufs_nb; i += step) {
         do_butterfly_ct_2_layers(mem, r1, r2, r3, i, m, len, card);
@@ -334,7 +326,7 @@ inline void butterfly_ct_two_layers_step(
 }
 
 /**
- * Vectorized butterly GS step
+ * Vectorized butterfly GS step
  *
  * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
  *      P = P + Q
@@ -365,8 +357,6 @@ inline void butterfly_gs_step(
 
     const size_t end = (len > 3) ? len - 3 : 0;
     const unsigned bufs_nb = buf.get_n();
-    // #pragma omp parallel for
-    // #pragma unroll
     const std::vector<T*>& mem = buf.get_mem();
     for (unsigned i = start; i < bufs_nb; i += step) {
         VecType x1, x2, x3, x4;
@@ -374,9 +364,7 @@ inline void butterfly_gs_step(
         VecType* __restrict p = reinterpret_cast<VecType*>(mem[i]);
         VecType* __restrict q = reinterpret_cast<VecType*>(mem[i + m]);
 
-        // #pragma omp parallel for
         size_t j = 0;
-        // #pragma unroll
         for (; j < end; j += 4) {
             x1 = LOAD(p + j);
             x2 = LOAD(p + j + 1);
@@ -416,7 +404,7 @@ inline void butterfly_gs_step(
 }
 
 /**
- * Vectorized butterly GS step
+ * Vectorized butterfly GS step
  *
  * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
  *      Q = r * P
@@ -446,8 +434,6 @@ inline void butterfly_gs_step_simple(
 
     const size_t end = (len > 1) ? len - 1 : 0;
     const unsigned bufs_nb = buf.get_n();
-    // #pragma omp parallel for
-    // #pragma unroll
     const std::vector<T*>& mem = buf.get_mem();
     for (unsigned i = start; i < bufs_nb; i += step) {
         VecType x1, y1;
@@ -455,9 +441,7 @@ inline void butterfly_gs_step_simple(
         VecType* __restrict p = reinterpret_cast<VecType*>(mem[i]);
         VecType* __restrict q = reinterpret_cast<VecType*>(mem[i + m]);
 
-        // #pragma omp parallel for
         size_t j = 0;
-        // #pragma unroll
         for (; j < end; j += 2) {
             x1 = LOAD(p + j);
             x2 = LOAD(p + j + 1);
@@ -495,14 +479,12 @@ inline void encode_post_process(
     const VecType _threshold = SET1(threshold);
     const VecType mask_hi = SET1(max);
 
-    // #pragma unroll
     const std::vector<T*>& mem = output.get_mem();
     for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
         VecType* __restrict buf = reinterpret_cast<VecType*>(mem[frag_id]);
 
         size_t vec_id = 0;
         size_t end = (vecs_nb > 3) ? vecs_nb - 3 : 0;
-        // #pragma unroll
         for (; vec_id < end; vec_id += 4) {
             VecType a1 = LOAD(buf + vec_id);
             VecType a2 = LOAD(buf + vec_id + 1);

From 83c97ea4d4af31de447f4a070e780379059e7026 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Mon, 29 Oct 2018 16:42:31 +0100
Subject: [PATCH 31/77] SIMD 256: remove NF4Type

---
 src/simd_256.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/simd_256.h b/src/simd_256.h
index e542b6ec..85a696a3 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -50,7 +50,6 @@ namespace simd {
 
 typedef __m256i VecType;
 typedef __m128i HalfVecType;
-typedef __uint128_t NF4Type;
 
 /* ============= Constant variable  ============ */
 

From 435f89d8b1442f8abe6c4023ca07d1ff8d80bfb7 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Mon, 29 Oct 2018 16:42:51 +0100
Subject: [PATCH 32/77] SIMD NF4: remove NF4Type

---
 src/simd_nf4.h | 61 +++++++++++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 31 deletions(-)

diff --git a/src/simd_nf4.h b/src/simd_nf4.h
index b86e284e..de493220 100644
--- a/src/simd_nf4.h
+++ b/src/simd_nf4.h
@@ -39,18 +39,17 @@ namespace quadiron {
 namespace simd {
 
 typedef uint32_t aint32 __attribute__((aligned(ALIGNMENT)));
-typedef __uint128_t NF4Type;
 
-/** Return NF4Type integer from a _m128i register */
-static inline NF4Type m128i_to_uint128(__m128i v)
+/** Return __uint128_t integer from a _m128i register */
+static inline __uint128_t m128i_to_uint128(__m128i v)
 {
-    NF4Type i;
+    __uint128_t i;
     _mm_store_si128((__m128i*)&i, v);
 
     return i; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn)
 }
 
-inline NF4Type expand16(uint16_t* arr, int n)
+inline __uint128_t expand16(uint16_t* arr, int n)
 {
     // since n <= 4
     uint16_t _arr[4] __attribute__((aligned(ALIGNMENT))) = {0, 0, 0, 0};
@@ -61,7 +60,7 @@ inline NF4Type expand16(uint16_t* arr, int n)
     return m128i_to_uint128(b);
 }
 
-inline NF4Type expand32(uint32_t* arr, int n)
+inline __uint128_t expand32(uint32_t* arr, int n)
 {
     // since n <= 4
     uint32_t _arr[4] __attribute__((aligned(simd::ALIGNMENT))) = {0, 0, 0, 0};
@@ -75,7 +74,7 @@ inline NF4Type expand32(uint32_t* arr, int n)
 inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n)
 {
     uint16_t ai[8];
-    NF4Type values;
+    __uint128_t values;
 
     __m128i _a = _mm_loadu_si128((__m128i*)&a);
     ai[0] = _mm_extract_epi16(_a, 0);
@@ -101,7 +100,7 @@ inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n)
 inline void unpack(__uint128_t a, GroupedValues<__uint128_t>& b, int n)
 {
     uint16_t ai[8];
-    NF4Type values;
+    __uint128_t values;
 
     __m128i _a = _mm_loadu_si128((__m128i*)&a);
     ai[0] = _mm_extract_epi16(_a, 0);
@@ -123,7 +122,7 @@ inline void unpack(__uint128_t a, GroupedValues<__uint128_t>& b, int n)
     b.values = values; // NOLINT(clang-analyzer-core.uninitialized.Assign)
 }
 
-inline NF4Type pack(__uint128_t a)
+inline __uint128_t pack(__uint128_t a)
 {
     __m128i _a = _mm_loadu_si128((__m128i*)&a);
     __m128i b = _mm_set_epi32(
@@ -135,7 +134,7 @@ inline NF4Type pack(__uint128_t a)
     return m128i_to_uint128(b);
 }
 
-inline NF4Type pack(__uint128_t a, uint32_t flag)
+inline __uint128_t pack(__uint128_t a, uint32_t flag)
 {
     aint32 b0, b1, b2, b3;
     __m128i _a = _mm_loadu_si128((__m128i*)&a);
@@ -179,35 +178,35 @@ inline void STORE_LOW(HalfVecType* address, VecType reg)
     _mm_store_si128(address, _mm256_castsi256_si128(reg));
 }
 
-inline NF4Type add(NF4Type a, NF4Type b)
+inline __uint128_t add(__uint128_t a, __uint128_t b)
 {
     HalfVecType res;
     VecType _a = CAST_TO_DOUBLE((HalfVecType)a);
     VecType _b = CAST_TO_DOUBLE((HalfVecType)b);
     STORE_LOW(&res, ADD_MOD(_a, _b, F4));
-    return (NF4Type)res;
+    return (__uint128_t)res;
 }
 
-inline NF4Type sub(NF4Type a, NF4Type b)
+inline __uint128_t sub(__uint128_t a, __uint128_t b)
 {
     HalfVecType res;
     VecType _a = CAST_TO_DOUBLE((HalfVecType)a);
     VecType _b = CAST_TO_DOUBLE((HalfVecType)b);
     STORE_LOW(&res, SUB_MOD(_a, _b, F4));
-    return (NF4Type)res;
+    return (__uint128_t)res;
 }
 
-inline NF4Type mul(NF4Type a, NF4Type b)
+inline __uint128_t mul(__uint128_t a, __uint128_t b)
 {
     HalfVecType res;
     VecType _a = CAST_TO_DOUBLE((HalfVecType)a);
     VecType _b = CAST_TO_DOUBLE((HalfVecType)b);
     STORE_LOW(&res, MULFULL_MOD(_a, _b, F4));
-    return (NF4Type)res;
+    return (__uint128_t)res;
 }
 
 inline void
-add_buf_to_two_bufs_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y)
+add_buf_to_two_bufs_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint128_t* y)
 {
     // add last _y[] to x and x_next
     HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
@@ -223,7 +222,7 @@ add_buf_to_two_bufs_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y)
     }
 }
 
-inline void hadamard_mul_rem(unsigned n, NF4Type* x, NF4Type* y)
+inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y)
 {
     HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
     HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
@@ -236,7 +235,7 @@ inline void hadamard_mul_rem(unsigned n, NF4Type* x, NF4Type* y)
 }
 
 inline void
-hadamard_mul_doubled_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y)
+hadamard_mul_doubled_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint128_t* y)
 {
     HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
     HalfVecType* _x_half = reinterpret_cast<HalfVecType*>(x_half);
@@ -253,40 +252,40 @@ hadamard_mul_doubled_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y)
 
 #elif defined(__SSE4_1__)
 
-inline NF4Type add(NF4Type a, NF4Type b)
+inline __uint128_t add(__uint128_t a, __uint128_t b)
 {
     VecType res;
     STORE(&res, ADD_MOD((VecType)a, (VecType)b, F4));
-    return (NF4Type)res;
+    return (__uint128_t)res;
 }
 
-inline NF4Type sub(NF4Type a, NF4Type b)
+inline __uint128_t sub(__uint128_t a, __uint128_t b)
 {
     VecType res;
     STORE(&res, SUB_MOD((VecType)a, (VecType)b, F4));
-    return (NF4Type)res;
+    return (__uint128_t)res;
 }
 
-inline NF4Type mul(NF4Type a, NF4Type b)
+inline __uint128_t mul(__uint128_t a, __uint128_t b)
 {
     VecType res;
     STORE(&res, MULFULL_MOD((VecType)a, (VecType)b, F4));
-    return (NF4Type)res;
+    return (__uint128_t)res;
 }
 
 inline void
-add_buf_to_two_bufs_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y)
+add_buf_to_two_bufs_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint128_t* y)
 {
     // do nothing
 }
 
-inline void hadamard_mul_rem(unsigned n, NF4Type* x, NF4Type* y)
+inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y)
 {
     // do nothing
 }
 
 inline void
-hadamard_mul_doubled_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y)
+hadamard_mul_doubled_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint128_t* y)
 {
     // do nothing
 }
@@ -296,7 +295,7 @@ hadamard_mul_doubled_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y)
 /* ==================== Operations for NF4 =================== */
 
 /** Add buffer `y` to two halves of `x`. `x` is of length `n` */
-inline void add_buf_to_two_bufs(unsigned n, NF4Type* _x, NF4Type* _y)
+inline void add_buf_to_two_bufs(unsigned n, __uint128_t* _x, __uint128_t* _y)
 {
     unsigned i;
     VecType* x = reinterpret_cast<VecType*>(_x);
@@ -308,7 +307,7 @@ inline void add_buf_to_two_bufs(unsigned n, NF4Type* _x, NF4Type* _y)
     const unsigned num_len = vec_len * ratio;
     const unsigned rem_len = half_len - num_len;
 
-    NF4Type* x_half = _x + half_len;
+    __uint128_t* x_half = _x + half_len;
     VecType* x_next = reinterpret_cast<VecType*>(x_half);
 
     // add y to the first half of `x`
@@ -327,7 +326,7 @@ inline void add_buf_to_two_bufs(unsigned n, NF4Type* _x, NF4Type* _y)
     }
 }
 
-inline void hadamard_mul(unsigned n, NF4Type* _x, NF4Type* _y)
+inline void hadamard_mul(unsigned n, __uint128_t* _x, __uint128_t* _y)
 {
     unsigned i;
     VecType* x = reinterpret_cast<VecType*>(_x);

From 4bd2d6c88f44617780f794ae1211910e3a0fe31c Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Mon, 29 Oct 2018 17:23:14 +0100
Subject: [PATCH 33/77] SIMD NF4: remove C-style cast

---
 src/simd_nf4.h | 68 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 25 deletions(-)

diff --git a/src/simd_nf4.h b/src/simd_nf4.h
index de493220..175a760b 100644
--- a/src/simd_nf4.h
+++ b/src/simd_nf4.h
@@ -168,9 +168,15 @@ inline __uint128_t pack(__uint128_t a, uint32_t flag)
 
 #if defined(__AVX2__)
 
-inline VecType CAST_TO_DOUBLE(HalfVecType x)
+inline VecType load_to_reg(HalfVecType x)
 {
-    return _mm256_castsi128_si256(x);
+    return _mm256_castsi128_si256(_mm_load_si128(&x));
+}
+
+inline VecType load_to_reg(__uint128_t x)
+{
+    const HalfVecType* _x = reinterpret_cast<const HalfVecType*>(&x);
+    return load_to_reg(*_x);
 }
 
 inline void STORE_LOW(HalfVecType* address, VecType reg)
@@ -181,28 +187,28 @@ inline void STORE_LOW(HalfVecType* address, VecType reg)
 inline __uint128_t add(__uint128_t a, __uint128_t b)
 {
     HalfVecType res;
-    VecType _a = CAST_TO_DOUBLE((HalfVecType)a);
-    VecType _b = CAST_TO_DOUBLE((HalfVecType)b);
+    VecType _a = load_to_reg(a);
+    VecType _b = load_to_reg(b);
     STORE_LOW(&res, ADD_MOD(_a, _b, F4));
-    return (__uint128_t)res;
+    return reinterpret_cast<__uint128_t>(res);
 }
 
 inline __uint128_t sub(__uint128_t a, __uint128_t b)
 {
     HalfVecType res;
-    VecType _a = CAST_TO_DOUBLE((HalfVecType)a);
-    VecType _b = CAST_TO_DOUBLE((HalfVecType)b);
+    VecType _a = load_to_reg(a);
+    VecType _b = load_to_reg(b);
     STORE_LOW(&res, SUB_MOD(_a, _b, F4));
-    return (__uint128_t)res;
+    return reinterpret_cast<__uint128_t>(res);
 }
 
 inline __uint128_t mul(__uint128_t a, __uint128_t b)
 {
     HalfVecType res;
-    VecType _a = CAST_TO_DOUBLE((HalfVecType)a);
-    VecType _b = CAST_TO_DOUBLE((HalfVecType)b);
+    VecType _a = load_to_reg(a);
+    VecType _b = load_to_reg(b);
     STORE_LOW(&res, MULFULL_MOD(_a, _b, F4));
-    return (__uint128_t)res;
+    return reinterpret_cast<__uint128_t>(res);
 }
 
 inline void
@@ -213,9 +219,9 @@ add_buf_to_two_bufs_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint1
     HalfVecType* _x_half = reinterpret_cast<HalfVecType*>(x_half);
     HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
     for (unsigned i = 0; i < n; ++i) {
-        VecType _x_p = CAST_TO_DOUBLE(_x[i]);
-        VecType _x_next_p = CAST_TO_DOUBLE(_x_half[i]);
-        VecType _y_p = CAST_TO_DOUBLE(_y[i]);
+        VecType _x_p = load_to_reg(_x[i]);
+        VecType _x_next_p = load_to_reg(_x_half[i]);
+        VecType _y_p = load_to_reg(_y[i]);
 
         STORE_LOW(_x + i, ADD_MOD(_x_p, _y_p, F4));
         STORE_LOW(_x_half + i, ADD_MOD(_x_next_p, _y_p, F4));
@@ -227,8 +233,8 @@ inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y)
     HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
     HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
     for (unsigned i = 0; i < n; ++i) {
-        VecType _x_p = CAST_TO_DOUBLE(_x[i]);
-        VecType _y_p = CAST_TO_DOUBLE(_y[i]);
+        VecType _x_p = load_to_reg(_x[i]);
+        VecType _y_p = load_to_reg(_y[i]);
 
         STORE_LOW(_x + i, MULFULL_MOD(_x_p, _y_p, F4));
     }
@@ -241,9 +247,9 @@ hadamard_mul_doubled_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint
     HalfVecType* _x_half = reinterpret_cast<HalfVecType*>(x_half);
     HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
     for (unsigned i = 0; i < n; ++i) {
-        VecType _x_p = CAST_TO_DOUBLE(_x[i]);
-        VecType _x_next_p = CAST_TO_DOUBLE(_x_half[i]);
-        VecType _y_p = CAST_TO_DOUBLE(_y[i]);
+        VecType _x_p = load_to_reg(_x[i]);
+        VecType _x_next_p = load_to_reg(_x_half[i]);
+        VecType _y_p = load_to_reg(_y[i]);
 
         STORE_LOW(_x + i, MULFULL_MOD(_x_p, _y_p, F4));
         STORE_LOW(_x_half + i, MULFULL_MOD(_x_next_p, _y_p, F4));
@@ -252,25 +258,37 @@ hadamard_mul_doubled_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint
 
 #elif defined(__SSE4_1__)
 
+inline VecType load_to_reg(__uint128_t x)
+{
+    const VecType* _x = reinterpret_cast<const VecType*>(&x);
+    return _mm_load_si128(_x);
+}
+
 inline __uint128_t add(__uint128_t a, __uint128_t b)
 {
     VecType res;
-    STORE(&res, ADD_MOD((VecType)a, (VecType)b, F4));
-    return (__uint128_t)res;
+    VecType _a = load_to_reg(a);
+    VecType _b = load_to_reg(b);
+    STORE(&res, ADD_MOD(_a, _b, F4));
+    return reinterpret_cast<__uint128_t>(res);
 }
 
 inline __uint128_t sub(__uint128_t a, __uint128_t b)
 {
     VecType res;
-    STORE(&res, SUB_MOD((VecType)a, (VecType)b, F4));
-    return (__uint128_t)res;
+    VecType _a = load_to_reg(a);
+    VecType _b = load_to_reg(b);
+    STORE(&res, SUB_MOD(_a, _b, F4));
+    return reinterpret_cast<__uint128_t>(res);
 }
 
 inline __uint128_t mul(__uint128_t a, __uint128_t b)
 {
     VecType res;
-    STORE(&res, MULFULL_MOD((VecType)a, (VecType)b, F4));
-    return (__uint128_t)res;
+    VecType _a = load_to_reg(a);
+    VecType _b = load_to_reg(b);
+    STORE(&res, MULFULL_MOD(_a, _b, F4));
+    return reinterpret_cast<__uint128_t>(res);
 }
 
 inline void

From f608c75b147e0aa2a5eb7134503f080b23e11cd4 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 11:04:14 +0100
Subject: [PATCH 34/77] FFT_2n.h: compute simd indices

---
 src/fft_2n.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/fft_2n.h b/src/fft_2n.h
index 8a63bbb2..86626c25 100644
--- a/src/fft_2n.h
+++ b/src/fft_2n.h
@@ -142,6 +142,11 @@ class Radix2 : public FourierTransform<T> {
     size_t pkt_size;
     size_t buf_size;
 
+    // Indices used for accelerated functions
+    size_t simd_vec_len;
+    size_t simd_trailing_len;
+    size_t simd_offset;
+
     std::unique_ptr<T[]> rev = nullptr;
     std::unique_ptr<vec::Vector<T>> W = nullptr;
     std::unique_ptr<vec::Vector<T>> inv_W = nullptr;
@@ -182,6 +187,12 @@ Radix2<T>::Radix2(const gf::Field<T>& gf, int n, int data_len, size_t pkt_size)
 
     rev = std::unique_ptr<T[]>(new T[n]);
     init_bitrev();
+
+    // Indices used for accelerated functions
+    const unsigned ratio = simd::countof<T>();
+    simd_vec_len = this->pkt_size / ratio;
+    simd_trailing_len = this->pkt_size - simd_vec_len * ratio;
+    simd_offset = simd_vec_len * ratio;
 }
 
 template <typename T>

From 664cb68bb5baadd18d4f4a242f4b9e103514eddb Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 11:04:48 +0100
Subject: [PATCH 35/77] FFT_2n.cpp: remove calculation of indices

---
 src/fft_2n.cpp | 106 ++++++++++++++-----------------------------------
 1 file changed, 30 insertions(+), 76 deletions(-)

diff --git a/src/fft_2n.cpp b/src/fft_2n.cpp
index f3d8847f..efa7abd8 100644
--- a/src/fft_2n.cpp
+++ b/src/fft_2n.cpp
@@ -48,10 +48,6 @@ void Radix2<uint16_t>::butterfly_ct_two_layers_step(
     unsigned start,
     unsigned m)
 {
-    const unsigned ratio = simd::countof<uint16_t>();
-    const size_t len = this->pkt_size;
-    const size_t vec_len = len / ratio;
-    const size_t last_len = len - vec_len * ratio;
     const unsigned coefIndex = start * this->n / m / 2;
     const uint16_t r1 = vec_W[coefIndex];
     const uint16_t r2 = vec_W[coefIndex / 2];
@@ -59,29 +55,28 @@ void Radix2<uint16_t>::butterfly_ct_two_layers_step(
 
     // perform vector operations
     simd::butterfly_ct_two_layers_step(
-        buf, r1, r2, r3, start, m, vec_len, card);
+        buf, r1, r2, r3, start, m, simd_vec_len, card);
 
     // for last elements, perform as non-SIMD method
-    if (last_len > 0) {
+    if (simd_trailing_len > 0) {
         const unsigned step = m << 2;
-        size_t offset = vec_len * ratio;
         //  ---------
         // First layer
         //  ---------
         const uint16_t r1 = W->get(start * this->n / m / 2);
         // first pair
-        butterfly_ct_step_slow(buf, r1, start, m, step, offset);
+        butterfly_ct_step_slow(buf, r1, start, m, step, simd_offset);
         // second pair
-        butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset);
+        butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, simd_offset);
         //  ---------
         // Second layer
         //  ---------
         // first pair
         const uint16_t r2 = W->get(start * this->n / m / 4);
-        butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset);
+        butterfly_ct_step_slow(buf, r2, start, 2 * m, step, simd_offset);
         // second pair
         const uint16_t r3 = W->get((start + m) * this->n / m / 4);
-        butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset);
+        butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, simd_offset);
     }
 }
 
@@ -93,18 +88,12 @@ void Radix2<uint16_t>::butterfly_ct_step(
     unsigned m,
     unsigned step)
 {
-    const unsigned ratio = simd::countof<uint16_t>();
-    const size_t len = this->pkt_size;
-    const size_t vec_len = len / ratio;
-    const size_t last_len = len - vec_len * ratio;
-
     // perform vector operations
-    simd::butterfly_ct_step(buf, r, start, m, step, vec_len, card);
+    simd::butterfly_ct_step(buf, r, start, m, step, simd_vec_len, card);
 
     // for last elements, perform as non-SIMD method
-    if (last_len > 0) {
-        size_t offset = vec_len * ratio;
-        butterfly_ct_step_slow(buf, r, start, m, step, offset);
+    if (simd_trailing_len > 0) {
+        butterfly_ct_step_slow(buf, r, start, m, step, simd_offset);
     }
 }
 
@@ -116,18 +105,12 @@ void Radix2<uint16_t>::butterfly_gs_step(
     unsigned m,
     unsigned step)
 {
-    const unsigned ratio = simd::countof<uint16_t>();
-    const size_t len = this->pkt_size;
-    const size_t vec_len = len / ratio;
-    const size_t last_len = len - vec_len * ratio;
-
     // perform vector operations
-    simd::butterfly_gs_step(buf, coef, start, m, vec_len, card);
+    simd::butterfly_gs_step(buf, coef, start, m, simd_vec_len, card);
 
     // for last elements, perform as non-SIMD method
-    if (last_len > 0) {
-        size_t offset = vec_len * ratio;
-        butterfly_gs_step_slow(buf, coef, start, m, step, offset);
+    if (simd_trailing_len > 0) {
+        butterfly_gs_step_slow(buf, coef, start, m, step, simd_offset);
     }
 }
 
@@ -139,18 +122,12 @@ void Radix2<uint16_t>::butterfly_gs_step_simple(
     unsigned m,
     unsigned step)
 {
-    const unsigned ratio = simd::countof<uint16_t>();
-    const size_t len = this->pkt_size;
-    const size_t vec_len = len / ratio;
-    const size_t last_len = len - vec_len * ratio;
-
     // perform vector operations
-    simd::butterfly_gs_step_simple(buf, coef, start, m, vec_len, card);
+    simd::butterfly_gs_step_simple(buf, coef, start, m, simd_vec_len, card);
 
     // for last elements, perform as non-SIMD method
-    if (last_len > 0) {
-        size_t offset = vec_len * ratio;
-        butterfly_gs_step_simple_slow(buf, coef, start, m, step, offset);
+    if (simd_trailing_len > 0) {
+        butterfly_gs_step_simple_slow(buf, coef, start, m, step, simd_offset);
     }
 }
 
@@ -160,10 +137,6 @@ void Radix2<uint32_t>::butterfly_ct_two_layers_step(
     unsigned start,
     unsigned m)
 {
-    const unsigned ratio = simd::countof<uint32_t>();
-    const size_t len = this->pkt_size;
-    const size_t vec_len = len / ratio;
-    const size_t last_len = len - vec_len * ratio;
     const unsigned coefIndex = start * this->n / m / 2;
     const uint32_t r1 = vec_W[coefIndex];
     const uint32_t r2 = vec_W[coefIndex / 2];
@@ -171,29 +144,28 @@ void Radix2<uint32_t>::butterfly_ct_two_layers_step(
 
     // perform vector operations
     simd::butterfly_ct_two_layers_step(
-        buf, r1, r2, r3, start, m, vec_len, card);
+        buf, r1, r2, r3, start, m, simd_vec_len, card);
 
     // for last elements, perform as non-SIMD method
-    if (last_len > 0) {
+    if (simd_trailing_len > 0) {
         const unsigned step = m << 2;
-        size_t offset = vec_len * ratio;
         //  ---------
         // First layer
         //  ---------
         const uint32_t r1 = W->get(start * this->n / m / 2);
         // first pair
-        butterfly_ct_step_slow(buf, r1, start, m, step, offset);
+        butterfly_ct_step_slow(buf, r1, start, m, step, simd_offset);
         // second pair
-        butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset);
+        butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, simd_offset);
         //  ---------
         // Second layer
         //  ---------
         // first pair
         const uint32_t r2 = W->get(start * this->n / m / 4);
-        butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset);
+        butterfly_ct_step_slow(buf, r2, start, 2 * m, step, simd_offset);
         // second pair
         const uint32_t r3 = W->get((start + m) * this->n / m / 4);
-        butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset);
+        butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, simd_offset);
     }
 }
 
@@ -205,18 +177,12 @@ void Radix2<uint32_t>::butterfly_ct_step(
     unsigned m,
     unsigned step)
 {
-    const unsigned ratio = simd::countof<uint32_t>();
-    const size_t len = this->pkt_size;
-    const size_t vec_len = len / ratio;
-    const size_t last_len = len - vec_len * ratio;
-
     // perform vector operations
-    simd::butterfly_ct_step(buf, r, start, m, step, vec_len, card);
+    simd::butterfly_ct_step(buf, r, start, m, step, simd_vec_len, card);
 
     // for last elements, perform as non-SIMD method
-    if (last_len > 0) {
-        size_t offset = vec_len * ratio;
-        butterfly_ct_step_slow(buf, r, start, m, step, offset);
+    if (simd_trailing_len > 0) {
+        butterfly_ct_step_slow(buf, r, start, m, step, simd_offset);
     }
 }
 
@@ -228,18 +194,12 @@ void Radix2<uint32_t>::butterfly_gs_step(
     unsigned m,
     unsigned step)
 {
-    const unsigned ratio = simd::countof<uint32_t>();
-    const size_t len = this->pkt_size;
-    const size_t vec_len = len / ratio;
-    const size_t last_len = len - vec_len * ratio;
-
     // perform vector operations
-    simd::butterfly_gs_step(buf, coef, start, m, vec_len, card);
+    simd::butterfly_gs_step(buf, coef, start, m, simd_vec_len, card);
 
     // for last elements, perform as non-SIMD method
-    if (last_len > 0) {
-        size_t offset = vec_len * ratio;
-        butterfly_gs_step_slow(buf, coef, start, m, step, offset);
+    if (simd_trailing_len > 0) {
+        butterfly_gs_step_slow(buf, coef, start, m, step, simd_offset);
     }
 }
 
@@ -251,18 +211,12 @@ void Radix2<uint32_t>::butterfly_gs_step_simple(
     unsigned m,
     unsigned step)
 {
-    const unsigned ratio = simd::countof<uint32_t>();
-    const size_t len = this->pkt_size;
-    const size_t vec_len = len / ratio;
-    const size_t last_len = len - vec_len * ratio;
-
     // perform vector operations
-    simd::butterfly_gs_step_simple(buf, coef, start, m, vec_len, card);
+    simd::butterfly_gs_step_simple(buf, coef, start, m, simd_vec_len, card);
 
     // for last elements, perform as non-SIMD method
-    if (last_len > 0) {
-        size_t offset = vec_len * ratio;
-        butterfly_gs_step_simple_slow(buf, coef, start, m, step, offset);
+    if (simd_trailing_len > 0) {
+        butterfly_gs_step_simple_slow(buf, coef, start, m, step, simd_offset);
     }
 }
 

From e657e79d3392a0dcb9ef9b24ac92ac54e1c8f9d1 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 11:11:37 +0100
Subject: [PATCH 36/77] FFT_2n.h: define butterfly_ct_two_layers_step_slow

---
 src/fft_2n.h | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/fft_2n.h b/src/fft_2n.h
index 86626c25..4af68528 100644
--- a/src/fft_2n.h
+++ b/src/fft_2n.h
@@ -112,6 +112,11 @@ class Radix2 : public FourierTransform<T> {
         unsigned step);
 
     // Only used for non-vectorized elements
+    void butterfly_ct_two_layers_step_slow(
+        vec::Buffers<T>& buf,
+        unsigned start,
+        unsigned m,
+        size_t offset = 0);
     void butterfly_ct_step_slow(
         vec::Buffers<T>& buf,
         T coef,
@@ -432,6 +437,16 @@ void Radix2<T>::butterfly_ct_two_layers_step(
     vec::Buffers<T>& buf,
     unsigned start,
     unsigned m)
+{
+    butterfly_ct_two_layers_step_slow(buf, start, m);
+}
+
+template <typename T>
+void Radix2<T>::butterfly_ct_two_layers_step_slow(
+    vec::Buffers<T>& buf,
+    unsigned start,
+    unsigned m,
+    size_t offset)
 {
     const unsigned step = m << 2;
     //  ---------
@@ -439,18 +454,18 @@ void Radix2<T>::butterfly_ct_two_layers_step(
     //  ---------
     const T r1 = W->get(start * this->n / m / 2);
     // first pair
-    butterfly_ct_step(buf, r1, start, m, step);
+    butterfly_ct_step_slow(buf, r1, start, m, step, offset);
     // second pair
-    butterfly_ct_step(buf, r1, start + 2 * m, m, step);
+    butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset);
     //  ---------
     // Second layer
     //  ---------
     // first pair
     const T r2 = W->get(start * this->n / m / 4);
-    butterfly_ct_step(buf, r2, start, 2 * m, step);
+    butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset);
     // second pair
     const T r3 = W->get((start + m) * this->n / m / 4);
-    butterfly_ct_step(buf, r3, start + m, 2 * m, step);
+    butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset);
 }
 
 template <typename T>

From cb734b42d09a42fa47fb53fdd0bfa5f9e4a7c8d7 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 11:11:58 +0100
Subject: [PATCH 37/77] FFT_2n.cpp: use butterfly_ct_two_layers_step_slow

---
 src/fft_2n.cpp | 38 ++------------------------------------
 1 file changed, 2 insertions(+), 36 deletions(-)

diff --git a/src/fft_2n.cpp b/src/fft_2n.cpp
index efa7abd8..f7d91468 100644
--- a/src/fft_2n.cpp
+++ b/src/fft_2n.cpp
@@ -59,24 +59,7 @@ void Radix2<uint16_t>::butterfly_ct_two_layers_step(
 
     // for last elements, perform as non-SIMD method
     if (simd_trailing_len > 0) {
-        const unsigned step = m << 2;
-        //  ---------
-        // First layer
-        //  ---------
-        const uint16_t r1 = W->get(start * this->n / m / 2);
-        // first pair
-        butterfly_ct_step_slow(buf, r1, start, m, step, simd_offset);
-        // second pair
-        butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, simd_offset);
-        //  ---------
-        // Second layer
-        //  ---------
-        // first pair
-        const uint16_t r2 = W->get(start * this->n / m / 4);
-        butterfly_ct_step_slow(buf, r2, start, 2 * m, step, simd_offset);
-        // second pair
-        const uint16_t r3 = W->get((start + m) * this->n / m / 4);
-        butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, simd_offset);
+        butterfly_ct_two_layers_step_slow(buf, start, m, simd_offset);
     }
 }
 
@@ -148,24 +131,7 @@ void Radix2<uint32_t>::butterfly_ct_two_layers_step(
 
     // for last elements, perform as non-SIMD method
     if (simd_trailing_len > 0) {
-        const unsigned step = m << 2;
-        //  ---------
-        // First layer
-        //  ---------
-        const uint32_t r1 = W->get(start * this->n / m / 2);
-        // first pair
-        butterfly_ct_step_slow(buf, r1, start, m, step, simd_offset);
-        // second pair
-        butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, simd_offset);
-        //  ---------
-        // Second layer
-        //  ---------
-        // first pair
-        const uint32_t r2 = W->get(start * this->n / m / 4);
-        butterfly_ct_step_slow(buf, r2, start, 2 * m, step, simd_offset);
-        // second pair
-        const uint32_t r3 = W->get((start + m) * this->n / m / 4);
-        butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, simd_offset);
+        butterfly_ct_two_layers_step_slow(buf, start, m, simd_offset);
     }
 }
 

From 1a11fb0dd2b1f9598993f1287fd8b08972dedf0f Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 11:16:37 +0100
Subject: [PATCH 38/77] FEC RS FNT: simd indices as member variables

---
 src/fec_rs_fnt.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/fec_rs_fnt.h b/src/fec_rs_fnt.h
index 4bdd5475..55ffb487 100644
--- a/src/fec_rs_fnt.h
+++ b/src/fec_rs_fnt.h
@@ -60,6 +60,11 @@ class RsFnt : public FecCode<T> {
     // decoding context used in encoding of systematic FNT
     std::unique_ptr<DecodeContext<T>> enc_context;
 
+    // Indices used for accelerated functions
+    size_t simd_vec_len;
+    size_t simd_trailing_len;
+    size_t simd_offset;
+
   public:
     RsFnt(
         FecType type,
@@ -70,6 +75,12 @@ class RsFnt : public FecCode<T> {
         : FecCode<T>(type, word_size, n_data, n_parities, pkt_size)
     {
         this->fec_init();
+
+        // Indices used for accelerated functions
+        const unsigned ratio = simd::countof<T>();
+        simd_vec_len = this->pkt_size / ratio;
+        simd_trailing_len = this->pkt_size - simd_vec_len * ratio;
+        simd_offset = simd_vec_len * ratio;
     }
 
     inline void check_params() override

From e1e9eebaa39e06be6af43d6048b3f199c27ceacb Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 11:16:59 +0100
Subject: [PATCH 39/77] FEC Vectorisation: use FNT's simd indices

---
 src/fec_vectorisation.cpp | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/src/fec_vectorisation.cpp b/src/fec_vectorisation.cpp
index 8684e1ab..ed82fab8 100644
--- a/src/fec_vectorisation.cpp
+++ b/src/fec_vectorisation.cpp
@@ -53,20 +53,13 @@ void RsFnt<uint16_t>::encode_post_process(
     uint16_t threshold = this->gf->card_minus_one();
     unsigned code_len = this->n_outputs;
 
-    // number of elements per vector register
-    unsigned vec_size = simd::countof<uint16_t>();
-    // number of vector registers per fragment packet
-    size_t vecs_nb = size / vec_size;
-    // odd number of elements not vectorized
-    size_t last_len = size - vecs_nb * vec_size;
-
     simd::encode_post_process(
-        output, props, offset, code_len, threshold, vecs_nb);
+        output, props, offset, code_len, threshold, simd_vec_len);
 
-    if (last_len > 0) {
+    if (simd_trailing_len > 0) {
         for (unsigned i = 0; i < code_len; ++i) {
             uint16_t* chunk = output.get(i);
-            for (size_t j = vecs_nb * vec_size; j < size; ++j) {
+            for (size_t j = simd_offset; j < size; ++j) {
                 if (chunk[j] == threshold) {
                     props[i].add(offset + j, OOR_MARK);
                 }
@@ -85,20 +78,13 @@ void RsFnt<uint32_t>::encode_post_process(
     const uint32_t threshold = this->gf->card_minus_one();
     const unsigned code_len = this->n_outputs;
 
-    // number of elements per vector register
-    const unsigned vec_size = simd::countof<uint32_t>();
-    // number of vector registers per fragment packet
-    const size_t vecs_nb = size / vec_size;
-    // odd number of elements not vectorized
-    const size_t last_len = size - vecs_nb * vec_size;
-
     simd::encode_post_process(
-        output, props, offset, code_len, threshold, vecs_nb);
+        output, props, offset, code_len, threshold, simd_vec_len);
 
-    if (last_len > 0) {
+    if (simd_trailing_len > 0) {
         for (unsigned i = 0; i < code_len; ++i) {
             uint32_t* chunk = output.get(i);
-            for (size_t j = vecs_nb * vec_size; j < size; ++j) {
+            for (size_t j = simd_offset; j < size; ++j) {
                 if (chunk[j] == threshold) {
                     props[i].add(offset + j, OOR_MARK);
                 }

From 761eef761e37a7cf3850cd6161ba13cc90800417 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 11:27:04 +0100
Subject: [PATCH 40/77] SIMD Basic: clang-format fix

---
 src/simd_basic.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/simd_basic.h b/src/simd_basic.h
index f7cee150..106f620b 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -112,7 +112,7 @@ inline VecType MUL_MOD(VecType x, VecType y, T q)
     const VecType lo =
         (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55);
     const VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO)
-                           : BLEND16(ZERO, SHIFTR(res, 2), 0x55);
+                                 : BLEND16(ZERO, SHIFTR(res, 2), 0x55);
     return SUB_MOD(lo, hi, q);
 }
 
@@ -137,8 +137,8 @@ inline VecType MULFULL_MOD(VecType x, VecType y, T q)
     if (is_all_zeros(cmp) == 1) {
         return res;
     }
-    return (q == F3) ? XOR(res, AND(F4_u32, cmp)) :
-                       ADD<T>(res, AND(ONE32, cmp));
+    return (q == F3) ? XOR(res, AND(F4_u32, cmp))
+                     : ADD<T>(res, AND(ONE32, cmp));
 }
 
 /**

From fbc8c77ba0be6d2dcf08a012e21f00e233a2654a Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 11:27:28 +0100
Subject: [PATCH 41/77] SIMD NF4: clang-format fix

---
 src/simd_nf4.h | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/simd_nf4.h b/src/simd_nf4.h
index 175a760b..e60a9880 100644
--- a/src/simd_nf4.h
+++ b/src/simd_nf4.h
@@ -211,8 +211,11 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b)
     return reinterpret_cast<__uint128_t>(res);
 }
 
-inline void
-add_buf_to_two_bufs_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint128_t* y)
+inline void add_buf_to_two_bufs_rem(
+    unsigned n,
+    __uint128_t* x,
+    __uint128_t* x_half,
+    __uint128_t* y)
 {
     // add last _y[] to x and x_next
     HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
@@ -240,8 +243,11 @@ inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y)
     }
 }
 
-inline void
-hadamard_mul_doubled_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint128_t* y)
+inline void hadamard_mul_doubled_rem(
+    unsigned n,
+    __uint128_t* x,
+    __uint128_t* x_half,
+    __uint128_t* y)
 {
     HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
     HalfVecType* _x_half = reinterpret_cast<HalfVecType*>(x_half);
@@ -291,8 +297,11 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b)
     return reinterpret_cast<__uint128_t>(res);
 }
 
-inline void
-add_buf_to_two_bufs_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint128_t* y)
+inline void add_buf_to_two_bufs_rem(
+    unsigned n,
+    __uint128_t* x,
+    __uint128_t* x_half,
+    __uint128_t* y)
 {
     // do nothing
 }
@@ -302,8 +311,11 @@ inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y)
     // do nothing
 }
 
-inline void
-hadamard_mul_doubled_rem(unsigned n, __uint128_t* x, __uint128_t* x_half, __uint128_t* y)
+inline void hadamard_mul_doubled_rem(
+    unsigned n,
+    __uint128_t* x,
+    __uint128_t* x_half,
+    __uint128_t* y)
 {
     // do nothing
 }

From 114186c185a97506bb9a2f8649e55ea7d566b53d Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 11:28:38 +0100
Subject: [PATCH 42/77] SIMD: rename LOAD

---
 src/simd_128.h |  3 +--
 src/simd_256.h |  2 +-
 src/simd_fnt.h | 72 +++++++++++++++++++++++++-------------------------
 src/simd_nf4.h | 48 ++++++++++++++++-----------------
 4 files changed, 62 insertions(+), 63 deletions(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index 0baac264..e04cf6cd 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -56,7 +56,7 @@ typedef __m128i VecType;
 
 /* ============= Essential Operations for SSE w/ both u16 & u32 ============ */
 
-inline VecType LOAD(VecType* address)
+inline VecType LoadToReg(VecType* address)
 {
     return _mm_load_si128(address);
 }
@@ -157,7 +157,6 @@ inline VecType CMPEQ<uint16_t>(VecType x, VecType y)
     return _mm_cmpeq_epi16(x, y);
 }
 
-
 template <typename T>
 inline VecType MIN(VecType x, VecType y);
 template <>
diff --git a/src/simd_256.h b/src/simd_256.h
index 85a696a3..80953e92 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -69,7 +69,7 @@ typedef __m128i HalfVecType;
 
 /* ============= Essential Operations for AVX2 w/ both u16 & u32 ============ */
 
-inline VecType LOAD(VecType* address)
+inline VecType LoadToReg(VecType* address)
 {
     return _mm256_load_si256(address);
 }
diff --git a/src/simd_fnt.h b/src/simd_fnt.h
index 05d4d7d0..7e025610 100644
--- a/src/simd_fnt.h
+++ b/src/simd_fnt.h
@@ -156,13 +156,13 @@ inline void butterfly_ct_step(
 
         size_t j = 0;
         for (; j < end; j += 2) {
-            x1 = LOAD(p + j);
-            y1 = LOAD(q + j);
+            x1 = LoadToReg(p + j);
+            y1 = LoadToReg(q + j);
 
             BUTTERFLY_CT(rp1, c, &x1, &y1, card);
 
-            x2 = LOAD(p + j + 1);
-            y2 = LOAD(q + j + 1);
+            x2 = LoadToReg(p + j + 1);
+            y2 = LoadToReg(q + j + 1);
 
             BUTTERFLY_CT(rp1, c, &x2, &y2, card);
 
@@ -173,8 +173,8 @@ inline void butterfly_ct_step(
             STORE(q + j + 1, y2);
         }
         for (; j < len; ++j) {
-            x1 = LOAD(p + j);
-            y1 = LOAD(q + j);
+            x1 = LoadToReg(p + j);
+            y1 = LoadToReg(q + j);
 
             BUTTERFLY_CT(rp1, c, &x1, &y1, card);
 
@@ -213,18 +213,18 @@ inline static void do_butterfly_ct_2_layers(
     const size_t end = (len > 1) ? len - 1 : 0;
     while (j < end) {
         // First layer (c1, x, y) & (c1, u, v)
-        VecType x1 = LOAD(p);
-        VecType x2 = LOAD(p + 1);
-        VecType y1 = LOAD(q);
-        VecType y2 = LOAD(q + 1);
+        VecType x1 = LoadToReg(p);
+        VecType x2 = LoadToReg(p + 1);
+        VecType y1 = LoadToReg(q);
+        VecType y2 = LoadToReg(q + 1);
 
         BUTTERFLY_CT(r1p1, c1, &x1, &y1, card);
         BUTTERFLY_CT(r1p1, c1, &x2, &y2, card);
 
-        VecType u1 = LOAD(r);
-        VecType u2 = LOAD(r + 1);
-        VecType v1 = LOAD(s);
-        VecType v2 = LOAD(s + 1);
+        VecType u1 = LoadToReg(r);
+        VecType u2 = LoadToReg(r + 1);
+        VecType v1 = LoadToReg(s);
+        VecType v2 = LoadToReg(s + 1);
 
         BUTTERFLY_CT(r1p1, c1, &u1, &v1, card);
         BUTTERFLY_CT(r1p1, c1, &u2, &v2, card);
@@ -255,10 +255,10 @@ inline static void do_butterfly_ct_2_layers(
 
     for (; j < len; ++j) {
         // First layer (c1, x, y) & (c1, u, v)
-        VecType x1 = LOAD(p + j);
-        VecType y1 = LOAD(q + j);
-        VecType u1 = LOAD(r + j);
-        VecType v1 = LOAD(s + j);
+        VecType x1 = LoadToReg(p + j);
+        VecType y1 = LoadToReg(q + j);
+        VecType u1 = LoadToReg(r + j);
+        VecType v1 = LoadToReg(s + j);
 
         // BUTTERFLY_3_test(c1, &x1, &y1, &u1, &v1, card);
         BUTTERFLY_CT(r1p1, c1, &x1, &y1, card);
@@ -366,14 +366,14 @@ inline void butterfly_gs_step(
 
         size_t j = 0;
         for (; j < end; j += 4) {
-            x1 = LOAD(p + j);
-            x2 = LOAD(p + j + 1);
-            x3 = LOAD(p + j + 2);
-            x4 = LOAD(p + j + 3);
-            y1 = LOAD(q + j);
-            y2 = LOAD(q + j + 1);
-            y3 = LOAD(q + j + 2);
-            y4 = LOAD(q + j + 3);
+            x1 = LoadToReg(p + j);
+            x2 = LoadToReg(p + j + 1);
+            x3 = LoadToReg(p + j + 2);
+            x4 = LoadToReg(p + j + 3);
+            y1 = LoadToReg(q + j);
+            y2 = LoadToReg(q + j + 1);
+            y3 = LoadToReg(q + j + 2);
+            y4 = LoadToReg(q + j + 3);
 
             BUTTERFLY_GS(rp1, c, &x1, &y1, card);
             BUTTERFLY_GS(rp1, c, &x2, &y2, card);
@@ -391,8 +391,8 @@ inline void butterfly_gs_step(
             STORE(q + j + 3, y4);
         }
         for (; j < len; ++j) {
-            x1 = LOAD(p + j);
-            y1 = LOAD(q + j);
+            x1 = LoadToReg(p + j);
+            y1 = LoadToReg(q + j);
 
             BUTTERFLY_GS(rp1, c, &x1, &y1, card);
 
@@ -443,8 +443,8 @@ inline void butterfly_gs_step_simple(
 
         size_t j = 0;
         for (; j < end; j += 2) {
-            x1 = LOAD(p + j);
-            x2 = LOAD(p + j + 1);
+            x1 = LoadToReg(p + j);
+            x2 = LoadToReg(p + j + 1);
 
             y1 = BUTTERFLY_GS_SIMPLE(rp1, c, x1, card);
             y2 = BUTTERFLY_GS_SIMPLE(rp1, c, x2, card);
@@ -454,7 +454,7 @@ inline void butterfly_gs_step_simple(
             STORE(q + j + 1, y2);
         }
         for (; j < len; ++j) {
-            x1 = LOAD(p + j);
+            x1 = LoadToReg(p + j);
 
             y1 = BUTTERFLY_GS_SIMPLE(rp1, c, x1, card);
 
@@ -486,10 +486,10 @@ inline void encode_post_process(
         size_t vec_id = 0;
         size_t end = (vecs_nb > 3) ? vecs_nb - 3 : 0;
         for (; vec_id < end; vec_id += 4) {
-            VecType a1 = LOAD(buf + vec_id);
-            VecType a2 = LOAD(buf + vec_id + 1);
-            VecType a3 = LOAD(buf + vec_id + 2);
-            VecType a4 = LOAD(buf + vec_id + 3);
+            VecType a1 = LoadToReg(buf + vec_id);
+            VecType a2 = LoadToReg(buf + vec_id + 1);
+            VecType a3 = LoadToReg(buf + vec_id + 2);
+            VecType a4 = LoadToReg(buf + vec_id + 3);
 
             if (TESTZ(a1, _threshold) == 0) {
                 const off_t curr_offset = offset + vec_id * vec_size;
@@ -513,7 +513,7 @@ inline void encode_post_process(
             }
         }
         for (; vec_id < vecs_nb; ++vec_id) {
-            VecType a = LOAD(buf + vec_id);
+            VecType a = LoadToReg(buf + vec_id);
             uint32_t c = TESTZ(a, _threshold);
             if (c == 0) {
                 const off_t curr_offset = offset + vec_id * vec_size;
diff --git a/src/simd_nf4.h b/src/simd_nf4.h
index e60a9880..9cf558e2 100644
--- a/src/simd_nf4.h
+++ b/src/simd_nf4.h
@@ -168,15 +168,15 @@ inline __uint128_t pack(__uint128_t a, uint32_t flag)
 
 #if defined(__AVX2__)
 
-inline VecType load_to_reg(HalfVecType x)
+inline VecType LoadToReg(HalfVecType x)
 {
     return _mm256_castsi128_si256(_mm_load_si128(&x));
 }
 
-inline VecType load_to_reg(__uint128_t x)
+inline VecType LoadToReg(__uint128_t x)
 {
     const HalfVecType* _x = reinterpret_cast<const HalfVecType*>(&x);
-    return load_to_reg(*_x);
+    return LoadToReg(*_x);
 }
 
 inline void STORE_LOW(HalfVecType* address, VecType reg)
@@ -187,8 +187,8 @@ inline void STORE_LOW(HalfVecType* address, VecType reg)
 inline __uint128_t add(__uint128_t a, __uint128_t b)
 {
     HalfVecType res;
-    VecType _a = load_to_reg(a);
-    VecType _b = load_to_reg(b);
+    VecType _a = LoadToReg(a);
+    VecType _b = LoadToReg(b);
     STORE_LOW(&res, ADD_MOD(_a, _b, F4));
     return reinterpret_cast<__uint128_t>(res);
 }
@@ -196,8 +196,8 @@ inline __uint128_t add(__uint128_t a, __uint128_t b)
 inline __uint128_t sub(__uint128_t a, __uint128_t b)
 {
     HalfVecType res;
-    VecType _a = load_to_reg(a);
-    VecType _b = load_to_reg(b);
+    VecType _a = LoadToReg(a);
+    VecType _b = LoadToReg(b);
     STORE_LOW(&res, SUB_MOD(_a, _b, F4));
     return reinterpret_cast<__uint128_t>(res);
 }
@@ -205,8 +205,8 @@ inline __uint128_t sub(__uint128_t a, __uint128_t b)
 inline __uint128_t mul(__uint128_t a, __uint128_t b)
 {
     HalfVecType res;
-    VecType _a = load_to_reg(a);
-    VecType _b = load_to_reg(b);
+    VecType _a = LoadToReg(a);
+    VecType _b = LoadToReg(b);
     STORE_LOW(&res, MULFULL_MOD(_a, _b, F4));
     return reinterpret_cast<__uint128_t>(res);
 }
@@ -222,9 +222,9 @@ inline void add_buf_to_two_bufs_rem(
     HalfVecType* _x_half = reinterpret_cast<HalfVecType*>(x_half);
     HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
     for (unsigned i = 0; i < n; ++i) {
-        VecType _x_p = load_to_reg(_x[i]);
-        VecType _x_next_p = load_to_reg(_x_half[i]);
-        VecType _y_p = load_to_reg(_y[i]);
+        VecType _x_p = LoadToReg(_x[i]);
+        VecType _x_next_p = LoadToReg(_x_half[i]);
+        VecType _y_p = LoadToReg(_y[i]);
 
         STORE_LOW(_x + i, ADD_MOD(_x_p, _y_p, F4));
         STORE_LOW(_x_half + i, ADD_MOD(_x_next_p, _y_p, F4));
@@ -236,8 +236,8 @@ inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y)
     HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
     HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
     for (unsigned i = 0; i < n; ++i) {
-        VecType _x_p = load_to_reg(_x[i]);
-        VecType _y_p = load_to_reg(_y[i]);
+        VecType _x_p = LoadToReg(_x[i]);
+        VecType _y_p = LoadToReg(_y[i]);
 
         STORE_LOW(_x + i, MULFULL_MOD(_x_p, _y_p, F4));
     }
@@ -253,9 +253,9 @@ inline void hadamard_mul_doubled_rem(
     HalfVecType* _x_half = reinterpret_cast<HalfVecType*>(x_half);
     HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
     for (unsigned i = 0; i < n; ++i) {
-        VecType _x_p = load_to_reg(_x[i]);
-        VecType _x_next_p = load_to_reg(_x_half[i]);
-        VecType _y_p = load_to_reg(_y[i]);
+        VecType _x_p = LoadToReg(_x[i]);
+        VecType _x_next_p = LoadToReg(_x_half[i]);
+        VecType _y_p = LoadToReg(_y[i]);
 
         STORE_LOW(_x + i, MULFULL_MOD(_x_p, _y_p, F4));
         STORE_LOW(_x_half + i, MULFULL_MOD(_x_next_p, _y_p, F4));
@@ -264,7 +264,7 @@ inline void hadamard_mul_doubled_rem(
 
 #elif defined(__SSE4_1__)
 
-inline VecType load_to_reg(__uint128_t x)
+inline VecType LoadToReg(__uint128_t x)
 {
     const VecType* _x = reinterpret_cast<const VecType*>(&x);
     return _mm_load_si128(_x);
@@ -273,8 +273,8 @@ inline VecType load_to_reg(__uint128_t x)
 inline __uint128_t add(__uint128_t a, __uint128_t b)
 {
     VecType res;
-    VecType _a = load_to_reg(a);
-    VecType _b = load_to_reg(b);
+    VecType _a = LoadToReg(a);
+    VecType _b = LoadToReg(b);
     STORE(&res, ADD_MOD(_a, _b, F4));
     return reinterpret_cast<__uint128_t>(res);
 }
@@ -282,8 +282,8 @@ inline __uint128_t add(__uint128_t a, __uint128_t b)
 inline __uint128_t sub(__uint128_t a, __uint128_t b)
 {
     VecType res;
-    VecType _a = load_to_reg(a);
-    VecType _b = load_to_reg(b);
+    VecType _a = LoadToReg(a);
+    VecType _b = LoadToReg(b);
     STORE(&res, SUB_MOD(_a, _b, F4));
     return reinterpret_cast<__uint128_t>(res);
 }
@@ -291,8 +291,8 @@ inline __uint128_t sub(__uint128_t a, __uint128_t b)
 inline __uint128_t mul(__uint128_t a, __uint128_t b)
 {
     VecType res;
-    VecType _a = load_to_reg(a);
-    VecType _b = load_to_reg(b);
+    VecType _a = LoadToReg(a);
+    VecType _b = LoadToReg(b);
     STORE(&res, MULFULL_MOD(_a, _b, F4));
     return reinterpret_cast<__uint128_t>(res);
 }

From 96617b349e33160e13d437aaaf69a5fc711aa71f Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:00:25 +0100
Subject: [PATCH 43/77] SIMD: rename STORE

---
 src/simd_128.h |  2 +-
 src/simd_256.h |  2 +-
 src/simd_fnt.h | 64 +++++++++++++++++++++++++-------------------------
 src/simd_nf4.h |  6 ++---
 4 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index e04cf6cd..545324d6 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -60,7 +60,7 @@ inline VecType LoadToReg(VecType* address)
 {
     return _mm_load_si128(address);
 }
-inline void STORE(VecType* address, VecType reg)
+inline void StoreToMem(VecType* address, VecType reg)
 {
     _mm_store_si128(address, reg);
 }
diff --git a/src/simd_256.h b/src/simd_256.h
index 80953e92..cd8df25b 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -73,7 +73,7 @@ inline VecType LoadToReg(VecType* address)
 {
     return _mm256_load_si256(address);
 }
-inline void STORE(VecType* address, VecType reg)
+inline void StoreToMem(VecType* address, VecType reg)
 {
     _mm256_store_si256(address, reg);
 }
diff --git a/src/simd_fnt.h b/src/simd_fnt.h
index 7e025610..08b73636 100644
--- a/src/simd_fnt.h
+++ b/src/simd_fnt.h
@@ -167,10 +167,10 @@ inline void butterfly_ct_step(
             BUTTERFLY_CT(rp1, c, &x2, &y2, card);
 
             // Store back to memory
-            STORE(p + j, x1);
-            STORE(p + j + 1, x2);
-            STORE(q + j, y1);
-            STORE(q + j + 1, y2);
+            StoreToMem(p + j, x1);
+            StoreToMem(p + j + 1, x2);
+            StoreToMem(q + j, y1);
+            StoreToMem(q + j + 1, y2);
         }
         for (; j < len; ++j) {
             x1 = LoadToReg(p + j);
@@ -179,8 +179,8 @@ inline void butterfly_ct_step(
             BUTTERFLY_CT(rp1, c, &x1, &y1, card);
 
             // Store back to memory
-            STORE(p + j, x1);
-            STORE(q + j, y1);
+            StoreToMem(p + j, x1);
+            StoreToMem(q + j, y1);
         }
     }
 }
@@ -237,15 +237,15 @@ inline static void do_butterfly_ct_2_layers(
         BUTTERFLY_CT(r3p1, c3, &y2, &v2, card);
 
         // Store back to memory
-        STORE(p, x1);
-        STORE(p + 1, x2);
-        STORE(q, y1);
-        STORE(q + 1, y2);
-
-        STORE(r, u1);
-        STORE(r + 1, u2);
-        STORE(s, v1);
-        STORE(s + 1, v2);
+        StoreToMem(p, x1);
+        StoreToMem(p + 1, x2);
+        StoreToMem(q, y1);
+        StoreToMem(q + 1, y2);
+
+        StoreToMem(r, u1);
+        StoreToMem(r + 1, u2);
+        StoreToMem(s, v1);
+        StoreToMem(s + 1, v2);
         p = p + 2;
         q = q + 2;
         r = r + 2;
@@ -267,10 +267,10 @@ inline static void do_butterfly_ct_2_layers(
         BUTTERFLY_CT(r3p1, c3, &y1, &v1, card);
 
         // Store back to memory
-        STORE(p + j, x1);
-        STORE(q + j, y1);
-        STORE(r + j, u1);
-        STORE(s + j, v1);
+        StoreToMem(p + j, x1);
+        StoreToMem(q + j, y1);
+        StoreToMem(r + j, u1);
+        StoreToMem(s + j, v1);
     }
 }
 
@@ -381,14 +381,14 @@ inline void butterfly_gs_step(
             BUTTERFLY_GS(rp1, c, &x4, &y4, card);
 
             // Store back to memory
-            STORE(p + j, x1);
-            STORE(p + j + 1, x2);
-            STORE(p + j + 2, x3);
-            STORE(p + j + 3, x4);
-            STORE(q + j, y1);
-            STORE(q + j + 1, y2);
-            STORE(q + j + 2, y3);
-            STORE(q + j + 3, y4);
+            StoreToMem(p + j, x1);
+            StoreToMem(p + j + 1, x2);
+            StoreToMem(p + j + 2, x3);
+            StoreToMem(p + j + 3, x4);
+            StoreToMem(q + j, y1);
+            StoreToMem(q + j + 1, y2);
+            StoreToMem(q + j + 2, y3);
+            StoreToMem(q + j + 3, y4);
         }
         for (; j < len; ++j) {
             x1 = LoadToReg(p + j);
@@ -397,8 +397,8 @@ inline void butterfly_gs_step(
             BUTTERFLY_GS(rp1, c, &x1, &y1, card);
 
             // Store back to memory
-            STORE(p + j, x1);
-            STORE(q + j, y1);
+            StoreToMem(p + j, x1);
+            StoreToMem(q + j, y1);
         }
     }
 }
@@ -450,8 +450,8 @@ inline void butterfly_gs_step_simple(
             y2 = BUTTERFLY_GS_SIMPLE(rp1, c, x2, card);
 
             // Store back to memory
-            STORE(q + j, y1);
-            STORE(q + j + 1, y2);
+            StoreToMem(q + j, y1);
+            StoreToMem(q + j + 1, y2);
         }
         for (; j < len; ++j) {
             x1 = LoadToReg(p + j);
@@ -459,7 +459,7 @@ inline void butterfly_gs_step_simple(
             y1 = BUTTERFLY_GS_SIMPLE(rp1, c, x1, card);
 
             // Store back to memory
-            STORE(q + j, y1);
+            StoreToMem(q + j, y1);
         }
     }
 }
diff --git a/src/simd_nf4.h b/src/simd_nf4.h
index 9cf558e2..eac6af80 100644
--- a/src/simd_nf4.h
+++ b/src/simd_nf4.h
@@ -275,7 +275,7 @@ inline __uint128_t add(__uint128_t a, __uint128_t b)
     VecType res;
     VecType _a = LoadToReg(a);
     VecType _b = LoadToReg(b);
-    STORE(&res, ADD_MOD(_a, _b, F4));
+    StoreToMem(&res, ADD_MOD(_a, _b, F4));
     return reinterpret_cast<__uint128_t>(res);
 }
 
@@ -284,7 +284,7 @@ inline __uint128_t sub(__uint128_t a, __uint128_t b)
     VecType res;
     VecType _a = LoadToReg(a);
     VecType _b = LoadToReg(b);
-    STORE(&res, SUB_MOD(_a, _b, F4));
+    StoreToMem(&res, SUB_MOD(_a, _b, F4));
     return reinterpret_cast<__uint128_t>(res);
 }
 
@@ -293,7 +293,7 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b)
     VecType res;
     VecType _a = LoadToReg(a);
     VecType _b = LoadToReg(b);
-    STORE(&res, MULFULL_MOD(_a, _b, F4));
+    StoreToMem(&res, MULFULL_MOD(_a, _b, F4));
     return reinterpret_cast<__uint128_t>(res);
 }
 

From 96dc58bd359116a1737b94a5a8b9f827798348f8 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:01:08 +0100
Subject: [PATCH 44/77] SIMD: rename AND

---
 src/simd_128.h   | 2 +-
 src/simd_256.h   | 2 +-
 src/simd_basic.h | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index 545324d6..ffc16309 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -65,7 +65,7 @@ inline void StoreToMem(VecType* address, VecType reg)
     _mm_store_si128(address, reg);
 }
 
-inline VecType AND(VecType x, VecType y)
+inline VecType And(VecType x, VecType y)
 {
     return _mm_and_si128(x, y);
 }
diff --git a/src/simd_256.h b/src/simd_256.h
index cd8df25b..efa167de 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -78,7 +78,7 @@ inline void StoreToMem(VecType* address, VecType reg)
     _mm256_store_si256(address, reg);
 }
 
-inline VecType AND(VecType x, VecType y)
+inline VecType And(VecType x, VecType y)
 {
     return _mm256_and_si256(x, y);
 }
diff --git a/src/simd_basic.h b/src/simd_basic.h
index 106f620b..54654ef9 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -132,13 +132,13 @@ inline VecType MULFULL_MOD(VecType x, VecType y, T q)
     const VecType res = MUL_MOD(x, y, q);
 
     // filter elements of both of a & b = card-1
-    const VecType cmp = AND(CMPEQ<T>(x, CARD_M_1(q)), CMPEQ<T>(y, CARD_M_1(q)));
+    const VecType cmp = And(CMPEQ<T>(x, CARD_M_1(q)), CMPEQ<T>(y, CARD_M_1(q)));
 
     if (is_all_zeros(cmp) == 1) {
         return res;
     }
-    return (q == F3) ? XOR(res, AND(F4_u32, cmp))
-                     : ADD<T>(res, AND(ONE32, cmp));
+    return (q == F3) ? XOR(res, And(F4_u32, cmp))
+                     : ADD<T>(res, And(ONE32, cmp));
 }
 
 /**
@@ -161,7 +161,7 @@ inline void ADD_PROPS(
     T max)
 {
     const VecType b = CMPEQ<T>(threshold, symb);
-    const VecType c = AND(mask, b);
+    const VecType c = And(mask, b);
     auto d = MVMSK8(c);
     const unsigned element_size = sizeof(T);
     while (d > 0) {

From f2114e325fae25975c10456def2f4f7c85543c0c Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:01:26 +0100
Subject: [PATCH 45/77] SIMD: rename XOR

---
 src/simd_128.h   | 2 +-
 src/simd_256.h   | 2 +-
 src/simd_basic.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index ffc16309..029a87df 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -69,7 +69,7 @@ inline VecType And(VecType x, VecType y)
 {
     return _mm_and_si128(x, y);
 }
-inline VecType XOR(VecType x, VecType y)
+inline VecType Xor(VecType x, VecType y)
 {
     return _mm_xor_si128(x, y);
 }
diff --git a/src/simd_256.h b/src/simd_256.h
index efa167de..392fb414 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -82,7 +82,7 @@ inline VecType And(VecType x, VecType y)
 {
     return _mm256_and_si256(x, y);
 }
-inline VecType XOR(VecType x, VecType y)
+inline VecType Xor(VecType x, VecType y)
 {
     return _mm256_xor_si256(x, y);
 }
diff --git a/src/simd_basic.h b/src/simd_basic.h
index 54654ef9..c07c592c 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -137,7 +137,7 @@ inline VecType MULFULL_MOD(VecType x, VecType y, T q)
     if (is_all_zeros(cmp) == 1) {
         return res;
     }
-    return (q == F3) ? XOR(res, And(F4_u32, cmp))
+    return (q == F3) ? Xor(res, And(F4_u32, cmp))
                      : ADD<T>(res, And(ONE32, cmp));
 }
 

From 1df8169f90324052f28d3918cd0976518cd35508 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:02:53 +0100
Subject: [PATCH 46/77] SIMD: rename MVMSK8

---
 src/simd_128.h   | 2 +-
 src/simd_256.h   | 2 +-
 src/simd_basic.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index 029a87df..47b6edc3 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -73,7 +73,7 @@ inline VecType Xor(VecType x, VecType y)
 {
     return _mm_xor_si128(x, y);
 }
-inline uint16_t MVMSK8(VecType x)
+inline uint16_t Msb8Mask(VecType x)
 {
     return _mm_movemask_epi8(x);
 }
diff --git a/src/simd_256.h b/src/simd_256.h
index 392fb414..4fe42fc4 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -86,7 +86,7 @@ inline VecType Xor(VecType x, VecType y)
 {
     return _mm256_xor_si256(x, y);
 }
-inline uint32_t MVMSK8(VecType x)
+inline uint32_t Msb8Mask(VecType x)
 {
     return _mm256_movemask_epi8(x);
 }
diff --git a/src/simd_basic.h b/src/simd_basic.h
index c07c592c..71818702 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -162,7 +162,7 @@ inline void ADD_PROPS(
 {
     const VecType b = CMPEQ<T>(threshold, symb);
     const VecType c = And(mask, b);
-    auto d = MVMSK8(c);
+    auto d = Msb8Mask(c);
     const unsigned element_size = sizeof(T);
     while (d > 0) {
         const unsigned byte_idx = __builtin_ctz(d);

From 18399bdee827e0bc1c1400d13566095728857888 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:09:02 +0100
Subject: [PATCH 47/77] SIMD: rename TESTZ

---
 src/simd_128.h |  2 +-
 src/simd_256.h |  2 +-
 src/simd_fnt.h | 10 +++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index 47b6edc3..9f24655e 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -77,7 +77,7 @@ inline uint16_t Msb8Mask(VecType x)
 {
     return _mm_movemask_epi8(x);
 }
-inline uint16_t TESTZ(VecType x, VecType y)
+inline uint16_t AndIsZero(VecType x, VecType y)
 {
     return _mm_testz_si128(x, y);
 }
diff --git a/src/simd_256.h b/src/simd_256.h
index 4fe42fc4..01484616 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -90,7 +90,7 @@ inline uint32_t Msb8Mask(VecType x)
 {
     return _mm256_movemask_epi8(x);
 }
-inline uint32_t TESTZ(VecType x, VecType y)
+inline uint32_t AndIsZero(VecType x, VecType y)
 {
     return _mm256_testz_si256(x, y);
 }
diff --git a/src/simd_fnt.h b/src/simd_fnt.h
index 08b73636..53fb8df3 100644
--- a/src/simd_fnt.h
+++ b/src/simd_fnt.h
@@ -491,22 +491,22 @@ inline void encode_post_process(
             VecType a3 = LoadToReg(buf + vec_id + 2);
             VecType a4 = LoadToReg(buf + vec_id + 3);
 
-            if (TESTZ(a1, _threshold) == 0) {
+            if (AndIsZero(a1, _threshold) == 0) {
                 const off_t curr_offset = offset + vec_id * vec_size;
                 ADD_PROPS(
                     props[frag_id], _threshold, mask_hi, a1, curr_offset, max);
             }
-            if (TESTZ(a2, _threshold) == 0) {
+            if (AndIsZero(a2, _threshold) == 0) {
                 const off_t curr_offset = offset + (vec_id + 1) * vec_size;
                 ADD_PROPS(
                     props[frag_id], _threshold, mask_hi, a2, curr_offset, max);
             }
-            if (TESTZ(a3, _threshold) == 0) {
+            if (AndIsZero(a3, _threshold) == 0) {
                 const off_t curr_offset = offset + (vec_id + 2) * vec_size;
                 ADD_PROPS(
                     props[frag_id], _threshold, mask_hi, a3, curr_offset, max);
             }
-            if (TESTZ(a4, _threshold) == 0) {
+            if (AndIsZero(a4, _threshold) == 0) {
                 const off_t curr_offset = offset + (vec_id + 3) * vec_size;
                 ADD_PROPS(
                     props[frag_id], _threshold, mask_hi, a4, curr_offset, max);
@@ -514,7 +514,7 @@ inline void encode_post_process(
         }
         for (; vec_id < vecs_nb; ++vec_id) {
             VecType a = LoadToReg(buf + vec_id);
-            uint32_t c = TESTZ(a, _threshold);
+            uint32_t c = AndIsZero(a, _threshold);
             if (c == 0) {
                 const off_t curr_offset = offset + vec_id * vec_size;
                 ADD_PROPS(

From a873b88e6191ee93761ed4cbea6f74cd0b636975 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:09:25 +0100
Subject: [PATCH 48/77] SIMD: rename is_all_zeros

---
 src/simd_128.h   | 2 +-
 src/simd_256.h   | 2 +-
 src/simd_basic.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index 9f24655e..7b241453 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -81,7 +81,7 @@ inline uint16_t AndIsZero(VecType x, VecType y)
 {
     return _mm_testz_si128(x, y);
 }
-inline int is_all_zeros(VecType x)
+inline int IsZero(VecType x)
 {
     return _mm_testc_si128(ZERO, x);
 }
diff --git a/src/simd_256.h b/src/simd_256.h
index 01484616..e9048772 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -94,7 +94,7 @@ inline uint32_t AndIsZero(VecType x, VecType y)
 {
     return _mm256_testz_si256(x, y);
 }
-inline int is_all_zeros(VecType x)
+inline int IsZero(VecType x)
 {
     return _mm256_testc_si256(ZERO, x);
 }
diff --git a/src/simd_basic.h b/src/simd_basic.h
index 71818702..4b5cf763 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -134,7 +134,7 @@ inline VecType MULFULL_MOD(VecType x, VecType y, T q)
     // filter elements of both of a & b = card-1
     const VecType cmp = And(CMPEQ<T>(x, CARD_M_1(q)), CMPEQ<T>(y, CARD_M_1(q)));
 
-    if (is_all_zeros(cmp) == 1) {
+    if (IsZero(cmp) == 1) {
         return res;
     }
     return (q == F3) ? Xor(res, And(F4_u32, cmp))

From 02be87fd86a07a1d1c3ae9cfca4c17d46769c24d Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:10:00 +0100
Subject: [PATCH 49/77] SIMD: rename SET1

---
 src/simd_128.h   |  6 +++---
 src/simd_256.h   |  6 +++---
 src/simd_basic.h |  2 +-
 src/simd_fnt.h   | 16 ++++++++--------
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index 7b241453..c453c19b 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -93,14 +93,14 @@ inline int IsZero(VecType x)
 /* ================= Essential Operations for SSE ================= */
 
 template <typename T>
-inline VecType SET1(T val);
+inline VecType SetOne(T val);
 template <>
-inline VecType SET1(uint32_t val)
+inline VecType SetOne(uint32_t val)
 {
     return _mm_set1_epi32(val);
 }
 template <>
-inline VecType SET1(uint16_t val)
+inline VecType SetOne(uint16_t val)
 {
     return _mm_set1_epi16(val);
 }
diff --git a/src/simd_256.h b/src/simd_256.h
index e9048772..07be31de 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -106,14 +106,14 @@ inline int IsZero(VecType x)
 /* ================= Essential Operations for AVX2 ================= */
 
 template <typename T>
-inline VecType SET1(T val);
+inline VecType SetOne(T val);
 template <>
-inline VecType SET1(uint32_t val)
+inline VecType SetOne(uint32_t val)
 {
     return _mm256_set1_epi32(val);
 }
 template <>
-inline VecType SET1(uint16_t val)
+inline VecType SetOne(uint16_t val)
 {
     return _mm256_set1_epi16(val);
 }
diff --git a/src/simd_basic.h b/src/simd_basic.h
index 4b5cf763..52dd3305 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -181,7 +181,7 @@ inline void ADD_PROPS(
 template <typename T>
 inline void mul_coef_to_buf(const T a, T* src, T* dest, size_t len, T card)
 {
-    const VecType coef = SET1(a);
+    const VecType coef = SetOne(a);
 
     VecType* __restrict _src = reinterpret_cast<VecType*>(src);
     VecType* __restrict _dest = reinterpret_cast<VecType*>(dest);
diff --git a/src/simd_fnt.h b/src/simd_fnt.h
index 53fb8df3..1b1843cd 100644
--- a/src/simd_fnt.h
+++ b/src/simd_fnt.h
@@ -143,7 +143,7 @@ inline void butterfly_ct_step(
         return;
     }
     const T rp1 = r + 1;
-    VecType c = SET1(r);
+    VecType c = SetOne(r);
 
     const size_t end = (len > 1) ? len - 1 : 0;
     const unsigned bufs_nb = buf.get_n();
@@ -200,9 +200,9 @@ inline static void do_butterfly_ct_2_layers(
     const T r2p1 = r2 + 1;
     const T r3p1 = r3 + 1;
 
-    VecType c1 = SET1(r1);
-    VecType c2 = SET1(r2);
-    VecType c3 = SET1(r3);
+    VecType c1 = SetOne(r1);
+    VecType c2 = SetOne(r2);
+    VecType c3 = SetOne(r3);
 
     VecType* __restrict p = reinterpret_cast<VecType*>(mem[start]);
     VecType* __restrict q = reinterpret_cast<VecType*>(mem[start + m]);
@@ -353,7 +353,7 @@ inline void butterfly_gs_step(
     }
     const unsigned step = m << 1;
     const T rp1 = r + 1;
-    VecType c = SET1(r);
+    VecType c = SetOne(r);
 
     const size_t end = (len > 3) ? len - 3 : 0;
     const unsigned bufs_nb = buf.get_n();
@@ -430,7 +430,7 @@ inline void butterfly_gs_step_simple(
     }
     const unsigned step = m << 1;
     const T rp1 = r + 1;
-    VecType c = SET1(r);
+    VecType c = SetOne(r);
 
     const size_t end = (len > 1) ? len - 1 : 0;
     const unsigned bufs_nb = buf.get_n();
@@ -476,8 +476,8 @@ inline void encode_post_process(
     const unsigned element_size = sizeof(T);
     const unsigned vec_size = countof<T>();
     const T max = 1 << (element_size * 8 - 1);
-    const VecType _threshold = SET1(threshold);
-    const VecType mask_hi = SET1(max);
+    const VecType _threshold = SetOne(threshold);
+    const VecType mask_hi = SetOne(max);
 
     const std::vector<T*>& mem = output.get_mem();
     for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {

From 76abc31ebfb6cf6de2bbeee04911c33fe6a95149 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:10:18 +0100
Subject: [PATCH 50/77] SIMD: rename ADD

---
 src/simd_128.h   | 6 +++---
 src/simd_256.h   | 6 +++---
 src/simd_basic.h | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index c453c19b..134131df 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -106,14 +106,14 @@ inline VecType SetOne(uint16_t val)
 }
 
 template <typename T>
-inline VecType ADD(VecType x, VecType y);
+inline VecType Add(VecType x, VecType y);
 template <>
-inline VecType ADD<uint32_t>(VecType x, VecType y)
+inline VecType Add<uint32_t>(VecType x, VecType y)
 {
     return _mm_add_epi32(x, y);
 }
 template <>
-inline VecType ADD<uint16_t>(VecType x, VecType y)
+inline VecType Add<uint16_t>(VecType x, VecType y)
 {
     return _mm_add_epi16(x, y);
 }
diff --git a/src/simd_256.h b/src/simd_256.h
index 07be31de..4cde3dae 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -119,14 +119,14 @@ inline VecType SetOne(uint16_t val)
 }
 
 template <typename T>
-inline VecType ADD(VecType x, VecType y);
+inline VecType Add(VecType x, VecType y);
 template <>
-inline VecType ADD<uint32_t>(VecType x, VecType y)
+inline VecType Add<uint32_t>(VecType x, VecType y)
 {
     return _mm256_add_epi32(x, y);
 }
 template <>
-inline VecType ADD<uint16_t>(VecType x, VecType y)
+inline VecType Add<uint16_t>(VecType x, VecType y)
 {
     return _mm256_add_epi16(x, y);
 }
diff --git a/src/simd_basic.h b/src/simd_basic.h
index 52dd3305..7a90e3ec 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -61,7 +61,7 @@ inline VecType CARD_M_1(T q)
 template <typename T>
 inline VecType ADD_MOD(VecType x, VecType y, T q)
 {
-    const VecType res = ADD<T>(x, y);
+    const VecType res = Add<T>(x, y);
     return MIN<T>(res, SUB<T>(res, CARD(q)));
 }
 
@@ -77,7 +77,7 @@ template <typename T>
 inline VecType SUB_MOD(VecType x, VecType y, T q)
 {
     const VecType res = SUB<T>(x, y);
-    return MIN<T>(res, ADD<T>(res, CARD(q)));
+    return MIN<T>(res, Add<T>(res, CARD(q)));
 }
 
 /**
@@ -138,7 +138,7 @@ inline VecType MULFULL_MOD(VecType x, VecType y, T q)
         return res;
     }
     return (q == F3) ? Xor(res, And(F4_u32, cmp))
-                     : ADD<T>(res, And(ONE32, cmp));
+                     : Add<T>(res, And(ONE32, cmp));
 }
 
 /**

From 3a5beca6796a0b33f3d60866e8ac76e35c14c2c8 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:10:37 +0100
Subject: [PATCH 51/77] SIMD: rename SUB

---
 src/simd_128.h   | 6 +++---
 src/simd_256.h   | 6 +++---
 src/simd_basic.h | 8 ++++----
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index 134131df..6431099b 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -119,14 +119,14 @@ inline VecType Add<uint16_t>(VecType x, VecType y)
 }
 
 template <typename T>
-inline VecType SUB(VecType x, VecType y);
+inline VecType Sub(VecType x, VecType y);
 template <>
-inline VecType SUB<uint32_t>(VecType x, VecType y)
+inline VecType Sub<uint32_t>(VecType x, VecType y)
 {
     return _mm_sub_epi32(x, y);
 }
 template <>
-inline VecType SUB<uint16_t>(VecType x, VecType y)
+inline VecType Sub<uint16_t>(VecType x, VecType y)
 {
     return _mm_sub_epi16(x, y);
 }
diff --git a/src/simd_256.h b/src/simd_256.h
index 4cde3dae..b03f4c8d 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -132,14 +132,14 @@ inline VecType Add<uint16_t>(VecType x, VecType y)
 }
 
 template <typename T>
-inline VecType SUB(VecType x, VecType y);
+inline VecType Sub(VecType x, VecType y);
 template <>
-inline VecType SUB<uint32_t>(VecType x, VecType y)
+inline VecType Sub<uint32_t>(VecType x, VecType y)
 {
     return _mm256_sub_epi32(x, y);
 }
 template <>
-inline VecType SUB<uint16_t>(VecType x, VecType y)
+inline VecType Sub<uint16_t>(VecType x, VecType y)
 {
     return _mm256_sub_epi16(x, y);
 }
diff --git a/src/simd_basic.h b/src/simd_basic.h
index 7a90e3ec..ea1869e8 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -62,7 +62,7 @@ template <typename T>
 inline VecType ADD_MOD(VecType x, VecType y, T q)
 {
     const VecType res = Add<T>(x, y);
-    return MIN<T>(res, SUB<T>(res, CARD(q)));
+    return MIN<T>(res, Sub<T>(res, CARD(q)));
 }
 
 /**
@@ -76,7 +76,7 @@ inline VecType ADD_MOD(VecType x, VecType y, T q)
 template <typename T>
 inline VecType SUB_MOD(VecType x, VecType y, T q)
 {
-    const VecType res = SUB<T>(x, y);
+    const VecType res = Sub<T>(x, y);
     return MIN<T>(res, Add<T>(res, CARD(q)));
 }
 
@@ -90,8 +90,8 @@ inline VecType SUB_MOD(VecType x, VecType y, T q)
 template <typename T>
 inline VecType NEG_MOD(VecType x, T q)
 {
-    const VecType res = SUB<T>(CARD(q), x);
-    return MIN<T>(res, SUB<T>(res, CARD(q)));
+    const VecType res = Sub<T>(CARD(q), x);
+    return MIN<T>(res, Sub<T>(res, CARD(q)));
 }
 
 /**

From 5d7d0fd1724a707ab9b669eae1ee35984e354ee3 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:10:59 +0100
Subject: [PATCH 52/77] SIMD: rename MUL

---
 src/simd_128.h   | 6 +++---
 src/simd_256.h   | 6 +++---
 src/simd_basic.h | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index 6431099b..0eaa0b28 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -132,14 +132,14 @@ inline VecType Sub<uint16_t>(VecType x, VecType y)
 }
 
 template <typename T>
-inline VecType MUL(VecType x, VecType y);
+inline VecType Mul(VecType x, VecType y);
 template <>
-inline VecType MUL<uint32_t>(VecType x, VecType y)
+inline VecType Mul<uint32_t>(VecType x, VecType y)
 {
     return _mm_mullo_epi32(x, y);
 }
 template <>
-inline VecType MUL<uint16_t>(VecType x, VecType y)
+inline VecType Mul<uint16_t>(VecType x, VecType y)
 {
     return _mm_mullo_epi16(x, y);
 }
diff --git a/src/simd_256.h b/src/simd_256.h
index b03f4c8d..f3a73ea1 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -145,14 +145,14 @@ inline VecType Sub<uint16_t>(VecType x, VecType y)
 }
 
 template <typename T>
-inline VecType MUL(VecType x, VecType y);
+inline VecType Mul(VecType x, VecType y);
 template <>
-inline VecType MUL<uint32_t>(VecType x, VecType y)
+inline VecType Mul<uint32_t>(VecType x, VecType y)
 {
     return _mm256_mullo_epi32(x, y);
 }
 template <>
-inline VecType MUL<uint16_t>(VecType x, VecType y)
+inline VecType Mul<uint16_t>(VecType x, VecType y)
 {
     return _mm256_mullo_epi16(x, y);
 }
diff --git a/src/simd_basic.h b/src/simd_basic.h
index ea1869e8..1db3623f 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -108,7 +108,7 @@ inline VecType NEG_MOD(VecType x, T q)
 template <typename T>
 inline VecType MUL_MOD(VecType x, VecType y, T q)
 {
-    const VecType res = MUL<T>(x, y);
+    const VecType res = Mul<T>(x, y);
     const VecType lo =
         (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55);
     const VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO)

From 4ac96507e358762c70e4c8cd9a383cde19a3516d Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:11:38 +0100
Subject: [PATCH 53/77] SIMD: rename CMPEQ

---
 src/simd_128.h   | 6 +++---
 src/simd_256.h   | 6 +++---
 src/simd_basic.h | 5 +++--
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index 0eaa0b28..c8d30c90 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -145,14 +145,14 @@ inline VecType Mul<uint16_t>(VecType x, VecType y)
 }
 
 template <typename T>
-inline VecType CMPEQ(VecType x, VecType y);
+inline VecType CompareEq(VecType x, VecType y);
 template <>
-inline VecType CMPEQ<uint32_t>(VecType x, VecType y)
+inline VecType CompareEq<uint32_t>(VecType x, VecType y)
 {
     return _mm_cmpeq_epi32(x, y);
 }
 template <>
-inline VecType CMPEQ<uint16_t>(VecType x, VecType y)
+inline VecType CompareEq<uint16_t>(VecType x, VecType y)
 {
     return _mm_cmpeq_epi16(x, y);
 }
diff --git a/src/simd_256.h b/src/simd_256.h
index f3a73ea1..bd06232c 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -158,14 +158,14 @@ inline VecType Mul<uint16_t>(VecType x, VecType y)
 }
 
 template <typename T>
-inline VecType CMPEQ(VecType x, VecType y);
+inline VecType CompareEq(VecType x, VecType y);
 template <>
-inline VecType CMPEQ<uint32_t>(VecType x, VecType y)
+inline VecType CompareEq<uint32_t>(VecType x, VecType y)
 {
     return _mm256_cmpeq_epi32(x, y);
 }
 template <>
-inline VecType CMPEQ<uint16_t>(VecType x, VecType y)
+inline VecType CompareEq<uint16_t>(VecType x, VecType y)
 {
     return _mm256_cmpeq_epi16(x, y);
 }
diff --git a/src/simd_basic.h b/src/simd_basic.h
index 1db3623f..1cad74e1 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -132,7 +132,8 @@ inline VecType MULFULL_MOD(VecType x, VecType y, T q)
     const VecType res = MUL_MOD(x, y, q);
 
     // filter elements of both of a & b = card-1
-    const VecType cmp = And(CMPEQ<T>(x, CARD_M_1(q)), CMPEQ<T>(y, CARD_M_1(q)));
+    const VecType cmp =
+        And(CompareEq<T>(x, CARD_M_1(q)), CompareEq<T>(y, CARD_M_1(q)));
 
     if (IsZero(cmp) == 1) {
         return res;
@@ -160,7 +161,7 @@ inline void ADD_PROPS(
     off_t offset,
     T max)
 {
-    const VecType b = CMPEQ<T>(threshold, symb);
+    const VecType b = CompareEq<T>(threshold, symb);
     const VecType c = And(mask, b);
     auto d = Msb8Mask(c);
     const unsigned element_size = sizeof(T);

From 91fd3ee3cb80d5979a87e6a113c48fa86a2b1b93 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:14:21 +0100
Subject: [PATCH 54/77] SIMD: rename MIN

---
 src/simd_128.h   | 6 +++---
 src/simd_256.h   | 6 +++---
 src/simd_basic.h | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index c8d30c90..fe3ca80c 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -158,14 +158,14 @@ inline VecType CompareEq<uint16_t>(VecType x, VecType y)
 }
 
 template <typename T>
-inline VecType MIN(VecType x, VecType y);
+inline VecType Min(VecType x, VecType y);
 template <>
-inline VecType MIN<uint32_t>(VecType x, VecType y)
+inline VecType Min<uint32_t>(VecType x, VecType y)
 {
     return _mm_min_epu32(x, y);
 }
 template <>
-inline VecType MIN<uint16_t>(VecType x, VecType y)
+inline VecType Min<uint16_t>(VecType x, VecType y)
 {
     return _mm_min_epu16(x, y);
 }
diff --git a/src/simd_256.h b/src/simd_256.h
index bd06232c..92c5d5f6 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -171,14 +171,14 @@ inline VecType CompareEq<uint16_t>(VecType x, VecType y)
 }
 
 template <typename T>
-inline VecType MIN(VecType x, VecType y);
+inline VecType Min(VecType x, VecType y);
 template <>
-inline VecType MIN<uint32_t>(VecType x, VecType y)
+inline VecType Min<uint32_t>(VecType x, VecType y)
 {
     return _mm256_min_epu32(x, y);
 }
 template <>
-inline VecType MIN<uint16_t>(VecType x, VecType y)
+inline VecType Min<uint16_t>(VecType x, VecType y)
 {
     return _mm256_min_epu16(x, y);
 }
diff --git a/src/simd_basic.h b/src/simd_basic.h
index 1cad74e1..3e597e9e 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -62,7 +62,7 @@ template <typename T>
 inline VecType ADD_MOD(VecType x, VecType y, T q)
 {
     const VecType res = Add<T>(x, y);
-    return MIN<T>(res, Sub<T>(res, CARD(q)));
+    return Min<T>(res, Sub<T>(res, CARD(q)));
 }
 
 /**
@@ -77,7 +77,7 @@ template <typename T>
 inline VecType SUB_MOD(VecType x, VecType y, T q)
 {
     const VecType res = Sub<T>(x, y);
-    return MIN<T>(res, Add<T>(res, CARD(q)));
+    return Min<T>(res, Add<T>(res, CARD(q)));
 }
 
 /**
@@ -91,7 +91,7 @@ template <typename T>
 inline VecType NEG_MOD(VecType x, T q)
 {
     const VecType res = Sub<T>(CARD(q), x);
-    return MIN<T>(res, Sub<T>(res, CARD(q)));
+    return Min<T>(res, Sub<T>(res, CARD(q)));
 }
 
 /**

From a8f971a9cdfd02ca78b75bf286ef009d2ccab033 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:15:27 +0100
Subject: [PATCH 55/77] SIMD: rename CARD & CARD_M_1

---
 src/simd_basic.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/simd_basic.h b/src/simd_basic.h
index 3e597e9e..693603b1 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -37,13 +37,13 @@ namespace quadiron {
 namespace simd {
 
 template <typename T>
-inline VecType CARD(T q)
+inline VecType Card(T q)
 {
     return (q == F3) ? F3_u32 : F4_u32;
 }
 
 template <typename T>
-inline VecType CARD_M_1(T q)
+inline VecType CardMinusOne(T q)
 {
     return (q == F3) ? F3m1_u32 : F4m1_u32;
 }
@@ -62,7 +62,7 @@ template <typename T>
 inline VecType ADD_MOD(VecType x, VecType y, T q)
 {
     const VecType res = Add<T>(x, y);
-    return Min<T>(res, Sub<T>(res, CARD(q)));
+    return Min<T>(res, Sub<T>(res, Card(q)));
 }
 
 /**
@@ -77,7 +77,7 @@ template <typename T>
 inline VecType SUB_MOD(VecType x, VecType y, T q)
 {
     const VecType res = Sub<T>(x, y);
-    return Min<T>(res, Add<T>(res, CARD(q)));
+    return Min<T>(res, Add<T>(res, Card(q)));
 }
 
 /**
@@ -90,8 +90,8 @@ inline VecType SUB_MOD(VecType x, VecType y, T q)
 template <typename T>
 inline VecType NEG_MOD(VecType x, T q)
 {
-    const VecType res = Sub<T>(CARD(q), x);
-    return Min<T>(res, Sub<T>(res, CARD(q)));
+    const VecType res = Sub<T>(Card(q), x);
+    return Min<T>(res, Sub<T>(res, Card(q)));
 }
 
 /**
@@ -133,7 +133,7 @@ inline VecType MULFULL_MOD(VecType x, VecType y, T q)
 
     // filter elements of both of a & b = card-1
     const VecType cmp =
-        And(CompareEq<T>(x, CARD_M_1(q)), CompareEq<T>(y, CARD_M_1(q)));
+        And(CompareEq<T>(x, CardMinusOne(q)), CompareEq<T>(y, CardMinusOne(q)));
 
     if (IsZero(cmp) == 1) {
         return res;

From b0516fc5bec26533f139e5aaf39eb446d1a5f547 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:15:58 +0100
Subject: [PATCH 56/77] SIMD: rename ADD_MOD

---
 src/simd_basic.h |  4 ++--
 src/simd_fnt.h   |  6 +++---
 src/simd_nf4.h   | 12 ++++++------
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/simd_basic.h b/src/simd_basic.h
index 693603b1..d6c64895 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -59,7 +59,7 @@ inline VecType CardMinusOne(T q)
  * @return (x + y) mod q
  */
 template <typename T>
-inline VecType ADD_MOD(VecType x, VecType y, T q)
+inline VecType ModAdd(VecType x, VecType y, T q)
 {
     const VecType res = Add<T>(x, y);
     return Min<T>(res, Sub<T>(res, Card(q)));
@@ -221,7 +221,7 @@ inline void add_two_bufs(T* src, T* dest, size_t len, T card)
 
     size_t i;
     for (i = 0; i < _len; i++) {
-        _dest[i] = ADD_MOD(_src[i], _dest[i], card);
+        _dest[i] = ModAdd(_src[i], _dest[i], card);
     }
     if (_last_len > 0) {
         for (i = _len * ratio; i < len; i++) {
diff --git a/src/simd_fnt.h b/src/simd_fnt.h
index 1b1843cd..ba6767f2 100644
--- a/src/simd_fnt.h
+++ b/src/simd_fnt.h
@@ -56,9 +56,9 @@ inline void BUTTERFLY_CT(T rp1, VecType c, VecType* x, VecType* y, T q)
     VecType z = (rp1 == 2) ? *y : MUL_MOD(c, *y, q);
     if (rp1 < q) {
         *y = SUB_MOD(*x, z, q);
-        *x = ADD_MOD(*x, z, q);
+        *x = ModAdd(*x, z, q);
     } else { // i.e. r == q - 1
-        *y = ADD_MOD(*x, z, q);
+        *y = ModAdd(*x, z, q);
         *x = SUB_MOD(*x, z, q);
     }
 }
@@ -78,7 +78,7 @@ inline void BUTTERFLY_CT(T rp1, VecType c, VecType* x, VecType* y, T q)
 template <typename T>
 inline void BUTTERFLY_GS(T rp1, VecType c, VecType* x, VecType* y, T q)
 {
-    VecType add = ADD_MOD(*x, *y, q);
+    VecType add = ModAdd(*x, *y, q);
     if (rp1 == 2) {
         *y = SUB_MOD(*x, *y, q);
     } else if (rp1 < q) {
diff --git a/src/simd_nf4.h b/src/simd_nf4.h
index eac6af80..c5dc5150 100644
--- a/src/simd_nf4.h
+++ b/src/simd_nf4.h
@@ -189,7 +189,7 @@ inline __uint128_t add(__uint128_t a, __uint128_t b)
     HalfVecType res;
     VecType _a = LoadToReg(a);
     VecType _b = LoadToReg(b);
-    STORE_LOW(&res, ADD_MOD(_a, _b, F4));
+    STORE_LOW(&res, ModAdd(_a, _b, F4));
     return reinterpret_cast<__uint128_t>(res);
 }
 
@@ -226,8 +226,8 @@ inline void add_buf_to_two_bufs_rem(
         VecType _x_next_p = LoadToReg(_x_half[i]);
         VecType _y_p = LoadToReg(_y[i]);
 
-        STORE_LOW(_x + i, ADD_MOD(_x_p, _y_p, F4));
-        STORE_LOW(_x_half + i, ADD_MOD(_x_next_p, _y_p, F4));
+        STORE_LOW(_x + i, ModAdd(_x_p, _y_p, F4));
+        STORE_LOW(_x_half + i, ModAdd(_x_next_p, _y_p, F4));
     }
 }
 
@@ -275,7 +275,7 @@ inline __uint128_t add(__uint128_t a, __uint128_t b)
     VecType res;
     VecType _a = LoadToReg(a);
     VecType _b = LoadToReg(b);
-    StoreToMem(&res, ADD_MOD(_a, _b, F4));
+    StoreToMem(&res, ModAdd(_a, _b, F4));
     return reinterpret_cast<__uint128_t>(res);
 }
 
@@ -342,12 +342,12 @@ inline void add_buf_to_two_bufs(unsigned n, __uint128_t* _x, __uint128_t* _y)
 
     // add y to the first half of `x`
     for (i = 0; i < vec_len; ++i) {
-        x[i] = ADD_MOD(x[i], y[i], F4);
+        x[i] = ModAdd(x[i], y[i], F4);
     }
 
     // add y to the second half of `x`
     for (i = 0; i < vec_len; ++i) {
-        x_next[i] = ADD_MOD(x_next[i], y[i], F4);
+        x_next[i] = ModAdd(x_next[i], y[i], F4);
     }
 
     if (rem_len > 0) {

From 61122ea64bbf45762e52d169a74e647350cb1fe8 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:16:16 +0100
Subject: [PATCH 57/77] SIMD: rename SUB_MOD

---
 src/simd_basic.h |  6 +++---
 src/simd_fnt.h   | 10 +++++-----
 src/simd_nf4.h   |  4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/simd_basic.h b/src/simd_basic.h
index d6c64895..ce40725e 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -74,7 +74,7 @@ inline VecType ModAdd(VecType x, VecType y, T q)
  * @return (x - y) mod q
  */
 template <typename T>
-inline VecType SUB_MOD(VecType x, VecType y, T q)
+inline VecType ModSub(VecType x, VecType y, T q)
 {
     const VecType res = Sub<T>(x, y);
     return Min<T>(res, Add<T>(res, Card(q)));
@@ -113,7 +113,7 @@ inline VecType MUL_MOD(VecType x, VecType y, T q)
         (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55);
     const VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO)
                                  : BLEND16(ZERO, SHIFTR(res, 2), 0x55);
-    return SUB_MOD(lo, hi, q);
+    return ModSub(lo, hi, q);
 }
 
 /**
@@ -244,7 +244,7 @@ inline void sub_two_bufs(T* bufa, T* bufb, T* res, size_t len, T card)
     size_t i;
     for (i = 0; i < _len; i++) {
         // perform subtraction
-        _res[i] = SUB_MOD(_bufa[i], _bufb[i], card);
+        _res[i] = ModSub(_bufa[i], _bufb[i], card);
     }
     if (_last_len > 0) {
         for (i = _len * ratio; i < len; i++) {
diff --git a/src/simd_fnt.h b/src/simd_fnt.h
index ba6767f2..f002c65b 100644
--- a/src/simd_fnt.h
+++ b/src/simd_fnt.h
@@ -55,11 +55,11 @@ inline void BUTTERFLY_CT(T rp1, VecType c, VecType* x, VecType* y, T q)
 {
     VecType z = (rp1 == 2) ? *y : MUL_MOD(c, *y, q);
     if (rp1 < q) {
-        *y = SUB_MOD(*x, z, q);
+        *y = ModSub(*x, z, q);
         *x = ModAdd(*x, z, q);
     } else { // i.e. r == q - 1
         *y = ModAdd(*x, z, q);
-        *x = SUB_MOD(*x, z, q);
+        *x = ModSub(*x, z, q);
     }
 }
 
@@ -80,12 +80,12 @@ inline void BUTTERFLY_GS(T rp1, VecType c, VecType* x, VecType* y, T q)
 {
     VecType add = ModAdd(*x, *y, q);
     if (rp1 == 2) {
-        *y = SUB_MOD(*x, *y, q);
+        *y = ModSub(*x, *y, q);
     } else if (rp1 < q) {
-        VecType sub = SUB_MOD(*x, *y, q);
+        VecType sub = ModSub(*x, *y, q);
         *y = MUL_MOD(c, sub, q);
     } else { // i.e. r == q - 1
-        *y = SUB_MOD(*y, *x, q);
+        *y = ModSub(*y, *x, q);
     }
     *x = add;
 }
diff --git a/src/simd_nf4.h b/src/simd_nf4.h
index c5dc5150..f224cfbf 100644
--- a/src/simd_nf4.h
+++ b/src/simd_nf4.h
@@ -198,7 +198,7 @@ inline __uint128_t sub(__uint128_t a, __uint128_t b)
     HalfVecType res;
     VecType _a = LoadToReg(a);
     VecType _b = LoadToReg(b);
-    STORE_LOW(&res, SUB_MOD(_a, _b, F4));
+    STORE_LOW(&res, ModSub(_a, _b, F4));
     return reinterpret_cast<__uint128_t>(res);
 }
 
@@ -284,7 +284,7 @@ inline __uint128_t sub(__uint128_t a, __uint128_t b)
     VecType res;
     VecType _a = LoadToReg(a);
     VecType _b = LoadToReg(b);
-    StoreToMem(&res, SUB_MOD(_a, _b, F4));
+    StoreToMem(&res, ModSub(_a, _b, F4));
     return reinterpret_cast<__uint128_t>(res);
 }
 

From 2d86344995931989cf3a8c0b0859ae93856f54fb Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:16:33 +0100
Subject: [PATCH 58/77] SIMD: rename NEG_MOD

---
 src/simd_basic.h | 4 ++--
 src/simd_fnt.h   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/simd_basic.h b/src/simd_basic.h
index ce40725e..60864e4a 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -88,7 +88,7 @@ inline VecType ModSub(VecType x, VecType y, T q)
  * @return (-x) mod q
  */
 template <typename T>
-inline VecType NEG_MOD(VecType x, T q)
+inline VecType ModNeg(VecType x, T q)
 {
     const VecType res = Sub<T>(Card(q), x);
     return Min<T>(res, Sub<T>(res, Card(q)));
@@ -292,7 +292,7 @@ inline void neg(size_t len, T* buf, T card)
 
     size_t i;
     for (i = 0; i < _len; i++) {
-        _buf[i] = NEG_MOD(_buf[i], card);
+        _buf[i] = ModNeg(_buf[i], card);
     }
     if (_last_len > 0) {
         for (i = _len * ratio; i < len; i++) {
diff --git a/src/simd_fnt.h b/src/simd_fnt.h
index f002c65b..86938b28 100644
--- a/src/simd_fnt.h
+++ b/src/simd_fnt.h
@@ -110,7 +110,7 @@ inline VecType BUTTERFLY_GS_SIMPLE(T rp1, VecType c, VecType x, T q)
     } else if (rp1 < q) {
         return MUL_MOD(c, x, q);
     } else {
-        return NEG_MOD(x, q);
+        return ModNeg(x, q);
     }
 }
 

From 7766b5679cd5498fb5b2ac28176745585c2db84a Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:16:57 +0100
Subject: [PATCH 59/77] SIMD: rename MUL_MOD

---
 src/simd_basic.h | 14 +++++++-------
 src/simd_fnt.h   |  6 +++---
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/simd_basic.h b/src/simd_basic.h
index 60864e4a..4fe780cb 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -106,7 +106,7 @@ inline VecType ModNeg(VecType x, T q)
  * @return (x * y) mod q
  */
 template <typename T>
-inline VecType MUL_MOD(VecType x, VecType y, T q)
+inline VecType ModMul(VecType x, VecType y, T q)
 {
     const VecType res = Mul<T>(x, y);
     const VecType lo =
@@ -129,7 +129,7 @@ inline VecType MUL_MOD(VecType x, VecType y, T q)
 template <typename T>
 inline VecType MULFULL_MOD(VecType x, VecType y, T q)
 {
-    const VecType res = MUL_MOD(x, y, q);
+    const VecType res = ModMul(x, y, q);
 
     // filter elements of both of a & b = card-1
     const VecType cmp =
@@ -193,13 +193,13 @@ inline void mul_coef_to_buf(const T a, T* src, T* dest, size_t len, T card)
     size_t i = 0;
     const size_t end = (_len > 3) ? _len - 3 : 0;
     for (; i < end; i += 4) {
-        _dest[i] = MUL_MOD(coef, _src[i], card);
-        _dest[i + 1] = MUL_MOD(coef, _src[i + 1], card);
-        _dest[i + 2] = MUL_MOD(coef, _src[i + 2], card);
-        _dest[i + 3] = MUL_MOD(coef, _src[i + 3], card);
+        _dest[i] = ModMul(coef, _src[i], card);
+        _dest[i + 1] = ModMul(coef, _src[i + 1], card);
+        _dest[i + 2] = ModMul(coef, _src[i + 2], card);
+        _dest[i + 3] = ModMul(coef, _src[i + 3], card);
     }
     for (; i < _len; ++i) {
-        _dest[i] = MUL_MOD(coef, _src[i], card);
+        _dest[i] = ModMul(coef, _src[i], card);
     }
 
     if (_last_len > 0) {
diff --git a/src/simd_fnt.h b/src/simd_fnt.h
index 86938b28..0d013c46 100644
--- a/src/simd_fnt.h
+++ b/src/simd_fnt.h
@@ -53,7 +53,7 @@ namespace simd {
 template <typename T>
 inline void BUTTERFLY_CT(T rp1, VecType c, VecType* x, VecType* y, T q)
 {
-    VecType z = (rp1 == 2) ? *y : MUL_MOD(c, *y, q);
+    VecType z = (rp1 == 2) ? *y : ModMul(c, *y, q);
     if (rp1 < q) {
         *y = ModSub(*x, z, q);
         *x = ModAdd(*x, z, q);
@@ -83,7 +83,7 @@ inline void BUTTERFLY_GS(T rp1, VecType c, VecType* x, VecType* y, T q)
         *y = ModSub(*x, *y, q);
     } else if (rp1 < q) {
         VecType sub = ModSub(*x, *y, q);
-        *y = MUL_MOD(c, sub, q);
+        *y = ModMul(c, sub, q);
     } else { // i.e. r == q - 1
         *y = ModSub(*y, *x, q);
     }
@@ -108,7 +108,7 @@ inline VecType BUTTERFLY_GS_SIMPLE(T rp1, VecType c, VecType x, T q)
     if (rp1 == 2) {
         return x;
     } else if (rp1 < q) {
-        return MUL_MOD(c, x, q);
+        return ModMul(c, x, q);
     } else {
         return ModNeg(x, q);
     }

From 34b6595153a8e5255d26b239f88cb6c2e0d6b634 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:17:51 +0100
Subject: [PATCH 60/77] SIMD: rename MULFULL_MOD

---
 src/simd_basic.h |  4 ++--
 src/simd_nf4.h   | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/simd_basic.h b/src/simd_basic.h
index 4fe780cb..eefbf370 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -127,7 +127,7 @@ inline VecType ModMul(VecType x, VecType y, T q)
  * @return (x * y) mod q
  */
 template <typename T>
-inline VecType MULFULL_MOD(VecType x, VecType y, T q)
+inline VecType ModMulSafe(VecType x, VecType y, T q)
 {
     const VecType res = ModMul(x, y, q);
 
@@ -270,7 +270,7 @@ inline void mul_two_bufs(T* src, T* dest, size_t len, T card)
     size_t i;
     for (i = 0; i < _len; i++) {
         // perform multiplicaton
-        _dest[i] = MULFULL_MOD(_src[i], _dest[i], card);
+        _dest[i] = ModMulSafe(_src[i], _dest[i], card);
     }
     if (_last_len > 0) {
         for (i = _len * ratio; i < len; i++) {
diff --git a/src/simd_nf4.h b/src/simd_nf4.h
index f224cfbf..0accb358 100644
--- a/src/simd_nf4.h
+++ b/src/simd_nf4.h
@@ -207,7 +207,7 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b)
     HalfVecType res;
     VecType _a = LoadToReg(a);
     VecType _b = LoadToReg(b);
-    STORE_LOW(&res, MULFULL_MOD(_a, _b, F4));
+    STORE_LOW(&res, ModMulSafe(_a, _b, F4));
     return reinterpret_cast<__uint128_t>(res);
 }
 
@@ -239,7 +239,7 @@ inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y)
         VecType _x_p = LoadToReg(_x[i]);
         VecType _y_p = LoadToReg(_y[i]);
 
-        STORE_LOW(_x + i, MULFULL_MOD(_x_p, _y_p, F4));
+        STORE_LOW(_x + i, ModMulSafe(_x_p, _y_p, F4));
     }
 }
 
@@ -257,8 +257,8 @@ inline void hadamard_mul_doubled_rem(
         VecType _x_next_p = LoadToReg(_x_half[i]);
         VecType _y_p = LoadToReg(_y[i]);
 
-        STORE_LOW(_x + i, MULFULL_MOD(_x_p, _y_p, F4));
-        STORE_LOW(_x_half + i, MULFULL_MOD(_x_next_p, _y_p, F4));
+        STORE_LOW(_x + i, ModMulSafe(_x_p, _y_p, F4));
+        STORE_LOW(_x_half + i, ModMulSafe(_x_next_p, _y_p, F4));
     }
 }
 
@@ -293,7 +293,7 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b)
     VecType res;
     VecType _a = LoadToReg(a);
     VecType _b = LoadToReg(b);
-    StoreToMem(&res, MULFULL_MOD(_a, _b, F4));
+    StoreToMem(&res, ModMulSafe(_a, _b, F4));
     return reinterpret_cast<__uint128_t>(res);
 }
 
@@ -369,7 +369,7 @@ inline void hadamard_mul(unsigned n, __uint128_t* _x, __uint128_t* _y)
 
     // multiply y to the first half of `x`
     for (i = 0; i < vec_len; ++i) {
-        x[i] = MULFULL_MOD(x[i], y[i], F4);
+        x[i] = ModMulSafe(x[i], y[i], F4);
     }
 
     if (rem_len > 0) {

From dca8686c1b6c6609865899ddfe3daec17113bbed Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:18:13 +0100
Subject: [PATCH 61/77] SIMD: rename ADD_PROPS

---
 src/simd_basic.h |  2 +-
 src/simd_fnt.h   | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/simd_basic.h b/src/simd_basic.h
index eefbf370..d33e5b92 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -153,7 +153,7 @@ inline VecType ModMulSafe(VecType x, VecType y, T q)
  * @param max a dummy variable
  */
 template <typename T>
-inline void ADD_PROPS(
+inline void AddProps(
     Properties& props,
     VecType threshold,
     VecType mask,
diff --git a/src/simd_fnt.h b/src/simd_fnt.h
index 0d013c46..4761e945 100644
--- a/src/simd_fnt.h
+++ b/src/simd_fnt.h
@@ -493,22 +493,22 @@ inline void encode_post_process(
 
             if (AndIsZero(a1, _threshold) == 0) {
                 const off_t curr_offset = offset + vec_id * vec_size;
-                ADD_PROPS(
+                AddProps(
                     props[frag_id], _threshold, mask_hi, a1, curr_offset, max);
             }
             if (AndIsZero(a2, _threshold) == 0) {
                 const off_t curr_offset = offset + (vec_id + 1) * vec_size;
-                ADD_PROPS(
+                AddProps(
                     props[frag_id], _threshold, mask_hi, a2, curr_offset, max);
             }
             if (AndIsZero(a3, _threshold) == 0) {
                 const off_t curr_offset = offset + (vec_id + 2) * vec_size;
-                ADD_PROPS(
+                AddProps(
                     props[frag_id], _threshold, mask_hi, a3, curr_offset, max);
             }
             if (AndIsZero(a4, _threshold) == 0) {
                 const off_t curr_offset = offset + (vec_id + 3) * vec_size;
-                ADD_PROPS(
+                AddProps(
                     props[frag_id], _threshold, mask_hi, a4, curr_offset, max);
             }
         }
@@ -517,7 +517,7 @@ inline void encode_post_process(
             uint32_t c = AndIsZero(a, _threshold);
             if (c == 0) {
                 const off_t curr_offset = offset + vec_id * vec_size;
-                ADD_PROPS(
+                AddProps(
                     props[frag_id], _threshold, mask_hi, a, curr_offset, max);
             }
         }

From 84e0714ca76ed95845aaea1d88e163eb25352bda Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:23:24 +0100
Subject: [PATCH 62/77] SIMD: rename BUTTERFLY_CT

---
 src/simd_fnt.h | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/simd_fnt.h b/src/simd_fnt.h
index 4761e945..08376688 100644
--- a/src/simd_fnt.h
+++ b/src/simd_fnt.h
@@ -51,7 +51,7 @@ namespace simd {
  * @param q modular
  */
 template <typename T>
-inline void BUTTERFLY_CT(T rp1, VecType c, VecType* x, VecType* y, T q)
+inline void ButterflyCT(T rp1, VecType c, VecType* x, VecType* y, T q)
 {
     VecType z = (rp1 == 2) ? *y : ModMul(c, *y, q);
     if (rp1 < q) {
@@ -159,12 +159,12 @@ inline void butterfly_ct_step(
             x1 = LoadToReg(p + j);
             y1 = LoadToReg(q + j);
 
-            BUTTERFLY_CT(rp1, c, &x1, &y1, card);
+            ButterflyCT(rp1, c, &x1, &y1, card);
 
             x2 = LoadToReg(p + j + 1);
             y2 = LoadToReg(q + j + 1);
 
-            BUTTERFLY_CT(rp1, c, &x2, &y2, card);
+            ButterflyCT(rp1, c, &x2, &y2, card);
 
             // Store back to memory
             StoreToMem(p + j, x1);
@@ -176,7 +176,7 @@ inline void butterfly_ct_step(
             x1 = LoadToReg(p + j);
             y1 = LoadToReg(q + j);
 
-            BUTTERFLY_CT(rp1, c, &x1, &y1, card);
+            ButterflyCT(rp1, c, &x1, &y1, card);
 
             // Store back to memory
             StoreToMem(p + j, x1);
@@ -218,23 +218,23 @@ inline static void do_butterfly_ct_2_layers(
         VecType y1 = LoadToReg(q);
         VecType y2 = LoadToReg(q + 1);
 
-        BUTTERFLY_CT(r1p1, c1, &x1, &y1, card);
-        BUTTERFLY_CT(r1p1, c1, &x2, &y2, card);
+        ButterflyCT(r1p1, c1, &x1, &y1, card);
+        ButterflyCT(r1p1, c1, &x2, &y2, card);
 
         VecType u1 = LoadToReg(r);
         VecType u2 = LoadToReg(r + 1);
         VecType v1 = LoadToReg(s);
         VecType v2 = LoadToReg(s + 1);
 
-        BUTTERFLY_CT(r1p1, c1, &u1, &v1, card);
-        BUTTERFLY_CT(r1p1, c1, &u2, &v2, card);
+        ButterflyCT(r1p1, c1, &u1, &v1, card);
+        ButterflyCT(r1p1, c1, &u2, &v2, card);
 
         // Second layer (c2, x, u) & (c3, y, v)
-        BUTTERFLY_CT(r2p1, c2, &x1, &u1, card);
-        BUTTERFLY_CT(r2p1, c2, &x2, &u2, card);
+        ButterflyCT(r2p1, c2, &x1, &u1, card);
+        ButterflyCT(r2p1, c2, &x2, &u2, card);
 
-        BUTTERFLY_CT(r3p1, c3, &y1, &v1, card);
-        BUTTERFLY_CT(r3p1, c3, &y2, &v2, card);
+        ButterflyCT(r3p1, c3, &y1, &v1, card);
+        ButterflyCT(r3p1, c3, &y2, &v2, card);
 
         // Store back to memory
         StoreToMem(p, x1);
@@ -261,10 +261,10 @@ inline static void do_butterfly_ct_2_layers(
         VecType v1 = LoadToReg(s + j);
 
         // BUTTERFLY_3_test(c1, &x1, &y1, &u1, &v1, card);
-        BUTTERFLY_CT(r1p1, c1, &x1, &y1, card);
-        BUTTERFLY_CT(r1p1, c1, &u1, &v1, card);
-        BUTTERFLY_CT(r2p1, c2, &x1, &u1, card);
-        BUTTERFLY_CT(r3p1, c3, &y1, &v1, card);
+        ButterflyCT(r1p1, c1, &x1, &y1, card);
+        ButterflyCT(r1p1, c1, &u1, &v1, card);
+        ButterflyCT(r2p1, c2, &x1, &u1, card);
+        ButterflyCT(r3p1, c3, &y1, &v1, card);
 
         // Store back to memory
         StoreToMem(p + j, x1);

From 51d2f8d2423f7df969be4cd9cdcf15931743341c Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:23:45 +0100
Subject: [PATCH 63/77] SIMD: rename BUTTERFLY_GS

---
 src/simd_fnt.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/simd_fnt.h b/src/simd_fnt.h
index 08376688..c5672884 100644
--- a/src/simd_fnt.h
+++ b/src/simd_fnt.h
@@ -76,7 +76,7 @@ inline void ButterflyCT(T rp1, VecType c, VecType* x, VecType* y, T q)
  * @param q modular
  */
 template <typename T>
-inline void BUTTERFLY_GS(T rp1, VecType c, VecType* x, VecType* y, T q)
+inline void ButterflyGS(T rp1, VecType c, VecType* x, VecType* y, T q)
 {
     VecType add = ModAdd(*x, *y, q);
     if (rp1 == 2) {
@@ -375,10 +375,10 @@ inline void butterfly_gs_step(
             y3 = LoadToReg(q + j + 2);
             y4 = LoadToReg(q + j + 3);
 
-            BUTTERFLY_GS(rp1, c, &x1, &y1, card);
-            BUTTERFLY_GS(rp1, c, &x2, &y2, card);
-            BUTTERFLY_GS(rp1, c, &x3, &y3, card);
-            BUTTERFLY_GS(rp1, c, &x4, &y4, card);
+            ButterflyGS(rp1, c, &x1, &y1, card);
+            ButterflyGS(rp1, c, &x2, &y2, card);
+            ButterflyGS(rp1, c, &x3, &y3, card);
+            ButterflyGS(rp1, c, &x4, &y4, card);
 
             // Store back to memory
             StoreToMem(p + j, x1);
@@ -394,7 +394,7 @@ inline void butterfly_gs_step(
             x1 = LoadToReg(p + j);
             y1 = LoadToReg(q + j);
 
-            BUTTERFLY_GS(rp1, c, &x1, &y1, card);
+            ButterflyGS(rp1, c, &x1, &y1, card);
 
             // Store back to memory
             StoreToMem(p + j, x1);

From cc7d37c4cd1a98ea1e1ae9c4084a6b6fa315868f Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:24:11 +0100
Subject: [PATCH 64/77] SIMD: rename BUTTERFLY_GS_SIMPLE

---
 src/simd_fnt.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/simd_fnt.h b/src/simd_fnt.h
index c5672884..885a3d25 100644
--- a/src/simd_fnt.h
+++ b/src/simd_fnt.h
@@ -103,7 +103,7 @@ inline void ButterflyGS(T rp1, VecType c, VecType* x, VecType* y, T q)
  * @return r * x
  */
 template <typename T>
-inline VecType BUTTERFLY_GS_SIMPLE(T rp1, VecType c, VecType x, T q)
+inline VecType ButterflySimpleGS(T rp1, VecType c, VecType x, T q)
 {
     if (rp1 == 2) {
         return x;
@@ -446,8 +446,8 @@ inline void butterfly_gs_step_simple(
             x1 = LoadToReg(p + j);
             x2 = LoadToReg(p + j + 1);
 
-            y1 = BUTTERFLY_GS_SIMPLE(rp1, c, x1, card);
-            y2 = BUTTERFLY_GS_SIMPLE(rp1, c, x2, card);
+            y1 = ButterflySimpleGS(rp1, c, x1, card);
+            y2 = ButterflySimpleGS(rp1, c, x2, card);
 
             // Store back to memory
             StoreToMem(q + j, y1);
@@ -456,7 +456,7 @@ inline void butterfly_gs_step_simple(
         for (; j < len; ++j) {
             x1 = LoadToReg(p + j);
 
-            y1 = BUTTERFLY_GS_SIMPLE(rp1, c, x1, card);
+            y1 = ButterflySimpleGS(rp1, c, x1, card);
 
             // Store back to memory
             StoreToMem(q + j, y1);

From db987642265f9372bd1f870dccd0b9598e530a8b Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:31:55 +0100
Subject: [PATCH 65/77] SIMD: rename STORE_LOW

---
 src/simd_nf4.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/simd_nf4.h b/src/simd_nf4.h
index 0accb358..25b435c0 100644
--- a/src/simd_nf4.h
+++ b/src/simd_nf4.h
@@ -179,7 +179,7 @@ inline VecType LoadToReg(__uint128_t x)
     return LoadToReg(*_x);
 }
 
-inline void STORE_LOW(HalfVecType* address, VecType reg)
+inline void StoreLowHalfToMem(HalfVecType* address, VecType reg)
 {
     _mm_store_si128(address, _mm256_castsi256_si128(reg));
 }
@@ -189,7 +189,7 @@ inline __uint128_t add(__uint128_t a, __uint128_t b)
     HalfVecType res;
     VecType _a = LoadToReg(a);
     VecType _b = LoadToReg(b);
-    STORE_LOW(&res, ModAdd(_a, _b, F4));
+    StoreLowHalfToMem(&res, ModAdd(_a, _b, F4));
     return reinterpret_cast<__uint128_t>(res);
 }
 
@@ -198,7 +198,7 @@ inline __uint128_t sub(__uint128_t a, __uint128_t b)
     HalfVecType res;
     VecType _a = LoadToReg(a);
     VecType _b = LoadToReg(b);
-    STORE_LOW(&res, ModSub(_a, _b, F4));
+    StoreLowHalfToMem(&res, ModSub(_a, _b, F4));
     return reinterpret_cast<__uint128_t>(res);
 }
 
@@ -207,7 +207,7 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b)
     HalfVecType res;
     VecType _a = LoadToReg(a);
     VecType _b = LoadToReg(b);
-    STORE_LOW(&res, ModMulSafe(_a, _b, F4));
+    StoreLowHalfToMem(&res, ModMulSafe(_a, _b, F4));
     return reinterpret_cast<__uint128_t>(res);
 }
 
@@ -226,8 +226,8 @@ inline void add_buf_to_two_bufs_rem(
         VecType _x_next_p = LoadToReg(_x_half[i]);
         VecType _y_p = LoadToReg(_y[i]);
 
-        STORE_LOW(_x + i, ModAdd(_x_p, _y_p, F4));
-        STORE_LOW(_x_half + i, ModAdd(_x_next_p, _y_p, F4));
+        StoreLowHalfToMem(_x + i, ModAdd(_x_p, _y_p, F4));
+        StoreLowHalfToMem(_x_half + i, ModAdd(_x_next_p, _y_p, F4));
     }
 }
 
@@ -239,7 +239,7 @@ inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y)
         VecType _x_p = LoadToReg(_x[i]);
         VecType _y_p = LoadToReg(_y[i]);
 
-        STORE_LOW(_x + i, ModMulSafe(_x_p, _y_p, F4));
+        StoreLowHalfToMem(_x + i, ModMulSafe(_x_p, _y_p, F4));
     }
 }
 
@@ -257,8 +257,8 @@ inline void hadamard_mul_doubled_rem(
         VecType _x_next_p = LoadToReg(_x_half[i]);
         VecType _y_p = LoadToReg(_y[i]);
 
-        STORE_LOW(_x + i, ModMulSafe(_x_p, _y_p, F4));
-        STORE_LOW(_x_half + i, ModMulSafe(_x_next_p, _y_p, F4));
+        StoreLowHalfToMem(_x + i, ModMulSafe(_x_p, _y_p, F4));
+        StoreLowHalfToMem(_x_half + i, ModMulSafe(_x_next_p, _y_p, F4));
     }
 }
 

From e9de0badd522cafe4e22ca11187cadd57254788f Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 12:36:04 +0100
Subject: [PATCH 66/77] SIMD: rename macro names

---
 src/simd_128.h   | 16 ++++++++--------
 src/simd_256.h   | 16 ++++++++--------
 src/simd_basic.h |  8 ++++----
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index fe3ca80c..6cfbc8e5 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -40,17 +40,17 @@ typedef __m128i VecType;
 
 /* ============= Constant variable  ============ */
 
-#define F4_u32 _mm_set1_epi32(65537)
-#define F4m1_u32 _mm_set1_epi32(65536)
-#define F3_u32 _mm_set1_epi32(257)
-#define F3m1_u32 _mm_set1_epi32(256)
+#define F4_U32 _mm_set1_epi32(65537)
+#define F4_MINUS_ONE_U32 _mm_set1_epi32(65536)
+#define F3_U32 _mm_set1_epi32(257)
+#define F3_MINUS_ONE_U32 _mm_set1_epi32(256)
 
-#define F3_u16 _mm_set1_epi16(257)
-#define F3m1_u16 _mm_set1_epi16(256)
+#define F3_U16 _mm_set1_epi16(257)
+#define F3_MINUS_ONE_U16 _mm_set1_epi16(256)
 
 #define ZERO (_mm_setzero_si128())
-#define ONE16 (_mm_set1_epi16(1))
-#define ONE32 (_mm_set1_epi32(1))
+#define ONE_U16 (_mm_set1_epi16(1))
+#define ONE_U32 (_mm_set1_epi32(1))
 
 #define MASK8_LO (_mm_set1_epi16(0x80))
 
diff --git a/src/simd_256.h b/src/simd_256.h
index 92c5d5f6..8b9ae688 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -53,17 +53,17 @@ typedef __m128i HalfVecType;
 
 /* ============= Constant variable  ============ */
 
-#define F4_u32 _mm256_set1_epi32(65537)
-#define F4m1_u32 _mm256_set1_epi32(65536)
-#define F3_u32 _mm256_set1_epi32(257)
-#define F3m1_u32 _mm256_set1_epi32(256)
+#define F4_U32 _mm256_set1_epi32(65537)
+#define F4_MINUS_ONE_U32 _mm256_set1_epi32(65536)
+#define F3_U32 _mm256_set1_epi32(257)
+#define F3_MINUS_ONE_U32 _mm256_set1_epi32(256)
 
-#define F3_u16 _mm256_set1_epi16(257)
-#define F3m1_u16 _mm256_set1_epi16(256)
+#define F3_U16 _mm256_set1_epi16(257)
+#define F3_MINUS_ONE_U16 _mm256_set1_epi16(256)
 
 #define ZERO (_mm256_setzero_si256())
-#define ONE16 (_mm256_set1_epi16(1))
-#define ONE32 (_mm256_set1_epi32(1))
+#define ONE_U16 (_mm256_set1_epi16(1))
+#define ONE_U32 (_mm256_set1_epi32(1))
 
 #define MASK8_LO (_mm256_set1_epi16(0x80))
 
diff --git a/src/simd_basic.h b/src/simd_basic.h
index d33e5b92..de302273 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -39,13 +39,13 @@ namespace simd {
 template <typename T>
 inline VecType Card(T q)
 {
-    return (q == F3) ? F3_u32 : F4_u32;
+    return (q == F3) ? F3_U32 : F4_U32;
 }
 
 template <typename T>
 inline VecType CardMinusOne(T q)
 {
-    return (q == F3) ? F3m1_u32 : F4m1_u32;
+    return (q == F3) ? F3_MINUS_ONE_U32 : F4_MINUS_ONE_U32;
 }
 
 /* ================= Basic Operations ================= */
@@ -138,8 +138,8 @@ inline VecType ModMulSafe(VecType x, VecType y, T q)
     if (IsZero(cmp) == 1) {
         return res;
     }
-    return (q == F3) ? Xor(res, And(F4_u32, cmp))
-                     : Add<T>(res, And(ONE32, cmp));
+    return (q == F3) ? Xor(res, And(F4_U32, cmp))
+                     : Add<T>(res, And(ONE_U32, cmp));
 }
 
 /**

From d35c4d31f9b2b7c57439dd8c0d2a435c9cda1d63 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 13:40:41 +0100
Subject: [PATCH 67/77] SIMD Basic: fix Card & CardMinusOne functions

---
 src/simd_basic.h | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/simd_basic.h b/src/simd_basic.h
index de302273..b6bd7c80 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -37,13 +37,27 @@ namespace quadiron {
 namespace simd {
 
 template <typename T>
-inline VecType Card(T q)
+inline VecType Card(T q);
+template <>
+inline VecType Card<uint16_t>(uint16_t q)
+{
+    return F3_U16;
+}
+template <>
+inline VecType Card<uint32_t>(uint32_t q)
 {
     return (q == F3) ? F3_U32 : F4_U32;
 }
 
 template <typename T>
-inline VecType CardMinusOne(T q)
+inline VecType CardMinusOne(T q);
+template <>
+inline VecType CardMinusOne<uint16_t>(uint16_t q)
+{
+    return F3_MINUS_ONE_U16;
+}
+template <>
+inline VecType CardMinusOne<uint32_t>(uint32_t q)
 {
     return (q == F3) ? F3_MINUS_ONE_U32 : F4_MINUS_ONE_U32;
 }

From 05d39381440f8c3337fbd2741d824292eb61bc9a Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 13:41:18 +0100
Subject: [PATCH 68/77] SIMD Basic: refactor get low/high half elements for
 ModMul

---
 src/simd_basic.h | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/simd_basic.h b/src/simd_basic.h
index b6bd7c80..92382217 100644
--- a/src/simd_basic.h
+++ b/src/simd_basic.h
@@ -62,6 +62,19 @@ inline VecType CardMinusOne<uint32_t>(uint32_t q)
     return (q == F3) ? F3_MINUS_ONE_U32 : F4_MINUS_ONE_U32;
 }
 
+template <typename T>
+inline VecType GetLowHalf(VecType x, T q)
+{
+    return (q == F3) ? BLEND8(ZERO, x, MASK8_LO) : BLEND16(ZERO, x, 0x55);
+}
+
+template <typename T>
+inline VecType GetHighHalf(VecType x, T q)
+{
+    return (q == F3) ? BLEND8(ZERO, SHIFTR(x, 1), MASK8_LO)
+                     : BLEND16(ZERO, SHIFTR(x, 2), 0x55);
+}
+
 /* ================= Basic Operations ================= */
 
 /**
@@ -123,10 +136,8 @@ template <typename T>
 inline VecType ModMul(VecType x, VecType y, T q)
 {
     const VecType res = Mul<T>(x, y);
-    const VecType lo =
-        (q == F3) ? BLEND8(ZERO, res, MASK8_LO) : BLEND16(ZERO, res, 0x55);
-    const VecType hi = (q == F3) ? BLEND8(ZERO, SHIFTR(res, 1), MASK8_LO)
-                                 : BLEND16(ZERO, SHIFTR(res, 2), 0x55);
+    const VecType lo = GetLowHalf(res, q);
+    const VecType hi = GetHighHalf(res, q);
     return ModSub(lo, hi, q);
 }
 

From 0c275e43126d0393192e97cd8a2ca39791c94801 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Wed, 31 Oct 2018 15:30:33 +0100
Subject: [PATCH 69/77] Core includes only SIMD's allocator

It moves typedef for DoubleSize and SignedDoubleSize from arith to core
---
 src/arith.h | 6 ------
 src/core.h  | 8 +++++++-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/arith.h b/src/arith.h
index 1da85320..9b230677 100644
--- a/src/arith.h
+++ b/src/arith.h
@@ -41,12 +41,6 @@
 
 namespace quadiron {
 
-template <typename T>
-using DoubleSizeVal = typename DoubleSize<T>::T;
-
-template <typename T>
-using SignedDoubleSizeVal = typename SignedDoubleSize<T>::T;
-
 /** Base/core arithmetical functions of QuadIron. */
 namespace arith {
 
diff --git a/src/core.h b/src/core.h
index 5eaf84fe..a9033f90 100644
--- a/src/core.h
+++ b/src/core.h
@@ -34,7 +34,7 @@
 #include <random>
 
 #include "big_int.h"
-#include "simd/simd.h"
+#include "simd/allocator.h"
 
 namespace quadiron {
 
@@ -78,6 +78,12 @@ struct SignedDoubleSize<__uint128_t> {
     typedef Int256 T;
 };
 
+template <typename T>
+using DoubleSizeVal = typename DoubleSize<T>::T;
+
+template <typename T>
+using SignedDoubleSizeVal = typename SignedDoubleSize<T>::T;
+
 /** A group of values stored as one.
  *
  * This allows faster processing, as the values can be processed as one.

From c72dd8af0cfe4c50b4942944545e1bf140eb5944 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Wed, 31 Oct 2018 15:35:45 +0100
Subject: [PATCH 70/77] SIMD: update simd header

---
 src/simd/simd.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/src/simd/simd.h b/src/simd/simd.h
index 9cdcd251..ad02f3fc 100644
--- a/src/simd/simd.h
+++ b/src/simd/simd.h
@@ -31,6 +31,8 @@
 #ifndef __QUAD_SIMD_SIMD_H__
 #define __QUAD_SIMD_SIMD_H__
 
+#include "property.h"
+
 #include "simd/allocator.h"
 #include "simd/definitions.h"
 
@@ -57,4 +59,31 @@ static constexpr std::size_t countof()
 } // namespace simd
 } // namespace quadiron
 
+#ifdef QUADIRON_USE_SIMD
+
+const unsigned F4 = 65537;
+const unsigned F3 = 257;
+
+// Include essential operations that use SIMD functions
+#if defined(__AVX2__)
+
+#include "simd_256.h"
+
+#elif defined(__SSE4_1__)
+
+#include "simd_128.h"
+
+#endif
+
+// Include basic operations
+#include "simd_basic.h"
+
+// Include accelerated operations dedicated for FNT
+#include "simd_fnt.h"
+
+// Include accelerated operations dedicated for NF4
+#include "simd_nf4.h"
+
+#endif // #ifdef QUADIRON_USE_SIMD
+
 #endif

From ce91fb25ee470a594cef147931540e0ecaf29137 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Wed, 31 Oct 2018 15:35:58 +0100
Subject: [PATCH 71/77] Remove simd.h

---
 src/simd.h | 73 ------------------------------------------------------
 1 file changed, 73 deletions(-)
 delete mode 100644 src/simd.h

diff --git a/src/simd.h b/src/simd.h
deleted file mode 100644
index 41e4935e..00000000
--- a/src/simd.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright 2017-2018 Scality
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __QUAD_SIMD_H__
-#define __QUAD_SIMD_H__
-
-#ifdef QUADIRON_USE_SIMD
-
-#include "property.h"
-#include "simd/simd.h"
-
-const unsigned F4 = 65537;
-const unsigned F3 = 257;
-
-namespace quadiron {
-/** The namespace simd contains functions accelerated by
- *  using SIMD operations over 128bits and 256bits
- *
- *  It supports operations on 16-bit and 32-bit numbers
- */
-namespace simd {
-
-// Vectorized operations are implemented in appropriated headers simd*.h
-
-} // namespace simd
-} // namespace quadiron
-
-// Include essential operations that use SIMD functions
-#if defined(__AVX2__)
-#include "simd_256.h"
-#elif defined(__SSE4_1__)
-#include "simd_128.h"
-#endif
-
-// Include basic operations
-#include "simd_basic.h"
-
-// Include accelerated operations dedicated for FNT
-#include "simd_fnt.h"
-
-// Include accelerated operations dedicated for NF4
-#include "simd_nf4.h"
-
-#endif // #ifdef QUADIRON_USE_SIMD
-
-#endif

From bb838aedf44a7359436d0bd2d5b50155f94c1494 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Wed, 31 Oct 2018 15:38:58 +0100
Subject: [PATCH 72/77] SIMD: move simd_* header to simd dir

---
 src/{ => simd}/simd_128.h   | 0
 src/{ => simd}/simd_256.h   | 0
 src/{ => simd}/simd_basic.h | 0
 src/{ => simd}/simd_fnt.h   | 0
 src/{ => simd}/simd_nf4.h   | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename src/{ => simd}/simd_128.h (100%)
 rename src/{ => simd}/simd_256.h (100%)
 rename src/{ => simd}/simd_basic.h (100%)
 rename src/{ => simd}/simd_fnt.h (100%)
 rename src/{ => simd}/simd_nf4.h (100%)

diff --git a/src/simd_128.h b/src/simd/simd_128.h
similarity index 100%
rename from src/simd_128.h
rename to src/simd/simd_128.h
diff --git a/src/simd_256.h b/src/simd/simd_256.h
similarity index 100%
rename from src/simd_256.h
rename to src/simd/simd_256.h
diff --git a/src/simd_basic.h b/src/simd/simd_basic.h
similarity index 100%
rename from src/simd_basic.h
rename to src/simd/simd_basic.h
diff --git a/src/simd_fnt.h b/src/simd/simd_fnt.h
similarity index 100%
rename from src/simd_fnt.h
rename to src/simd/simd_fnt.h
diff --git a/src/simd_nf4.h b/src/simd/simd_nf4.h
similarity index 100%
rename from src/simd_nf4.h
rename to src/simd/simd_nf4.h

From 97d9cf88b37a33dd94ddaab09565bedc9e6aa107 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Wed, 31 Oct 2018 15:39:58 +0100
Subject: [PATCH 73/77] SIMD: remove useless included headers

---
 src/simd/simd_128.h   | 2 --
 src/simd/simd_256.h   | 2 --
 src/simd/simd_basic.h | 2 --
 src/simd/simd_fnt.h   | 2 --
 src/simd/simd_nf4.h   | 4 ----
 5 files changed, 12 deletions(-)

diff --git a/src/simd/simd_128.h b/src/simd/simd_128.h
index 6cfbc8e5..bb33ee4f 100644
--- a/src/simd/simd_128.h
+++ b/src/simd/simd_128.h
@@ -31,8 +31,6 @@
 #ifndef __QUAD_SIMD_128_H__
 #define __QUAD_SIMD_128_H__
 
-#include <x86intrin.h>
-
 namespace quadiron {
 namespace simd {
 
diff --git a/src/simd/simd_256.h b/src/simd/simd_256.h
index 8b9ae688..0723e80f 100644
--- a/src/simd/simd_256.h
+++ b/src/simd/simd_256.h
@@ -31,8 +31,6 @@
 #ifndef __QUAD_SIMD_256_H__
 #define __QUAD_SIMD_256_H__
 
-#include <x86intrin.h>
-
 /* GCC doesn't include the split store intrinsics so define them here. */
 #if defined(__GNUC__) && !defined(__clang__)
 
diff --git a/src/simd/simd_basic.h b/src/simd/simd_basic.h
index 92382217..ab2301ad 100644
--- a/src/simd/simd_basic.h
+++ b/src/simd/simd_basic.h
@@ -31,8 +31,6 @@
 #ifndef __QUAD_SIMD_BASIC_H__
 #define __QUAD_SIMD_BASIC_H__
 
-#include <x86intrin.h>
-
 namespace quadiron {
 namespace simd {
 
diff --git a/src/simd/simd_fnt.h b/src/simd/simd_fnt.h
index 885a3d25..97467050 100644
--- a/src/simd/simd_fnt.h
+++ b/src/simd/simd_fnt.h
@@ -31,8 +31,6 @@
 #ifndef __QUAD_SIMD_FNT_H__
 #define __QUAD_SIMD_FNT_H__
 
-#include <x86intrin.h>
-
 namespace quadiron {
 namespace simd {
 
diff --git a/src/simd/simd_nf4.h b/src/simd/simd_nf4.h
index 25b435c0..8c3c1d92 100644
--- a/src/simd/simd_nf4.h
+++ b/src/simd/simd_nf4.h
@@ -31,10 +31,6 @@
 #ifndef __QUAD_SIMD_NF4_H__
 #define __QUAD_SIMD_NF4_H__
 
-#include <x86intrin.h>
-
-#include <simd/simd.h>
-
 namespace quadiron {
 namespace simd {
 

From 4b26041a29fa2b0b67f39c4c1505b84fa666c159 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Wed, 31 Oct 2018 15:45:12 +0100
Subject: [PATCH 74/77] Buffers includes only SIMD's allocator

---
 src/vec_buffers.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/vec_buffers.h b/src/vec_buffers.h
index d31c69d3..2122d300 100644
--- a/src/vec_buffers.h
+++ b/src/vec_buffers.h
@@ -38,7 +38,7 @@
 #include <vector>
 
 #include "core.h"
-#include "simd/simd.h"
+#include "simd/allocator.h"
 
 namespace quadiron {
 namespace vec {

From cac0b66dea742a9b9a761aa037d79212e628cd51 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Wed, 31 Oct 2018 15:47:38 +0100
Subject: [PATCH 75/77] Include new SIMD's header

---
 src/fec_base.h            | 2 +-
 src/fec_vectorisation.cpp | 1 -
 src/fft_2n.cpp            | 2 +-
 src/gf_nf4.cpp            | 2 +-
 src/gf_ring.cpp           | 2 +-
 5 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/fec_base.h b/src/fec_base.h
index 741d480f..0a417155 100644
--- a/src/fec_base.h
+++ b/src/fec_base.h
@@ -51,7 +51,7 @@
 
 #ifdef QUADIRON_USE_SIMD
 
-#include "simd.h"
+#include "simd/simd.h"
 
 #endif // #ifdef QUADIRON_USE_SIMD
 
diff --git a/src/fec_vectorisation.cpp b/src/fec_vectorisation.cpp
index ed82fab8..3900fd6d 100644
--- a/src/fec_vectorisation.cpp
+++ b/src/fec_vectorisation.cpp
@@ -37,7 +37,6 @@
 
 #ifdef QUADIRON_USE_SIMD
 
-#include "simd.h"
 #include "simd/simd.h"
 
 namespace quadiron {
diff --git a/src/fft_2n.cpp b/src/fft_2n.cpp
index f7d91468..6cc1f181 100644
--- a/src/fft_2n.cpp
+++ b/src/fft_2n.cpp
@@ -37,7 +37,7 @@
 
 #ifdef QUADIRON_USE_SIMD
 
-#include "simd.h"
+#include "simd/simd.h"
 
 namespace quadiron {
 namespace fft {
diff --git a/src/gf_nf4.cpp b/src/gf_nf4.cpp
index 9e7fa4dc..ecbf31b7 100644
--- a/src/gf_nf4.cpp
+++ b/src/gf_nf4.cpp
@@ -32,7 +32,7 @@
 
 #ifdef QUADIRON_USE_SIMD
 
-#include "simd.h"
+#include "simd/simd.h"
 
 namespace quadiron {
 namespace gf {
diff --git a/src/gf_ring.cpp b/src/gf_ring.cpp
index da1ed530..9120fe01 100644
--- a/src/gf_ring.cpp
+++ b/src/gf_ring.cpp
@@ -31,7 +31,7 @@
 #include "gf_ring.h"
 
 #ifdef QUADIRON_USE_SIMD
-#include "simd.h"
+
 #include "simd/simd.h"
 
 namespace quadiron {

From 942d47c48b1e61be0e81b6edd8944bef3403dfaf Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Wed, 31 Oct 2018 15:58:18 +0100
Subject: [PATCH 76/77] Include right headers for simd tests

---
 test/simd/test_allocator.cpp   | 2 +-
 test/simd/test_definitions.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/simd/test_allocator.cpp b/test/simd/test_allocator.cpp
index a1d59034..09edb70f 100644
--- a/test/simd/test_allocator.cpp
+++ b/test/simd/test_allocator.cpp
@@ -32,7 +32,7 @@
 
 #include <gtest/gtest.h>
 
-#include "simd/simd.h"
+#include "simd/allocator.h"
 
 namespace simd = quadiron::simd;
 
diff --git a/test/simd/test_definitions.cpp b/test/simd/test_definitions.cpp
index c7a48975..bde45d05 100644
--- a/test/simd/test_definitions.cpp
+++ b/test/simd/test_definitions.cpp
@@ -29,7 +29,7 @@
  */
 #include <gtest/gtest.h>
 
-#include "simd/simd.h"
+#include "simd/definitions.h"
 
 namespace simd = quadiron::simd;
 

From 880881491c3d31b7d7db93420590dc662953b77c Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Wed, 31 Oct 2018 15:58:55 +0100
Subject: [PATCH 77/77] SIMD: include headers for simd tests

---
 src/simd/simd.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/simd/simd.h b/src/simd/simd.h
index ad02f3fc..372931cc 100644
--- a/src/simd/simd.h
+++ b/src/simd/simd.h
@@ -31,7 +31,9 @@
 #ifndef __QUAD_SIMD_SIMD_H__
 #define __QUAD_SIMD_SIMD_H__
 
+#include "core.h"
 #include "property.h"
+#include "vec_buffers.h"
 
 #include "simd/allocator.h"
 #include "simd/definitions.h"