diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index aff9c341..3ff590af 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -31,6 +31,7 @@ include(GNUInstallDirs)
 set(LIB_SRC
   ${SOURCE_DIR}/core.cpp
   ${SOURCE_DIR}/fec_vectorisation.cpp
+  ${SOURCE_DIR}/fft_2n.cpp
   ${SOURCE_DIR}/misc.cpp
   ${SOURCE_DIR}/gf_nf4.cpp
   ${SOURCE_DIR}/gf_ring.cpp
diff --git a/src/arith.h b/src/arith.h
index 1da85320..9b230677 100644
--- a/src/arith.h
+++ b/src/arith.h
@@ -41,12 +41,6 @@
 
 namespace quadiron {
 
-template <typename T>
-using DoubleSizeVal = typename DoubleSize<T>::T;
-
-template <typename T>
-using SignedDoubleSizeVal = typename SignedDoubleSize<T>::T;
-
 /** Base/core arithmetical functions of QuadIron. */
 namespace arith {
 
diff --git a/src/core.h b/src/core.h
index 5eaf84fe..a9033f90 100644
--- a/src/core.h
+++ b/src/core.h
@@ -34,7 +34,7 @@
 #include <random>
 
 #include "big_int.h"
-#include "simd/simd.h"
+#include "simd/allocator.h"
 
 namespace quadiron {
 
@@ -78,6 +78,12 @@ struct SignedDoubleSize<__uint128_t> {
     typedef Int256 T;
 };
 
+template <typename T>
+using DoubleSizeVal = typename DoubleSize<T>::T;
+
+template <typename T>
+using SignedDoubleSizeVal = typename SignedDoubleSize<T>::T;
+
 /** A group of values stored as one.
  *
  * This allows faster processing, as the values can be processed as one.
diff --git a/src/fec_base.h b/src/fec_base.h
index 13ac78d9..0a417155 100644
--- a/src/fec_base.h
+++ b/src/fec_base.h
@@ -51,7 +51,7 @@
 
 #ifdef QUADIRON_USE_SIMD
 
-#include "simd.h"
+#include "simd/simd.h"
 
 #endif // #ifdef QUADIRON_USE_SIMD
 
@@ -74,8 +74,6 @@ static inline uint64_t hrtime_usec(timeval begin)
     return 1000000 * (tv.tv_sec - begin.tv_sec) + tv.tv_usec - begin.tv_usec;
 }
 
-#define OOR_MARK 1
-
 enum class FecType {
     /** Systematic code
      *
diff --git a/src/fec_rs_fnt.h b/src/fec_rs_fnt.h
index 4bdd5475..55ffb487 100644
--- a/src/fec_rs_fnt.h
+++ b/src/fec_rs_fnt.h
@@ -60,6 +60,11 @@ class RsFnt : public FecCode<T> {
     // decoding context used in encoding of systematic FNT
     std::unique_ptr<DecodeContext<T>> enc_context;
 
+    // Indices used for accelerated functions
+    size_t simd_vec_len;
+    size_t simd_trailing_len;
+    size_t simd_offset;
+
   public:
     RsFnt(
         FecType type,
@@ -70,6 +75,12 @@ class RsFnt : public FecCode<T> {
         : FecCode<T>(type, word_size, n_data, n_parities, pkt_size)
     {
         this->fec_init();
+
+        // Indices used for accelerated functions
+        const unsigned ratio = simd::countof<T>();
+        simd_vec_len = this->pkt_size / ratio;
+        simd_trailing_len = this->pkt_size - simd_vec_len * ratio;
+        simd_offset = simd_vec_len * ratio;
     }
 
     inline void check_params() override
diff --git a/src/fec_vectorisation.cpp b/src/fec_vectorisation.cpp
index 2564ed7d..3900fd6d 100644
--- a/src/fec_vectorisation.cpp
+++ b/src/fec_vectorisation.cpp
@@ -32,12 +32,11 @@
 #include "fec_rs_fnt.h"
 
 /*
- * The file includes vectorized operations used by FEC classes
+ * The file includes specialized operations used by FEC classes
  */
 
 #ifdef QUADIRON_USE_SIMD
 
-#include "simd.h"
 #include "simd/simd.h"
 
 namespace quadiron {
@@ -53,20 +52,13 @@ void RsFnt<uint16_t>::encode_post_process(
     uint16_t threshold = this->gf->card_minus_one();
     unsigned code_len = this->n_outputs;
 
-    // number of elements per vector register
-    unsigned vec_size = simd::countof<uint16_t>();
-    // number of vector registers per fragment packet
-    size_t vecs_nb = size / vec_size;
-    // odd number of elements not vectorized
-    size_t last_len = size - vecs_nb * vec_size;
-
     simd::encode_post_process(
-        output, props, offset, code_len, threshold, vecs_nb);
+        output, props, offset, code_len, threshold, simd_vec_len);
 
-    if (last_len > 0) {
+    if (simd_trailing_len > 0) {
         for (unsigned i = 0; i < code_len; ++i) {
             uint16_t* chunk = output.get(i);
-            for (size_t j = vecs_nb * vec_size; j < size; ++j) {
+            for (size_t j = simd_offset; j < size; ++j) {
                 if (chunk[j] == threshold) {
                     props[i].add(offset + j, OOR_MARK);
                 }
@@ -85,20 +77,13 @@ void RsFnt<uint32_t>::encode_post_process(
     const uint32_t threshold = this->gf->card_minus_one();
     const unsigned code_len = this->n_outputs;
 
-    // number of elements per vector register
-    const unsigned vec_size = simd::countof<uint32_t>();
-    // number of vector registers per fragment packet
-    const size_t vecs_nb = size / vec_size;
-    // odd number of elements not vectorized
-    const size_t last_len = size - vecs_nb * vec_size;
-
     simd::encode_post_process(
-        output, props, offset, code_len, threshold, vecs_nb);
+        output, props, offset, code_len, threshold, simd_vec_len);
 
-    if (last_len > 0) {
+    if (simd_trailing_len > 0) {
         for (unsigned i = 0; i < code_len; ++i) {
             uint32_t* chunk = output.get(i);
-            for (size_t j = vecs_nb * vec_size; j < size; ++j) {
+            for (size_t j = simd_offset; j < size; ++j) {
                 if (chunk[j] == threshold) {
                     props[i].add(offset + j, OOR_MARK);
                 }
diff --git a/src/fft_2n.cpp b/src/fft_2n.cpp
new file mode 100644
index 00000000..6cc1f181
--- /dev/null
+++ b/src/fft_2n.cpp
@@ -0,0 +1,192 @@
+/* -*- mode: c++ -*- */
+/*
+ * Copyright 2017-2018 Scality
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fft_2n.h"
+
+/*
+ * The file includes vectorized operations used by Radix2 classes
+ */
+
+#ifdef QUADIRON_USE_SIMD
+
+#include "simd/simd.h"
+
+namespace quadiron {
+namespace fft {
+
+template <>
+void Radix2<uint16_t>::butterfly_ct_two_layers_step(
+    vec::Buffers<uint16_t>& buf,
+    unsigned start,
+    unsigned m)
+{
+    const unsigned coefIndex = start * this->n / m / 2;
+    const uint16_t r1 = vec_W[coefIndex];
+    const uint16_t r2 = vec_W[coefIndex / 2];
+    const uint16_t r3 = vec_W[coefIndex / 2 + this->n / 4];
+
+    // perform vector operations
+    simd::butterfly_ct_two_layers_step(
+        buf, r1, r2, r3, start, m, simd_vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (simd_trailing_len > 0) {
+        butterfly_ct_two_layers_step_slow(buf, start, m, simd_offset);
+    }
+}
+
+template <>
+void Radix2<uint16_t>::butterfly_ct_step(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t r,
+    unsigned start,
+    unsigned m,
+    unsigned step)
+{
+    // perform vector operations
+    simd::butterfly_ct_step(buf, r, start, m, step, simd_vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (simd_trailing_len > 0) {
+        butterfly_ct_step_slow(buf, r, start, m, step, simd_offset);
+    }
+}
+
+template <>
+void Radix2<uint16_t>::butterfly_gs_step(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step)
+{
+    // perform vector operations
+    simd::butterfly_gs_step(buf, coef, start, m, simd_vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (simd_trailing_len > 0) {
+        butterfly_gs_step_slow(buf, coef, start, m, step, simd_offset);
+    }
+}
+
+template <>
+void Radix2<uint16_t>::butterfly_gs_step_simple(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step)
+{
+    // perform vector operations
+    simd::butterfly_gs_step_simple(buf, coef, start, m, simd_vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (simd_trailing_len > 0) {
+        butterfly_gs_step_simple_slow(buf, coef, start, m, step, simd_offset);
+    }
+}
+
+template <>
+void Radix2<uint32_t>::butterfly_ct_two_layers_step(
+    vec::Buffers<uint32_t>& buf,
+    unsigned start,
+    unsigned m)
+{
+    const unsigned coefIndex = start * this->n / m / 2;
+    const uint32_t r1 = vec_W[coefIndex];
+    const uint32_t r2 = vec_W[coefIndex / 2];
+    const uint32_t r3 = vec_W[coefIndex / 2 + this->n / 4];
+
+    // perform vector operations
+    simd::butterfly_ct_two_layers_step(
+        buf, r1, r2, r3, start, m, simd_vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (simd_trailing_len > 0) {
+        butterfly_ct_two_layers_step_slow(buf, start, m, simd_offset);
+    }
+}
+
+template <>
+void Radix2<uint32_t>::butterfly_ct_step(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t r,
+    unsigned start,
+    unsigned m,
+    unsigned step)
+{
+    // perform vector operations
+    simd::butterfly_ct_step(buf, r, start, m, step, simd_vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (simd_trailing_len > 0) {
+        butterfly_ct_step_slow(buf, r, start, m, step, simd_offset);
+    }
+}
+
+template <>
+void Radix2<uint32_t>::butterfly_gs_step(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step)
+{
+    // perform vector operations
+    simd::butterfly_gs_step(buf, coef, start, m, simd_vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (simd_trailing_len > 0) {
+        butterfly_gs_step_slow(buf, coef, start, m, step, simd_offset);
+    }
+}
+
+template <>
+void Radix2<uint32_t>::butterfly_gs_step_simple(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step)
+{
+    // perform vector operations
+    simd::butterfly_gs_step_simple(buf, coef, start, m, simd_vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (simd_trailing_len > 0) {
+        butterfly_gs_step_simple_slow(buf, coef, start, m, step, simd_offset);
+    }
+}
+
+} // namespace fft
+} // namespace quadiron
+
+#endif // #ifdef QUADIRON_USE_SIMD
diff --git a/src/fft_2n.h b/src/fft_2n.h
index f3da7174..4af68528 100644
--- a/src/fft_2n.h
+++ b/src/fft_2n.h
@@ -112,6 +112,11 @@ class Radix2 : public FourierTransform<T> {
         unsigned step);
 
     // Only used for non-vectorized elements
+    void butterfly_ct_two_layers_step_slow(
+        vec::Buffers<T>& buf,
+        unsigned start,
+        unsigned m,
+        size_t offset = 0);
     void butterfly_ct_step_slow(
         vec::Buffers<T>& buf,
         T coef,
@@ -142,6 +147,11 @@ class Radix2 : public FourierTransform<T> {
     size_t pkt_size;
     size_t buf_size;
 
+    // Indices used for accelerated functions
+    size_t simd_vec_len;
+    size_t simd_trailing_len;
+    size_t simd_offset;
+
     std::unique_ptr<T[]> rev = nullptr;
     std::unique_ptr<vec::Vector<T>> W = nullptr;
     std::unique_ptr<vec::Vector<T>> inv_W = nullptr;
@@ -182,6 +192,12 @@ Radix2<T>::Radix2(const gf::Field<T>& gf, int n, int data_len, size_t pkt_size)
 
     rev = std::unique_ptr<T[]>(new T[n]);
     init_bitrev();
+
+    // Indices used for accelerated functions
+    const unsigned ratio = simd::countof<T>();
+    simd_vec_len = this->pkt_size / ratio;
+    simd_trailing_len = this->pkt_size - simd_vec_len * ratio;
+    simd_offset = simd_vec_len * ratio;
 }
 
 template <typename T>
@@ -421,6 +437,16 @@ void Radix2<T>::butterfly_ct_two_layers_step(
     vec::Buffers<T>& buf,
     unsigned start,
     unsigned m)
+{
+    butterfly_ct_two_layers_step_slow(buf, start, m);
+}
+
+template <typename T>
+void Radix2<T>::butterfly_ct_two_layers_step_slow(
+    vec::Buffers<T>& buf,
+    unsigned start,
+    unsigned m,
+    size_t offset)
 {
     const unsigned step = m << 2;
     //  ---------
@@ -428,18 +454,18 @@ void Radix2<T>::butterfly_ct_two_layers_step(
     //  ---------
     const T r1 = W->get(start * this->n / m / 2);
     // first pair
-    butterfly_ct_step(buf, r1, start, m, step);
+    butterfly_ct_step_slow(buf, r1, start, m, step, offset);
     // second pair
-    butterfly_ct_step(buf, r1, start + 2 * m, m, step);
+    butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset);
     //  ---------
     // Second layer
     //  ---------
     // first pair
     const T r2 = W->get(start * this->n / m / 4);
-    butterfly_ct_step(buf, r2, start, 2 * m, step);
+    butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset);
     // second pair
     const T r3 = W->get((start + m) * this->n / m / 4);
-    butterfly_ct_step(buf, r3, start + m, 2 * m, step);
+    butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset);
 }
 
 template <typename T>
@@ -602,6 +628,65 @@ void Radix2<T>::ifft(vec::Buffers<T>& output, vec::Buffers<T>& input)
     this->gf->mul_vec_to_vecp(*(this->vec_inv_n), output, output);
 }
 
+#ifdef QUADIRON_USE_SIMD
+
+/* Operations are vectorized by SIMD */
+template <>
+void Radix2<uint16_t>::butterfly_ct_two_layers_step(
+    vec::Buffers<uint16_t>& buf,
+    unsigned start,
+    unsigned m);
+template <>
+void Radix2<uint16_t>::butterfly_ct_step(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t r,
+    unsigned start,
+    unsigned m,
+    unsigned step);
+template <>
+void Radix2<uint16_t>::butterfly_gs_step(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step);
+template <>
+void Radix2<uint16_t>::butterfly_gs_step_simple(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step);
+
+template <>
+void Radix2<uint32_t>::butterfly_ct_two_layers_step(
+    vec::Buffers<uint32_t>& buf,
+    unsigned start,
+    unsigned m);
+template <>
+void Radix2<uint32_t>::butterfly_ct_step(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t r,
+    unsigned start,
+    unsigned m,
+    unsigned step);
+template <>
+void Radix2<uint32_t>::butterfly_gs_step(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step);
+template <>
+void Radix2<uint32_t>::butterfly_gs_step_simple(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step);
+
+#endif // #ifdef QUADIRON_USE_SIMD
+
 } // namespace fft
 } // namespace quadiron
 
diff --git a/src/gf_nf4.cpp b/src/gf_nf4.cpp
index 9e7fa4dc..ecbf31b7 100644
--- a/src/gf_nf4.cpp
+++ b/src/gf_nf4.cpp
@@ -32,7 +32,7 @@
 
 #ifdef QUADIRON_USE_SIMD
 
-#include "simd.h"
+#include "simd/simd.h"
 
 namespace quadiron {
 namespace gf {
diff --git a/src/gf_ring.cpp b/src/gf_ring.cpp
index da1ed530..9120fe01 100644
--- a/src/gf_ring.cpp
+++ b/src/gf_ring.cpp
@@ -31,7 +31,7 @@
 #include "gf_ring.h"
 
 #ifdef QUADIRON_USE_SIMD
-#include "simd.h"
+
 #include "simd/simd.h"
 
 namespace quadiron {
diff --git a/src/property.h b/src/property.h
index 766acea7..2ecdc65e 100644
--- a/src/property.h
+++ b/src/property.h
@@ -40,6 +40,8 @@
 
 namespace quadiron {
 
+#define OOR_MARK 1
+
 /** Ancillary data attached to values.
  *
  * A property carries extra-information (whose interpretation is left to the
diff --git a/src/simd.h b/src/simd.h
deleted file mode 100644
index 8309bfff..00000000
--- a/src/simd.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright 2017-2018 Scality
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __QUAD_SIMD_H__
-#define __QUAD_SIMD_H__
-
-#ifdef QUADIRON_USE_SIMD
-
-#include "property.h"
-#include "simd/simd.h"
-
-const unsigned F4 = 65537;
-const unsigned F3 = 257;
-
-typedef uint8_t aint8 __attribute__((aligned(quadiron::simd::ALIGNMENT)));
-typedef uint16_t aint16 __attribute__((aligned(quadiron::simd::ALIGNMENT)));
-typedef uint32_t aint32 __attribute__((aligned(quadiron::simd::ALIGNMENT)));
-typedef uint64_t aint64 __attribute__((aligned(quadiron::simd::ALIGNMENT)));
-typedef __uint128_t aint128 __attribute__((aligned(quadiron::simd::ALIGNMENT)));
-
-namespace quadiron {
-/** The namespace simd contains functions for GF-NF4 that are accelerated by
- *  using SIMD operations over 128bits
- *
- *  It supports operations on 32-bit numbers
- */
-namespace simd {
-
-// Vectorized operations are implemented in appropriated headers simd*.h
-
-} // namespace simd
-} // namespace quadiron
-
-#if defined(__AVX2__)
-#include "simd_256.h"
-#elif defined(__SSE4_1__)
-#include "simd_128.h"
-#endif
-
-#include "simd_nf4.h"
-
-#endif // #ifdef QUADIRON_USE_SIMD
-
-#endif
diff --git a/src/simd/simd.h b/src/simd/simd.h
index 9cdcd251..372931cc 100644
--- a/src/simd/simd.h
+++ b/src/simd/simd.h
@@ -31,6 +31,10 @@
 #ifndef __QUAD_SIMD_SIMD_H__
 #define __QUAD_SIMD_SIMD_H__
 
+#include "core.h"
+#include "property.h"
+#include "vec_buffers.h"
+
 #include "simd/allocator.h"
 #include "simd/definitions.h"
 
@@ -57,4 +61,31 @@ static constexpr std::size_t countof()
 } // namespace simd
 } // namespace quadiron
 
+#ifdef QUADIRON_USE_SIMD
+
+const unsigned F4 = 65537;
+const unsigned F3 = 257;
+
+// Include essential operations that use SIMD functions
+#if defined(__AVX2__)
+
+#include "simd_256.h"
+
+#elif defined(__SSE4_1__)
+
+#include "simd_128.h"
+
+#endif
+
+// Include basic operations
+#include "simd_basic.h"
+
+// Include accelerated operations dedicated for FNT
+#include "simd_fnt.h"
+
+// Include accelerated operations dedicated for NF4
+#include "simd_nf4.h"
+
+#endif // #ifdef QUADIRON_USE_SIMD
+
 #endif
diff --git a/src/simd/simd_128.h b/src/simd/simd_128.h
new file mode 100644
index 00000000..bb33ee4f
--- /dev/null
+++ b/src/simd/simd_128.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright 2017-2018 Scality
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __QUAD_SIMD_128_H__
+#define __QUAD_SIMD_128_H__
+
+namespace quadiron {
+namespace simd {
+
+typedef __m128i VecType;
+
+/* ============= Constant variable  ============ */
+
+#define F4_U32 _mm_set1_epi32(65537)
+#define F4_MINUS_ONE_U32 _mm_set1_epi32(65536)
+#define F3_U32 _mm_set1_epi32(257)
+#define F3_MINUS_ONE_U32 _mm_set1_epi32(256)
+
+#define F3_U16 _mm_set1_epi16(257)
+#define F3_MINUS_ONE_U16 _mm_set1_epi16(256)
+
+#define ZERO (_mm_setzero_si128())
+#define ONE_U16 (_mm_set1_epi16(1))
+#define ONE_U32 (_mm_set1_epi32(1))
+
+#define MASK8_LO (_mm_set1_epi16(0x80))
+
+/* ============= Essential Operations for SSE w/ both u16 & u32 ============ */
+
+inline VecType LoadToReg(VecType* address)
+{
+    return _mm_load_si128(address);
+}
+inline void StoreToMem(VecType* address, VecType reg)
+{
+    _mm_store_si128(address, reg);
+}
+
+inline VecType And(VecType x, VecType y)
+{
+    return _mm_and_si128(x, y);
+}
+inline VecType Xor(VecType x, VecType y)
+{
+    return _mm_xor_si128(x, y);
+}
+inline uint16_t Msb8Mask(VecType x)
+{
+    return _mm_movemask_epi8(x);
+}
+inline uint16_t AndIsZero(VecType x, VecType y)
+{
+    return _mm_testz_si128(x, y);
+}
+inline int IsZero(VecType x)
+{
+    return _mm_testc_si128(ZERO, x);
+}
+
+#define SHIFTR(x, imm8) (_mm_srli_si128(x, imm8))
+#define BLEND8(x, y, mask) (_mm_blendv_epi8(x, y, mask))
+#define BLEND16(x, y, imm8) (_mm_blend_epi16(x, y, imm8))
+
+/* ================= Essential Operations for SSE ================= */
+
+template <typename T>
+inline VecType SetOne(T val);
+template <>
+inline VecType SetOne(uint32_t val)
+{
+    return _mm_set1_epi32(val);
+}
+template <>
+inline VecType SetOne(uint16_t val)
+{
+    return _mm_set1_epi16(val);
+}
+
+template <typename T>
+inline VecType Add(VecType x, VecType y);
+template <>
+inline VecType Add<uint32_t>(VecType x, VecType y)
+{
+    return _mm_add_epi32(x, y);
+}
+template <>
+inline VecType Add<uint16_t>(VecType x, VecType y)
+{
+    return _mm_add_epi16(x, y);
+}
+
+template <typename T>
+inline VecType Sub(VecType x, VecType y);
+template <>
+inline VecType Sub<uint32_t>(VecType x, VecType y)
+{
+    return _mm_sub_epi32(x, y);
+}
+template <>
+inline VecType Sub<uint16_t>(VecType x, VecType y)
+{
+    return _mm_sub_epi16(x, y);
+}
+
+template <typename T>
+inline VecType Mul(VecType x, VecType y);
+template <>
+inline VecType Mul<uint32_t>(VecType x, VecType y)
+{
+    return _mm_mullo_epi32(x, y);
+}
+template <>
+inline VecType Mul<uint16_t>(VecType x, VecType y)
+{
+    return _mm_mullo_epi16(x, y);
+}
+
+template <typename T>
+inline VecType CompareEq(VecType x, VecType y);
+template <>
+inline VecType CompareEq<uint32_t>(VecType x, VecType y)
+{
+    return _mm_cmpeq_epi32(x, y);
+}
+template <>
+inline VecType CompareEq<uint16_t>(VecType x, VecType y)
+{
+    return _mm_cmpeq_epi16(x, y);
+}
+
+template <typename T>
+inline VecType Min(VecType x, VecType y);
+template <>
+inline VecType Min<uint32_t>(VecType x, VecType y)
+{
+    return _mm_min_epu32(x, y);
+}
+template <>
+inline VecType Min<uint16_t>(VecType x, VecType y)
+{
+    return _mm_min_epu16(x, y);
+}
+
+} // namespace simd
+} // namespace quadiron
+
+#endif
diff --git a/src/simd/simd_256.h b/src/simd/simd_256.h
new file mode 100644
index 00000000..0723e80f
--- /dev/null
+++ b/src/simd/simd_256.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright 2017-2018 Scality
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __QUAD_SIMD_256_H__
+#define __QUAD_SIMD_256_H__
+
+/* GCC doesn't include the split store intrinsics so define them here. */
+#if defined(__GNUC__) && !defined(__clang__)
+
+static inline void __attribute__((__always_inline__))
+_mm256_storeu2_m128i(__m128i* const hi, __m128i* const lo, const __m256i a)
+{
+    _mm_storeu_si128(lo, _mm256_castsi256_si128(a));
+    _mm_storeu_si128(hi, _mm256_extracti128_si256(a, 1));
+}
+
+#endif /* defined(__GNUC__) */
+
+namespace quadiron {
+namespace simd {
+
+typedef __m256i VecType;
+typedef __m128i HalfVecType;
+
+/* ============= Constant variable  ============ */
+
+#define F4_U32 _mm256_set1_epi32(65537)
+#define F4_MINUS_ONE_U32 _mm256_set1_epi32(65536)
+#define F3_U32 _mm256_set1_epi32(257)
+#define F3_MINUS_ONE_U32 _mm256_set1_epi32(256)
+
+#define F3_U16 _mm256_set1_epi16(257)
+#define F3_MINUS_ONE_U16 _mm256_set1_epi16(256)
+
+#define ZERO (_mm256_setzero_si256())
+#define ONE_U16 (_mm256_set1_epi16(1))
+#define ONE_U32 (_mm256_set1_epi32(1))
+
+#define MASK8_LO (_mm256_set1_epi16(0x80))
+
+/* ============= Essential Operations for AVX2 w/ both u16 & u32 ============ */
+
+inline VecType LoadToReg(VecType* address)
+{
+    return _mm256_load_si256(address);
+}
+inline void StoreToMem(VecType* address, VecType reg)
+{
+    _mm256_store_si256(address, reg);
+}
+
+inline VecType And(VecType x, VecType y)
+{
+    return _mm256_and_si256(x, y);
+}
+inline VecType Xor(VecType x, VecType y)
+{
+    return _mm256_xor_si256(x, y);
+}
+inline uint32_t Msb8Mask(VecType x)
+{
+    return _mm256_movemask_epi8(x);
+}
+inline uint32_t AndIsZero(VecType x, VecType y)
+{
+    return _mm256_testz_si256(x, y);
+}
+inline int IsZero(VecType x)
+{
+    return _mm256_testc_si256(ZERO, x);
+}
+
+#define SHIFTR(x, imm8) (_mm256_srli_si256(x, imm8))
+#define BLEND8(x, y, mask) (_mm256_blendv_epi8(x, y, mask))
+#define BLEND16(x, y, imm8) (_mm256_blend_epi16(x, y, imm8))
+
+/* ================= Essential Operations for AVX2 ================= */
+
+template <typename T>
+inline VecType SetOne(T val);
+template <>
+inline VecType SetOne(uint32_t val)
+{
+    return _mm256_set1_epi32(val);
+}
+template <>
+inline VecType SetOne(uint16_t val)
+{
+    return _mm256_set1_epi16(val);
+}
+
+template <typename T>
+inline VecType Add(VecType x, VecType y);
+template <>
+inline VecType Add<uint32_t>(VecType x, VecType y)
+{
+    return _mm256_add_epi32(x, y);
+}
+template <>
+inline VecType Add<uint16_t>(VecType x, VecType y)
+{
+    return _mm256_add_epi16(x, y);
+}
+
+template <typename T>
+inline VecType Sub(VecType x, VecType y);
+template <>
+inline VecType Sub<uint32_t>(VecType x, VecType y)
+{
+    return _mm256_sub_epi32(x, y);
+}
+template <>
+inline VecType Sub<uint16_t>(VecType x, VecType y)
+{
+    return _mm256_sub_epi16(x, y);
+}
+
+template <typename T>
+inline VecType Mul(VecType x, VecType y);
+template <>
+inline VecType Mul<uint32_t>(VecType x, VecType y)
+{
+    return _mm256_mullo_epi32(x, y);
+}
+template <>
+inline VecType Mul<uint16_t>(VecType x, VecType y)
+{
+    return _mm256_mullo_epi16(x, y);
+}
+
+template <typename T>
+inline VecType CompareEq(VecType x, VecType y);
+template <>
+inline VecType CompareEq<uint32_t>(VecType x, VecType y)
+{
+    return _mm256_cmpeq_epi32(x, y);
+}
+template <>
+inline VecType CompareEq<uint16_t>(VecType x, VecType y)
+{
+    return _mm256_cmpeq_epi16(x, y);
+}
+
+template <typename T>
+inline VecType Min(VecType x, VecType y);
+template <>
+inline VecType Min<uint32_t>(VecType x, VecType y)
+{
+    return _mm256_min_epu32(x, y);
+}
+template <>
+inline VecType Min<uint16_t>(VecType x, VecType y)
+{
+    return _mm256_min_epu16(x, y);
+}
+
+} // namespace simd
+} // namespace quadiron
+
+#endif
diff --git a/src/simd/simd_basic.h b/src/simd/simd_basic.h
new file mode 100644
index 00000000..ab2301ad
--- /dev/null
+++ b/src/simd/simd_basic.h
@@ -0,0 +1,331 @@
+/*
+ * Copyright 2017-2018 Scality
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __QUAD_SIMD_BASIC_H__
+#define __QUAD_SIMD_BASIC_H__
+
+namespace quadiron {
+namespace simd {
+
+template <typename T>
+inline VecType Card(T q);
+template <>
+inline VecType Card<uint16_t>(uint16_t q)
+{
+    return F3_U16;
+}
+template <>
+inline VecType Card<uint32_t>(uint32_t q)
+{
+    return (q == F3) ? F3_U32 : F4_U32;
+}
+
+template <typename T>
+inline VecType CardMinusOne(T q);
+template <>
+inline VecType CardMinusOne<uint16_t>(uint16_t q)
+{
+    return F3_MINUS_ONE_U16;
+}
+template <>
+inline VecType CardMinusOne<uint32_t>(uint32_t q)
+{
+    return (q == F3) ? F3_MINUS_ONE_U32 : F4_MINUS_ONE_U32;
+}
+
+template <typename T>
+inline VecType GetLowHalf(VecType x, T q)
+{
+    return (q == F3) ? BLEND8(ZERO, x, MASK8_LO) : BLEND16(ZERO, x, 0x55);
+}
+
+template <typename T>
+inline VecType GetHighHalf(VecType x, T q)
+{
+    return (q == F3) ? BLEND8(ZERO, SHIFTR(x, 1), MASK8_LO)
+                     : BLEND16(ZERO, SHIFTR(x, 2), 0x55);
+}
+
+/* ================= Basic Operations ================= */
+
+/**
+ * Modular addition
+ *
+ * @param x input register
+ * @param y input register
+ * @param q modulo
+ * @return (x + y) mod q
+ */
+template <typename T>
+inline VecType ModAdd(VecType x, VecType y, T q)
+{
+    const VecType res = Add<T>(x, y);
+    return Min<T>(res, Sub<T>(res, Card(q)));
+}
+
+/**
+ * Modular subtraction for packed unsigned 32-bit integers
+ *
+ * @param x input register
+ * @param y input register
+ * @param q modulo
+ * @return (x - y) mod q
+ */
+template <typename T>
+inline VecType ModSub(VecType x, VecType y, T q)
+{
+    const VecType res = Sub<T>(x, y);
+    return Min<T>(res, Add<T>(res, Card(q)));
+}
+
+/**
+ * Modular negation for packed unsigned 32-bit integers
+ *
+ * @param x input register
+ * @param q modulo
+ * @return (-x) mod q
+ */
+template <typename T>
+inline VecType ModNeg(VecType x, T q)
+{
+    const VecType res = Sub<T>(Card(q), x);
+    return Min<T>(res, Sub<T>(res, Card(q)));
+}
+
+/**
+ * Modular multiplication for packed unsigned 32-bit integers
+ *
+ * @note We assume that at least `x` or `y` is less than `q-1` so it's
+ * not necessary to verify overflow on multiplying elements
+ *
+ * @param x input register
+ * @param y input register
+ * @param q modulo
+ * @return (x * y) mod q
+ */
+template <typename T>
+inline VecType ModMul(VecType x, VecType y, T q)
+{
+    const VecType res = Mul<T>(x, y);
+    const VecType lo = GetLowHalf(res, q);
+    const VecType hi = GetHighHalf(res, q);
+    return ModSub(lo, hi, q);
+}
+
+/**
+ * Modular general multiplication for packed unsigned 32-bit integers
+ *
+ * @note It's necessary to verify overflow on multiplying elements
+ *
+ * @param x input register
+ * @param y input register
+ * @param q modulo
+ * @return (x * y) mod q
+ */
+template <typename T>
+inline VecType ModMulSafe(VecType x, VecType y, T q)
+{
+    const VecType res = ModMul(x, y, q);
+
+    // filter elements of both of a & b = card-1
+    const VecType cmp =
+        And(CompareEq<T>(x, CardMinusOne(q)), CompareEq<T>(y, CardMinusOne(q)));
+
+    if (IsZero(cmp) == 1) {
+        return res;
+    }
+    return (q == F3) ? Xor(res, And(F4_U32, cmp))
+                     : Add<T>(res, And(ONE_U32, cmp));
+}
+
+/**
+ * Update property for a given register for packed unsigned 32-bit integers
+ *
+ * @param props properties bound to fragments
+ * @param threshold register storing max value in its elements
+ * @param mask a specific mask
+ * @param symb input register
+ * @param offset offset in the data fragments
+ * @param max a dummy variable
+ */
+template <typename T>
+inline void AddProps(
+    Properties& props,
+    VecType threshold,
+    VecType mask,
+    VecType symb,
+    off_t offset,
+    T max)
+{
+    const VecType b = CompareEq<T>(threshold, symb);
+    const VecType c = And(mask, b);
+    auto d = Msb8Mask(c);
+    const unsigned element_size = sizeof(T);
+    while (d > 0) {
+        const unsigned byte_idx = __builtin_ctz(d);
+        const size_t _offset = offset + byte_idx / element_size;
+        props.add(_offset, OOR_MARK);
+        d ^= 1 << byte_idx;
+    }
+}
+
+/* ==================== Operations for RingModN =================== */
+/** Perform a multiplication of a coefficient `a` to each element of `src` and
+ *  add result to correspondent element of `dest`
+ *
+ * @note: 1 < `a` < card - 1
+ */
+template <typename T>
+inline void mul_coef_to_buf(const T a, T* src, T* dest, size_t len, T card)
+{
+    const VecType coef = SetOne(a);
+
+    VecType* __restrict _src = reinterpret_cast<VecType*>(src);
+    VecType* __restrict _dest = reinterpret_cast<VecType*>(dest);
+    const unsigned ratio = sizeof(*_src) / sizeof(*src);
+    const size_t _len = len / ratio;
+    const size_t _last_len = len - _len * ratio;
+
+    size_t i = 0;
+    const size_t end = (_len > 3) ? _len - 3 : 0;
+    for (; i < end; i += 4) {
+        _dest[i] = ModMul(coef, _src[i], card);
+        _dest[i + 1] = ModMul(coef, _src[i + 1], card);
+        _dest[i + 2] = ModMul(coef, _src[i + 2], card);
+        _dest[i + 3] = ModMul(coef, _src[i + 3], card);
+    }
+    for (; i < _len; ++i) {
+        _dest[i] = ModMul(coef, _src[i], card);
+    }
+
+    if (_last_len > 0) {
+        const DoubleSizeVal<T> coef_double = DoubleSizeVal<T>(a);
+        for (size_t i = _len * ratio; i < len; i++) {
+            dest[i] = (T)((coef_double * src[i]) % card);
+        }
+    }
+}
+
+template <typename T>
+inline void add_two_bufs(T* src, T* dest, size_t len, T card)
+{
+    VecType* __restrict _src = reinterpret_cast<VecType*>(src);
+    VecType* __restrict _dest = reinterpret_cast<VecType*>(dest);
+    const unsigned ratio = sizeof(*_src) / sizeof(*src);
+    const size_t _len = len / ratio;
+    const size_t _last_len = len - _len * ratio;
+
+    size_t i;
+    for (i = 0; i < _len; i++) {
+        _dest[i] = ModAdd(_src[i], _dest[i], card);
+    }
+    if (_last_len > 0) {
+        for (i = _len * ratio; i < len; i++) {
+            const T tmp = src[i] + dest[i];
+            dest[i] = (tmp >= card) ? (tmp - card) : tmp;
+        }
+    }
+}
+
+template <typename T>
+inline void sub_two_bufs(T* bufa, T* bufb, T* res, size_t len, T card)
+{
+    VecType* __restrict _bufa = reinterpret_cast<VecType*>(bufa);
+    VecType* __restrict _bufb = reinterpret_cast<VecType*>(bufb);
+    VecType* __restrict _res = reinterpret_cast<VecType*>(res);
+    const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa);
+    const size_t _len = len / ratio;
+    const size_t _last_len = len - _len * ratio;
+
+    size_t i;
+    for (i = 0; i < _len; i++) {
+        // perform subtraction
+        _res[i] = ModSub(_bufa[i], _bufb[i], card);
+    }
+    if (_last_len > 0) {
+        for (i = _len * ratio; i < len; i++) {
+            // perform subtraction
+            if (bufa[i] >= bufb[i]) {
+                res[i] = bufa[i] - bufb[i];
+            } else {
+                res[i] = card - (bufb[i] - bufa[i]);
+            }
+        }
+    }
+}
+
+template <typename T>
+inline void mul_two_bufs(T* src, T* dest, size_t len, T card)
+{
+    VecType* __restrict _src = reinterpret_cast<VecType*>(src);
+    VecType* __restrict _dest = reinterpret_cast<VecType*>(dest);
+    const unsigned ratio = sizeof(*_src) / sizeof(*src);
+    const size_t _len = len / ratio;
+    const size_t _last_len = len - _len * ratio;
+
+    size_t i;
+    for (i = 0; i < _len; i++) {
+        // perform multiplicaton
+        _dest[i] = ModMulSafe(_src[i], _dest[i], card);
+    }
+    if (_last_len > 0) {
+        for (i = _len * ratio; i < len; i++) {
+            // perform multiplicaton
+            dest[i] = T((DoubleSizeVal<T>(src[i]) * dest[i]) % card);
+        }
+    }
+}
+
+/** Apply an element-wise negation to a buffer
+ */
+template <typename T>
+inline void neg(size_t len, T* buf, T card)
+{
+    VecType* _buf = reinterpret_cast<VecType*>(buf);
+    const unsigned ratio = sizeof(*_buf) / sizeof(*buf);
+    const size_t _len = len / ratio;
+    const size_t _last_len = len - _len * ratio;
+
+    size_t i;
+    for (i = 0; i < _len; i++) {
+        _buf[i] = ModNeg(_buf[i], card);
+    }
+    if (_last_len > 0) {
+        for (i = _len * ratio; i < len; i++) {
+            if (buf[i])
+                buf[i] = card - buf[i];
+        }
+    }
+}
+
+} // namespace simd
+} // namespace quadiron
+
+#endif
diff --git a/src/simd/simd_fnt.h b/src/simd/simd_fnt.h
new file mode 100644
index 00000000..97467050
--- /dev/null
+++ b/src/simd/simd_fnt.h
@@ -0,0 +1,528 @@
+/*
+ * Copyright 2017-2018 Scality
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __QUAD_SIMD_FNT_H__
+#define __QUAD_SIMD_FNT_H__
+
+namespace quadiron {
+namespace simd {
+
+/* ================= Vectorized Operations ================= */
+
+/**
+ * Butterfly Cooley-Tukey operation
+ *
+ * x <- x + r * y
+ * y <- x - r * y
+ *
+ * @param rp1 coefficient `r` plus one
+ * @param c a register stores coefficient `r`
+ * @param x working register
+ * @param y working register
+ * @param q modular
+ */
+template <typename T>
+inline void ButterflyCT(T rp1, VecType c, VecType* x, VecType* y, T q)
+{
+    VecType z = (rp1 == 2) ? *y : ModMul(c, *y, q);
+    if (rp1 < q) {
+        *y = ModSub(*x, z, q);
+        *x = ModAdd(*x, z, q);
+    } else { // i.e. r == q - 1
+        *y = ModAdd(*x, z, q);
+        *x = ModSub(*x, z, q);
+    }
+}
+
+/**
+ * Butterfly Genteleman-Sande operation
+ *
+ * x <- x + y
+ * y <- r * (x - y)
+ *
+ * @param rp1 coefficient `r` plus one
+ * @param c a register stores coefficient `r`
+ * @param x working register
+ * @param y working register
+ * @param q modular
+ */
+template <typename T>
+inline void ButterflyGS(T rp1, VecType c, VecType* x, VecType* y, T q)
+{
+    VecType add = ModAdd(*x, *y, q);
+    if (rp1 == 2) {
+        *y = ModSub(*x, *y, q);
+    } else if (rp1 < q) {
+        VecType sub = ModSub(*x, *y, q);
+        *y = ModMul(c, sub, q);
+    } else { // i.e. r == q - 1
+        *y = ModSub(*y, *x, q);
+    }
+    *x = add;
+}
+
+/**
+ * Butterfly Genteleman-Sande simple operation where y = 0
+ *
+ * x <- x, i.e. no operation
+ * y <- r * x
+ *
+ * @param rp1 coefficient `r` plus one
+ * @param c a register stores coefficient `r`
+ * @param x working register
+ * @param q modular
+ * @return r * x
+ */
+template <typename T>
+inline VecType ButterflySimpleGS(T rp1, VecType c, VecType x, T q)
+{
+    if (rp1 == 2) {
+        return x;
+    } else if (rp1 < q) {
+        return ModMul(c, x, q);
+    } else {
+        return ModNeg(x, q);
+    }
+}
+
+/**
+ * Vectorized butterfly CT step
+ *
+ * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
+ *      P = P + r * Q
+ *      Q = P - r * Q
+ *
+ * @param buf - working buffers
+ * @param r - coefficient
+ * @param start - index of buffer among `m` ones
+ * @param m - current group size
+ * @param step - next loop
+ * @param len - number of vectors per buffer
+ * @param card - modulo cardinal
+ */
+template <typename T>
+inline void butterfly_ct_step(
+    vec::Buffers<T>& buf,
+    T r,
+    unsigned start,
+    unsigned m,
+    unsigned step,
+    size_t len,
+    T card)
+{
+    if (len == 0) {
+        return;
+    }
+    const T rp1 = r + 1;
+    VecType c = SetOne(r);
+
+    const size_t end = (len > 1) ? len - 1 : 0;
+    const unsigned bufs_nb = buf.get_n();
+    const std::vector<T*>& mem = buf.get_mem();
+    for (unsigned i = start; i < bufs_nb; i += step) {
+        VecType x1, y1;
+        VecType x2, y2;
+        VecType* __restrict p = reinterpret_cast<VecType*>(mem[i]);
+        VecType* __restrict q = reinterpret_cast<VecType*>(mem[i + m]);
+
+        size_t j = 0;
+        for (; j < end; j += 2) {
+            x1 = LoadToReg(p + j);
+            y1 = LoadToReg(q + j);
+
+            ButterflyCT(rp1, c, &x1, &y1, card);
+
+            x2 = LoadToReg(p + j + 1);
+            y2 = LoadToReg(q + j + 1);
+
+            ButterflyCT(rp1, c, &x2, &y2, card);
+
+            // Store back to memory
+            StoreToMem(p + j, x1);
+            StoreToMem(p + j + 1, x2);
+            StoreToMem(q + j, y1);
+            StoreToMem(q + j + 1, y2);
+        }
+        for (; j < len; ++j) {
+            x1 = LoadToReg(p + j);
+            y1 = LoadToReg(q + j);
+
+            ButterflyCT(rp1, c, &x1, &y1, card);
+
+            // Store back to memory
+            StoreToMem(p + j, x1);
+            StoreToMem(q + j, y1);
+        }
+    }
+}
+
+template <typename T>
+inline static void do_butterfly_ct_2_layers(
+    const std::vector<T*>& mem,
+    T r1,
+    T r2,
+    T r3,
+    unsigned start,
+    unsigned m,
+    size_t len,
+    T card)
+{
+    const T r1p1 = r1 + 1;
+    const T r2p1 = r2 + 1;
+    const T r3p1 = r3 + 1;
+
+    VecType c1 = SetOne(r1);
+    VecType c2 = SetOne(r2);
+    VecType c3 = SetOne(r3);
+
+    VecType* __restrict p = reinterpret_cast<VecType*>(mem[start]);
+    VecType* __restrict q = reinterpret_cast<VecType*>(mem[start + m]);
+    VecType* __restrict r = reinterpret_cast<VecType*>(mem[start + 2 * m]);
+    VecType* __restrict s = reinterpret_cast<VecType*>(mem[start + 3 * m]);
+
+    size_t j = 0;
+    const size_t end = (len > 1) ? len - 1 : 0;
+    while (j < end) {
+        // First layer (c1, x, y) & (c1, u, v)
+        VecType x1 = LoadToReg(p);
+        VecType x2 = LoadToReg(p + 1);
+        VecType y1 = LoadToReg(q);
+        VecType y2 = LoadToReg(q + 1);
+
+        ButterflyCT(r1p1, c1, &x1, &y1, card);
+        ButterflyCT(r1p1, c1, &x2, &y2, card);
+
+        VecType u1 = LoadToReg(r);
+        VecType u2 = LoadToReg(r + 1);
+        VecType v1 = LoadToReg(s);
+        VecType v2 = LoadToReg(s + 1);
+
+        ButterflyCT(r1p1, c1, &u1, &v1, card);
+        ButterflyCT(r1p1, c1, &u2, &v2, card);
+
+        // Second layer (c2, x, u) & (c3, y, v)
+        ButterflyCT(r2p1, c2, &x1, &u1, card);
+        ButterflyCT(r2p1, c2, &x2, &u2, card);
+
+        ButterflyCT(r3p1, c3, &y1, &v1, card);
+        ButterflyCT(r3p1, c3, &y2, &v2, card);
+
+        // Store back to memory
+        StoreToMem(p, x1);
+        StoreToMem(p + 1, x2);
+        StoreToMem(q, y1);
+        StoreToMem(q + 1, y2);
+
+        StoreToMem(r, u1);
+        StoreToMem(r + 1, u2);
+        StoreToMem(s, v1);
+        StoreToMem(s + 1, v2);
+        p = p + 2;
+        q = q + 2;
+        r = r + 2;
+        s = s + 2;
+        j = j + 2;
+    };
+
+    for (; j < len; ++j) {
+        // First layer (c1, x, y) & (c1, u, v)
+        VecType x1 = LoadToReg(p + j);
+        VecType y1 = LoadToReg(q + j);
+        VecType u1 = LoadToReg(r + j);
+        VecType v1 = LoadToReg(s + j);
+
+        // BUTTERFLY_3_test(c1, &x1, &y1, &u1, &v1, card);
+        ButterflyCT(r1p1, c1, &x1, &y1, card);
+        ButterflyCT(r1p1, c1, &u1, &v1, card);
+        ButterflyCT(r2p1, c2, &x1, &u1, card);
+        ButterflyCT(r3p1, c3, &y1, &v1, card);
+
+        // Store back to memory
+        StoreToMem(p + j, x1);
+        StoreToMem(q + j, y1);
+        StoreToMem(r + j, u1);
+        StoreToMem(s + j, v1);
+    }
+}
+
+/**
+ * Vectorized butterfly CT on two-layers at a time
+ *
+ * For each quadruple
+ * (P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m])
+ * First layer: butterfly on (P, Q) and (R, S) for step = 2 * m
+ *      coef r1 = W[start * n / (2 * m)]
+ *      P = P + r1 * Q
+ *      Q = P - r1 * Q
+ *      R = R + r1 * S
+ *      S = R - r1 * S
+ * Second layer: butterfly on (P, R) and (Q, S) for step = 4 * m
+ *      coef r2 = W[start * n / (4 * m)]
+ *      coef r3 = W[(start + m) * n / (4 * m)]
+ *      P = P + r2 * R
+ *      R = P - r2 * R
+ *      Q = Q + r3 * S
+ *      S = Q - r3 * S
+ *
+ * @param buf - working buffers
+ * @param r1 - coefficient for the 1st layer
+ * @param r2 - 1st coefficient for the 2nd layer
+ * @param r3 - 2nd coefficient for the 2nd layer
+ * @param start - index of buffer among `m` ones
+ * @param m - current group size
+ * @param len - number of vectors per buffer
+ * @param card - modulo cardinal
+ */
+template <typename T>
+inline void butterfly_ct_two_layers_step(
+    vec::Buffers<T>& buf,
+    T r1,
+    T r2,
+    T r3,
+    unsigned start,
+    unsigned m,
+    size_t len,
+    T card)
+{
+    if (len == 0) {
+        return;
+    }
+    const unsigned step = m << 2;
+    const unsigned bufs_nb = buf.get_n();
+
+    const std::vector<T*>& mem = buf.get_mem();
+    for (unsigned i = start; i < bufs_nb; i += step) {
+        do_butterfly_ct_2_layers(mem, r1, r2, r3, i, m, len, card);
+    }
+}
+
+/**
+ * Vectorized butterfly GS step
+ *
+ * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
+ *      P = P + Q
+ *      Q = r * (P - Q)
+ *
+ * @param buf - working buffers
+ * @param r - coefficient
+ * @param start - index of buffer among `m` ones
+ * @param m - current group size
+ * @param len - number of vectors per buffer
+ * @param card - modulo cardinal
+ */
+template <typename T>
+inline void butterfly_gs_step(
+    vec::Buffers<T>& buf,
+    T r,
+    unsigned start,
+    unsigned m,
+    size_t len,
+    T card)
+{
+    if (len == 0) {
+        return;
+    }
+    const unsigned step = m << 1;
+    const T rp1 = r + 1;
+    VecType c = SetOne(r);
+
+    const size_t end = (len > 3) ? len - 3 : 0;
+    const unsigned bufs_nb = buf.get_n();
+    const std::vector<T*>& mem = buf.get_mem();
+    for (unsigned i = start; i < bufs_nb; i += step) {
+        VecType x1, x2, x3, x4;
+        VecType y1, y2, y3, y4;
+        VecType* __restrict p = reinterpret_cast<VecType*>(mem[i]);
+        VecType* __restrict q = reinterpret_cast<VecType*>(mem[i + m]);
+
+        size_t j = 0;
+        for (; j < end; j += 4) {
+            x1 = LoadToReg(p + j);
+            x2 = LoadToReg(p + j + 1);
+            x3 = LoadToReg(p + j + 2);
+            x4 = LoadToReg(p + j + 3);
+            y1 = LoadToReg(q + j);
+            y2 = LoadToReg(q + j + 1);
+            y3 = LoadToReg(q + j + 2);
+            y4 = LoadToReg(q + j + 3);
+
+            ButterflyGS(rp1, c, &x1, &y1, card);
+            ButterflyGS(rp1, c, &x2, &y2, card);
+            ButterflyGS(rp1, c, &x3, &y3, card);
+            ButterflyGS(rp1, c, &x4, &y4, card);
+
+            // Store back to memory
+            StoreToMem(p + j, x1);
+            StoreToMem(p + j + 1, x2);
+            StoreToMem(p + j + 2, x3);
+            StoreToMem(p + j + 3, x4);
+            StoreToMem(q + j, y1);
+            StoreToMem(q + j + 1, y2);
+            StoreToMem(q + j + 2, y3);
+            StoreToMem(q + j + 3, y4);
+        }
+        for (; j < len; ++j) {
+            x1 = LoadToReg(p + j);
+            y1 = LoadToReg(q + j);
+
+            ButterflyGS(rp1, c, &x1, &y1, card);
+
+            // Store back to memory
+            StoreToMem(p + j, x1);
+            StoreToMem(q + j, y1);
+        }
+    }
+}
+
+/**
+ * Vectorized butterfly GS step
+ *
+ * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
+ *      Q = r * P
+ *
+ * @param buf - working buffers
+ * @param r - coefficient
+ * @param start - index of buffer among `m` ones
+ * @param m - current group size
+ * @param len - number of vectors per buffer
+ * @param card - modulo cardinal
+ */
+template <typename T>
+inline void butterfly_gs_step_simple(
+    vec::Buffers<T>& buf,
+    T r,
+    unsigned start,
+    unsigned m,
+    size_t len,
+    T card)
+{
+    if (len == 0) {
+        return;
+    }
+    const unsigned step = m << 1;
+    const T rp1 = r + 1;
+    VecType c = SetOne(r);
+
+    const size_t end = (len > 1) ? len - 1 : 0;
+    const unsigned bufs_nb = buf.get_n();
+    const std::vector<T*>& mem = buf.get_mem();
+    for (unsigned i = start; i < bufs_nb; i += step) {
+        VecType x1, y1;
+        VecType x2, y2;
+        VecType* __restrict p = reinterpret_cast<VecType*>(mem[i]);
+        VecType* __restrict q = reinterpret_cast<VecType*>(mem[i + m]);
+
+        size_t j = 0;
+        for (; j < end; j += 2) {
+            x1 = LoadToReg(p + j);
+            x2 = LoadToReg(p + j + 1);
+
+            y1 = ButterflySimpleGS(rp1, c, x1, card);
+            y2 = ButterflySimpleGS(rp1, c, x2, card);
+
+            // Store back to memory
+            StoreToMem(q + j, y1);
+            StoreToMem(q + j + 1, y2);
+        }
+        for (; j < len; ++j) {
+            x1 = LoadToReg(p + j);
+
+            y1 = ButterflySimpleGS(rp1, c, x1, card);
+
+            // Store back to memory
+            StoreToMem(q + j, y1);
+        }
+    }
+}
+
+template <typename T>
+inline void encode_post_process(
+    vec::Buffers<T>& output,
+    std::vector<Properties>& props,
+    off_t offset,
+    unsigned code_len,
+    T threshold,
+    size_t vecs_nb)
+{
+    const unsigned element_size = sizeof(T);
+    const unsigned vec_size = countof<T>();
+    const T max = 1 << (element_size * 8 - 1);
+    const VecType _threshold = SetOne(threshold);
+    const VecType mask_hi = SetOne(max);
+
+    const std::vector<T*>& mem = output.get_mem();
+    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
+        VecType* __restrict buf = reinterpret_cast<VecType*>(mem[frag_id]);
+
+        size_t vec_id = 0;
+        size_t end = (vecs_nb > 3) ? vecs_nb - 3 : 0;
+        for (; vec_id < end; vec_id += 4) {
+            VecType a1 = LoadToReg(buf + vec_id);
+            VecType a2 = LoadToReg(buf + vec_id + 1);
+            VecType a3 = LoadToReg(buf + vec_id + 2);
+            VecType a4 = LoadToReg(buf + vec_id + 3);
+
+            if (AndIsZero(a1, _threshold) == 0) {
+                const off_t curr_offset = offset + vec_id * vec_size;
+                AddProps(
+                    props[frag_id], _threshold, mask_hi, a1, curr_offset, max);
+            }
+            if (AndIsZero(a2, _threshold) == 0) {
+                const off_t curr_offset = offset + (vec_id + 1) * vec_size;
+                AddProps(
+                    props[frag_id], _threshold, mask_hi, a2, curr_offset, max);
+            }
+            if (AndIsZero(a3, _threshold) == 0) {
+                const off_t curr_offset = offset + (vec_id + 2) * vec_size;
+                AddProps(
+                    props[frag_id], _threshold, mask_hi, a3, curr_offset, max);
+            }
+            if (AndIsZero(a4, _threshold) == 0) {
+                const off_t curr_offset = offset + (vec_id + 3) * vec_size;
+                AddProps(
+                    props[frag_id], _threshold, mask_hi, a4, curr_offset, max);
+            }
+        }
+        for (; vec_id < vecs_nb; ++vec_id) {
+            VecType a = LoadToReg(buf + vec_id);
+            uint32_t c = AndIsZero(a, _threshold);
+            if (c == 0) {
+                const off_t curr_offset = offset + vec_id * vec_size;
+                AddProps(
+                    props[frag_id], _threshold, mask_hi, a, curr_offset, max);
+            }
+        }
+    }
+}
+
+} // namespace simd
+} // namespace quadiron
+
+#endif
diff --git a/src/simd/simd_nf4.h b/src/simd/simd_nf4.h
new file mode 100644
index 00000000..8c3c1d92
--- /dev/null
+++ b/src/simd/simd_nf4.h
@@ -0,0 +1,380 @@
+/*
+ * Copyright 2017-2018 Scality
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __QUAD_SIMD_NF4_H__
+#define __QUAD_SIMD_NF4_H__
+
+namespace quadiron {
+namespace simd {
+
+typedef uint32_t aint32 __attribute__((aligned(ALIGNMENT)));
+
+/** Return __uint128_t integer from a _m128i register */
+static inline __uint128_t m128i_to_uint128(__m128i v)
+{
+    __uint128_t i;
+    _mm_store_si128((__m128i*)&i, v);
+
+    return i; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn)
+}
+
+inline __uint128_t expand16(uint16_t* arr, int n)
+{
+    // since n <= 4
+    uint16_t _arr[4] __attribute__((aligned(ALIGNMENT))) = {0, 0, 0, 0};
+    std::copy_n(arr, n, _arr);
+
+    __m128i b = _mm_set_epi16(0, 0, 0, 0, _arr[3], _arr[2], _arr[1], _arr[0]);
+
+    return m128i_to_uint128(b);
+}
+
+inline __uint128_t expand32(uint32_t* arr, int n)
+{
+    // since n <= 4
+    uint32_t _arr[4] __attribute__((aligned(simd::ALIGNMENT))) = {0, 0, 0, 0};
+    std::copy_n(arr, n, _arr);
+
+    __m128i b = _mm_set_epi32(_arr[3], _arr[2], _arr[1], _arr[0]);
+
+    return m128i_to_uint128(b);
+}
+
+inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n)
+{
+    uint16_t ai[8];
+    __uint128_t values;
+
+    __m128i _a = _mm_loadu_si128((__m128i*)&a);
+    ai[0] = _mm_extract_epi16(_a, 0);
+    ai[1] = _mm_extract_epi16(_a, 1);
+    ai[2] = _mm_extract_epi16(_a, 2);
+    ai[3] = _mm_extract_epi16(_a, 3);
+    ai[4] = _mm_extract_epi16(_a, 4);
+    ai[5] = _mm_extract_epi16(_a, 5);
+    ai[6] = _mm_extract_epi16(_a, 6);
+    ai[7] = _mm_extract_epi16(_a, 7);
+
+    const uint32_t flag =
+        ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u);
+
+    __m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]);
+    _mm_store_si128((__m128i*)&values, val);
+
+    GroupedValues<__uint128_t> b = {values, flag};
+
+    return b;
+}
+
+inline void unpack(__uint128_t a, GroupedValues<__uint128_t>& b, int n)
+{
+    uint16_t ai[8];
+    __uint128_t values;
+
+    __m128i _a = _mm_loadu_si128((__m128i*)&a);
+    ai[0] = _mm_extract_epi16(_a, 0);
+    ai[1] = _mm_extract_epi16(_a, 1);
+    ai[2] = _mm_extract_epi16(_a, 2);
+    ai[3] = _mm_extract_epi16(_a, 3);
+    ai[4] = _mm_extract_epi16(_a, 4);
+    ai[5] = _mm_extract_epi16(_a, 5);
+    ai[6] = _mm_extract_epi16(_a, 6);
+    ai[7] = _mm_extract_epi16(_a, 7);
+
+    const uint32_t flag =
+        ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u);
+
+    __m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]);
+    _mm_store_si128((__m128i*)&values, val);
+
+    b.flag = flag;
+    b.values = values; // NOLINT(clang-analyzer-core.uninitialized.Assign)
+}
+
+inline __uint128_t pack(__uint128_t a)
+{
+    __m128i _a = _mm_loadu_si128((__m128i*)&a);
+    __m128i b = _mm_set_epi32(
+        _mm_extract_epi16(_a, 3),
+        _mm_extract_epi16(_a, 2),
+        _mm_extract_epi16(_a, 1),
+        _mm_extract_epi16(_a, 0));
+
+    return m128i_to_uint128(b);
+}
+
+inline __uint128_t pack(__uint128_t a, uint32_t flag)
+{
+    aint32 b0, b1, b2, b3;
+    __m128i _a = _mm_loadu_si128((__m128i*)&a);
+
+    if (flag & 1)
+        b0 = 65536;
+    else
+        b0 = _mm_extract_epi16(_a, 0);
+    flag >>= 1;
+    if (flag & 1)
+        b1 = 65536;
+    else
+        b1 = _mm_extract_epi16(_a, 1);
+    flag >>= 1;
+    if (flag & 1)
+        b2 = 65536;
+    else
+        b2 = _mm_extract_epi16(_a, 2);
+    flag >>= 1;
+    if (flag & 1)
+        b3 = 65536;
+    else
+        b3 = _mm_extract_epi16(_a, 3);
+
+    __m128i b = _mm_set_epi32(b3, b2, b1, b0);
+
+    return m128i_to_uint128(b);
+}
+
+/* ================= Basic operations for NF4 ================= */
+
+#if defined(__AVX2__)
+
+inline VecType LoadToReg(HalfVecType x)
+{
+    return _mm256_castsi128_si256(_mm_load_si128(&x));
+}
+
+inline VecType LoadToReg(__uint128_t x)
+{
+    const HalfVecType* _x = reinterpret_cast<const HalfVecType*>(&x);
+    return LoadToReg(*_x);
+}
+
+inline void StoreLowHalfToMem(HalfVecType* address, VecType reg)
+{
+    _mm_store_si128(address, _mm256_castsi256_si128(reg));
+}
+
+inline __uint128_t add(__uint128_t a, __uint128_t b)
+{
+    HalfVecType res;
+    VecType _a = LoadToReg(a);
+    VecType _b = LoadToReg(b);
+    StoreLowHalfToMem(&res, ModAdd(_a, _b, F4));
+    return reinterpret_cast<__uint128_t>(res);
+}
+
+inline __uint128_t sub(__uint128_t a, __uint128_t b)
+{
+    HalfVecType res;
+    VecType _a = LoadToReg(a);
+    VecType _b = LoadToReg(b);
+    StoreLowHalfToMem(&res, ModSub(_a, _b, F4));
+    return reinterpret_cast<__uint128_t>(res);
+}
+
+inline __uint128_t mul(__uint128_t a, __uint128_t b)
+{
+    HalfVecType res;
+    VecType _a = LoadToReg(a);
+    VecType _b = LoadToReg(b);
+    StoreLowHalfToMem(&res, ModMulSafe(_a, _b, F4));
+    return reinterpret_cast<__uint128_t>(res);
+}
+
+inline void add_buf_to_two_bufs_rem(
+    unsigned n,
+    __uint128_t* x,
+    __uint128_t* x_half,
+    __uint128_t* y)
+{
+    // add last _y[] to x and x_next
+    HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
+    HalfVecType* _x_half = reinterpret_cast<HalfVecType*>(x_half);
+    HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
+    for (unsigned i = 0; i < n; ++i) {
+        VecType _x_p = LoadToReg(_x[i]);
+        VecType _x_next_p = LoadToReg(_x_half[i]);
+        VecType _y_p = LoadToReg(_y[i]);
+
+        StoreLowHalfToMem(_x + i, ModAdd(_x_p, _y_p, F4));
+        StoreLowHalfToMem(_x_half + i, ModAdd(_x_next_p, _y_p, F4));
+    }
+}
+
+inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y)
+{
+    HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
+    HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
+    for (unsigned i = 0; i < n; ++i) {
+        VecType _x_p = LoadToReg(_x[i]);
+        VecType _y_p = LoadToReg(_y[i]);
+
+        StoreLowHalfToMem(_x + i, ModMulSafe(_x_p, _y_p, F4));
+    }
+}
+
+inline void hadamard_mul_doubled_rem(
+    unsigned n,
+    __uint128_t* x,
+    __uint128_t* x_half,
+    __uint128_t* y)
+{
+    HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
+    HalfVecType* _x_half = reinterpret_cast<HalfVecType*>(x_half);
+    HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
+    for (unsigned i = 0; i < n; ++i) {
+        VecType _x_p = LoadToReg(_x[i]);
+        VecType _x_next_p = LoadToReg(_x_half[i]);
+        VecType _y_p = LoadToReg(_y[i]);
+
+        StoreLowHalfToMem(_x + i, ModMulSafe(_x_p, _y_p, F4));
+        StoreLowHalfToMem(_x_half + i, ModMulSafe(_x_next_p, _y_p, F4));
+    }
+}
+
+#elif defined(__SSE4_1__)
+
+inline VecType LoadToReg(__uint128_t x)
+{
+    const VecType* _x = reinterpret_cast<const VecType*>(&x);
+    return _mm_load_si128(_x);
+}
+
+inline __uint128_t add(__uint128_t a, __uint128_t b)
+{
+    VecType res;
+    VecType _a = LoadToReg(a);
+    VecType _b = LoadToReg(b);
+    StoreToMem(&res, ModAdd(_a, _b, F4));
+    return reinterpret_cast<__uint128_t>(res);
+}
+
+inline __uint128_t sub(__uint128_t a, __uint128_t b)
+{
+    VecType res;
+    VecType _a = LoadToReg(a);
+    VecType _b = LoadToReg(b);
+    StoreToMem(&res, ModSub(_a, _b, F4));
+    return reinterpret_cast<__uint128_t>(res);
+}
+
+inline __uint128_t mul(__uint128_t a, __uint128_t b)
+{
+    VecType res;
+    VecType _a = LoadToReg(a);
+    VecType _b = LoadToReg(b);
+    StoreToMem(&res, ModMulSafe(_a, _b, F4));
+    return reinterpret_cast<__uint128_t>(res);
+}
+
+inline void add_buf_to_two_bufs_rem(
+    unsigned n,
+    __uint128_t* x,
+    __uint128_t* x_half,
+    __uint128_t* y)
+{
+    // do nothing
+}
+
+inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y)
+{
+    // do nothing
+}
+
+inline void hadamard_mul_doubled_rem(
+    unsigned n,
+    __uint128_t* x,
+    __uint128_t* x_half,
+    __uint128_t* y)
+{
+    // do nothing
+}
+
+#endif
+
+/* ==================== Operations for NF4 =================== */
+
+/** Add buffer `y` to two halves of `x`. `x` is of length `n` */
+inline void add_buf_to_two_bufs(unsigned n, __uint128_t* _x, __uint128_t* _y)
+{
+    unsigned i;
+    VecType* x = reinterpret_cast<VecType*>(_x);
+    VecType* y = reinterpret_cast<VecType*>(_y);
+
+    const unsigned ratio = sizeof(*x) / sizeof(*_x);
+    const unsigned half_len = n / 2;
+    const unsigned vec_len = half_len / ratio;
+    const unsigned num_len = vec_len * ratio;
+    const unsigned rem_len = half_len - num_len;
+
+    __uint128_t* x_half = _x + half_len;
+    VecType* x_next = reinterpret_cast<VecType*>(x_half);
+
+    // add y to the first half of `x`
+    for (i = 0; i < vec_len; ++i) {
+        x[i] = ModAdd(x[i], y[i], F4);
+    }
+
+    // add y to the second half of `x`
+    for (i = 0; i < vec_len; ++i) {
+        x_next[i] = ModAdd(x_next[i], y[i], F4);
+    }
+
+    if (rem_len > 0) {
+        add_buf_to_two_bufs_rem(
+            rem_len, _x + num_len, x_half + num_len, _y + num_len);
+    }
+}
+
+inline void hadamard_mul(unsigned n, __uint128_t* _x, __uint128_t* _y)
+{
+    unsigned i;
+    VecType* x = reinterpret_cast<VecType*>(_x);
+    VecType* y = reinterpret_cast<VecType*>(_y);
+
+    const unsigned ratio = sizeof(*x) / sizeof(*_x);
+    const unsigned vec_len = n / ratio;
+    const unsigned num_len = vec_len * ratio;
+    const unsigned rem_len = n - num_len;
+
+    // multiply y to the first half of `x`
+    for (i = 0; i < vec_len; ++i) {
+        x[i] = ModMulSafe(x[i], y[i], F4);
+    }
+
+    if (rem_len > 0) {
+        // add last _y[] to x
+        hadamard_mul_rem(rem_len, _x + num_len, _y + num_len);
+    }
+}
+
+} // namespace simd
+} // namespace quadiron
+
+#endif
diff --git a/src/simd_128.h b/src/simd_128.h
deleted file mode 100644
index 0f6b8857..00000000
--- a/src/simd_128.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright 2017-2018 Scality
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __QUAD_SIMD_128_H__
-#define __QUAD_SIMD_128_H__
-
-#include <x86intrin.h>
-
-typedef __m128i m128i;
-
-// Disable `cert-err58-cpp` on these: AFAIK they cannot throw.
-// (probably a false positive present in Clang 5 and fixed in Clang 6).
-const m128i F4_m128i = _mm_set1_epi32(65537);       // NOLINT(cert-err58-cpp)
-const m128i F4minus1_m128i = _mm_set1_epi32(65536); // NOLINT(cert-err58-cpp)
-const m128i F3_m128i = _mm_set1_epi32(257);         // NOLINT(cert-err58-cpp)
-const m128i F3minus1_m128i = _mm_set1_epi32(256);   // NOLINT(cert-err58-cpp)
-
-const m128i F3_m128i_u16 = _mm_set1_epi16(257);       // NOLINT(cert-err58-cpp)
-const m128i F3minus1_m128i_u16 = _mm_set1_epi16(256); // NOLINT(cert-err58-cpp)
-
-#include "simd_128_u16.h"
-#include "simd_128_u32.h"
-
-#endif
diff --git a/src/simd_128_u16.h b/src/simd_128_u16.h
deleted file mode 100644
index a177cfb3..00000000
--- a/src/simd_128_u16.h
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
- * Copyright 2017-2018 Scality
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __QUAD_SIMD_128_U16_H__
-#define __QUAD_SIMD_128_U16_H__
-
-#include <x86intrin.h>
-
-#include "property.h"
-#include "simd/simd.h"
-
-namespace quadiron {
-namespace simd {
-
-/* ==================== Essential Operations =================== */
-
-/** Perform a%card where a is a addition of two numbers whose elements are
- *  symbols of GF(card) */
-inline m128i mod_after_add(m128i a, aint16 card)
-{
-    const m128i _card = _mm_set1_epi16(card);
-    const m128i _card_minus_1 = _mm_set1_epi16(card - 1);
-
-    m128i cmp = _mm_cmpgt_epi16(a, _card_minus_1);
-    m128i b = _mm_sub_epi16(a, _mm_and_si128(_card, cmp));
-
-    return b;
-}
-
-/** Perform addition of two numbers a, b whose elements are of GF(card) */
-inline m128i add(m128i a, m128i b, aint16 card)
-{
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-    m128i c = _mm_add_epi16(_a, _b);
-
-    // Modulo
-    return mod_after_add(c, card);
-}
-
-/** Perform subtraction of a by b where a, b whose elements are symbols of
- *  GF(card)
- * sub(a, b) = a - b if a >= b, or
- *             card + a - b, otherwise
- */
-inline m128i sub(m128i a, m128i b, aint16 card)
-{
-    const m128i _card = _mm_set1_epi16(card);
-
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-
-    m128i cmp = _mm_cmpgt_epi16(_b, _a);
-    m128i _a1 = _mm_add_epi16(_a, _mm_and_si128(_card, cmp));
-
-    return _mm_sub_epi16(_a1, _b);
-}
-
-/** Negate `a`
- * @return 0 if (a == 0), else card - a
- */
-inline m128i neg(m128i a, aint16 card = F3)
-{
-    const m128i _card = _mm_set1_epi16(card);
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_setzero_si128();
-
-    m128i cmp = _mm_cmpgt_epi16(_a, _b);
-
-    return _mm_sub_epi16(_mm_and_si128(cmp, _card), _a);
-}
-
-inline m128i mod_after_multiply(m128i a)
-{
-    const m128i mask = _mm_set1_epi16(F3 - 2);
-
-    m128i lo = _mm_and_si128(a, mask);
-
-    m128i a_shift = _mm_srli_si128(a, 1);
-    m128i hi = _mm_and_si128(a_shift, mask);
-
-    m128i cmp = _mm_cmpgt_epi16(hi, lo);
-    m128i _lo = _mm_add_epi16(lo, _mm_and_si128(F3_m128i_u16, cmp));
-
-    return _mm_sub_epi16(_lo, hi);
-}
-
-inline m128i mul(m128i a, m128i b)
-{
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-
-    m128i c = _mm_mullo_epi16(_a, _b);
-
-    // filter elements of both of a & b = card-1
-    m128i cmp = _mm_and_si128(
-        _mm_cmpeq_epi16(_a, F3minus1_m128i_u16),
-        _mm_cmpeq_epi16(_b, F3minus1_m128i_u16));
-
-    const m128i one = _mm_set1_epi16(1);
-    c = _mm_add_epi16(c, _mm_and_si128(one, cmp));
-
-    // Modulo
-    return mod_after_multiply(c);
-}
-
-/** Perform multiplication of two numbers a, b whose elements are of GF(card)
- *  where `card` is a prime Fermat number, i.e. card = Fx with x < 5
- *  Currently, it supports only for F3
- */
-inline m128i mul(m128i a, m128i b, aint16 card)
-{
-    // FIXME: generalize card
-    assert(card == F3);
-    return mul(a, b);
-}
-
-/** Apply an element-wise negation to a buffer
- */
-inline void neg(size_t len, aint16* buf, aint16 card = F3)
-{
-    m128i* _buf = reinterpret_cast<m128i*>(buf);
-    unsigned ratio = sizeof(*_buf) / sizeof(*buf);
-    size_t _len = len / ratio;
-    size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        _buf[i] = neg(_buf[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            if (buf[i])
-                buf[i] = card - buf[i];
-        }
-    }
-}
-
-/** Perform a multiplication of a coefficient `a` to each element of `src` and
- *  add result to correspondent element of `dest`
- */
-inline void mul_coef_to_buf(
-    const aint16 a,
-    aint16* src,
-    aint16* dest,
-    size_t len,
-    aint16 card = F3)
-{
-    const m128i coef = _mm_set1_epi16(a);
-
-    m128i* _src = reinterpret_cast<m128i*>(src);
-    m128i* _dest = reinterpret_cast<m128i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplication
-        _dest[i] = mul(coef, _src[i], card);
-    }
-    if (_last_len > 0) {
-        uint32_t coef_doubled = (uint32_t)a;
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplication
-            dest[i] = (aint16)((coef_doubled * src[i]) % card);
-        }
-    }
-}
-
-inline void
-add_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3)
-{
-    m128i* _src = reinterpret_cast<m128i*>(src);
-    m128i* _dest = reinterpret_cast<m128i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform addition
-        _dest[i] = add(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform addition
-            aint16 tmp = src[i] + dest[i];
-            dest[i] = (tmp >= card) ? (tmp - card) : tmp;
-        }
-    }
-}
-
-inline void sub_two_bufs(
-    aint16* bufa,
-    aint16* bufb,
-    aint16* res,
-    size_t len,
-    aint16 card = F3)
-{
-    m128i* _bufa = reinterpret_cast<m128i*>(bufa);
-    m128i* _bufb = reinterpret_cast<m128i*>(bufb);
-    m128i* _res = reinterpret_cast<m128i*>(res);
-    const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform subtraction
-        _res[i] = sub(_bufa[i], _bufb[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform subtraction
-            if (bufa[i] >= bufb[i])
-                res[i] = bufa[i] - bufb[i];
-            else
-                res[i] = card - (bufb[i] - bufa[i]);
-        }
-    }
-}
-
-inline void
-mul_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3)
-{
-    m128i* _src = reinterpret_cast<m128i*>(src);
-    m128i* _dest = reinterpret_cast<m128i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplicaton
-        _dest[i] = mul(_src[i], _dest[i], F3);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplicaton
-            // dest[i] = uint32_t(src[i]) * uint32_t(dest[i]) % card;
-            dest[i] = uint16_t((uint32_t(src[i]) * dest[i]) % card);
-        }
-    }
-}
-
-/*
- * buf1[i] = buf1[i] + coef * buf2[i]
- * buf2[i] = buf1[i] - coef * buf2[i]
- */
-inline void butterfly_ct(
-    uint16_t coef,
-    aint16* buf1,
-    aint16* buf2,
-    size_t len,
-    uint32_t card = F3)
-{
-    const m128i _coef = _mm_set1_epi16(coef);
-    m128i* _buf1 = reinterpret_cast<m128i*>(buf1);
-    m128i* _buf2 = reinterpret_cast<m128i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m128i a = mul(_coef, _buf2[i], card);
-        _buf2[i] = sub(_buf1[i], a, card);
-        _buf1[i] = add(_buf1[i], a, card);
-    }
-}
-
-/*
- * buf1[i] = buf1[i] + buf2[i]
- * buf2[i] = coef * (buf1[i] - buf2[i])
- */
-inline void butterfly_gs(
-    uint16_t coef,
-    aint16* buf1,
-    aint16* buf2,
-    size_t len,
-    uint16_t card = F3)
-{
-    const m128i _coef = _mm_set1_epi16(coef);
-    m128i* _buf1 = reinterpret_cast<m128i*>(buf1);
-    m128i* _buf2 = reinterpret_cast<m128i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m128i a = _buf1[i];
-        m128i b = _buf2[i];
-        m128i c = sub(a, b, card);
-        _buf1[i] = add(a, b, card);
-        _buf2[i] = mul(_coef, c, card);
-    }
-}
-
-inline void encode_post_process(
-    vec::Buffers<uint16_t>& output,
-    std::vector<Properties>& props,
-    off_t offset,
-    unsigned code_len,
-    uint16_t threshold,
-    size_t vecs_nb)
-{
-    const unsigned vec_size = simd::countof<uint16_t>();
-
-    const m128i _threshold = _mm_set1_epi16(threshold);
-    uint16_t max = 1 << (sizeof(uint16_t) * 8 - 1);
-    const m128i mask_hi = _mm_set1_epi16(max);
-    const unsigned element_size = sizeof(uint16_t);
-
-    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
-        uint16_t* chunk = output.get(frag_id);
-        m128i* buf = reinterpret_cast<m128i*>(chunk);
-        for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) {
-            const m128i a = _mm_load_si128(&(buf[vec_id]));
-            const m128i b = _mm_cmpeq_epi16(_threshold, a);
-            const m128i c = _mm_and_si128(mask_hi, b);
-            uint16_t d = _mm_movemask_epi8(c);
-
-            while (d > 0) {
-                unsigned byte_idx = __builtin_ctz(d);
-                unsigned element_idx = byte_idx / element_size;
-                off_t _offset = offset + vec_id * vec_size + element_idx;
-                props[frag_id].add(_offset, 1);
-                d ^= 1 << byte_idx;
-            }
-        }
-    }
-}
-
-} // namespace simd
-} // namespace quadiron
-
-#endif
diff --git a/src/simd_128_u32.h b/src/simd_128_u32.h
deleted file mode 100644
index 5039f0f8..00000000
--- a/src/simd_128_u32.h
+++ /dev/null
@@ -1,442 +0,0 @@
-/*
- * Copyright 2017-2018 Scality
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __QUAD_SIMD_128_U32_H__
-#define __QUAD_SIMD_128_U32_H__
-
-#include <x86intrin.h>
-
-#include "simd/simd.h"
-
-namespace quadiron {
-namespace simd {
-
-/* ==================== Essential Operations =================== */
-
-/** Perform a%card where a is a addition of two numbers whose elements are
- *  symbols of GF(card) */
-inline m128i mod_after_add(m128i a, aint32 card)
-{
-    const m128i _card = _mm_set1_epi32(card);
-    const m128i _card_minus_1 = _mm_set1_epi32(card - 1);
-
-    m128i cmp = _mm_cmpgt_epi32(a, _card_minus_1);
-    m128i b = _mm_sub_epi32(a, _mm_and_si128(_card, cmp));
-
-    return b;
-}
-
-/** Perform addition of two numbers a, b whose elements are of GF(card) */
-inline m128i add(m128i a, m128i b, aint32 card)
-{
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-    m128i c = _mm_add_epi32(_a, _b);
-
-    // Modulo
-    return mod_after_add(c, card);
-}
-
-/** Perform subtraction of a by b where a, b whose elements are symbols of
- *  GF(card)
- * sub(a, b) = a - b if a >= b, or
- *             card + a - b, otherwise
- */
-inline m128i sub(m128i a, m128i b, aint32 card)
-{
-    const m128i _card = _mm_set1_epi32(card);
-
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-
-    m128i cmp = _mm_cmpgt_epi32(_b, _a);
-    m128i _a1 = _mm_add_epi32(_a, _mm_and_si128(_card, cmp));
-
-    return _mm_sub_epi32(_a1, _b);
-}
-
-/** Negate `a`
- * @return 0 if (a == 0), else card - a
- */
-inline m128i neg(m128i a, aint32 card = F4)
-{
-    const m128i _card = _mm_set1_epi32(card);
-
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_setzero_si128();
-    m128i cmp = _mm_cmpgt_epi32(_a, _b);
-
-    return _mm_sub_epi32(_mm_and_si128(cmp, _card), _a);
-}
-
-/** Perform a%card where a is a multiplication of two numbers whose elements are
- *  symbols of GF(F4)
- *
- *  We find v in a = u * card + v
- *  a is expressed also as: a = hi * (card-1) + lo
- *  where hi and lo is 16-bit for F4 (or 8-bit for F3) high and low parts of a
- *  hence, v = (lo - hi) % F4
- *      v = lo - hi, if lo >= hi
- *          or
- *          F4 + lo - hi, otherwise
- */
-inline m128i mod_after_multiply_f4(m128i a)
-{
-    const m128i mask = _mm_set1_epi32(F4 - 2);
-
-    m128i lo = _mm_and_si128(a, mask);
-
-    m128i a_shift = _mm_srli_si128(a, 2);
-    m128i hi = _mm_and_si128(a_shift, mask);
-
-    m128i cmp = _mm_cmpgt_epi32(hi, lo);
-    m128i _lo = _mm_add_epi32(lo, _mm_and_si128(F4_m128i, cmp));
-
-    return _mm_sub_epi32(_lo, hi);
-}
-
-inline m128i mod_after_multiply_f3(m128i a)
-{
-    const m128i mask = _mm_set1_epi32(F3 - 2);
-
-    m128i lo = _mm_and_si128(a, mask);
-
-    m128i a_shift = _mm_srli_si128(a, 1);
-    m128i hi = _mm_and_si128(a_shift, mask);
-
-    m128i cmp = _mm_cmpgt_epi32(hi, lo);
-    m128i _lo = _mm_add_epi32(lo, _mm_and_si128(F3_m128i, cmp));
-
-    return _mm_sub_epi32(_lo, hi);
-}
-
-inline m128i mul_f4(m128i a, m128i b)
-{
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-
-    m128i c = _mm_mullo_epi32(_a, _b);
-
-    // filter elements of both of a & b = card-1
-    m128i cmp = _mm_and_si128(
-        _mm_cmpeq_epi32(_a, F4minus1_m128i),
-        _mm_cmpeq_epi32(_b, F4minus1_m128i));
-
-    const m128i one = _mm_set1_epi32(1);
-    c = _mm_add_epi32(c, _mm_and_si128(one, cmp));
-
-    // Modulo
-    return mod_after_multiply_f4(c);
-}
-
-inline m128i mul_f3(m128i a, m128i b)
-{
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-
-    m128i c = _mm_mullo_epi32(_a, _b);
-
-    // filter elements of both of a & b = card-1
-    m128i cmp = _mm_and_si128(
-        _mm_cmpeq_epi32(_a, F3minus1_m128i),
-        _mm_cmpeq_epi32(_b, F3minus1_m128i));
-
-    c = _mm_xor_si128(c, _mm_and_si128(F4_m128i, cmp));
-
-    // Modulo
-    return mod_after_multiply_f3(c);
-}
-
-/** Perform multiplication of two numbers a, b whose elements are of GF(card)
- *  where `card` is a prime Fermat number, i.e. card = Fx with x < 5
- *  Currently, it supports only for F3 and F4
- */
-inline m128i mul(m128i a, m128i b, aint32 card)
-{
-    assert(card == F4 || card == F3);
-    if (card == F4)
-        return mul_f4(a, b);
-    return mul_f3(a, b);
-}
-
-/** Apply an element-wise negation to a buffer
- */
-inline void neg(size_t len, aint32* buf, aint32 card = F4)
-{
-    m128i* _buf = reinterpret_cast<m128i*>(buf);
-    unsigned ratio = sizeof(*_buf) / sizeof(*buf);
-    size_t _len = len / ratio;
-    size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        _buf[i] = neg(_buf[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            if (buf[i] > 0)
-                buf[i] = card - buf[i];
-        }
-    }
-}
-
-/** Perform a multiplication of a coefficient `a` to each element of `src` and
- *  add result to correspondent element of `dest`
- */
-inline void mul_coef_to_buf(
-    const aint32 a,
-    aint32* src,
-    aint32* dest,
-    size_t len,
-    aint32 card = F4)
-{
-    const m128i coef = _mm_set1_epi32(a);
-
-    m128i* _src = reinterpret_cast<m128i*>(src);
-    m128i* _dest = reinterpret_cast<m128i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplication
-        _dest[i] = mul(coef, _src[i], card);
-    }
-    if (_last_len > 0) {
-        uint64_t coef_64 = (uint64_t)a;
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplication
-            dest[i] = (aint32)((coef_64 * src[i]) % card);
-        }
-    }
-}
-
-inline void
-add_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4)
-{
-    m128i* _src = reinterpret_cast<m128i*>(src);
-    m128i* _dest = reinterpret_cast<m128i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform addition
-        _dest[i] = add(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform addition
-            aint32 tmp = src[i] + dest[i];
-            dest[i] = (tmp >= card) ? (tmp - card) : tmp;
-        }
-    }
-}
-
-inline void sub_two_bufs(
-    aint32* bufa,
-    aint32* bufb,
-    aint32* res,
-    size_t len,
-    aint32 card = F4)
-{
-    m128i* _bufa = reinterpret_cast<m128i*>(bufa);
-    m128i* _bufb = reinterpret_cast<m128i*>(bufb);
-    m128i* _res = reinterpret_cast<m128i*>(res);
-    const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform subtraction
-        _res[i] = sub(_bufa[i], _bufb[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform subtraction
-            if (bufa[i] >= bufb[i])
-                res[i] = bufa[i] - bufb[i];
-            else
-                res[i] = card - (bufb[i] - bufa[i]);
-        }
-    }
-}
-
-inline void
-mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4)
-{
-    m128i* _src = reinterpret_cast<m128i*>(src);
-    m128i* _dest = reinterpret_cast<m128i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplicaton
-        _dest[i] = mul(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplicaton
-            dest[i] = uint32_t((uint64_t(src[i]) * dest[i]) % card);
-        }
-    }
-}
-
-/*
- * buf1[i] = buf1[i] + coef * buf2[i]
- * buf2[i] = buf1[i] - coef * buf2[i]
- */
-inline void butterfly_ct(
-    uint32_t coef,
-    aint32* buf1,
-    aint32* buf2,
-    size_t len,
-    uint32_t card = F4)
-{
-    const m128i _coef = _mm_set1_epi32(coef);
-    m128i* _buf1 = reinterpret_cast<m128i*>(buf1);
-    m128i* _buf2 = reinterpret_cast<m128i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m128i a = mul(_coef, _buf2[i], card);
-        _buf2[i] = sub(_buf1[i], a, card);
-        _buf1[i] = add(_buf1[i], a, card);
-    }
-}
-
-/*
- * buf1[i] = buf1[i] + buf2[i]
- * buf2[i] = coef * (buf1[i] - buf2[i])
- */
-inline void butterfly_gs(
-    uint32_t coef,
-    aint32* buf1,
-    aint32* buf2,
-    size_t len,
-    uint32_t card = F4)
-{
-    const m128i _coef = _mm_set1_epi32(coef);
-    m128i* _buf1 = reinterpret_cast<m128i*>(buf1);
-    m128i* _buf2 = reinterpret_cast<m128i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m128i a = _buf1[i];
-        m128i b = _buf2[i];
-        m128i c = sub(a, b, card);
-        _buf1[i] = add(a, b, card);
-        _buf2[i] = mul(_coef, c, card);
-    }
-}
-
-inline void encode_post_process(
-    vec::Buffers<uint32_t>& output,
-    std::vector<Properties>& props,
-    off_t offset,
-    unsigned code_len,
-    uint32_t threshold,
-    size_t vecs_nb)
-{
-    const unsigned vec_size = simd::countof<uint32_t>();
-
-    const m128i _threshold = _mm_set1_epi32(threshold);
-    const uint32_t max = 1 << (sizeof(uint32_t) * 8 - 1);
-    const m128i mask_hi = _mm_set1_epi32(max);
-    const unsigned element_size = sizeof(uint32_t);
-
-    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
-        uint32_t* chunk = output.get(frag_id);
-        m128i* buf = reinterpret_cast<m128i*>(chunk);
-        for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) {
-            const m128i a = _mm_load_si128(&(buf[vec_id]));
-            const m128i b = _mm_cmpeq_epi32(_threshold, a);
-            const m128i c = _mm_and_si128(mask_hi, b);
-            uint16_t d = _mm_movemask_epi8(c);
-
-            while (d > 0) {
-                unsigned byte_idx = __builtin_ctz(d);
-                unsigned element_idx = byte_idx / element_size;
-                off_t _offset = offset + vec_id * vec_size + element_idx;
-                props[frag_id].add(_offset, 1);
-                d ^= 1 << byte_idx;
-            }
-        }
-    }
-}
-
-/* ==================== Operations for NF4 =================== */
-
-/** Return aint128 integer from a _m128i register */
-static inline aint128 m128i_to_uint128(m128i v)
-{
-    aint128 i;
-    _mm_store_si128((m128i*)&i, v);
-
-    return i; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn)
-}
-
-inline __uint128_t add(__uint128_t a, __uint128_t b)
-{
-    m128i res = add((m128i)a, (m128i)b, F4);
-    return m128i_to_uint128(res);
-}
-
-inline __uint128_t sub(__uint128_t a, __uint128_t b)
-{
-    m128i res = sub((m128i)a, (m128i)b, F4);
-    return m128i_to_uint128(res);
-}
-
-inline __uint128_t mul(__uint128_t a, __uint128_t b)
-{
-    m128i res = mul((m128i)a, (m128i)b, F4);
-    return m128i_to_uint128(res);
-}
-
-inline void hadamard_mul(int n, aint128* _x, aint128* _y)
-{
-    int i;
-    m128i* x = reinterpret_cast<m128i*>(_x);
-    m128i* y = reinterpret_cast<m128i*>(_y);
-
-    // multiply y to `x`
-    for (i = 0; i < n; i++) {
-        x[i] = mul(x[i], y[i], F4);
-    }
-}
-
-} // namespace simd
-} // namespace quadiron
-
-#endif
diff --git a/src/simd_256.h b/src/simd_256.h
deleted file mode 100644
index 2dc49cc4..00000000
--- a/src/simd_256.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright 2017-2018 Scality
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __QUAD_SIMD_256_H__
-#define __QUAD_SIMD_256_H__
-
-#include <x86intrin.h>
-
-typedef __m256i m256i;
-
-// Disable `cert-err58-cpp` on these: AFAIK they cannot throw.
-// (probably a false positive present in Clang 5 and fixed in Clang 6).
-const m256i F4_m256i = _mm256_set1_epi32(65537);       // NOLINT(cert-err58-cpp)
-const m256i F4minus1_m256i = _mm256_set1_epi32(65536); // NOLINT(cert-err58-cpp)
-const m256i F3_m256i = _mm256_set1_epi32(257);         // NOLINT(cert-err58-cpp)
-const m256i F3minus1_m256i = _mm256_set1_epi32(256);   // NOLINT(cert-err58-cpp)
-
-const m256i F3_m256i_u16 = _mm256_set1_epi16(257); // NOLINT(cert-err58-cpp)
-// NOLINTNEXTLINE(cert-err58-cpp)
-const m256i F3minus1_m256i_u16 = _mm256_set1_epi16(256);
-
-/* GCC doesn't include the split store intrinsics so define them here. */
-#if defined(__GNUC__) && !defined(__clang__)
-
-static inline void __attribute__((__always_inline__))
-_mm256_storeu2_m128i(__m128i* const hi, __m128i* const lo, const __m256i a)
-{
-    _mm_storeu_si128(lo, _mm256_castsi256_si128(a));
-    _mm_storeu_si128(hi, _mm256_extracti128_si256(a, 1));
-}
-
-#endif /* defined(__GNUC__) */
-
-#include "simd_256_u16.h"
-#include "simd_256_u32.h"
-
-#endif
diff --git a/src/simd_256_u16.h b/src/simd_256_u16.h
deleted file mode 100644
index 974e136e..00000000
--- a/src/simd_256_u16.h
+++ /dev/null
@@ -1,354 +0,0 @@
-/*
- * Copyright 2017-2018 Scality
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __QUAD_SIMD_256_U16_H__
-#define __QUAD_SIMD_256_U16_H__
-
-#include <x86intrin.h>
-
-#include "simd/simd.h"
-
-namespace quadiron {
-namespace simd {
-
-/** Perform a%card where a is a addition of two numbers whose elements are
- *  symbols of GF(card) */
-inline m256i mod_after_add(m256i a, aint16 card)
-{
-    const m256i _card = _mm256_set1_epi16(card);
-    const m256i _card_minus_1 = _mm256_set1_epi16(card - 1);
-
-    m256i cmp = _mm256_cmpgt_epi16(a, _card_minus_1);
-    m256i b = _mm256_sub_epi16(a, _mm256_and_si256(_card, cmp));
-
-    return b;
-}
-
-/** Perform addition of two numbers a, b whose elements are of GF(card) */
-inline m256i add(m256i a, m256i b, aint16 card = F3)
-{
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
-    m256i c = _mm256_add_epi16(_a, _b);
-
-    // Modulo
-    return mod_after_add(c, card);
-}
-
-/** Perform subtraction of a by b where a, b whose elements are symbols of
- *  GF(card)
- * sub(a, b) = a - b if a >= b, or
- *             card + a - b, otherwise
- */
-inline m256i sub(m256i a, m256i b, aint16 card)
-{
-    const m256i _card = _mm256_set1_epi16(card);
-
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
-
-    m256i cmp = _mm256_cmpgt_epi16(_b, _a);
-    m256i _a1 = _mm256_add_epi16(_a, _mm256_and_si256(_card, cmp));
-
-    return _mm256_sub_epi16(_a1, _b);
-}
-
-/** Negate `a`
- * @return 0 if (a == 0), else card - a
- */
-inline m256i neg(m256i a, aint16 card = F3)
-{
-    const m256i _card = _mm256_set1_epi16(card);
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_setzero_si256();
-
-    m256i cmp = _mm256_cmpgt_epi16(_a, _b);
-
-    return _mm256_sub_epi16(_mm256_and_si256(cmp, _card), _a);
-}
-
-inline m256i mod_after_multiply(m256i a)
-{
-    const m256i mask = _mm256_set1_epi16(F3 - 2);
-
-    m256i lo = _mm256_and_si256(a, mask);
-
-    m256i a_shift = _mm256_srli_si256(a, 1);
-    m256i hi = _mm256_and_si256(a_shift, mask);
-
-    m256i cmp = _mm256_cmpgt_epi16(hi, lo);
-    m256i _lo = _mm256_add_epi16(lo, _mm256_and_si256(F3_m256i_u16, cmp));
-
-    return _mm256_sub_epi16(_lo, hi);
-}
-
-inline m256i mul(m256i a, m256i b)
-{
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
-
-    m256i c = _mm256_mullo_epi16(_a, _b);
-
-    // filter elements of both of a & b = card-1
-    m256i cmp = _mm256_and_si256(
-        _mm256_cmpeq_epi16(_a, F3minus1_m256i_u16),
-        _mm256_cmpeq_epi16(_b, F3minus1_m256i_u16));
-
-    const m256i one = _mm256_set1_epi16(1);
-    c = _mm256_add_epi16(c, _mm256_and_si256(one, cmp));
-
-    // Modulo
-    return mod_after_multiply(c);
-}
-
-/** Perform multiplication of two numbers a, b whose elements are of GF(card)
- *  where `card` is a prime Fermat number, i.e. card = Fx with x < 5
- *  Currently, it supports only for F3
- */
-inline m256i mul(m256i a, m256i b, aint16 card)
-{
-    // FIXME: generalize card
-    assert(card == F3);
-    return mul(a, b);
-}
-
-/** Apply an element-wise negation to a buffer
- */
-inline void neg(size_t len, aint16* buf, aint16 card = F3)
-{
-    m256i* _buf = reinterpret_cast<m256i*>(buf);
-    unsigned ratio = sizeof(*_buf) / sizeof(*buf);
-    size_t _len = len / ratio;
-    size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        _buf[i] = neg(_buf[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            if (buf[i])
-                buf[i] = card - buf[i];
-        }
-    }
-}
-
-/** Perform a multiplication of a coefficient `a` to each element of `src` and
- *  add result to correspondent element of `dest`
- */
-inline void mul_coef_to_buf(
-    const aint16 a,
-    aint16* src,
-    aint16* dest,
-    size_t len,
-    aint16 card = F3)
-{
-    const m256i coef = _mm256_set1_epi16(a);
-
-    m256i* _src = reinterpret_cast<m256i*>(src);
-    m256i* _dest = reinterpret_cast<m256i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplication
-        _dest[i] = mul(coef, _src[i], card);
-    }
-    if (_last_len > 0) {
-        uint32_t coef_doubled = (uint32_t)a;
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplication
-            dest[i] = (aint16)((coef_doubled * src[i]) % card);
-        }
-    }
-}
-
-inline void
-add_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3)
-{
-    m256i* _src = reinterpret_cast<m256i*>(src);
-    m256i* _dest = reinterpret_cast<m256i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform addition
-        _dest[i] = add(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform addition
-            aint16 tmp = src[i] + dest[i];
-            dest[i] = (tmp >= card) ? (tmp - card) : tmp;
-        }
-    }
-}
-
-inline void sub_two_bufs(
-    aint16* bufa,
-    aint16* bufb,
-    aint16* res,
-    size_t len,
-    aint16 card = F3)
-{
-    m256i* _bufa = reinterpret_cast<m256i*>(bufa);
-    m256i* _bufb = reinterpret_cast<m256i*>(bufb);
-    m256i* _res = reinterpret_cast<m256i*>(res);
-    const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform subtraction
-        _res[i] = sub(_bufa[i], _bufb[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform subtraction
-            if (bufa[i] >= bufb[i])
-                res[i] = bufa[i] - bufb[i];
-            else
-                res[i] = card - (bufb[i] - bufa[i]);
-        }
-    }
-}
-
-inline void
-mul_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3)
-{
-    m256i* _src = reinterpret_cast<m256i*>(src);
-    m256i* _dest = reinterpret_cast<m256i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplicaton
-        _dest[i] = mul(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplicaton
-            dest[i] = (uint32_t(src[i]) * dest[i]) % card;
-        }
-    }
-}
-
-/*
- * buf1[i] = buf1[i] + coef * buf2[i]
- * buf2[i] = buf1[i] - coef * buf2[i]
- */
-inline void butterfly_ct(
-    uint16_t coef,
-    aint16* buf1,
-    aint16* buf2,
-    size_t len,
-    uint16_t card = F3)
-{
-    const m256i _coef = _mm256_set1_epi16(coef);
-    m256i* _buf1 = reinterpret_cast<m256i*>(buf1);
-    m256i* _buf2 = reinterpret_cast<m256i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m256i a = mul(_coef, _buf2[i], card);
-        _buf2[i] = sub(_buf1[i], a, card);
-        _buf1[i] = add(_buf1[i], a, card);
-    }
-}
-
-/*
- * buf1[i] = buf1[i] + buf2[i]
- * buf2[i] = coef * (buf1[i] - buf2[i])
- */
-inline void butterfly_gs(
-    uint16_t coef,
-    aint16* buf1,
-    aint16* buf2,
-    size_t len,
-    uint16_t card = F3)
-{
-    const m256i _coef = _mm256_set1_epi16(coef);
-    m256i* _buf1 = reinterpret_cast<m256i*>(buf1);
-    m256i* _buf2 = reinterpret_cast<m256i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m256i a = _buf1[i];
-        m256i b = _buf2[i];
-        m256i c = sub(a, b, card);
-        _buf1[i] = add(a, b, card);
-        _buf2[i] = mul(_coef, c, card);
-    }
-}
-
-inline void encode_post_process(
-    vec::Buffers<uint16_t>& output,
-    std::vector<Properties>& props,
-    off_t offset,
-    unsigned code_len,
-    uint16_t threshold,
-    size_t vecs_nb)
-{
-    const unsigned vec_size = simd::countof<uint16_t>();
-
-    const m256i _threshold = _mm256_set1_epi16(threshold);
-    uint16_t max = 1 << (sizeof(uint16_t) * 8 - 1);
-    const m256i mask_hi = _mm256_set1_epi16(max);
-    const unsigned element_size = sizeof(uint16_t);
-
-    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
-        uint16_t* chunk = output.get(frag_id);
-        m256i* buf = reinterpret_cast<m256i*>(chunk);
-        for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) {
-            const m256i a = _mm256_load_si256(&(buf[vec_id]));
-            const m256i b = _mm256_cmpeq_epi16(_threshold, a);
-            const m256i c = _mm256_and_si256(mask_hi, b);
-            uint32_t d = _mm256_movemask_epi8(c);
-
-            while (d > 0) {
-                unsigned byte_idx = __builtin_ctz(d);
-                unsigned element_idx = byte_idx / element_size;
-                off_t _offset = offset + vec_id * vec_size + element_idx;
-                props[frag_id].add(_offset, 1);
-                d ^= 1 << byte_idx;
-            }
-        }
-    }
-}
-
-} // namespace simd
-} // namespace quadiron
-
-#endif
diff --git a/src/simd_256_u32.h b/src/simd_256_u32.h
deleted file mode 100644
index 9c76c89b..00000000
--- a/src/simd_256_u32.h
+++ /dev/null
@@ -1,466 +0,0 @@
-/*
- * Copyright 2017-2018 Scality
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __QUAD_SIMD_256_U32_H__
-#define __QUAD_SIMD_256_U32_H__
-
-#include <x86intrin.h>
-
-#include "simd/simd.h"
-
-namespace quadiron {
-namespace simd {
-
-/* ==================== Essential Operations =================== */
-
-/** Perform a%card where a is a addition of two numbers whose elements are
- *  symbols of GF(card) */
-inline m256i mod_after_add(m256i a, aint32 card)
-{
-    const m256i _card = _mm256_set1_epi32(card);
-    const m256i _card_minus_1 = _mm256_set1_epi32(card - 1);
-
-    m256i cmp = _mm256_cmpgt_epi32(a, _card_minus_1);
-    m256i b = _mm256_sub_epi32(a, _mm256_and_si256(_card, cmp));
-
-    return b;
-}
-
-/** Perform addition of two numbers a, b whose elements are of GF(card) */
-inline m256i add(m256i a, m256i b, aint32 card)
-{
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
-    m256i c = _mm256_add_epi32(_a, _b);
-
-    // Modulo
-    return mod_after_add(c, card);
-}
-
-/** Perform subtraction of a by b where a, b whose elements are symbols of
- *  GF(card)
- * sub(a, b) = a - b if a >= b, or
- *             card + a - b, otherwise
- */
-inline m256i sub(m256i a, m256i b, aint32 card)
-{
-    const m256i _card = _mm256_set1_epi32(card);
-
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
-
-    m256i cmp = _mm256_cmpgt_epi32(_b, _a);
-    m256i _a1 = _mm256_add_epi32(_a, _mm256_and_si256(_card, cmp));
-
-    return _mm256_sub_epi32(_a1, _b);
-}
-
-/** Negate `a`
- * @return 0 if (a == 0), else card - a
- */
-inline m256i neg(m256i a, aint32 card = F4)
-{
-    const m256i _card = _mm256_set1_epi32(card);
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_setzero_si256();
-
-    m256i cmp = _mm256_cmpgt_epi32(_a, _b);
-
-    return _mm256_sub_epi32(_mm256_and_si256(cmp, _card), _a);
-}
-
-/** Perform a%card where a is a multiplication of two numbers whose elements are
- *  symbols of GF(F4)
- *
- *  We find v in a = u * card + v
- *  a is expressed also as: a = hi * (card-1) + lo
- *  where hi and lo is 16-bit for F4 (or 8-bit for F3) high and low parts of a
- *  hence, v = (lo - hi) % F4
- *      v = lo - hi, if lo >= hi
- *          or
- *          F4 + lo - hi, otherwise
- */
-inline m256i mod_after_multiply_f4(m256i a)
-{
-    const m256i mask = _mm256_set1_epi32(F4 - 2);
-
-    m256i lo = _mm256_and_si256(a, mask);
-
-    m256i a_shift = _mm256_srli_si256(a, 2);
-    m256i hi = _mm256_and_si256(a_shift, mask);
-
-    m256i cmp = _mm256_cmpgt_epi32(hi, lo);
-    m256i _lo = _mm256_add_epi32(lo, _mm256_and_si256(F4_m256i, cmp));
-
-    return _mm256_sub_epi32(_lo, hi);
-}
-
-inline m256i mod_after_multiply_f3(m256i a)
-{
-    const m256i mask = _mm256_set1_epi32(F3 - 2);
-
-    m256i lo = _mm256_and_si256(a, mask);
-
-    m256i a_shift = _mm256_srli_si256(a, 1);
-    m256i hi = _mm256_and_si256(a_shift, mask);
-
-    m256i cmp = _mm256_cmpgt_epi32(hi, lo);
-    m256i _lo = _mm256_add_epi32(lo, _mm256_and_si256(F3_m256i, cmp));
-
-    return _mm256_sub_epi32(_lo, hi);
-}
-
-inline m256i mul_f4(m256i a, m256i b)
-{
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
-
-    m256i c = _mm256_mullo_epi32(_a, _b);
-
-    // filter elements of both of a & b = card-1
-    m256i cmp = _mm256_and_si256(
-        _mm256_cmpeq_epi32(_a, F4minus1_m256i),
-        _mm256_cmpeq_epi32(_b, F4minus1_m256i));
-
-    const m256i one = _mm256_set1_epi32(1);
-    c = _mm256_add_epi32(c, _mm256_and_si256(one, cmp));
-
-    // Modulo
-    return mod_after_multiply_f4(c);
-}
-
-inline m256i mul_f3(m256i a, m256i b)
-{
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
-
-    m256i c = _mm256_mullo_epi32(_a, _b);
-
-    // filter elements of both of a & b = card-1
-    m256i cmp = _mm256_and_si256(
-        _mm256_cmpeq_epi32(_a, F3minus1_m256i),
-        _mm256_cmpeq_epi32(_b, F3minus1_m256i));
-
-    c = _mm256_xor_si256(c, _mm256_and_si256(F4_m256i, cmp));
-
-    // Modulo
-    return mod_after_multiply_f3(c);
-}
-
-/** Perform multiplication of two numbers a, b whose elements are of GF(card)
- *  where `card` is a prime Fermat number, i.e. card = Fx with x < 5
- *  Currently, it supports only for F3 and F4
- */
-inline m256i mul(m256i a, m256i b, aint32 card)
-{
-    assert(card == F4 || card == F3);
-    if (card == F4)
-        return mul_f4(a, b);
-    return mul_f3(a, b);
-}
-
-/** Apply an element-wise negation to a buffer
- */
-inline void neg(size_t len, aint32* buf, aint32 card = F4)
-{
-    m256i* _buf = reinterpret_cast<m256i*>(buf);
-    unsigned ratio = sizeof(*_buf) / sizeof(*buf);
-    size_t _len = len / ratio;
-    size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        _buf[i] = neg(_buf[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            if (buf[i])
-                buf[i] = card - buf[i];
-        }
-    }
-}
-
-/** Perform a multiplication of a coefficient `a` to each element of `src` and
- *  add result to correspondent element of `dest`
- */
-inline void mul_coef_to_buf(
-    const aint32 a,
-    aint32* src,
-    aint32* dest,
-    size_t len,
-    aint32 card = F4)
-{
-    const m256i coef = _mm256_set1_epi32(a);
-
-    m256i* _src = reinterpret_cast<m256i*>(src);
-    m256i* _dest = reinterpret_cast<m256i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplication
-        _dest[i] = mul(coef, _src[i], card);
-    }
-    if (_last_len > 0) {
-        uint64_t coef_64 = (uint64_t)a;
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplication
-            dest[i] = (aint32)((coef_64 * src[i]) % card);
-        }
-    }
-}
-
-inline void
-add_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4)
-{
-    m256i* _src = reinterpret_cast<m256i*>(src);
-    m256i* _dest = reinterpret_cast<m256i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform addition
-        _dest[i] = add(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform addition
-            aint32 tmp = src[i] + dest[i];
-            dest[i] = (tmp >= card) ? (tmp - card) : tmp;
-        }
-    }
-}
-
-inline void sub_two_bufs(
-    aint32* bufa,
-    aint32* bufb,
-    aint32* res,
-    size_t len,
-    aint32 card = F4)
-{
-    m256i* _bufa = reinterpret_cast<m256i*>(bufa);
-    m256i* _bufb = reinterpret_cast<m256i*>(bufb);
-    m256i* _res = reinterpret_cast<m256i*>(res);
-    const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform subtraction
-        _res[i] = sub(_bufa[i], _bufb[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform subtraction
-            if (bufa[i] >= bufb[i])
-                res[i] = bufa[i] - bufb[i];
-            else
-                res[i] = card - (bufb[i] - bufa[i]);
-        }
-    }
-}
-
-inline void
-mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4)
-{
-    m256i* _src = reinterpret_cast<m256i*>(src);
-    m256i* _dest = reinterpret_cast<m256i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplicaton
-        _dest[i] = mul(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplicaton
-            dest[i] = uint32_t((uint64_t(src[i]) * dest[i]) % card);
-        }
-    }
-}
-
-/*
- * buf1[i] = buf1[i] + coef * buf2[i]
- * buf2[i] = buf1[i] - coef * buf2[i]
- */
-inline void butterfly_ct(
-    uint32_t coef,
-    aint32* buf1,
-    aint32* buf2,
-    size_t len,
-    uint32_t card = F4)
-{
-    const m256i _coef = _mm256_set1_epi32(coef);
-    m256i* _buf1 = reinterpret_cast<m256i*>(buf1);
-    m256i* _buf2 = reinterpret_cast<m256i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m256i a = mul(_coef, _buf2[i], card);
-        _buf2[i] = sub(_buf1[i], a, card);
-        _buf1[i] = add(_buf1[i], a, card);
-    }
-}
-
-/*
- * buf1[i] = buf1[i] + buf2[i]
- * buf2[i] = coef * (buf1[i] - buf2[i])
- */
-inline void butterfly_gs(
-    uint32_t coef,
-    aint32* buf1,
-    aint32* buf2,
-    size_t len,
-    uint32_t card = F4)
-{
-    const m256i _coef = _mm256_set1_epi32(coef);
-    m256i* _buf1 = reinterpret_cast<m256i*>(buf1);
-    m256i* _buf2 = reinterpret_cast<m256i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m256i a = add(_buf1[i], _buf2[i], card);
-        _buf2[i] = mul(_coef, sub(_buf1[i], _buf2[i], card), card);
-        _buf1[i] = a;
-    }
-}
-
-inline void encode_post_process(
-    vec::Buffers<uint32_t>& output,
-    std::vector<Properties>& props,
-    off_t offset,
-    unsigned code_len,
-    uint32_t threshold,
-    size_t vecs_nb)
-{
-    const unsigned vec_size = simd::countof<uint32_t>();
-
-    const m256i _threshold = _mm256_set1_epi32(threshold);
-    const uint32_t max = 1 << (sizeof(uint32_t) * 8 - 1);
-    const m256i mask_hi = _mm256_set1_epi32(max);
-    const unsigned element_size = sizeof(uint32_t);
-
-    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
-        uint32_t* chunk = output.get(frag_id);
-        m256i* buf = reinterpret_cast<m256i*>(chunk);
-        for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) {
-            const m256i a = _mm256_load_si256(&(buf[vec_id]));
-            const m256i b = _mm256_cmpeq_epi32(_threshold, a);
-            const m256i c = _mm256_and_si256(mask_hi, b);
-            uint32_t d = _mm256_movemask_epi8(c);
-
-            while (d > 0) {
-                unsigned byte_idx = __builtin_ctz(d);
-                unsigned element_idx = byte_idx / element_size;
-                off_t _offset = offset + vec_id * vec_size + element_idx;
-                props[frag_id].add(_offset, 1);
-                d ^= 1 << byte_idx;
-            }
-        }
-    }
-}
-
-/* ==================== Operations for NF4 =================== */
-typedef __m128i m128i;
-
-/** Return aint128 integer from a _m128i register */
-inline aint128 m256i_to_uint128(m256i v)
-{
-    aint128 hi, lo;
-    _mm256_storeu2_m128i((m128i*)&hi, (m128i*)&lo, v);
-    return lo; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn)
-}
-
-inline __uint128_t add(__uint128_t a, __uint128_t b)
-{
-    m256i _a = _mm256_castsi128_si256((m128i)a);
-    m256i _b = _mm256_castsi128_si256((m128i)b);
-    m256i res = add(_a, _b, F4);
-    return m256i_to_uint128(res);
-}
-
-inline __uint128_t sub(__uint128_t a, __uint128_t b)
-{
-    m256i _a = _mm256_castsi128_si256((m128i)a);
-    m256i _b = _mm256_castsi128_si256((m128i)b);
-    m256i res = sub(_a, _b, F4);
-    return m256i_to_uint128(res);
-}
-
-inline __uint128_t mul(__uint128_t a, __uint128_t b)
-{
-    m256i _a = _mm256_castsi128_si256((m128i)a);
-    m256i _b = _mm256_castsi128_si256((m128i)b);
-    m256i res = mul(_a, _b, F4);
-    return m256i_to_uint128(res);
-}
-
-/** Store low 128-bit part of `reg` to memory */
-inline void store_low(aint128* address, m256i reg)
-{
-    _mm_store_si128((m128i*)address, _mm256_castsi256_si128(reg));
-}
-
-inline void hadamard_mul(int n, aint128* _x, aint128* _y)
-{
-    int i;
-    m256i* x = reinterpret_cast<m256i*>(_x);
-    m256i* y = reinterpret_cast<m256i*>(_y);
-
-    const unsigned ratio = sizeof(*x) / sizeof(*_x);
-    const int len_256 = n / ratio;
-    const int last_len = n - len_256 * ratio;
-
-    // multiply y to the first half of `x`
-    for (i = 0; i < len_256; i++) {
-        x[i] = mul(x[i], y[i], F4);
-    }
-
-    if (last_len > 0) {
-        // add last _y[] to x
-        for (i = len_256 * ratio; i < n; i++) {
-            m256i _x_p = _mm256_castsi128_si256((m128i)_x[i]);
-            m256i _y_p = _mm256_castsi128_si256((m128i)_y[i]);
-
-            store_low(_x + i, mul(_x_p, _y_p, F4));
-        }
-    }
-}
-
-} // namespace simd
-} // namespace quadiron
-
-#endif
diff --git a/src/simd_nf4.h b/src/simd_nf4.h
deleted file mode 100644
index 7d517430..00000000
--- a/src/simd_nf4.h
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Copyright 2017-2018 Scality
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __QUAD_SIMD_NF4_H__
-#define __QUAD_SIMD_NF4_H__
-
-#include <x86intrin.h>
-
-#include <simd/simd.h>
-
-namespace quadiron {
-namespace simd {
-
-#ifdef __AVX2__
-typedef __m128i m128i;
-
-/** Return aint128 integer from a _m128i register */
-static inline aint128 m128i_to_uint128(m128i v)
-{
-    aint128 i;
-    _mm_store_si128((m128i*)&i, v);
-
-    return i; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn)
-}
-#endif // #ifdef __AVX2__
-
-inline aint128 expand16(uint16_t* arr, int n)
-{
-    // since n <= 4
-    uint16_t _arr[4] __attribute__((aligned(ALIGNMENT))) = {0, 0, 0, 0};
-    std::copy_n(arr, n, _arr);
-
-    m128i b = _mm_set_epi16(0, 0, 0, 0, _arr[3], _arr[2], _arr[1], _arr[0]);
-
-    return m128i_to_uint128(b);
-}
-
-inline aint128 expand32(uint32_t* arr, int n)
-{
-    // since n <= 4
-    uint32_t _arr[4] __attribute__((aligned(simd::ALIGNMENT))) = {0, 0, 0, 0};
-    std::copy_n(arr, n, _arr);
-
-    m128i b = _mm_set_epi32(_arr[3], _arr[2], _arr[1], _arr[0]);
-
-    return m128i_to_uint128(b);
-}
-
-inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n)
-{
-    uint16_t ai[8];
-    aint128 values;
-
-    m128i _a = _mm_loadu_si128((m128i*)&a);
-    ai[0] = _mm_extract_epi16(_a, 0);
-    ai[1] = _mm_extract_epi16(_a, 1);
-    ai[2] = _mm_extract_epi16(_a, 2);
-    ai[3] = _mm_extract_epi16(_a, 3);
-    ai[4] = _mm_extract_epi16(_a, 4);
-    ai[5] = _mm_extract_epi16(_a, 5);
-    ai[6] = _mm_extract_epi16(_a, 6);
-    ai[7] = _mm_extract_epi16(_a, 7);
-
-    const uint32_t flag =
-        ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u);
-
-    m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]);
-    _mm_store_si128((m128i*)&values, val);
-
-    GroupedValues<__uint128_t> b = {values, flag};
-
-    return b;
-}
-
-inline void unpack(__uint128_t a, GroupedValues<__uint128_t>& b, int n)
-{
-    uint16_t ai[8];
-    aint128 values;
-
-    m128i _a = _mm_loadu_si128((m128i*)&a);
-    ai[0] = _mm_extract_epi16(_a, 0);
-    ai[1] = _mm_extract_epi16(_a, 1);
-    ai[2] = _mm_extract_epi16(_a, 2);
-    ai[3] = _mm_extract_epi16(_a, 3);
-    ai[4] = _mm_extract_epi16(_a, 4);
-    ai[5] = _mm_extract_epi16(_a, 5);
-    ai[6] = _mm_extract_epi16(_a, 6);
-    ai[7] = _mm_extract_epi16(_a, 7);
-
-    const uint32_t flag =
-        ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u);
-
-    m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]);
-    _mm_store_si128((m128i*)&values, val);
-
-    b.flag = flag;
-    b.values = values; // NOLINT(clang-analyzer-core.uninitialized.Assign)
-}
-
-inline aint128 pack(__uint128_t a)
-{
-    m128i _a = _mm_loadu_si128((m128i*)&a);
-    m128i b = _mm_set_epi32(
-        _mm_extract_epi16(_a, 3),
-        _mm_extract_epi16(_a, 2),
-        _mm_extract_epi16(_a, 1),
-        _mm_extract_epi16(_a, 0));
-
-    return m128i_to_uint128(b);
-}
-
-inline aint128 pack(__uint128_t a, uint32_t flag)
-{
-    aint32 b0, b1, b2, b3;
-    m128i _a = _mm_loadu_si128((m128i*)&a);
-
-    if (flag & 1)
-        b0 = 65536;
-    else
-        b0 = _mm_extract_epi16(_a, 0);
-    flag >>= 1;
-    if (flag & 1)
-        b1 = 65536;
-    else
-        b1 = _mm_extract_epi16(_a, 1);
-    flag >>= 1;
-    if (flag & 1)
-        b2 = 65536;
-    else
-        b2 = _mm_extract_epi16(_a, 2);
-    flag >>= 1;
-    if (flag & 1)
-        b3 = 65536;
-    else
-        b3 = _mm_extract_epi16(_a, 3);
-
-    m128i b = _mm_set_epi32(b3, b2, b1, b0);
-
-    return m128i_to_uint128(b);
-}
-
-} // namespace simd
-} // namespace quadiron
-
-#endif
diff --git a/src/vec_buffers.h b/src/vec_buffers.h
index d31c69d3..2122d300 100644
--- a/src/vec_buffers.h
+++ b/src/vec_buffers.h
@@ -38,7 +38,7 @@
 #include <vector>
 
 #include "core.h"
-#include "simd/simd.h"
+#include "simd/allocator.h"
 
 namespace quadiron {
 namespace vec {
diff --git a/test/simd/test_allocator.cpp b/test/simd/test_allocator.cpp
index a1d59034..09edb70f 100644
--- a/test/simd/test_allocator.cpp
+++ b/test/simd/test_allocator.cpp
@@ -32,7 +32,7 @@
 
 #include <gtest/gtest.h>
 
-#include "simd/simd.h"
+#include "simd/allocator.h"
 
 namespace simd = quadiron::simd;
 
diff --git a/test/simd/test_definitions.cpp b/test/simd/test_definitions.cpp
index c7a48975..bde45d05 100644
--- a/test/simd/test_definitions.cpp
+++ b/test/simd/test_definitions.cpp
@@ -29,7 +29,7 @@
  */
 #include <gtest/gtest.h>
 
-#include "simd/simd.h"
+#include "simd/definitions.h"
 
 namespace simd = quadiron::simd;