diff --git a/src/avx2.rs b/src/avx2.rs
index 105e879..f8eb103 100644
--- a/src/avx2.rs
+++ b/src/avx2.rs
@@ -1,7 +1,7 @@
 mod fdct;
 mod ycbcr;
 
-use crate::encoder::Operations;
+use crate::encoder::{AlignedBlock, Operations};
 pub use fdct::fdct_avx2;
 pub use ycbcr::*;
 
@@ -9,7 +9,7 @@ pub(crate) struct AVX2Operations;
 
 impl Operations for AVX2Operations {
     #[inline(always)]
-    fn fdct(data: &mut [i16; 64]) {
+    fn fdct(data: &mut AlignedBlock) {
         fdct_avx2(data);
     }
 }
diff --git a/src/avx2/fdct.rs b/src/avx2/fdct.rs
index f4a067a..061f098 100644
--- a/src/avx2/fdct.rs
+++ b/src/avx2/fdct.rs
@@ -25,6 +25,8 @@ use core::arch::x86_64::{
     _mm256_unpacklo_epi16, _mm256_unpacklo_epi32,
 };
 
+use crate::encoder::AlignedBlock;
+
 const CONST_BITS: i32 = 13;
 const PASS1_BITS: i32 = 2;
 
@@ -57,14 +59,14 @@ const DESCALE_P1: i32 = CONST_BITS - PASS1_BITS;
 const DESCALE_P2: i32 = CONST_BITS + PASS1_BITS;
 
 #[inline(always)]
-pub fn fdct_avx2(data: &mut [i16; 64]) {
+pub fn fdct_avx2(data: &mut AlignedBlock) {
     unsafe {
         fdct_avx2_internal(data);
     }
 }
 
 #[target_feature(enable = "avx2")]
-fn fdct_avx2_internal(data: &mut [i16; 64]) {
+fn fdct_avx2_internal(data: &mut AlignedBlock) {
     #[target_feature(enable = "avx2")]
     #[allow(non_snake_case)]
     #[inline]
@@ -420,6 +422,8 @@ fn fdct_avx2_internal(data: &mut [i16; 64]) {
         (t1, t2, t3, t4)
     }
 
+    let data = &mut data.data;
+
     let ymm4 = avx_load(&data[0..16]);
     let ymm5 = avx_load(&data[16..32]);
     let ymm6 = avx_load(&data[32..48]);
diff --git a/src/encoder.rs b/src/encoder.rs
index e908ea4..bb3fa4c 100644
--- a/src/encoder.rs
+++ b/src/encoder.rs
@@ -34,6 +34,24 @@ pub enum JpegColorType {
     Ycck,
 }
 
+#[derive(Copy, Clone)]
+#[repr(C, align(32))]
+pub(crate) struct AlignedBlock {
+    pub data: [i16; 64],
+}
+
+impl AlignedBlock {
+    pub const fn new(data: [i16; 64]) -> Self {
+        AlignedBlock { data }
+    }
+}
+
+impl Default for AlignedBlock {
+    fn default() -> Self {
+        AlignedBlock { data: [0i16; 64] }
+    }
+}
+
 impl JpegColorType {
     pub(crate) fn get_num_components(self) -> usize {
         use JpegColorType::*;
@@ -752,7 +770,7 @@ impl<W: JfifWrite> Encoder<W> {
 
                             OP::fdct(&mut block);
 
-                            let mut q_block = [0i16; 64];
+                            let mut q_block = AlignedBlock::default();
 
                             OP::quantize_block(
                                 &block,
@@ -767,7 +785,7 @@ impl<W: JfifWrite> Encoder<W> {
                                 &self.huffman_tables[component.ac_huffman_table as usize].1,
                             )?;
 
-                            prev_dc[i] = q_block[0];
+                            prev_dc[i] = q_block.data[0];
                         }
                     }
                 }
@@ -827,7 +845,7 @@ impl<W: JfifWrite> Encoder<W> {
                     &self.huffman_tables[component.ac_huffman_table as usize].1,
                 )?;
 
-                prev_dc = block[0];
+                prev_dc = block.data[0];
 
                 if restart_interval > 0 {
                     if restarts_to_go == 0 {
@@ -883,12 +901,12 @@ impl<W: JfifWrite> Encoder<W> {
                 }
 
                 self.writer.write_dc(
-                    block[0],
+                    block.data[0],
                     prev_dc,
                     &self.huffman_tables[component.dc_huffman_table as usize].0,
                 )?;
 
-                prev_dc = block[0];
+                prev_dc = block.data[0];
 
                 if restart_interval > 0 {
                     if restarts_to_go == 0 {
@@ -960,7 +978,7 @@ impl<W: JfifWrite> Encoder<W> {
         &mut self,
         image: &I,
         q_tables: &[QuantizationTable; 2],
-    ) -> [Vec<[i16; 64]>; 4] {
+    ) -> [Vec<AlignedBlock>; 4] {
         let width = image.width();
         let height = image.height();
 
@@ -1022,7 +1040,7 @@ impl<W: JfifWrite> Encoder<W> {
 
                     OP::fdct(&mut block);
 
-                    let mut q_block = [0i16; 64];
+                    let mut q_block = AlignedBlock::default();
 
                     OP::quantize_block(
                         &block,
@@ -1037,7 +1055,7 @@ impl<W: JfifWrite> Encoder<W> {
         blocks
     }
 
-    fn init_block_buffers(&mut self, buffer_size: usize) -> [Vec<[i16; 64]>; 4] {
+    fn init_block_buffers(&mut self, buffer_size: usize) -> [Vec<AlignedBlock>; 4] {
         // To simplify the code and to give the compiler more infos to optimize stuff we always initialize 4 components
         // Resource overhead should be minimal because an empty Vec doesn't allocate
 
@@ -1065,7 +1083,7 @@ impl<W: JfifWrite> Encoder<W> {
     }
 
     // Create new huffman tables optimized for this image
-    fn optimize_huffman_table(&mut self, blocks: &[Vec<[i16; 64]>; 4]) {
+    fn optimize_huffman_table(&mut self, blocks: &[Vec<AlignedBlock>; 4]) {
         // TODO: Find out if it's possible to reuse some code from the writer
 
         let max_tables = self.components.len().min(2) as u8;
@@ -1088,7 +1106,7 @@ impl<W: JfifWrite> Encoder<W> {
                     debug_assert!(!blocks[i].is_empty());
 
                     for block in &blocks[i] {
-                        let value = block[0];
+                        let value = block.data[0];
                         let diff = value - prev_dc;
                         let num_bits = get_num_bits(diff);
 
@@ -1120,7 +1138,7 @@ impl<W: JfifWrite> Encoder<W> {
                             for block in &blocks[i] {
                                 let mut zero_run = 0;
 
-                                for &value in &block[start..end] {
+                                for &value in &block.data[start..end] {
                                     if value == 0 {
                                         zero_run += 1;
                                     } else {
@@ -1146,7 +1164,7 @@ impl<W: JfifWrite> Encoder<W> {
                         for block in &blocks[i] {
                             let mut zero_run = 0;
 
-                            for &value in &block[1..] {
+                            for &value in &block.data[1..] {
                                 if value == 0 {
                                     zero_run += 1;
                                 } else {
@@ -1208,7 +1226,7 @@ fn get_block(
     col_stride: usize,
     row_stride: usize,
     width: usize,
-) -> [i16; 64] {
+) -> AlignedBlock {
     let mut block = [0i16; 64];
 
     for y in 0..8 {
@@ -1220,7 +1238,7 @@ fn get_block(
         }
     }
 
-    block
+    AlignedBlock::new(block)
 }
 
 fn get_num_bits(mut value: i16) -> u8 {
@@ -1240,15 +1258,15 @@ fn get_num_bits(mut value: i16) -> u8 {
 
 pub(crate) trait Operations {
     #[inline(always)]
-    fn fdct(data: &mut [i16; 64]) {
+    fn fdct(data: &mut AlignedBlock) {
         fdct(data);
     }
 
     #[inline(always)]
-    fn quantize_block(block: &[i16; 64], q_block: &mut [i16; 64], table: &QuantizationTable) {
+    fn quantize_block(block: &AlignedBlock, q_block: &mut AlignedBlock, table: &QuantizationTable) {
         for i in 0..64 {
             let z = ZIGZAG[i] as usize & 0x3f;
-            q_block[i] = table.quantize(block[z], z);
+            q_block.data[i] = table.quantize(block.data[z], z);
         }
     }
 }
diff --git a/src/fdct.rs b/src/fdct.rs
index 956c46e..ff76a16 100644
--- a/src/fdct.rs
+++ b/src/fdct.rs
@@ -71,6 +71,8 @@
  * scaled fixed-point arithmetic, with a minimal number of shifts.
  */
 
+use crate::encoder::AlignedBlock;
+
 const CONST_BITS: i32 = 13;
 const PASS1_BITS: i32 = 2;
 
@@ -102,7 +104,9 @@ fn into_el(v: i32) -> i16 {
 
 #[allow(clippy::erasing_op)]
 #[allow(clippy::identity_op)]
-pub fn fdct(data: &mut [i16; 64]) {
+pub fn fdct(data: &mut AlignedBlock) {
+    let data = &mut data.data;
+
     /* Pass 1: process rows. */
     /* Note results are scaled up by sqrt(8) compared to a true DCT; */
     /* furthermore, we scale the results by 2**PASS1_BITS. */
@@ -238,6 +242,8 @@ mod tests {
 
     // Inputs and outputs are taken from libjpegs jpeg_fdct_islow for a typical image
 
+    use crate::encoder::AlignedBlock;
+
     use super::fdct;
 
     const INPUT1: [i16; 64] = [
@@ -269,12 +275,12 @@ mod tests {
 
     #[test]
     pub fn test_fdct_libjpeg() {
-        let mut i1 = INPUT1.clone();
+        let mut i1 = AlignedBlock::new(INPUT1.clone());
         fdct(&mut i1);
-        assert_eq!(i1, OUTPUT1);
+        assert_eq!(i1.data, OUTPUT1);
 
-        let mut i2 = INPUT2.clone();
+        let mut i2 = AlignedBlock::new(INPUT2.clone());
         fdct(&mut i2);
-        assert_eq!(i2, OUTPUT2);
+        assert_eq!(i2.data, OUTPUT2);
     }
 }
diff --git a/src/writer.rs b/src/writer.rs
index d0dc9cc..5812033 100644
--- a/src/writer.rs
+++ b/src/writer.rs
@@ -1,5 +1,5 @@
 use crate::EncodingError;
-use crate::encoder::Component;
+use crate::encoder::{AlignedBlock, Component};
 use crate::huffman::{CodingClass, HuffmanTable};
 use crate::marker::{Marker, SOFType};
 use crate::quantization::QuantizationTable;
@@ -330,12 +330,12 @@ impl<W: JfifWrite> JfifWriter<W> {
 
     pub fn write_block(
         &mut self,
-        block: &[i16; 64],
+        block: &AlignedBlock,
         prev_dc: i16,
         dc_table: &HuffmanTable,
         ac_table: &HuffmanTable,
     ) -> Result<(), EncodingError> {
-        self.write_dc(block[0], prev_dc, dc_table)?;
+        self.write_dc(block.data[0], prev_dc, dc_table)?;
         self.write_ac_block(block, 1, 64, ac_table)
     }
 
@@ -355,14 +355,14 @@ impl<W: JfifWrite> JfifWriter<W> {
 
     pub fn write_ac_block(
         &mut self,
-        block: &[i16; 64],
+        block: &AlignedBlock,
         start: usize,
         end: usize,
         ac_table: &HuffmanTable,
     ) -> Result<(), EncodingError> {
         let mut zero_run = 0;
 
-        for &value in &block[start..end] {
+        for &value in &block.data[start..end] {
             if value == 0 {
                 zero_run += 1;
             } else {