diff --git a/src/avx2.rs b/src/avx2.rs index 105e879..f8eb103 100644 --- a/src/avx2.rs +++ b/src/avx2.rs @@ -1,7 +1,7 @@ mod fdct; mod ycbcr; -use crate::encoder::Operations; +use crate::encoder::{AlignedBlock, Operations}; pub use fdct::fdct_avx2; pub use ycbcr::*; @@ -9,7 +9,7 @@ pub(crate) struct AVX2Operations; impl Operations for AVX2Operations { #[inline(always)] - fn fdct(data: &mut [i16; 64]) { + fn fdct(data: &mut AlignedBlock) { fdct_avx2(data); } } diff --git a/src/avx2/fdct.rs b/src/avx2/fdct.rs index f4a067a..061f098 100644 --- a/src/avx2/fdct.rs +++ b/src/avx2/fdct.rs @@ -25,6 +25,8 @@ use core::arch::x86_64::{ _mm256_unpacklo_epi16, _mm256_unpacklo_epi32, }; +use crate::encoder::AlignedBlock; + const CONST_BITS: i32 = 13; const PASS1_BITS: i32 = 2; @@ -57,14 +59,14 @@ const DESCALE_P1: i32 = CONST_BITS - PASS1_BITS; const DESCALE_P2: i32 = CONST_BITS + PASS1_BITS; #[inline(always)] -pub fn fdct_avx2(data: &mut [i16; 64]) { +pub fn fdct_avx2(data: &mut AlignedBlock) { unsafe { fdct_avx2_internal(data); } } #[target_feature(enable = "avx2")] -fn fdct_avx2_internal(data: &mut [i16; 64]) { +fn fdct_avx2_internal(data: &mut AlignedBlock) { #[target_feature(enable = "avx2")] #[allow(non_snake_case)] #[inline] @@ -420,6 +422,8 @@ fn fdct_avx2_internal(data: &mut [i16; 64]) { (t1, t2, t3, t4) } + let data = &mut data.data; + let ymm4 = avx_load(&data[0..16]); let ymm5 = avx_load(&data[16..32]); let ymm6 = avx_load(&data[32..48]); diff --git a/src/encoder.rs b/src/encoder.rs index e908ea4..bb3fa4c 100644 --- a/src/encoder.rs +++ b/src/encoder.rs @@ -34,6 +34,24 @@ pub enum JpegColorType { Ycck, } +#[derive(Copy, Clone)] +#[repr(C, align(32))] +pub(crate) struct AlignedBlock { + pub data: [i16; 64], +} + +impl AlignedBlock { + pub const fn new(data: [i16; 64]) -> Self { + AlignedBlock { data } + } +} + +impl Default for AlignedBlock { + fn default() -> Self { + AlignedBlock { data: [0i16; 64] } + } +} + impl JpegColorType { pub(crate) fn get_num_components(self) -> usize { use JpegColorType::*; @@ -752,7 +770,7 @@ impl Encoder { OP::fdct(&mut block); - let mut q_block = [0i16; 64]; + let mut q_block = AlignedBlock::default(); OP::quantize_block( &block, @@ -767,7 +785,7 @@ impl Encoder { &self.huffman_tables[component.ac_huffman_table as usize].1, )?; - prev_dc[i] = q_block[0]; + prev_dc[i] = q_block.data[0]; } } } @@ -827,7 +845,7 @@ impl Encoder { &self.huffman_tables[component.ac_huffman_table as usize].1, )?; - prev_dc = block[0]; + prev_dc = block.data[0]; if restart_interval > 0 { if restarts_to_go == 0 { @@ -883,12 +901,12 @@ impl Encoder { } self.writer.write_dc( - block[0], + block.data[0], prev_dc, &self.huffman_tables[component.dc_huffman_table as usize].0, )?; - prev_dc = block[0]; + prev_dc = block.data[0]; if restart_interval > 0 { if restarts_to_go == 0 { @@ -960,7 +978,7 @@ impl Encoder { &mut self, image: &I, q_tables: &[QuantizationTable; 2], - ) -> [Vec<[i16; 64]>; 4] { + ) -> [Vec; 4] { let width = image.width(); let height = image.height(); @@ -1022,7 +1040,7 @@ impl Encoder { OP::fdct(&mut block); - let mut q_block = [0i16; 64]; + let mut q_block = AlignedBlock::default(); OP::quantize_block( &block, @@ -1037,7 +1055,7 @@ impl Encoder { blocks } - fn init_block_buffers(&mut self, buffer_size: usize) -> [Vec<[i16; 64]>; 4] { + fn init_block_buffers(&mut self, buffer_size: usize) -> [Vec; 4] { // To simplify the code and to give the compiler more infos to optimize stuff we always initialize 4 components // Resource overhead should be minimal because an empty Vec doesn't allocate @@ -1065,7 +1083,7 @@ impl Encoder { } // Create new huffman tables optimized for this image - fn optimize_huffman_table(&mut self, blocks: &[Vec<[i16; 64]>; 4]) { + fn optimize_huffman_table(&mut self, blocks: &[Vec; 4]) { // TODO: Find out if it's possible to reuse some code from the writer let max_tables = self.components.len().min(2) as u8; @@ -1088,7 +1106,7 @@ impl Encoder { debug_assert!(!blocks[i].is_empty()); for block in &blocks[i] { - let value = block[0]; + let value = block.data[0]; let diff = value - prev_dc; let num_bits = get_num_bits(diff); @@ -1120,7 +1138,7 @@ impl Encoder { for block in &blocks[i] { let mut zero_run = 0; - for &value in &block[start..end] { + for &value in &block.data[start..end] { if value == 0 { zero_run += 1; } else { @@ -1146,7 +1164,7 @@ impl Encoder { for block in &blocks[i] { let mut zero_run = 0; - for &value in &block[1..] { + for &value in &block.data[1..] { if value == 0 { zero_run += 1; } else { @@ -1208,7 +1226,7 @@ fn get_block( col_stride: usize, row_stride: usize, width: usize, -) -> [i16; 64] { +) -> AlignedBlock { let mut block = [0i16; 64]; for y in 0..8 { @@ -1220,7 +1238,7 @@ fn get_block( } } - block + AlignedBlock::new(block) } fn get_num_bits(mut value: i16) -> u8 { @@ -1240,15 +1258,15 @@ fn get_num_bits(mut value: i16) -> u8 { pub(crate) trait Operations { #[inline(always)] - fn fdct(data: &mut [i16; 64]) { + fn fdct(data: &mut AlignedBlock) { fdct(data); } #[inline(always)] - fn quantize_block(block: &[i16; 64], q_block: &mut [i16; 64], table: &QuantizationTable) { + fn quantize_block(block: &AlignedBlock, q_block: &mut AlignedBlock, table: &QuantizationTable) { for i in 0..64 { let z = ZIGZAG[i] as usize & 0x3f; - q_block[i] = table.quantize(block[z], z); + q_block.data[i] = table.quantize(block.data[z], z); } } } diff --git a/src/fdct.rs b/src/fdct.rs index 956c46e..ff76a16 100644 --- a/src/fdct.rs +++ b/src/fdct.rs @@ -71,6 +71,8 @@ * scaled fixed-point arithmetic, with a minimal number of shifts. */ +use crate::encoder::AlignedBlock; + const CONST_BITS: i32 = 13; const PASS1_BITS: i32 = 2; @@ -102,7 +104,9 @@ fn into_el(v: i32) -> i16 { #[allow(clippy::erasing_op)] #[allow(clippy::identity_op)] -pub fn fdct(data: &mut [i16; 64]) { +pub fn fdct(data: &mut AlignedBlock) { + let data = &mut data.data; + /* Pass 1: process rows. */ /* Note results are scaled up by sqrt(8) compared to a true DCT; */ /* furthermore, we scale the results by 2**PASS1_BITS. */ @@ -238,6 +242,8 @@ mod tests { // Inputs and outputs are taken from libjpegs jpeg_fdct_islow for a typical image + use crate::encoder::AlignedBlock; + use super::fdct; const INPUT1: [i16; 64] = [ @@ -269,12 +275,12 @@ mod tests { #[test] pub fn test_fdct_libjpeg() { - let mut i1 = INPUT1.clone(); + let mut i1 = AlignedBlock::new(INPUT1.clone()); fdct(&mut i1); - assert_eq!(i1, OUTPUT1); + assert_eq!(i1.data, OUTPUT1); - let mut i2 = INPUT2.clone(); + let mut i2 = AlignedBlock::new(INPUT2.clone()); fdct(&mut i2); - assert_eq!(i2, OUTPUT2); + assert_eq!(i2.data, OUTPUT2); } } diff --git a/src/writer.rs b/src/writer.rs index d0dc9cc..5812033 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -1,5 +1,5 @@ use crate::EncodingError; -use crate::encoder::Component; +use crate::encoder::{AlignedBlock, Component}; use crate::huffman::{CodingClass, HuffmanTable}; use crate::marker::{Marker, SOFType}; use crate::quantization::QuantizationTable; @@ -330,12 +330,12 @@ impl JfifWriter { pub fn write_block( &mut self, - block: &[i16; 64], + block: &AlignedBlock, prev_dc: i16, dc_table: &HuffmanTable, ac_table: &HuffmanTable, ) -> Result<(), EncodingError> { - self.write_dc(block[0], prev_dc, dc_table)?; + self.write_dc(block.data[0], prev_dc, dc_table)?; self.write_ac_block(block, 1, 64, ac_table) } @@ -355,14 +355,14 @@ impl JfifWriter { pub fn write_ac_block( &mut self, - block: &[i16; 64], + block: &AlignedBlock, start: usize, end: usize, ac_table: &HuffmanTable, ) -> Result<(), EncodingError> { let mut zero_run = 0; - for &value in &block[start..end] { + for &value in &block.data[start..end] { if value == 0 { zero_run += 1; } else {