From 19dcfc884e10cf209bbd8019d6183fc505403e22 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 16 Feb 2025 15:13:37 +0000 Subject: [PATCH] Remove RGBA codepaths using std::simd types since they regress performance on Windows, see #567 --- src/filter.rs | 71 --------------------------------------------------- 1 file changed, 71 deletions(-) diff --git a/src/filter.rs b/src/filter.rs index c33cc205..c79031cb 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -43,23 +43,6 @@ mod simd { out.into() } - /// Functionally equivalent to `simd::paeth_predictor` but does not temporarily convert - /// the SIMD elements to `i16`. - fn paeth_predictor_u8( - a: Simd, - b: Simd, - c: Simd, - ) -> Simd - where - LaneCount: SupportedLaneCount, - { - let mut out = [0; N]; - for i in 0..N { - out[i] = super::filter_paeth_stbi(a[i].into(), b[i].into(), c[i].into()); - } - out.into() - } - /// Memory of previous pixels (as needed to unfilter `Filter::Paeth`). /// See also https://www.w3.org/TR/png/#filter-byte-positions #[derive(Default)] @@ -98,24 +81,6 @@ mod simd { state.a = x.cast::(); } - /// Computes the Paeth predictor without converting `u8` to `i16`. - /// - /// See `simd::paeth_step`. - fn paeth_step_u8( - state: &mut PaethState, - b: Simd, - x: &mut Simd, - ) where - LaneCount: SupportedLaneCount, - { - // Calculating the new value of the current pixel. - *x += paeth_predictor_u8(state.a, b, state.c); - - // Preparing for the next step. - state.c = b; - state.a = *x; - } - fn load3(src: &[u8]) -> u8x4 { u8x4::from_array([src[0], src[1], src[2], 0]) } @@ -155,30 +120,6 @@ mod simd { store3(x, curr_row); } - /// Undoes `Filter::Paeth` for `BytesPerPixel::Four` and `BytesPerPixel::Eight`. - /// - /// This function calculates the Paeth predictor entirely in `Simd` - /// without converting to an intermediate `Simd`. Doing so avoids - /// paying a small performance penalty converting between types. - pub fn unfilter_paeth_u8(prev_row: &[u8], curr_row: &mut [u8]) - where - LaneCount: SupportedLaneCount, - { - debug_assert_eq!(prev_row.len(), curr_row.len()); - debug_assert_eq!(prev_row.len() % N, 0); - assert!(matches!(N, 4 | 8)); - - let mut state = PaethState::::default(); - for (prev_row, curr_row) in prev_row.chunks_exact(N).zip(curr_row.chunks_exact_mut(N)) { - let b = Simd::from_slice(prev_row); - let mut x = Simd::from_slice(curr_row); - - paeth_step_u8(&mut state, b, &mut x); - - curr_row[..N].copy_from_slice(&x.to_array()[..N]); - } - } - fn load6(src: &[u8]) -> u8x8 { u8x8::from_array([src[0], src[1], src[2], src[3], src[4], src[5], 0, 0]) } @@ -804,12 +745,6 @@ pub(crate) fn unfilter( } } BytesPerPixel::Four => { - #[cfg(all(feature = "unstable", target_arch = "x86_64"))] - { - simd::unfilter_paeth_u8::<4>(previous, current); - return; - } - let mut a_bpp = [0; 4]; let mut c_bpp = [0; 4]; for (chunk, b_bpp) in current.chunks_exact_mut(4).zip(previous.chunks_exact(4)) @@ -860,12 +795,6 @@ pub(crate) fn unfilter( } } BytesPerPixel::Eight => { - #[cfg(all(feature = "unstable", target_arch = "x86_64"))] - { - simd::unfilter_paeth_u8::<8>(previous, current); - return; - } - let mut a_bpp = [0; 8]; let mut c_bpp = [0; 8]; for (chunk, b_bpp) in current.chunks_exact_mut(8).zip(previous.chunks_exact(8))