From 03be367b8ec87aeaea1539f6f7a5ef206bf3d405 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 29 Sep 2024 14:12:12 +0100 Subject: [PATCH] Add multiversioning for filtering and make it an optional, opt-in feature --- Cargo.toml | 3 ++- src/filter.rs | 25 +++++++++++++++++-------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index e92ef4a3..4813e8af 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,7 @@ crc32fast = "1.2.0" fdeflate = "0.3.3" flate2 = "1.0.11" miniz_oxide = { version = "0.8", features = ["simd"] } -multiversion = "0.7.4" +multiversion = { version = "0.7.4", optional = true } [dev-dependencies] byteorder = "1.5.0" @@ -41,6 +41,7 @@ term = "0.7" [features] unstable = [] benchmarks = [] +multiversioning = ["multiversion"] [[bench]] path = "benches/decoder.rs" diff --git a/src/filter.rs b/src/filter.rs index 41d3462d..5c310f22 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -1,5 +1,6 @@ use core::convert::TryInto; +#[cfg(feature = "multiversioning")] use multiversion::multiversion; use crate::common::BytesPerPixel; @@ -14,6 +15,7 @@ mod simd { use std::simd::cmp::{SimdOrd, SimdPartialEq, SimdPartialOrd}; use std::simd::num::{SimdInt, SimdUint}; use std::simd::{u8x4, u8x8, LaneCount, Simd, SimdElement, SupportedLaneCount}; + #[cfg(feature = "multiversioning")] use multiversion::multiversion; /// This is an equivalent of the `PaethPredictor` function from @@ -170,10 +172,10 @@ mod simd { dest[0..3].copy_from_slice(&src.to_array()[0..3]) } - #[multiversion(targets( + #[cfg_attr(feature = "multiversioning", multiversion(targets( "x86_64+sse+sse2+sse3+sse4.1+ssse3", // SSE4.1 is enough because our vectors are 128bit "arm+neon", // 32-bit ARM only; 64-bit always has NEON - ))] + )))] /// Undoes `FilterType::Paeth` for `BytesPerPixel::Three`. pub fn unfilter_paeth3(mut prev_row: &[u8], mut curr_row: &mut [u8]) { debug_assert_eq!(prev_row.len(), curr_row.len()); @@ -205,10 +207,10 @@ mod simd { store3(x, curr_row); } - #[multiversion(targets( + #[cfg_attr(feature = "multiversioning", multiversion(targets( "x86_64+sse+sse2+sse3+sse4.1+ssse3", // SSE4.1 is enough because our vectors are 128bit "arm+neon", // 32-bit ARM only; 64-bit always has NEON - ))] + )))] /// Undoes `FilterType::Paeth` for `BytesPerPixel::Four` and `BytesPerPixel::Eight`. /// /// This function calculates the Paeth predictor entirely in `Simd` @@ -241,10 +243,10 @@ mod simd { dest[0..6].copy_from_slice(&src.to_array()[0..6]) } - #[multiversion(targets( + #[cfg_attr(feature = "multiversioning", multiversion(targets( "x86_64+sse+sse2+sse3+sse4.1+ssse3", // x86-64-v2, higher levels provide no benefit "arm+neon", // 32-bit ARM only; 64-bit always has NEON - ))] + )))] /// Undoes `FilterType::Paeth` for `BytesPerPixel::Six`. pub fn unfilter_paeth6(mut prev_row: &[u8], mut curr_row: &mut [u8]) { debug_assert_eq!(prev_row.len(), curr_row.len()); @@ -396,10 +398,10 @@ fn filter_paeth(a: u8, b: u8, c: u8) -> u8 { } } -#[multiversion(targets( +#[cfg_attr(feature = "multiversioning", multiversion(targets( "x86_64+sse+sse2+sse3+sse4.1+ssse3", // SSE4.1 is enough because our vectors are 128bit "arm+neon", // 32-bit ARM only; 64-bit always has NEON -))] +)))] pub(crate) fn unfilter( mut filter: FilterType, tbpp: BytesPerPixel, @@ -912,6 +914,13 @@ pub(crate) fn unfilter( } } +#[cfg_attr(feature = "multiversioning", multiversion(targets( + // SSE4.1 only gives a +15% boost to Paeth, not worth the bloat. + // AVX regresses fast filters but speeds up slow ones. Another +10% to Paeth. Not worth it? + // AVX2 makes everything go BRRRRRRRRRR, with more than double performance for Paeth + "x86_64+sse+sse2+sse3+sse4.1+sse4.2+ssse3+avx+avx2+fma", + "arm+neon", // 32-bit ARM only; 64-bit always has NEON +)))] fn filter_internal( method: FilterType, bpp: usize,