diff --git a/Cargo.toml b/Cargo.toml index 08d475bc..4813e8af 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,7 @@ crc32fast = "1.2.0" fdeflate = "0.3.3" flate2 = "1.0.11" miniz_oxide = { version = "0.8", features = ["simd"] } +multiversion = { version = "0.7.4", optional = true } [dev-dependencies] byteorder = "1.5.0" @@ -40,6 +41,7 @@ term = "0.7" [features] unstable = [] benchmarks = [] +multiversioning = ["multiversion"] [[bench]] path = "benches/decoder.rs" @@ -52,6 +54,12 @@ name = "unfilter" harness = false required-features = ["benchmarks"] +[[bench]] +path = "benches/filter.rs" +name = "filter" +harness = false +required-features = ["benchmarks"] + [[bench]] path = "benches/expand_paletted.rs" name = "expand_paletted" diff --git a/benches/filter.rs b/benches/filter.rs new file mode 100644 index 00000000..c975dce2 --- /dev/null +++ b/benches/filter.rs @@ -0,0 +1,59 @@ +//! Usage example: +//! +//! ``` +//! $ alias bench="rustup run nightly cargo bench" +//! $ bench --bench=filter --features=benchmarks,unstable -- --save-baseline my_baseline +//! ... tweak something, say the Sub filter ... +//! $ bench --bench=filter --features=benchmarks,unstable -- filter=Sub --baseline my_baseline +//! ``` + +use criterion::{criterion_group, criterion_main, Criterion, Throughput}; +use png::benchable_apis; +use png::FilterType; +use rand::Rng; + +fn filter_all(c: &mut Criterion) { + let bpps = [1, 2, 3, 4, 6, 8]; + let filters = [ + FilterType::Sub, + FilterType::Up, + FilterType::Avg, + FilterType::Paeth, + ]; + for &filter in filters.iter() { + for &bpp in bpps.iter() { + bench_filter(c, filter, bpp); + } + } +} + +criterion_group!(benches, filter_all); +criterion_main!(benches); + +fn bench_filter(c: &mut Criterion, filter: FilterType, bpp: u8) { + let mut group = c.benchmark_group("filter"); + + fn get_random_bytes(rng: &mut R, n: usize) -> Vec { + use rand::Fill; + let mut result = vec![0u8; n]; + result.as_mut_slice().try_fill(rng).unwrap(); + result + } + let mut rng = rand::thread_rng(); + let row_size = 4096 * (bpp as usize); + let two_rows = get_random_bytes(&mut rng, row_size * 2); + let mut out = vec![0; row_size]; + + group.throughput(Throughput::Bytes(row_size as u64)); + group.bench_with_input( + format!("filter={filter:?}/bpp={bpp}"), + &two_rows, + |b, two_rows| { + let (prev_row, curr_row) = two_rows.split_at(row_size); + let mut curr_row = curr_row.to_vec(); + b.iter(|| { + benchable_apis::filter(filter, bpp, prev_row, curr_row.as_mut_slice(), &mut out) + }); + }, + ); +} diff --git a/src/benchable_apis.rs b/src/benchable_apis.rs index 17b0b0d6..c8142eb0 100644 --- a/src/benchable_apis.rs +++ b/src/benchable_apis.rs @@ -12,6 +12,12 @@ pub fn unfilter(filter: FilterType, tbpp: u8, previous: &[u8], current: &mut [u8 crate::filter::unfilter(filter, tbpp, previous, current) } +pub fn filter(filter: FilterType, tbpp: u8, previous: &[u8], current: &[u8], output: &mut [u8]) { + let tbpp = BytesPerPixel::from_usize(tbpp as usize); + let adaptive = crate::AdaptiveFilterType::NonAdaptive; + crate::filter::filter(filter, adaptive, tbpp, previous, current, output); +} + pub use crate::decoder::transform::{create_transform_fn, TransformFn}; pub fn create_info_from_plte_trns_bitdepth<'a>( diff --git a/src/filter.rs b/src/filter.rs index 9290a040..7cfddef3 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -1,5 +1,8 @@ use core::convert::TryInto; +#[cfg(feature = "multiversioning")] +use multiversion::multiversion; + use crate::common::BytesPerPixel; /// SIMD helpers for `fn unfilter` @@ -9,6 +12,8 @@ use crate::common::BytesPerPixel; /// feature of Rust gets stabilized. #[cfg(feature = "unstable")] mod simd { + #[cfg(feature = "multiversioning")] + use multiversion::multiversion; use std::simd::cmp::{SimdOrd, SimdPartialEq, SimdPartialOrd}; use std::simd::num::{SimdInt, SimdUint}; use std::simd::{u8x4, u8x8, LaneCount, Simd, SimdElement, SupportedLaneCount}; @@ -167,6 +172,10 @@ mod simd { dest[0..3].copy_from_slice(&src.to_array()[0..3]) } + #[cfg_attr(feature = "multiversioning", multiversion(targets( + "x86_64+sse+sse2+sse3+sse4.1+ssse3", // SSE4.1 is enough because our vectors are 128bit + "arm+neon", // 32-bit ARM only; 64-bit always has NEON + )))] /// Undoes `FilterType::Paeth` for `BytesPerPixel::Three`. pub fn unfilter_paeth3(mut prev_row: &[u8], mut curr_row: &mut [u8]) { debug_assert_eq!(prev_row.len(), curr_row.len()); @@ -198,6 +207,10 @@ mod simd { store3(x, curr_row); } + #[cfg_attr(feature = "multiversioning", multiversion(targets( + "x86_64+sse+sse2+sse3+sse4.1+ssse3", // SSE4.1 is enough because our vectors are 128bit + "arm+neon", // 32-bit ARM only; 64-bit always has NEON + )))] /// Undoes `FilterType::Paeth` for `BytesPerPixel::Four` and `BytesPerPixel::Eight`. /// /// This function calculates the Paeth predictor entirely in `Simd` @@ -230,6 +243,10 @@ mod simd { dest[0..6].copy_from_slice(&src.to_array()[0..6]) } + #[cfg_attr(feature = "multiversioning", multiversion(targets( + "x86_64+sse+sse2+sse3+sse4.1+ssse3", // x86-64-v2, higher levels provide no benefit + "arm+neon", // 32-bit ARM only; 64-bit always has NEON + )))] /// Undoes `FilterType::Paeth` for `BytesPerPixel::Six`. pub fn unfilter_paeth6(mut prev_row: &[u8], mut curr_row: &mut [u8]) { debug_assert_eq!(prev_row.len(), curr_row.len()); @@ -381,6 +398,10 @@ fn filter_paeth(a: u8, b: u8, c: u8) -> u8 { } } +#[cfg_attr(feature = "multiversioning", multiversion(targets( + "x86_64+sse+sse2+sse3+sse4.1+ssse3", // SSE4.1 is enough because our vectors are 128bit + "arm+neon", // 32-bit ARM only; 64-bit always has NEON +)))] pub(crate) fn unfilter( mut filter: FilterType, tbpp: BytesPerPixel, @@ -893,6 +914,13 @@ pub(crate) fn unfilter( } } +#[cfg_attr(feature = "multiversioning", multiversion(targets( + // SSE4.1 only gives a +15% boost to Paeth, not worth the bloat. + // AVX regresses fast filters but speeds up slow ones. Another +10% to Paeth. Not worth it? + // AVX2 makes everything go BRRRRRRRRRR, with more than double performance for Paeth + "x86_64+sse+sse2+sse3+sse4.1+sse4.2+ssse3+avx+avx2+fma", + "arm+neon", // 32-bit ARM only; 64-bit always has NEON +)))] fn filter_internal( method: FilterType, bpp: usize,