Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Function multiversioning for a performance boost #515

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ crc32fast = "1.2.0"
fdeflate = "0.3.3"
flate2 = "1.0.11"
miniz_oxide = { version = "0.8", features = ["simd"] }
multiversion = { version = "0.7.4", optional = true }

[dev-dependencies]
byteorder = "1.5.0"
Expand All @@ -40,6 +41,7 @@ term = "0.7"
[features]
unstable = []
benchmarks = []
multiversioning = ["multiversion"]

[[bench]]
path = "benches/decoder.rs"
Expand All @@ -52,6 +54,12 @@ name = "unfilter"
harness = false
required-features = ["benchmarks"]

[[bench]]
path = "benches/filter.rs"
name = "filter"
harness = false
required-features = ["benchmarks"]

[[bench]]
path = "benches/expand_paletted.rs"
name = "expand_paletted"
Expand Down
59 changes: 59 additions & 0 deletions benches/filter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
//! Usage example:
//!
//! ```
//! $ alias bench="rustup run nightly cargo bench"
//! $ bench --bench=filter --features=benchmarks,unstable -- --save-baseline my_baseline
//! ... tweak something, say the Sub filter ...
//! $ bench --bench=filter --features=benchmarks,unstable -- filter=Sub --baseline my_baseline
//! ```

use criterion::{criterion_group, criterion_main, Criterion, Throughput};
use png::benchable_apis;
use png::FilterType;
use rand::Rng;

fn filter_all(c: &mut Criterion) {
let bpps = [1, 2, 3, 4, 6, 8];
let filters = [
FilterType::Sub,
FilterType::Up,
FilterType::Avg,
FilterType::Paeth,
];
for &filter in filters.iter() {
for &bpp in bpps.iter() {
bench_filter(c, filter, bpp);
}
}
}

criterion_group!(benches, filter_all);
criterion_main!(benches);

fn bench_filter(c: &mut Criterion, filter: FilterType, bpp: u8) {
let mut group = c.benchmark_group("filter");

fn get_random_bytes<R: Rng>(rng: &mut R, n: usize) -> Vec<u8> {
use rand::Fill;
let mut result = vec![0u8; n];
result.as_mut_slice().try_fill(rng).unwrap();
result
}
let mut rng = rand::thread_rng();
let row_size = 4096 * (bpp as usize);
let two_rows = get_random_bytes(&mut rng, row_size * 2);
let mut out = vec![0; row_size];

group.throughput(Throughput::Bytes(row_size as u64));
group.bench_with_input(
format!("filter={filter:?}/bpp={bpp}"),
&two_rows,
|b, two_rows| {
let (prev_row, curr_row) = two_rows.split_at(row_size);
let mut curr_row = curr_row.to_vec();
b.iter(|| {
benchable_apis::filter(filter, bpp, prev_row, curr_row.as_mut_slice(), &mut out)
});
},
);
}
6 changes: 6 additions & 0 deletions src/benchable_apis.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ pub fn unfilter(filter: FilterType, tbpp: u8, previous: &[u8], current: &mut [u8
crate::filter::unfilter(filter, tbpp, previous, current)
}

pub fn filter(filter: FilterType, tbpp: u8, previous: &[u8], current: &[u8], output: &mut [u8]) {
let tbpp = BytesPerPixel::from_usize(tbpp as usize);
let adaptive = crate::AdaptiveFilterType::NonAdaptive;
crate::filter::filter(filter, adaptive, tbpp, previous, current, output);
}

pub use crate::decoder::transform::{create_transform_fn, TransformFn};

pub fn create_info_from_plte_trns_bitdepth<'a>(
Expand Down
28 changes: 28 additions & 0 deletions src/filter.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
use core::convert::TryInto;

#[cfg(feature = "multiversioning")]
use multiversion::multiversion;

use crate::common::BytesPerPixel;

/// SIMD helpers for `fn unfilter`
Expand All @@ -9,6 +12,8 @@ use crate::common::BytesPerPixel;
/// feature of Rust gets stabilized.
#[cfg(feature = "unstable")]
mod simd {
#[cfg(feature = "multiversioning")]
use multiversion::multiversion;
use std::simd::cmp::{SimdOrd, SimdPartialEq, SimdPartialOrd};
use std::simd::num::{SimdInt, SimdUint};
use std::simd::{u8x4, u8x8, LaneCount, Simd, SimdElement, SupportedLaneCount};
Expand Down Expand Up @@ -167,6 +172,10 @@ mod simd {
dest[0..3].copy_from_slice(&src.to_array()[0..3])
}

#[cfg_attr(feature = "multiversioning", multiversion(targets(
"x86_64+sse+sse2+sse3+sse4.1+ssse3", // SSE4.1 is enough because our vectors are 128bit
"arm+neon", // 32-bit ARM only; 64-bit always has NEON
)))]
/// Undoes `FilterType::Paeth` for `BytesPerPixel::Three`.
pub fn unfilter_paeth3(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
debug_assert_eq!(prev_row.len(), curr_row.len());
Expand Down Expand Up @@ -198,6 +207,10 @@ mod simd {
store3(x, curr_row);
}

#[cfg_attr(feature = "multiversioning", multiversion(targets(
"x86_64+sse+sse2+sse3+sse4.1+ssse3", // SSE4.1 is enough because our vectors are 128bit
"arm+neon", // 32-bit ARM only; 64-bit always has NEON
)))]
/// Undoes `FilterType::Paeth` for `BytesPerPixel::Four` and `BytesPerPixel::Eight`.
///
/// This function calculates the Paeth predictor entirely in `Simd<u8, N>`
Expand Down Expand Up @@ -230,6 +243,10 @@ mod simd {
dest[0..6].copy_from_slice(&src.to_array()[0..6])
}

#[cfg_attr(feature = "multiversioning", multiversion(targets(
"x86_64+sse+sse2+sse3+sse4.1+ssse3", // x86-64-v2, higher levels provide no benefit
"arm+neon", // 32-bit ARM only; 64-bit always has NEON
)))]
/// Undoes `FilterType::Paeth` for `BytesPerPixel::Six`.
pub fn unfilter_paeth6(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
debug_assert_eq!(prev_row.len(), curr_row.len());
Expand Down Expand Up @@ -381,6 +398,10 @@ fn filter_paeth(a: u8, b: u8, c: u8) -> u8 {
}
}

#[cfg_attr(feature = "multiversioning", multiversion(targets(
"x86_64+sse+sse2+sse3+sse4.1+ssse3", // SSE4.1 is enough because our vectors are 128bit
"arm+neon", // 32-bit ARM only; 64-bit always has NEON
)))]
pub(crate) fn unfilter(
mut filter: FilterType,
tbpp: BytesPerPixel,
Expand Down Expand Up @@ -893,6 +914,13 @@ pub(crate) fn unfilter(
}
}

#[cfg_attr(feature = "multiversioning", multiversion(targets(
// SSE4.1 only gives a +15% boost to Paeth, not worth the bloat.
// AVX regresses fast filters but speeds up slow ones. Another +10% to Paeth. Not worth it?
// AVX2 makes everything go BRRRRRRRRRR, with more than double performance for Paeth
"x86_64+sse+sse2+sse3+sse4.1+sse4.2+ssse3+avx+avx2+fma",
"arm+neon", // 32-bit ARM only; 64-bit always has NEON
)))]
fn filter_internal(
method: FilterType,
bpp: usize,
Expand Down
Loading