diff --git a/Cargo.lock b/Cargo.lock index e09e4065..c1cecbda 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -103,6 +103,12 @@ dependencies = [ "arrayvec", ] +[[package]] +name = "az" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b7e4c2464d97fe331d41de9d5db0def0a96f4d823b8b32a2efd503578988973" + [[package]] name = "bindgen" version = "0.69.5" @@ -279,6 +285,13 @@ version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" +[[package]] +name = "coeffs" +version = "0.1.0" +dependencies = [ + "rug", +] + [[package]] name = "color_quant" version = "1.1.0" @@ -454,6 +467,16 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +[[package]] +name = "gmp-mpfr-sys" +version = "1.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0205cd82059bc63b63cf516d714352a30c44f2c74da9961dfda2617ae6b5918" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "half" version = "2.4.1" @@ -655,6 +678,12 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "libm" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" + [[package]] name = "linux-raw-sys" version = "0.4.14" @@ -1058,6 +1087,18 @@ version = "0.8.50" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57397d16646700483b67d2dd6511d79318f9d057fdbd21a4066aeac8b41d310a" +[[package]] +name = "rug" +version = "1.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97ae2c1089ec0575193eb9222881310cc1ed8bce3646ef8b81b44b518595b79d" +dependencies = [ + "az", + "gmp-mpfr-sys", + "libc", + "libm", +] + [[package]] name = "rustc-hash" version = "1.1.0" @@ -1515,7 +1556,7 @@ dependencies = [ [[package]] name = "yuvutils-rs" -version = "0.5.8" +version = "0.5.9" dependencies = [ "num-traits", "rayon", diff --git a/Cargo.toml b/Cargo.toml index e08183ad..94351395 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,8 @@ -workspace = { members = ["app"] } +workspace = { members = ["app", "coeffs"] } [package] name = "yuvutils-rs" -version = "0.5.8" +version = "0.5.9" edition = "2021" description = "High performance utilities for YUV format handling and conversion." readme = "README.md" diff --git a/README.md b/README.md index f804e205..f335b74b 100644 --- a/README.md +++ b/README.md @@ -78,11 +78,11 @@ Tests performed on the image 5763x3842 |------------------------|:----------:|:---------:| | utils RGB->YUV 4:2:0 | 3.16ms | 3.53ms | | libyuv RGB->YUV 4:2:0 | 3.58ms | 33.87ms | -| utils RGBA->YUV 4:2:0 | 4.07ms | 5.47ms | +| utils RGBA->YUV 4:2:0 | 4.04ms | 5.47ms | | libyuv RGBA->YUV 4:2:0 | 4.87ms | 23.48ms | -| utils RGBA->YUV 4:2:2 | 4.46ms | 7.08ms | +| utils RGBA->YUV 4:2:2 | 4.37ms | 7.08ms | | libyuv RGBA->YUV 4:2:2 | 5.90ms | 35.23ms | -| utils RGBA->YUV 4:4:4 | 4.66ms | 7.97ms | +| utils RGBA->YUV 4:4:4 | 4.49ms | 7.97ms | ### Decoding diff --git a/app/benches/yuv8/main.rs b/app/benches/yuv8/main.rs index 8dd9c501..5f8188fe 100644 --- a/app/benches/yuv8/main.rs +++ b/app/benches/yuv8/main.rs @@ -34,10 +34,10 @@ use yuv_sys::{ rs_NV21ToABGR, rs_RGB24ToI420, }; use yuvutils_rs::{ - rgb_to_yuv420, rgb_to_yuv422, rgb_to_yuv444, rgb_to_yuv_nv12, rgba_to_yuv420, rgba_to_yuv422, - rgba_to_yuv444, yuv420_to_rgb, yuv420_to_rgba, yuv422_to_rgba, yuv444_to_rgba, - yuv_nv12_to_rgba, YuvBiPlanarImageMut, YuvChromaSubsampling, YuvPlanarImageMut, YuvRange, - YuvStandardMatrix, + gbr_to_rgba, rgb_to_gbr, rgb_to_yuv420, rgb_to_yuv422, rgb_to_yuv444, rgb_to_yuv_nv12, + rgba_to_yuv420, rgba_to_yuv422, rgba_to_yuv444, yuv420_to_rgb, yuv420_to_rgba, yuv422_to_rgba, + yuv444_to_rgba, yuv_nv12_to_rgba, YuvBiPlanarImageMut, YuvChromaSubsampling, YuvPlanarImageMut, + YuvRange, YuvStandardMatrix, }; pub fn criterion_benchmark(c: &mut Criterion) { @@ -53,9 +53,14 @@ pub fn criterion_benchmark(c: &mut Criterion) { let mut planar_image = YuvPlanarImageMut::::alloc(dimensions.0, dimensions.1, YuvChromaSubsampling::Yuv420); + let mut gbr_image = + YuvPlanarImageMut::::alloc(dimensions.0, dimensions.1, YuvChromaSubsampling::Yuv444); + let mut bi_planar_image = YuvBiPlanarImageMut::::alloc(dimensions.0, dimensions.1, YuvChromaSubsampling::Yuv420); + rgb_to_gbr(&mut gbr_image, &src_bytes, stride as u32, YuvRange::Limited).unwrap(); + rgb_to_yuv420( &mut planar_image, &src_bytes, @@ -79,6 +84,20 @@ pub fn criterion_benchmark(c: &mut Criterion) { let fixed_planar = planar_image.to_fixed(); let rgba_image = img.to_rgba8(); + let fixed_gbr = gbr_image.to_fixed(); + + c.bench_function("yuvutils GBR -> RGBA Limited", |b| { + let mut rgb_bytes = vec![0u8; dimensions.0 as usize * 4 * dimensions.1 as usize]; + b.iter(|| { + gbr_to_rgba( + &fixed_gbr, + &mut rgb_bytes, + dimensions.0 * 4, + YuvRange::Limited, + ) + .unwrap(); + }) + }); c.bench_function("yuvutils RGB -> YUV 4:2:0", |b| { let mut test_planar = YuvPlanarImageMut::::alloc( diff --git a/app/src/main.rs b/app/src/main.rs index 7dd1b7be..b64d8481 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -32,11 +32,12 @@ use std::io::Read; use std::time::Instant; use yuv_sys::{rs_I420ToRGB24, rs_NV12ToRGB24, rs_NV21ToABGR, rs_NV21ToRGB24}; use yuvutils_rs::{ - rgb_to_yuv420, rgb_to_yuv420_p16, rgb_to_yuv422, rgb_to_yuv422_p16, rgb_to_yuv444, - rgb_to_yuv_nv12, yuv420_p16_to_rgb16, yuv420_to_rgb, yuv420_to_yuyv422, yuv422_p16_to_rgb16, - yuv422_to_rgb, yuv444_to_rgb, yuv_nv12_to_rgb, yuv_nv12_to_rgba, yuyv422_to_yuv420, - BufferStoreMut, YuvBiPlanarImageMut, YuvBytesPacking, YuvChromaSubsampling, YuvEndianness, - YuvPackedImage, YuvPackedImageMut, YuvPlanarImageMut, YuvRange, YuvStandardMatrix, + gbr_to_rgb, rgb_to_gbr, rgb_to_yuv420, rgb_to_yuv420_p16, rgb_to_yuv422, rgb_to_yuv422_p16, + rgb_to_yuv444, rgb_to_yuv_nv12, yuv420_p16_to_rgb16, yuv420_to_rgb, yuv420_to_yuyv422, + yuv422_p16_to_rgb16, yuv422_to_rgb, yuv444_to_rgb, yuv_nv12_to_rgb, yuv_nv12_to_rgba, + yuyv422_to_yuv420, BufferStoreMut, YuvBiPlanarImageMut, YuvBytesPacking, YuvChromaSubsampling, + YuvEndianness, YuvPackedImage, YuvPackedImageMut, YuvPlanarImageMut, YuvRange, + YuvStandardMatrix, }; fn read_file_bytes(file_path: &str) -> Result, String> { @@ -94,20 +95,17 @@ fn main() { YuvBiPlanarImageMut::::alloc(width as u32, height as u32, YuvChromaSubsampling::Yuv420); let mut planar_image = - YuvPlanarImageMut::::alloc(width as u32, height as u32, YuvChromaSubsampling::Yuv422); + YuvPlanarImageMut::::alloc(width as u32, height as u32, YuvChromaSubsampling::Yuv420); - let mut bytes_16: Vec = src_bytes.iter().map(|&x| (x as u16) << 4).collect(); + // let mut bytes_16: Vec = src_bytes.iter().map(|&x| (x as u16) << 4).collect(); let start_time = Instant::now(); - rgb_to_yuv422_p16( + rgb_to_yuv420( &mut planar_image, - &bytes_16, + &src_bytes, rgba_stride as u32, - 12, YuvRange::Limited, YuvStandardMatrix::Bt601, - YuvEndianness::LittleEndian, - YuvBytesPacking::LeastSignificantBytes, ) .unwrap(); // bytes_16.fill(0); @@ -260,15 +258,12 @@ fn main() { // let rgba_stride = width as usize * 4; // let mut rgba = vec![0u8; height as usize * rgba_stride]; - yuv422_p16_to_rgb16( + yuv420_to_rgb( &fixed_planar, - &mut bytes_16, + &mut rgba, rgba_stride as u32, - 12, YuvRange::Limited, YuvStandardMatrix::Bt601, - YuvEndianness::LittleEndian, - YuvBytesPacking::LeastSignificantBytes, ) .unwrap(); @@ -309,7 +304,7 @@ fn main() { // chunk[2] = b; // }); - rgba = bytes_16.iter().map(|&x| (x >> 4) as u8).collect(); + // rgba = bytes_16.iter().map(|&x| (x >> 4) as u8).collect(); image::save_buffer( "converted_sharp15.jpg", diff --git a/coeffs/src/main.rs b/coeffs/src/main.rs index 54faf70d..8de53d8d 100644 --- a/coeffs/src/main.rs +++ b/coeffs/src/main.rs @@ -182,31 +182,31 @@ fn get_forward_coeffs_integral( CbCrForwardTransform { yr: (yr * Float::with_val(BITS, prec)) - .to_i32_saturating_round(Round::Down) + .to_i32_saturating_round(Round::Nearest) .unwrap(), yg: (yg * Float::with_val(BITS, prec)) - .to_i32_saturating_round(Round::Down) + .to_i32_saturating_round(Round::Nearest) .unwrap(), yb: (yb * Float::with_val(BITS, prec)) - .to_i32_saturating_round(Round::Down) + .to_i32_saturating_round(Round::Nearest) .unwrap(), cb_r: (cb_r * Float::with_val(BITS, prec)) - .to_i32_saturating_round(Round::Down) + .to_i32_saturating_round(Round::Nearest) .unwrap(), cb_g: (cb_g * Float::with_val(BITS, prec)) - .to_i32_saturating_round(Round::Down) + .to_i32_saturating_round(Round::Nearest) .unwrap(), cb_b: (cb_b * Float::with_val(BITS, prec)) - .to_i32_saturating_round(Round::Down) + .to_i32_saturating_round(Round::Nearest) .unwrap(), cr_r: (cr_r * Float::with_val(BITS, prec)) - .to_i32_saturating_round(Round::Down) + .to_i32_saturating_round(Round::Nearest) .unwrap(), cr_g: (cr_g * Float::with_val(BITS, prec)) - .to_i32_saturating_round(Round::Down) + .to_i32_saturating_round(Round::Nearest) .unwrap(), cr_b: (cr_b * Float::with_val(BITS, prec)) - .to_i32_saturating_round(Round::Down) + .to_i32_saturating_round(Round::Nearest) .unwrap(), } } @@ -300,21 +300,16 @@ pub fn get_inverse_transform_integral( * range_uv(); let prec = (1 << prec) as f32; let y_coeff = y_coef * Float::with_val(BITS, prec); - println!("Y Coeff {}", y_coeff); let cr_coeff = cr_coeff * Float::with_val(BITS, prec); - println!("Cr Coeff {}", cr_coeff); let cb_coeff = cb_coeff * Float::with_val(BITS, prec); - println!("Cb Coeff {}", cr_coeff); let g_coeff_1 = g_coeff_1 * Float::with_val(BITS, prec); - println!("G Coeff 1 {}", g_coeff_1); let g_coeff_2 = g_coeff_2 * Float::with_val(BITS, prec); - println!("G Coeff 2 {}", g_coeff_2); CbCrInverseTransform::new( - y_coeff.to_i32_saturating_round(Round::Down).unwrap(), - cr_coeff.to_i32_saturating_round(Round::Down).unwrap(), - cb_coeff.to_i32_saturating_round(Round::Down).unwrap(), - g_coeff_1.to_i32_saturating_round(Round::Down).unwrap(), - g_coeff_2.to_i32_saturating_round(Round::Down).unwrap(), + y_coeff.to_i32_saturating_round(Round::Nearest).unwrap(), + cr_coeff.to_i32_saturating_round(Round::Nearest).unwrap(), + cb_coeff.to_i32_saturating_round(Round::Nearest).unwrap(), + g_coeff_1.to_i32_saturating_round(Round::Nearest).unwrap(), + g_coeff_2.to_i32_saturating_round(Round::Nearest).unwrap(), ) } @@ -369,15 +364,16 @@ impl YuvStandardMatrix { } fn main() { - let transform = get_forward_coeffs(0.2220f32, 0.0713f32, 8, YuvRange::Limited); - println!("Precise {:?}", transform); - let integral = get_forward_coeffs_integral(0.2220f32, 0.0713f32, 8, YuvRange::Limited, 13); - println!("Integral {:?}", integral); + let kr_kb = YuvStandardMatrix::Bt2020.get_kr_kb(); + let range = YuvRange::Full; + let bit_depth = 12; + let transform = get_forward_coeffs(kr_kb.kr, kr_kb.kb, bit_depth, range); + println!("Precise {:?};", transform); + let integral = get_forward_coeffs_integral(kr_kb.kr, kr_kb.kb, bit_depth, range, 13); + println!("Integral {:?};", integral); - let kr_kb = YuvStandardMatrix::Bt601.get_kr_kb(); - let inverse = get_inverse_transform(kr_kb.kr, kr_kb.kb, 8, YuvRange::Limited); + let inverse = get_inverse_transform(kr_kb.kr, kr_kb.kb, bit_depth, range); println!("Inverse {:?}", inverse); - let inverse_integral = - get_inverse_transform_integral(kr_kb.kr, kr_kb.kb, 8, 13, YuvRange::Limited); + let inverse_integral = get_inverse_transform_integral(kr_kb.kr, kr_kb.kb, bit_depth, 13, range); println!("Inverse Integral {:?};", inverse_integral); } diff --git a/src/avx2/gbr_to_rgb.rs b/src/avx2/gbr_to_rgb.rs new file mode 100644 index 00000000..37c065cc --- /dev/null +++ b/src/avx2/gbr_to_rgb.rs @@ -0,0 +1,230 @@ +/* + * Copyright (c) Radzivon Bartoshyk, 11/2024. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +use crate::avx2::avx2_utils::{_mm256_store_interleaved_epi8, avx2_pack_u16, avx2_store_u8_rgb}; +use crate::yuv_support::YuvSourceChannels; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +pub(crate) fn avx_yuv_to_rgba_row_full( + g_plane: &[u8], + b_plane: &[u8], + r_plane: &[u8], + rgba: &mut [u8], + start_cx: usize, + width: usize, +) -> usize { + unsafe { + avx_yuv_to_rgba_row_full_impl::( + g_plane, b_plane, r_plane, rgba, start_cx, width, + ) + } +} + +#[target_feature(enable = "avx2")] +unsafe fn avx_yuv_to_rgba_row_full_impl( + g_plane: &[u8], + b_plane: &[u8], + r_plane: &[u8], + rgba: &mut [u8], + start_cx: usize, + width: usize, +) -> usize { + let mut cx = start_cx; + + let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into(); + + let v_alpha = _mm256_set1_epi8(255u8 as i8); + + while cx + 32 < width { + let g_values = _mm256_loadu_si256(g_plane.get_unchecked(cx..).as_ptr() as *const _); + let b_values = _mm256_loadu_si256(b_plane.get_unchecked(cx..).as_ptr() as *const _); + let r_values = _mm256_loadu_si256(r_plane.get_unchecked(cx..).as_ptr() as *const _); + + let dst_shift = cx * destination_channels.get_channels_count(); + let rgba_ptr = rgba.get_unchecked_mut(dst_shift..); + + match destination_channels { + YuvSourceChannels::Rgb => { + avx2_store_u8_rgb(rgba_ptr.as_mut_ptr(), r_values, g_values, b_values); + } + YuvSourceChannels::Bgr => { + avx2_store_u8_rgb(rgba_ptr.as_mut_ptr(), b_values, g_values, r_values); + } + YuvSourceChannels::Rgba => { + _mm256_store_interleaved_epi8( + rgba_ptr.as_mut_ptr(), + r_values, + g_values, + b_values, + v_alpha, + ); + } + YuvSourceChannels::Bgra => { + _mm256_store_interleaved_epi8( + rgba_ptr.as_mut_ptr(), + b_values, + g_values, + r_values, + v_alpha, + ); + } + } + + cx += 32; + } + + cx +} + +pub(crate) fn avx_yuv_to_rgba_row_limited( + g_plane: &[u8], + b_plane: &[u8], + r_plane: &[u8], + rgba: &mut [u8], + start_cx: usize, + width: usize, + y_bias: i32, + y_coeff: i32, +) -> usize { + unsafe { + avx_yuv_to_rgba_row_limited_impl::( + g_plane, b_plane, r_plane, rgba, start_cx, width, y_bias, y_coeff, + ) + } +} + +#[target_feature(enable = "avx2")] +unsafe fn avx_yuv_to_rgba_row_limited_impl( + g_plane: &[u8], + b_plane: &[u8], + r_plane: &[u8], + rgba: &mut [u8], + start_cx: usize, + width: usize, + y_bias: i32, + y_coeff: i32, +) -> usize { + let mut cx = start_cx; + + let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into(); + + let v_alpha = _mm256_set1_epi8(255u8 as i8); + + const V_SCALE: i32 = 2; + + let vy_coeff = _mm256_set1_epi16(y_coeff as i16); + let vy_bias = _mm256_set1_epi8(y_bias as i8); + + while cx + 32 < width { + let g_values0 = _mm256_subs_epu8( + _mm256_loadu_si256(g_plane.get_unchecked(cx..).as_ptr() as *const _), + vy_bias, + ); + let b_values0 = _mm256_subs_epu8( + _mm256_loadu_si256(b_plane.get_unchecked(cx..).as_ptr() as *const _), + vy_bias, + ); + let r_values0 = _mm256_subs_epu8( + _mm256_loadu_si256(r_plane.get_unchecked(cx..).as_ptr() as *const _), + vy_bias, + ); + + let rl_hi = _mm256_mulhrs_epi16( + _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_values0))), + vy_coeff, + ); + let gl_hi = _mm256_mulhrs_epi16( + _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_values0))), + vy_coeff, + ); + let bl_hi = _mm256_mulhrs_epi16( + _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_values0))), + vy_coeff, + ); + + let rl_lo = _mm256_mulhrs_epi16( + _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>( + r_values0, + ))), + vy_coeff, + ); + let gl_lo = _mm256_mulhrs_epi16( + _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>( + g_values0, + ))), + vy_coeff, + ); + let bl_lo = _mm256_mulhrs_epi16( + _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>( + b_values0, + ))), + vy_coeff, + ); + + let r_values = avx2_pack_u16(rl_lo, rl_hi); + let g_values = avx2_pack_u16(gl_lo, gl_hi); + let b_values = avx2_pack_u16(bl_lo, bl_hi); + + let dst_shift = cx * destination_channels.get_channels_count(); + let rgba_ptr = rgba.get_unchecked_mut(dst_shift..); + + match destination_channels { + YuvSourceChannels::Rgb => { + avx2_store_u8_rgb(rgba_ptr.as_mut_ptr(), r_values, g_values, b_values); + } + YuvSourceChannels::Bgr => { + avx2_store_u8_rgb(rgba_ptr.as_mut_ptr(), b_values, g_values, r_values); + } + YuvSourceChannels::Rgba => { + _mm256_store_interleaved_epi8( + rgba_ptr.as_mut_ptr(), + r_values, + g_values, + b_values, + v_alpha, + ); + } + YuvSourceChannels::Bgra => { + _mm256_store_interleaved_epi8( + rgba_ptr.as_mut_ptr(), + b_values, + g_values, + r_values, + v_alpha, + ); + } + } + + cx += 32; + } + + cx +} diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs index e3811da1..781ceadd 100644 --- a/src/avx2/mod.rs +++ b/src/avx2/mod.rs @@ -29,6 +29,7 @@ #![deny(unreachable_code, unreachable_pub)] mod avx2_utils; mod avx2_ycgco; +mod gbr_to_rgb; mod rgb_to_nv; mod rgb_to_y; mod rgb_to_ycgco; @@ -47,6 +48,7 @@ mod yuv_to_yuv2; mod yuy2_to_rgb; mod yuy2_to_yuv; +pub(crate) use gbr_to_rgb::{avx_yuv_to_rgba_row_full, avx_yuv_to_rgba_row_limited}; pub(crate) use rgb_to_nv::avx2_rgba_to_nv; pub(crate) use rgb_to_y::avx2_rgb_to_y_row; pub(crate) use rgb_to_ycgco::avx2_rgb_to_ycgco_row; diff --git a/src/avx2/rgb_to_ycgco.rs b/src/avx2/rgb_to_ycgco.rs index 1e5e9dfb..b37e4827 100644 --- a/src/avx2/rgb_to_ycgco.rs +++ b/src/avx2/rgb_to_ycgco.rs @@ -66,7 +66,7 @@ pub(crate) unsafe fn avx2_rgb_to_ycgco_row let v_g_coeff_2 = _mm256_set1_epi16(-(transform.g_coeff_2 as i16)); let v_alpha = _mm256_set1_epi8(255u8 as i8); let zeros = _mm256_setzero_si256(); - let rounding_const = _mm256_set1_epi16(1 << 5); + let rounding_const = _mm256_set1_epi16((1 << 5) - 1); for x in (_yuy2_x..max_x_32).step_by(32) { let yuy2_offset = x * 4; diff --git a/src/built_coefficients.rs b/src/built_coefficients.rs index 51a819b4..10579edb 100644 --- a/src/built_coefficients.rs +++ b/src/built_coefficients.rs @@ -31,49 +31,49 @@ use crate::{YuvRange, YuvStandardMatrix}; static FORWARD_BT601_FULL_8_13PREC: CbCrForwardTransform = CbCrForwardTransform { yr: 2449, - yg: 4808, - yb: 933, - cb_r: -1383, + yg: 4809, + yb: 934, + cb_r: -1382, cb_g: -2714, cb_b: 4096, cr_r: 4096, cr_g: -3430, - cr_b: -667, + cr_b: -666, }; static FORWARD_BT601_LIMITED_8_13PREC: CbCrForwardTransform = CbCrForwardTransform { - yr: 2103, - yg: 4129, + yr: 2104, + yg: 4130, yb: 802, - cb_r: -1215, + cb_r: -1214, cb_g: -2384, cb_b: 3598, cr_r: 3598, cr_g: -3013, - cr_b: -586, + cr_b: -585, }; static FORWARD_BT709_LIMITED_8_13PREC: CbCrForwardTransform = CbCrForwardTransform { - yr: 1495, - yg: 5031, - yb: 507, - cb_r: -825, + yr: 1496, + yg: 5032, + yb: 508, + cb_r: -824, cb_g: -2774, cb_b: 3598, cr_r: 3598, - cr_g: -3269, + cr_g: -3268, cr_b: -330, }; static FORWARD_BT709_FULL_8_13PREC: CbCrForwardTransform = CbCrForwardTransform { - yr: 1741, - yg: 5858, + yr: 1742, + yg: 5859, yb: 591, cb_r: -939, - cb_g: -3158, + cb_g: -3157, cb_b: 4096, cr_r: 4096, - cr_g: -3721, + cr_g: -3720, cr_b: -376, }; @@ -82,70 +82,70 @@ static FORWARD_BT2020_LIMITED_8_13PREC: CbCrForwardTransform = CbCrForwardT yg: 4770, yb: 417, cb_r: -1005, - cb_g: -2594, + cb_g: -2593, cb_b: 3598, cr_r: 3598, cr_g: -3309, - cr_b: -290, + cr_b: -289, }; static FORWARD_BT2020_FULL_8_13PREC: CbCrForwardTransform = CbCrForwardTransform { yr: 2152, yg: 5554, - yb: 485, + yb: 486, cb_r: -1144, - cb_g: -2953, + cb_g: -2952, cb_b: 4096, cr_r: 4096, cr_g: -3767, - cr_b: -330, + cr_b: -329, }; static FORWARD_SMPTE240_LIMITED_8_13PREC: CbCrForwardTransform = CbCrForwardTransform { yr: 612, - yg: 4931, - yb: 1491, - cb_r: -398, + yg: 4932, + yb: 1492, + cb_r: -397, cb_g: -3201, cb_b: 3598, cr_r: 3598, cr_g: -2763, - cr_b: -836, + cr_b: -835, }; static FORWARD_SMPTE240_FULL_8_13PREC: CbCrForwardTransform = CbCrForwardTransform { - yr: 712, - yg: 5742, - yb: 1736, - cb_r: -453, + yr: 713, + yg: 5743, + yb: 1737, + cb_r: -452, cb_g: -3644, cb_b: 4096, cr_r: 4096, cr_g: -3145, - cr_b: -952, + cr_b: -951, }; static FORWARD_BT470_FULL_8_13PREC: CbCrForwardTransform = CbCrForwardTransform { - yr: 1818, + yr: 1819, yg: 5789, yb: 584, - cb_r: -980, + cb_r: -979, cb_g: -3117, cb_b: 4096, cr_r: 4096, cr_g: -3721, - cr_b: -376, + cr_b: -375, }; static FORWARD_BT470_LIMITED_8_13PREC: CbCrForwardTransform = CbCrForwardTransform { - yr: 1561, - yg: 4971, - yb: 501, - cb_r: -861, + yr: 1562, + yg: 4972, + yb: 502, + cb_r: -860, cb_g: -2738, cb_b: 3598, cr_r: 3598, - cr_g: -3269, + cr_g: -3268, cr_b: -330, }; @@ -190,10 +190,10 @@ pub(crate) fn get_built_forward_transform( } static INVERSE_BT601_LIMITED_8_PREC13: CbCrInverseTransform = CbCrInverseTransform { - y_coef: 9538, - cr_coef: 13074, + y_coef: 9539, + cr_coef: 13075, cb_coef: 16525, - g_coeff_1: 6659, + g_coeff_1: 6660, g_coeff_2: 3209, }; @@ -206,75 +206,75 @@ static INVERSE_BT601_FULL_8_PREC13: CbCrInverseTransform = CbCrInverseTrans }; static INVERSE_BT709_LIMITED_8_PREC13: CbCrInverseTransform = CbCrInverseTransform { - y_coef: 9538, + y_coef: 9539, cr_coef: 14686, - cb_coef: 17304, - g_coeff_1: 4365, - g_coeff_2: 1746, + cb_coef: 17305, + g_coeff_1: 4366, + g_coeff_2: 1747, }; static INVERSE_BT709_FULL_8_PREC13: CbCrInverseTransform = CbCrInverseTransform { y_coef: 8192, - cr_coef: 12900, + cr_coef: 12901, cb_coef: 15201, - g_coeff_1: 3834, - g_coeff_2: 1534, + g_coeff_1: 3835, + g_coeff_2: 1535, }; static INVERSE_BT2020_LIMITED_8_PREC13: CbCrInverseTransform = CbCrInverseTransform { - y_coef: 9538, - cr_coef: 13751, + y_coef: 9539, + cr_coef: 13752, cb_coef: 17545, g_coeff_1: 5328, - g_coeff_2: 1534, + g_coeff_2: 1535, }; static INVERSE_BT2020_FULL_8_PREC13: CbCrInverseTransform = CbCrInverseTransform { y_coef: 8192, - cr_coef: 12079, + cr_coef: 12080, cb_coef: 15412, - g_coeff_1: 4680, + g_coeff_1: 4681, g_coeff_2: 1348, }; static INVERSE_SMPTE240_LIMITED_8_PREC13: CbCrInverseTransform = CbCrInverseTransform { - y_coef: 9538, - cr_coef: 17028, + y_coef: 9539, + cr_coef: 17029, cb_coef: 14697, g_coeff_1: 2113, - g_coeff_2: 4444, + g_coeff_2: 4445, }; static INVERSE_SMPTE240_FULL_8_PREC13: CbCrInverseTransform = CbCrInverseTransform { y_coef: 8192, - cr_coef: 14958, - cb_coef: 12910, + cr_coef: 14959, + cb_coef: 12911, g_coeff_1: 1856, g_coeff_2: 3904, }; static INVERSE_BT470_LIMITED_8_PREC13: CbCrInverseTransform = CbCrInverseTransform { - y_coef: 9538, - cr_coef: 14510, - cb_coef: 17321, + y_coef: 9539, + cr_coef: 14511, + cb_coef: 17322, g_coeff_1: 4558, - g_coeff_2: 1747, + g_coeff_2: 1748, }; static INVERSE_BT470_FULL_8_PREC13: CbCrInverseTransform = CbCrInverseTransform { y_coef: 8192, - cr_coef: 12746, - cb_coef: 15215, + cr_coef: 12747, + cb_coef: 15216, g_coeff_1: 4004, g_coeff_2: 1535, }; static INVERSE_BT601_LIMITED_10_PREC13: CbCrInverseTransform = CbCrInverseTransform { - y_coef: 9566, + y_coef: 9567, cr_coef: 13113, - cb_coef: 16573, + cb_coef: 16574, g_coeff_1: 6679, - g_coeff_2: 3218, + g_coeff_2: 3219, }; static INVERSE_BT601_FULL_10_PREC13: CbCrInverseTransform = CbCrInverseTransform { @@ -286,40 +286,41 @@ static INVERSE_BT601_FULL_10_PREC13: CbCrInverseTransform = CbCrInverseTran }; static INVERSE_BT709_LIMITED_10_PREC13: CbCrInverseTransform = CbCrInverseTransform { - y_coef: 9566, + y_coef: 9567, cr_coef: 14729, - cb_coef: 17355, + cb_coef: 17356, g_coeff_1: 4378, g_coeff_2: 1752, }; static INVERSE_BT709_FULL_10_PREC13: CbCrInverseTransform = CbCrInverseTransform { y_coef: 8192, - cr_coef: 12900, + cr_coef: 12901, cb_coef: 15201, - g_coeff_1: 3834, - g_coeff_2: 1534, + g_coeff_1: 3835, + g_coeff_2: 1535, }; static INVERSE_BT2020_LIMITED_10_PREC13: CbCrInverseTransform = CbCrInverseTransform { - y_coef: 9566, + y_coef: 9567, cr_coef: 13792, cb_coef: 17597, - g_coeff_1: 5343, + g_coeff_1: 5344, g_coeff_2: 1539, }; + static INVERSE_BT2020_FULL_10_PREC13: CbCrInverseTransform = CbCrInverseTransform { y_coef: 8192, - cr_coef: 12079, + cr_coef: 12080, cb_coef: 15412, - g_coeff_1: 4680, + g_coeff_1: 4681, g_coeff_2: 1348, }; static INVERSE_BT601_LIMITED_12_PREC13: CbCrInverseTransform = CbCrInverseTransform { - y_coef: 9573, - cr_coef: 13122, - cb_coef: 16585, + y_coef: 9574, + cr_coef: 13123, + cb_coef: 16586, g_coeff_1: 6684, g_coeff_2: 3221, }; @@ -333,33 +334,34 @@ static INVERSE_BT601_FULL_12_PREC13: CbCrInverseTransform = CbCrInverseTran }; static INVERSE_BT709_LIMITED_12_PREC13: CbCrInverseTransform = CbCrInverseTransform { - y_coef: 9573, + y_coef: 9574, cr_coef: 14740, cb_coef: 17368, - g_coeff_1: 4381, + g_coeff_1: 4382, g_coeff_2: 1753, }; static INVERSE_BT709_FULL_12_PREC13: CbCrInverseTransform = CbCrInverseTransform { y_coef: 8192, - cr_coef: 12900, + cr_coef: 12901, cb_coef: 15201, - g_coeff_1: 3834, - g_coeff_2: 1534, + g_coeff_1: 3835, + g_coeff_2: 1535, }; static INVERSE_BT2020_LIMITED_12_PREC13: CbCrInverseTransform = CbCrInverseTransform { - y_coef: 9573, + y_coef: 9574, cr_coef: 13802, - cb_coef: 17609, - g_coeff_1: 5347, + cb_coef: 17610, + g_coeff_1: 5348, g_coeff_2: 1540, }; + static INVERSE_BT2020_FULL_12_PREC13: CbCrInverseTransform = CbCrInverseTransform { y_coef: 8192, - cr_coef: 12079, + cr_coef: 12080, cb_coef: 15412, - g_coeff_1: 4680, + g_coeff_1: 4681, g_coeff_2: 1348, }; diff --git a/src/from_identity.rs b/src/from_identity.rs index 43b2c63e..8e442ac1 100644 --- a/src/from_identity.rs +++ b/src/from_identity.rs @@ -27,7 +27,13 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #![forbid(unsafe_code)] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use crate::avx2::{avx_yuv_to_rgba_row_full, avx_yuv_to_rgba_row_limited}; +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +use crate::neon::{yuv_to_rgba_row_full, yuv_to_rgba_row_limited, yuv_to_rgba_row_limited_rdm}; use crate::numerics::qrshr; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use crate::sse::{sse_yuv_to_rgba_row_full, sse_yuv_to_rgba_row_limited}; use crate::yuv_error::check_rgba_destination; use crate::yuv_support::{get_yuv_range, YuvSourceChannels}; use crate::{YuvChromaSubsampling, YuvError, YuvPlanarImage, YuvRange}; @@ -37,8 +43,183 @@ use rayon::iter::{IndexedParallelIterator, ParallelIterator}; #[cfg(feature = "rayon")] use rayon::prelude::{ParallelSlice, ParallelSliceMut}; use std::fmt::Debug; +use std::marker::PhantomData; use std::mem::size_of; +struct WideRowGbrProcessor { + _phantom: PhantomData, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + _use_sse: bool, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + _use_avx: bool, +} + +impl Default for WideRowGbrProcessor { + fn default() -> Self { + WideRowGbrProcessor { + _phantom: PhantomData, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + _use_sse: std::arch::is_x86_feature_detected!("sse4.1"), + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + _use_avx: std::arch::is_x86_feature_detected!("avx2"), + } + } +} + +struct WideRowGbrLimitedProcessor { + _phantom: PhantomData, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + _use_sse: bool, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + _use_avx: bool, + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + _use_rdm: bool, +} + +impl Default for WideRowGbrLimitedProcessor { + fn default() -> Self { + WideRowGbrLimitedProcessor { + _phantom: PhantomData, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + _use_sse: std::arch::is_x86_feature_detected!("sse4.1"), + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + _use_avx: std::arch::is_x86_feature_detected!("avx2"), + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + _use_rdm: std::arch::is_aarch64_feature_detected!("rdm"), + } + } +} + +trait FullRangeWideRow { + fn handle_row( + &self, + g_plane: &[V], + b_plane: &[V], + r_plane: &[V], + rgba: &mut [V], + start_cx: usize, + width: usize, + ) -> usize; +} + +trait LimitedRangeWideRow { + fn handle_row( + &self, + g_plane: &[V], + b_plane: &[V], + r_plane: &[V], + rgba: &mut [V], + start_cx: usize, + width: usize, + y_bias: i32, + y_coeff: i32, + ) -> usize; +} + +impl FullRangeWideRow for WideRowGbrProcessor { + fn handle_row( + &self, + _g_plane: &[u8], + _b_plane: &[u8], + _r_plane: &[u8], + _rgba: &mut [u8], + _start_cx: usize, + _width: usize, + ) -> usize { + let mut _cx = _start_cx; + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if self._use_avx { + _cx = avx_yuv_to_rgba_row_full::( + _g_plane, _b_plane, _r_plane, _rgba, _cx, _width, + ); + } + if self._use_sse { + _cx = sse_yuv_to_rgba_row_full::( + _g_plane, _b_plane, _r_plane, _rgba, _cx, _width, + ); + } + } + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + { + _cx = yuv_to_rgba_row_full::(_g_plane, _b_plane, _r_plane, _rgba, _cx, _width); + } + _cx + } +} + +impl FullRangeWideRow for WideRowGbrProcessor { + fn handle_row( + &self, + _g_plane: &[u16], + _b_plane: &[u16], + _r_plane: &[u16], + _rgba: &mut [u16], + _start_cx: usize, + _width: usize, + ) -> usize { + 0 + } +} + +impl LimitedRangeWideRow for WideRowGbrLimitedProcessor { + fn handle_row( + &self, + _g_plane: &[u8], + _b_plane: &[u8], + _r_plane: &[u8], + _rgba: &mut [u8], + _start_cx: usize, + _width: usize, + _y_bias: i32, + _y_coeff: i32, + ) -> usize { + let mut _cx = _start_cx; + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if self._use_avx { + _cx = avx_yuv_to_rgba_row_limited::( + _g_plane, _b_plane, _r_plane, _rgba, _cx, _width, _y_bias, _y_coeff, + ); + } + if self._use_sse { + _cx = sse_yuv_to_rgba_row_limited::( + _g_plane, _b_plane, _r_plane, _rgba, _cx, _width, _y_bias, _y_coeff, + ); + } + } + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + { + if self._use_rdm { + _cx = yuv_to_rgba_row_limited_rdm::( + _g_plane, _b_plane, _r_plane, _rgba, _cx, _width, _y_bias, _y_coeff, + ); + } else { + _cx = yuv_to_rgba_row_limited::( + _g_plane, _b_plane, _r_plane, _rgba, _cx, _width, _y_bias, _y_coeff, + ); + } + } + _cx + } +} + +impl LimitedRangeWideRow for WideRowGbrLimitedProcessor { + fn handle_row( + &self, + _g_plane: &[u16], + _b_plane: &[u16], + _r_plane: &[u16], + _rgba: &mut [u16], + _start_cx: usize, + _width: usize, + _y_bias: i32, + _y_coeff: i32, + ) -> usize { + 0 + } +} + #[inline] fn gbr_to_rgbx_impl< V: Copy + AsPrimitive + 'static + Sized + Debug + Send + Sync, @@ -52,6 +233,8 @@ fn gbr_to_rgbx_impl< ) -> Result<(), YuvError> where i32: AsPrimitive, + WideRowGbrProcessor: FullRangeWideRow, + WideRowGbrLimitedProcessor: LimitedRangeWideRow, { let destination_channels: YuvSourceChannels = CHANNELS.into(); let channels = destination_channels.get_channels_count(); @@ -115,41 +298,65 @@ where let y_bias = range.bias_y as i32; let iter = y_iter.zip(u_iter).zip(v_iter).zip(rgb_iter); + + let wide_handler = WideRowGbrLimitedProcessor::::default(); + iter.for_each(|(((y_src, u_src), v_src), rgb)| { let y_src = &y_src[0..image.width as usize]; + let cx = wide_handler.handle_row::( + y_src, + u_src, + v_src, + rgb, + 0, + image.width as usize, + y_bias, + y_coef, + ); + let rgb_chunks = rgb.chunks_exact_mut(channels); for (((&y_src, &u_src), &v_src), rgb_dst) in - y_src.iter().zip(u_src).zip(v_src).zip(rgb_chunks) + y_src.iter().zip(u_src).zip(v_src).zip(rgb_chunks).skip(cx) { - rgb_dst[0] = + rgb_dst[destination_channels.get_r_channel_offset()] = qrshr::((v_src.as_() - y_bias) * y_coef).as_(); - rgb_dst[1] = + rgb_dst[destination_channels.get_g_channel_offset()] = qrshr::((y_src.as_() - y_bias) * y_coef).as_(); - rgb_dst[2] = + rgb_dst[destination_channels.get_b_channel_offset()] = qrshr::((u_src.as_() - y_bias) * y_coef).as_(); if channels == 4 { - rgb_dst[3] = max_value.as_(); + rgb_dst[destination_channels.get_a_channel_offset()] = max_value.as_(); } } }); } YuvRange::Full => { + let wide_handler = WideRowGbrProcessor::::default(); let iter = y_iter.zip(u_iter).zip(v_iter).zip(rgb_iter); iter.for_each(|(((y_src, u_src), v_src), rgb)| { let y_src = &y_src[0..image.width as usize]; + let cx = wide_handler.handle_row::( + y_src, + u_src, + v_src, + rgb, + 0, + image.width as usize, + ); + let rgb_chunks = rgb.chunks_exact_mut(channels); for (((&y_src, &u_src), &v_src), rgb_dst) in - y_src.iter().zip(u_src).zip(v_src).zip(rgb_chunks) + y_src.iter().zip(u_src).zip(v_src).zip(rgb_chunks).skip(cx) { - rgb_dst[0] = v_src; - rgb_dst[1] = y_src; - rgb_dst[2] = u_src; + rgb_dst[destination_channels.get_r_channel_offset()] = v_src; + rgb_dst[destination_channels.get_g_channel_offset()] = y_src; + rgb_dst[destination_channels.get_b_channel_offset()] = u_src; if channels == 4 { - rgb_dst[3] = max_value.as_(); + rgb_dst[destination_channels.get_a_channel_offset()] = max_value.as_(); } } }); diff --git a/src/from_identity_alpha.rs b/src/from_identity_alpha.rs index 0b0587a9..fca8ebf6 100644 --- a/src/from_identity_alpha.rs +++ b/src/from_identity_alpha.rs @@ -127,13 +127,13 @@ where .zip(rgb_chunks) .zip(a_src) { - rgb_dst[0] = + rgb_dst[destination_channels.get_r_channel_offset()] = qrshr::((v_src.as_() - y_bias) * y_coef).as_(); - rgb_dst[1] = + rgb_dst[destination_channels.get_g_channel_offset()] = qrshr::((y_src.as_() - y_bias) * y_coef).as_(); - rgb_dst[2] = + rgb_dst[destination_channels.get_b_channel_offset()] = qrshr::((u_src.as_() - y_bias) * y_coef).as_(); - rgb_dst[3] = a_src; + rgb_dst[destination_channels.get_a_channel_offset()] = a_src; } }); } @@ -150,10 +150,10 @@ where .zip(rgb_chunks) .zip(a_src) { - rgb_dst[0] = v_src; - rgb_dst[1] = y_src; - rgb_dst[2] = u_src; - rgb_dst[3] = a_src; + rgb_dst[destination_channels.get_r_channel_offset()] = v_src; + rgb_dst[destination_channels.get_g_channel_offset()] = y_src; + rgb_dst[destination_channels.get_b_channel_offset()] = u_src; + rgb_dst[destination_channels.get_a_channel_offset()] = a_src; } }); } diff --git a/src/neon/gbr_to_rgb.rs b/src/neon/gbr_to_rgb.rs new file mode 100644 index 00000000..ac255642 --- /dev/null +++ b/src/neon/gbr_to_rgb.rs @@ -0,0 +1,270 @@ +/* + * Copyright (c) Radzivon Bartoshyk, 11/2024. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +use crate::neon::neon_simd_support::vmullnq_s16; +use crate::yuv_support::YuvSourceChannels; +use std::arch::aarch64::*; + +pub(crate) fn yuv_to_rgba_row_limited_rdm( + g_plane: &[u8], + b_plane: &[u8], + r_plane: &[u8], + rgba: &mut [u8], + start_cx: usize, + width: usize, + y_bias: i32, + y_coeff: i32, +) -> usize { + unsafe { + yuv_to_rgba_row_limited_impl_rdm::( + g_plane, b_plane, r_plane, rgba, start_cx, width, y_bias, y_coeff, + ) + } +} + +#[target_feature(enable = "rdm")] +unsafe fn yuv_to_rgba_row_limited_impl_rdm( + g_plane: &[u8], + b_plane: &[u8], + r_plane: &[u8], + rgba: &mut [u8], + start_cx: usize, + width: usize, + y_bias: i32, + y_coeff: i32, +) -> usize { + let mut cx = start_cx; + + let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into(); + + let v_alpha = vdupq_n_u8(255u8); + + const V_SCALE: i32 = 2; + + let vy_coeff = vdupq_n_s16(y_coeff as i16); + let vy_bias = vdupq_n_u8(y_bias as u8); + + while cx + 16 < width { + let g_values0 = vqsubq_u8( + vld1q_u8(g_plane.get_unchecked(cx..).as_ptr() as *const _), + vy_bias, + ); + let b_values0 = vqsubq_u8( + vld1q_u8(b_plane.get_unchecked(cx..).as_ptr() as *const _), + vy_bias, + ); + let r_values0 = vqsubq_u8( + vld1q_u8(r_plane.get_unchecked(cx..).as_ptr() as *const _), + vy_bias, + ); + + let rl_lo = vqrdmulhq_s16( + vreinterpretq_s16_u16(vshll_n_u8::(vget_low_u8(r_values0))), + vy_coeff, + ); + let gl_lo = vqrdmulhq_s16( + vreinterpretq_s16_u16(vshll_n_u8::(vget_low_u8(g_values0))), + vy_coeff, + ); + let bl_lo = vqrdmulhq_s16( + vreinterpretq_s16_u16(vshll_n_u8::(vget_low_u8(b_values0))), + vy_coeff, + ); + + let rl_hi = vqrdmulhq_s16( + vreinterpretq_s16_u16(vshll_high_n_u8::(r_values0)), + vy_coeff, + ); + let gl_hi = vqrdmulhq_s16( + vreinterpretq_s16_u16(vshll_high_n_u8::(g_values0)), + vy_coeff, + ); + let bl_hi = vqrdmulhq_s16( + vreinterpretq_s16_u16(vshll_high_n_u8::(b_values0)), + vy_coeff, + ); + + let r_values = vcombine_u8(vqmovun_s16(rl_lo), vqmovun_s16(rl_hi)); + let g_values = vcombine_u8(vqmovun_s16(gl_lo), vqmovun_s16(gl_hi)); + let b_values = vcombine_u8(vqmovun_s16(bl_lo), vqmovun_s16(bl_hi)); + + let dst_shift = cx * destination_channels.get_channels_count(); + let rgba_ptr = rgba.get_unchecked_mut(dst_shift..); + + match destination_channels { + YuvSourceChannels::Rgb => { + let dst_pack: uint8x16x3_t = uint8x16x3_t(r_values, g_values, b_values); + vst3q_u8(rgba_ptr.as_mut_ptr(), dst_pack); + } + YuvSourceChannels::Bgr => { + let dst_pack: uint8x16x3_t = uint8x16x3_t(b_values, g_values, r_values); + vst3q_u8(rgba_ptr.as_mut_ptr(), dst_pack); + } + YuvSourceChannels::Rgba => { + let dst_pack: uint8x16x4_t = uint8x16x4_t(r_values, g_values, b_values, v_alpha); + vst4q_u8(rgba_ptr.as_mut_ptr(), dst_pack); + } + YuvSourceChannels::Bgra => { + let dst_pack: uint8x16x4_t = uint8x16x4_t(b_values, g_values, r_values, v_alpha); + vst4q_u8(rgba_ptr.as_mut_ptr(), dst_pack); + } + } + + cx += 16; + } + + cx +} + +pub(crate) fn yuv_to_rgba_row_limited( + g_plane: &[u8], + b_plane: &[u8], + r_plane: &[u8], + rgba: &mut [u8], + start_cx: usize, + width: usize, + y_bias: i32, + y_coeff: i32, +) -> usize { + unsafe { + let mut cx = start_cx; + + let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into(); + + let v_alpha = vdupq_n_u8(255u8); + + let vy_coeff = vdupq_n_u16(y_coeff as u16); + let vy_bias = vdupq_n_u8(y_bias as u8); + + while cx + 16 < width { + let g_values0 = vqsubq_u8( + vld1q_u8(g_plane.get_unchecked(cx..).as_ptr() as *const _), + vy_bias, + ); + let b_values0 = vqsubq_u8( + vld1q_u8(b_plane.get_unchecked(cx..).as_ptr() as *const _), + vy_bias, + ); + let r_values0 = vqsubq_u8( + vld1q_u8(r_plane.get_unchecked(cx..).as_ptr() as *const _), + vy_bias, + ); + + let rl_lo = vmullnq_s16::(vmovl_u8(vget_low_u8(r_values0)), vy_coeff); + let gl_lo = vmullnq_s16::(vmovl_u8(vget_low_u8(g_values0)), vy_coeff); + let bl_lo = vmullnq_s16::(vmovl_u8(vget_low_u8(b_values0)), vy_coeff); + + let rl_hi = vmullnq_s16::(vmovl_high_u8(r_values0), vy_coeff); + let gl_hi = vmullnq_s16::(vmovl_high_u8(g_values0), vy_coeff); + let bl_hi = vmullnq_s16::(vmovl_high_u8(b_values0), vy_coeff); + + let r_values = vcombine_u8(vqmovn_u16(rl_lo), vqmovn_u16(rl_hi)); + let g_values = vcombine_u8(vqmovn_u16(gl_lo), vqmovn_u16(gl_hi)); + let b_values = vcombine_u8(vqmovn_u16(bl_lo), vqmovn_u16(bl_hi)); + + let dst_shift = cx * destination_channels.get_channels_count(); + let rgba_ptr = rgba.get_unchecked_mut(dst_shift..); + + match destination_channels { + YuvSourceChannels::Rgb => { + let dst_pack: uint8x16x3_t = uint8x16x3_t(r_values, g_values, b_values); + vst3q_u8(rgba_ptr.as_mut_ptr(), dst_pack); + } + YuvSourceChannels::Bgr => { + let dst_pack: uint8x16x3_t = uint8x16x3_t(b_values, g_values, r_values); + vst3q_u8(rgba_ptr.as_mut_ptr(), dst_pack); + } + YuvSourceChannels::Rgba => { + let dst_pack: uint8x16x4_t = + uint8x16x4_t(r_values, g_values, b_values, v_alpha); + vst4q_u8(rgba_ptr.as_mut_ptr(), dst_pack); + } + YuvSourceChannels::Bgra => { + let dst_pack: uint8x16x4_t = + uint8x16x4_t(b_values, g_values, r_values, v_alpha); + vst4q_u8(rgba_ptr.as_mut_ptr(), dst_pack); + } + } + + cx += 16; + } + + cx + } +} + +pub(crate) fn yuv_to_rgba_row_full( + g_plane: &[u8], + b_plane: &[u8], + r_plane: &[u8], + rgba: &mut [u8], + start_cx: usize, + width: usize, +) -> usize { + unsafe { + let mut cx = start_cx; + + let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into(); + + let v_alpha = vdupq_n_u8(255u8); + + while cx + 16 < width { + let g_values = vld1q_u8(g_plane.get_unchecked(cx..).as_ptr() as *const _); + let b_values = vld1q_u8(b_plane.get_unchecked(cx..).as_ptr() as *const _); + let r_values = vld1q_u8(r_plane.get_unchecked(cx..).as_ptr() as *const _); + + let dst_shift = cx * destination_channels.get_channels_count(); + let rgba_ptr = rgba.get_unchecked_mut(dst_shift..); + + match destination_channels { + YuvSourceChannels::Rgb => { + let dst_pack: uint8x16x3_t = uint8x16x3_t(r_values, g_values, b_values); + vst3q_u8(rgba_ptr.as_mut_ptr(), dst_pack); + } + YuvSourceChannels::Bgr => { + let dst_pack: uint8x16x3_t = uint8x16x3_t(b_values, g_values, r_values); + vst3q_u8(rgba_ptr.as_mut_ptr(), dst_pack); + } + YuvSourceChannels::Rgba => { + let dst_pack: uint8x16x4_t = + uint8x16x4_t(r_values, g_values, b_values, v_alpha); + vst4q_u8(rgba_ptr.as_mut_ptr(), dst_pack); + } + YuvSourceChannels::Bgra => { + let dst_pack: uint8x16x4_t = + uint8x16x4_t(b_values, g_values, r_values, v_alpha); + vst4q_u8(rgba_ptr.as_mut_ptr(), dst_pack); + } + } + + cx += 16; + } + + cx + } +} diff --git a/src/neon/mod.rs b/src/neon/mod.rs index 5cfd22be..3fd5bdb0 100644 --- a/src/neon/mod.rs +++ b/src/neon/mod.rs @@ -27,6 +27,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #![deny(unreachable_code, unreachable_pub)] +mod gbr_to_rgb; mod neon_simd_support; mod neon_ycgco; mod neon_ycgco_r; @@ -39,6 +40,7 @@ mod rgba_to_yuv; mod rgba_to_yuv420; mod y_p16_to_rgba16; mod y_to_rgb; +mod y_to_rgb_alpha; mod ycgco_to_rgb; mod ycgco_to_rgb_alpha; mod ycgcor_to_rgb; @@ -57,7 +59,10 @@ mod yuv_to_yuy2; mod yuy2_to_rgb; mod yuy2_to_yuv; -pub(crate) use rgb_to_y::neon_rgb_to_y_row; +pub(crate) use gbr_to_rgb::{ + yuv_to_rgba_row_full, yuv_to_rgba_row_limited, yuv_to_rgba_row_limited_rdm, +}; +pub(crate) use rgb_to_y::{neon_rgb_to_y_rdm, neon_rgb_to_y_row}; pub(crate) use rgb_to_ycgco::neon_rgb_to_ycgco_row; pub(crate) use rgb_to_ycgco_r::neon_rgb_to_ycgcor_row; pub(crate) use rgb_to_yuv_p16::{neon_rgba_to_yuv_p16, neon_rgba_to_yuv_p16_rdm}; @@ -66,6 +71,7 @@ pub(crate) use rgba_to_yuv::{neon_rgba_to_yuv, neon_rgba_to_yuv_rdm}; pub(crate) use rgba_to_yuv420::{neon_rgba_to_yuv420, neon_rgba_to_yuv_rdm420}; pub(crate) use y_p16_to_rgba16::{neon_y_p16_to_rgba16_rdm, neon_y_p16_to_rgba16_row}; pub(crate) use y_to_rgb::{neon_y_to_rgb_row, neon_y_to_rgb_row_rdm}; +pub(crate) use y_to_rgb_alpha::{neon_y_to_rgb_alpha_row, neon_y_to_rgb_row_alpha_rdm}; pub(crate) use ycgco_to_rgb::neon_ycgco_to_rgb_row; pub(crate) use ycgco_to_rgb_alpha::neon_ycgco_to_rgb_alpha_row; pub(crate) use ycgcor_to_rgb::neon_ycgcor_to_rgb_row; diff --git a/src/neon/neon_simd_support.rs b/src/neon/neon_simd_support.rs index 0d79deb8..e07a8a74 100644 --- a/src/neon/neon_simd_support.rs +++ b/src/neon/neon_simd_support.rs @@ -94,6 +94,13 @@ pub(crate) unsafe fn vmullq_laneq_s16( ) } +#[inline(always)] +pub(crate) unsafe fn vmullnq_s16(v: uint16x8_t, q: uint16x8_t) -> uint16x8_t { + let hi = vmull_high_u16(q, v); + let lo = vmull_u16(vget_low_u16(q), vget_low_u16(v)); + vcombine_u16(vrshrn_n_u32::(lo), vrshrn_n_u32::(hi)) +} + #[inline(always)] pub(crate) unsafe fn neon_div_by_255(v: uint16x8_t) -> uint8x8_t { let addition = vdupq_n_u16(127); diff --git a/src/neon/rgb_to_y.rs b/src/neon/rgb_to_y.rs index 5232264d..26a309be 100644 --- a/src/neon/rgb_to_y.rs +++ b/src/neon/rgb_to_y.rs @@ -31,7 +31,7 @@ use crate::yuv_support::{CbCrForwardTransform, YuvChromaRange, YuvSourceChannels use std::arch::aarch64::*; #[target_feature(enable = "rdm")] -pub(crate) unsafe fn neon_rgb_to_y_row( +pub(crate) unsafe fn neon_rgb_to_y_rdm( transform: &CbCrForwardTransform, range: &YuvChromaRange, y_plane: *mut u8, @@ -51,7 +51,6 @@ pub(crate) unsafe fn neon_rgb_to_y_row( let v_yr = vdupq_n_s16(transform.yr as i16); let v_yg = vdupq_n_s16(transform.yg as i16); let v_yb = vdupq_n_s16(transform.yb as i16); - let v_zeros = vdupq_n_s16(0i16); let i_bias_y = vdupq_n_s16(range.bias_y as i16); let i_cap_y = vdupq_n_u16(range.range_y as u16 + range.bias_y as u16); @@ -96,7 +95,6 @@ pub(crate) unsafe fn neon_rgb_to_y_row( let mut y_high = vqrdmlahq_s16(y_bias, r_high, v_yr); y_high = vqrdmlahq_s16(y_high, g_high, v_yg); y_high = vqrdmlahq_s16(y_high, b_high, v_yb); - y_high = vmaxq_s16(y_high, v_zeros); let y_high = vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(y_high, i_bias_y)), i_cap_y); @@ -107,7 +105,6 @@ pub(crate) unsafe fn neon_rgb_to_y_row( let mut y_low = vqrdmlahq_s16(y_bias, r_low, v_yr); y_low = vqrdmlahq_s16(y_low, g_low, v_yg); y_low = vqrdmlahq_s16(y_low, b_low, v_yb); - y_low = vmaxq_s16(y_low, v_zeros); let y_low = vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(y_low, i_bias_y)), i_cap_y); @@ -119,3 +116,131 @@ pub(crate) unsafe fn neon_rgb_to_y_row( cx } + +pub(crate) unsafe fn neon_rgb_to_y_row( + transform: &CbCrForwardTransform, + range: &YuvChromaRange, + y_plane: *mut u8, + rgba: &[u8], + start_cx: usize, + width: usize, +) -> usize { + let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into(); + let channels = source_channels.get_channels_count(); + + let bias_y = range.bias_y as i32; + + let y_ptr = y_plane; + let rgba_ptr = rgba.as_ptr(); + + let y_bias = vdupq_n_s32(bias_y); + let weights_arr: [i16; 8] = [ + transform.yr as i16, + transform.yg as i16, + transform.yb as i16, + transform.cb_r as i16, + transform.cb_g as i16, + transform.cb_b as i16, + transform.cr_r as i16, + transform.cr_g as i16, + ]; + let v_weights = vld1q_s16(weights_arr.as_ptr()); + + let i_bias_y = vdupq_n_s16(range.bias_y as i16); + let i_cap_y = vdupq_n_u16(range.range_y as u16 + range.bias_y as u16); + + let mut cx = start_cx; + while cx + 16 < width { + let r_values_u8: uint8x16_t; + let g_values_u8: uint8x16_t; + let b_values_u8: uint8x16_t; + + match source_channels { + YuvSourceChannels::Rgb | YuvSourceChannels::Bgr => { + let rgb_values = vld3q_u8(rgba_ptr.add(cx * channels)); + if source_channels == YuvSourceChannels::Rgb { + r_values_u8 = rgb_values.0; + g_values_u8 = rgb_values.1; + b_values_u8 = rgb_values.2; + } else { + r_values_u8 = rgb_values.2; + g_values_u8 = rgb_values.1; + b_values_u8 = rgb_values.0; + } + } + YuvSourceChannels::Rgba => { + let rgb_values = vld4q_u8(rgba_ptr.add(cx * channels)); + r_values_u8 = rgb_values.0; + g_values_u8 = rgb_values.1; + b_values_u8 = rgb_values.2; + } + YuvSourceChannels::Bgra => { + let rgb_values = vld4q_u8(rgba_ptr.add(cx * channels)); + r_values_u8 = rgb_values.2; + g_values_u8 = rgb_values.1; + b_values_u8 = rgb_values.0; + } + } + + let r_high = vreinterpretq_s16_u16(vmovl_high_u8(r_values_u8)); + let g_high = vreinterpretq_s16_u16(vmovl_high_u8(g_values_u8)); + let b_high = vreinterpretq_s16_u16(vmovl_high_u8(b_values_u8)); + + let r_h_low = vget_low_s16(r_high); + let g_h_low = vget_low_s16(g_high); + let b_h_low = vget_low_s16(b_high); + + let mut y_h_high = vmlal_high_laneq_s16::<0>(y_bias, r_high, v_weights); + y_h_high = vmlal_high_laneq_s16::<1>(y_h_high, g_high, v_weights); + y_h_high = vmlal_high_laneq_s16::<2>(y_h_high, b_high, v_weights); + + let mut y_h_low = vmlal_laneq_s16::<0>(y_bias, r_h_low, v_weights); + y_h_low = vmlal_laneq_s16::<1>(y_h_low, g_h_low, v_weights); + y_h_low = vmlal_laneq_s16::<2>(y_h_low, b_h_low, v_weights); + + let y_high = vminq_u16( + vreinterpretq_u16_s16(vmaxq_s16( + vcombine_s16( + vshrn_n_s32::(y_h_low), + vshrn_n_s32::(y_h_high), + ), + i_bias_y, + )), + i_cap_y, + ); + + let r_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(r_values_u8))); + let g_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(g_values_u8))); + let b_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b_values_u8))); + + let r_l_low = vget_low_s16(r_low); + let g_l_low = vget_low_s16(g_low); + let b_l_low = vget_low_s16(b_low); + + let mut y_l_high = vmlal_high_laneq_s16::<0>(y_bias, r_low, v_weights); + y_l_high = vmlal_high_laneq_s16::<1>(y_l_high, g_low, v_weights); + y_l_high = vmlal_high_laneq_s16::<2>(y_l_high, b_low, v_weights); + + let mut y_l_low = vmlal_laneq_s16::<0>(y_bias, r_l_low, v_weights); + y_l_low = vmlal_laneq_s16::<1>(y_l_low, g_l_low, v_weights); + y_l_low = vmlal_laneq_s16::<2>(y_l_low, b_l_low, v_weights); + + let y_low = vminq_u16( + vreinterpretq_u16_s16(vmaxq_s16( + vcombine_s16( + vshrn_n_s32::(y_l_low), + vshrn_n_s32::(y_l_high), + ), + i_bias_y, + )), + i_cap_y, + ); + + let y = vcombine_u8(vmovn_u16(y_low), vmovn_u16(y_high)); + vst1q_u8(y_ptr.add(cx), y); + + cx += 16; + } + + cx +} diff --git a/src/neon/rgb_to_yuv_p16.rs b/src/neon/rgb_to_yuv_p16.rs index 33192f63..ad2dec18 100644 --- a/src/neon/rgb_to_yuv_p16.rs +++ b/src/neon/rgb_to_yuv_p16.rs @@ -58,7 +58,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_p16< let bytes_position: YuvBytesPacking = BYTES_POSITION.into(); let channels = source_channels.get_channels_count(); - let rounding_const_bias: i32 = 1 << (PRECISION - 1); + let rounding_const_bias: i32 = (1 << (PRECISION - 1)) - 1; let bias_y = range.bias_y as i32 * (1 << PRECISION) + rounding_const_bias; let bias_uv = range.bias_uv as i32 * (1 << PRECISION) + rounding_const_bias; diff --git a/src/neon/rgba_to_nv.rs b/src/neon/rgba_to_nv.rs index 536c8319..82b0cba1 100644 --- a/src/neon/rgba_to_nv.rs +++ b/src/neon/rgba_to_nv.rs @@ -256,7 +256,7 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row< let chroma_subsampling: YuvChromaSubsampling = SAMPLING.into(); let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into(); let channels = source_channels.get_channels_count(); - let rounding_const_bias: i32 = 1 << (PRECISION - 1); + let rounding_const_bias: i32 = (1 << (PRECISION - 1)) - 1; let bias_y = range.bias_y as i32 * (1 << PRECISION) + rounding_const_bias; let bias_uv = range.bias_uv as i32 * (1 << PRECISION) + rounding_const_bias; diff --git a/src/neon/rgba_to_yuv.rs b/src/neon/rgba_to_yuv.rs index 4e434a1d..bc8e5530 100644 --- a/src/neon/rgba_to_yuv.rs +++ b/src/neon/rgba_to_yuv.rs @@ -240,7 +240,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv< let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into(); let channels = source_channels.get_channels_count(); - let rounding_const_bias: i32 = 1 << (PRECISION - 1); + let rounding_const_bias: i32 = (1 << (PRECISION - 1)) - 1; let bias_y = range.bias_y as i32 * (1 << PRECISION) + rounding_const_bias; let bias_uv = range.bias_uv as i32 * (1 << PRECISION) + rounding_const_bias; diff --git a/src/neon/y_to_rgb_alpha.rs b/src/neon/y_to_rgb_alpha.rs new file mode 100644 index 00000000..7c1f4384 --- /dev/null +++ b/src/neon/y_to_rgb_alpha.rs @@ -0,0 +1,175 @@ +/* + * Copyright (c) Radzivon Bartoshyk, 10/2024. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +use crate::neon::neon_simd_support::vmullq_laneq_s16; +use crate::yuv_support::{CbCrInverseTransform, YuvChromaRange, YuvSourceChannels}; +use std::arch::aarch64::*; + +#[target_feature(enable = "rdm")] +pub(crate) unsafe fn neon_y_to_rgb_row_alpha_rdm( + range: &YuvChromaRange, + transform: &CbCrInverseTransform, + y_plane: &[u8], + a_plane: &[u8], + rgba: &mut [u8], + start_cx: usize, + width: usize, +) -> usize { + let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into(); + assert!( + destination_channels == YuvSourceChannels::Rgba + || destination_channels == YuvSourceChannels::Bgra + ); + let channels = destination_channels.get_channels_count(); + + let y_ptr = y_plane.as_ptr(); + let rgba_ptr = rgba.as_mut_ptr(); + + let y_corr = vdupq_n_u8(range.bias_y as u8); + + let mut cx = start_cx; + + const V_SCALE: i32 = 2; + + while cx + 16 < width { + let y_values = vqsubq_u8(vld1q_u8(y_ptr.add(cx)), y_corr); + + let y_high = vqrdmulhq_n_s16( + vreinterpretq_s16_u16(vshll_high_n_u8::(y_values)), + transform.y_coef as i16, + ); + + let r_high = vqmovun_s16(y_high); + + let y_low = vqrdmulhq_n_s16( + vreinterpretq_s16_u16(vshll_n_u8::(vget_low_u8(y_values))), + transform.y_coef as i16, + ); + + let r_low = vqmovun_s16(y_low); + + let r_values = vcombine_u8(r_low, r_high); + + let dst_shift = cx * channels; + + let a_vals = vld1q_u8(a_plane.get_unchecked(cx..).as_ptr()); + + match destination_channels { + YuvSourceChannels::Rgb | YuvSourceChannels::Bgr => { + unreachable!(); + } + YuvSourceChannels::Rgba => { + let dst_pack: uint8x16x4_t = uint8x16x4_t(r_values, r_values, r_values, a_vals); + vst4q_u8(rgba_ptr.add(dst_shift), dst_pack); + } + YuvSourceChannels::Bgra => { + let dst_pack: uint8x16x4_t = uint8x16x4_t(r_values, r_values, r_values, a_vals); + vst4q_u8(rgba_ptr.add(dst_shift), dst_pack); + } + } + + cx += 16; + } + + cx +} + +pub(crate) unsafe fn neon_y_to_rgb_alpha_row< + const PRECISION: i32, + const DESTINATION_CHANNELS: u8, +>( + range: &YuvChromaRange, + transform: &CbCrInverseTransform, + y_plane: &[u8], + a_plane: &[u8], + rgba: &mut [u8], + start_cx: usize, + width: usize, +) -> usize { + let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into(); + assert!( + destination_channels == YuvSourceChannels::Rgba + || destination_channels == YuvSourceChannels::Bgra + ); + let channels = destination_channels.get_channels_count(); + + let y_ptr = y_plane.as_ptr(); + let rgba_ptr = rgba.as_mut_ptr(); + + let y_corr = vdupq_n_u8(range.bias_y as u8); + let v_luma_coeff = vdupq_n_s16(transform.y_coef as i16); + + let mut cx = start_cx; + + while cx + 16 < width { + let y_values = vqsubq_u8(vld1q_u8(y_ptr.add(cx)), y_corr); + + let y_high = + vmullq_laneq_s16::<0>(vreinterpretq_s16_u16(vmovl_high_u8(y_values)), v_luma_coeff); + + let r_high = vqmovun_s16(vcombine_s16( + vrshrn_n_s32::(y_high.0), + vrshrn_n_s32::(y_high.1), + )); + + let y_low = vmullq_laneq_s16::<0>( + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(y_values))), + v_luma_coeff, + ); + + let r_low = vqmovun_s16(vcombine_s16( + vrshrn_n_s32::(y_low.0), + vrshrn_n_s32::(y_low.1), + )); + + let r_values = vcombine_u8(r_low, r_high); + + let dst_shift = cx * channels; + + let a_vals = vld1q_u8(a_plane.get_unchecked(cx..).as_ptr()); + + match destination_channels { + YuvSourceChannels::Rgb | YuvSourceChannels::Bgr => { + unreachable!(); + } + YuvSourceChannels::Rgba => { + let dst_pack: uint8x16x4_t = uint8x16x4_t(r_values, r_values, r_values, a_vals); + vst4q_u8(rgba_ptr.add(dst_shift), dst_pack); + } + YuvSourceChannels::Bgra => { + let dst_pack: uint8x16x4_t = uint8x16x4_t(r_values, r_values, r_values, a_vals); + vst4q_u8(rgba_ptr.add(dst_shift), dst_pack); + } + } + + cx += 16; + } + + cx +} diff --git a/src/numerics.rs b/src/numerics.rs index d1e18af7..9eacbd57 100644 --- a/src/numerics.rs +++ b/src/numerics.rs @@ -33,7 +33,7 @@ use std::ops::Shr; #[inline(always)] /// Saturating rounding shift right against bit depth pub(crate) fn qrshr(val: i32) -> i32 { - let rounding: i32 = 1 << (PRECISION - 1); + let rounding: i32 = (1 << (PRECISION - 1)) - 1; let max_value: i32 = (1 << BIT_DEPTH) - 1; ((val + rounding) >> PRECISION).min(max_value).max(0) } diff --git a/src/rgb_to_nv_p16.rs b/src/rgb_to_nv_p16.rs index 95efebb4..0c30757c 100644 --- a/src/rgb_to_nv_p16.rs +++ b/src/rgb_to_nv_p16.rs @@ -87,7 +87,7 @@ fn rgbx_to_yuv_bi_planar_10_impl< let transform_precise = get_forward_transform(max_range, range.range_y, range.range_uv, kr_kb.kr, kr_kb.kb); let transform = transform_precise.to_integers(PRECISION as u32); - const ROUNDING_CONST_BIAS: i32 = 1 << (PRECISION - 1); + const ROUNDING_CONST_BIAS: i32 = (1 << (PRECISION - 1)) - 1; let bias_y = range.bias_y as i32 * (1 << PRECISION) + ROUNDING_CONST_BIAS; let bias_uv = range.bias_uv as i32 * (1 << PRECISION) + ROUNDING_CONST_BIAS; diff --git a/src/rgb_to_y.rs b/src/rgb_to_y.rs index 410275b4..0bc6e515 100644 --- a/src/rgb_to_y.rs +++ b/src/rgb_to_y.rs @@ -36,7 +36,7 @@ use crate::avx512bw::avx512_row_rgb_to_y; use crate::built_coefficients::get_built_forward_transform; use crate::images::YuvGrayImageMut; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -use crate::neon::neon_rgb_to_y_row; +use crate::neon::{neon_rgb_to_y_rdm, neon_rgb_to_y_row}; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] use crate::sse::sse_rgb_to_y; use crate::yuv_error::check_rgba_destination; @@ -84,8 +84,7 @@ fn rgbx_to_y( ); transform_precise.to_integers(PRECISION as u32) }; - let precision_scale = (1 << PRECISION) as f32; - let bias_y = ((chroma_range.bias_y as f32 + 0.5f32) * precision_scale) as i32; + let bias_y = (chroma_range.bias_y as f32 + 0.5f32) as i32 + ((1 << (PRECISION - 1)) - 1); let i_bias_y = chroma_range.bias_y as i32; let i_cap_y = chroma_range.range_y as i32 + i_bias_y; @@ -166,7 +165,16 @@ fn rgbx_to_y( #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] unsafe { if is_rdm_available { - _cx = neon_rgb_to_y_row::( + _cx = neon_rgb_to_y_rdm::( + &transform, + &chroma_range, + y_plane.as_mut_ptr(), + rgba, + _cx, + gray_image.width as usize, + ); + } else { + _cx = neon_rgb_to_y_row::( &transform, &chroma_range, y_plane.as_mut_ptr(), diff --git a/src/rgb_to_yuv_p16.rs b/src/rgb_to_yuv_p16.rs index 66032b94..37d75150 100644 --- a/src/rgb_to_yuv_p16.rs +++ b/src/rgb_to_yuv_p16.rs @@ -95,7 +95,7 @@ fn rgbx_to_yuv_ant< const PRECISION: i32 = 13; let transform = transform_precise.to_integers(PRECISION as u32); - const ROUNDING_CONST_BIAS: i32 = 1 << (PRECISION - 1); + const ROUNDING_CONST_BIAS: i32 = (1 << (PRECISION - 1)) - 1; let bias_y = range.bias_y as i32 * (1 << PRECISION) + ROUNDING_CONST_BIAS; let bias_uv = range.bias_uv as i32 * (1 << PRECISION) + ROUNDING_CONST_BIAS; diff --git a/src/rgba_to_nv.rs b/src/rgba_to_nv.rs index 27c706cd..ac057a06 100644 --- a/src/rgba_to_nv.rs +++ b/src/rgba_to_nv.rs @@ -77,7 +77,7 @@ fn rgbx_to_nv transform_precise.to_integers(PRECISION as u32) }; - const ROUNDING_CONST_BIAS: i32 = 1 << (PRECISION - 1); + const ROUNDING_CONST_BIAS: i32 = (1 << (PRECISION - 1)) - 1; let bias_y = chroma_range.bias_y as i32 * (1 << PRECISION) + ROUNDING_CONST_BIAS; let bias_uv = chroma_range.bias_uv as i32 * (1 << PRECISION) + ROUNDING_CONST_BIAS; diff --git a/src/rgba_to_yuv.rs b/src/rgba_to_yuv.rs index e071b1e2..13befbc1 100644 --- a/src/rgba_to_yuv.rs +++ b/src/rgba_to_yuv.rs @@ -84,7 +84,7 @@ fn rgbx_to_yuv8( transform_precise.to_integers(PRECISION as u32) }; - const ROUNDING_CONST_BIAS: i32 = 1 << (PRECISION - 1); + const ROUNDING_CONST_BIAS: i32 = (1 << (PRECISION - 1)) - 1; let bias_y = chroma_range.bias_y as i32 * (1 << PRECISION) + ROUNDING_CONST_BIAS; let bias_uv = chroma_range.bias_uv as i32 * (1 << PRECISION) + ROUNDING_CONST_BIAS; diff --git a/src/sharpyuv/sharp_rgba_to_yuv.rs b/src/sharpyuv/sharp_rgba_to_yuv.rs index 34091407..608e93cd 100644 --- a/src/sharpyuv/sharp_rgba_to_yuv.rs +++ b/src/sharpyuv/sharp_rgba_to_yuv.rs @@ -54,7 +54,7 @@ fn sharpen_row420( + g_plane: &[u8], + b_plane: &[u8], + r_plane: &[u8], + rgba: &mut [u8], + start_cx: usize, + width: usize, +) -> usize { + unsafe { + sse_yuv_to_rgba_row_full_impl::( + g_plane, b_plane, r_plane, rgba, start_cx, width, + ) + } +} + +#[target_feature(enable = "sse4.1")] +unsafe fn sse_yuv_to_rgba_row_full_impl( + g_plane: &[u8], + b_plane: &[u8], + r_plane: &[u8], + rgba: &mut [u8], + start_cx: usize, + width: usize, +) -> usize { + let mut cx = start_cx; + + let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into(); + + let v_alpha = _mm_set1_epi8(255u8 as i8); + + while cx + 16 < width { + let g_values = _mm_loadu_si128(g_plane.get_unchecked(cx..).as_ptr() as *const _); + let b_values = _mm_loadu_si128(b_plane.get_unchecked(cx..).as_ptr() as *const _); + let r_values = _mm_loadu_si128(r_plane.get_unchecked(cx..).as_ptr() as *const _); + + let dst_shift = cx * destination_channels.get_channels_count(); + let rgba_ptr = rgba.get_unchecked_mut(dst_shift..); + + match destination_channels { + YuvSourceChannels::Rgb => { + sse_store_rgb_u8(rgba_ptr.as_mut_ptr(), r_values, g_values, b_values); + } + YuvSourceChannels::Bgr => { + sse_store_rgb_u8(rgba_ptr.as_mut_ptr(), b_values, g_values, r_values); + } + YuvSourceChannels::Rgba => { + sse_store_rgba(rgba_ptr.as_mut_ptr(), r_values, g_values, b_values, v_alpha); + } + YuvSourceChannels::Bgra => { + sse_store_rgba(rgba_ptr.as_mut_ptr(), b_values, g_values, r_values, v_alpha); + } + } + + cx += 16; + } + + cx +} + +pub(crate) fn sse_yuv_to_rgba_row_limited( + g_plane: &[u8], + b_plane: &[u8], + r_plane: &[u8], + rgba: &mut [u8], + start_cx: usize, + width: usize, + y_bias: i32, + y_coeff: i32, +) -> usize { + unsafe { + sse_yuv_to_rgba_row_limited_impl::( + g_plane, b_plane, r_plane, rgba, start_cx, width, y_bias, y_coeff, + ) + } +} + +#[target_feature(enable = "sse4.1")] +unsafe fn sse_yuv_to_rgba_row_limited_impl( + g_plane: &[u8], + b_plane: &[u8], + r_plane: &[u8], + rgba: &mut [u8], + start_cx: usize, + width: usize, + y_bias: i32, + y_coeff: i32, +) -> usize { + let mut cx = start_cx; + + let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into(); + + let v_alpha = _mm_set1_epi8(255u8 as i8); + + const V_SCALE: i32 = 2; + + let vy_coeff = _mm_set1_epi16(y_coeff as i16); + let vy_bias = _mm_set1_epi8(y_bias as i8); + + while cx + 16 < width { + let g_values0 = _mm_subs_epu8( + _mm_loadu_si128(g_plane.get_unchecked(cx..).as_ptr() as *const _), + vy_bias, + ); + let b_values0 = _mm_subs_epu8( + _mm_loadu_si128(b_plane.get_unchecked(cx..).as_ptr() as *const _), + vy_bias, + ); + let r_values0 = _mm_subs_epu8( + _mm_loadu_si128(r_plane.get_unchecked(cx..).as_ptr() as *const _), + vy_bias, + ); + + let rl_hi = _mm_mulhrs_epi16( + _mm_slli_epi16::(_mm_cvtepu8_epi16(r_values0)), + vy_coeff, + ); + let gl_hi = _mm_mulhrs_epi16( + _mm_slli_epi16::(_mm_cvtepu8_epi16(g_values0)), + vy_coeff, + ); + let bl_hi = _mm_mulhrs_epi16( + _mm_slli_epi16::(_mm_cvtepu8_epi16(b_values0)), + vy_coeff, + ); + + let rl_lo = _mm_mulhrs_epi16( + _mm_slli_epi16::(_mm_unpacklo_epi8(r_values0, _mm_setzero_si128())), + vy_coeff, + ); + let gl_lo = _mm_mulhrs_epi16( + _mm_slli_epi16::(_mm_unpacklo_epi8(g_values0, _mm_setzero_si128())), + vy_coeff, + ); + let bl_lo = _mm_mulhrs_epi16( + _mm_slli_epi16::(_mm_unpacklo_epi8(b_values0, _mm_setzero_si128())), + vy_coeff, + ); + + let r_values = _mm_packus_epi16(rl_lo, rl_hi); + let g_values = _mm_packus_epi16(gl_lo, gl_hi); + let b_values = _mm_packus_epi16(bl_lo, bl_hi); + + let dst_shift = cx * destination_channels.get_channels_count(); + let rgba_ptr = rgba.get_unchecked_mut(dst_shift..); + + match destination_channels { + YuvSourceChannels::Rgb => { + sse_store_rgb_u8(rgba_ptr.as_mut_ptr(), r_values, g_values, b_values); + } + YuvSourceChannels::Bgr => { + sse_store_rgb_u8(rgba_ptr.as_mut_ptr(), b_values, g_values, r_values); + } + YuvSourceChannels::Rgba => { + sse_store_rgba(rgba_ptr.as_mut_ptr(), r_values, g_values, b_values, v_alpha); + } + YuvSourceChannels::Bgra => { + sse_store_rgba(rgba_ptr.as_mut_ptr(), b_values, g_values, r_values, v_alpha); + } + } + + cx += 16; + } + + cx +} diff --git a/src/sse/mod.rs b/src/sse/mod.rs index 0441f8a7..7a2f9240 100644 --- a/src/sse/mod.rs +++ b/src/sse/mod.rs @@ -27,6 +27,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #![deny(unreachable_code, unreachable_pub)] +mod gbr_to_rgb; mod rgb_to_nv; mod rgb_to_y; mod rgb_to_ycgco; @@ -52,6 +53,7 @@ mod yuv_to_yuy2; mod yuy2_to_rgb; mod yuy2_to_yuv; +pub(crate) use gbr_to_rgb::{sse_yuv_to_rgba_row_full, sse_yuv_to_rgba_row_limited}; pub(crate) use rgb_to_nv::sse_rgba_to_nv_row; pub(crate) use rgb_to_y::sse_rgb_to_y; pub(crate) use rgb_to_ycgco::sse_rgb_to_ycgco_row; diff --git a/src/sse/rgb_to_ycgco.rs b/src/sse/rgb_to_ycgco.rs index fce2d1ab..36a3228b 100644 --- a/src/sse/rgb_to_ycgco.rs +++ b/src/sse/rgb_to_ycgco.rs @@ -66,7 +66,7 @@ pub(crate) unsafe fn sse_rgb_to_ycgco_row(co)); let r = _mm_adds_epi16(b, co); let zeros = _mm_setzero_si128(); - let rounding_const = _mm_set1_epi16(1 << 5); + let rounding_const = _mm_set1_epi16((1 << 5) - 1); ( _mm_srai_epi16::<6>(_mm_adds_epi16(_mm_max_epi16(r, zeros), rounding_const)), _mm_srai_epi16::<6>(_mm_adds_epi16(_mm_max_epi16(g, zeros), rounding_const)), diff --git a/src/sse/ycgco_to_rgb.rs b/src/sse/ycgco_to_rgb.rs index 73347468..afbd364d 100644 --- a/src/sse/ycgco_to_rgb.rs +++ b/src/sse/ycgco_to_rgb.rs @@ -77,7 +77,7 @@ pub(crate) unsafe fn sse_ycgco_to_rgb_row let v_g_coeff_1 = _mm_set1_epi16(-(transform.g_coeff_1 as i16)); let v_g_coeff_2 = _mm_set1_epi16(-(transform.g_coeff_2 as i16)); let v_alpha = _mm_set1_epi8(255u8 as i8); - let rounding_const = _mm_set1_epi16(1 << 5); + let rounding_const = _mm_set1_epi16((1 << 5) - 1); let zeros = _mm_setzero_si128(); diff --git a/src/to_identity.rs b/src/to_identity.rs index d5f59b86..5184550a 100644 --- a/src/to_identity.rs +++ b/src/to_identity.rs @@ -101,12 +101,21 @@ where .zip(v_dst.iter_mut()) .zip(rgb_chunks) { - *v_dst = - qrshr::(rgb_dst[0].as_() * y_coef + y_bias).as_(); - *y_dst = - qrshr::(rgb_dst[1].as_() * y_coef + y_bias).as_(); - *u_dst = - qrshr::(rgb_dst[2].as_() * y_coef + y_bias).as_(); + *v_dst = qrshr::( + rgb_dst[destination_channels.get_r_channel_offset()].as_() * y_coef + + y_bias, + ) + .as_(); + *y_dst = qrshr::( + rgb_dst[destination_channels.get_g_channel_offset()].as_() * y_coef + + y_bias, + ) + .as_(); + *u_dst = qrshr::( + rgb_dst[destination_channels.get_b_channel_offset()].as_() * y_coef + + y_bias, + ) + .as_(); } } } @@ -120,9 +129,9 @@ where .zip(v_dst.iter_mut()) .zip(rgb_chunks) { - *v_dst = rgb_dst[0]; - *y_dst = rgb_dst[1]; - *u_dst = rgb_dst[2]; + *v_dst = rgb_dst[destination_channels.get_r_channel_offset()]; + *y_dst = rgb_dst[destination_channels.get_g_channel_offset()]; + *u_dst = rgb_dst[destination_channels.get_b_channel_offset()]; } } } diff --git a/src/y_with_alpha_to_rgb.rs b/src/y_with_alpha_to_rgb.rs index 906d46a4..940d3a0c 100644 --- a/src/y_with_alpha_to_rgb.rs +++ b/src/y_with_alpha_to_rgb.rs @@ -27,6 +27,8 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ use crate::built_coefficients::get_built_inverse_transform; +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +use crate::neon::{neon_y_to_rgb_alpha_row, neon_y_to_rgb_row_alpha_rdm}; use crate::numerics::qrshr; use crate::yuv_error::check_rgba_destination; use crate::yuv_support::*; @@ -37,6 +39,80 @@ use rayon::iter::{IndexedParallelIterator, ParallelIterator}; #[cfg(feature = "rayon")] use rayon::prelude::{ParallelSlice, ParallelSliceMut}; use std::fmt::Debug; +use std::marker::PhantomData; + +struct WideRowProcessor { + _phantom: PhantomData, + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + _use_rdm: bool, +} + +impl Default for WideRowProcessor { + fn default() -> Self { + WideRowProcessor { + _phantom: PhantomData, + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + _use_rdm: std::arch::is_aarch64_feature_detected!("rdm"), + } + } +} + +trait ProcessRowHandler { + fn handle_row( + &self, + range: &YuvChromaRange, + transform: &CbCrInverseTransform, + y_plane: &[V], + a_plane: &[V], + rgba: &mut [V], + start_cx: usize, + width: usize, + ) -> usize; +} + +impl ProcessRowHandler for WideRowProcessor { + fn handle_row( + &self, + _range: &YuvChromaRange, + _transform: &CbCrInverseTransform, + _y_plane: &[u16], + _a_plane: &[u16], + _rgba: &mut [u16], + _start_cx: usize, + _width: usize, + ) -> usize { + 0 + } +} + +impl ProcessRowHandler for WideRowProcessor { + fn handle_row( + &self, + _range: &YuvChromaRange, + _transform: &CbCrInverseTransform, + _y_plane: &[u8], + _a_plane: &[u8], + _rgba: &mut [u8], + _start_cx: usize, + _width: usize, + ) -> usize { + let mut _cx = _start_cx; + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + unsafe { + let neon_wide_row_handler = if self._use_rdm { + neon_y_to_rgb_row_alpha_rdm:: + } else { + neon_y_to_rgb_alpha_row:: + }; + + let offset = + neon_wide_row_handler(_range, _transform, _y_plane, _a_plane, _rgba, _cx, _width); + _cx = offset; + } + + _cx + } +} // Chroma subsampling always assumed as 400 #[inline] @@ -53,6 +129,7 @@ fn y_with_alpha_to_rgbx< ) -> Result<(), YuvError> where i32: AsPrimitive, + WideRowProcessor: ProcessRowHandler, { let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into(); let channels = destination_channels.get_channels_count(); @@ -109,14 +186,29 @@ where } if range == YuvRange::Limited { + let handler = WideRowProcessor::::default(); iter.zip(y_iter) .zip(a_iter) .for_each(|((rgba, y_plane), a_plane)| { let y_plane = &y_plane[0..image.width as usize]; + let mut _cx = 0usize; + + let offset = handler.handle_row::( + &chroma_range, + &inverse_transform, + y_plane, + a_plane, + rgba, + _cx, + image.width as usize, + ); + _cx = offset; + for ((y_src, a_src), rgba) in y_plane .iter() .zip(a_plane) .zip(rgba.chunks_exact_mut(channels)) + .skip(_cx) { let y_value = (y_src.as_() - bias_y) * y_coef; diff --git a/src/ycgco_r_to_rgb.rs b/src/ycgco_r_to_rgb.rs index f5ec7bea..6d931508 100644 --- a/src/ycgco_r_to_rgb.rs +++ b/src/ycgco_r_to_rgb.rs @@ -67,7 +67,7 @@ fn ycgco_r_type_ro_rgbx { /// Integral transformation adds an error not less than 1% pub fn to_integers(self, precision: u32) -> CbCrInverseTransform { let precision_scale: i32 = 1i32 << (precision as i32); - let cr_coef = (self.cr_coef * precision_scale as f32) as i32; - let cb_coef = (self.cb_coef * precision_scale as f32) as i32; - let y_coef = (self.y_coef * precision_scale as f32) as i32; - let g_coef_1 = (self.g_coeff_1 * precision_scale as f32) as i32; - let g_coef_2 = (self.g_coeff_2 * precision_scale as f32) as i32; + let cr_coef = (self.cr_coef * precision_scale as f32).round() as i32; + let cb_coef = (self.cb_coef * precision_scale as f32).round() as i32; + let y_coef = (self.y_coef * precision_scale as f32).round() as i32; + let g_coef_1 = (self.g_coeff_1 * precision_scale as f32).round() as i32; + let g_coef_2 = (self.g_coeff_2 * precision_scale as f32).round() as i32; CbCrInverseTransform:: { y_coef, cr_coef, @@ -114,15 +114,15 @@ impl ToIntegerTransform for CbCrForwardTransform { fn to_integers(&self, precision: u32) -> CbCrForwardTransform { let scale = (1 << precision) as f32; CbCrForwardTransform:: { - yr: (self.yr * scale) as i32, - yg: (self.yg * scale) as i32, - yb: (self.yb * scale) as i32, - cb_r: (self.cb_r * scale) as i32, - cb_g: (self.cb_g * scale) as i32, - cb_b: (self.cb_b * scale) as i32, - cr_r: (self.cr_r * scale) as i32, - cr_g: (self.cr_g * scale) as i32, - cr_b: (self.cr_b * scale) as i32, + yr: (self.yr * scale).round() as i32, + yg: (self.yg * scale).round() as i32, + yb: (self.yb * scale).round() as i32, + cb_r: (self.cb_r * scale).round() as i32, + cb_g: (self.cb_g * scale).round() as i32, + cb_b: (self.cb_b * scale).round() as i32, + cr_r: (self.cr_r * scale).round() as i32, + cr_g: (self.cr_g * scale).round() as i32, + cr_b: (self.cr_b * scale).round() as i32, } } }