Skip to content

Commit

Permalink
Rounding and bugfixes
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Sep 4, 2024
1 parent b678812 commit fbab0cb
Show file tree
Hide file tree
Showing 13 changed files with 49 additions and 43 deletions.
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Supports:
- [x] YCgCo-R
- [x] YUY2
- [x] Identity ( GBR )
- [x] Sharp YUV

# SIMD

Expand Down Expand Up @@ -65,11 +66,11 @@ yuv422_to_rgb(&y_plane, y_stride,

```rust
rgb_to_ycgco420(&mut y_plane, y_stride,
&mut cg_plane, cg_width,
&mut cg_plane, cg_width,
&rgb, rgb_stride,
width, height,
YuvRange::TV);
&mut cg_plane, cg_width,
&mut cg_plane, cg_width,
&rgb, rgb_stride,
width, height,
YuvRange::TV);
```

### YCgCo to RGB
Expand Down
2 changes: 1 addition & 1 deletion src/avx2/yuv_nv_to_rgba.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ pub unsafe fn avx2_yuv_nv_to_rgba_row<
let rounding_const = _mm256_set1_epi16(1 << 5);

while cx + 32 < width {
let y_values = _mm256_subs_epi8(
let y_values = _mm256_subs_epu8(
_mm256_loadu_si256(y_ptr.add(y_offset + cx) as *const __m256i),
y_corr,
);
Expand Down
2 changes: 1 addition & 1 deletion src/avx2/yuv_to_rgba.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ pub unsafe fn avx2_yuv_to_rgba_row<const DESTINATION_CHANNELS: u8, const SAMPLIN
let rounding_const = _mm256_set1_epi16(1 << 5);

while cx + 32 < width {
let y_values = _mm256_subs_epi8(
let y_values = _mm256_subs_epu8(
_mm256_loadu_si256(y_ptr.add(y_offset + cx) as *const __m256i),
y_corr,
);
Expand Down
2 changes: 1 addition & 1 deletion src/avx2/yuv_to_rgba_alpha.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ pub unsafe fn avx2_yuv_to_rgba_alpha<const DESTINATION_CHANNELS: u8, const SAMPL
let rounding_const = _mm256_set1_epi16(1 << 5);

while cx + 32 < width {
let y_values = _mm256_subs_epi8(
let y_values = _mm256_subs_epu8(
_mm256_loadu_si256(y_ptr.add(y_offset + cx) as *const __m256i),
y_corr,
);
Expand Down
2 changes: 1 addition & 1 deletion src/avx512bw/yuv_nv_to_rgba.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ pub unsafe fn avx512_yuv_nv_to_rgba<
let v_alpha = _mm512_set1_epi8(255u8 as i8);

while cx + 32 < width {
let y_values = _mm512_subs_epi8(
let y_values = _mm512_subs_epu8(
_mm512_loadu_si512(y_ptr.add(y_offset + cx) as *const i32),
y_corr,
);
Expand Down
2 changes: 1 addition & 1 deletion src/avx512bw/yuv_to_rgba.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ pub unsafe fn avx512_yuv_to_rgba<const DESTINATION_CHANNELS: u8, const SAMPLING:
let v_alpha = _mm512_set1_epi8(255u8 as i8);

while cx + 64 < width {
let y_values = _mm512_subs_epi8(
let y_values = _mm512_subs_epu8(
_mm512_loadu_si512(y_ptr.add(y_offset + cx) as *const i32),
y_corr,
);
Expand Down
2 changes: 1 addition & 1 deletion src/avx512bw/yuv_to_rgba_alpha.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ pub unsafe fn avx512_yuv_to_rgba_alpha<const DESTINATION_CHANNELS: u8, const SAM
let v_g_coeff_2 = _mm512_set1_epi16(-1 * transform.g_coeff_2 as i16);

while cx + 64 < width {
let y_values = _mm512_subs_epi8(
let y_values = _mm512_subs_epu8(
_mm512_loadu_si512(y_ptr.add(y_offset + cx) as *const i32),
y_corr,
);
Expand Down
15 changes: 8 additions & 7 deletions src/rgb_to_yuv_p16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,10 @@ fn rgbx_to_yuv_impl<
kr_kb.kb,
);
let transform = transform_precise.to_integers(8);
let precision_scale = (1 << 8) as f32;
let bias_y = ((range.bias_y as f32 + 0.5f32) * precision_scale) as i32;
let bias_uv = ((range.bias_uv as f32 + 0.5f32) * precision_scale) as i32;
const PRECISION: i32 = 8;
const ROUNDING_CONST_BIAS: i32 = 1 << (PRECISION - 1);
let bias_y = range.bias_y as i32 * (1 << PRECISION) + ROUNDING_CONST_BIAS;
let bias_uv = range.bias_uv as i32 * (1 << PRECISION) + ROUNDING_CONST_BIAS;

let iterator_step = match chroma_subsampling {
YuvChromaSample::YUV420 => 2usize,
Expand Down Expand Up @@ -152,9 +153,9 @@ fn rgbx_to_yuv_impl<
let r = unsafe { src.add(src_chans.get_r_channel_offset()).read_unaligned() } as i32;
let g = unsafe { src.add(src_chans.get_g_channel_offset()).read_unaligned() } as i32;
let b = unsafe { src.add(src_chans.get_b_channel_offset()).read_unaligned() } as i32;
let y_0 = (r * transform.yr + g * transform.yg + b * transform.yb + bias_y) >> 8;
let cb = (r * transform.cb_r + g * transform.cb_g + b * transform.cb_b + bias_uv) >> 8;
let cr = (r * transform.cr_r + g * transform.cr_g + b * transform.cr_b + bias_uv) >> 8;
let y_0 = (r * transform.yr + g * transform.yg + b * transform.yb + bias_y) >> PRECISION;
let cb = (r * transform.cb_r + g * transform.cb_g + b * transform.cb_b + bias_uv) >> PRECISION;
let cr = (r * transform.cr_r + g * transform.cr_g + b * transform.cr_b + bias_uv) >> PRECISION;
unsafe {
y_st_ptr.add(x).write_unaligned(transform_integer::<
ENDIANNESS,
Expand Down Expand Up @@ -199,7 +200,7 @@ fn rgbx_to_yuv_impl<
unsafe { src.add(src_chans.get_b_channel_offset()).read_unaligned() }
as i32;
let y_1 =
(r * transform.yr + g * transform.yg + b * transform.yb + bias_y) >> 8;
(r * transform.yr + g * transform.yg + b * transform.yb + bias_y) >> PRECISION;
unsafe {
y_st_ptr.add(x + 1).write_unaligned(transform_integer::<
ENDIANNESS,
Expand Down
22 changes: 13 additions & 9 deletions src/rgba_to_nv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,11 @@ fn rgbx_to_nv<const ORIGIN_CHANNELS: u8, const UV_ORDER: u8, const SAMPLING: u8>
kr_kb.kr,
kr_kb.kb,
);
let transform = transform_precise.to_integers(8);
const ROUNDING_CONST_BIAS: i32 = 1 << 7;
let bias_y = range.bias_y as i32 * (1 << 8) + ROUNDING_CONST_BIAS;
let bias_uv = range.bias_uv as i32 * (1 << 8) + ROUNDING_CONST_BIAS;
const PRECISION: i32 = 8;
let transform = transform_precise.to_integers(PRECISION as u32);
const ROUNDING_CONST_BIAS: i32 = 1 << (PRECISION - 1);
let bias_y = range.bias_y as i32 * (1 << PRECISION) + ROUNDING_CONST_BIAS;
let bias_uv = range.bias_uv as i32 * (1 << PRECISION) + ROUNDING_CONST_BIAS;

let iterator_step = match chroma_subsampling {
YuvChromaSample::YUV420 => 2usize,
Expand Down Expand Up @@ -135,9 +136,12 @@ fn rgbx_to_nv<const ORIGIN_CHANNELS: u8, const UV_ORDER: u8, const SAMPLING: u8>
as i32;
let b = unsafe { *source_slice.get_unchecked(source_channels.get_b_channel_offset()) }
as i32;
let y_0 = (r * transform.yr + g * transform.yg + b * transform.yb + bias_y) >> 8;
let cb = (r * transform.cb_r + g * transform.cb_g + b * transform.cb_b + bias_uv) >> 8;
let cr = (r * transform.cr_r + g * transform.cr_g + b * transform.cr_b + bias_uv) >> 8;
let y_0 =
(r * transform.yr + g * transform.yg + b * transform.yb + bias_y) >> PRECISION;
let cb = (r * transform.cb_r + g * transform.cb_g + b * transform.cb_b + bias_uv)
>> PRECISION;
let cr = (r * transform.cr_r + g * transform.cr_g + b * transform.cr_b + bias_uv)
>> PRECISION;
unsafe {
*y_plane.get_unchecked_mut(y_offset + x) = y_0.clamp(i_bias_y, i_cap_y) as u8;
}
Expand All @@ -164,8 +168,8 @@ fn rgbx_to_nv<const ORIGIN_CHANNELS: u8, const UV_ORDER: u8, const SAMPLING: u8>
let b = unsafe {
*source_slice.get_unchecked(source_channels.get_b_channel_offset())
} as i32;
let y_1 =
(r * transform.yr + g * transform.yg + b * transform.yb + bias_y) >> 8;
let y_1 = (r * transform.yr + g * transform.yg + b * transform.yb + bias_y)
>> PRECISION;
unsafe {
*y_plane.get_unchecked_mut(y_offset + next_x) =
y_1.clamp(i_bias_y, i_cap_y) as u8;
Expand Down
7 changes: 4 additions & 3 deletions src/sse/rgba_to_yuv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,11 @@ pub unsafe fn sse_rgba_to_yuv_row<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>

let mut cx = start_cx;
let mut uv_x = start_ux;
const PRECISION: i32 = 8;

const ROUNDING_CONST_BIAS: i32 = 1 << 7;
let bias_y = range.bias_y as i32 * (1 << 8) + ROUNDING_CONST_BIAS;
let bias_uv = range.bias_uv as i32 * (1 << 8) + ROUNDING_CONST_BIAS;
const ROUNDING_CONST_BIAS: i32 = 1 << (PRECISION - 1);
let bias_y = range.bias_y as i32 * (1 << PRECISION) + ROUNDING_CONST_BIAS;
let bias_uv = range.bias_uv as i32 * (1 << PRECISION) + ROUNDING_CONST_BIAS;

let zeros = _mm_setzero_si128();

Expand Down
2 changes: 1 addition & 1 deletion src/sse/yuv_nv_to_rgba.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ pub unsafe fn sse_yuv_nv_to_rgba<
let distribute_shuffle = _mm_setr_epi8(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);

while cx + 16 < width {
let y_values = _mm_subs_epi8(
let y_values = _mm_subs_epu8(
_mm_loadu_si128(y_ptr.add(y_offset + cx) as *const __m128i),
y_corr,
);
Expand Down
21 changes: 10 additions & 11 deletions src/sse/yuv_to_rgba.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ pub unsafe fn sse_yuv_to_rgba_row<const DESTINATION_CHANNELS: u8, const SAMPLING
let v_luma_coeff = _mm_set1_epi16(transform.y_coef as i16);
let v_cr_coeff = _mm_set1_epi16(transform.cr_coef as i16);
let v_cb_coeff = _mm_set1_epi16(transform.cb_coef as i16);
let v_min_values = _mm_setzero_si128();
let v_g_coeff_1 = _mm_set1_epi16(-1 * transform.g_coeff_1 as i16);
let v_g_coeff_2 = _mm_set1_epi16(-1 * transform.g_coeff_2 as i16);
let v_alpha = _mm_set1_epi8(255u8 as i8);
Expand All @@ -58,7 +57,7 @@ pub unsafe fn sse_yuv_to_rgba_row<const DESTINATION_CHANNELS: u8, const SAMPLING
let zeros = _mm_setzero_si128();

while cx + 16 < width {
let y_values = _mm_subs_epi8(
let y_values = _mm_subs_epu8(
_mm_loadu_si128(y_ptr.add(y_offset + cx) as *const __m128i),
y_corr,
);
Expand Down Expand Up @@ -96,14 +95,14 @@ pub unsafe fn sse_yuv_to_rgba_row<const DESTINATION_CHANNELS: u8, const SAMPLING
let r_high = _mm_srai_epi16::<6>(_mm_adds_epi16(
_mm_max_epi16(
_mm_adds_epi16(y_high, _mm_mullo_epi16(v_high, v_cr_coeff)),
v_min_values,
zeros,
),
rounding_const,
));
let b_high = _mm_srai_epi16::<6>(_mm_adds_epi16(
_mm_max_epi16(
_mm_adds_epi16(y_high, _mm_mullo_epi16(u_high, v_cb_coeff)),
v_min_values,
zeros,
),
rounding_const,
));
Expand All @@ -116,7 +115,7 @@ pub unsafe fn sse_yuv_to_rgba_row<const DESTINATION_CHANNELS: u8, const SAMPLING
_mm_mullo_epi16(u_high, v_g_coeff_2),
),
),
v_min_values,
zeros,
),
rounding_const,
));
Expand All @@ -128,14 +127,14 @@ pub unsafe fn sse_yuv_to_rgba_row<const DESTINATION_CHANNELS: u8, const SAMPLING
let r_low = _mm_srai_epi16::<6>(_mm_adds_epi16(
_mm_max_epi16(
_mm_adds_epi16(y_low, _mm_mullo_epi16(v_low, v_cr_coeff)),
v_min_values,
zeros,
),
rounding_const,
));
let b_low = _mm_srai_epi16::<6>(_mm_adds_epi16(
_mm_max_epi16(
_mm_adds_epi16(y_low, _mm_mullo_epi16(u_low, v_cb_coeff)),
v_min_values,
zeros,
),
rounding_const,
));
Expand All @@ -148,7 +147,7 @@ pub unsafe fn sse_yuv_to_rgba_row<const DESTINATION_CHANNELS: u8, const SAMPLING
_mm_mullo_epi16(u_low, v_g_coeff_2),
),
),
v_min_values,
zeros,
),
rounding_const,
));
Expand Down Expand Up @@ -230,14 +229,14 @@ pub unsafe fn sse_yuv_to_rgba_row<const DESTINATION_CHANNELS: u8, const SAMPLING
let r_low = _mm_srai_epi16::<6>(_mm_adds_epi16(
_mm_max_epi16(
_mm_adds_epi16(y_low, _mm_mullo_epi16(v_low, v_cr_coeff)),
v_min_values,
zeros,
),
rounding_const,
));
let b_low = _mm_srai_epi16::<6>(_mm_adds_epi16(
_mm_max_epi16(
_mm_adds_epi16(y_low, _mm_mullo_epi16(u_low, v_cb_coeff)),
v_min_values,
zeros,
),
rounding_const,
));
Expand All @@ -250,7 +249,7 @@ pub unsafe fn sse_yuv_to_rgba_row<const DESTINATION_CHANNELS: u8, const SAMPLING
_mm_mullo_epi16(u_low, v_g_coeff_2),
),
),
v_min_values,
zeros,
),
rounding_const,
));
Expand Down
2 changes: 1 addition & 1 deletion src/sse/yuv_to_rgba_alpha.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ pub unsafe fn sse_yuv_to_rgba_alpha_row<const DESTINATION_CHANNELS: u8, const SA
let zeros = _mm_setzero_si128();

while cx + 16 < width {
let y_values = _mm_subs_epi8(
let y_values = _mm_subs_epu8(
_mm_loadu_si128(y_ptr.add(y_offset + cx) as *const __m128i),
y_corr,
);
Expand Down

0 comments on commit fbab0cb

Please sign in to comment.