From bbe1dc4adbf1aeef616242139d6601a7fdd0bed0 Mon Sep 17 00:00:00 2001 From: Ali <> Date: Thu, 4 Aug 2022 01:44:51 +0400 Subject: [PATCH] Workaround for armv7 --- .../PublicHeaders/ImageDCT/ImageDCT.h | 3 + .../AnimationCache/ImageDCT/Sources/DCT.cpp | 120 +--- .../AnimationCache/ImageDCT/Sources/DCT.h | 3 + .../ImageDCT/Sources/DCTCommon.h | 1 - .../ImageDCT/Sources/DCT_Neon.c | 622 ------------------ .../ImageDCT/Sources/ImageDCT.mm | 4 + .../ImageDCT/Sources/YuvConversion.m | 62 +- .../Sources/AnimationCache.swift | 2 +- .../AnimationCache/Sources/ImageData.swift | 6 +- 9 files changed, 81 insertions(+), 742 deletions(-) diff --git a/submodules/TelegramUI/Components/AnimationCache/ImageDCT/PublicHeaders/ImageDCT/ImageDCT.h b/submodules/TelegramUI/Components/AnimationCache/ImageDCT/PublicHeaders/ImageDCT/ImageDCT.h index 7181f9ea560..75df7359c0a 100644 --- a/submodules/TelegramUI/Components/AnimationCache/ImageDCT/PublicHeaders/ImageDCT/ImageDCT.h +++ b/submodules/TelegramUI/Components/AnimationCache/ImageDCT/PublicHeaders/ImageDCT/ImageDCT.h @@ -26,8 +26,11 @@ typedef NS_ENUM(NSUInteger, ImageDCTTableType) { - (void)forwardWithPixels:(uint8_t const * _Nonnull)pixels coefficients:(int16_t * _Nonnull)coefficients width:(NSInteger)width height:(NSInteger)height bytesPerRow:(NSInteger)bytesPerRow __attribute__((objc_direct)); - (void)inverseWithCoefficients:(int16_t const * _Nonnull)coefficients pixels:(uint8_t * _Nonnull)pixels width:(NSInteger)width height:(NSInteger)height coefficientsPerRow:(NSInteger)coefficientsPerRow bytesPerRow:(NSInteger)bytesPerRow __attribute__((objc_direct)); + +#if defined(__aarch64__) - (void)forward4x4:(int16_t const * _Nonnull)normalizedCoefficients coefficients:(int16_t * _Nonnull)coefficients width:(NSInteger)width height:(NSInteger)height __attribute__((objc_direct)); - (void)inverse4x4Add:(int16_t const * _Nonnull)coefficients normalizedCoefficients:(int16_t * _Nonnull)normalizedCoefficients width:(NSInteger)width height:(NSInteger)height __attribute__((objc_direct)); +#endif @end diff --git a/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/DCT.cpp b/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/DCT.cpp index 537d3ad186a..e918a229aa2 100644 --- a/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/DCT.cpp +++ b/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/DCT.cpp @@ -353,7 +353,6 @@ void performInverseDct(int16_t const * coefficients, uint8_t *pixels, int width, int16_t element = coefficients[acOffset]; acOffset++; coefficientBlock[zigZagInv[blockY * DCTSIZE + blockX]] = element; - //coefficientBlock[zigZagInv[blockY * DCTSIZE + blockX]] = coefficients[(y + blockY) * coefficientsPerRow + (x + blockX)]; } } @@ -368,66 +367,6 @@ void performInverseDct(int16_t const * coefficients, uint8_t *pixels, int width, } } -void matrix_multiply_4x4_neon(float32_t *A, float32_t *B, float32_t *C) { - // these are the columns A - float32x4_t A0; - float32x4_t A1; - float32x4_t A2; - float32x4_t A3; - - // these are the columns B - float32x4_t B0; - float32x4_t B1; - float32x4_t B2; - float32x4_t B3; - - // these are the columns C - float32x4_t C0; - float32x4_t C1; - float32x4_t C2; - float32x4_t C3; - - A0 = vld1q_f32(A); - A1 = vld1q_f32(A+4); - A2 = vld1q_f32(A+8); - A3 = vld1q_f32(A+12); - - // Zero accumulators for C values - C0 = vmovq_n_f32(0); - C1 = vmovq_n_f32(0); - C2 = vmovq_n_f32(0); - C3 = vmovq_n_f32(0); - - // Multiply accumulate in 4x1 blocks, i.e. each column in C - B0 = vld1q_f32(B); - C0 = vfmaq_laneq_f32(C0, A0, B0, 0); - C0 = vfmaq_laneq_f32(C0, A1, B0, 1); - C0 = vfmaq_laneq_f32(C0, A2, B0, 2); - C0 = vfmaq_laneq_f32(C0, A3, B0, 3); - vst1q_f32(C, C0); - - B1 = vld1q_f32(B+4); - C1 = vfmaq_laneq_f32(C1, A0, B1, 0); - C1 = vfmaq_laneq_f32(C1, A1, B1, 1); - C1 = vfmaq_laneq_f32(C1, A2, B1, 2); - C1 = vfmaq_laneq_f32(C1, A3, B1, 3); - vst1q_f32(C+4, C1); - - B2 = vld1q_f32(B+8); - C2 = vfmaq_laneq_f32(C2, A0, B2, 0); - C2 = vfmaq_laneq_f32(C2, A1, B2, 1); - C2 = vfmaq_laneq_f32(C2, A2, B2, 2); - C2 = vfmaq_laneq_f32(C2, A3, B2, 3); - vst1q_f32(C+8, C2); - - B3 = vld1q_f32(B+12); - C3 = vfmaq_laneq_f32(C3, A0, B3, 0); - C3 = vfmaq_laneq_f32(C3, A1, B3, 1); - C3 = vfmaq_laneq_f32(C3, A2, B3, 2); - C3 = vfmaq_laneq_f32(C3, A3, B3, 3); - vst1q_f32(C+12, C3); -} - typedef int16_t tran_low_t; typedef int32_t tran_high_t; typedef int16_t tran_coef_t; @@ -483,30 +422,6 @@ static inline tran_high_t fdct_round_shift(tran_high_t input) { return rv; } -void fdct4x4_float(const int16_t *input, tran_low_t *output) { - float inputFloat[4 * 4]; - for (int i = 0; i < 4 * 4; i++) { - inputFloat[i] = (float)input[i]; - } - float outputFloat[4 * 4]; - - int i, j, u, v; - for (u = 0; u < 4; ++u) { - for (v = 0; v < 4; ++v) { - outputFloat[u * 4 + v] = 0; - for (i = 0; i < 4; i++) { - for (j = 0; j < 4; j++) { - outputFloat[u * 4 + v] += inputFloat[i * 4 + j] * cos(M_PI/((float)4)*(i+1./2.)*u)*cos(M_PI/((float)4)*(j+1./2.)*v); - } - } - } - } - - for (int i = 0; i < 4 * 4; i++) { - output[i] = (float)outputFloat[i]; - } -} - void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose @@ -636,14 +551,11 @@ void vpx_idct4x4_16_add_c(const tran_low_t *input, tran_low_t *dest, int stride) idct4_c(temp_in, temp_out); for (j = 0; j < 4; ++j) { dest[j * stride + i] = ROUND_POWER_OF_TWO(temp_out[j], 4); - //dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4)); } } } -static inline int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) { - return vld1q_s16(buf); -} +#if defined(__aarch64__) static inline void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) { // Swap 32 bit elements. Goes from: @@ -752,23 +664,18 @@ inline void vpx_idct4x4_16_add_neon(const int16x8_t &top64, const int16x8_t &bot vst1_s16(dest + destRowIncrement * 1, vget_high_s16(a[0])); vst1_s16(dest + destRowIncrement * 2, vget_high_s16(a[1])); vst1_s16(dest + destRowIncrement * 3, vget_low_s16(a[1])); - - //vst1q_s16(dest, a[0]); - //dest += 2 * 4; - //vst1_s16(dest, vget_high_s16(a[1])); - //dest += 4; - //vst1_s16(dest, vget_low_s16(a[1])); } +#endif + static int dct4x4QuantDC = 58; static int dct4x4QuantAC = 58; +#if defined(__aarch64__) + void performForward4x4Dct(int16_t const *normalizedCoefficients, int16_t *coefficients, int width, int height, DCTELEM *divisors) { DCTELEM block[4 * 4]; DCTELEM coefBlock[4 * 4]; - - //int acOffset = (width / 4) * (height / 4); - for (int y = 0; y < height; y += 4) { for (int x = 0; x < width; x += 4) { for (int blockY = 0; blockY < 4; blockY++) { @@ -791,20 +698,9 @@ void performForward4x4Dct(int16_t const *normalizedCoefficients, int16_t *coeffi } } - //coefficients[(y / 4) * (width / 4) + x / 4] = coefBlock[0]; - for (int blockY = 0; blockY < 4; blockY++) { for (int blockX = 0; blockX < 4; blockX++) { - /*if (blockX == 0 && blockY == 0) { - continue; - }*/ - coefficients[(y + blockY) * width + (x + blockX)] = coefBlock[zigZag4x4Inv[blockY * 4 + blockX]]; - //coefficients[acOffset] = coefBlock[zigZag4x4Inv[blockY * 4 + blockX]]; - //acOffset++; - //coefficients[(y + blockY) * width + (x + blockX)] = coefBlock[blockY * 4 + blockX]; - //int targetIndex = (blockY * 4 + blockX) * (width / 4 * height / 4) + blockIndex; - //coefficients[targetIndex] = coefBlock[zigZag4x4Inv[blockY * 4 + blockX]]; } } } @@ -845,6 +741,8 @@ void performInverse4x4DctAdd(int16_t const *coefficients, int16_t *normalizedCoe } } +#endif + } namespace dct { @@ -912,6 +810,8 @@ void DCT::inverse(int16_t const *coefficients, uint8_t *pixels, int width, int h performInverseDct(coefficients, pixels, width, height, coefficientsPerRow, bytesPerRow, _internal->auxiliaryData, (IFAST_MULT_TYPE *)_internal->inverseDctData.data()); } +#if defined(__aarch64__) + void DCT::forward4x4(int16_t const *normalizedCoefficients, int16_t *coefficients, int width, int height) { performForward4x4Dct(normalizedCoefficients, coefficients, width, height, (DCTELEM *)_internal->forwardDctData.data()); } @@ -920,4 +820,6 @@ void DCT::inverse4x4Add(int16_t const *coefficients, int16_t *normalizedCoeffici performInverse4x4DctAdd(coefficients, normalizedCoefficients, width, height, _internal->auxiliaryData, (IFAST_MULT_TYPE *)_internal->inverseDctData.data()); } +#endif + } diff --git a/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/DCT.h b/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/DCT.h index 29d6943d6c7..1d5004cf1e0 100644 --- a/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/DCT.h +++ b/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/DCT.h @@ -30,8 +30,11 @@ class DCT { void forward(uint8_t const *pixels, int16_t *coefficients, int width, int height, int bytesPerRow); void inverse(int16_t const *coefficients, uint8_t *pixels, int width, int height, int coefficientsPerRow, int bytesPerRow); + +#if defined(__aarch64__) void forward4x4(int16_t const *normalizedCoefficients, int16_t *coefficients, int width, int height); void inverse4x4Add(int16_t const *coefficients, int16_t *normalizedCoefficients, int width, int height); +#endif private: DCTInternal *_internal; diff --git a/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/DCTCommon.h b/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/DCTCommon.h index b900f4a3e19..b57f76e8dea 100644 --- a/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/DCTCommon.h +++ b/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/DCTCommon.h @@ -18,7 +18,6 @@ struct DctAuxiliaryData *createDctAuxiliaryData(); void freeDctAuxiliaryData(struct DctAuxiliaryData *data); void dct_jpeg_idct_ifast(struct DctAuxiliaryData *auxiliaryData, void *dct_table, JCOEFPTR coef_block, JSAMPROW output_buf); -void dct_jpeg_idct_ifast_normalized(struct DctAuxiliaryData *auxiliaryData, void *dct_table, JCOEFPTR coef_block, JCOEFPTR output_buf); void dct_jpeg_fdct_ifast(DCTELEM *data); #ifdef __cplusplus diff --git a/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/DCT_Neon.c b/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/DCT_Neon.c index d7a4fe25040..e0ce94fadd8 100644 --- a/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/DCT_Neon.c +++ b/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/DCT_Neon.c @@ -695,626 +695,4 @@ void dct_jpeg_idct_ifast(struct DctAuxiliaryData *auxiliaryData, void *dct_table vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1); } -void dct_jpeg_idct_ifast_normalized_neon(struct DctAuxiliaryData *auxiliaryData, void *dct_table, JCOEFPTR coef_block, JCOEFPTR output_buf) -{ - IFAST_MULT_TYPE *quantptr = dct_table; - - /* Load DCT coefficients. */ - int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE); - int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE); - int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE); - int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE); - int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE); - int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE); - int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE); - int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE); - - /* Load quantization table values for DC coefficients. */ - int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE); - /* Dequantize DC coefficients. */ - row0 = vmulq_s16(row0, quant_row0); - - /* Construct bitmap to test if all AC coefficients are 0. */ - int16x8_t bitmap = vorrq_s16(row1, row2); - bitmap = vorrq_s16(bitmap, row3); - bitmap = vorrq_s16(bitmap, row4); - bitmap = vorrq_s16(bitmap, row5); - bitmap = vorrq_s16(bitmap, row6); - bitmap = vorrq_s16(bitmap, row7); - - int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0); - int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1); - - /* Load IDCT conversion constants. */ - const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts); - - if (left_ac_bitmap == 0 && right_ac_bitmap == 0) { - /* All AC coefficients are zero. - * Compute DC values and duplicate into vectors. - */ - int16x8_t dcval = row0; - row1 = dcval; - row2 = dcval; - row3 = dcval; - row4 = dcval; - row5 = dcval; - row6 = dcval; - row7 = dcval; - } else if (left_ac_bitmap == 0) { - /* AC coefficients are zero for columns 0, 1, 2, and 3. - * Use DC values for these columns. - */ - int16x4_t dcval = vget_low_s16(row0); - - /* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */ - - /* Load quantization table. */ - int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4); - int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4); - int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4); - int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4); - int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4); - int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4); - int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4); - - /* Even part: dequantize DCT coefficients. */ - int16x4_t tmp0 = vget_high_s16(row0); - int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2); - int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4); - int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6); - - int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */ - int16x4_t tmp11 = vsub_s16(tmp0, tmp2); - - int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */ - int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3); - int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1); - tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3); - tmp12 = vsub_s16(tmp12, tmp13); - - tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */ - tmp3 = vsub_s16(tmp10, tmp13); - tmp1 = vadd_s16(tmp11, tmp12); - tmp2 = vsub_s16(tmp11, tmp12); - - /* Odd part: dequantize DCT coefficients. */ - int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1); - int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3); - int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5); - int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7); - - int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */ - int16x4_t neg_z10 = vsub_s16(tmp5, tmp6); - int16x4_t z11 = vadd_s16(tmp4, tmp7); - int16x4_t z12 = vsub_s16(tmp4, tmp7); - - tmp7 = vadd_s16(z11, z13); /* phase 5 */ - int16x4_t z11_sub_z13 = vsub_s16(z11, z13); - tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1); - tmp11 = vadd_s16(tmp11, z11_sub_z13); - - int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10); - int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2); - z5 = vadd_s16(z5, z10_add_z12); - tmp10 = vqdmulh_lane_s16(z12, consts, 0); - tmp10 = vadd_s16(tmp10, z12); - tmp10 = vsub_s16(tmp10, z5); - tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3); - tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10)); - tmp12 = vadd_s16(tmp12, z5); - - tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */ - tmp5 = vsub_s16(tmp11, tmp6); - tmp4 = vadd_s16(tmp10, tmp5); - - row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7)); - row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7)); - row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6)); - row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6)); - row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5)); - row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5)); - row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4)); - row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4)); - } else if (right_ac_bitmap == 0) { - /* AC coefficients are zero for columns 4, 5, 6, and 7. - * Use DC values for these columns. - */ - int16x4_t dcval = vget_high_s16(row0); - - /* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */ - - /* Load quantization table. */ - int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE); - int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE); - int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE); - int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE); - int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE); - int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE); - int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE); - - /* Even part: dequantize DCT coefficients. */ - int16x4_t tmp0 = vget_low_s16(row0); - int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2); - int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4); - int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6); - - int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */ - int16x4_t tmp11 = vsub_s16(tmp0, tmp2); - - int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */ - int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3); - int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1); - tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3); - tmp12 = vsub_s16(tmp12, tmp13); - - tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */ - tmp3 = vsub_s16(tmp10, tmp13); - tmp1 = vadd_s16(tmp11, tmp12); - tmp2 = vsub_s16(tmp11, tmp12); - - /* Odd part: dequantize DCT coefficients. */ - int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1); - int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3); - int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5); - int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7); - - int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */ - int16x4_t neg_z10 = vsub_s16(tmp5, tmp6); - int16x4_t z11 = vadd_s16(tmp4, tmp7); - int16x4_t z12 = vsub_s16(tmp4, tmp7); - - tmp7 = vadd_s16(z11, z13); /* phase 5 */ - int16x4_t z11_sub_z13 = vsub_s16(z11, z13); - tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1); - tmp11 = vadd_s16(tmp11, z11_sub_z13); - - int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10); - int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2); - z5 = vadd_s16(z5, z10_add_z12); - tmp10 = vqdmulh_lane_s16(z12, consts, 0); - tmp10 = vadd_s16(tmp10, z12); - tmp10 = vsub_s16(tmp10, z5); - tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3); - tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10)); - tmp12 = vadd_s16(tmp12, z5); - - tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */ - tmp5 = vsub_s16(tmp11, tmp6); - tmp4 = vadd_s16(tmp10, tmp5); - - row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval); - row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval); - row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval); - row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval); - row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval); - row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval); - row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval); - row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval); - } else { - /* Some AC coefficients are non-zero; full IDCT calculation required. */ - - /* Load quantization table. */ - int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE); - int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE); - int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE); - int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE); - int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE); - int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE); - int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE); - - /* Even part: dequantize DCT coefficients. */ - int16x8_t tmp0 = row0; - int16x8_t tmp1 = vmulq_s16(row2, quant_row2); - int16x8_t tmp2 = vmulq_s16(row4, quant_row4); - int16x8_t tmp3 = vmulq_s16(row6, quant_row6); - - int16x8_t tmp10 = vaddq_s16(tmp0, tmp2); /* phase 3 */ - int16x8_t tmp11 = vsubq_s16(tmp0, tmp2); - - int16x8_t tmp13 = vaddq_s16(tmp1, tmp3); /* phases 5-3 */ - int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3); - int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1); - tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3); - tmp12 = vsubq_s16(tmp12, tmp13); - - tmp0 = vaddq_s16(tmp10, tmp13); /* phase 2 */ - tmp3 = vsubq_s16(tmp10, tmp13); - tmp1 = vaddq_s16(tmp11, tmp12); - tmp2 = vsubq_s16(tmp11, tmp12); - - /* Odd part: dequantize DCT coefficients. */ - int16x8_t tmp4 = vmulq_s16(row1, quant_row1); - int16x8_t tmp5 = vmulq_s16(row3, quant_row3); - int16x8_t tmp6 = vmulq_s16(row5, quant_row5); - int16x8_t tmp7 = vmulq_s16(row7, quant_row7); - - int16x8_t z13 = vaddq_s16(tmp6, tmp5); /* phase 6 */ - int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6); - int16x8_t z11 = vaddq_s16(tmp4, tmp7); - int16x8_t z12 = vsubq_s16(tmp4, tmp7); - - tmp7 = vaddq_s16(z11, z13); /* phase 5 */ - int16x8_t z11_sub_z13 = vsubq_s16(z11, z13); - tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1); - tmp11 = vaddq_s16(tmp11, z11_sub_z13); - - int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10); - int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2); - z5 = vaddq_s16(z5, z10_add_z12); - tmp10 = vqdmulhq_lane_s16(z12, consts, 0); - tmp10 = vaddq_s16(tmp10, z12); - tmp10 = vsubq_s16(tmp10, z5); - tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3); - tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10)); - tmp12 = vaddq_s16(tmp12, z5); - - tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */ - tmp5 = vsubq_s16(tmp11, tmp6); - tmp4 = vaddq_s16(tmp10, tmp5); - - row0 = vaddq_s16(tmp0, tmp7); - row7 = vsubq_s16(tmp0, tmp7); - row1 = vaddq_s16(tmp1, tmp6); - row6 = vsubq_s16(tmp1, tmp6); - row2 = vaddq_s16(tmp2, tmp5); - row5 = vsubq_s16(tmp2, tmp5); - row4 = vaddq_s16(tmp3, tmp4); - row3 = vsubq_s16(tmp3, tmp4); - } - - /* Transpose rows to work on columns in pass 2. */ - int16x8x2_t rows_01 = vtrnq_s16(row0, row1); - int16x8x2_t rows_23 = vtrnq_s16(row2, row3); - int16x8x2_t rows_45 = vtrnq_s16(row4, row5); - int16x8x2_t rows_67 = vtrnq_s16(row6, row7); - - int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]), - vreinterpretq_s32_s16(rows_45.val[0])); - int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]), - vreinterpretq_s32_s16(rows_45.val[1])); - int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]), - vreinterpretq_s32_s16(rows_67.val[0])); - int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]), - vreinterpretq_s32_s16(rows_67.val[1])); - - int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]); - int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]); - int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]); - int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]); - - int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]); - int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]); - int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]); - int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]); - int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]); - int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]); - int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]); - int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]); - - /* 1-D IDCT, pass 2 */ - - /* Even part */ - int16x8_t tmp10 = vaddq_s16(col0, col4); - int16x8_t tmp11 = vsubq_s16(col0, col4); - - int16x8_t tmp13 = vaddq_s16(col2, col6); - int16x8_t col2_sub_col6 = vsubq_s16(col2, col6); - int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1); - tmp12 = vaddq_s16(tmp12, col2_sub_col6); - tmp12 = vsubq_s16(tmp12, tmp13); - - int16x8_t tmp0 = vaddq_s16(tmp10, tmp13); - int16x8_t tmp3 = vsubq_s16(tmp10, tmp13); - int16x8_t tmp1 = vaddq_s16(tmp11, tmp12); - int16x8_t tmp2 = vsubq_s16(tmp11, tmp12); - - /* Odd part */ - int16x8_t z13 = vaddq_s16(col5, col3); - int16x8_t neg_z10 = vsubq_s16(col3, col5); - int16x8_t z11 = vaddq_s16(col1, col7); - int16x8_t z12 = vsubq_s16(col1, col7); - - int16x8_t tmp7 = vaddq_s16(z11, z13); /* phase 5 */ - int16x8_t z11_sub_z13 = vsubq_s16(z11, z13); - tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1); - tmp11 = vaddq_s16(tmp11, z11_sub_z13); - - int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10); - int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2); - z5 = vaddq_s16(z5, z10_add_z12); - tmp10 = vqdmulhq_lane_s16(z12, consts, 0); - tmp10 = vaddq_s16(tmp10, z12); - tmp10 = vsubq_s16(tmp10, z5); - tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3); - tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10)); - tmp12 = vaddq_s16(tmp12, z5); - - int16x8_t tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */ - int16x8_t tmp5 = vsubq_s16(tmp11, tmp6); - int16x8_t tmp4 = vaddq_s16(tmp10, tmp5); - - col0 = vaddq_s16(tmp0, tmp7); - col7 = vsubq_s16(tmp0, tmp7); - col1 = vaddq_s16(tmp1, tmp6); - col6 = vsubq_s16(tmp1, tmp6); - col2 = vaddq_s16(tmp2, tmp5); - col5 = vsubq_s16(tmp2, tmp5); - col4 = vaddq_s16(tmp3, tmp4); - col3 = vsubq_s16(tmp3, tmp4); - - /* Scale down by a factor of 8, narrowing to 8-bit. */ - int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3), - vqshrn_n_s16(col1, PASS1_BITS + 3)); - int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3), - vqshrn_n_s16(col5, PASS1_BITS + 3)); - int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3), - vqshrn_n_s16(col3, PASS1_BITS + 3)); - int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3), - vqshrn_n_s16(col7, PASS1_BITS + 3)); - /* Clamp to range [0-255]. */ - uint8x16_t cols_01 = - vreinterpretq_u8_s8 - (vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE)))); - uint8x16_t cols_45 = - vreinterpretq_u8_s8 - (vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE)))); - uint8x16_t cols_23 = - vreinterpretq_u8_s8 - (vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE)))); - uint8x16_t cols_67 = - vreinterpretq_u8_s8 - (vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE)))); - - /* Transpose block to prepare for store. */ - uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01), - vreinterpretq_u32_u8(cols_45)); - uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23), - vreinterpretq_u32_u8(cols_67)); - - uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]), - vreinterpretq_u8_u32(cols_0415.val[1])); - uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]), - vreinterpretq_u8_u32(cols_2637.val[1])); - uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]), - vreinterpretq_u16_u8(cols_2367.val[0])); - uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]), - vreinterpretq_u16_u8(cols_2367.val[1])); - - uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]); - uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]); - uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]); - uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]); - - JCOEFPTR outptr0 = output_buf + DCTSIZE * 0; - JCOEFPTR outptr1 = output_buf + DCTSIZE * 1; - JCOEFPTR outptr2 = output_buf + DCTSIZE * 2; - JCOEFPTR outptr3 = output_buf + DCTSIZE * 3; - JCOEFPTR outptr4 = output_buf + DCTSIZE * 4; - JCOEFPTR outptr5 = output_buf + DCTSIZE * 5; - JCOEFPTR outptr6 = output_buf + DCTSIZE * 6; - JCOEFPTR outptr7 = output_buf + DCTSIZE * 7; - - /* Store DCT block to memory. */ - vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u16(rows_04), 0); - vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u16(rows_15), 0); - vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u16(rows_26), 0); - vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u16(rows_37), 0); - vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u16(rows_04), 1); - vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u16(rows_15), 1); - vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u16(rows_26), 1); - vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u16(rows_37), 1); -} - -void dct_jpeg_idct_ifast_normalized(struct DctAuxiliaryData *auxiliaryData, void *dct_table, JCOEFPTR coef_block, JCOEFPTR output_buf) { - DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - DCTELEM tmp10, tmp11, tmp12, tmp13; - DCTELEM z5, z10, z11, z12, z13; - JCOEFPTR inptr; - IFAST_MULT_TYPE *quantptr; - int *wsptr; - JCOEFPTR outptr; - int ctr; - int workspace[DCTSIZE2]; /* buffers data between passes */ - - /* Pass 1: process columns from input, store into work array. */ - - inptr = coef_block; - quantptr = dct_table; - wsptr = workspace; - for (ctr = DCTSIZE; ctr > 0; ctr--) { - /* Due to quantization, we will usually find that many of the input - * coefficients are zero, especially the AC terms. We can exploit this - * by short-circuiting the IDCT calculation for any column in which all - * the AC terms are zero. In that case each output is equal to the - * DC coefficient (with scale factor as needed). - * With typical images and quantization tables, half or more of the - * column DCT calculations can be simplified this way. - */ - - if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 && - inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 4] == 0 && - inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 6] == 0 && - inptr[DCTSIZE * 7] == 0) { - /* AC terms all zero */ - int dcval = (int)DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]); - - wsptr[DCTSIZE * 0] = dcval; - wsptr[DCTSIZE * 1] = dcval; - wsptr[DCTSIZE * 2] = dcval; - wsptr[DCTSIZE * 3] = dcval; - wsptr[DCTSIZE * 4] = dcval; - wsptr[DCTSIZE * 5] = dcval; - wsptr[DCTSIZE * 6] = dcval; - wsptr[DCTSIZE * 7] = dcval; - - inptr++; /* advance pointers to next column */ - quantptr++; - wsptr++; - continue; - } - - /* Even part */ - - tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]); - tmp1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]); - tmp2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]); - tmp3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]); - - tmp10 = tmp0 + tmp2; /* phase 3 */ - tmp11 = tmp0 - tmp2; - - tmp13 = tmp1 + tmp3; /* phases 5-3 */ - tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */ - - tmp0 = tmp10 + tmp13; /* phase 2 */ - tmp3 = tmp10 - tmp13; - tmp1 = tmp11 + tmp12; - tmp2 = tmp11 - tmp12; - - /* Odd part */ - - tmp4 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]); - tmp5 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]); - tmp6 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]); - tmp7 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]); - - z13 = tmp6 + tmp5; /* phase 6 */ - z10 = tmp6 - tmp5; - z11 = tmp4 + tmp7; - z12 = tmp4 - tmp7; - - tmp7 = z11 + z13; /* phase 5 */ - tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ - - z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */ - tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */ - tmp12 = MULTIPLY(z10, -FIX_2_613125930) + z5; /* -2*(c2+c6) */ - - tmp6 = tmp12 - tmp7; /* phase 2 */ - tmp5 = tmp11 - tmp6; - tmp4 = tmp10 + tmp5; - - wsptr[DCTSIZE * 0] = (int)(tmp0 + tmp7); - wsptr[DCTSIZE * 7] = (int)(tmp0 - tmp7); - wsptr[DCTSIZE * 1] = (int)(tmp1 + tmp6); - wsptr[DCTSIZE * 6] = (int)(tmp1 - tmp6); - wsptr[DCTSIZE * 2] = (int)(tmp2 + tmp5); - wsptr[DCTSIZE * 5] = (int)(tmp2 - tmp5); - wsptr[DCTSIZE * 4] = (int)(tmp3 + tmp4); - wsptr[DCTSIZE * 3] = (int)(tmp3 - tmp4); - - inptr++; /* advance pointers to next column */ - quantptr++; - wsptr++; - } - - /* Pass 2: process rows from work array, store into output array. */ - /* Note that we must descale the results by a factor of 8 == 2**3, */ - /* and also undo the PASS1_BITS scaling. */ - - wsptr = workspace; - for (ctr = 0; ctr < DCTSIZE; ctr++) { - outptr = output_buf + ctr * DCTSIZE; - /* Rows of zeroes can be exploited in the same way as we did with columns. - * However, the column calculation has created many nonzero AC terms, so - * the simplification applies less often (typically 5% to 10% of the time). - * On machines with very fast multiplication, it's possible that the - * test takes more time than it's worth. In that case this section - * may be commented out. - */ - -#ifndef NO_ZERO_ROW_TEST - if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 && - wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) { - /* AC terms all zero */ - //JSAMPLE dcval = range_limit[IDESCALE(wsptr[0], PASS1_BITS + 3) & RANGE_MASK]; - JCOEF dcval = wsptr[0]; - - outptr[0] = dcval; - outptr[1] = dcval; - outptr[2] = dcval; - outptr[3] = dcval; - outptr[4] = dcval; - outptr[5] = dcval; - outptr[6] = dcval; - outptr[7] = dcval; - - wsptr += DCTSIZE; /* advance pointer to next row */ - continue; - } -#endif - - /* Even part */ - - tmp10 = ((DCTELEM)wsptr[0] + (DCTELEM)wsptr[4]); - tmp11 = ((DCTELEM)wsptr[0] - (DCTELEM)wsptr[4]); - - tmp13 = ((DCTELEM)wsptr[2] + (DCTELEM)wsptr[6]); - tmp12 = - MULTIPLY((DCTELEM)wsptr[2] - (DCTELEM)wsptr[6], FIX_1_414213562) - tmp13; - - tmp0 = tmp10 + tmp13; - tmp3 = tmp10 - tmp13; - tmp1 = tmp11 + tmp12; - tmp2 = tmp11 - tmp12; - - /* Odd part */ - - z13 = (DCTELEM)wsptr[5] + (DCTELEM)wsptr[3]; - z10 = (DCTELEM)wsptr[5] - (DCTELEM)wsptr[3]; - z11 = (DCTELEM)wsptr[1] + (DCTELEM)wsptr[7]; - z12 = (DCTELEM)wsptr[1] - (DCTELEM)wsptr[7]; - - tmp7 = z11 + z13; /* phase 5 */ - tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ - - z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */ - tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */ - tmp12 = MULTIPLY(z10, -FIX_2_613125930) + z5; /* -2*(c2+c6) */ - - tmp6 = tmp12 - tmp7; /* phase 2 */ - tmp5 = tmp11 - tmp6; - tmp4 = tmp10 + tmp5; - - /* Final output stage: scale down by a factor of 8 and range-limit */ - - /*outptr[0] = - range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS + 3) & RANGE_MASK]; - outptr[7] = - range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS + 3) & RANGE_MASK]; - outptr[1] = - range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS + 3) & RANGE_MASK]; - outptr[6] = - range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS + 3) & RANGE_MASK]; - outptr[2] = - range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS + 3) & RANGE_MASK]; - outptr[5] = - range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS + 3) & RANGE_MASK]; - outptr[4] = - range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS + 3) & RANGE_MASK]; - outptr[3] = - range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS + 3) & RANGE_MASK];*/ - - outptr[0] = IDESCALE(tmp0 + tmp7, PASS1_BITS + 3); - outptr[7] = IDESCALE(tmp0 - tmp7, PASS1_BITS + 3); - outptr[1] = IDESCALE(tmp1 + tmp6, PASS1_BITS + 3); - outptr[6] = IDESCALE(tmp1 - tmp6, PASS1_BITS + 3); - outptr[2] = IDESCALE(tmp2 + tmp5, PASS1_BITS + 3); - outptr[5] = IDESCALE(tmp2 - tmp5, PASS1_BITS + 3); - outptr[4] = IDESCALE(tmp3 + tmp4, PASS1_BITS + 3); - outptr[3] = IDESCALE(tmp3 - tmp4, PASS1_BITS + 3); - - /*outptr[0] = tmp0 + tmp7; - outptr[7] = tmp0 - tmp7; - outptr[1] = tmp1 + tmp6; - outptr[6] = tmp1 - tmp6; - outptr[2] = tmp2 + tmp5; - outptr[5] = tmp2 - tmp5; - outptr[4] = tmp3 + tmp4; - outptr[3] = tmp3 - tmp4;*/ - - wsptr += DCTSIZE; /* advance pointer to next row */ - } -} - #endif diff --git a/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/ImageDCT.mm b/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/ImageDCT.mm index 589b36e74c2..342c71018c2 100644 --- a/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/ImageDCT.mm +++ b/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/ImageDCT.mm @@ -78,6 +78,8 @@ - (void)inverseWithCoefficients:(int16_t const * _Nonnull)coefficients pixels:(u _dct->inverse(coefficients, pixels, (int)width, (int)height, (int)coefficientsPerRow, (int)bytesPerRow); } +#if defined(__aarch64__) + - (void)forward4x4:(int16_t const * _Nonnull)normalizedCoefficients coefficients:(int16_t * _Nonnull)coefficients width:(NSInteger)width height:(NSInteger)height { _dct->forward4x4(normalizedCoefficients, coefficients, (int)width, (int)height); } @@ -86,4 +88,6 @@ - (void)inverse4x4Add:(int16_t const * _Nonnull)coefficients normalizedCoefficie _dct->inverse4x4Add(coefficients, normalizedCoefficients, (int)width, (int)height); } +#endif + @end diff --git a/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/YuvConversion.m b/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/YuvConversion.m index 03567796d7a..4d00b0e92a8 100644 --- a/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/YuvConversion.m +++ b/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/YuvConversion.m @@ -96,6 +96,9 @@ void combineYUVAPlanesIntoARGB(uint8_t *argb, uint8_t const *inY, uint8_t const error = vImageConvert_420Yp8_Cb8_Cr8ToARGB8888(&srcYp, &srcCb, &srcCr, &destArgb, &info, permuteMap, 255, kvImageDoNotTile); error = vImageOverwriteChannels_ARGB8888(&srcA, &destArgb, &destArgb, 1 << 0, kvImageDoNotTile); + if (error != kvImageNoError) { + } + //error = vImageOverwriteChannels_ARGB8888(&srcYp, &destArgb, &destArgb, 1 << 1, kvImageDoNotTile); //error = vImageOverwriteChannels_ARGB8888(&srcYp, &destArgb, &destArgb, 1 << 2, kvImageDoNotTile); //error = vImageOverwriteChannels_ARGB8888(&srcYp, &destArgb, &destArgb, 1 << 3, kvImageDoNotTile); @@ -118,11 +121,13 @@ void scaleImagePlane(uint8_t *outPlane, int outWidth, int outHeight, int outByte } void convertUInt8toInt16(uint8_t const *source, int16_t *dest, int length) { +#if defined(__aarch64__) + #if DEBUG assert(!((intptr_t)source % sizeof(uint64_t))); assert(!((intptr_t)dest % sizeof(uint64_t))); #endif - + for (int i = 0; i < length; i += 8 * 4) { #pragma unroll for (int j = 0; j < 4; j++) { @@ -137,9 +142,15 @@ void convertUInt8toInt16(uint8_t const *source, int16_t *dest, int length) { dest[i] = (int16_t)source[i]; } } +#else + for (int i = 0; i < length; i++) { + dest[i] = (int16_t)source[i]; + } +#endif } void convertInt16toUInt8(int16_t const *source, uint8_t *dest, int length) { +#if defined(__aarch64__) for (int i = 0; i < length; i += 8) { int16x8_t lhs16 = vld1q_s16(&source[i]); int8x8_t lhs = vqmovun_s16(lhs16); @@ -158,9 +169,22 @@ void convertInt16toUInt8(int16_t const *source, uint8_t *dest, int length) { dest[i] = (int8_t)result; } } +#else + for (int i = 0; i < length; i++) { + int16_t result = source[i]; + if (result < 0) { + result = 0; + } + if (result > 255) { + result = 255; + } + dest[i] = (int8_t)result; + } +#endif } void subtractArraysInt16(int16_t const *a, int16_t const *b, int16_t *dest, int length) { +#if defined(__aarch64__) for (int i = 0; i < length; i += 8) { int16x8_t lhs = vld1q_s16((int16_t *)&a[i]); int16x8_t rhs = vld1q_s16((int16_t *)&b[i]); @@ -172,9 +196,15 @@ void subtractArraysInt16(int16_t const *a, int16_t const *b, int16_t *dest, int dest[i] = a[i] - b[i]; } } +#else + for (int i = 0; i < length; i++) { + dest[i] = a[i] - b[i]; + } +#endif } void addArraysInt16(int16_t const *a, int16_t const *b, int16_t *dest, int length) { +#if defined(__aarch64__) for (int i = 0; i < length; i += 8 * 4) { #pragma unroll for (int j = 0; j < 4; j++) { @@ -189,9 +219,15 @@ void addArraysInt16(int16_t const *a, int16_t const *b, int16_t *dest, int lengt dest[i] = a[i] - b[i]; } } +#else + for (int i = 0; i < length; i++) { + dest[i] = a[i] - b[i]; + } +#endif } void subtractArraysUInt8Int16(uint8_t const *a, int16_t const *b, uint8_t *dest, int length) { +#if defined(__aarch64__) for (int i = 0; i < length; i += 8) { uint8x8_t lhs8 = vld1_u8(&a[i]); int16x8_t lhs = vreinterpretq_s16_u16(vmovl_u8(lhs8)); @@ -214,12 +250,9 @@ void subtractArraysUInt8Int16(uint8_t const *a, int16_t const *b, uint8_t *dest, dest[i] = (int8_t)result; } } -} - -void addArraysUInt8Int16(uint8_t const *a, int16_t const *b, uint8_t *dest, int length) { -#if false +#else for (int i = 0; i < length; i++) { - int16_t result = ((int16_t)a[i]) + b[i]; + int16_t result = ((int16_t)a[i]) - b[i]; if (result < 0) { result = 0; } @@ -228,7 +261,11 @@ void addArraysUInt8Int16(uint8_t const *a, int16_t const *b, uint8_t *dest, int } dest[i] = (int8_t)result; } -#else +#endif +} + +void addArraysUInt8Int16(uint8_t const *a, int16_t const *b, uint8_t *dest, int length) { +#if defined(__aarch64__) for (int i = 0; i < length; i += 8) { uint8x8_t lhs8 = vld1_u8(&a[i]); int16x8_t lhs = vreinterpretq_s16_u16(vmovl_u8(lhs8)); @@ -251,5 +288,16 @@ void addArraysUInt8Int16(uint8_t const *a, int16_t const *b, uint8_t *dest, int dest[i] = (int8_t)result; } } +#else + for (int i = 0; i < length; i++) { + int16_t result = ((int16_t)a[i]) + b[i]; + if (result < 0) { + result = 0; + } + if (result > 255) { + result = 255; + } + dest[i] = (int8_t)result; + } #endif } diff --git a/submodules/TelegramUI/Components/AnimationCache/Sources/AnimationCache.swift b/submodules/TelegramUI/Components/AnimationCache/Sources/AnimationCache.swift index 865156eb50c..4435f007ba9 100644 --- a/submodules/TelegramUI/Components/AnimationCache/Sources/AnimationCache.swift +++ b/submodules/TelegramUI/Components/AnimationCache/Sources/AnimationCache.swift @@ -449,7 +449,7 @@ private final class AnimationCacheItemWriterImpl: AnimationCacheItemWriter { self.differenceCoefficients = differenceCoefficients } - #if DEBUG && false + #if !arch(arm64) var insertKeyframe = insertKeyframe insertKeyframe = true #endif diff --git a/submodules/TelegramUI/Components/AnimationCache/Sources/ImageData.swift b/submodules/TelegramUI/Components/AnimationCache/Sources/ImageData.swift index c4744e410e3..352867087ed 100644 --- a/submodules/TelegramUI/Components/AnimationCache/Sources/ImageData.swift +++ b/submodules/TelegramUI/Components/AnimationCache/Sources/ImageData.swift @@ -627,6 +627,7 @@ extension DctCoefficientsYUVA420 { } func dct4x4(dctData: DctData, target: DctCoefficientsYUVA420) { + #if arch(arm64) precondition(self.yPlane.width == target.yPlane.width && self.yPlane.height == target.yPlane.height) for i in 0 ..< 4 { @@ -655,15 +656,15 @@ extension DctCoefficientsYUVA420 { targetPlane.data.withUnsafeMutableBytes { bytes in let coefficients = bytes.baseAddress!.assumingMemoryBound(to: Int16.self) - //memcpy(coefficients, sourceCoefficients, sourceBytes.count) - dctData.deltaDct.forward4x4(sourceCoefficients, coefficients: coefficients, width: sourcePlane.width, height: sourcePlane.height) } } } + #endif } func idct4x4Add(dctData: DctData, target: DctCoefficientsYUVA420) { + #if arch(arm64) precondition(self.yPlane.width == target.yPlane.width && self.yPlane.height == target.yPlane.height) for i in 0 ..< 4 { @@ -698,6 +699,7 @@ extension DctCoefficientsYUVA420 { } } } + #endif } func subtract(other: DctCoefficientsYUVA420) {