Skip to content

Commit

Permalink
Workaround for armv7
Browse files Browse the repository at this point in the history
  • Loading branch information
Ali committed Aug 3, 2022
1 parent 551674a commit bbe1dc4
Show file tree
Hide file tree
Showing 9 changed files with 81 additions and 742 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,11 @@ typedef NS_ENUM(NSUInteger, ImageDCTTableType) {

- (void)forwardWithPixels:(uint8_t const * _Nonnull)pixels coefficients:(int16_t * _Nonnull)coefficients width:(NSInteger)width height:(NSInteger)height bytesPerRow:(NSInteger)bytesPerRow __attribute__((objc_direct));
- (void)inverseWithCoefficients:(int16_t const * _Nonnull)coefficients pixels:(uint8_t * _Nonnull)pixels width:(NSInteger)width height:(NSInteger)height coefficientsPerRow:(NSInteger)coefficientsPerRow bytesPerRow:(NSInteger)bytesPerRow __attribute__((objc_direct));

#if defined(__aarch64__)
- (void)forward4x4:(int16_t const * _Nonnull)normalizedCoefficients coefficients:(int16_t * _Nonnull)coefficients width:(NSInteger)width height:(NSInteger)height __attribute__((objc_direct));
- (void)inverse4x4Add:(int16_t const * _Nonnull)coefficients normalizedCoefficients:(int16_t * _Nonnull)normalizedCoefficients width:(NSInteger)width height:(NSInteger)height __attribute__((objc_direct));
#endif

@end

Expand Down
120 changes: 11 additions & 109 deletions submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/DCT.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,6 @@ void performInverseDct(int16_t const * coefficients, uint8_t *pixels, int width,
int16_t element = coefficients[acOffset];
acOffset++;
coefficientBlock[zigZagInv[blockY * DCTSIZE + blockX]] = element;
//coefficientBlock[zigZagInv[blockY * DCTSIZE + blockX]] = coefficients[(y + blockY) * coefficientsPerRow + (x + blockX)];
}
}

Expand All @@ -368,66 +367,6 @@ void performInverseDct(int16_t const * coefficients, uint8_t *pixels, int width,
}
}

void matrix_multiply_4x4_neon(float32_t *A, float32_t *B, float32_t *C) {
// these are the columns A
float32x4_t A0;
float32x4_t A1;
float32x4_t A2;
float32x4_t A3;

// these are the columns B
float32x4_t B0;
float32x4_t B1;
float32x4_t B2;
float32x4_t B3;

// these are the columns C
float32x4_t C0;
float32x4_t C1;
float32x4_t C2;
float32x4_t C3;

A0 = vld1q_f32(A);
A1 = vld1q_f32(A+4);
A2 = vld1q_f32(A+8);
A3 = vld1q_f32(A+12);

// Zero accumulators for C values
C0 = vmovq_n_f32(0);
C1 = vmovq_n_f32(0);
C2 = vmovq_n_f32(0);
C3 = vmovq_n_f32(0);

// Multiply accumulate in 4x1 blocks, i.e. each column in C
B0 = vld1q_f32(B);
C0 = vfmaq_laneq_f32(C0, A0, B0, 0);
C0 = vfmaq_laneq_f32(C0, A1, B0, 1);
C0 = vfmaq_laneq_f32(C0, A2, B0, 2);
C0 = vfmaq_laneq_f32(C0, A3, B0, 3);
vst1q_f32(C, C0);

B1 = vld1q_f32(B+4);
C1 = vfmaq_laneq_f32(C1, A0, B1, 0);
C1 = vfmaq_laneq_f32(C1, A1, B1, 1);
C1 = vfmaq_laneq_f32(C1, A2, B1, 2);
C1 = vfmaq_laneq_f32(C1, A3, B1, 3);
vst1q_f32(C+4, C1);

B2 = vld1q_f32(B+8);
C2 = vfmaq_laneq_f32(C2, A0, B2, 0);
C2 = vfmaq_laneq_f32(C2, A1, B2, 1);
C2 = vfmaq_laneq_f32(C2, A2, B2, 2);
C2 = vfmaq_laneq_f32(C2, A3, B2, 3);
vst1q_f32(C+8, C2);

B3 = vld1q_f32(B+12);
C3 = vfmaq_laneq_f32(C3, A0, B3, 0);
C3 = vfmaq_laneq_f32(C3, A1, B3, 1);
C3 = vfmaq_laneq_f32(C3, A2, B3, 2);
C3 = vfmaq_laneq_f32(C3, A3, B3, 3);
vst1q_f32(C+12, C3);
}

typedef int16_t tran_low_t;
typedef int32_t tran_high_t;
typedef int16_t tran_coef_t;
Expand Down Expand Up @@ -483,30 +422,6 @@ static inline tran_high_t fdct_round_shift(tran_high_t input) {
return rv;
}

void fdct4x4_float(const int16_t *input, tran_low_t *output) {
float inputFloat[4 * 4];
for (int i = 0; i < 4 * 4; i++) {
inputFloat[i] = (float)input[i];
}
float outputFloat[4 * 4];

int i, j, u, v;
for (u = 0; u < 4; ++u) {
for (v = 0; v < 4; ++v) {
outputFloat[u * 4 + v] = 0;
for (i = 0; i < 4; i++) {
for (j = 0; j < 4; j++) {
outputFloat[u * 4 + v] += inputFloat[i * 4 + j] * cos(M_PI/((float)4)*(i+1./2.)*u)*cos(M_PI/((float)4)*(j+1./2.)*v);
}
}
}
}

for (int i = 0; i < 4 * 4; i++) {
output[i] = (float)outputFloat[i];
}
}

void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
Expand Down Expand Up @@ -636,14 +551,11 @@ void vpx_idct4x4_16_add_c(const tran_low_t *input, tran_low_t *dest, int stride)
idct4_c(temp_in, temp_out);
for (j = 0; j < 4; ++j) {
dest[j * stride + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
//dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4));
}
}
}

static inline int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
return vld1q_s16(buf);
}
#if defined(__aarch64__)

static inline void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) {
// Swap 32 bit elements. Goes from:
Expand Down Expand Up @@ -752,23 +664,18 @@ inline void vpx_idct4x4_16_add_neon(const int16x8_t &top64, const int16x8_t &bot
vst1_s16(dest + destRowIncrement * 1, vget_high_s16(a[0]));
vst1_s16(dest + destRowIncrement * 2, vget_high_s16(a[1]));
vst1_s16(dest + destRowIncrement * 3, vget_low_s16(a[1]));

//vst1q_s16(dest, a[0]);
//dest += 2 * 4;
//vst1_s16(dest, vget_high_s16(a[1]));
//dest += 4;
//vst1_s16(dest, vget_low_s16(a[1]));
}

#endif

static int dct4x4QuantDC = 58;
static int dct4x4QuantAC = 58;

#if defined(__aarch64__)

void performForward4x4Dct(int16_t const *normalizedCoefficients, int16_t *coefficients, int width, int height, DCTELEM *divisors) {
DCTELEM block[4 * 4];
DCTELEM coefBlock[4 * 4];

//int acOffset = (width / 4) * (height / 4);

for (int y = 0; y < height; y += 4) {
for (int x = 0; x < width; x += 4) {
for (int blockY = 0; blockY < 4; blockY++) {
Expand All @@ -791,20 +698,9 @@ void performForward4x4Dct(int16_t const *normalizedCoefficients, int16_t *coeffi
}
}

//coefficients[(y / 4) * (width / 4) + x / 4] = coefBlock[0];

for (int blockY = 0; blockY < 4; blockY++) {
for (int blockX = 0; blockX < 4; blockX++) {
/*if (blockX == 0 && blockY == 0) {
continue;
}*/

coefficients[(y + blockY) * width + (x + blockX)] = coefBlock[zigZag4x4Inv[blockY * 4 + blockX]];
//coefficients[acOffset] = coefBlock[zigZag4x4Inv[blockY * 4 + blockX]];
//acOffset++;
//coefficients[(y + blockY) * width + (x + blockX)] = coefBlock[blockY * 4 + blockX];
//int targetIndex = (blockY * 4 + blockX) * (width / 4 * height / 4) + blockIndex;
//coefficients[targetIndex] = coefBlock[zigZag4x4Inv[blockY * 4 + blockX]];
}
}
}
Expand Down Expand Up @@ -845,6 +741,8 @@ void performInverse4x4DctAdd(int16_t const *coefficients, int16_t *normalizedCoe
}
}

#endif

}

namespace dct {
Expand Down Expand Up @@ -912,6 +810,8 @@ void DCT::inverse(int16_t const *coefficients, uint8_t *pixels, int width, int h
performInverseDct(coefficients, pixels, width, height, coefficientsPerRow, bytesPerRow, _internal->auxiliaryData, (IFAST_MULT_TYPE *)_internal->inverseDctData.data());
}

#if defined(__aarch64__)

void DCT::forward4x4(int16_t const *normalizedCoefficients, int16_t *coefficients, int width, int height) {
performForward4x4Dct(normalizedCoefficients, coefficients, width, height, (DCTELEM *)_internal->forwardDctData.data());
}
Expand All @@ -920,4 +820,6 @@ void DCT::inverse4x4Add(int16_t const *coefficients, int16_t *normalizedCoeffici
performInverse4x4DctAdd(coefficients, normalizedCoefficients, width, height, _internal->auxiliaryData, (IFAST_MULT_TYPE *)_internal->inverseDctData.data());
}

#endif

}
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,11 @@ class DCT {

void forward(uint8_t const *pixels, int16_t *coefficients, int width, int height, int bytesPerRow);
void inverse(int16_t const *coefficients, uint8_t *pixels, int width, int height, int coefficientsPerRow, int bytesPerRow);

#if defined(__aarch64__)
void forward4x4(int16_t const *normalizedCoefficients, int16_t *coefficients, int width, int height);
void inverse4x4Add(int16_t const *coefficients, int16_t *normalizedCoefficients, int width, int height);
#endif

private:
DCTInternal *_internal;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ struct DctAuxiliaryData *createDctAuxiliaryData();
void freeDctAuxiliaryData(struct DctAuxiliaryData *data);

void dct_jpeg_idct_ifast(struct DctAuxiliaryData *auxiliaryData, void *dct_table, JCOEFPTR coef_block, JSAMPROW output_buf);
void dct_jpeg_idct_ifast_normalized(struct DctAuxiliaryData *auxiliaryData, void *dct_table, JCOEFPTR coef_block, JCOEFPTR output_buf);
void dct_jpeg_fdct_ifast(DCTELEM *data);

#ifdef __cplusplus
Expand Down
Loading

0 comments on commit bbe1dc4

Please sign in to comment.