Emoji 1.1 WIP

2025-12-23 06:35:51 +00:00 · 2022-08-01 02:22:35 +04:00
parent 47239681a7
commit 095b9d5058
17 changed files with 395 additions and 279 deletions
--- a/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/DCT.cpp
+++ b/submodules/TelegramUI/Components/AnimationCache/ImageDCT/Sources/DCT.cpp
@@ -727,7 +727,7 @@ static inline void transpose_idct4x4_16_bd8(int16x8_t *const a) {
    idct4x4_16_kernel_bd8(a);
 }

-inline void vpx_idct4x4_16_add_neon(const int16x8_t &top64, const int16x8_t &bottom64, int16_t *dest, int16_t multiplier) {
+inline void vpx_idct4x4_16_add_neon(const int16x8_t &top64, const int16x8_t &bottom64, const int16x4_t &current0, const int16x4_t &current1, const int16x4_t &current2, const int16x4_t &current3, int16_t multiplier, int16_t *dest, int destRowIncrement) {
    int16x8_t a[2];
    
    assert(!((intptr_t)dest % sizeof(uint32_t)));
@@ -745,11 +745,19 @@ inline void vpx_idct4x4_16_add_neon(const int16x8_t &top64, const int16x8_t &bot
    a[0] = vrshrq_n_s16(a[0], 4);
    a[1] = vrshrq_n_s16(a[1], 4);
    
-    vst1q_s16(dest, a[0]);
-    dest += 2 * 4;
-    vst1_s16(dest, vget_high_s16(a[1]));
-    dest += 4;
-    vst1_s16(dest, vget_low_s16(a[1]));
+    a[0] = vaddq_s16(a[0], vcombine_s16(current0, current1));
+    a[1] = vaddq_s16(a[1], vcombine_s16(current3, current2));
+    
+    vst1_s16(dest + destRowIncrement * 0, vget_low_s16(a[0]));
+    vst1_s16(dest + destRowIncrement * 1, vget_high_s16(a[0]));
+    vst1_s16(dest + destRowIncrement * 2, vget_high_s16(a[1]));
+    vst1_s16(dest + destRowIncrement * 3, vget_low_s16(a[1]));
+    
+    //vst1q_s16(dest, a[0]);
+    //dest += 2 * 4;
+    //vst1_s16(dest, vget_high_s16(a[1]));
+    //dest += 4;
+    //vst1_s16(dest, vget_low_s16(a[1]));
 }

 static int dct4x4QuantDC = 58;
@@ -803,11 +811,14 @@ void performForward4x4Dct(int16_t const *normalizedCoefficients, int16_t *coeffi
    }
 }

-void performInverse4x4Dct(int16_t const * coefficients, int16_t *normalizedCoefficients, int width, int height, DctAuxiliaryData *auxiliaryData, IFAST_MULT_TYPE *ifmtbl) {
-    DCTELEM resultBlock[4 * 4];
-    
+void performInverse4x4DctAdd(int16_t const *coefficients, int16_t *normalizedCoefficients, int width, int height, DctAuxiliaryData *auxiliaryData, IFAST_MULT_TYPE *ifmtbl) {
    for (int y = 0; y < height; y += 4) {
        for (int x = 0; x < width; x += 4) {
+            int16x4_t current0 = vld1_s16(&normalizedCoefficients[(y + 0) * width + x]);
+            int16x4_t current1 = vld1_s16(&normalizedCoefficients[(y + 1) * width + x]);
+            int16x4_t current2 = vld1_s16(&normalizedCoefficients[(y + 2) * width + x]);
+            int16x4_t current3 = vld1_s16(&normalizedCoefficients[(y + 3) * width + x]);
+            
            uint32x2_t sa = vld1_u32((uint32_t *)&coefficients[(y + 0) * width + x]);
            uint32x2_t sb = vld1_u32((uint32_t *)&coefficients[(y + 1) * width + x]);
            uint32x2_t sc = vld1_u32((uint32_t *)&coefficients[(y + 2) * width + x]);
@@ -829,34 +840,7 @@ void performInverse4x4Dct(int16_t const * coefficients, int16_t *normalizedCoeff
            int16x8_t top64 = vreinterpretq_s16_u16(qtop16);
            int16x8_t bottom64 = vreinterpretq_s16_u16(qbottom16);
            
-            /*DCTELEM coefficientBlock[4 * 4];
-            
-            for (int blockY = 0; blockY < 4; blockY++) {
-                for (int blockX = 0; blockX < 4; blockX++) {
-                    coefficientBlock[zigZag4x4Inv[blockY * 4 + blockX]] = coefficients[(y + blockY) * width + (x + blockX)];
-                }
-            }
-            
-            top64 = vreinterpretq_s16_u64(vld1q_u64((uint64_t *)&coefficientBlock[0]));
-            bottom64 = vreinterpretq_s16_u64(vld1q_u64((uint64_t *)&coefficientBlock[8]));*/
-            
-            vpx_idct4x4_16_add_neon(top64, bottom64, resultBlock, dct4x4QuantAC);
-            
-            uint32x2_t a = vld1_u32((uint32_t *)&resultBlock[4 * 0]);
-            uint32x2_t b = vld1_u32((uint32_t *)&resultBlock[4 * 1]);
-            uint32x2_t c = vld1_u32((uint32_t *)&resultBlock[4 * 2]);
-            uint32x2_t d = vld1_u32((uint32_t *)&resultBlock[4 * 3]);
-            
-            vst1_u32((uint32_t *)&normalizedCoefficients[(y + 0) * width + x], a);
-            vst1_u32((uint32_t *)&normalizedCoefficients[(y + 1) * width + x], b);
-            vst1_u32((uint32_t *)&normalizedCoefficients[(y + 2) * width + x], c);
-            vst1_u32((uint32_t *)&normalizedCoefficients[(y + 3) * width + x], d);
-            
-            /*for (int blockY = 0; blockY < 4; blockY++) {
-                for (int blockX = 0; blockX < 4; blockX++) {
-                    normalizedCoefficients[(y + blockY) * width + (x + blockX)] = resultBlock[blockY * 4 + blockX];
-                }
-            }*/
+            vpx_idct4x4_16_add_neon(top64, bottom64, current0, current1, current2, current3, dct4x4QuantAC, normalizedCoefficients + y * width + x, width);
        }
    }
 }
@@ -932,8 +916,8 @@ void DCT::forward4x4(int16_t const *normalizedCoefficients, int16_t *coefficient
    performForward4x4Dct(normalizedCoefficients, coefficients, width, height, (DCTELEM *)_internal->forwardDctData.data());
 }

-void DCT::inverse4x4(int16_t const *coefficients, int16_t *normalizedCoefficients, int width, int height) {
-    performInverse4x4Dct(coefficients, normalizedCoefficients, width, height, _internal->auxiliaryData, (IFAST_MULT_TYPE *)_internal->inverseDctData.data());
+void DCT::inverse4x4Add(int16_t const *coefficients, int16_t *normalizedCoefficients, int width, int height) {
+    performInverse4x4DctAdd(coefficients, normalizedCoefficients, width, height, _internal->auxiliaryData, (IFAST_MULT_TYPE *)_internal->inverseDctData.data());
 }

 }