Entity input: improved animation cache and rendering

2025-12-22 22:25:57 +00:00 · 2022-06-24 02:06:02 +01:00
parent 0f1b382265
commit c112bc5146
37 changed files with 2557 additions and 383 deletions
--- a/submodules/TelegramUI/Components/AnimationCache/BUILD
+++ b/submodules/TelegramUI/Components/AnimationCache/BUILD
@@ -13,6 +13,7 @@ swift_library(
        "//submodules/SSignalKit/SwiftSignalKit:SwiftSignalKit",
        "//submodules/CryptoUtils:CryptoUtils",
        "//submodules/ManagedFile:ManagedFile",
+        "//submodules/TelegramUI/Components/AnimationCache/DCT:DCT",
    ],
    visibility = [
        "//visibility:public",
--- a/submodules/TelegramUI/Components/AnimationCache/DCT/BUILD
+++ b/submodules/TelegramUI/Components/AnimationCache/DCT/BUILD
@@ -0,0 +1,23 @@
+
+objc_library(
+    name = "DCT",
+    enable_modules = True,
+    module_name = "DCT",
+    srcs = glob([
+        "Sources/**/*.m",
+        "Sources/**/*.h",
+    ]),
+    hdrs = glob([
+        "PublicHeaders/**/*.h",
+    ]),
+    includes = [
+        "PublicHeaders",
+    ],
+    sdk_frameworks = [
+        "Foundation",
+        "Accelerate",
+    ],
+    visibility = [
+        "//visibility:public",
+    ],
+)
--- a/submodules/TelegramUI/Components/AnimationCache/DCT/PublicHeaders/DCT/DCT.h
+++ b/submodules/TelegramUI/Components/AnimationCache/DCT/PublicHeaders/DCT/DCT.h
@@ -0,0 +1,14 @@
+#ifndef DctImageTransform_h
+#define DctImageTransform_h
+
+#import <Foundation/Foundation.h>
+
+#import <DCT/YuvConversion.h>
+
+NSData *generateForwardDctData(int quality);
+void performForwardDct(uint8_t const *pixels, int16_t *coefficients, int width, int height, int bytesPerRow, NSData *dctData);
+
+NSData *generateInverseDctData(int quality);
+void performInverseDct(int16_t const *coefficients, uint8_t *pixels, int width, int height, int coefficientsPerRow, int bytesPerRow, NSData *idctData);
+
+#endif /* DctImageTransform_h */
--- a/submodules/TelegramUI/Components/AnimationCache/DCT/PublicHeaders/DCT/YuvConversion.h
+++ b/submodules/TelegramUI/Components/AnimationCache/DCT/PublicHeaders/DCT/YuvConversion.h
@@ -0,0 +1,9 @@
+#ifndef YuvConversion_h
+#define YuvConversion_h
+
+#import <Foundation/Foundation.h>
+
+void splitRGBAIntoYUVAPlanes(uint8_t const *argb, uint8_t *outY, uint8_t *outU, uint8_t *outV, uint8_t *outA, int width, int height, int bytesPerRow);
+void combineYUVAPlanesIntoARBB(uint8_t *argb, uint8_t const *inY, uint8_t const *inU, uint8_t const *inV, uint8_t const *inA, int width, int height, int bytesPerRow);
+
+#endif /* YuvConversion_h */
--- a/submodules/TelegramUI/Components/AnimationCache/DCT/Sources/DCT.m
+++ b/submodules/TelegramUI/Components/AnimationCache/DCT/Sources/DCT.m
@@ -0,0 +1,991 @@
+#import <DCT/DCT.h>
+
+typedef long JLONG;
+
+typedef unsigned char JSAMPLE;
+#define GETJSAMPLE(value)  ((int)(value))
+
+#define MAXJSAMPLE      255
+#define CENTERJSAMPLE   128
+
+typedef short JCOEF;
+
+typedef unsigned int JDIMENSION;
+
+#define JPEG_MAX_DIMENSION  65500L  /* a tad under 64K to prevent overflows */
+
+#define MULTIPLIER  short       /* prefer 16-bit with SIMD for parellelism */
+
+typedef MULTIPLIER IFAST_MULT_TYPE;  /* 16 bits is OK, use short if faster */
+#define IFAST_SCALE_BITS  2          /* fractional bits in scale factors */
+
+/* Various constants determining the sizes of things.
+ * All of these are specified by the JPEG standard, so don't change them
+ * if you want to be compatible.
+ */
+
+#define DCTSIZE             8   /* The basic DCT block is 8x8 samples */
+#define DCTSIZE2            64  /* DCTSIZE squared; # of elements in a block */
+#define NUM_QUANT_TBLS      4   /* Quantization tables are numbered 0..3 */
+#define NUM_HUFF_TBLS       4   /* Huffman tables are numbered 0..3 */
+#define NUM_ARITH_TBLS      16  /* Arith-coding tables are numbered 0..15 */
+#define MAX_COMPS_IN_SCAN   4   /* JPEG limit on # of components in one scan */
+#define MAX_SAMP_FACTOR     4   /* JPEG limit on sampling factors */
+/* Unfortunately, some bozo at Adobe saw no reason to be bound by the standard;
+ * the PostScript DCT filter can emit files with many more than 10 blocks/MCU.
+ * If you happen to run across such a file, you can up D_MAX_BLOCKS_IN_MCU
+ * to handle it.  We even let you do this from the jconfig.h file.  However,
+ * we strongly discourage changing C_MAX_BLOCKS_IN_MCU; just because Adobe
+ * sometimes emits noncompliant files doesn't mean you should too.
+ */
+#define C_MAX_BLOCKS_IN_MCU   10 /* compressor's limit on blocks per MCU */
+#ifndef D_MAX_BLOCKS_IN_MCU
+#define D_MAX_BLOCKS_IN_MCU   10 /* decompressor's limit on blocks per MCU */
+#endif
+
+
+/* Data structures for images (arrays of samples and of DCT coefficients).
+ */
+
+typedef JSAMPLE *JSAMPROW;      /* ptr to one image row of pixel samples. */
+typedef JSAMPROW *JSAMPARRAY;   /* ptr to some rows (a 2-D sample array) */
+typedef JSAMPARRAY *JSAMPIMAGE; /* a 3-D sample array: top index is color */
+
+typedef JCOEF JBLOCK[DCTSIZE2]; /* one block of coefficients */
+typedef JBLOCK *JBLOCKROW;      /* pointer to one row of coefficient blocks */
+typedef JBLOCKROW *JBLOCKARRAY;         /* a 2-D array of coefficient blocks */
+typedef JBLOCKARRAY *JBLOCKIMAGE;       /* a 3-D array of coefficient blocks */
+
+typedef JCOEF *JCOEFPTR;        /* useful in a couple of places */
+
+#include <arm_neon.h>
+
+/* jsimd_idct_ifast_neon() performs dequantization and a fast, not so accurate
+ * inverse DCT (Discrete Cosine Transform) on one block of coefficients.  It
+ * uses the same calculations and produces exactly the same output as IJG's
+ * original jpeg_idct_ifast() function, which can be found in jidctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.082392200 =  2688 * 2^-15
+ *    0.414213562 = 13568 * 2^-15
+ *    0.847759065 = 27776 * 2^-15
+ *    0.613125930 = 20096 * 2^-15
+ *
+ * See jidctfst.c for further details of the IDCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_idct_ifast_neon() match up
+ * with those in jpeg_idct_ifast().
+ */
+
+#define PASS1_BITS  2
+
+#define F_0_082  2688
+#define F_0_414  13568
+#define F_0_847  27776
+#define F_0_613  20096
+
+
+__attribute__((aligned(16))) static const int16_t jsimd_idct_ifast_neon_consts[] = {
+  F_0_082, F_0_414, F_0_847, F_0_613
+};
+
+#define F_0_382  12544
+#define F_0_541  17792
+#define F_0_707  23168
+#define F_0_306  9984
+
+
+__attribute__((aligned(16))) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
+  F_0_382, F_0_541, F_0_707, F_0_306
+};
+
+typedef short DCTELEM;          /* prefer 16 bit with SIMD for parellelism */
+typedef unsigned short UDCTELEM;
+typedef unsigned int UDCTELEM2;
+
+static void jsimd_fdct_ifast_neon(DCTELEM *data) {
+  /* Load an 8x8 block of samples into Neon registers.  De-interleaving loads
+   * are used, followed by vuzp to transpose the block such that we have a
+   * column of samples per vector - allowing all rows to be processed at once.
+   */
+  int16x8x4_t data1 = vld4q_s16(data);
+  int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
+
+  int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
+  int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
+  int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
+  int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
+
+  int16x8_t col0 = cols_04.val[0];
+  int16x8_t col1 = cols_15.val[0];
+  int16x8_t col2 = cols_26.val[0];
+  int16x8_t col3 = cols_37.val[0];
+  int16x8_t col4 = cols_04.val[1];
+  int16x8_t col5 = cols_15.val[1];
+  int16x8_t col6 = cols_26.val[1];
+  int16x8_t col7 = cols_37.val[1];
+
+  /* Pass 1: process rows. */
+
+  /* Load DCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
+
+  int16x8_t tmp0 = vaddq_s16(col0, col7);
+  int16x8_t tmp7 = vsubq_s16(col0, col7);
+  int16x8_t tmp1 = vaddq_s16(col1, col6);
+  int16x8_t tmp6 = vsubq_s16(col1, col6);
+  int16x8_t tmp2 = vaddq_s16(col2, col5);
+  int16x8_t tmp5 = vsubq_s16(col2, col5);
+  int16x8_t tmp3 = vaddq_s16(col3, col4);
+  int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);    /* phase 2 */
+  int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+  int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+  int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+  col0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
+  col4 = vsubq_s16(tmp10, tmp11);
+
+  int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+  col2 = vaddq_s16(tmp13, z1);                /* phase 5 */
+  col6 = vsubq_s16(tmp13, z1);
+
+  /* Odd part */
+  tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
+  tmp11 = vaddq_s16(tmp5, tmp6);
+  tmp12 = vaddq_s16(tmp6, tmp7);
+
+  int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+  int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+  z2 = vaddq_s16(z2, z5);
+  int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+  z5 = vaddq_s16(tmp12, z5);
+  z4 = vaddq_s16(z4, z5);
+  int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+  int16x8_t z11 = vaddq_s16(tmp7, z3);        /* phase 5 */
+  int16x8_t z13 = vsubq_s16(tmp7, z3);
+
+  col5 = vaddq_s16(z13, z2);                  /* phase 6 */
+  col3 = vsubq_s16(z13, z2);
+  col1 = vaddq_s16(z11, z4);
+  col7 = vsubq_s16(z11, z4);
+
+  /* Transpose to work on columns in pass 2. */
+  int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+  int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+  int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+  int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+  int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+                                      vreinterpretq_s32_s16(cols_45.val[0]));
+  int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+                                      vreinterpretq_s32_s16(cols_45.val[1]));
+  int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+                                      vreinterpretq_s32_s16(cols_67.val[0]));
+  int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+                                      vreinterpretq_s32_s16(cols_67.val[1]));
+
+  int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+  int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+  int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+  int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+  int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+  int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+  int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+  int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+  int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+  int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+  int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+  int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+  /* Pass 2: process columns. */
+
+  tmp0 = vaddq_s16(row0, row7);
+  tmp7 = vsubq_s16(row0, row7);
+  tmp1 = vaddq_s16(row1, row6);
+  tmp6 = vsubq_s16(row1, row6);
+  tmp2 = vaddq_s16(row2, row5);
+  tmp5 = vsubq_s16(row2, row5);
+  tmp3 = vaddq_s16(row3, row4);
+  tmp4 = vsubq_s16(row3, row4);
+
+  /* Even part */
+  tmp10 = vaddq_s16(tmp0, tmp3);              /* phase 2 */
+  tmp13 = vsubq_s16(tmp0, tmp3);
+  tmp11 = vaddq_s16(tmp1, tmp2);
+  tmp12 = vsubq_s16(tmp1, tmp2);
+
+  row0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
+  row4 = vsubq_s16(tmp10, tmp11);
+
+  z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+  row2 = vaddq_s16(tmp13, z1);                /* phase 5 */
+  row6 = vsubq_s16(tmp13, z1);
+
+  /* Odd part */
+  tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
+  tmp11 = vaddq_s16(tmp5, tmp6);
+  tmp12 = vaddq_s16(tmp6, tmp7);
+
+  z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+  z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+  z2 = vaddq_s16(z2, z5);
+  z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+  z5 = vaddq_s16(tmp12, z5);
+  z4 = vaddq_s16(z4, z5);
+  z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+  z11 = vaddq_s16(tmp7, z3);                  /* phase 5 */
+  z13 = vsubq_s16(tmp7, z3);
+
+  row5 = vaddq_s16(z13, z2);                  /* phase 6 */
+  row3 = vsubq_s16(z13, z2);
+  row1 = vaddq_s16(z11, z4);
+  row7 = vsubq_s16(z11, z4);
+
+  vst1q_s16(data + 0 * DCTSIZE, row0);
+  vst1q_s16(data + 1 * DCTSIZE, row1);
+  vst1q_s16(data + 2 * DCTSIZE, row2);
+  vst1q_s16(data + 3 * DCTSIZE, row3);
+  vst1q_s16(data + 4 * DCTSIZE, row4);
+  vst1q_s16(data + 5 * DCTSIZE, row5);
+  vst1q_s16(data + 6 * DCTSIZE, row6);
+  vst1q_s16(data + 7 * DCTSIZE, row7);
+}
+
+static void jsimd_idct_ifast_neon(void *dct_table, JCOEFPTR coef_block,
+                           JSAMPROW output_buf)
+{
+  IFAST_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values for DC coefficients. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  /* Dequantize DC coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+
+  /* Construct bitmap to test if all AC coefficients are 0. */
+  int16x8_t bitmap = vorrq_s16(row1, row2);
+  bitmap = vorrq_s16(bitmap, row3);
+  bitmap = vorrq_s16(bitmap, row4);
+  bitmap = vorrq_s16(bitmap, row5);
+  bitmap = vorrq_s16(bitmap, row6);
+  bitmap = vorrq_s16(bitmap, row7);
+
+  int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+  int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+  /* Load IDCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts);
+
+  if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+    /* All AC coefficients are zero.
+     * Compute DC values and duplicate into vectors.
+     */
+    int16x8_t dcval = row0;
+    row1 = dcval;
+    row2 = dcval;
+    row3 = dcval;
+    row4 = dcval;
+    row5 = dcval;
+    row6 = dcval;
+    row7 = dcval;
+  } else if (left_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 0, 1, 2, and 3.
+     * Use DC values for these columns.
+     */
+    int16x4_t dcval = vget_low_s16(row0);
+
+    /* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+    int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x4_t tmp0 = vget_high_s16(row0);
+    int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2);
+    int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4);
+    int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+    int16x4_t tmp10 = vadd_s16(tmp0, tmp2);   /* phase 3 */
+    int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+    int16x4_t tmp13 = vadd_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+    int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsub_s16(tmp12, tmp13);
+
+    tmp0 = vadd_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsub_s16(tmp10, tmp13);
+    tmp1 = vadd_s16(tmp11, tmp12);
+    tmp2 = vsub_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1);
+    int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3);
+    int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5);
+    int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7);
+
+    int16x4_t z13 = vadd_s16(tmp6, tmp5);     /* phase 6 */
+    int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+    int16x4_t z11 = vadd_s16(tmp4, tmp7);
+    int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+    tmp7 = vadd_s16(z11, z13);                /* phase 5 */
+    int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+    tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+    int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+    int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+    z5 = vadd_s16(z5, z10_add_z12);
+    tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+    tmp10 = vadd_s16(tmp10, z12);
+    tmp10 = vsub_s16(tmp10, z5);
+    tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+    tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+    tmp12 = vadd_s16(tmp12, z5);
+
+    tmp6 = vsub_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsub_s16(tmp11, tmp6);
+    tmp4 = vadd_s16(tmp10, tmp5);
+
+    row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7));
+    row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7));
+    row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6));
+    row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6));
+    row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5));
+    row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5));
+    row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4));
+    row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4));
+  } else if (right_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 4, 5, 6, and 7.
+     * Use DC values for these columns.
+     */
+    int16x4_t dcval = vget_high_s16(row0);
+
+    /* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+    int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x4_t tmp0 = vget_low_s16(row0);
+    int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2);
+    int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4);
+    int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+    int16x4_t tmp10 = vadd_s16(tmp0, tmp2);   /* phase 3 */
+    int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+    int16x4_t tmp13 = vadd_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+    int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsub_s16(tmp12, tmp13);
+
+    tmp0 = vadd_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsub_s16(tmp10, tmp13);
+    tmp1 = vadd_s16(tmp11, tmp12);
+    tmp2 = vsub_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1);
+    int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3);
+    int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5);
+    int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7);
+
+    int16x4_t z13 = vadd_s16(tmp6, tmp5);     /* phase 6 */
+    int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+    int16x4_t z11 = vadd_s16(tmp4, tmp7);
+    int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+    tmp7 = vadd_s16(z11, z13);                /* phase 5 */
+    int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+    tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+    int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+    int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+    z5 = vadd_s16(z5, z10_add_z12);
+    tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+    tmp10 = vadd_s16(tmp10, z12);
+    tmp10 = vsub_s16(tmp10, z5);
+    tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+    tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+    tmp12 = vadd_s16(tmp12, z5);
+
+    tmp6 = vsub_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsub_s16(tmp11, tmp6);
+    tmp4 = vadd_s16(tmp10, tmp5);
+
+    row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval);
+    row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval);
+    row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval);
+    row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval);
+    row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval);
+    row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval);
+    row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval);
+    row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval);
+  } else {
+    /* Some AC coefficients are non-zero; full IDCT calculation required. */
+
+    /* Load quantization table. */
+    int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+    int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+    int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+    int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE);
+    int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+    int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+    int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x8_t tmp0 = row0;
+    int16x8_t tmp1 = vmulq_s16(row2, quant_row2);
+    int16x8_t tmp2 = vmulq_s16(row4, quant_row4);
+    int16x8_t tmp3 = vmulq_s16(row6, quant_row6);
+
+    int16x8_t tmp10 = vaddq_s16(tmp0, tmp2);   /* phase 3 */
+    int16x8_t tmp11 = vsubq_s16(tmp0, tmp2);
+
+    int16x8_t tmp13 = vaddq_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3);
+    int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsubq_s16(tmp12, tmp13);
+
+    tmp0 = vaddq_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsubq_s16(tmp10, tmp13);
+    tmp1 = vaddq_s16(tmp11, tmp12);
+    tmp2 = vsubq_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x8_t tmp4 = vmulq_s16(row1, quant_row1);
+    int16x8_t tmp5 = vmulq_s16(row3, quant_row3);
+    int16x8_t tmp6 = vmulq_s16(row5, quant_row5);
+    int16x8_t tmp7 = vmulq_s16(row7, quant_row7);
+
+    int16x8_t z13 = vaddq_s16(tmp6, tmp5);     /* phase 6 */
+    int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6);
+    int16x8_t z11 = vaddq_s16(tmp4, tmp7);
+    int16x8_t z12 = vsubq_s16(tmp4, tmp7);
+
+    tmp7 = vaddq_s16(z11, z13);                /* phase 5 */
+    int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+    tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+    int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+    int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+    z5 = vaddq_s16(z5, z10_add_z12);
+    tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+    tmp10 = vaddq_s16(tmp10, z12);
+    tmp10 = vsubq_s16(tmp10, z5);
+    tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+    tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+    tmp12 = vaddq_s16(tmp12, z5);
+
+    tmp6 = vsubq_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsubq_s16(tmp11, tmp6);
+    tmp4 = vaddq_s16(tmp10, tmp5);
+
+    row0 = vaddq_s16(tmp0, tmp7);
+    row7 = vsubq_s16(tmp0, tmp7);
+    row1 = vaddq_s16(tmp1, tmp6);
+    row6 = vsubq_s16(tmp1, tmp6);
+    row2 = vaddq_s16(tmp2, tmp5);
+    row5 = vsubq_s16(tmp2, tmp5);
+    row4 = vaddq_s16(tmp3, tmp4);
+    row3 = vsubq_s16(tmp3, tmp4);
+  }
+
+  /* Transpose rows to work on columns in pass 2. */
+  int16x8x2_t rows_01 = vtrnq_s16(row0, row1);
+  int16x8x2_t rows_23 = vtrnq_s16(row2, row3);
+  int16x8x2_t rows_45 = vtrnq_s16(row4, row5);
+  int16x8x2_t rows_67 = vtrnq_s16(row6, row7);
+
+  int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]),
+                                      vreinterpretq_s32_s16(rows_45.val[0]));
+  int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]),
+                                      vreinterpretq_s32_s16(rows_45.val[1]));
+  int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]),
+                                      vreinterpretq_s32_s16(rows_67.val[0]));
+  int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]),
+                                      vreinterpretq_s32_s16(rows_67.val[1]));
+
+  int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]);
+  int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]);
+  int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]);
+  int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]);
+
+  int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]);
+  int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]);
+  int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]);
+  int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]);
+  int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]);
+  int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]);
+  int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]);
+  int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]);
+
+  /* 1-D IDCT, pass 2 */
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(col0, col4);
+  int16x8_t tmp11 = vsubq_s16(col0, col4);
+
+  int16x8_t tmp13 = vaddq_s16(col2, col6);
+  int16x8_t col2_sub_col6 = vsubq_s16(col2, col6);
+  int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1);
+  tmp12 = vaddq_s16(tmp12, col2_sub_col6);
+  tmp12 = vsubq_s16(tmp12, tmp13);
+
+  int16x8_t tmp0 = vaddq_s16(tmp10, tmp13);
+  int16x8_t tmp3 = vsubq_s16(tmp10, tmp13);
+  int16x8_t tmp1 = vaddq_s16(tmp11, tmp12);
+  int16x8_t tmp2 = vsubq_s16(tmp11, tmp12);
+
+  /* Odd part */
+  int16x8_t z13 = vaddq_s16(col5, col3);
+  int16x8_t neg_z10 = vsubq_s16(col3, col5);
+  int16x8_t z11 = vaddq_s16(col1, col7);
+  int16x8_t z12 = vsubq_s16(col1, col7);
+
+  int16x8_t tmp7 = vaddq_s16(z11, z13);      /* phase 5 */
+  int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+  tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+  tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+  int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+  int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+  z5 = vaddq_s16(z5, z10_add_z12);
+  tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+  tmp10 = vaddq_s16(tmp10, z12);
+  tmp10 = vsubq_s16(tmp10, z5);
+  tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+  tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+  tmp12 = vaddq_s16(tmp12, z5);
+
+  int16x8_t tmp6 = vsubq_s16(tmp12, tmp7);   /* phase 2 */
+  int16x8_t tmp5 = vsubq_s16(tmp11, tmp6);
+  int16x8_t tmp4 = vaddq_s16(tmp10, tmp5);
+
+  col0 = vaddq_s16(tmp0, tmp7);
+  col7 = vsubq_s16(tmp0, tmp7);
+  col1 = vaddq_s16(tmp1, tmp6);
+  col6 = vsubq_s16(tmp1, tmp6);
+  col2 = vaddq_s16(tmp2, tmp5);
+  col5 = vsubq_s16(tmp2, tmp5);
+  col4 = vaddq_s16(tmp3, tmp4);
+  col3 = vsubq_s16(tmp3, tmp4);
+
+  /* Scale down by a factor of 8, narrowing to 8-bit. */
+  int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col1, PASS1_BITS + 3));
+  int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col5, PASS1_BITS + 3));
+  int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col3, PASS1_BITS + 3));
+  int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col7, PASS1_BITS + 3));
+  /* Clamp to range [0-255]. */
+  uint8x16_t cols_01 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_45 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_23 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_67 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+
+  /* Transpose block to prepare for store. */
+  uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01),
+                                     vreinterpretq_u32_u8(cols_45));
+  uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23),
+                                     vreinterpretq_u32_u8(cols_67));
+
+  uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]),
+                                    vreinterpretq_u8_u32(cols_0415.val[1]));
+  uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]),
+                                    vreinterpretq_u8_u32(cols_2637.val[1]));
+  uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]),
+                                     vreinterpretq_u16_u8(cols_2367.val[0]));
+  uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]),
+                                     vreinterpretq_u16_u8(cols_2367.val[1]));
+
+  uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]);
+  uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]);
+  uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]);
+  uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]);
+
+  JSAMPROW outptr0 = output_buf + DCTSIZE * 0;
+  JSAMPROW outptr1 = output_buf + DCTSIZE * 1;
+  JSAMPROW outptr2 = output_buf + DCTSIZE * 2;
+  JSAMPROW outptr3 = output_buf + DCTSIZE * 3;
+  JSAMPROW outptr4 = output_buf + DCTSIZE * 4;
+  JSAMPROW outptr5 = output_buf + DCTSIZE * 5;
+  JSAMPROW outptr6 = output_buf + DCTSIZE * 6;
+  JSAMPROW outptr7 = output_buf + DCTSIZE * 7;
+
+  /* Store DCT block to memory. */
+  vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0);
+  vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0);
+  vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0);
+  vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0);
+  vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1);
+  vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1);
+  vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1);
+  vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1);
+}
+
+static int flss(uint16_t val) {
+  int bit;
+
+  bit = 16;
+
+  if (!val)
+    return 0;
+
+  if (!(val & 0xff00)) {
+    bit -= 8;
+    val <<= 8;
+  }
+  if (!(val & 0xf000)) {
+    bit -= 4;
+    val <<= 4;
+  }
+  if (!(val & 0xc000)) {
+    bit -= 2;
+    val <<= 2;
+  }
+  if (!(val & 0x8000)) {
+    bit -= 1;
+    val <<= 1;
+  }
+
+  return bit;
+}
+
+static int compute_reciprocal(uint16_t divisor, DCTELEM *dtbl) {
+  UDCTELEM2 fq, fr;
+  UDCTELEM c;
+  int b, r;
+
+  if (divisor == 1) {
+    /* divisor == 1 means unquantized, so these reciprocal/correction/shift
+     * values will cause the C quantization algorithm to act like the
+     * identity function.  Since only the C quantization algorithm is used in
+     * these cases, the scale value is irrelevant.
+     */
+    dtbl[DCTSIZE2 * 0] = (DCTELEM)1;                        /* reciprocal */
+    dtbl[DCTSIZE2 * 1] = (DCTELEM)0;                        /* correction */
+    dtbl[DCTSIZE2 * 2] = (DCTELEM)1;                        /* scale */
+    dtbl[DCTSIZE2 * 3] = -(DCTELEM)(sizeof(DCTELEM) * 8);   /* shift */
+    return 0;
+  }
+
+  b = flss(divisor) - 1;
+  r  = sizeof(DCTELEM) * 8 + b;
+
+  fq = ((UDCTELEM2)1 << r) / divisor;
+  fr = ((UDCTELEM2)1 << r) % divisor;
+
+  c = divisor / 2;                      /* for rounding */
+
+  if (fr == 0) {                        /* divisor is power of two */
+    /* fq will be one bit too large to fit in DCTELEM, so adjust */
+    fq >>= 1;
+    r--;
+  } else if (fr <= (divisor / 2U)) {    /* fractional part is < 0.5 */
+    c++;
+  } else {                              /* fractional part is > 0.5 */
+    fq++;
+  }
+
+  dtbl[DCTSIZE2 * 0] = (DCTELEM)fq;     /* reciprocal */
+  dtbl[DCTSIZE2 * 1] = (DCTELEM)c;      /* correction + roundfactor */
+#ifdef WITH_SIMD
+  dtbl[DCTSIZE2 * 2] = (DCTELEM)(1 << (sizeof(DCTELEM) * 8 * 2 - r)); /* scale */
+#else
+  dtbl[DCTSIZE2 * 2] = 1;
+#endif
+  dtbl[DCTSIZE2 * 3] = (DCTELEM)r - sizeof(DCTELEM) * 8; /* shift */
+
+  if (r <= 16) return 0;
+  else return 1;
+}
+
+#define DESCALE(x, n)  RIGHT_SHIFT(x, n)
+
+
+/* Multiply a DCTELEM variable by an JLONG constant, and immediately
+ * descale to yield a DCTELEM result.
+ */
+
+#define MULTIPLY(var, const)  ((DCTELEM)DESCALE((var) * (const), CONST_BITS))
+#define MULTIPLY16V16(var1, var2)  ((var1) * (var2))
+
+static DCTELEM std_luminance_quant_tbl[DCTSIZE2] = {
+  16,  11,  10,  16,  24,  40,  51,  61,
+  12,  12,  14,  19,  26,  58,  60,  55,
+  14,  13,  16,  24,  40,  57,  69,  56,
+  14,  17,  22,  29,  51,  87,  80,  62,
+  18,  22,  37,  56,  68, 109, 103,  77,
+  24,  35,  55,  64,  81, 104, 113,  92,
+  49,  64,  78,  87, 103, 121, 120, 101,
+  72,  92,  95,  98, 112, 100, 103,  99
+};
+
+static int jpeg_quality_scaling(int quality)
+/* Convert a user-specified quality rating to a percentage scaling factor
+ * for an underlying quantization table, using our recommended scaling curve.
+ * The input 'quality' factor should be 0 (terrible) to 100 (very good).
+ */
+{
+  /* Safety limit on quality factor.  Convert 0 to 1 to avoid zero divide. */
+  if (quality <= 0) quality = 1;
+  if (quality > 100) quality = 100;
+
+  /* The basic table is used as-is (scaling 100) for a quality of 50.
+   * Qualities 50..100 are converted to scaling percentage 200 - 2*Q;
+   * note that at Q=100 the scaling is 0, which will cause jpeg_add_quant_table
+   * to make all the table entries 1 (hence, minimum quantization loss).
+   * Qualities 1..50 are converted to scaling percentage 5000/Q.
+   */
+  if (quality < 50)
+    quality = 5000 / quality;
+  else
+    quality = 200 - quality * 2;
+
+  return quality;
+}
+
+static void jpeg_add_quant_table(DCTELEM *qtable, DCTELEM *basicTable, int scale_factor, bool forceBaseline)
+/* Define a quantization table equal to the basic_table times
+ * a scale factor (given as a percentage).
+ * If force_baseline is TRUE, the computed quantization table entries
+ * are limited to 1..255 for JPEG baseline compatibility.
+ */
+{
+  int i;
+  long temp;
+
+  for (i = 0; i < DCTSIZE2; i++) {
+    temp = ((long)basicTable[i] * scale_factor + 50L) / 100L;
+    /* limit the values to the valid range */
+    if (temp <= 0L) temp = 1L;
+    if (temp > 32767L) temp = 32767L; /* max quantizer needed for 12 bits */
+    if (forceBaseline && temp > 255L)
+      temp = 255L;              /* limit to baseline range if requested */
+      qtable[i] = (uint16_t)temp;
+  }
+}
+
+static void jpeg_set_quality(DCTELEM *qtable, int quality)
+/* Set or change the 'quality' (quantization) setting, using default tables.
+ * This is the standard quality-adjusting entry point for typical user
+ * interfaces; only those who want detailed control over quantization tables
+ * would use the preceding three routines directly.
+ */
+{
+  /* Convert user 0-100 rating to percentage scaling */
+  quality = jpeg_quality_scaling(quality);
+
+  /* Set up standard quality tables */
+    jpeg_add_quant_table(qtable, std_luminance_quant_tbl, quality, false);
+}
+
+static void getDivisors(DCTELEM *dtbl, DCTELEM *qtable) {
+#define CONST_BITS  14
+#define RIGHT_SHIFT(x, shft)    ((x) >> (shft))
+    
+    static const int16_t aanscales[DCTSIZE2] = {
+      /* precomputed values scaled up by 14 bits */
+      16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
+      22725, 31521, 29692, 26722, 22725, 17855, 12299,  6270,
+      21407, 29692, 27969, 25172, 21407, 16819, 11585,  5906,
+      19266, 26722, 25172, 22654, 19266, 15137, 10426,  5315,
+      16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
+      12873, 17855, 16819, 15137, 12873, 10114,  6967,  3552,
+       8867, 12299, 11585, 10426,  8867,  6967,  4799,  2446,
+       4520,  6270,  5906,  5315,  4520,  3552,  2446,  1247
+    };
+
+    for (int i = 0; i < DCTSIZE2; i++) {
+      if (!compute_reciprocal(
+            DESCALE(MULTIPLY16V16((JLONG)qtable[i],
+                                  (JLONG)aanscales[i]),
+                    CONST_BITS - 3), &dtbl[i])) {
+        //fdct->quantize = quantize;
+            printf("here\n");
+        }
+    }
+}
+
+static void quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  int i;
+  DCTELEM temp;
+  JCOEFPTR output_ptr = coef_block;
+
+  UDCTELEM recip, corr;
+  int shift;
+  UDCTELEM2 product;
+
+  for (i = 0; i < DCTSIZE2; i++) {
+    temp = workspace[i];
+    recip = divisors[i + DCTSIZE2 * 0];
+    corr =  divisors[i + DCTSIZE2 * 1];
+    shift = divisors[i + DCTSIZE2 * 3];
+
+    if (temp < 0) {
+      temp = -temp;
+      product = (UDCTELEM2)(temp + corr) * recip;
+      product >>= shift + sizeof(DCTELEM) * 8;
+      temp = (DCTELEM)product;
+      temp = -temp;
+    } else {
+      product = (UDCTELEM2)(temp + corr) * recip;
+      product >>= shift + sizeof(DCTELEM) * 8;
+      temp = (DCTELEM)product;
+    }
+    output_ptr[i] = (JCOEF)temp;
+  }
+}
+
+NSData *generateForwardDctData(int quality) {
+    NSMutableData *divisors = [[NSMutableData alloc] initWithLength:DCTSIZE2 * 4 * sizeof(DCTELEM)];
+    
+    DCTELEM qtable[DCTSIZE2];
+    jpeg_set_quality(qtable, quality);
+    
+    getDivisors((DCTELEM *)divisors.mutableBytes, qtable);
+    
+    return divisors;
+}
+
+NSData *generateInverseDctData(int quality) {
+    NSMutableData *divisors = [[NSMutableData alloc] initWithLength:DCTSIZE2 * sizeof(IFAST_MULT_TYPE)];
+    IFAST_MULT_TYPE *ifmtbl = (IFAST_MULT_TYPE *)divisors.mutableBytes;
+    
+    DCTELEM qtable[DCTSIZE2];
+    jpeg_set_quality(qtable, quality);
+    
+#define CONST_BITS  14
+    static const int16_t aanscales[DCTSIZE2] = {
+        /* precomputed values scaled up by 14 bits */
+        16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
+        22725, 31521, 29692, 26722, 22725, 17855, 12299,  6270,
+        21407, 29692, 27969, 25172, 21407, 16819, 11585,  5906,
+        19266, 26722, 25172, 22654, 19266, 15137, 10426,  5315,
+        16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
+        12873, 17855, 16819, 15137, 12873, 10114,  6967,  3552,
+        8867, 12299, 11585, 10426,  8867,  6967,  4799,  2446,
+        4520,  6270,  5906,  5315,  4520,  3552,  2446,  1247
+    };
+    
+    for (int i = 0; i < DCTSIZE2; i++) {
+        ifmtbl[i] = (IFAST_MULT_TYPE)
+        DESCALE(MULTIPLY16V16((JLONG)qtable[i],
+                              (JLONG)aanscales[i]),
+                CONST_BITS - IFAST_SCALE_BITS);
+    }
+    
+    return divisors;
+}
+
+static const int zigZagInv[DCTSIZE2] = {
+    0,1,8,16,9,2,3,10,
+    17,24,32,25,18,11,4,5,
+    12,19,26,33,40,48,41,34,
+    27,20,13,6,7,14,21,28,
+    35,42,49,56,57,50,43,36,
+    29,22,15,23,30,37,44,51,
+    58,59,52,45,38,31,39,46,
+    53,60,61,54,47,55,62,63
+};
+
+static const int zigZag[DCTSIZE2] = {
+    0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
+};
+
+void performForwardDct(uint8_t const *pixels, int16_t *coefficients, int width, int height, int bytesPerRow, NSData *dctData) {
+    DCTELEM *divisors = (DCTELEM *)dctData.bytes;
+    
+    DCTELEM block[DCTSIZE2];
+    JCOEF coefBlock[DCTSIZE2];
+    
+    for (int y = 0; y < height; y += DCTSIZE) {
+        for (int x = 0; x < width; x += DCTSIZE) {
+            for (int blockY = 0; blockY < DCTSIZE; blockY++) {
+                for (int blockX = 0; blockX < DCTSIZE; blockX++) {
+                    block[blockY * DCTSIZE + blockX] = ((DCTELEM)pixels[(y + blockY) * bytesPerRow + (x + blockX)]) - CENTERJSAMPLE;
+                }
+            }
+            
+            jsimd_fdct_ifast_neon(block);
+            
+            quantize(coefBlock, divisors, block);
+            
+            for (int blockY = 0; blockY < DCTSIZE; blockY++) {
+                for (int blockX = 0; blockX < DCTSIZE; blockX++) {
+                    coefficients[(y + blockY) * bytesPerRow + (x + blockX)] = coefBlock[zigZagInv[blockY * DCTSIZE + blockX]];
+                }
+            }
+        }
+    }
+}
+
+void performInverseDct(int16_t const *coefficients, uint8_t *pixels, int width, int height, int coefficientsPerRow, int bytesPerRow, NSData *idctData) {
+    IFAST_MULT_TYPE *ifmtbl = (IFAST_MULT_TYPE *)idctData.bytes;
+    
+    DCTELEM coefficientBlock[DCTSIZE2];
+    JSAMPLE pixelBlock[DCTSIZE2];
+    
+    for (int y = 0; y < height; y += DCTSIZE) {
+        for (int x = 0; x < width; x += DCTSIZE) {
+            for (int blockY = 0; blockY < DCTSIZE; blockY++) {
+                for (int blockX = 0; blockX < DCTSIZE; blockX++) {
+                    coefficientBlock[zigZag[blockY * DCTSIZE + blockX]] = coefficients[(y + blockY) * coefficientsPerRow + (x + blockX)];
+                }
+            }
+            
+            jsimd_idct_ifast_neon(ifmtbl, coefficientBlock, pixelBlock);
+            
+            for (int blockY = 0; blockY < DCTSIZE; blockY++) {
+                for (int blockX = 0; blockX < DCTSIZE; blockX++) {
+                    pixels[(y + blockY) * bytesPerRow + (x + blockX)] = pixelBlock[blockY * DCTSIZE + blockX];
+                }
+            }
+        }
+    }
+}
--- a/submodules/TelegramUI/Components/AnimationCache/DCT/Sources/YuvConversion.m
+++ b/submodules/TelegramUI/Components/AnimationCache/DCT/Sources/YuvConversion.m
@@ -0,0 +1,99 @@
+#import <DCT/YuvConversion.h>
+
+#import <Foundation/Foundation.h>
+#import <Accelerate/Accelerate.h>
+
+static uint8_t permuteMap[4] = { 3, 2, 1, 0};
+
+void splitRGBAIntoYUVAPlanes(uint8_t const *argb, uint8_t *outY, uint8_t *outU, uint8_t *outV, uint8_t *outA, int width, int height, int bytesPerRow) {
+    static vImage_ARGBToYpCbCr info;
+    static dispatch_once_t onceToken;
+    dispatch_once(&onceToken, ^{
+        vImage_YpCbCrPixelRange pixelRange = (vImage_YpCbCrPixelRange){ 0, 128, 255, 255, 255, 1, 255, 0 };
+        vImageConvert_ARGBToYpCbCr_GenerateConversion(kvImage_ARGBToYpCbCrMatrix_ITU_R_709_2, &pixelRange, &info, kvImageARGB8888, kvImage420Yp8_Cb8_Cr8, 0);
+    });
+    
+    vImage_Error error = kvImageNoError;
+    
+    vImage_Buffer src;
+    src.data = (void *)argb;
+    src.width = width;
+    src.height = height;
+    src.rowBytes = bytesPerRow;
+    
+    vImage_Buffer destYp;
+    destYp.data = outY;
+    destYp.width = width;
+    destYp.height = height;
+    destYp.rowBytes = width;
+    
+    vImage_Buffer destCr;
+    destCr.data = outU;
+    destCr.width = width / 2;
+    destCr.height = height / 2;
+    destCr.rowBytes = width / 2;
+    
+    vImage_Buffer destCb;
+    destCb.data = outV;
+    destCb.width = width / 2;
+    destCb.height = height / 2;
+    destCb.rowBytes = width / 2;
+    
+    vImage_Buffer destA;
+    destA.data = outA;
+    destA.width = width;
+    destA.height = height;
+    destA.rowBytes = width;
+    
+    error = vImageConvert_ARGB8888To420Yp8_Cb8_Cr8(&src, &destYp, &destCb, &destCr, &info, permuteMap, kvImageDoNotTile);
+    if (error != kvImageNoError) {
+        return;
+    }
+    
+    vImageExtractChannel_ARGB8888(&src, &destA, 3, kvImageDoNotTile);
+}
+
+void combineYUVAPlanesIntoARBB(uint8_t *argb, uint8_t const *inY, uint8_t const *inU, uint8_t const *inV, uint8_t const *inA, int width, int height, int bytesPerRow) {
+    static vImage_YpCbCrToARGB info;
+    static dispatch_once_t onceToken;
+    dispatch_once(&onceToken, ^{
+        vImage_YpCbCrPixelRange pixelRange = (vImage_YpCbCrPixelRange){ 0, 128, 255, 255, 255, 1, 255, 0 };
+        vImageConvert_YpCbCrToARGB_GenerateConversion(kvImage_YpCbCrToARGBMatrix_ITU_R_709_2, &pixelRange, &info, kvImage420Yp8_Cb8_Cr8, kvImageARGB8888, 0);
+    });
+    
+    vImage_Error error = kvImageNoError;
+    
+    vImage_Buffer destArgb;
+    destArgb.data = (void *)argb;
+    destArgb.width = width;
+    destArgb.height = height;
+    destArgb.rowBytes = bytesPerRow;
+    
+    vImage_Buffer srcYp;
+    srcYp.data = (void *)inY;
+    srcYp.width = width;
+    srcYp.height = height;
+    srcYp.rowBytes = width;
+    
+    vImage_Buffer srcCr;
+    srcCr.data = (void *)inU;
+    srcCr.width = width / 2;
+    srcCr.height = height / 2;
+    srcCr.rowBytes = width / 2;
+    
+    vImage_Buffer srcCb;
+    srcCb.data = (void *)inV;
+    srcCb.width = width / 2;
+    srcCb.height = height / 2;
+    srcCb.rowBytes = width / 2;
+    
+    vImage_Buffer srcA;
+    srcA.data = (void *)inA;
+    srcA.width = width;
+    srcA.height = height;
+    srcA.rowBytes = width;
+    
+    error = vImageConvert_420Yp8_Cb8_Cr8ToARGB8888(&srcYp, &srcCb, &srcCr, &destArgb, &info, permuteMap, 255, kvImageDoNotTile);
+    
+    error = vImageOverwriteChannels_ARGB8888(&srcA, &destArgb, &destArgb, 1 << 0, kvImageDoNotTile);
+}
--- a/submodules/TelegramUI/Components/AnimationCache/Sources/AnimationCache.swift
+++ b/submodules/TelegramUI/Components/AnimationCache/Sources/AnimationCache.swift
@@ -3,6 +3,7 @@ import UIKit
 import SwiftSignalKit
 import CryptoUtils
 import ManagedFile
+import Compression

 public final class AnimationCacheItemFrame {
    public enum Format {
@@ -25,19 +26,51 @@ public final class AnimationCacheItemFrame {
 public final class AnimationCacheItem {
    public let numFrames: Int
    private let getFrameImpl: (Int) -> AnimationCacheItemFrame?
+    private let getFrameIndexImpl: (Double) -> Int
    
-    public init(numFrames: Int, getFrame: @escaping (Int) -> AnimationCacheItemFrame?) {
+    public init(numFrames: Int, getFrame: @escaping (Int) -> AnimationCacheItemFrame?, getFrameIndexImpl: @escaping (Double) -> Int) {
        self.numFrames = numFrames
        self.getFrameImpl = getFrame
+        self.getFrameIndexImpl = getFrameIndexImpl
    }
    
    public func getFrame(index: Int) -> AnimationCacheItemFrame? {
        return self.getFrameImpl(index)
    }
+    
+    public func getFrame(at duration: Double) -> AnimationCacheItemFrame? {
+        let index = self.getFrameIndexImpl(duration)
+        return self.getFrameImpl(index)
+    }
+}
+
+public struct AnimationCacheItemDrawingSurface {
+    public let argb: UnsafeMutablePointer<UInt8>
+    public let width: Int
+    public let height: Int
+    public let bytesPerRow: Int
+    public let length: Int
+    
+    init(
+        argb: UnsafeMutablePointer<UInt8>,
+        width: Int,
+        height: Int,
+        bytesPerRow: Int,
+        length: Int
+    ) {
+        self.argb = argb
+        self.width = width
+        self.height = height
+        self.bytesPerRow = bytesPerRow
+        self.length = length
+    }
 }

 public protocol AnimationCacheItemWriter: AnyObject {
-    func add(bytes: UnsafeRawPointer, length: Int, width: Int, height: Int, bytesPerRow: Int, duration: Double)
+    var queue: Queue { get }
+    var isCancelled: Bool { get }
+    
+    func add(with drawingBlock: (AnimationCacheItemDrawingSurface) -> Void, proposedWidth: Int, proposedHeight: Int, duration: Double)
    func finish()
 }

@@ -53,7 +86,8 @@ public final class AnimationCacheItemResult {

 public protocol AnimationCache: AnyObject {
    func get(sourceId: String, size: CGSize, fetch: @escaping (CGSize, AnimationCacheItemWriter) -> Disposable) -> Signal<AnimationCacheItemResult, NoError>
-    func getSynchronously(sourceId: String, size: CGSize) -> AnimationCacheItem?
+    func getFirstFrameSynchronously(sourceId: String, size: CGSize) -> AnimationCacheItem?
+    func getFirstFrame(queue: Queue, sourceId: String, size: CGSize, completion: @escaping (AnimationCacheItem?) -> Void) -> Disposable
 }

 private func md5Hash(_ string: String) -> String {
@@ -80,11 +114,82 @@ private func itemSubpath(hashString: String) -> (directory: String, fileName: St
    return (directory, hashString)
 }

+private func roundUp(_ numToRound: Int, multiple: Int) -> Int {
+    if multiple == 0 {
+        return numToRound
+    }
+    
+    let remainder = numToRound % multiple
+    if remainder == 0 {
+        return numToRound;
+    }
+    
+    return numToRound + multiple - remainder
+}
+
+private func compressData(data: Data, addSizeHeader: Bool = false) -> Data? {
+    let algorithm: compression_algorithm = COMPRESSION_LZFSE
+    
+    let scratchData = malloc(compression_encode_scratch_buffer_size(algorithm))!
+    defer {
+        free(scratchData)
+    }
+    
+    let headerSize = addSizeHeader ? 4 : 0
+    var compressedData = Data(count: headerSize + data.count + 16 * 1024)
+    let resultSize = compressedData.withUnsafeMutableBytes { buffer -> Int in
+        guard let bytes = buffer.baseAddress?.assumingMemoryBound(to: UInt8.self) else {
+            return 0
+        }
+        
+        if addSizeHeader {
+            var decompressedSize: UInt32 = UInt32(data.count)
+            memcpy(bytes, &decompressedSize, 4)
+        }
+        
+        return data.withUnsafeBytes { sourceBuffer -> Int in
+            return compression_encode_buffer(bytes.advanced(by: headerSize), buffer.count - headerSize, sourceBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self), sourceBuffer.count, scratchData, algorithm)
+        }
+    }
+    
+    if resultSize <= 0 {
+        return nil
+    }
+    compressedData.count = headerSize + resultSize
+    return compressedData
+}
+
+private func decompressData(data: Data, range: Range<Int>, decompressedSize: Int) -> Data? {
+    let algorithm: compression_algorithm = COMPRESSION_LZFSE
+    
+    let scratchData = malloc(compression_decode_scratch_buffer_size(algorithm))!
+    defer {
+        free(scratchData)
+    }
+    
+    var decompressedFrameData = Data(count: decompressedSize)
+    let resultSize = decompressedFrameData.withUnsafeMutableBytes { buffer -> Int in
+        guard let bytes = buffer.baseAddress?.assumingMemoryBound(to: UInt8.self) else {
+            return 0
+        }
+        return data.withUnsafeBytes { sourceBuffer -> Int in
+            return compression_decode_buffer(bytes, buffer.count, sourceBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self).advanced(by: range.lowerBound), range.upperBound - range.lowerBound, scratchData, algorithm)
+        }
+    }
+    
+    if resultSize <= 0 {
+        return nil
+    }
+    if decompressedFrameData.count != resultSize {
+        decompressedFrameData.count = resultSize
+    }
+    return decompressedFrameData
+}
+
 private final class AnimationCacheItemWriterImpl: AnimationCacheItemWriter {
-    private struct ParameterSet: Equatable {
-        var width: Int
-        var height: Int
-        var bytesPerRow: Int
+    struct CompressedResult {
+        var animationPath: String
+        var firstFramePath: String
    }
    
    private struct FrameMetadata {
@@ -93,10 +198,19 @@ private final class AnimationCacheItemWriterImpl: AnimationCacheItemWriter {
        var duration: Double
    }
    
-    private let file: ManagedFile
-    private let completion: (Bool) -> Void
+    let queue: Queue
+    var isCancelled: Bool = false
    
-    private var currentParameterSet: ParameterSet?
+    private let decompressedPath: String
+    private let compressedPath: String
+    private let firstFramePath: String
+    private var file: ManagedFile?
+    private let completion: (CompressedResult?) -> Void
+    
+    private var currentSurface: ImageARGB?
+    private var currentYUVASurface: ImageYUVA420?
+    private var currentDctData: DctData?
+    private var currentDctCoefficients: DctCoefficientsYUVA420?
    private var contentLengthOffset: Int?
    private var isFailed: Bool = false
    private var isFinished: Bool = false
@@ -104,44 +218,141 @@ private final class AnimationCacheItemWriterImpl: AnimationCacheItemWriter {
    private var frames: [FrameMetadata] = []
    private var contentLength: Int = 0
    
+    private let dctQuality: Int
+    
    private let lock = Lock()
    
-    init?(tempPath: String, completion: @escaping (Bool) -> Void) {
-        guard let file = ManagedFile(queue: nil, path: tempPath, mode: .readwrite) else {
+    init?(queue: Queue, allocateTempFile: @escaping () -> String, completion: @escaping (CompressedResult?) -> Void) {
+        self.dctQuality = 67
+        
+        self.queue = queue
+        self.decompressedPath = allocateTempFile()
+        self.compressedPath = allocateTempFile()
+        self.firstFramePath = allocateTempFile()
+        
+        guard let file = ManagedFile(queue: nil, path: self.decompressedPath, mode: .readwrite) else {
            return nil
        }
        self.file = file
        self.completion = completion
    }
    
-    func add(bytes: UnsafeRawPointer, length: Int, width: Int, height: Int, bytesPerRow: Int, duration: Double) {
+    func add(with drawingBlock: (AnimationCacheItemDrawingSurface) -> Void, proposedWidth: Int, proposedHeight: Int, duration: Double) {
+        if self.isFailed || self.isFinished {
+            return
+        }
+        
        self.lock.locked {
-            if self.isFailed {
+            guard !self.isFailed, !self.isFinished, let file = self.file else {
                return
            }
            
-            let parameterSet = ParameterSet(width: width, height: height, bytesPerRow: bytesPerRow)
-            if let currentParameterSet = self.currentParameterSet {
-                if currentParameterSet != parameterSet {
+            let width = roundUp(proposedWidth, multiple: 16)
+            let height = roundUp(proposedWidth, multiple: 16)
+            
+            var isFirstFrame = false
+            
+            let surface: ImageARGB
+            if let current = self.currentSurface {
+                if current.argbPlane.width == width && current.argbPlane.height == height {
+                    surface = current
+                } else {
                    self.isFailed = true
                    return
                }
            } else {
-                self.currentParameterSet = parameterSet
+                isFirstFrame = true
                
-                self.file.write(1 as UInt32)
-                
-                self.file.write(UInt32(parameterSet.width))
-                self.file.write(UInt32(parameterSet.height))
-                self.file.write(UInt32(parameterSet.bytesPerRow))
-                
-                self.contentLengthOffset = Int(self.file.position())
-                self.file.write(0 as UInt32)
+                surface = ImageARGB(width: width, height: height)
+                self.currentSurface = surface
            }
            
-            self.frames.append(FrameMetadata(offset: Int(self.file.position()), length: length, duration: duration))
-            let _ = self.file.write(bytes, count: length)
-            self.contentLength += length
+            let yuvaSurface: ImageYUVA420
+            if let current = self.currentYUVASurface {
+                if current.yPlane.width == width && current.yPlane.height == height {
+                    yuvaSurface = current
+                } else {
+                    self.isFailed = true
+                    return
+                }
+            } else {
+                yuvaSurface = ImageYUVA420(width: width, height: height)
+                self.currentYUVASurface = yuvaSurface
+            }
+            
+            let dctCoefficients: DctCoefficientsYUVA420
+            if let current = self.currentDctCoefficients {
+                if current.yPlane.width == width && current.yPlane.height == height {
+                    dctCoefficients = current
+                } else {
+                    self.isFailed = true
+                    return
+                }
+            } else {
+                dctCoefficients = DctCoefficientsYUVA420(width: width, height: height)
+                self.currentDctCoefficients = dctCoefficients
+            }
+            
+            let dctData: DctData
+            if let current = self.currentDctData, current.quality == self.dctQuality {
+                dctData = current
+            } else {
+                dctData = DctData(quality: self.dctQuality)
+                self.currentDctData = dctData
+            }
+            
+            surface.argbPlane.data.withUnsafeMutableBytes { bytes -> Void in
+                drawingBlock(AnimationCacheItemDrawingSurface(
+                    argb: bytes.baseAddress!.assumingMemoryBound(to: UInt8.self),
+                    width: width,
+                    height: height,
+                    bytesPerRow: surface.argbPlane.bytesPerRow,
+                    length: bytes.count
+                ))
+            }
+            
+            surface.toYUVA420(target: yuvaSurface)
+            yuvaSurface.dct(dctData: dctData, target: dctCoefficients)
+            
+            if isFirstFrame {
+                file.write(2 as UInt32)
+                
+                file.write(UInt32(dctCoefficients.yPlane.width))
+                file.write(UInt32(dctCoefficients.yPlane.height))
+                file.write(UInt32(dctData.quality))
+            
+                self.contentLengthOffset = Int(file.position())
+                file.write(0 as UInt32)
+            }
+            
+            let framePosition = Int(file.position())
+            assert(framePosition >= 0)
+            var frameLength = 0
+            
+            for i in 0 ..< 4 {
+                let dctPlane: DctCoefficientPlane
+                switch i {
+                case 0:
+                    dctPlane = dctCoefficients.yPlane
+                case 1:
+                    dctPlane = dctCoefficients.uPlane
+                case 2:
+                    dctPlane = dctCoefficients.vPlane
+                case 3:
+                    dctPlane = dctCoefficients.aPlane
+                default:
+                    preconditionFailure()
+                }
+                
+                dctPlane.data.withUnsafeBytes { bytes in
+                    let _ = file.write(bytes.baseAddress!.assumingMemoryBound(to: UInt8.self), count: bytes.count)
+                }
+                frameLength += dctPlane.data.count
+            }
+            
+            self.frames.append(FrameMetadata(offset: framePosition, length: frameLength, duration: duration))
+            
+            self.contentLength += frameLength
        }
    }
    
@@ -152,27 +363,96 @@ private final class AnimationCacheItemWriterImpl: AnimationCacheItemWriter {
                self.isFinished = true
                shouldComplete = true
                
-                guard let contentLengthOffset = self.contentLengthOffset else {
+                guard let contentLengthOffset = self.contentLengthOffset, let file = self.file else {
+                    self.isFailed = true
+                    return
+                }
+                assert(contentLengthOffset >= 0)
+                
+                let metadataPosition = file.position()
+                file.seek(position: Int64(contentLengthOffset))
+                file.write(UInt32(self.contentLength))
+                
+                file.seek(position: metadataPosition)
+                file.write(UInt32(self.frames.count))
+                for frame in self.frames {
+                    file.write(UInt32(frame.offset))
+                    file.write(UInt32(frame.length))
+                    file.write(Float32(frame.duration))
+                }
+                
+                if !self.frames.isEmpty, let dctCoefficients = self.currentDctCoefficients, let dctData = self.currentDctData {
+                    var firstFrameData = Data(capacity: 4 * 5 + self.frames[0].length)
+                    
+                    writeUInt32(data: &firstFrameData, value: 2 as UInt32)
+                    writeUInt32(data: &firstFrameData, value: UInt32(dctCoefficients.yPlane.width))
+                    writeUInt32(data: &firstFrameData, value: UInt32(dctCoefficients.yPlane.height))
+                    writeUInt32(data: &firstFrameData, value: UInt32(dctData.quality))
+                    
+                    writeUInt32(data: &firstFrameData, value: UInt32(self.frames[0].length))
+                    let firstFrameStart = 4 * 5
+                    
+                    file.seek(position: Int64(self.frames[0].offset))
+                    firstFrameData.count += self.frames[0].length
+                    firstFrameData.withUnsafeMutableBytes { bytes in
+                        let _ = file.read(bytes.baseAddress!.advanced(by: 4 * 5), self.frames[0].length)
+                    }
+                    
+                    writeUInt32(data: &firstFrameData, value: UInt32(1))
+                    writeUInt32(data: &firstFrameData, value: UInt32(firstFrameStart))
+                    writeUInt32(data: &firstFrameData, value: UInt32(self.frames[0].length))
+                    writeFloat32(data: &firstFrameData, value: Float32(1.0))
+                    
+                    guard let compressedFirstFrameData = compressData(data: firstFrameData, addSizeHeader: true) else {
+                        self.isFailed = true
+                        return
+                    }
+                    guard let _ = try? compressedFirstFrameData.write(to: URL(fileURLWithPath: self.firstFramePath)) else {
+                        self.isFailed = true
+                        return
+                    }
+                } else {
                    self.isFailed = true
                    return
                }
                
-                let metadataPosition = self.file.position()
-                self.file.seek(position: Int64(contentLengthOffset))
-                self.file.write(UInt32(self.contentLength))
-                
-                self.file.seek(position: metadataPosition)
-                self.file.write(UInt32(self.frames.count))
-                for frame in self.frames {
-                    self.file.write(UInt32(frame.offset))
-                    self.file.write(UInt32(frame.length))
-                    self.file.write(Float32(frame.duration))
+                if !self.isFailed {
+                    self.file = nil
+                    
+                    file._unsafeClose()
+                    
+                    guard let uncompressedData = try? Data(contentsOf: URL(fileURLWithPath: self.decompressedPath), options: .alwaysMapped) else {
+                        self.isFailed = true
+                        return
+                    }
+                    guard let compressedData = compressData(data: uncompressedData) else {
+                        self.isFailed = true
+                        return
+                    }
+                    guard let compressedFile = ManagedFile(queue: nil, path: self.compressedPath, mode: .readwrite) else {
+                        self.isFailed = true
+                        return
+                    }
+                    compressedFile.write(Int32(uncompressedData.count))
+                    let _ = compressedFile.write(compressedData)
+                    compressedFile._unsafeClose()
                }
            }
        }
        
        if shouldComplete {
-            self.completion(!self.isFailed)
+            let _ = try? FileManager.default.removeItem(atPath: self.decompressedPath)
+            
+            if !self.isFailed {
+                self.completion(CompressedResult(
+                    animationPath: self.compressedPath,
+                    firstFramePath: self.firstFramePath
+                ))
+            } else {
+                let _ = try? FileManager.default.removeItem(atPath: self.compressedPath)
+                let _ = try? FileManager.default.removeItem(atPath: self.firstFramePath)
+                self.completion(nil)
+            }
        }
    }
 }
@@ -185,12 +465,34 @@ private final class AnimationCacheItemAccessor {
    
    private let data: Data
    private let frameMapping: [Int: FrameInfo]
-    private let format: AnimationCacheItemFrame.Format
+    private let durationMapping: [Double]
+    private let totalDuration: Double
    
-    init(data: Data, frameMapping: [Int: FrameInfo], format: AnimationCacheItemFrame.Format) {
+    private var currentYUVASurface: ImageYUVA420
+    private var currentDctData: DctData
+    private var currentDctCoefficients: DctCoefficientsYUVA420
+    
+    init(data: Data, frameMapping: [FrameInfo], width: Int, height: Int, dctQuality: Int) {
        self.data = data
-        self.frameMapping = frameMapping
-        self.format = format
+        
+        var resultFrameMapping: [Int: FrameInfo] = [:]
+        var durationMapping: [Double] = []
+        var totalDuration: Double = 0.0
+        
+        for i in 0 ..< frameMapping.count {
+            let frame = frameMapping[i]
+            resultFrameMapping[i] = frame
+            totalDuration += frame.duration
+            durationMapping.append(totalDuration)
+        }
+        
+        self.frameMapping = resultFrameMapping
+        self.durationMapping = durationMapping
+        self.totalDuration = totalDuration
+        
+        self.currentYUVASurface = ImageYUVA420(width: width, height: height)
+        self.currentDctData = DctData(quality: dctQuality)
+        self.currentDctCoefficients = DctCoefficientsYUVA420(width: width, height: height)
    }
    
    func getFrame(index: Int) -> AnimationCacheItemFrame? {
@@ -198,7 +500,56 @@ private final class AnimationCacheItemAccessor {
            return nil
        }
        
-        return AnimationCacheItemFrame(data: data, range: frameInfo.range, format: self.format, duration: frameInfo.duration)
+        let currentSurface = ImageARGB(width: self.currentYUVASurface.yPlane.width, height: self.currentYUVASurface.yPlane.height)
+        
+        var frameDataOffset = 0
+        let frameLength = frameInfo.range.upperBound - frameInfo.range.lowerBound
+        for i in 0 ..< 4 {
+            let dctPlane: DctCoefficientPlane
+            switch i {
+            case 0:
+                dctPlane = self.currentDctCoefficients.yPlane
+            case 1:
+                dctPlane = self.currentDctCoefficients.uPlane
+            case 2:
+                dctPlane = self.currentDctCoefficients.vPlane
+            case 3:
+                dctPlane = self.currentDctCoefficients.aPlane
+            default:
+                preconditionFailure()
+            }
+            
+            if frameDataOffset + dctPlane.data.count > frameLength {
+                break
+            }
+            
+            dctPlane.data.withUnsafeMutableBytes { targetBuffer -> Void in
+                self.data.copyBytes(to: targetBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self), from: (frameInfo.range.lowerBound + frameDataOffset) ..< (frameInfo.range.lowerBound + frameDataOffset + targetBuffer.count))
+            }
+            
+            frameDataOffset += dctPlane.data.count
+        }
+        
+        self.currentDctCoefficients.idct(dctData: self.currentDctData, target: self.currentYUVASurface)
+        self.currentYUVASurface.toARGB(target: currentSurface)
+        
+        return AnimationCacheItemFrame(data: currentSurface.argbPlane.data, range: 0 ..< currentSurface.argbPlane.data.count, format: .rgba(width: currentSurface.argbPlane.width, height: currentSurface.argbPlane.height, bytesPerRow: currentSurface.argbPlane.bytesPerRow), duration: frameInfo.duration)
+    }
+    
+    func getFrameIndex(duration: Double) -> Int {
+        if self.totalDuration == 0.0 {
+            return 0
+        }
+        if self.durationMapping.count <= 1 {
+            return 0
+        }
+        let normalizedDuration = duration.truncatingRemainder(dividingBy: self.totalDuration)
+        for i in 1 ..< self.durationMapping.count {
+            if normalizedDuration < self.durationMapping[i] {
+                return i - 1
+            }
+        }
+        return self.durationMapping.count - 1
    }
 }

@@ -213,10 +564,54 @@ private func readUInt32(data: Data, offset: Int) -> UInt32 {
    return value
 }

+private func readFloat32(data: Data, offset: Int) -> Float32 {
+    var value: Float32 = 0
+    withUnsafeMutableBytes(of: &value, { bytes -> Void in
+        data.withUnsafeBytes { dataBytes -> Void in
+            memcpy(bytes.baseAddress!, dataBytes.baseAddress!.advanced(by: offset), 4)
+        }
+    })
+    
+    return value
+}
+
+private func writeUInt32(data: inout Data, value: UInt32) {
+    var value: UInt32 = value
+    withUnsafeBytes(of: &value, { bytes -> Void in
+        data.count += 4
+        data.withUnsafeMutableBytes { dataBytes -> Void in
+            memcpy(dataBytes.baseAddress!.advanced(by: dataBytes.count - 4), bytes.baseAddress!, 4)
+        }
+    })
+}
+
+private func writeFloat32(data: inout Data, value: Float32) {
+    var value: Float32 = value
+    withUnsafeBytes(of: &value, { bytes -> Void in
+        data.count += 4
+        data.withUnsafeMutableBytes { dataBytes -> Void in
+            memcpy(dataBytes.baseAddress!.advanced(by: dataBytes.count - 4), bytes.baseAddress!, 4)
+        }
+    })
+}
+
 private func loadItem(path: String) -> AnimationCacheItem? {
-    guard let data = try? Data(contentsOf: URL(fileURLWithPath: path), options: .alwaysMapped) else {
+    guard let compressedData = try? Data(contentsOf: URL(fileURLWithPath: path), options: .alwaysMapped) else {
        return nil
    }
+    
+    if compressedData.count < 4 {
+        return nil
+    }
+    let decompressedSize = readUInt32(data: compressedData, offset: 0)
+    
+    if decompressedSize <= 0 || decompressedSize > 20 * 1024 * 1024 {
+        return nil
+    }
+    guard let data = decompressData(data: compressedData, range: 4 ..< compressedData.count, decompressedSize: Int(decompressedSize)) else {
+        return nil
+    }
+    
    let dataLength = data.count
    
    var offset = 0
@@ -226,7 +621,7 @@ private func loadItem(path: String) -> AnimationCacheItem? {
    }
    let formatVersion = readUInt32(data: data, offset: offset)
    offset += 4
-    if formatVersion != 1 {
+    if formatVersion != 2 {
        return nil
    }
    
@@ -245,7 +640,7 @@ private func loadItem(path: String) -> AnimationCacheItem? {
    guard dataLength >= offset + 4 else {
        return nil
    }
-    let bytesPerRow = readUInt32(data: data, offset: offset)
+    let dctQuality = readUInt32(data: data, offset: offset)
    offset += 4
    
    guard dataLength >= offset + 4 else {
@@ -262,8 +657,8 @@ private func loadItem(path: String) -> AnimationCacheItem? {
    let numFrames = readUInt32(data: data, offset: offset)
    offset += 4
    
-    var frameMapping: [Int: AnimationCacheItemAccessor.FrameInfo] = [:]
-    for i in 0 ..< Int(numFrames) {
+    var frameMapping: [AnimationCacheItemAccessor.FrameInfo] = []
+    for _ in 0 ..< Int(numFrames) {
        guard dataLength >= offset + 4 + 4 + 4 else {
            return nil
        }
@@ -272,16 +667,18 @@ private func loadItem(path: String) -> AnimationCacheItem? {
        offset += 4
        let frameLength = readUInt32(data: data, offset: offset)
        offset += 4
-        let frameDuration = readUInt32(data: data, offset: offset)
+        let frameDuration = readFloat32(data: data, offset: offset)
        offset += 4
        
-        frameMapping[i] = AnimationCacheItemAccessor.FrameInfo(range: Int(frameStart) ..< Int(frameStart + frameLength), duration: Double(frameDuration))
+        frameMapping.append(AnimationCacheItemAccessor.FrameInfo(range: Int(frameStart) ..< Int(frameStart + frameLength), duration: Double(frameDuration)))
    }
    
-    let itemAccessor = AnimationCacheItemAccessor(data: data, frameMapping: frameMapping, format: .rgba(width: Int(width), height: Int(height), bytesPerRow: Int(bytesPerRow)))
+    let itemAccessor = AnimationCacheItemAccessor(data: data, frameMapping: frameMapping, width: Int(width), height: Int(height), dctQuality: Int(dctQuality))
    
    return AnimationCacheItem(numFrames: Int(numFrames), getFrame: { index in
        return itemAccessor.getFrame(index: index)
+    }, getFrameIndexImpl: { duration in
+        return itemAccessor.getFrameIndex(duration: duration)
    })
 }

@@ -300,10 +697,14 @@ public final class AnimationCacheImpl: AnimationCache {
        private let basePath: String
        private let allocateTempFile: () -> String
        
+        private let fetchQueues: [Queue]
+        private var nextFetchQueueIndex: Int = 0
+        
        private var itemContexts: [String: ItemContext] = [:]
        
        init(queue: Queue, basePath: String, allocateTempFile: @escaping () -> String) {
            self.queue = queue
+            self.fetchQueues = (0 ..< 2).map { _ in Queue() }
            self.basePath = basePath
            self.allocateTempFile = allocateTempFile
        }
@@ -315,9 +716,10 @@ public final class AnimationCacheImpl: AnimationCache {
            let sourceIdPath = itemSubpath(hashString: md5Hash(sourceId + "-\(Int(size.width))x\(Int(size.height))"))
            let itemDirectoryPath = "\(self.basePath)/\(sourceIdPath.directory)"
            let itemPath = "\(itemDirectoryPath)/\(sourceIdPath.fileName)"
+            let itemFirstFramePath = "\(itemDirectoryPath)/\(sourceIdPath.fileName)-f"
            
-            if FileManager.default.fileExists(atPath: itemPath) {
-                updateResult(AnimationCacheItemResult(item: loadItem(path: itemPath), isFinal: true))
+            if FileManager.default.fileExists(atPath: itemPath), let item = loadItem(path: itemPath) {
+                updateResult(AnimationCacheItemResult(item: item, isFinal: true))
                
                return EmptyDisposable
            }
@@ -338,8 +740,7 @@ public final class AnimationCacheImpl: AnimationCache {
            updateResult(AnimationCacheItemResult(item: nil, isFinal: false))
            
            if beginFetch {
-                let tempPath = self.allocateTempFile()
-                guard let writer = AnimationCacheItemWriterImpl(tempPath: tempPath, completion: { [weak self, weak itemContext] success in
+                guard let writer = AnimationCacheItemWriterImpl(queue: self.fetchQueues[self.nextFetchQueueIndex % self.fetchQueues.count], allocateTempFile: self.allocateTempFile, completion: { [weak self, weak itemContext] result in
                    queue.async {
                        guard let strongSelf = self, let itemContext = itemContext, itemContext === strongSelf.itemContexts[sourceId] else {
                            return
@@ -347,13 +748,18 @@ public final class AnimationCacheImpl: AnimationCache {
                        
                        strongSelf.itemContexts.removeValue(forKey: sourceId)
                        
-                        guard success else {
+                        guard let result = result else {
                            return
                        }
                        guard let _ = try? FileManager.default.createDirectory(at: URL(fileURLWithPath: itemDirectoryPath), withIntermediateDirectories: true, attributes: nil) else {
                            return
                        }
-                        guard let _ = try? FileManager.default.moveItem(atPath: tempPath, toPath: itemPath) else {
+                        let _ = try? FileManager.default.removeItem(atPath: itemPath)
+                        guard let _ = try? FileManager.default.moveItem(atPath: result.animationPath, toPath: itemPath) else {
+                            return
+                        }
+                        let _ = try? FileManager.default.removeItem(atPath: itemFirstFramePath)
+                        guard let _ = try? FileManager.default.moveItem(atPath: result.firstFramePath, toPath: itemFirstFramePath) else {
                            return
                        }
                        guard let item = loadItem(path: itemPath) else {
@@ -368,9 +774,14 @@ public final class AnimationCacheImpl: AnimationCache {
                    return EmptyDisposable
                }
                
-                let fetchDisposable = fetch(size, writer)
+                let fetchDisposable = MetaDisposable()
+                fetchDisposable.set(fetch(size, writer))
                
-                itemContext.disposable.set(ActionDisposable {
+                itemContext.disposable.set(ActionDisposable { [weak writer] in
+                    if let writer = writer {
+                        writer.isCancelled = true
+                    }
+                    
                    fetchDisposable.dispose()
                })
            }
@@ -389,25 +800,43 @@ public final class AnimationCacheImpl: AnimationCache {
            }
        }
        
-        func getSynchronously(sourceId: String, size: CGSize) -> AnimationCacheItem? {
+        static func getFirstFrameSynchronously(basePath: String, sourceId: String, size: CGSize) -> AnimationCacheItem? {
            let sourceIdPath = itemSubpath(hashString: md5Hash(sourceId + "-\(Int(size.width))x\(Int(size.height))"))
-            let itemDirectoryPath = "\(self.basePath)/\(sourceIdPath.directory)"
-            let itemPath = "\(itemDirectoryPath)/\(sourceIdPath.fileName)"
+            let itemDirectoryPath = "\(basePath)/\(sourceIdPath.directory)"
+            let itemFirstFramePath = "\(itemDirectoryPath)/\(sourceIdPath.fileName)-f"
            
-            if FileManager.default.fileExists(atPath: itemPath) {
-                return loadItem(path: itemPath)
+            if FileManager.default.fileExists(atPath: itemFirstFramePath) {
+                return loadItem(path: itemFirstFramePath)
            } else {
                return nil
            }
        }
+        
+        static func getFirstFrame(basePath: String, sourceId: String, size: CGSize, completion: @escaping (AnimationCacheItem?) -> Void) -> Disposable {
+            let sourceIdPath = itemSubpath(hashString: md5Hash(sourceId + "-\(Int(size.width))x\(Int(size.height))"))
+            let itemDirectoryPath = "\(basePath)/\(sourceIdPath.directory)"
+            let itemFirstFramePath = "\(itemDirectoryPath)/\(sourceIdPath.fileName)-f"
+            
+            if FileManager.default.fileExists(atPath: itemFirstFramePath), let item = loadItem(path: itemFirstFramePath) {
+                completion(item)
+                
+                return EmptyDisposable
+            } else {
+                completion(nil)
+                
+                return EmptyDisposable
+            }
+        }
    }
    
    private let queue: Queue
+    private let basePath: String
    private let impl: QueueLocalObject<Impl>
    
    public init(basePath: String, allocateTempFile: @escaping () -> String) {
        let queue = Queue()
        self.queue = queue
+        self.basePath = basePath
        self.impl = QueueLocalObject(queue: queue, generate: {
            return Impl(queue: queue, basePath: basePath, allocateTempFile: allocateTempFile)
        })
@@ -431,9 +860,18 @@ public final class AnimationCacheImpl: AnimationCache {
        |> runOn(self.queue)
    }
    
-    public func getSynchronously(sourceId: String, size: CGSize) -> AnimationCacheItem? {
-        return self.impl.syncWith { impl -> AnimationCacheItem? in
-            return impl.getSynchronously(sourceId: sourceId, size: size)
+    public func getFirstFrameSynchronously(sourceId: String, size: CGSize) -> AnimationCacheItem? {
+        return Impl.getFirstFrameSynchronously(basePath: self.basePath, sourceId: sourceId, size: size)
+    }
+    
+    public func getFirstFrame(queue: Queue, sourceId: String, size: CGSize, completion: @escaping (AnimationCacheItem?) -> Void) -> Disposable {
+        let disposable = MetaDisposable()
+        
+        let basePath = self.basePath
+        queue.async {
+            disposable.set(Impl.getFirstFrame(basePath: basePath, sourceId: sourceId, size: size, completion: completion))
        }
+        
+        return disposable
    }
 }
--- a/submodules/TelegramUI/Components/AnimationCache/Sources/ImageData.swift
+++ b/submodules/TelegramUI/Components/AnimationCache/Sources/ImageData.swift
@@ -0,0 +1,231 @@
+import Foundation
+import UIKit
+import DCT
+
+final class ImagePlane {
+    let width: Int
+    let height: Int
+    let bytesPerRow: Int
+    let components: Int
+    var data: Data
+    
+    init(width: Int, height: Int, components: Int) {
+        self.width = width
+        self.height = height
+        self.bytesPerRow = width * components
+        self.components = components
+        self.data = Data(count: width * components * height)
+    }
+}
+
+final class ImageARGB {
+    let argbPlane: ImagePlane
+    
+    init(width: Int, height: Int) {
+        self.argbPlane = ImagePlane(width: width, height: height, components: 4)
+    }
+}
+
+final class ImageYUVA420 {
+    let yPlane: ImagePlane
+    let uPlane: ImagePlane
+    let vPlane: ImagePlane
+    let aPlane: ImagePlane
+    
+    init(width: Int, height: Int) {
+        self.yPlane = ImagePlane(width: width, height: height, components: 1)
+        self.uPlane = ImagePlane(width: width / 2, height: height / 2, components: 1)
+        self.vPlane = ImagePlane(width: width / 2, height: height / 2, components: 1)
+        self.aPlane = ImagePlane(width: width, height: height, components: 1)
+    }
+}
+
+final class DctCoefficientPlane {
+    let width: Int
+    let height: Int
+    var data: Data
+    
+    init(width: Int, height: Int) {
+        self.width = width
+        self.height = height
+        self.data = Data(count: width * 2 * height)
+    }
+}
+
+final class DctCoefficientsYUVA420 {
+    let yPlane: DctCoefficientPlane
+    let uPlane: DctCoefficientPlane
+    let vPlane: DctCoefficientPlane
+    let aPlane: DctCoefficientPlane
+    
+    init(width: Int, height: Int) {
+        self.yPlane = DctCoefficientPlane(width: width, height: height)
+        self.uPlane = DctCoefficientPlane(width: width / 2, height: height / 2)
+        self.vPlane = DctCoefficientPlane(width: width / 2, height: height / 2)
+        self.aPlane = DctCoefficientPlane(width: width, height: height)
+    }
+}
+
+extension ImageARGB {
+    func toYUVA420(target: ImageYUVA420) {
+        precondition(self.argbPlane.width == target.yPlane.width && self.argbPlane.height == target.yPlane.height)
+        
+        self.argbPlane.data.withUnsafeBytes { argbBuffer -> Void in
+            target.yPlane.data.withUnsafeMutableBytes { yBuffer -> Void in
+                target.uPlane.data.withUnsafeMutableBytes { uBuffer -> Void in
+                    target.vPlane.data.withUnsafeMutableBytes { vBuffer -> Void in
+                        target.aPlane.data.withUnsafeMutableBytes { aBuffer -> Void in
+                            splitRGBAIntoYUVAPlanes(
+                                argbBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self),
+                                yBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self),
+                                uBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self),
+                                vBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self),
+                                aBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self),
+                                Int32(self.argbPlane.width),
+                                Int32(self.argbPlane.height),
+                                Int32(self.argbPlane.bytesPerRow)
+                            )
+                        }
+                    }
+                }
+            }
+        }
+    }
+    
+    func toYUVA420() -> ImageYUVA420 {
+        let resultImage = ImageYUVA420(width: self.argbPlane.width, height: self.argbPlane.height)
+        self.toYUVA420(target: resultImage)
+        return resultImage
+    }
+}
+
+extension ImageYUVA420 {
+    func toARGB(target: ImageARGB) {
+        precondition(self.yPlane.width == target.argbPlane.width && self.yPlane.height == target.argbPlane.height)
+        
+        self.yPlane.data.withUnsafeBytes { yBuffer -> Void in
+            self.uPlane.data.withUnsafeBytes { uBuffer -> Void in
+                self.vPlane.data.withUnsafeBytes { vBuffer -> Void in
+                    self.aPlane.data.withUnsafeBytes { aBuffer -> Void in
+                        target.argbPlane.data.withUnsafeMutableBytes { argbBuffer -> Void in
+                            combineYUVAPlanesIntoARBB(
+                                argbBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self),
+                                yBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self),
+                                uBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self),
+                                vBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self),
+                                aBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self),
+                                Int32(target.argbPlane.width),
+                                Int32(target.argbPlane.height),
+                                Int32(target.argbPlane.bytesPerRow)
+                            )
+                        }
+                    }
+                }
+            }
+        }
+    }
+    
+    func toARGB() -> ImageARGB {
+        let resultImage = ImageARGB(width: self.yPlane.width, height: self.yPlane.height)
+        self.toARGB(target: resultImage)
+        return resultImage
+    }
+}
+
+final class DctData {
+    let quality: Int
+    let dctData: Data
+    let idctData: Data
+    
+    init(quality: Int) {
+        self.quality = quality
+        self.dctData = generateForwardDctData(Int32(quality))!
+        self.idctData = generateInverseDctData(Int32(quality))!
+    }
+}
+
+extension ImageYUVA420 {
+    func dct(dctData: DctData, target: DctCoefficientsYUVA420) {
+        precondition(self.yPlane.width == target.yPlane.width && self.yPlane.height == target.yPlane.height)
+        
+        for i in 0 ..< 4 {
+            let sourcePlane: ImagePlane
+            let targetPlane: DctCoefficientPlane
+            switch i {
+            case 0:
+                sourcePlane = self.yPlane
+                targetPlane = target.yPlane
+            case 1:
+                sourcePlane = self.uPlane
+                targetPlane = target.uPlane
+            case 2:
+                sourcePlane = self.vPlane
+                targetPlane = target.vPlane
+            case 3:
+                sourcePlane = self.aPlane
+                targetPlane = target.aPlane
+            default:
+                preconditionFailure()
+            }
+            
+            sourcePlane.data.withUnsafeBytes { sourceBytes in
+                let sourcePixels = sourceBytes.baseAddress!.assumingMemoryBound(to: UInt8.self)
+                
+                targetPlane.data.withUnsafeMutableBytes { bytes in
+                    let coefficients = bytes.baseAddress!.assumingMemoryBound(to: UInt16.self)
+                    
+                    performForwardDct(sourcePixels, coefficients, Int32(sourcePlane.width), Int32(sourcePlane.height), Int32(sourcePlane.bytesPerRow), dctData.dctData)
+                }
+            }
+        }
+    }
+    
+    func dct(dctData: DctData) -> DctCoefficientsYUVA420 {
+        let results = DctCoefficientsYUVA420(width: self.yPlane.width, height: self.yPlane.height)
+        self.dct(dctData: dctData, target: results)
+        return results
+    }
+}
+
+extension DctCoefficientsYUVA420 {
+    func idct(dctData: DctData, target: ImageYUVA420) {
+        precondition(self.yPlane.width == target.yPlane.width && self.yPlane.height == target.yPlane.height)
+        
+        for i in 0 ..< 4 {
+            let sourcePlane: DctCoefficientPlane
+            let targetPlane: ImagePlane
+            switch i {
+            case 0:
+                sourcePlane = self.yPlane
+                targetPlane = target.yPlane
+            case 1:
+                sourcePlane = self.uPlane
+                targetPlane = target.uPlane
+            case 2:
+                sourcePlane = self.vPlane
+                targetPlane = target.vPlane
+            case 3:
+                sourcePlane = self.aPlane
+                targetPlane = target.aPlane
+            default:
+                preconditionFailure()
+            }
+            
+            sourcePlane.data.withUnsafeBytes { sourceBytes in
+                let coefficients = sourceBytes.baseAddress!.assumingMemoryBound(to: UInt16.self)
+                
+                targetPlane.data.withUnsafeMutableBytes { bytes in
+                    let pixels = bytes.baseAddress!.assumingMemoryBound(to: UInt8.self)
+                    
+                    performInverseDct(coefficients, pixels, Int32(sourcePlane.width), Int32(sourcePlane.height), Int32(targetPlane.bytesPerRow), Int32(sourcePlane.width), dctData.idctData)
+                }
+            }
+        }
+    }
+    
+    func idct(dctData: DctData) -> ImageYUVA420 {
+        let resultImage = ImageYUVA420(width: self.yPlane.width, height: self.yPlane.height)
+        self.idct(dctData: dctData, target: resultImage)
+        return resultImage
+    }
+}