Entity input: improved animation cache and rendering

This commit is contained in:
Ali
2022-06-24 02:06:02 +01:00
parent 0f1b382265
commit c112bc5146
37 changed files with 2557 additions and 383 deletions

View File

@@ -13,6 +13,7 @@ swift_library(
"//submodules/SSignalKit/SwiftSignalKit:SwiftSignalKit",
"//submodules/CryptoUtils:CryptoUtils",
"//submodules/ManagedFile:ManagedFile",
"//submodules/TelegramUI/Components/AnimationCache/DCT:DCT",
],
visibility = [
"//visibility:public",

View File

@@ -0,0 +1,23 @@
objc_library(
name = "DCT",
enable_modules = True,
module_name = "DCT",
srcs = glob([
"Sources/**/*.m",
"Sources/**/*.h",
]),
hdrs = glob([
"PublicHeaders/**/*.h",
]),
includes = [
"PublicHeaders",
],
sdk_frameworks = [
"Foundation",
"Accelerate",
],
visibility = [
"//visibility:public",
],
)

View File

@@ -0,0 +1,14 @@
#ifndef DctImageTransform_h
#define DctImageTransform_h
#import <Foundation/Foundation.h>
#import <DCT/YuvConversion.h>
NSData *generateForwardDctData(int quality);
void performForwardDct(uint8_t const *pixels, int16_t *coefficients, int width, int height, int bytesPerRow, NSData *dctData);
NSData *generateInverseDctData(int quality);
void performInverseDct(int16_t const *coefficients, uint8_t *pixels, int width, int height, int coefficientsPerRow, int bytesPerRow, NSData *idctData);
#endif /* DctImageTransform_h */

View File

@@ -0,0 +1,9 @@
#ifndef YuvConversion_h
#define YuvConversion_h
#import <Foundation/Foundation.h>
void splitRGBAIntoYUVAPlanes(uint8_t const *argb, uint8_t *outY, uint8_t *outU, uint8_t *outV, uint8_t *outA, int width, int height, int bytesPerRow);
void combineYUVAPlanesIntoARBB(uint8_t *argb, uint8_t const *inY, uint8_t const *inU, uint8_t const *inV, uint8_t const *inA, int width, int height, int bytesPerRow);
#endif /* YuvConversion_h */

View File

@@ -0,0 +1,991 @@
#import <DCT/DCT.h>
typedef long JLONG;
typedef unsigned char JSAMPLE;
#define GETJSAMPLE(value) ((int)(value))
#define MAXJSAMPLE 255
#define CENTERJSAMPLE 128
typedef short JCOEF;
typedef unsigned int JDIMENSION;
#define JPEG_MAX_DIMENSION 65500L /* a tad under 64K to prevent overflows */
#define MULTIPLIER short /* prefer 16-bit with SIMD for parellelism */
typedef MULTIPLIER IFAST_MULT_TYPE; /* 16 bits is OK, use short if faster */
#define IFAST_SCALE_BITS 2 /* fractional bits in scale factors */
/* Various constants determining the sizes of things.
* All of these are specified by the JPEG standard, so don't change them
* if you want to be compatible.
*/
#define DCTSIZE 8 /* The basic DCT block is 8x8 samples */
#define DCTSIZE2 64 /* DCTSIZE squared; # of elements in a block */
#define NUM_QUANT_TBLS 4 /* Quantization tables are numbered 0..3 */
#define NUM_HUFF_TBLS 4 /* Huffman tables are numbered 0..3 */
#define NUM_ARITH_TBLS 16 /* Arith-coding tables are numbered 0..15 */
#define MAX_COMPS_IN_SCAN 4 /* JPEG limit on # of components in one scan */
#define MAX_SAMP_FACTOR 4 /* JPEG limit on sampling factors */
/* Unfortunately, some bozo at Adobe saw no reason to be bound by the standard;
* the PostScript DCT filter can emit files with many more than 10 blocks/MCU.
* If you happen to run across such a file, you can up D_MAX_BLOCKS_IN_MCU
* to handle it. We even let you do this from the jconfig.h file. However,
* we strongly discourage changing C_MAX_BLOCKS_IN_MCU; just because Adobe
* sometimes emits noncompliant files doesn't mean you should too.
*/
#define C_MAX_BLOCKS_IN_MCU 10 /* compressor's limit on blocks per MCU */
#ifndef D_MAX_BLOCKS_IN_MCU
#define D_MAX_BLOCKS_IN_MCU 10 /* decompressor's limit on blocks per MCU */
#endif
/* Data structures for images (arrays of samples and of DCT coefficients).
*/
typedef JSAMPLE *JSAMPROW; /* ptr to one image row of pixel samples. */
typedef JSAMPROW *JSAMPARRAY; /* ptr to some rows (a 2-D sample array) */
typedef JSAMPARRAY *JSAMPIMAGE; /* a 3-D sample array: top index is color */
typedef JCOEF JBLOCK[DCTSIZE2]; /* one block of coefficients */
typedef JBLOCK *JBLOCKROW; /* pointer to one row of coefficient blocks */
typedef JBLOCKROW *JBLOCKARRAY; /* a 2-D array of coefficient blocks */
typedef JBLOCKARRAY *JBLOCKIMAGE; /* a 3-D array of coefficient blocks */
typedef JCOEF *JCOEFPTR; /* useful in a couple of places */
#include <arm_neon.h>
/* jsimd_idct_ifast_neon() performs dequantization and a fast, not so accurate
* inverse DCT (Discrete Cosine Transform) on one block of coefficients. It
* uses the same calculations and produces exactly the same output as IJG's
* original jpeg_idct_ifast() function, which can be found in jidctfst.c.
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.082392200 = 2688 * 2^-15
* 0.414213562 = 13568 * 2^-15
* 0.847759065 = 27776 * 2^-15
* 0.613125930 = 20096 * 2^-15
*
* See jidctfst.c for further details of the IDCT algorithm. Where possible,
* the variable names and comments here in jsimd_idct_ifast_neon() match up
* with those in jpeg_idct_ifast().
*/
#define PASS1_BITS 2
#define F_0_082 2688
#define F_0_414 13568
#define F_0_847 27776
#define F_0_613 20096
__attribute__((aligned(16))) static const int16_t jsimd_idct_ifast_neon_consts[] = {
F_0_082, F_0_414, F_0_847, F_0_613
};
#define F_0_382 12544
#define F_0_541 17792
#define F_0_707 23168
#define F_0_306 9984
__attribute__((aligned(16))) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
F_0_382, F_0_541, F_0_707, F_0_306
};
typedef short DCTELEM; /* prefer 16 bit with SIMD for parellelism */
typedef unsigned short UDCTELEM;
typedef unsigned int UDCTELEM2;
static void jsimd_fdct_ifast_neon(DCTELEM *data) {
/* Load an 8x8 block of samples into Neon registers. De-interleaving loads
* are used, followed by vuzp to transpose the block such that we have a
* column of samples per vector - allowing all rows to be processed at once.
*/
int16x8x4_t data1 = vld4q_s16(data);
int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
int16x8_t col0 = cols_04.val[0];
int16x8_t col1 = cols_15.val[0];
int16x8_t col2 = cols_26.val[0];
int16x8_t col3 = cols_37.val[0];
int16x8_t col4 = cols_04.val[1];
int16x8_t col5 = cols_15.val[1];
int16x8_t col6 = cols_26.val[1];
int16x8_t col7 = cols_37.val[1];
/* Pass 1: process rows. */
/* Load DCT conversion constants. */
const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
int16x8_t tmp0 = vaddq_s16(col0, col7);
int16x8_t tmp7 = vsubq_s16(col0, col7);
int16x8_t tmp1 = vaddq_s16(col1, col6);
int16x8_t tmp6 = vsubq_s16(col1, col6);
int16x8_t tmp2 = vaddq_s16(col2, col5);
int16x8_t tmp5 = vsubq_s16(col2, col5);
int16x8_t tmp3 = vaddq_s16(col3, col4);
int16x8_t tmp4 = vsubq_s16(col3, col4);
/* Even part */
int16x8_t tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */
int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
col0 = vaddq_s16(tmp10, tmp11); /* phase 3 */
col4 = vsubq_s16(tmp10, tmp11);
int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
col2 = vaddq_s16(tmp13, z1); /* phase 5 */
col6 = vsubq_s16(tmp13, z1);
/* Odd part */
tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */
tmp11 = vaddq_s16(tmp5, tmp6);
tmp12 = vaddq_s16(tmp6, tmp7);
int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
z2 = vaddq_s16(z2, z5);
int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
z5 = vaddq_s16(tmp12, z5);
z4 = vaddq_s16(z4, z5);
int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
int16x8_t z11 = vaddq_s16(tmp7, z3); /* phase 5 */
int16x8_t z13 = vsubq_s16(tmp7, z3);
col5 = vaddq_s16(z13, z2); /* phase 6 */
col3 = vsubq_s16(z13, z2);
col1 = vaddq_s16(z11, z4);
col7 = vsubq_s16(z11, z4);
/* Transpose to work on columns in pass 2. */
int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
vreinterpretq_s32_s16(cols_45.val[0]));
int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
vreinterpretq_s32_s16(cols_45.val[1]));
int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
vreinterpretq_s32_s16(cols_67.val[0]));
int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
vreinterpretq_s32_s16(cols_67.val[1]));
int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
/* Pass 2: process columns. */
tmp0 = vaddq_s16(row0, row7);
tmp7 = vsubq_s16(row0, row7);
tmp1 = vaddq_s16(row1, row6);
tmp6 = vsubq_s16(row1, row6);
tmp2 = vaddq_s16(row2, row5);
tmp5 = vsubq_s16(row2, row5);
tmp3 = vaddq_s16(row3, row4);
tmp4 = vsubq_s16(row3, row4);
/* Even part */
tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */
tmp13 = vsubq_s16(tmp0, tmp3);
tmp11 = vaddq_s16(tmp1, tmp2);
tmp12 = vsubq_s16(tmp1, tmp2);
row0 = vaddq_s16(tmp10, tmp11); /* phase 3 */
row4 = vsubq_s16(tmp10, tmp11);
z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
row2 = vaddq_s16(tmp13, z1); /* phase 5 */
row6 = vsubq_s16(tmp13, z1);
/* Odd part */
tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */
tmp11 = vaddq_s16(tmp5, tmp6);
tmp12 = vaddq_s16(tmp6, tmp7);
z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
z2 = vaddq_s16(z2, z5);
z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
z5 = vaddq_s16(tmp12, z5);
z4 = vaddq_s16(z4, z5);
z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
z11 = vaddq_s16(tmp7, z3); /* phase 5 */
z13 = vsubq_s16(tmp7, z3);
row5 = vaddq_s16(z13, z2); /* phase 6 */
row3 = vsubq_s16(z13, z2);
row1 = vaddq_s16(z11, z4);
row7 = vsubq_s16(z11, z4);
vst1q_s16(data + 0 * DCTSIZE, row0);
vst1q_s16(data + 1 * DCTSIZE, row1);
vst1q_s16(data + 2 * DCTSIZE, row2);
vst1q_s16(data + 3 * DCTSIZE, row3);
vst1q_s16(data + 4 * DCTSIZE, row4);
vst1q_s16(data + 5 * DCTSIZE, row5);
vst1q_s16(data + 6 * DCTSIZE, row6);
vst1q_s16(data + 7 * DCTSIZE, row7);
}
static void jsimd_idct_ifast_neon(void *dct_table, JCOEFPTR coef_block,
JSAMPROW output_buf)
{
IFAST_MULT_TYPE *quantptr = dct_table;
/* Load DCT coefficients. */
int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE);
int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
/* Load quantization table values for DC coefficients. */
int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
/* Dequantize DC coefficients. */
row0 = vmulq_s16(row0, quant_row0);
/* Construct bitmap to test if all AC coefficients are 0. */
int16x8_t bitmap = vorrq_s16(row1, row2);
bitmap = vorrq_s16(bitmap, row3);
bitmap = vorrq_s16(bitmap, row4);
bitmap = vorrq_s16(bitmap, row5);
bitmap = vorrq_s16(bitmap, row6);
bitmap = vorrq_s16(bitmap, row7);
int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
/* Load IDCT conversion constants. */
const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts);
if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
/* All AC coefficients are zero.
* Compute DC values and duplicate into vectors.
*/
int16x8_t dcval = row0;
row1 = dcval;
row2 = dcval;
row3 = dcval;
row4 = dcval;
row5 = dcval;
row6 = dcval;
row7 = dcval;
} else if (left_ac_bitmap == 0) {
/* AC coefficients are zero for columns 0, 1, 2, and 3.
* Use DC values for these columns.
*/
int16x4_t dcval = vget_low_s16(row0);
/* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */
/* Load quantization table. */
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
/* Even part: dequantize DCT coefficients. */
int16x4_t tmp0 = vget_high_s16(row0);
int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2);
int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4);
int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6);
int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */
int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
tmp12 = vsub_s16(tmp12, tmp13);
tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */
tmp3 = vsub_s16(tmp10, tmp13);
tmp1 = vadd_s16(tmp11, tmp12);
tmp2 = vsub_s16(tmp11, tmp12);
/* Odd part: dequantize DCT coefficients. */
int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1);
int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3);
int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5);
int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7);
int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */
int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
int16x4_t z11 = vadd_s16(tmp4, tmp7);
int16x4_t z12 = vsub_s16(tmp4, tmp7);
tmp7 = vadd_s16(z11, z13); /* phase 5 */
int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
tmp11 = vadd_s16(tmp11, z11_sub_z13);
int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
z5 = vadd_s16(z5, z10_add_z12);
tmp10 = vqdmulh_lane_s16(z12, consts, 0);
tmp10 = vadd_s16(tmp10, z12);
tmp10 = vsub_s16(tmp10, z5);
tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
tmp12 = vadd_s16(tmp12, z5);
tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */
tmp5 = vsub_s16(tmp11, tmp6);
tmp4 = vadd_s16(tmp10, tmp5);
row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7));
row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7));
row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6));
row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6));
row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5));
row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5));
row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4));
row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4));
} else if (right_ac_bitmap == 0) {
/* AC coefficients are zero for columns 4, 5, 6, and 7.
* Use DC values for these columns.
*/
int16x4_t dcval = vget_high_s16(row0);
/* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */
/* Load quantization table. */
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
/* Even part: dequantize DCT coefficients. */
int16x4_t tmp0 = vget_low_s16(row0);
int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2);
int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4);
int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6);
int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */
int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
tmp12 = vsub_s16(tmp12, tmp13);
tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */
tmp3 = vsub_s16(tmp10, tmp13);
tmp1 = vadd_s16(tmp11, tmp12);
tmp2 = vsub_s16(tmp11, tmp12);
/* Odd part: dequantize DCT coefficients. */
int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1);
int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3);
int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5);
int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7);
int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */
int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
int16x4_t z11 = vadd_s16(tmp4, tmp7);
int16x4_t z12 = vsub_s16(tmp4, tmp7);
tmp7 = vadd_s16(z11, z13); /* phase 5 */
int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
tmp11 = vadd_s16(tmp11, z11_sub_z13);
int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
z5 = vadd_s16(z5, z10_add_z12);
tmp10 = vqdmulh_lane_s16(z12, consts, 0);
tmp10 = vadd_s16(tmp10, z12);
tmp10 = vsub_s16(tmp10, z5);
tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
tmp12 = vadd_s16(tmp12, z5);
tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */
tmp5 = vsub_s16(tmp11, tmp6);
tmp4 = vadd_s16(tmp10, tmp5);
row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval);
row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval);
row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval);
row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval);
row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval);
row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval);
row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval);
row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval);
} else {
/* Some AC coefficients are non-zero; full IDCT calculation required. */
/* Load quantization table. */
int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE);
int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
/* Even part: dequantize DCT coefficients. */
int16x8_t tmp0 = row0;
int16x8_t tmp1 = vmulq_s16(row2, quant_row2);
int16x8_t tmp2 = vmulq_s16(row4, quant_row4);
int16x8_t tmp3 = vmulq_s16(row6, quant_row6);
int16x8_t tmp10 = vaddq_s16(tmp0, tmp2); /* phase 3 */
int16x8_t tmp11 = vsubq_s16(tmp0, tmp2);
int16x8_t tmp13 = vaddq_s16(tmp1, tmp3); /* phases 5-3 */
int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3);
int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1);
tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3);
tmp12 = vsubq_s16(tmp12, tmp13);
tmp0 = vaddq_s16(tmp10, tmp13); /* phase 2 */
tmp3 = vsubq_s16(tmp10, tmp13);
tmp1 = vaddq_s16(tmp11, tmp12);
tmp2 = vsubq_s16(tmp11, tmp12);
/* Odd part: dequantize DCT coefficients. */
int16x8_t tmp4 = vmulq_s16(row1, quant_row1);
int16x8_t tmp5 = vmulq_s16(row3, quant_row3);
int16x8_t tmp6 = vmulq_s16(row5, quant_row5);
int16x8_t tmp7 = vmulq_s16(row7, quant_row7);
int16x8_t z13 = vaddq_s16(tmp6, tmp5); /* phase 6 */
int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6);
int16x8_t z11 = vaddq_s16(tmp4, tmp7);
int16x8_t z12 = vsubq_s16(tmp4, tmp7);
tmp7 = vaddq_s16(z11, z13); /* phase 5 */
int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
tmp11 = vaddq_s16(tmp11, z11_sub_z13);
int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
z5 = vaddq_s16(z5, z10_add_z12);
tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
tmp10 = vaddq_s16(tmp10, z12);
tmp10 = vsubq_s16(tmp10, z5);
tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
tmp12 = vaddq_s16(tmp12, z5);
tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */
tmp5 = vsubq_s16(tmp11, tmp6);
tmp4 = vaddq_s16(tmp10, tmp5);
row0 = vaddq_s16(tmp0, tmp7);
row7 = vsubq_s16(tmp0, tmp7);
row1 = vaddq_s16(tmp1, tmp6);
row6 = vsubq_s16(tmp1, tmp6);
row2 = vaddq_s16(tmp2, tmp5);
row5 = vsubq_s16(tmp2, tmp5);
row4 = vaddq_s16(tmp3, tmp4);
row3 = vsubq_s16(tmp3, tmp4);
}
/* Transpose rows to work on columns in pass 2. */
int16x8x2_t rows_01 = vtrnq_s16(row0, row1);
int16x8x2_t rows_23 = vtrnq_s16(row2, row3);
int16x8x2_t rows_45 = vtrnq_s16(row4, row5);
int16x8x2_t rows_67 = vtrnq_s16(row6, row7);
int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]),
vreinterpretq_s32_s16(rows_45.val[0]));
int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]),
vreinterpretq_s32_s16(rows_45.val[1]));
int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]),
vreinterpretq_s32_s16(rows_67.val[0]));
int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]),
vreinterpretq_s32_s16(rows_67.val[1]));
int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]);
int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]);
int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]);
int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]);
int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]);
int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]);
int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]);
int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]);
int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]);
int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]);
int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]);
int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]);
/* 1-D IDCT, pass 2 */
/* Even part */
int16x8_t tmp10 = vaddq_s16(col0, col4);
int16x8_t tmp11 = vsubq_s16(col0, col4);
int16x8_t tmp13 = vaddq_s16(col2, col6);
int16x8_t col2_sub_col6 = vsubq_s16(col2, col6);
int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1);
tmp12 = vaddq_s16(tmp12, col2_sub_col6);
tmp12 = vsubq_s16(tmp12, tmp13);
int16x8_t tmp0 = vaddq_s16(tmp10, tmp13);
int16x8_t tmp3 = vsubq_s16(tmp10, tmp13);
int16x8_t tmp1 = vaddq_s16(tmp11, tmp12);
int16x8_t tmp2 = vsubq_s16(tmp11, tmp12);
/* Odd part */
int16x8_t z13 = vaddq_s16(col5, col3);
int16x8_t neg_z10 = vsubq_s16(col3, col5);
int16x8_t z11 = vaddq_s16(col1, col7);
int16x8_t z12 = vsubq_s16(col1, col7);
int16x8_t tmp7 = vaddq_s16(z11, z13); /* phase 5 */
int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
tmp11 = vaddq_s16(tmp11, z11_sub_z13);
int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
z5 = vaddq_s16(z5, z10_add_z12);
tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
tmp10 = vaddq_s16(tmp10, z12);
tmp10 = vsubq_s16(tmp10, z5);
tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
tmp12 = vaddq_s16(tmp12, z5);
int16x8_t tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */
int16x8_t tmp5 = vsubq_s16(tmp11, tmp6);
int16x8_t tmp4 = vaddq_s16(tmp10, tmp5);
col0 = vaddq_s16(tmp0, tmp7);
col7 = vsubq_s16(tmp0, tmp7);
col1 = vaddq_s16(tmp1, tmp6);
col6 = vsubq_s16(tmp1, tmp6);
col2 = vaddq_s16(tmp2, tmp5);
col5 = vsubq_s16(tmp2, tmp5);
col4 = vaddq_s16(tmp3, tmp4);
col3 = vsubq_s16(tmp3, tmp4);
/* Scale down by a factor of 8, narrowing to 8-bit. */
int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3),
vqshrn_n_s16(col1, PASS1_BITS + 3));
int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3),
vqshrn_n_s16(col5, PASS1_BITS + 3));
int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3),
vqshrn_n_s16(col3, PASS1_BITS + 3));
int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3),
vqshrn_n_s16(col7, PASS1_BITS + 3));
/* Clamp to range [0-255]. */
uint8x16_t cols_01 =
vreinterpretq_u8_s8
(vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
uint8x16_t cols_45 =
vreinterpretq_u8_s8
(vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
uint8x16_t cols_23 =
vreinterpretq_u8_s8
(vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
uint8x16_t cols_67 =
vreinterpretq_u8_s8
(vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
/* Transpose block to prepare for store. */
uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01),
vreinterpretq_u32_u8(cols_45));
uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23),
vreinterpretq_u32_u8(cols_67));
uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]),
vreinterpretq_u8_u32(cols_0415.val[1]));
uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]),
vreinterpretq_u8_u32(cols_2637.val[1]));
uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]),
vreinterpretq_u16_u8(cols_2367.val[0]));
uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]),
vreinterpretq_u16_u8(cols_2367.val[1]));
uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]);
uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]);
uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]);
uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]);
JSAMPROW outptr0 = output_buf + DCTSIZE * 0;
JSAMPROW outptr1 = output_buf + DCTSIZE * 1;
JSAMPROW outptr2 = output_buf + DCTSIZE * 2;
JSAMPROW outptr3 = output_buf + DCTSIZE * 3;
JSAMPROW outptr4 = output_buf + DCTSIZE * 4;
JSAMPROW outptr5 = output_buf + DCTSIZE * 5;
JSAMPROW outptr6 = output_buf + DCTSIZE * 6;
JSAMPROW outptr7 = output_buf + DCTSIZE * 7;
/* Store DCT block to memory. */
vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0);
vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0);
vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0);
vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0);
vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1);
vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1);
vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1);
vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1);
}
static int flss(uint16_t val) {
int bit;
bit = 16;
if (!val)
return 0;
if (!(val & 0xff00)) {
bit -= 8;
val <<= 8;
}
if (!(val & 0xf000)) {
bit -= 4;
val <<= 4;
}
if (!(val & 0xc000)) {
bit -= 2;
val <<= 2;
}
if (!(val & 0x8000)) {
bit -= 1;
val <<= 1;
}
return bit;
}
static int compute_reciprocal(uint16_t divisor, DCTELEM *dtbl) {
UDCTELEM2 fq, fr;
UDCTELEM c;
int b, r;
if (divisor == 1) {
/* divisor == 1 means unquantized, so these reciprocal/correction/shift
* values will cause the C quantization algorithm to act like the
* identity function. Since only the C quantization algorithm is used in
* these cases, the scale value is irrelevant.
*/
dtbl[DCTSIZE2 * 0] = (DCTELEM)1; /* reciprocal */
dtbl[DCTSIZE2 * 1] = (DCTELEM)0; /* correction */
dtbl[DCTSIZE2 * 2] = (DCTELEM)1; /* scale */
dtbl[DCTSIZE2 * 3] = -(DCTELEM)(sizeof(DCTELEM) * 8); /* shift */
return 0;
}
b = flss(divisor) - 1;
r = sizeof(DCTELEM) * 8 + b;
fq = ((UDCTELEM2)1 << r) / divisor;
fr = ((UDCTELEM2)1 << r) % divisor;
c = divisor / 2; /* for rounding */
if (fr == 0) { /* divisor is power of two */
/* fq will be one bit too large to fit in DCTELEM, so adjust */
fq >>= 1;
r--;
} else if (fr <= (divisor / 2U)) { /* fractional part is < 0.5 */
c++;
} else { /* fractional part is > 0.5 */
fq++;
}
dtbl[DCTSIZE2 * 0] = (DCTELEM)fq; /* reciprocal */
dtbl[DCTSIZE2 * 1] = (DCTELEM)c; /* correction + roundfactor */
#ifdef WITH_SIMD
dtbl[DCTSIZE2 * 2] = (DCTELEM)(1 << (sizeof(DCTELEM) * 8 * 2 - r)); /* scale */
#else
dtbl[DCTSIZE2 * 2] = 1;
#endif
dtbl[DCTSIZE2 * 3] = (DCTELEM)r - sizeof(DCTELEM) * 8; /* shift */
if (r <= 16) return 0;
else return 1;
}
#define DESCALE(x, n) RIGHT_SHIFT(x, n)
/* Multiply a DCTELEM variable by an JLONG constant, and immediately
* descale to yield a DCTELEM result.
*/
#define MULTIPLY(var, const) ((DCTELEM)DESCALE((var) * (const), CONST_BITS))
#define MULTIPLY16V16(var1, var2) ((var1) * (var2))
static DCTELEM std_luminance_quant_tbl[DCTSIZE2] = {
16, 11, 10, 16, 24, 40, 51, 61,
12, 12, 14, 19, 26, 58, 60, 55,
14, 13, 16, 24, 40, 57, 69, 56,
14, 17, 22, 29, 51, 87, 80, 62,
18, 22, 37, 56, 68, 109, 103, 77,
24, 35, 55, 64, 81, 104, 113, 92,
49, 64, 78, 87, 103, 121, 120, 101,
72, 92, 95, 98, 112, 100, 103, 99
};
static int jpeg_quality_scaling(int quality)
/* Convert a user-specified quality rating to a percentage scaling factor
* for an underlying quantization table, using our recommended scaling curve.
* The input 'quality' factor should be 0 (terrible) to 100 (very good).
*/
{
/* Safety limit on quality factor. Convert 0 to 1 to avoid zero divide. */
if (quality <= 0) quality = 1;
if (quality > 100) quality = 100;
/* The basic table is used as-is (scaling 100) for a quality of 50.
* Qualities 50..100 are converted to scaling percentage 200 - 2*Q;
* note that at Q=100 the scaling is 0, which will cause jpeg_add_quant_table
* to make all the table entries 1 (hence, minimum quantization loss).
* Qualities 1..50 are converted to scaling percentage 5000/Q.
*/
if (quality < 50)
quality = 5000 / quality;
else
quality = 200 - quality * 2;
return quality;
}
static void jpeg_add_quant_table(DCTELEM *qtable, DCTELEM *basicTable, int scale_factor, bool forceBaseline)
/* Define a quantization table equal to the basic_table times
* a scale factor (given as a percentage).
* If force_baseline is TRUE, the computed quantization table entries
* are limited to 1..255 for JPEG baseline compatibility.
*/
{
int i;
long temp;
for (i = 0; i < DCTSIZE2; i++) {
temp = ((long)basicTable[i] * scale_factor + 50L) / 100L;
/* limit the values to the valid range */
if (temp <= 0L) temp = 1L;
if (temp > 32767L) temp = 32767L; /* max quantizer needed for 12 bits */
if (forceBaseline && temp > 255L)
temp = 255L; /* limit to baseline range if requested */
qtable[i] = (uint16_t)temp;
}
}
static void jpeg_set_quality(DCTELEM *qtable, int quality)
/* Set or change the 'quality' (quantization) setting, using default tables.
* This is the standard quality-adjusting entry point for typical user
* interfaces; only those who want detailed control over quantization tables
* would use the preceding three routines directly.
*/
{
/* Convert user 0-100 rating to percentage scaling */
quality = jpeg_quality_scaling(quality);
/* Set up standard quality tables */
jpeg_add_quant_table(qtable, std_luminance_quant_tbl, quality, false);
}
static void getDivisors(DCTELEM *dtbl, DCTELEM *qtable) {
#define CONST_BITS 14
#define RIGHT_SHIFT(x, shft) ((x) >> (shft))
static const int16_t aanscales[DCTSIZE2] = {
/* precomputed values scaled up by 14 bits */
16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270,
21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906,
19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315,
16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
12873, 17855, 16819, 15137, 12873, 10114, 6967, 3552,
8867, 12299, 11585, 10426, 8867, 6967, 4799, 2446,
4520, 6270, 5906, 5315, 4520, 3552, 2446, 1247
};
for (int i = 0; i < DCTSIZE2; i++) {
if (!compute_reciprocal(
DESCALE(MULTIPLY16V16((JLONG)qtable[i],
(JLONG)aanscales[i]),
CONST_BITS - 3), &dtbl[i])) {
//fdct->quantize = quantize;
printf("here\n");
}
}
}
static void quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
{
int i;
DCTELEM temp;
JCOEFPTR output_ptr = coef_block;
UDCTELEM recip, corr;
int shift;
UDCTELEM2 product;
for (i = 0; i < DCTSIZE2; i++) {
temp = workspace[i];
recip = divisors[i + DCTSIZE2 * 0];
corr = divisors[i + DCTSIZE2 * 1];
shift = divisors[i + DCTSIZE2 * 3];
if (temp < 0) {
temp = -temp;
product = (UDCTELEM2)(temp + corr) * recip;
product >>= shift + sizeof(DCTELEM) * 8;
temp = (DCTELEM)product;
temp = -temp;
} else {
product = (UDCTELEM2)(temp + corr) * recip;
product >>= shift + sizeof(DCTELEM) * 8;
temp = (DCTELEM)product;
}
output_ptr[i] = (JCOEF)temp;
}
}
NSData *generateForwardDctData(int quality) {
NSMutableData *divisors = [[NSMutableData alloc] initWithLength:DCTSIZE2 * 4 * sizeof(DCTELEM)];
DCTELEM qtable[DCTSIZE2];
jpeg_set_quality(qtable, quality);
getDivisors((DCTELEM *)divisors.mutableBytes, qtable);
return divisors;
}
NSData *generateInverseDctData(int quality) {
NSMutableData *divisors = [[NSMutableData alloc] initWithLength:DCTSIZE2 * sizeof(IFAST_MULT_TYPE)];
IFAST_MULT_TYPE *ifmtbl = (IFAST_MULT_TYPE *)divisors.mutableBytes;
DCTELEM qtable[DCTSIZE2];
jpeg_set_quality(qtable, quality);
#define CONST_BITS 14
static const int16_t aanscales[DCTSIZE2] = {
/* precomputed values scaled up by 14 bits */
16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270,
21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906,
19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315,
16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
12873, 17855, 16819, 15137, 12873, 10114, 6967, 3552,
8867, 12299, 11585, 10426, 8867, 6967, 4799, 2446,
4520, 6270, 5906, 5315, 4520, 3552, 2446, 1247
};
for (int i = 0; i < DCTSIZE2; i++) {
ifmtbl[i] = (IFAST_MULT_TYPE)
DESCALE(MULTIPLY16V16((JLONG)qtable[i],
(JLONG)aanscales[i]),
CONST_BITS - IFAST_SCALE_BITS);
}
return divisors;
}
static const int zigZagInv[DCTSIZE2] = {
0,1,8,16,9,2,3,10,
17,24,32,25,18,11,4,5,
12,19,26,33,40,48,41,34,
27,20,13,6,7,14,21,28,
35,42,49,56,57,50,43,36,
29,22,15,23,30,37,44,51,
58,59,52,45,38,31,39,46,
53,60,61,54,47,55,62,63
};
static const int zigZag[DCTSIZE2] = {
0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
};
void performForwardDct(uint8_t const *pixels, int16_t *coefficients, int width, int height, int bytesPerRow, NSData *dctData) {
DCTELEM *divisors = (DCTELEM *)dctData.bytes;
DCTELEM block[DCTSIZE2];
JCOEF coefBlock[DCTSIZE2];
for (int y = 0; y < height; y += DCTSIZE) {
for (int x = 0; x < width; x += DCTSIZE) {
for (int blockY = 0; blockY < DCTSIZE; blockY++) {
for (int blockX = 0; blockX < DCTSIZE; blockX++) {
block[blockY * DCTSIZE + blockX] = ((DCTELEM)pixels[(y + blockY) * bytesPerRow + (x + blockX)]) - CENTERJSAMPLE;
}
}
jsimd_fdct_ifast_neon(block);
quantize(coefBlock, divisors, block);
for (int blockY = 0; blockY < DCTSIZE; blockY++) {
for (int blockX = 0; blockX < DCTSIZE; blockX++) {
coefficients[(y + blockY) * bytesPerRow + (x + blockX)] = coefBlock[zigZagInv[blockY * DCTSIZE + blockX]];
}
}
}
}
}
void performInverseDct(int16_t const *coefficients, uint8_t *pixels, int width, int height, int coefficientsPerRow, int bytesPerRow, NSData *idctData) {
IFAST_MULT_TYPE *ifmtbl = (IFAST_MULT_TYPE *)idctData.bytes;
DCTELEM coefficientBlock[DCTSIZE2];
JSAMPLE pixelBlock[DCTSIZE2];
for (int y = 0; y < height; y += DCTSIZE) {
for (int x = 0; x < width; x += DCTSIZE) {
for (int blockY = 0; blockY < DCTSIZE; blockY++) {
for (int blockX = 0; blockX < DCTSIZE; blockX++) {
coefficientBlock[zigZag[blockY * DCTSIZE + blockX]] = coefficients[(y + blockY) * coefficientsPerRow + (x + blockX)];
}
}
jsimd_idct_ifast_neon(ifmtbl, coefficientBlock, pixelBlock);
for (int blockY = 0; blockY < DCTSIZE; blockY++) {
for (int blockX = 0; blockX < DCTSIZE; blockX++) {
pixels[(y + blockY) * bytesPerRow + (x + blockX)] = pixelBlock[blockY * DCTSIZE + blockX];
}
}
}
}
}

View File

@@ -0,0 +1,99 @@
#import <DCT/YuvConversion.h>
#import <Foundation/Foundation.h>
#import <Accelerate/Accelerate.h>
static uint8_t permuteMap[4] = { 3, 2, 1, 0};
void splitRGBAIntoYUVAPlanes(uint8_t const *argb, uint8_t *outY, uint8_t *outU, uint8_t *outV, uint8_t *outA, int width, int height, int bytesPerRow) {
static vImage_ARGBToYpCbCr info;
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
vImage_YpCbCrPixelRange pixelRange = (vImage_YpCbCrPixelRange){ 0, 128, 255, 255, 255, 1, 255, 0 };
vImageConvert_ARGBToYpCbCr_GenerateConversion(kvImage_ARGBToYpCbCrMatrix_ITU_R_709_2, &pixelRange, &info, kvImageARGB8888, kvImage420Yp8_Cb8_Cr8, 0);
});
vImage_Error error = kvImageNoError;
vImage_Buffer src;
src.data = (void *)argb;
src.width = width;
src.height = height;
src.rowBytes = bytesPerRow;
vImage_Buffer destYp;
destYp.data = outY;
destYp.width = width;
destYp.height = height;
destYp.rowBytes = width;
vImage_Buffer destCr;
destCr.data = outU;
destCr.width = width / 2;
destCr.height = height / 2;
destCr.rowBytes = width / 2;
vImage_Buffer destCb;
destCb.data = outV;
destCb.width = width / 2;
destCb.height = height / 2;
destCb.rowBytes = width / 2;
vImage_Buffer destA;
destA.data = outA;
destA.width = width;
destA.height = height;
destA.rowBytes = width;
error = vImageConvert_ARGB8888To420Yp8_Cb8_Cr8(&src, &destYp, &destCb, &destCr, &info, permuteMap, kvImageDoNotTile);
if (error != kvImageNoError) {
return;
}
vImageExtractChannel_ARGB8888(&src, &destA, 3, kvImageDoNotTile);
}
void combineYUVAPlanesIntoARBB(uint8_t *argb, uint8_t const *inY, uint8_t const *inU, uint8_t const *inV, uint8_t const *inA, int width, int height, int bytesPerRow) {
static vImage_YpCbCrToARGB info;
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
vImage_YpCbCrPixelRange pixelRange = (vImage_YpCbCrPixelRange){ 0, 128, 255, 255, 255, 1, 255, 0 };
vImageConvert_YpCbCrToARGB_GenerateConversion(kvImage_YpCbCrToARGBMatrix_ITU_R_709_2, &pixelRange, &info, kvImage420Yp8_Cb8_Cr8, kvImageARGB8888, 0);
});
vImage_Error error = kvImageNoError;
vImage_Buffer destArgb;
destArgb.data = (void *)argb;
destArgb.width = width;
destArgb.height = height;
destArgb.rowBytes = bytesPerRow;
vImage_Buffer srcYp;
srcYp.data = (void *)inY;
srcYp.width = width;
srcYp.height = height;
srcYp.rowBytes = width;
vImage_Buffer srcCr;
srcCr.data = (void *)inU;
srcCr.width = width / 2;
srcCr.height = height / 2;
srcCr.rowBytes = width / 2;
vImage_Buffer srcCb;
srcCb.data = (void *)inV;
srcCb.width = width / 2;
srcCb.height = height / 2;
srcCb.rowBytes = width / 2;
vImage_Buffer srcA;
srcA.data = (void *)inA;
srcA.width = width;
srcA.height = height;
srcA.rowBytes = width;
error = vImageConvert_420Yp8_Cb8_Cr8ToARGB8888(&srcYp, &srcCb, &srcCr, &destArgb, &info, permuteMap, 255, kvImageDoNotTile);
error = vImageOverwriteChannels_ARGB8888(&srcA, &destArgb, &destArgb, 1 << 0, kvImageDoNotTile);
}

View File

@@ -3,6 +3,7 @@ import UIKit
import SwiftSignalKit
import CryptoUtils
import ManagedFile
import Compression
public final class AnimationCacheItemFrame {
public enum Format {
@@ -25,19 +26,51 @@ public final class AnimationCacheItemFrame {
public final class AnimationCacheItem {
public let numFrames: Int
private let getFrameImpl: (Int) -> AnimationCacheItemFrame?
private let getFrameIndexImpl: (Double) -> Int
public init(numFrames: Int, getFrame: @escaping (Int) -> AnimationCacheItemFrame?) {
public init(numFrames: Int, getFrame: @escaping (Int) -> AnimationCacheItemFrame?, getFrameIndexImpl: @escaping (Double) -> Int) {
self.numFrames = numFrames
self.getFrameImpl = getFrame
self.getFrameIndexImpl = getFrameIndexImpl
}
public func getFrame(index: Int) -> AnimationCacheItemFrame? {
return self.getFrameImpl(index)
}
public func getFrame(at duration: Double) -> AnimationCacheItemFrame? {
let index = self.getFrameIndexImpl(duration)
return self.getFrameImpl(index)
}
}
public struct AnimationCacheItemDrawingSurface {
public let argb: UnsafeMutablePointer<UInt8>
public let width: Int
public let height: Int
public let bytesPerRow: Int
public let length: Int
init(
argb: UnsafeMutablePointer<UInt8>,
width: Int,
height: Int,
bytesPerRow: Int,
length: Int
) {
self.argb = argb
self.width = width
self.height = height
self.bytesPerRow = bytesPerRow
self.length = length
}
}
public protocol AnimationCacheItemWriter: AnyObject {
func add(bytes: UnsafeRawPointer, length: Int, width: Int, height: Int, bytesPerRow: Int, duration: Double)
var queue: Queue { get }
var isCancelled: Bool { get }
func add(with drawingBlock: (AnimationCacheItemDrawingSurface) -> Void, proposedWidth: Int, proposedHeight: Int, duration: Double)
func finish()
}
@@ -53,7 +86,8 @@ public final class AnimationCacheItemResult {
public protocol AnimationCache: AnyObject {
func get(sourceId: String, size: CGSize, fetch: @escaping (CGSize, AnimationCacheItemWriter) -> Disposable) -> Signal<AnimationCacheItemResult, NoError>
func getSynchronously(sourceId: String, size: CGSize) -> AnimationCacheItem?
func getFirstFrameSynchronously(sourceId: String, size: CGSize) -> AnimationCacheItem?
func getFirstFrame(queue: Queue, sourceId: String, size: CGSize, completion: @escaping (AnimationCacheItem?) -> Void) -> Disposable
}
private func md5Hash(_ string: String) -> String {
@@ -80,11 +114,82 @@ private func itemSubpath(hashString: String) -> (directory: String, fileName: St
return (directory, hashString)
}
private func roundUp(_ numToRound: Int, multiple: Int) -> Int {
if multiple == 0 {
return numToRound
}
let remainder = numToRound % multiple
if remainder == 0 {
return numToRound;
}
return numToRound + multiple - remainder
}
private func compressData(data: Data, addSizeHeader: Bool = false) -> Data? {
let algorithm: compression_algorithm = COMPRESSION_LZFSE
let scratchData = malloc(compression_encode_scratch_buffer_size(algorithm))!
defer {
free(scratchData)
}
let headerSize = addSizeHeader ? 4 : 0
var compressedData = Data(count: headerSize + data.count + 16 * 1024)
let resultSize = compressedData.withUnsafeMutableBytes { buffer -> Int in
guard let bytes = buffer.baseAddress?.assumingMemoryBound(to: UInt8.self) else {
return 0
}
if addSizeHeader {
var decompressedSize: UInt32 = UInt32(data.count)
memcpy(bytes, &decompressedSize, 4)
}
return data.withUnsafeBytes { sourceBuffer -> Int in
return compression_encode_buffer(bytes.advanced(by: headerSize), buffer.count - headerSize, sourceBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self), sourceBuffer.count, scratchData, algorithm)
}
}
if resultSize <= 0 {
return nil
}
compressedData.count = headerSize + resultSize
return compressedData
}
private func decompressData(data: Data, range: Range<Int>, decompressedSize: Int) -> Data? {
let algorithm: compression_algorithm = COMPRESSION_LZFSE
let scratchData = malloc(compression_decode_scratch_buffer_size(algorithm))!
defer {
free(scratchData)
}
var decompressedFrameData = Data(count: decompressedSize)
let resultSize = decompressedFrameData.withUnsafeMutableBytes { buffer -> Int in
guard let bytes = buffer.baseAddress?.assumingMemoryBound(to: UInt8.self) else {
return 0
}
return data.withUnsafeBytes { sourceBuffer -> Int in
return compression_decode_buffer(bytes, buffer.count, sourceBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self).advanced(by: range.lowerBound), range.upperBound - range.lowerBound, scratchData, algorithm)
}
}
if resultSize <= 0 {
return nil
}
if decompressedFrameData.count != resultSize {
decompressedFrameData.count = resultSize
}
return decompressedFrameData
}
private final class AnimationCacheItemWriterImpl: AnimationCacheItemWriter {
private struct ParameterSet: Equatable {
var width: Int
var height: Int
var bytesPerRow: Int
struct CompressedResult {
var animationPath: String
var firstFramePath: String
}
private struct FrameMetadata {
@@ -93,10 +198,19 @@ private final class AnimationCacheItemWriterImpl: AnimationCacheItemWriter {
var duration: Double
}
private let file: ManagedFile
private let completion: (Bool) -> Void
let queue: Queue
var isCancelled: Bool = false
private var currentParameterSet: ParameterSet?
private let decompressedPath: String
private let compressedPath: String
private let firstFramePath: String
private var file: ManagedFile?
private let completion: (CompressedResult?) -> Void
private var currentSurface: ImageARGB?
private var currentYUVASurface: ImageYUVA420?
private var currentDctData: DctData?
private var currentDctCoefficients: DctCoefficientsYUVA420?
private var contentLengthOffset: Int?
private var isFailed: Bool = false
private var isFinished: Bool = false
@@ -104,44 +218,141 @@ private final class AnimationCacheItemWriterImpl: AnimationCacheItemWriter {
private var frames: [FrameMetadata] = []
private var contentLength: Int = 0
private let dctQuality: Int
private let lock = Lock()
init?(tempPath: String, completion: @escaping (Bool) -> Void) {
guard let file = ManagedFile(queue: nil, path: tempPath, mode: .readwrite) else {
init?(queue: Queue, allocateTempFile: @escaping () -> String, completion: @escaping (CompressedResult?) -> Void) {
self.dctQuality = 67
self.queue = queue
self.decompressedPath = allocateTempFile()
self.compressedPath = allocateTempFile()
self.firstFramePath = allocateTempFile()
guard let file = ManagedFile(queue: nil, path: self.decompressedPath, mode: .readwrite) else {
return nil
}
self.file = file
self.completion = completion
}
func add(bytes: UnsafeRawPointer, length: Int, width: Int, height: Int, bytesPerRow: Int, duration: Double) {
func add(with drawingBlock: (AnimationCacheItemDrawingSurface) -> Void, proposedWidth: Int, proposedHeight: Int, duration: Double) {
if self.isFailed || self.isFinished {
return
}
self.lock.locked {
if self.isFailed {
guard !self.isFailed, !self.isFinished, let file = self.file else {
return
}
let parameterSet = ParameterSet(width: width, height: height, bytesPerRow: bytesPerRow)
if let currentParameterSet = self.currentParameterSet {
if currentParameterSet != parameterSet {
let width = roundUp(proposedWidth, multiple: 16)
let height = roundUp(proposedWidth, multiple: 16)
var isFirstFrame = false
let surface: ImageARGB
if let current = self.currentSurface {
if current.argbPlane.width == width && current.argbPlane.height == height {
surface = current
} else {
self.isFailed = true
return
}
} else {
self.currentParameterSet = parameterSet
isFirstFrame = true
self.file.write(1 as UInt32)
self.file.write(UInt32(parameterSet.width))
self.file.write(UInt32(parameterSet.height))
self.file.write(UInt32(parameterSet.bytesPerRow))
self.contentLengthOffset = Int(self.file.position())
self.file.write(0 as UInt32)
surface = ImageARGB(width: width, height: height)
self.currentSurface = surface
}
self.frames.append(FrameMetadata(offset: Int(self.file.position()), length: length, duration: duration))
let _ = self.file.write(bytes, count: length)
self.contentLength += length
let yuvaSurface: ImageYUVA420
if let current = self.currentYUVASurface {
if current.yPlane.width == width && current.yPlane.height == height {
yuvaSurface = current
} else {
self.isFailed = true
return
}
} else {
yuvaSurface = ImageYUVA420(width: width, height: height)
self.currentYUVASurface = yuvaSurface
}
let dctCoefficients: DctCoefficientsYUVA420
if let current = self.currentDctCoefficients {
if current.yPlane.width == width && current.yPlane.height == height {
dctCoefficients = current
} else {
self.isFailed = true
return
}
} else {
dctCoefficients = DctCoefficientsYUVA420(width: width, height: height)
self.currentDctCoefficients = dctCoefficients
}
let dctData: DctData
if let current = self.currentDctData, current.quality == self.dctQuality {
dctData = current
} else {
dctData = DctData(quality: self.dctQuality)
self.currentDctData = dctData
}
surface.argbPlane.data.withUnsafeMutableBytes { bytes -> Void in
drawingBlock(AnimationCacheItemDrawingSurface(
argb: bytes.baseAddress!.assumingMemoryBound(to: UInt8.self),
width: width,
height: height,
bytesPerRow: surface.argbPlane.bytesPerRow,
length: bytes.count
))
}
surface.toYUVA420(target: yuvaSurface)
yuvaSurface.dct(dctData: dctData, target: dctCoefficients)
if isFirstFrame {
file.write(2 as UInt32)
file.write(UInt32(dctCoefficients.yPlane.width))
file.write(UInt32(dctCoefficients.yPlane.height))
file.write(UInt32(dctData.quality))
self.contentLengthOffset = Int(file.position())
file.write(0 as UInt32)
}
let framePosition = Int(file.position())
assert(framePosition >= 0)
var frameLength = 0
for i in 0 ..< 4 {
let dctPlane: DctCoefficientPlane
switch i {
case 0:
dctPlane = dctCoefficients.yPlane
case 1:
dctPlane = dctCoefficients.uPlane
case 2:
dctPlane = dctCoefficients.vPlane
case 3:
dctPlane = dctCoefficients.aPlane
default:
preconditionFailure()
}
dctPlane.data.withUnsafeBytes { bytes in
let _ = file.write(bytes.baseAddress!.assumingMemoryBound(to: UInt8.self), count: bytes.count)
}
frameLength += dctPlane.data.count
}
self.frames.append(FrameMetadata(offset: framePosition, length: frameLength, duration: duration))
self.contentLength += frameLength
}
}
@@ -152,27 +363,96 @@ private final class AnimationCacheItemWriterImpl: AnimationCacheItemWriter {
self.isFinished = true
shouldComplete = true
guard let contentLengthOffset = self.contentLengthOffset else {
guard let contentLengthOffset = self.contentLengthOffset, let file = self.file else {
self.isFailed = true
return
}
assert(contentLengthOffset >= 0)
let metadataPosition = file.position()
file.seek(position: Int64(contentLengthOffset))
file.write(UInt32(self.contentLength))
file.seek(position: metadataPosition)
file.write(UInt32(self.frames.count))
for frame in self.frames {
file.write(UInt32(frame.offset))
file.write(UInt32(frame.length))
file.write(Float32(frame.duration))
}
if !self.frames.isEmpty, let dctCoefficients = self.currentDctCoefficients, let dctData = self.currentDctData {
var firstFrameData = Data(capacity: 4 * 5 + self.frames[0].length)
writeUInt32(data: &firstFrameData, value: 2 as UInt32)
writeUInt32(data: &firstFrameData, value: UInt32(dctCoefficients.yPlane.width))
writeUInt32(data: &firstFrameData, value: UInt32(dctCoefficients.yPlane.height))
writeUInt32(data: &firstFrameData, value: UInt32(dctData.quality))
writeUInt32(data: &firstFrameData, value: UInt32(self.frames[0].length))
let firstFrameStart = 4 * 5
file.seek(position: Int64(self.frames[0].offset))
firstFrameData.count += self.frames[0].length
firstFrameData.withUnsafeMutableBytes { bytes in
let _ = file.read(bytes.baseAddress!.advanced(by: 4 * 5), self.frames[0].length)
}
writeUInt32(data: &firstFrameData, value: UInt32(1))
writeUInt32(data: &firstFrameData, value: UInt32(firstFrameStart))
writeUInt32(data: &firstFrameData, value: UInt32(self.frames[0].length))
writeFloat32(data: &firstFrameData, value: Float32(1.0))
guard let compressedFirstFrameData = compressData(data: firstFrameData, addSizeHeader: true) else {
self.isFailed = true
return
}
guard let _ = try? compressedFirstFrameData.write(to: URL(fileURLWithPath: self.firstFramePath)) else {
self.isFailed = true
return
}
} else {
self.isFailed = true
return
}
let metadataPosition = self.file.position()
self.file.seek(position: Int64(contentLengthOffset))
self.file.write(UInt32(self.contentLength))
self.file.seek(position: metadataPosition)
self.file.write(UInt32(self.frames.count))
for frame in self.frames {
self.file.write(UInt32(frame.offset))
self.file.write(UInt32(frame.length))
self.file.write(Float32(frame.duration))
if !self.isFailed {
self.file = nil
file._unsafeClose()
guard let uncompressedData = try? Data(contentsOf: URL(fileURLWithPath: self.decompressedPath), options: .alwaysMapped) else {
self.isFailed = true
return
}
guard let compressedData = compressData(data: uncompressedData) else {
self.isFailed = true
return
}
guard let compressedFile = ManagedFile(queue: nil, path: self.compressedPath, mode: .readwrite) else {
self.isFailed = true
return
}
compressedFile.write(Int32(uncompressedData.count))
let _ = compressedFile.write(compressedData)
compressedFile._unsafeClose()
}
}
}
if shouldComplete {
self.completion(!self.isFailed)
let _ = try? FileManager.default.removeItem(atPath: self.decompressedPath)
if !self.isFailed {
self.completion(CompressedResult(
animationPath: self.compressedPath,
firstFramePath: self.firstFramePath
))
} else {
let _ = try? FileManager.default.removeItem(atPath: self.compressedPath)
let _ = try? FileManager.default.removeItem(atPath: self.firstFramePath)
self.completion(nil)
}
}
}
}
@@ -185,12 +465,34 @@ private final class AnimationCacheItemAccessor {
private let data: Data
private let frameMapping: [Int: FrameInfo]
private let format: AnimationCacheItemFrame.Format
private let durationMapping: [Double]
private let totalDuration: Double
init(data: Data, frameMapping: [Int: FrameInfo], format: AnimationCacheItemFrame.Format) {
private var currentYUVASurface: ImageYUVA420
private var currentDctData: DctData
private var currentDctCoefficients: DctCoefficientsYUVA420
init(data: Data, frameMapping: [FrameInfo], width: Int, height: Int, dctQuality: Int) {
self.data = data
self.frameMapping = frameMapping
self.format = format
var resultFrameMapping: [Int: FrameInfo] = [:]
var durationMapping: [Double] = []
var totalDuration: Double = 0.0
for i in 0 ..< frameMapping.count {
let frame = frameMapping[i]
resultFrameMapping[i] = frame
totalDuration += frame.duration
durationMapping.append(totalDuration)
}
self.frameMapping = resultFrameMapping
self.durationMapping = durationMapping
self.totalDuration = totalDuration
self.currentYUVASurface = ImageYUVA420(width: width, height: height)
self.currentDctData = DctData(quality: dctQuality)
self.currentDctCoefficients = DctCoefficientsYUVA420(width: width, height: height)
}
func getFrame(index: Int) -> AnimationCacheItemFrame? {
@@ -198,7 +500,56 @@ private final class AnimationCacheItemAccessor {
return nil
}
return AnimationCacheItemFrame(data: data, range: frameInfo.range, format: self.format, duration: frameInfo.duration)
let currentSurface = ImageARGB(width: self.currentYUVASurface.yPlane.width, height: self.currentYUVASurface.yPlane.height)
var frameDataOffset = 0
let frameLength = frameInfo.range.upperBound - frameInfo.range.lowerBound
for i in 0 ..< 4 {
let dctPlane: DctCoefficientPlane
switch i {
case 0:
dctPlane = self.currentDctCoefficients.yPlane
case 1:
dctPlane = self.currentDctCoefficients.uPlane
case 2:
dctPlane = self.currentDctCoefficients.vPlane
case 3:
dctPlane = self.currentDctCoefficients.aPlane
default:
preconditionFailure()
}
if frameDataOffset + dctPlane.data.count > frameLength {
break
}
dctPlane.data.withUnsafeMutableBytes { targetBuffer -> Void in
self.data.copyBytes(to: targetBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self), from: (frameInfo.range.lowerBound + frameDataOffset) ..< (frameInfo.range.lowerBound + frameDataOffset + targetBuffer.count))
}
frameDataOffset += dctPlane.data.count
}
self.currentDctCoefficients.idct(dctData: self.currentDctData, target: self.currentYUVASurface)
self.currentYUVASurface.toARGB(target: currentSurface)
return AnimationCacheItemFrame(data: currentSurface.argbPlane.data, range: 0 ..< currentSurface.argbPlane.data.count, format: .rgba(width: currentSurface.argbPlane.width, height: currentSurface.argbPlane.height, bytesPerRow: currentSurface.argbPlane.bytesPerRow), duration: frameInfo.duration)
}
func getFrameIndex(duration: Double) -> Int {
if self.totalDuration == 0.0 {
return 0
}
if self.durationMapping.count <= 1 {
return 0
}
let normalizedDuration = duration.truncatingRemainder(dividingBy: self.totalDuration)
for i in 1 ..< self.durationMapping.count {
if normalizedDuration < self.durationMapping[i] {
return i - 1
}
}
return self.durationMapping.count - 1
}
}
@@ -213,10 +564,54 @@ private func readUInt32(data: Data, offset: Int) -> UInt32 {
return value
}
private func readFloat32(data: Data, offset: Int) -> Float32 {
var value: Float32 = 0
withUnsafeMutableBytes(of: &value, { bytes -> Void in
data.withUnsafeBytes { dataBytes -> Void in
memcpy(bytes.baseAddress!, dataBytes.baseAddress!.advanced(by: offset), 4)
}
})
return value
}
private func writeUInt32(data: inout Data, value: UInt32) {
var value: UInt32 = value
withUnsafeBytes(of: &value, { bytes -> Void in
data.count += 4
data.withUnsafeMutableBytes { dataBytes -> Void in
memcpy(dataBytes.baseAddress!.advanced(by: dataBytes.count - 4), bytes.baseAddress!, 4)
}
})
}
private func writeFloat32(data: inout Data, value: Float32) {
var value: Float32 = value
withUnsafeBytes(of: &value, { bytes -> Void in
data.count += 4
data.withUnsafeMutableBytes { dataBytes -> Void in
memcpy(dataBytes.baseAddress!.advanced(by: dataBytes.count - 4), bytes.baseAddress!, 4)
}
})
}
private func loadItem(path: String) -> AnimationCacheItem? {
guard let data = try? Data(contentsOf: URL(fileURLWithPath: path), options: .alwaysMapped) else {
guard let compressedData = try? Data(contentsOf: URL(fileURLWithPath: path), options: .alwaysMapped) else {
return nil
}
if compressedData.count < 4 {
return nil
}
let decompressedSize = readUInt32(data: compressedData, offset: 0)
if decompressedSize <= 0 || decompressedSize > 20 * 1024 * 1024 {
return nil
}
guard let data = decompressData(data: compressedData, range: 4 ..< compressedData.count, decompressedSize: Int(decompressedSize)) else {
return nil
}
let dataLength = data.count
var offset = 0
@@ -226,7 +621,7 @@ private func loadItem(path: String) -> AnimationCacheItem? {
}
let formatVersion = readUInt32(data: data, offset: offset)
offset += 4
if formatVersion != 1 {
if formatVersion != 2 {
return nil
}
@@ -245,7 +640,7 @@ private func loadItem(path: String) -> AnimationCacheItem? {
guard dataLength >= offset + 4 else {
return nil
}
let bytesPerRow = readUInt32(data: data, offset: offset)
let dctQuality = readUInt32(data: data, offset: offset)
offset += 4
guard dataLength >= offset + 4 else {
@@ -262,8 +657,8 @@ private func loadItem(path: String) -> AnimationCacheItem? {
let numFrames = readUInt32(data: data, offset: offset)
offset += 4
var frameMapping: [Int: AnimationCacheItemAccessor.FrameInfo] = [:]
for i in 0 ..< Int(numFrames) {
var frameMapping: [AnimationCacheItemAccessor.FrameInfo] = []
for _ in 0 ..< Int(numFrames) {
guard dataLength >= offset + 4 + 4 + 4 else {
return nil
}
@@ -272,16 +667,18 @@ private func loadItem(path: String) -> AnimationCacheItem? {
offset += 4
let frameLength = readUInt32(data: data, offset: offset)
offset += 4
let frameDuration = readUInt32(data: data, offset: offset)
let frameDuration = readFloat32(data: data, offset: offset)
offset += 4
frameMapping[i] = AnimationCacheItemAccessor.FrameInfo(range: Int(frameStart) ..< Int(frameStart + frameLength), duration: Double(frameDuration))
frameMapping.append(AnimationCacheItemAccessor.FrameInfo(range: Int(frameStart) ..< Int(frameStart + frameLength), duration: Double(frameDuration)))
}
let itemAccessor = AnimationCacheItemAccessor(data: data, frameMapping: frameMapping, format: .rgba(width: Int(width), height: Int(height), bytesPerRow: Int(bytesPerRow)))
let itemAccessor = AnimationCacheItemAccessor(data: data, frameMapping: frameMapping, width: Int(width), height: Int(height), dctQuality: Int(dctQuality))
return AnimationCacheItem(numFrames: Int(numFrames), getFrame: { index in
return itemAccessor.getFrame(index: index)
}, getFrameIndexImpl: { duration in
return itemAccessor.getFrameIndex(duration: duration)
})
}
@@ -300,10 +697,14 @@ public final class AnimationCacheImpl: AnimationCache {
private let basePath: String
private let allocateTempFile: () -> String
private let fetchQueues: [Queue]
private var nextFetchQueueIndex: Int = 0
private var itemContexts: [String: ItemContext] = [:]
init(queue: Queue, basePath: String, allocateTempFile: @escaping () -> String) {
self.queue = queue
self.fetchQueues = (0 ..< 2).map { _ in Queue() }
self.basePath = basePath
self.allocateTempFile = allocateTempFile
}
@@ -315,9 +716,10 @@ public final class AnimationCacheImpl: AnimationCache {
let sourceIdPath = itemSubpath(hashString: md5Hash(sourceId + "-\(Int(size.width))x\(Int(size.height))"))
let itemDirectoryPath = "\(self.basePath)/\(sourceIdPath.directory)"
let itemPath = "\(itemDirectoryPath)/\(sourceIdPath.fileName)"
let itemFirstFramePath = "\(itemDirectoryPath)/\(sourceIdPath.fileName)-f"
if FileManager.default.fileExists(atPath: itemPath) {
updateResult(AnimationCacheItemResult(item: loadItem(path: itemPath), isFinal: true))
if FileManager.default.fileExists(atPath: itemPath), let item = loadItem(path: itemPath) {
updateResult(AnimationCacheItemResult(item: item, isFinal: true))
return EmptyDisposable
}
@@ -338,8 +740,7 @@ public final class AnimationCacheImpl: AnimationCache {
updateResult(AnimationCacheItemResult(item: nil, isFinal: false))
if beginFetch {
let tempPath = self.allocateTempFile()
guard let writer = AnimationCacheItemWriterImpl(tempPath: tempPath, completion: { [weak self, weak itemContext] success in
guard let writer = AnimationCacheItemWriterImpl(queue: self.fetchQueues[self.nextFetchQueueIndex % self.fetchQueues.count], allocateTempFile: self.allocateTempFile, completion: { [weak self, weak itemContext] result in
queue.async {
guard let strongSelf = self, let itemContext = itemContext, itemContext === strongSelf.itemContexts[sourceId] else {
return
@@ -347,13 +748,18 @@ public final class AnimationCacheImpl: AnimationCache {
strongSelf.itemContexts.removeValue(forKey: sourceId)
guard success else {
guard let result = result else {
return
}
guard let _ = try? FileManager.default.createDirectory(at: URL(fileURLWithPath: itemDirectoryPath), withIntermediateDirectories: true, attributes: nil) else {
return
}
guard let _ = try? FileManager.default.moveItem(atPath: tempPath, toPath: itemPath) else {
let _ = try? FileManager.default.removeItem(atPath: itemPath)
guard let _ = try? FileManager.default.moveItem(atPath: result.animationPath, toPath: itemPath) else {
return
}
let _ = try? FileManager.default.removeItem(atPath: itemFirstFramePath)
guard let _ = try? FileManager.default.moveItem(atPath: result.firstFramePath, toPath: itemFirstFramePath) else {
return
}
guard let item = loadItem(path: itemPath) else {
@@ -368,9 +774,14 @@ public final class AnimationCacheImpl: AnimationCache {
return EmptyDisposable
}
let fetchDisposable = fetch(size, writer)
let fetchDisposable = MetaDisposable()
fetchDisposable.set(fetch(size, writer))
itemContext.disposable.set(ActionDisposable {
itemContext.disposable.set(ActionDisposable { [weak writer] in
if let writer = writer {
writer.isCancelled = true
}
fetchDisposable.dispose()
})
}
@@ -389,25 +800,43 @@ public final class AnimationCacheImpl: AnimationCache {
}
}
func getSynchronously(sourceId: String, size: CGSize) -> AnimationCacheItem? {
static func getFirstFrameSynchronously(basePath: String, sourceId: String, size: CGSize) -> AnimationCacheItem? {
let sourceIdPath = itemSubpath(hashString: md5Hash(sourceId + "-\(Int(size.width))x\(Int(size.height))"))
let itemDirectoryPath = "\(self.basePath)/\(sourceIdPath.directory)"
let itemPath = "\(itemDirectoryPath)/\(sourceIdPath.fileName)"
let itemDirectoryPath = "\(basePath)/\(sourceIdPath.directory)"
let itemFirstFramePath = "\(itemDirectoryPath)/\(sourceIdPath.fileName)-f"
if FileManager.default.fileExists(atPath: itemPath) {
return loadItem(path: itemPath)
if FileManager.default.fileExists(atPath: itemFirstFramePath) {
return loadItem(path: itemFirstFramePath)
} else {
return nil
}
}
static func getFirstFrame(basePath: String, sourceId: String, size: CGSize, completion: @escaping (AnimationCacheItem?) -> Void) -> Disposable {
let sourceIdPath = itemSubpath(hashString: md5Hash(sourceId + "-\(Int(size.width))x\(Int(size.height))"))
let itemDirectoryPath = "\(basePath)/\(sourceIdPath.directory)"
let itemFirstFramePath = "\(itemDirectoryPath)/\(sourceIdPath.fileName)-f"
if FileManager.default.fileExists(atPath: itemFirstFramePath), let item = loadItem(path: itemFirstFramePath) {
completion(item)
return EmptyDisposable
} else {
completion(nil)
return EmptyDisposable
}
}
}
private let queue: Queue
private let basePath: String
private let impl: QueueLocalObject<Impl>
public init(basePath: String, allocateTempFile: @escaping () -> String) {
let queue = Queue()
self.queue = queue
self.basePath = basePath
self.impl = QueueLocalObject(queue: queue, generate: {
return Impl(queue: queue, basePath: basePath, allocateTempFile: allocateTempFile)
})
@@ -431,9 +860,18 @@ public final class AnimationCacheImpl: AnimationCache {
|> runOn(self.queue)
}
public func getSynchronously(sourceId: String, size: CGSize) -> AnimationCacheItem? {
return self.impl.syncWith { impl -> AnimationCacheItem? in
return impl.getSynchronously(sourceId: sourceId, size: size)
public func getFirstFrameSynchronously(sourceId: String, size: CGSize) -> AnimationCacheItem? {
return Impl.getFirstFrameSynchronously(basePath: self.basePath, sourceId: sourceId, size: size)
}
public func getFirstFrame(queue: Queue, sourceId: String, size: CGSize, completion: @escaping (AnimationCacheItem?) -> Void) -> Disposable {
let disposable = MetaDisposable()
let basePath = self.basePath
queue.async {
disposable.set(Impl.getFirstFrame(basePath: basePath, sourceId: sourceId, size: size, completion: completion))
}
return disposable
}
}

View File

@@ -0,0 +1,231 @@
import Foundation
import UIKit
import DCT
final class ImagePlane {
let width: Int
let height: Int
let bytesPerRow: Int
let components: Int
var data: Data
init(width: Int, height: Int, components: Int) {
self.width = width
self.height = height
self.bytesPerRow = width * components
self.components = components
self.data = Data(count: width * components * height)
}
}
final class ImageARGB {
let argbPlane: ImagePlane
init(width: Int, height: Int) {
self.argbPlane = ImagePlane(width: width, height: height, components: 4)
}
}
final class ImageYUVA420 {
let yPlane: ImagePlane
let uPlane: ImagePlane
let vPlane: ImagePlane
let aPlane: ImagePlane
init(width: Int, height: Int) {
self.yPlane = ImagePlane(width: width, height: height, components: 1)
self.uPlane = ImagePlane(width: width / 2, height: height / 2, components: 1)
self.vPlane = ImagePlane(width: width / 2, height: height / 2, components: 1)
self.aPlane = ImagePlane(width: width, height: height, components: 1)
}
}
final class DctCoefficientPlane {
let width: Int
let height: Int
var data: Data
init(width: Int, height: Int) {
self.width = width
self.height = height
self.data = Data(count: width * 2 * height)
}
}
final class DctCoefficientsYUVA420 {
let yPlane: DctCoefficientPlane
let uPlane: DctCoefficientPlane
let vPlane: DctCoefficientPlane
let aPlane: DctCoefficientPlane
init(width: Int, height: Int) {
self.yPlane = DctCoefficientPlane(width: width, height: height)
self.uPlane = DctCoefficientPlane(width: width / 2, height: height / 2)
self.vPlane = DctCoefficientPlane(width: width / 2, height: height / 2)
self.aPlane = DctCoefficientPlane(width: width, height: height)
}
}
extension ImageARGB {
func toYUVA420(target: ImageYUVA420) {
precondition(self.argbPlane.width == target.yPlane.width && self.argbPlane.height == target.yPlane.height)
self.argbPlane.data.withUnsafeBytes { argbBuffer -> Void in
target.yPlane.data.withUnsafeMutableBytes { yBuffer -> Void in
target.uPlane.data.withUnsafeMutableBytes { uBuffer -> Void in
target.vPlane.data.withUnsafeMutableBytes { vBuffer -> Void in
target.aPlane.data.withUnsafeMutableBytes { aBuffer -> Void in
splitRGBAIntoYUVAPlanes(
argbBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self),
yBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self),
uBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self),
vBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self),
aBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self),
Int32(self.argbPlane.width),
Int32(self.argbPlane.height),
Int32(self.argbPlane.bytesPerRow)
)
}
}
}
}
}
}
func toYUVA420() -> ImageYUVA420 {
let resultImage = ImageYUVA420(width: self.argbPlane.width, height: self.argbPlane.height)
self.toYUVA420(target: resultImage)
return resultImage
}
}
extension ImageYUVA420 {
func toARGB(target: ImageARGB) {
precondition(self.yPlane.width == target.argbPlane.width && self.yPlane.height == target.argbPlane.height)
self.yPlane.data.withUnsafeBytes { yBuffer -> Void in
self.uPlane.data.withUnsafeBytes { uBuffer -> Void in
self.vPlane.data.withUnsafeBytes { vBuffer -> Void in
self.aPlane.data.withUnsafeBytes { aBuffer -> Void in
target.argbPlane.data.withUnsafeMutableBytes { argbBuffer -> Void in
combineYUVAPlanesIntoARBB(
argbBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self),
yBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self),
uBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self),
vBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self),
aBuffer.baseAddress!.assumingMemoryBound(to: UInt8.self),
Int32(target.argbPlane.width),
Int32(target.argbPlane.height),
Int32(target.argbPlane.bytesPerRow)
)
}
}
}
}
}
}
func toARGB() -> ImageARGB {
let resultImage = ImageARGB(width: self.yPlane.width, height: self.yPlane.height)
self.toARGB(target: resultImage)
return resultImage
}
}
final class DctData {
let quality: Int
let dctData: Data
let idctData: Data
init(quality: Int) {
self.quality = quality
self.dctData = generateForwardDctData(Int32(quality))!
self.idctData = generateInverseDctData(Int32(quality))!
}
}
extension ImageYUVA420 {
func dct(dctData: DctData, target: DctCoefficientsYUVA420) {
precondition(self.yPlane.width == target.yPlane.width && self.yPlane.height == target.yPlane.height)
for i in 0 ..< 4 {
let sourcePlane: ImagePlane
let targetPlane: DctCoefficientPlane
switch i {
case 0:
sourcePlane = self.yPlane
targetPlane = target.yPlane
case 1:
sourcePlane = self.uPlane
targetPlane = target.uPlane
case 2:
sourcePlane = self.vPlane
targetPlane = target.vPlane
case 3:
sourcePlane = self.aPlane
targetPlane = target.aPlane
default:
preconditionFailure()
}
sourcePlane.data.withUnsafeBytes { sourceBytes in
let sourcePixels = sourceBytes.baseAddress!.assumingMemoryBound(to: UInt8.self)
targetPlane.data.withUnsafeMutableBytes { bytes in
let coefficients = bytes.baseAddress!.assumingMemoryBound(to: UInt16.self)
performForwardDct(sourcePixels, coefficients, Int32(sourcePlane.width), Int32(sourcePlane.height), Int32(sourcePlane.bytesPerRow), dctData.dctData)
}
}
}
}
func dct(dctData: DctData) -> DctCoefficientsYUVA420 {
let results = DctCoefficientsYUVA420(width: self.yPlane.width, height: self.yPlane.height)
self.dct(dctData: dctData, target: results)
return results
}
}
extension DctCoefficientsYUVA420 {
func idct(dctData: DctData, target: ImageYUVA420) {
precondition(self.yPlane.width == target.yPlane.width && self.yPlane.height == target.yPlane.height)
for i in 0 ..< 4 {
let sourcePlane: DctCoefficientPlane
let targetPlane: ImagePlane
switch i {
case 0:
sourcePlane = self.yPlane
targetPlane = target.yPlane
case 1:
sourcePlane = self.uPlane
targetPlane = target.uPlane
case 2:
sourcePlane = self.vPlane
targetPlane = target.vPlane
case 3:
sourcePlane = self.aPlane
targetPlane = target.aPlane
default:
preconditionFailure()
}
sourcePlane.data.withUnsafeBytes { sourceBytes in
let coefficients = sourceBytes.baseAddress!.assumingMemoryBound(to: UInt16.self)
targetPlane.data.withUnsafeMutableBytes { bytes in
let pixels = bytes.baseAddress!.assumingMemoryBound(to: UInt8.self)
performInverseDct(coefficients, pixels, Int32(sourcePlane.width), Int32(sourcePlane.height), Int32(targetPlane.bytesPerRow), Int32(sourcePlane.width), dctData.idctData)
}
}
}
}
func idct(dctData: DctData) -> ImageYUVA420 {
let resultImage = ImageYUVA420(width: self.yPlane.width, height: self.yPlane.height)
self.idct(dctData: dctData, target: resultImage)
return resultImage
}
}