diff --git a/submodules/RLottie/RLottie_Xcode.xcodeproj/project.pbxproj b/submodules/RLottie/RLottie_Xcode.xcodeproj/project.pbxproj index 4d98cd0094..dcd6faf62b 100644 --- a/submodules/RLottie/RLottie_Xcode.xcodeproj/project.pbxproj +++ b/submodules/RLottie/RLottie_Xcode.xcodeproj/project.pbxproj @@ -116,6 +116,7 @@ D010E17A22C237CF009324D4 /* vbrush.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D010E06B22C237CE009324D4 /* vbrush.cpp */; }; D010E17F22C23C42009324D4 /* LottieInstance.mm in Sources */ = {isa = PBXBuildFile; fileRef = D010E17E22C23C42009324D4 /* LottieInstance.mm */; }; D010E18122C23D57009324D4 /* LottieInstance.h in Headers */ = {isa = PBXBuildFile; fileRef = D010E18022C23D57009324D4 /* LottieInstance.h */; settings = {ATTRIBUTES = (Public, ); }; }; + D05B16B022CC29F100B85A94 /* pixman-arma64-neon-asm.h in Headers */ = {isa = PBXBuildFile; fileRef = D05B16AE22CC29F100B85A94 /* pixman-arma64-neon-asm.h */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ @@ -232,6 +233,8 @@ D010E06B22C237CE009324D4 /* vbrush.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = vbrush.cpp; sourceTree = ""; }; D010E17E22C23C42009324D4 /* LottieInstance.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = LottieInstance.mm; sourceTree = ""; }; D010E18022C23D57009324D4 /* LottieInstance.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = LottieInstance.h; sourceTree = ""; }; + D05B16AD22CC29F100B85A94 /* pixman-arma64-neon-asm.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = "pixman-arma64-neon-asm.S"; sourceTree = ""; }; + D05B16AE22CC29F100B85A94 /* pixman-arma64-neon-asm.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "pixman-arma64-neon-asm.h"; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -453,6 +456,8 @@ D010E03122C237CE009324D4 /* pixman */ = { isa = PBXGroup; children = ( + D05B16AE22CC29F100B85A94 /* pixman-arma64-neon-asm.h */, + D05B16AD22CC29F100B85A94 /* pixman-arma64-neon-asm.S */, D010E03322C237CE009324D4 /* pixman-arm-neon-asm.h */, D010E03422C237CE009324D4 /* vregion.h */, D010E03522C237CE009324D4 /* pixman-arm-neon-asm.S */, @@ -518,6 +523,7 @@ D010E12D22C237CE009324D4 /* rapidjson.h in Headers */, D010E11E22C237CE009324D4 /* strtod.h in Headers */, D010E14A22C237CF009324D4 /* config.h in Headers */, + D05B16B022CC29F100B85A94 /* pixman-arma64-neon-asm.h in Headers */, D010E17422C237CF009324D4 /* vstackallocator.h in Headers */, D010E10C22C237CE009324D4 /* lottieparser.h in Headers */, D010E15A22C237CF009324D4 /* vpathmesure.h in Headers */, diff --git a/submodules/RLottie/Sources/rlottie/src/vector/pixman/pixman-arma64-neon-asm.S b/submodules/RLottie/Sources/rlottie/src/vector/pixman/pixman-arma64-neon-asm.S new file mode 100644 index 0000000000..7705eee0a5 --- /dev/null +++ b/submodules/RLottie/Sources/rlottie/src/vector/pixman/pixman-arma64-neon-asm.S @@ -0,0 +1,418 @@ +/* + * Copyright © 2009 Nokia Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) + */ + +/* + * This file contains implementations of NEON optimized pixel processing + * functions. There is no full and detailed tutorial, but some functions + * (those which are exposing some new or interesting features) are + * extensively commented and can be used as examples. + * + * You may want to have a look at the comments for following functions: + * - pixman_composite_over_8888_0565_asm_neon + * - pixman_composite_over_n_8_0565_asm_neon + */ + +/* Prevent the stack from becoming executable for no reason... */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +.text +.arch armv8-a +.altmacro +.p2align 2 + +/* Supplementary macro for setting function attributes */ +.macro pixman_asm_function fname +.func fname +.global fname +#ifdef __ELF__ +.hidden fname +.type fname, %function +#endif +fname: +.endm + +#include "pixman-arma64-neon-asm.h" + +/* Global configuration options and preferences */ + +/* + * The code can optionally make use of unaligned memory accesses to improve + * performance of handling leading/trailing pixels for each scanline. + * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for + * example in linux if unaligned memory accesses are not configured to + * generate.exceptions. + */ +.set RESPECT_STRICT_ALIGNMENT, 1 + +/* + * Set default prefetch type. There is a choice between the following options: + * + * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work + * as NOP to workaround some HW bugs or for whatever other reason) + * + * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where + * advanced prefetch intruduces heavy overhead) + * + * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 + * which can run ARM and NEON instructions simultaneously so that extra ARM + * instructions do not add (many) extra cycles, but improve prefetch efficiency) + * + * Note: some types of function can't support advanced prefetch and fallback + * to simple one (those which handle 24bpp pixels) + */ +.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED + +/* Prefetch distance in pixels for simple prefetch */ +.set PREFETCH_DISTANCE_SIMPLE, 64 + +/* + * Implementation of pixman_composite_over_8888_0565_asm_neon + * + * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and + * performs OVER compositing operation. Function fast_composite_over_8888_0565 + * from pixman-fast-path.c does the same in C and can be used as a reference. + * + * First we need to have some NEON assembly code which can do the actual + * operation on the pixels and provide it to the template macro. + * + * Template macro quite conveniently takes care of emitting all the necessary + * code for memory reading and writing (including quite tricky cases of + * handling unaligned leading/trailing pixels), so we only need to deal with + * the data in NEON registers. + * + * NEON registers allocation in general is recommented to be the following: + * v0, v1, v2, v3 - contain loaded source pixel data + * v4, v5, v6, v7 - contain loaded destination pixels (if they are needed) + * v24, v25, v26, v27 - contain loading mask pixel data (if mask is used) + * v28, v29, v30, v31 - place for storing the result (destination pixels) + * + * As can be seen above, four 64-bit NEON registers are used for keeping + * intermediate pixel data and up to 8 pixels can be processed in one step + * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). + * + * This particular function uses the following registers allocation: + * v0, v1, v2, v3 - contain loaded source pixel data + * v4, v5 - contain loaded destination pixels (they are needed) + * v28, v29 - place for storing the result (destination pixels) + */ + +/* + * Step one. We need to have some code to do some arithmetics on pixel data. + * This is implemented as a pair of macros: '*_head' and '*_tail'. When used + * back-to-back, they take pixel data from {v0, v1, v2, v3} and {v4, v5}, + * perform all the needed calculations and write the result to {v28, v29}. + * The rationale for having two macros and not just one will be explained + * later. In practice, any single monolitic function which does the work can + * be split into two parts in any arbitrary way without affecting correctness. + * + * There is one special trick here too. Common template macro can optionally + * make our life a bit easier by doing R, G, B, A color components + * deinterleaving for 32bpp pixel formats (and this feature is used in + * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that + * instead of having 8 packed pixels in {v0, v1, v2, v3} registers, we + * actually use v0 register for blue channel (a vector of eight 8-bit + * values), v1 register for green, v2 for red and v3 for alpha. This + * simple conversion can be also done with a few NEON instructions: + * + * Packed to planar conversion: // vuzp8 is a wrapper macro + * vuzp8 v0, v1 + * vuzp8 v2, v3 + * vuzp8 v1, v3 + * vuzp8 v0, v2 + * + * Planar to packed conversion: // vzip8 is a wrapper macro + * vzip8 v0, v2 + * vzip8 v1, v3 + * vzip8 v2, v3 + * vzip8 v0, v1 + * + * But pixel can be loaded directly in planar format using LD4 / b NEON + * instruction. It is 1 cycle slower than LD1 / s, so this is not always + * desirable, that's why deinterleaving is optional. + * + * But anyway, here is the code: + */ + +.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head + mvn v24.8b, v3.8b /* get inverted alpha */ + /* do alpha blending */ + umull v8.8h, v24.8b, v4.8b + umull v9.8h, v24.8b, v5.8b + umull v10.8h, v24.8b, v6.8b + umull v11.8h, v24.8b, v7.8b +.endm + +.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail + urshr v14.8h, v8.8h, #8 + urshr v15.8h, v9.8h, #8 + urshr v16.8h, v10.8h, #8 + urshr v17.8h, v11.8h, #8 + raddhn v28.8b, v14.8h, v8.8h + raddhn v29.8b, v15.8h, v9.8h + raddhn v30.8b, v16.8h, v10.8h + raddhn v31.8b, v17.8h, v11.8h +.endm + +/******************************************************************************/ + +.macro pixman_composite_over_8888_8888_process_pixblock_head + pixman_composite_out_reverse_8888_8888_process_pixblock_head +.endm + +.macro pixman_composite_over_8888_8888_process_pixblock_tail + pixman_composite_out_reverse_8888_8888_process_pixblock_tail + uqadd v28.8b, v0.8b, v28.8b + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b +.endm + +.macro pixman_composite_over_8888_8888_process_pixblock_tail_head + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 + urshr v14.8h, v8.8h, #8 + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF + urshr v15.8h, v9.8h, #8 + urshr v16.8h, v10.8h, #8 + urshr v17.8h, v11.8h, #8 + PF beq 10f + PF add PF_X, PF_X, #8 + PF sub PF_CTL, PF_CTL, #1 +10: + raddhn v28.8b, v14.8h, v8.8h + raddhn v29.8b, v15.8h, v9.8h + PF cmp PF_X, ORIG_W + raddhn v30.8b, v16.8h, v10.8h + raddhn v31.8b, v17.8h, v11.8h + uqadd v28.8b, v0.8b, v28.8b + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b + fetch_src_pixblock + PF lsl DUMMY, PF_X, #src_bpp_shift + PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] + mvn v22.8b, v3.8b + PF lsl DUMMY, PF_X, #dst_bpp_shift + PF prfm PREFETCH_MODE, [PF_DST, DUMMY] + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 + PF ble 10f + PF sub PF_X, PF_X, ORIG_W +10: + umull v8.8h, v22.8b, v4.8b + PF ble 10f + PF subs PF_CTL, PF_CTL, #0x10 +10: + umull v9.8h, v22.8b, v5.8b + PF ble 10f + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb DUMMY, [PF_SRC, DUMMY] + PF add PF_SRC, PF_SRC, #1 +10: + umull v10.8h, v22.8b, v6.8b + PF ble 10f + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb DUMMY, [PF_DST, DUMMY] + PF add PF_DST, PF_DST, #1 +10: + umull v11.8h, v22.8b, v7.8b +.endm + +generate_composite_function \ + pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_over_8888_8888_process_pixblock_head, \ + pixman_composite_over_8888_8888_process_pixblock_tail, \ + pixman_composite_over_8888_8888_process_pixblock_tail_head + +generate_composite_function_single_scanline \ + pixman_composite_scanline_over_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_over_8888_8888_process_pixblock_head, \ + pixman_composite_over_8888_8888_process_pixblock_tail, \ + pixman_composite_over_8888_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_n_8888_process_pixblock_head + /* deinterleaved source pixels in {v0, v1, v2, v3} */ + /* inverted alpha in {v24} */ + /* destination pixels in {v4, v5, v6, v7} */ + umull v8.8h, v24.8b, v4.8b + umull v9.8h, v24.8b, v5.8b + umull v10.8h, v24.8b, v6.8b + umull v11.8h, v24.8b, v7.8b +.endm + +.macro pixman_composite_over_n_8888_process_pixblock_tail + urshr v14.8h, v8.8h, #8 + urshr v15.8h, v9.8h, #8 + urshr v16.8h, v10.8h, #8 + urshr v17.8h, v11.8h, #8 + raddhn v28.8b, v14.8h, v8.8h + raddhn v29.8b, v15.8h, v9.8h + raddhn v30.8b, v16.8h, v10.8h + raddhn v31.8b, v17.8h, v11.8h + uqadd v28.8b, v0.8b, v28.8b + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b +.endm + +.macro pixman_composite_over_n_8888_process_pixblock_tail_head + urshr v14.8h, v8.8h, #8 + urshr v15.8h, v9.8h, #8 + urshr v16.8h, v10.8h, #8 + urshr v17.8h, v11.8h, #8 + raddhn v28.8b, v14.8h, v8.8h + raddhn v29.8b, v15.8h, v9.8h + raddhn v30.8b, v16.8h, v10.8h + raddhn v31.8b, v17.8h, v11.8h + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 + uqadd v28.8b, v0.8b, v28.8b + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0x0F + PF beq 10f + PF add PF_X, PF_X, #8 + PF sub PF_CTL, PF_CTL, #1 +10: + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b + PF cmp PF_X, ORIG_W + umull v8.8h, v24.8b, v4.8b + PF lsl DUMMY, PF_X, #dst_bpp_shift + PF prfm PREFETCH_MODE, [PF_DST, DUMMY] + umull v9.8h, v24.8b, v5.8b + PF ble 10f + PF sub PF_X, PF_X, ORIG_W +10: + umull v10.8h, v24.8b, v6.8b + PF subs PF_CTL, PF_CTL, #0x10 + umull v11.8h, v24.8b, v7.8b + PF ble 10f + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb DUMMY, [PF_DST, DUMMY] + PF add PF_DST, PF_DST, #1 +10: + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 +.endm + +.macro pixman_composite_over_n_8888_init + mov v3.s[0], w4 + dup v0.8b, v3.b[0] + dup v1.8b, v3.b[1] + dup v2.8b, v3.b[2] + dup v3.8b, v3.b[3] + mvn v24.8b, v3.8b /* get inverted alpha */ +.endm + +generate_composite_function \ + pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_n_8888_init, \ + default_cleanup, \ + pixman_composite_over_8888_8888_process_pixblock_head, \ + pixman_composite_over_8888_8888_process_pixblock_tail, \ + pixman_composite_over_n_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_src_n_8888_process_pixblock_head +.endm + +.macro pixman_composite_src_n_8888_process_pixblock_tail +.endm + +.macro pixman_composite_src_n_8888_process_pixblock_tail_head + st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32 +.endm + +.macro pixman_composite_src_n_8888_init + mov v0.s[0], w4 + dup v3.2s, v0.s[0] + dup v2.2s, v0.s[0] + dup v1.2s, v0.s[0] + dup v0.2s, v0.s[0] +.endm + +.macro pixman_composite_src_n_8888_cleanup +.endm + +generate_composite_function \ + pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \ + FLAG_DST_WRITEONLY, \ + 8, /* number of pixels, processed in a single block */ \ + 0, /* prefetch distance */ \ + pixman_composite_src_n_8888_init, \ + pixman_composite_src_n_8888_cleanup, \ + pixman_composite_src_n_8888_process_pixblock_head, \ + pixman_composite_src_n_8888_process_pixblock_tail, \ + pixman_composite_src_n_8888_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_8888_8888_process_pixblock_head +.endm + +.macro pixman_composite_src_8888_8888_process_pixblock_tail +.endm + +.macro pixman_composite_src_8888_8888_process_pixblock_tail_head + st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32 + fetch_src_pixblock + cache_preload 8, 8 +.endm + +generate_composite_function \ + pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_WRITEONLY, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_8888_8888_process_pixblock_head, \ + pixman_composite_src_8888_8888_process_pixblock_tail, \ + pixman_composite_src_8888_8888_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ diff --git a/submodules/RLottie/Sources/rlottie/src/vector/pixman/pixman-arma64-neon-asm.h b/submodules/RLottie/Sources/rlottie/src/vector/pixman/pixman-arma64-neon-asm.h new file mode 100644 index 0000000000..1103084a3e --- /dev/null +++ b/submodules/RLottie/Sources/rlottie/src/vector/pixman/pixman-arma64-neon-asm.h @@ -0,0 +1,1223 @@ +/* + * Copyright © 2009 Nokia Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) + */ + +/* + * This file contains a macro ('generate_composite_function') which can + * construct 2D image processing functions, based on a common template. + * Any combinations of source, destination and mask images with 8bpp, + * 16bpp, 24bpp, 32bpp color formats are supported. + * + * This macro takes care of: + * - handling of leading and trailing unaligned pixels + * - doing most of the work related to L2 cache preload + * - encourages the use of software pipelining for better instructions + * scheduling + * + * The user of this macro has to provide some configuration parameters + * (bit depths for the images, prefetch distance, etc.) and a set of + * macros, which should implement basic code chunks responsible for + * pixels processing. See 'pixman-armv8-neon-asm.S' file for the usage + * examples. + * + * TODO: + * - try overlapped pixel method (from Ian Rickards) when processing + * exactly two blocks of pixels + * - maybe add an option to do reverse scanline processing + */ + +/* + * Bit flags for 'generate_composite_function' macro which are used + * to tune generated functions behavior. + */ +.set FLAG_DST_WRITEONLY, 0 +.set FLAG_DST_READWRITE, 1 +.set FLAG_DEINTERLEAVE_32BPP, 2 + +/* + * Constants for selecting preferable prefetch type. + */ +.set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */ +.set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */ +.set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */ + +/* + * prefetch mode + * available modes are: + * pldl1keep + * pldl1strm + * pldl2keep + * pldl2strm + * pldl3keep + * pldl3strm + */ +#define PREFETCH_MODE pldl1keep + +/* + * Definitions of supplementary pixld/pixst macros (for partial load/store of + * pixel data). + */ + +.macro pixldst1 op, elem_size, reg1, mem_operand, abits + op {v®1&.&elem_size}, [&mem_operand&], #8 +.endm + +.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits + op {v®1&.&elem_size, v®2&.&elem_size}, [&mem_operand&], #16 +.endm + +.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits + op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size, v®4&.&elem_size}, [&mem_operand&], #32 +.endm + +.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes + op {v®1&.&elem_size}[idx], [&mem_operand&], #&bytes& +.endm + +.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand + op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size}, [&mem_operand&], #24 +.endm + +.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand + op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size}[idx], [&mem_operand&], #3 +.endm + +.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits +.if numbytes == 32 + .if elem_size==32 + pixldst4 op, 2s, %(basereg+4), %(basereg+5), \ + %(basereg+6), %(basereg+7), mem_operand, abits + .elseif elem_size==16 + pixldst4 op, 4h, %(basereg+4), %(basereg+5), \ + %(basereg+6), %(basereg+7), mem_operand, abits + .else + pixldst4 op, 8b, %(basereg+4), %(basereg+5), \ + %(basereg+6), %(basereg+7), mem_operand, abits + .endif +.elseif numbytes == 16 + .if elem_size==32 + pixldst2 op, 2s, %(basereg+2), %(basereg+3), mem_operand, abits + .elseif elem_size==16 + pixldst2 op, 4h, %(basereg+2), %(basereg+3), mem_operand, abits + .else + pixldst2 op, 8b, %(basereg+2), %(basereg+3), mem_operand, abits + .endif +.elseif numbytes == 8 + .if elem_size==32 + pixldst1 op, 2s, %(basereg+1), mem_operand, abits + .elseif elem_size==16 + pixldst1 op, 4h, %(basereg+1), mem_operand, abits + .else + pixldst1 op, 8b, %(basereg+1), mem_operand, abits + .endif +.elseif numbytes == 4 + .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) + pixldst0 op, s, %(basereg+0), 1, mem_operand, abits, 4 + .elseif elem_size == 16 + pixldst0 op, h, %(basereg+0), 2, mem_operand, abits, 2 + pixldst0 op, h, %(basereg+0), 3, mem_operand, abits, 2 + .else + pixldst0 op, b, %(basereg+0), 4, mem_operand, abits, 1 + pixldst0 op, b, %(basereg+0), 5, mem_operand, abits, 1 + pixldst0 op, b, %(basereg+0), 6, mem_operand, abits, 1 + pixldst0 op, b, %(basereg+0), 7, mem_operand, abits, 1 + .endif +.elseif numbytes == 2 + .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) + pixldst0 op, h, %(basereg+0), 1, mem_operand, abits, 2 + .else + pixldst0 op, b, %(basereg+0), 2, mem_operand, abits, 1 + pixldst0 op, b, %(basereg+0), 3, mem_operand, abits, 1 + .endif +.elseif numbytes == 1 + pixldst0 op, b, %(basereg+0), 1, mem_operand, abits, 1 +.else + .error "unsupported size: numbytes" +.endif +.endm + +.macro pixld numpix, bpp, basereg, mem_operand, abits=0 +.if bpp > 0 +.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) + pixldst4 ld4, 8b, %(basereg+4), %(basereg+5), \ + %(basereg+6), %(basereg+7), mem_operand, abits +.elseif (bpp == 24) && (numpix == 8) + pixldst3 ld3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand +.elseif (bpp == 24) && (numpix == 4) + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand +.elseif (bpp == 24) && (numpix == 2) + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand +.elseif (bpp == 24) && (numpix == 1) + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand +.else + pixldst %(numpix * bpp / 8), ld1, %(bpp), basereg, mem_operand, abits +.endif +.endif +.endm + +.macro pixst numpix, bpp, basereg, mem_operand, abits=0 +.if bpp > 0 +.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) + pixldst4 st4, 8b, %(basereg+4), %(basereg+5), \ + %(basereg+6), %(basereg+7), mem_operand, abits +.elseif (bpp == 24) && (numpix == 8) + pixldst3 st3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand +.elseif (bpp == 24) && (numpix == 4) + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand +.elseif (bpp == 24) && (numpix == 2) + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand +.elseif (bpp == 24) && (numpix == 1) + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand +.else + pixldst %(numpix * bpp / 8), st1, %(bpp), basereg, mem_operand, abits +.endif +.endif +.endm + +.macro pixld_a numpix, bpp, basereg, mem_operand +.if (bpp * numpix) <= 128 + pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix) +.else + pixld numpix, bpp, basereg, mem_operand, 128 +.endif +.endm + +.macro pixst_a numpix, bpp, basereg, mem_operand +.if (bpp * numpix) <= 128 + pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix) +.else + pixst numpix, bpp, basereg, mem_operand, 128 +.endif +.endm + +/* + * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register + * aliases to be defined) + */ +.macro pixld1_s elem_size, reg1, mem_operand +.if elem_size == 16 + asr TMP1, VX, #16 + adds VX, VX, UNIT_X + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b +55: + add TMP1, mem_operand, TMP1, lsl #1 + asr TMP2, VX, #16 + adds VX, VX, UNIT_X + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b +55: + add TMP2, mem_operand, TMP2, lsl #1 + ld1 {v®1&.h}[0], [TMP1] + asr TMP1, VX, #16 + adds VX, VX, UNIT_X + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b +55: + add TMP1, mem_operand, TMP1, lsl #1 + ld1 {v®1&.h}[1], [TMP2] + asr TMP2, VX, #16 + adds VX, VX, UNIT_X + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b +55: + add TMP2, mem_operand, TMP2, lsl #1 + ld1 {v®1&.h}[2], [TMP1] + ld1 {v®1&.h}[3], [TMP2] +.elseif elem_size == 32 + asr TMP1, VX, #16 + adds VX, VX, UNIT_X + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b +55: + add TMP1, mem_operand, TMP1, lsl #2 + asr TMP2, VX, #16 + adds VX, VX, UNIT_X + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b +55: + add TMP2, mem_operand, TMP2, lsl #2 + ld1 {v®1&.s}[0], [TMP1] + ld1 {v®1&.s}[1], [TMP2] +.else + .error "unsupported" +.endif +.endm + +.macro pixld2_s elem_size, reg1, reg2, mem_operand +.if 0 /* elem_size == 32 */ + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X, asl #1 + add TMP1, mem_operand, TMP1, asl #2 + mov TMP2, VX, asr #16 + sub VX, VX, UNIT_X + add TMP2, mem_operand, TMP2, asl #2 + ld1 {v®1&.s}[0], [TMP1] + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X, asl #1 + add TMP1, mem_operand, TMP1, asl #2 + ld1 {v®2&.s}[0], [TMP2, :32] + mov TMP2, VX, asr #16 + add VX, VX, UNIT_X + add TMP2, mem_operand, TMP2, asl #2 + ld1 {v®1&.s}[1], [TMP1] + ld1 {v®2&.s}[1], [TMP2] +.else + pixld1_s elem_size, reg1, mem_operand + pixld1_s elem_size, reg2, mem_operand +.endif +.endm + +.macro pixld0_s elem_size, reg1, idx, mem_operand +.if elem_size == 16 + asr TMP1, VX, #16 + adds VX, VX, UNIT_X + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b +55: + add TMP1, mem_operand, TMP1, lsl #1 + ld1 {v®1&.h}[idx], [TMP1] +.elseif elem_size == 32 + asr DUMMY, VX, #16 + mov TMP1, DUMMY + adds VX, VX, UNIT_X + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b +55: + add TMP1, mem_operand, TMP1, lsl #2 + ld1 {v®1&.s}[idx], [TMP1] +.endif +.endm + +.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand +.if numbytes == 32 + pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand + pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand + pixdeinterleave elem_size, %(basereg+4) +.elseif numbytes == 16 + pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand +.elseif numbytes == 8 + pixld1_s elem_size, %(basereg+1), mem_operand +.elseif numbytes == 4 + .if elem_size == 32 + pixld0_s elem_size, %(basereg+0), 1, mem_operand + .elseif elem_size == 16 + pixld0_s elem_size, %(basereg+0), 2, mem_operand + pixld0_s elem_size, %(basereg+0), 3, mem_operand + .else + pixld0_s elem_size, %(basereg+0), 4, mem_operand + pixld0_s elem_size, %(basereg+0), 5, mem_operand + pixld0_s elem_size, %(basereg+0), 6, mem_operand + pixld0_s elem_size, %(basereg+0), 7, mem_operand + .endif +.elseif numbytes == 2 + .if elem_size == 16 + pixld0_s elem_size, %(basereg+0), 1, mem_operand + .else + pixld0_s elem_size, %(basereg+0), 2, mem_operand + pixld0_s elem_size, %(basereg+0), 3, mem_operand + .endif +.elseif numbytes == 1 + pixld0_s elem_size, %(basereg+0), 1, mem_operand +.else + .error "unsupported size: numbytes" +.endif +.endm + +.macro pixld_s numpix, bpp, basereg, mem_operand +.if bpp > 0 + pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand +.endif +.endm + +.macro vuzp8 reg1, reg2 + umov DUMMY, v16.d[0] + uzp1 v16.8b, v®1&.8b, v®2&.8b + uzp2 v®2&.8b, v®1&.8b, v®2&.8b + mov v®1&.8b, v16.8b + mov v16.d[0], DUMMY +.endm + +.macro vzip8 reg1, reg2 + umov DUMMY, v16.d[0] + zip1 v16.8b, v®1&.8b, v®2&.8b + zip2 v®2&.8b, v®1&.8b, v®2&.8b + mov v®1&.8b, v16.8b + mov v16.d[0], DUMMY +.endm + +/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ +.macro pixdeinterleave bpp, basereg +.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) + vuzp8 %(basereg+0), %(basereg+1) + vuzp8 %(basereg+2), %(basereg+3) + vuzp8 %(basereg+1), %(basereg+3) + vuzp8 %(basereg+0), %(basereg+2) +.endif +.endm + +/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ +.macro pixinterleave bpp, basereg +.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) + vzip8 %(basereg+0), %(basereg+2) + vzip8 %(basereg+1), %(basereg+3) + vzip8 %(basereg+2), %(basereg+3) + vzip8 %(basereg+0), %(basereg+1) +.endif +.endm + +/* + * This is a macro for implementing cache preload. The main idea is that + * cache preload logic is mostly independent from the rest of pixels + * processing code. It starts at the top left pixel and moves forward + * across pixels and can jump across scanlines. Prefetch distance is + * handled in an 'incremental' way: it starts from 0 and advances to the + * optimal distance over time. After reaching optimal prefetch distance, + * it is kept constant. There are some checks which prevent prefetching + * unneeded pixel lines below the image (but it still can prefetch a bit + * more data on the right side of the image - not a big issue and may + * be actually helpful when rendering text glyphs). Additional trick is + * the use of LDR instruction for prefetch instead of PLD when moving to + * the next line, the point is that we have a high chance of getting TLB + * miss in this case, and PLD would be useless. + * + * This sounds like it may introduce a noticeable overhead (when working with + * fully cached data). But in reality, due to having a separate pipeline and + * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can + * execute simultaneously with NEON and be completely shadowed by it. Thus + * we get no performance overhead at all (*). This looks like a very nice + * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher, + * but still can implement some rather advanced prefetch logic in software + * for almost zero cost! + * + * (*) The overhead of the prefetcher is visible when running some trivial + * pixels processing like simple copy. Anyway, having prefetch is a must + * when working with the graphics data. + */ +.macro PF a, x:vararg +.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED) + a x +.endif +.endm + +.macro cache_preload std_increment, boost_increment +.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) +.if std_increment != 0 + PF add PF_X, PF_X, #std_increment +.endif + PF tst PF_CTL, #0xF + PF beq 71f + PF add PF_X, PF_X, #boost_increment + PF sub PF_CTL, PF_CTL, #1 +71: + PF cmp PF_X, ORIG_W +.if src_bpp_shift >= 0 + PF lsl DUMMY, PF_X, #src_bpp_shift + PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] +.endif +.if dst_r_bpp != 0 + PF lsl DUMMY, PF_X, #dst_bpp_shift + PF prfm PREFETCH_MODE, [PF_DST, DUMMY] +.endif +.if mask_bpp_shift >= 0 + PF lsl DUMMY, PF_X, #mask_bpp_shift + PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] +.endif + PF ble 71f + PF sub PF_X, PF_X, ORIG_W + PF subs PF_CTL, PF_CTL, #0x10 +71: + PF ble 72f +.if src_bpp_shift >= 0 + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb DUMMY, [PF_SRC, DUMMY] + PF add PF_SRC, PF_SRC, #1 +.endif +.if dst_r_bpp != 0 + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb DUMMY, [PF_DST, DUMMY] + PF add PF_DST, PF_DST, #1 +.endif +.if mask_bpp_shift >= 0 + PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift + PF ldrsb DUMMY, [PF_MASK, DUMMY] + PF add PF_MASK, PF_MASK, #1 +.endif +72: +.endif +.endm + +.macro cache_preload_simple +.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) +.if src_bpp > 0 + prfm PREFETCH_MODE, [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] +.endif +.if dst_r_bpp > 0 + prfm PREFETCH_MODE, [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)] +.endif +.if mask_bpp > 0 + prfm PREFETCH_MODE, [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)] +.endif +.endif +.endm + +.macro fetch_mask_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK +.endm + +/* + * Macro which is used to process leading pixels until destination + * pointer is properly aligned (at 16 bytes boundary). When destination + * buffer uses 16bpp format, this is unnecessary, or even pointless. + */ +.macro ensure_destination_ptr_alignment process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head +.if dst_w_bpp != 24 + tst DST_R, #0xF + beq 52f +.irp lowbit, 1, 2, 4, 8, 16 +local skip1 +.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) +.if lowbit < 16 /* we don't need more than 16-byte alignment */ + tst DST_R, #lowbit + beq 51f +.endif + pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC + pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK +.if dst_r_bpp > 0 + pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R +.else + add DST_R, DST_R, #lowbit +.endif + PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) + sub W, W, #(lowbit * 8 / dst_w_bpp) +51: +.endif +.endr + pixdeinterleave src_bpp, src_basereg + pixdeinterleave mask_bpp, mask_basereg + pixdeinterleave dst_r_bpp, dst_r_basereg + + process_pixblock_head + cache_preload 0, pixblock_size + cache_preload_simple + process_pixblock_tail + + pixinterleave dst_w_bpp, dst_w_basereg + +.irp lowbit, 1, 2, 4, 8, 16 +.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) +.if lowbit < 16 /* we don't need more than 16-byte alignment */ + tst DST_W, #lowbit + beq 51f +.endif + pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W +51: +.endif +.endr +.endif +52: +.endm + +/* + * Special code for processing up to (pixblock_size - 1) remaining + * trailing pixels. As SIMD processing performs operation on + * pixblock_size pixels, anything smaller than this has to be loaded + * and stored in a special way. Loading and storing of pixel data is + * performed in such a way that we fill some 'slots' in the NEON + * registers (some slots naturally are unused), then perform compositing + * operation as usual. In the end, the data is taken from these 'slots' + * and saved to memory. + * + * cache_preload_flag - allows to suppress prefetch if + * set to 0 + * dst_aligned_flag - selects whether destination buffer + * is aligned + */ +.macro process_trailing_pixels cache_preload_flag, \ + dst_aligned_flag, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + tst W, #(pixblock_size - 1) + beq 52f +.irp chunk_size, 16, 8, 4, 2, 1 +.if pixblock_size > chunk_size + tst W, #chunk_size + beq 51f + pixld_src chunk_size, src_bpp, src_basereg, SRC + pixld chunk_size, mask_bpp, mask_basereg, MASK +.if dst_aligned_flag != 0 + pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R +.else + pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R +.endif +.if cache_preload_flag != 0 + PF add PF_X, PF_X, #chunk_size +.endif +51: +.endif +.endr + pixdeinterleave src_bpp, src_basereg + pixdeinterleave mask_bpp, mask_basereg + pixdeinterleave dst_r_bpp, dst_r_basereg + + process_pixblock_head +.if cache_preload_flag != 0 + cache_preload 0, pixblock_size + cache_preload_simple +.endif + process_pixblock_tail + pixinterleave dst_w_bpp, dst_w_basereg +.irp chunk_size, 16, 8, 4, 2, 1 +.if pixblock_size > chunk_size + tst W, #chunk_size + beq 51f +.if dst_aligned_flag != 0 + pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W +.else + pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W +.endif +51: +.endif +.endr +52: +.endm + +/* + * Macro, which performs all the needed operations to switch to the next + * scanline and start the next loop iteration unless all the scanlines + * are already processed. + */ +.macro advance_to_next_scanline start_of_loop_label + mov W, ORIG_W + add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift +.if src_bpp != 0 + add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift +.endif +.if mask_bpp != 0 + add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift +.endif +.if (dst_w_bpp != 24) + sub DST_W, DST_W, W, lsl #dst_bpp_shift +.endif +.if (src_bpp != 24) && (src_bpp != 0) + sub SRC, SRC, W, lsl #src_bpp_shift +.endif +.if (mask_bpp != 24) && (mask_bpp != 0) + sub MASK, MASK, W, lsl #mask_bpp_shift +.endif + subs H, H, #1 + mov DST_R, DST_W + bge start_of_loop_label +.endm + +/* + * Registers are allocated in the following way by default: + * v0, v1, v2, v3 - reserved for loading source pixel data + * v4, v5, v6, v7 - reserved for loading destination pixel data + * v24, v25, v26, v27 - reserved for loading mask pixel data + * v28, v29, v30, v31 - final destination pixel data for writeback to memory + */ +.macro generate_composite_function fname, \ + src_bpp_, \ + mask_bpp_, \ + dst_w_bpp_, \ + flags, \ + pixblock_size_, \ + prefetch_distance, \ + init, \ + cleanup, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head, \ + dst_w_basereg_ = 28, \ + dst_r_basereg_ = 4, \ + src_basereg_ = 0, \ + mask_basereg_ = 24 + + pixman_asm_function fname + stp x29, x30, [sp, -16]! + mov x29, sp + sub sp, sp, 232 /* push all registers */ + sub x29, x29, 64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 + stp x8, x9, [x29, -80] + stp x10, x11, [x29, -96] + stp x12, x13, [x29, -112] + stp x14, x15, [x29, -128] + stp x16, x17, [x29, -144] + stp x18, x19, [x29, -160] + stp x20, x21, [x29, -176] + stp x22, x23, [x29, -192] + stp x24, x25, [x29, -208] + stp x26, x27, [x29, -224] + str x28, [x29, -232] + +/* + * Select prefetch type for this function. If prefetch distance is + * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch + * has to be used instead of ADVANCED. + */ + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT +.if prefetch_distance == 0 + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE +.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \ + ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24)) + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE +.endif + +/* + * Make some macro arguments globally visible and accessible + * from other macros + */ + .set src_bpp, src_bpp_ + .set mask_bpp, mask_bpp_ + .set dst_w_bpp, dst_w_bpp_ + .set pixblock_size, pixblock_size_ + .set dst_w_basereg, dst_w_basereg_ + .set dst_r_basereg, dst_r_basereg_ + .set src_basereg, src_basereg_ + .set mask_basereg, mask_basereg_ + + .macro pixld_src x:vararg + pixld x + .endm + .macro fetch_src_pixblock + pixld_src pixblock_size, src_bpp, \ + (src_basereg - pixblock_size * src_bpp / 64), SRC + .endm +/* + * Assign symbolic names to registers + */ + W .req x0 /* width (is updated during processing) */ + H .req x1 /* height (is updated during processing) */ + DST_W .req x2 /* destination buffer pointer for writes */ + DST_STRIDE .req x3 /* destination image stride */ + SRC .req x4 /* source buffer pointer */ + SRC_STRIDE .req x5 /* source image stride */ + MASK .req x6 /* mask pointer */ + MASK_STRIDE .req x7 /* mask stride */ + + DST_R .req x8 /* destination buffer pointer for reads */ + + PF_CTL .req x9 /* combined lines counter and prefetch */ + /* distance increment counter */ + PF_X .req x10 /* pixel index in a scanline for current */ + /* pretetch position */ + PF_SRC .req x11 /* pointer to source scanline start */ + /* for prefetch purposes */ + PF_DST .req x12 /* pointer to destination scanline start */ + /* for prefetch purposes */ + PF_MASK .req x13 /* pointer to mask scanline start */ + /* for prefetch purposes */ + + ORIG_W .req x14 /* saved original width */ + DUMMY .req x15 /* temporary register */ + + sxtw x0, w0 + sxtw x1, w1 + sxtw x3, w3 + sxtw x5, w5 + sxtw x7, w7 + + .set mask_bpp_shift, -1 +.if src_bpp == 32 + .set src_bpp_shift, 2 +.elseif src_bpp == 24 + .set src_bpp_shift, 0 +.elseif src_bpp == 16 + .set src_bpp_shift, 1 +.elseif src_bpp == 8 + .set src_bpp_shift, 0 +.elseif src_bpp == 0 + .set src_bpp_shift, -1 +.else + .error "requested src bpp (src_bpp) is not supported" +.endif +.if mask_bpp == 32 + .set mask_bpp_shift, 2 +.elseif mask_bpp == 24 + .set mask_bpp_shift, 0 +.elseif mask_bpp == 8 + .set mask_bpp_shift, 0 +.elseif mask_bpp == 0 + .set mask_bpp_shift, -1 +.else + .error "requested mask bpp (mask_bpp) is not supported" +.endif +.if dst_w_bpp == 32 + .set dst_bpp_shift, 2 +.elseif dst_w_bpp == 24 + .set dst_bpp_shift, 0 +.elseif dst_w_bpp == 16 + .set dst_bpp_shift, 1 +.elseif dst_w_bpp == 8 + .set dst_bpp_shift, 0 +.else + .error "requested dst bpp (dst_w_bpp) is not supported" +.endif + +.if (((flags) & FLAG_DST_READWRITE) != 0) + .set dst_r_bpp, dst_w_bpp +.else + .set dst_r_bpp, 0 +.endif +.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) + .set DEINTERLEAVE_32BPP_ENABLED, 1 +.else + .set DEINTERLEAVE_32BPP_ENABLED, 0 +.endif + +.if prefetch_distance < 0 || prefetch_distance > 15 + .error "invalid prefetch distance (prefetch_distance)" +.endif + + PF mov PF_X, #0 + mov DST_R, DST_W + +.if src_bpp == 24 + sub SRC_STRIDE, SRC_STRIDE, W + sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 +.endif +.if mask_bpp == 24 + sub MASK_STRIDE, MASK_STRIDE, W + sub MASK_STRIDE, MASK_STRIDE, W, lsl #1 +.endif +.if dst_w_bpp == 24 + sub DST_STRIDE, DST_STRIDE, W + sub DST_STRIDE, DST_STRIDE, W, lsl #1 +.endif + +/* + * Setup advanced prefetcher initial state + */ + PF mov PF_SRC, SRC + PF mov PF_DST, DST_R + PF mov PF_MASK, MASK + /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ + PF lsl DUMMY, H, #4 + PF mov PF_CTL, DUMMY + PF add PF_CTL, PF_CTL, #(prefetch_distance - 0x10) + + init + subs H, H, #1 + mov ORIG_W, W + blt 9f + cmp W, #(pixblock_size * 2) + blt 800f +/* + * This is the start of the pipelined loop, which if optimized for + * long scanlines + */ +0: + ensure_destination_ptr_alignment process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + + /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ + pixld_a pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + fetch_src_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK + PF add PF_X, PF_X, #pixblock_size + process_pixblock_head + cache_preload 0, pixblock_size + cache_preload_simple + subs W, W, #(pixblock_size * 2) + blt 200f + +100: + process_pixblock_tail_head + cache_preload_simple + subs W, W, #pixblock_size + bge 100b + +200: + process_pixblock_tail + pixst_a pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W + + /* Process the remaining trailing pixels in the scanline */ + process_trailing_pixels 1, 1, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + advance_to_next_scanline 0b + + cleanup +1000: + /* pop all registers */ + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x8, x9, [x29, -80] + ldp x10, x11, [x29, -96] + ldp x12, x13, [x29, -112] + ldp x14, x15, [x29, -128] + ldp x16, x17, [x29, -144] + ldp x18, x19, [x29, -160] + ldp x20, x21, [x29, -176] + ldp x22, x23, [x29, -192] + ldp x24, x25, [x29, -208] + ldp x26, x27, [x29, -224] + ldr x28, [x29, -232] + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ +/* + * This is the start of the loop, designed to process images with small width + * (less than pixblock_size * 2 pixels). In this case neither pipelining + * nor prefetch are used. + */ +800: + /* Process exactly pixblock_size pixels if needed */ + tst W, #pixblock_size + beq 100f + pixld pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + fetch_src_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK + process_pixblock_head + process_pixblock_tail + pixst pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W +100: + /* Process the remaining trailing pixels in the scanline */ + process_trailing_pixels 0, 0, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + advance_to_next_scanline 800b +9: + cleanup + /* pop all registers */ + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x8, x9, [x29, -80] + ldp x10, x11, [x29, -96] + ldp x12, x13, [x29, -112] + ldp x14, x15, [x29, -128] + ldp x16, x17, [x29, -144] + ldp x18, x19, [x29, -160] + ldp x20, x21, [x29, -176] + ldp x22, x23, [x29, -192] + ldp x24, x25, [x29, -208] + ldp x26, x27, [x29, -224] + ldr x28, [x29, -232] + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ + + .purgem fetch_src_pixblock + .purgem pixld_src + + .unreq SRC + .unreq MASK + .unreq DST_R + .unreq DST_W + .unreq ORIG_W + .unreq W + .unreq H + .unreq SRC_STRIDE + .unreq DST_STRIDE + .unreq MASK_STRIDE + .unreq PF_CTL + .unreq PF_X + .unreq PF_SRC + .unreq PF_DST + .unreq PF_MASK + .unreq DUMMY + .endfunc +.endm + +/* + * A simplified variant of function generation template for a single + * scanline processing (for implementing pixman combine functions) + */ +.macro generate_composite_function_scanline use_nearest_scaling, \ + fname, \ + src_bpp_, \ + mask_bpp_, \ + dst_w_bpp_, \ + flags, \ + pixblock_size_, \ + init, \ + cleanup, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head, \ + dst_w_basereg_ = 28, \ + dst_r_basereg_ = 4, \ + src_basereg_ = 0, \ + mask_basereg_ = 24 + + pixman_asm_function fname + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE + +/* + * Make some macro arguments globally visible and accessible + * from other macros + */ + .set src_bpp, src_bpp_ + .set mask_bpp, mask_bpp_ + .set dst_w_bpp, dst_w_bpp_ + .set pixblock_size, pixblock_size_ + .set dst_w_basereg, dst_w_basereg_ + .set dst_r_basereg, dst_r_basereg_ + .set src_basereg, src_basereg_ + .set mask_basereg, mask_basereg_ + +.if use_nearest_scaling != 0 + /* + * Assign symbolic names to registers for nearest scaling + */ + W .req x0 + DST_W .req x1 + SRC .req x2 + VX .req x3 + UNIT_X .req x4 + SRC_WIDTH_FIXED .req x5 + MASK .req x6 + TMP1 .req x8 + TMP2 .req x9 + DST_R .req x10 + DUMMY .req x30 + + .macro pixld_src x:vararg + pixld_s x + .endm + + sxtw x0, w0 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 + + stp x29, x30, [sp, -16]! + mov x29, sp + sub sp, sp, 88 + sub x29, x29, 64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + stp x8, x9, [x29, -80] + str x10, [x29, -88] +.else + /* + * Assign symbolic names to registers + */ + W .req x0 /* width (is updated during processing) */ + DST_W .req x1 /* destination buffer pointer for writes */ + SRC .req x2 /* source buffer pointer */ + MASK .req x3 /* mask pointer */ + DST_R .req x4 /* destination buffer pointer for reads */ + DUMMY .req x30 + + .macro pixld_src x:vararg + pixld x + .endm + + sxtw x0, w0 + + stp x29, x30, [sp, -16]! + mov x29, sp + sub sp, sp, 64 + sub x29, x29, 64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 +.endif + +.if (((flags) & FLAG_DST_READWRITE) != 0) + .set dst_r_bpp, dst_w_bpp +.else + .set dst_r_bpp, 0 +.endif +.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) + .set DEINTERLEAVE_32BPP_ENABLED, 1 +.else + .set DEINTERLEAVE_32BPP_ENABLED, 0 +.endif + + .macro fetch_src_pixblock + pixld_src pixblock_size, src_bpp, \ + (src_basereg - pixblock_size * src_bpp / 64), SRC + .endm + + init + mov DST_R, DST_W + + cmp W, #pixblock_size + blt 800f + + ensure_destination_ptr_alignment process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + + subs W, W, #pixblock_size + blt 700f + + /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ + pixld_a pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + fetch_src_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK + process_pixblock_head + subs W, W, #pixblock_size + blt 200f +100: + process_pixblock_tail_head + subs W, W, #pixblock_size + bge 100b +200: + process_pixblock_tail + pixst_a pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W +700: + /* Process the remaining trailing pixels in the scanline (dst aligned) */ + process_trailing_pixels 0, 1, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + + cleanup +.if use_nearest_scaling != 0 + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x8, x9, [x29, -80] + ldr x10, [x29, -96] + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ +.else + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ +.endif +800: + /* Process the remaining trailing pixels in the scanline (dst unaligned) */ + process_trailing_pixels 0, 0, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + + cleanup +.if use_nearest_scaling != 0 + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x8, x9, [x29, -80] + ldr x10, [x29, -88] + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ + + .unreq DUMMY + .unreq DST_R + .unreq SRC + .unreq W + .unreq VX + .unreq UNIT_X + .unreq TMP1 + .unreq TMP2 + .unreq DST_W + .unreq MASK + .unreq SRC_WIDTH_FIXED + +.else + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ + + .unreq DUMMY + .unreq SRC + .unreq MASK + .unreq DST_R + .unreq DST_W + .unreq W +.endif + + .purgem fetch_src_pixblock + .purgem pixld_src + + .endfunc +.endm + +.macro generate_composite_function_single_scanline x:vararg + generate_composite_function_scanline 0, x +.endm + +.macro generate_composite_function_nearest_scanline x:vararg + generate_composite_function_scanline 1, x +.endm + +/* Default prologue/epilogue, nothing special needs to be done */ + +.macro default_init +.endm + +.macro default_cleanup +.endm + +/* + * Prologue/epilogue variant which additionally saves/restores v8-v15 + * registers (they need to be saved/restored by callee according to ABI). + * This is required if the code needs to use all the NEON registers. + */ + +.macro default_init_need_all_regs +.endm + +.macro default_cleanup_need_all_regs +.endm + +/******************************************************************************/