1/* 2 * ARMv8 NEON optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). 5 * All Rights Reserved. 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> 7 * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved. 8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> 9 * Copyright (C) 2014-2016, D. R. Commander. All Rights Reserved. 10 * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved. 11 * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved. 12 * 13 * This software is provided 'as-is', without any express or implied 14 * warranty. In no event will the authors be held liable for any damages 15 * arising from the use of this software. 16 * 17 * Permission is granted to anyone to use this software for any purpose, 18 * including commercial applications, and to alter it and redistribute it 19 * freely, subject to the following restrictions: 20 * 21 * 1. The origin of this software must not be misrepresented; you must not 22 * claim that you wrote the original software. If you use this software 23 * in a product, an acknowledgment in the product documentation would be 24 * appreciated but is not required. 25 * 2. Altered source versions must be plainly marked as such, and must not be 26 * misrepresented as being the original software. 27 * 3. This notice may not be removed or altered from any source distribution. 28 */ 29 30#if defined(__linux__) && defined(__ELF__) 31.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ 32#endif 33 34#if defined(__APPLE__) 35.section __DATA, __const 36#else 37.section .rodata, "a", %progbits 38#endif 39 40/* Constants for jsimd_idct_islow_neon() */ 41 42#define F_0_298 2446 /* FIX(0.298631336) */ 43#define F_0_390 3196 /* FIX(0.390180644) */ 44#define F_0_541 4433 /* FIX(0.541196100) */ 45#define F_0_765 6270 /* FIX(0.765366865) */ 46#define F_0_899 7373 /* FIX(0.899976223) */ 47#define F_1_175 9633 /* FIX(1.175875602) */ 48#define F_1_501 12299 /* FIX(1.501321110) */ 49#define F_1_847 15137 /* FIX(1.847759065) */ 50#define F_1_961 16069 /* FIX(1.961570560) */ 51#define F_2_053 16819 /* FIX(2.053119869) */ 52#define F_2_562 20995 /* FIX(2.562915447) */ 53#define F_3_072 25172 /* FIX(3.072711026) */ 54 55.balign 16 56Ljsimd_idct_islow_neon_consts: 57 .short F_0_298 58 .short -F_0_390 59 .short F_0_541 60 .short F_0_765 61 .short - F_0_899 62 .short F_1_175 63 .short F_1_501 64 .short - F_1_847 65 .short - F_1_961 66 .short F_2_053 67 .short - F_2_562 68 .short F_3_072 69 .short 0 /* padding */ 70 .short 0 71 .short 0 72 .short 0 73 74#undef F_0_298 75#undef F_0_390 76#undef F_0_541 77#undef F_0_765 78#undef F_0_899 79#undef F_1_175 80#undef F_1_501 81#undef F_1_847 82#undef F_1_961 83#undef F_2_053 84#undef F_2_562 85#undef F_3_072 86 87/* Constants for jsimd_idct_ifast_neon() */ 88 89.balign 16 90Ljsimd_idct_ifast_neon_consts: 91 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ 92 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ 93 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ 94 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ 95 96/* Constants for jsimd_idct_4x4_neon() and jsimd_idct_2x2_neon() */ 97 98#define CONST_BITS 13 99 100#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ 101#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ 102#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ 103#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ 104#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ 105#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ 106#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ 107#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ 108#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ 109#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ 110#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ 111#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ 112#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ 113#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ 114 115.balign 16 116Ljsimd_idct_4x4_neon_consts: 117 .short FIX_1_847759065 /* v0.h[0] */ 118 .short -FIX_0_765366865 /* v0.h[1] */ 119 .short -FIX_0_211164243 /* v0.h[2] */ 120 .short FIX_1_451774981 /* v0.h[3] */ 121 .short -FIX_2_172734803 /* d1[0] */ 122 .short FIX_1_061594337 /* d1[1] */ 123 .short -FIX_0_509795579 /* d1[2] */ 124 .short -FIX_0_601344887 /* d1[3] */ 125 .short FIX_0_899976223 /* v2.h[0] */ 126 .short FIX_2_562915447 /* v2.h[1] */ 127 .short 1 << (CONST_BITS + 1) /* v2.h[2] */ 128 .short 0 /* v2.h[3] */ 129 130.balign 8 131Ljsimd_idct_2x2_neon_consts: 132 .short -FIX_0_720959822 /* v14[0] */ 133 .short FIX_0_850430095 /* v14[1] */ 134 .short -FIX_1_272758580 /* v14[2] */ 135 .short FIX_3_624509785 /* v14[3] */ 136 137/* Constants for jsimd_ycc_*_neon() */ 138 139.balign 16 140Ljsimd_ycc_rgb_neon_consts: 141 .short 0, 0, 0, 0 142 .short 22971, -11277, -23401, 29033 143 .short -128, -128, -128, -128 144 .short -128, -128, -128, -128 145 146/* Constants for jsimd_*_ycc_neon() */ 147 148.balign 16 149Ljsimd_rgb_ycc_neon_consts: 150 .short 19595, 38470, 7471, 11059 151 .short 21709, 32768, 27439, 5329 152 .short 32767, 128, 32767, 128 153 .short 32767, 128, 32767, 128 154 155/* Constants for jsimd_fdct_islow_neon() */ 156 157#define F_0_298 2446 /* FIX(0.298631336) */ 158#define F_0_390 3196 /* FIX(0.390180644) */ 159#define F_0_541 4433 /* FIX(0.541196100) */ 160#define F_0_765 6270 /* FIX(0.765366865) */ 161#define F_0_899 7373 /* FIX(0.899976223) */ 162#define F_1_175 9633 /* FIX(1.175875602) */ 163#define F_1_501 12299 /* FIX(1.501321110) */ 164#define F_1_847 15137 /* FIX(1.847759065) */ 165#define F_1_961 16069 /* FIX(1.961570560) */ 166#define F_2_053 16819 /* FIX(2.053119869) */ 167#define F_2_562 20995 /* FIX(2.562915447) */ 168#define F_3_072 25172 /* FIX(3.072711026) */ 169 170.balign 16 171Ljsimd_fdct_islow_neon_consts: 172 .short F_0_298 173 .short -F_0_390 174 .short F_0_541 175 .short F_0_765 176 .short - F_0_899 177 .short F_1_175 178 .short F_1_501 179 .short - F_1_847 180 .short - F_1_961 181 .short F_2_053 182 .short - F_2_562 183 .short F_3_072 184 .short 0 /* padding */ 185 .short 0 186 .short 0 187 .short 0 188 189#undef F_0_298 190#undef F_0_390 191#undef F_0_541 192#undef F_0_765 193#undef F_0_899 194#undef F_1_175 195#undef F_1_501 196#undef F_1_847 197#undef F_1_961 198#undef F_2_053 199#undef F_2_562 200#undef F_3_072 201 202/* Constants for jsimd_fdct_ifast_neon() */ 203 204.balign 16 205Ljsimd_fdct_ifast_neon_consts: 206 .short (98 * 128) /* XFIX_0_382683433 */ 207 .short (139 * 128) /* XFIX_0_541196100 */ 208 .short (181 * 128) /* XFIX_0_707106781 */ 209 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ 210 211/* Constants for jsimd_h2*_downsample_neon() */ 212 213.balign 16 214Ljsimd_h2_downsample_neon_consts: 215 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 216 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */ 217 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 218 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */ 219 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 220 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */ 221 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 222 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */ 223 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 224 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */ 225 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 226 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */ 227 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 228 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */ 229 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 230 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */ 231 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 232 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */ 233 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \ 234 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */ 235 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \ 236 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */ 237 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \ 238 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */ 239 .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \ 240 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */ 241 .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \ 242 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */ 243 .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \ 244 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */ 245 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ 246 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */ 247 248/* Constants for jsimd_huff_encode_one_block_neon() */ 249 250.balign 16 251Ljsimd_huff_encode_one_block_neon_consts: 252 .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \ 253 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 254 .byte 0, 1, 2, 3, 16, 17, 32, 33, \ 255 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */ 256 .byte 34, 35, 48, 49, 255, 255, 50, 51, \ 257 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */ 258 .byte 8, 9, 22, 23, 36, 37, 50, 51, \ 259 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */ 260 .byte 54, 55, 40, 41, 26, 27, 12, 13, \ 261 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */ 262 .byte 6, 7, 20, 21, 34, 35, 48, 49, \ 263 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */ 264 .byte 42, 43, 28, 29, 14, 15, 30, 31, \ 265 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */ 266 .byte 255, 255, 255, 255, 56, 57, 42, 43, \ 267 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */ 268 .byte 26, 27, 40, 41, 42, 43, 28, 29, \ 269 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */ 270 .byte 255, 255, 255, 255, 0, 1, 255, 255, \ 271 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */ 272 .byte 255, 255, 255, 255, 255, 255, 255, 255, \ 273 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */ 274 .byte 255, 255, 255, 255, 255, 255, 255, 255, \ 275 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */ 276 .byte 4, 5, 6, 7, 255, 255, 255, 255, \ 277 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */ 278 279.text 280 281 282#define RESPECT_STRICT_ALIGNMENT 1 283 284 285/*****************************************************************************/ 286 287/* Supplementary macro for setting function attributes */ 288.macro asm_function fname 289#ifdef __APPLE__ 290 .private_extern _\fname 291 .globl _\fname 292_\fname: 293#else 294 .global \fname 295#ifdef __ELF__ 296 .hidden \fname 297 .type \fname, %function 298#endif 299\fname: 300#endif 301.endm 302 303/* Get symbol location */ 304.macro get_symbol_loc reg, symbol 305#ifdef __APPLE__ 306 adrp \reg, \symbol@PAGE 307 add \reg, \reg, \symbol@PAGEOFF 308#else 309 adrp \reg, \symbol 310 add \reg, \reg, :lo12:\symbol 311#endif 312.endm 313 314/* Transpose elements of single 128 bit registers */ 315.macro transpose_single x0, x1, xi, xilen, literal 316 ins \xi\xilen[0], \x0\xilen[0] 317 ins \x1\xilen[0], \x0\xilen[1] 318 trn1 \x0\literal, \x0\literal, \x1\literal 319 trn2 \x1\literal, \xi\literal, \x1\literal 320.endm 321 322/* Transpose elements of 2 different registers */ 323.macro transpose x0, x1, xi, xilen, literal 324 mov \xi\xilen, \x0\xilen 325 trn1 \x0\literal, \x0\literal, \x1\literal 326 trn2 \x1\literal, \xi\literal, \x1\literal 327.endm 328 329/* Transpose a block of 4x4 coefficients in four 64-bit registers */ 330.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen 331 mov \xi\xilen, \x0\xilen 332 trn1 \x0\x0len, \x0\x0len, \x2\x2len 333 trn2 \x2\x2len, \xi\x0len, \x2\x2len 334 mov \xi\xilen, \x1\xilen 335 trn1 \x1\x1len, \x1\x1len, \x3\x3len 336 trn2 \x3\x3len, \xi\x1len, \x3\x3len 337.endm 338 339.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen 340 mov \xi\xilen, \x0\xilen 341 trn1 \x0\x0len, \x0\x0len, \x1\x1len 342 trn2 \x1\x2len, \xi\x0len, \x1\x2len 343 mov \xi\xilen, \x2\xilen 344 trn1 \x2\x2len, \x2\x2len, \x3\x3len 345 trn2 \x3\x2len, \xi\x1len, \x3\x3len 346.endm 347 348.macro transpose_4x4 x0, x1, x2, x3, x5 349 transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b 350 transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b 351.endm 352 353.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3 354 trn1 \t0\().8h, \l0\().8h, \l1\().8h 355 trn1 \t1\().8h, \l2\().8h, \l3\().8h 356 trn1 \t2\().8h, \l4\().8h, \l5\().8h 357 trn1 \t3\().8h, \l6\().8h, \l7\().8h 358 trn2 \l1\().8h, \l0\().8h, \l1\().8h 359 trn2 \l3\().8h, \l2\().8h, \l3\().8h 360 trn2 \l5\().8h, \l4\().8h, \l5\().8h 361 trn2 \l7\().8h, \l6\().8h, \l7\().8h 362 363 trn1 \l4\().4s, \t2\().4s, \t3\().4s 364 trn2 \t3\().4s, \t2\().4s, \t3\().4s 365 trn1 \t2\().4s, \t0\().4s, \t1\().4s 366 trn2 \l2\().4s, \t0\().4s, \t1\().4s 367 trn1 \t0\().4s, \l1\().4s, \l3\().4s 368 trn2 \l3\().4s, \l1\().4s, \l3\().4s 369 trn2 \t1\().4s, \l5\().4s, \l7\().4s 370 trn1 \l5\().4s, \l5\().4s, \l7\().4s 371 372 trn2 \l6\().2d, \l2\().2d, \t3\().2d 373 trn1 \l0\().2d, \t2\().2d, \l4\().2d 374 trn1 \l1\().2d, \t0\().2d, \l5\().2d 375 trn2 \l7\().2d, \l3\().2d, \t1\().2d 376 trn1 \l2\().2d, \l2\().2d, \t3\().2d 377 trn2 \l4\().2d, \t2\().2d, \l4\().2d 378 trn1 \l3\().2d, \l3\().2d, \t1\().2d 379 trn2 \l5\().2d, \t0\().2d, \l5\().2d 380.endm 381 382 383#define CENTERJSAMPLE 128 384 385/*****************************************************************************/ 386 387/* 388 * Perform dequantization and inverse DCT on one block of coefficients. 389 * 390 * GLOBAL(void) 391 * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block, 392 * JSAMPARRAY output_buf, JDIMENSION output_col) 393 */ 394 395#define CONST_BITS 13 396#define PASS1_BITS 2 397 398#define XFIX_P_0_298 v0.h[0] 399#define XFIX_N_0_390 v0.h[1] 400#define XFIX_P_0_541 v0.h[2] 401#define XFIX_P_0_765 v0.h[3] 402#define XFIX_N_0_899 v0.h[4] 403#define XFIX_P_1_175 v0.h[5] 404#define XFIX_P_1_501 v0.h[6] 405#define XFIX_N_1_847 v0.h[7] 406#define XFIX_N_1_961 v1.h[0] 407#define XFIX_P_2_053 v1.h[1] 408#define XFIX_N_2_562 v1.h[2] 409#define XFIX_P_3_072 v1.h[3] 410 411asm_function jsimd_idct_islow_neon 412 DCT_TABLE .req x0 413 COEF_BLOCK .req x1 414 OUTPUT_BUF .req x2 415 OUTPUT_COL .req x3 416 TMP1 .req x0 417 TMP2 .req x1 418 TMP3 .req x9 419 TMP4 .req x10 420 TMP5 .req x11 421 TMP6 .req x12 422 TMP7 .req x13 423 TMP8 .req x14 424 425 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 426 guarantee that the upper (unused) 32 bits of x3 are valid. This 427 instruction ensures that those bits are set to zero. */ 428 uxtw x3, w3 429 430 sub sp, sp, #64 431 get_symbol_loc x15, Ljsimd_idct_islow_neon_consts 432 mov x10, sp 433 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32 434 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32 435 ld1 {v0.8h, v1.8h}, [x15] 436 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64 437 ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64 438 ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64 439 ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64 440 441 cmeq v16.8h, v3.8h, #0 442 cmeq v26.8h, v4.8h, #0 443 cmeq v27.8h, v5.8h, #0 444 cmeq v28.8h, v6.8h, #0 445 cmeq v29.8h, v7.8h, #0 446 cmeq v30.8h, v8.8h, #0 447 cmeq v31.8h, v9.8h, #0 448 449 and v10.16b, v16.16b, v26.16b 450 and v11.16b, v27.16b, v28.16b 451 and v12.16b, v29.16b, v30.16b 452 and v13.16b, v31.16b, v10.16b 453 and v14.16b, v11.16b, v12.16b 454 mul v2.8h, v2.8h, v18.8h 455 and v15.16b, v13.16b, v14.16b 456 shl v10.8h, v2.8h, #(PASS1_BITS) 457 sqxtn v16.8b, v15.8h 458 mov TMP1, v16.d[0] 459 mvn TMP2, TMP1 460 461 cbnz TMP2, 2f 462 /* case all AC coeffs are zeros */ 463 dup v2.2d, v10.d[0] 464 dup v6.2d, v10.d[1] 465 mov v3.16b, v2.16b 466 mov v7.16b, v6.16b 467 mov v4.16b, v2.16b 468 mov v8.16b, v6.16b 469 mov v5.16b, v2.16b 470 mov v9.16b, v6.16b 4711: 472 /* for this transpose, we should organise data like this: 473 * 00, 01, 02, 03, 40, 41, 42, 43 474 * 10, 11, 12, 13, 50, 51, 52, 53 475 * 20, 21, 22, 23, 60, 61, 62, 63 476 * 30, 31, 32, 33, 70, 71, 72, 73 477 * 04, 05, 06, 07, 44, 45, 46, 47 478 * 14, 15, 16, 17, 54, 55, 56, 57 479 * 24, 25, 26, 27, 64, 65, 66, 67 480 * 34, 35, 36, 37, 74, 75, 76, 77 481 */ 482 trn1 v28.8h, v2.8h, v3.8h 483 trn1 v29.8h, v4.8h, v5.8h 484 trn1 v30.8h, v6.8h, v7.8h 485 trn1 v31.8h, v8.8h, v9.8h 486 trn2 v16.8h, v2.8h, v3.8h 487 trn2 v17.8h, v4.8h, v5.8h 488 trn2 v18.8h, v6.8h, v7.8h 489 trn2 v19.8h, v8.8h, v9.8h 490 trn1 v2.4s, v28.4s, v29.4s 491 trn1 v6.4s, v30.4s, v31.4s 492 trn1 v3.4s, v16.4s, v17.4s 493 trn1 v7.4s, v18.4s, v19.4s 494 trn2 v4.4s, v28.4s, v29.4s 495 trn2 v8.4s, v30.4s, v31.4s 496 trn2 v5.4s, v16.4s, v17.4s 497 trn2 v9.4s, v18.4s, v19.4s 498 /* Even part: reverse the even part of the forward DCT. */ 499 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ 500 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 501 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 502 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 503 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 504 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 505 mov v21.16b, v19.16b /* tmp3 = z1 */ 506 mov v20.16b, v18.16b /* tmp3 = z1 */ 507 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 508 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 509 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 510 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 511 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 512 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 513 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 514 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ 515 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ 516 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ 517 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ 518 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ 519 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ 520 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ 521 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ 522 523 /* Odd part per figure 8; the matrix is unitary and hence its 524 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 525 */ 526 527 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 528 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 529 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 530 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 531 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ 532 533 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 534 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 535 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 536 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 537 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 538 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 539 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 540 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 541 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 542 543 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 544 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 545 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 546 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 547 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 548 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 549 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 550 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 551 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 552 553 add v23.4s, v23.4s, v27.4s /* z3 += z5 */ 554 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ 555 add v25.4s, v25.4s, v27.4s /* z4 += z5 */ 556 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ 557 558 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ 559 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ 560 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ 561 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ 562 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ 563 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ 564 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ 565 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ 566 567 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ 568 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ 569 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ 570 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ 571 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ 572 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ 573 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ 574 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ 575 576 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 577 578 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ 579 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ 580 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ 581 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ 582 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ 583 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ 584 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ 585 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ 586 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ 587 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ 588 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ 589 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ 590 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ 591 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ 592 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ 593 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ 594 595 shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */ 596 shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */ 597 shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */ 598 shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */ 599 shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */ 600 shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */ 601 shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */ 602 shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */ 603 shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */ 604 shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */ 605 shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */ 606 shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */ 607 shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */ 608 shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */ 609 shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */ 610 shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */ 611 movi v0.16b, #(CENTERJSAMPLE) 612 /* Prepare pointers (dual-issue with NEON instructions) */ 613 ldp TMP1, TMP2, [OUTPUT_BUF], 16 614 sqrshrn v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16) 615 ldp TMP3, TMP4, [OUTPUT_BUF], 16 616 sqrshrn v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16) 617 add TMP1, TMP1, OUTPUT_COL 618 sqrshrn v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16) 619 add TMP2, TMP2, OUTPUT_COL 620 sqrshrn v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16) 621 add TMP3, TMP3, OUTPUT_COL 622 sqrshrn2 v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16) 623 add TMP4, TMP4, OUTPUT_COL 624 sqrshrn2 v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16) 625 ldp TMP5, TMP6, [OUTPUT_BUF], 16 626 sqrshrn2 v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16) 627 ldp TMP7, TMP8, [OUTPUT_BUF], 16 628 sqrshrn2 v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16) 629 add TMP5, TMP5, OUTPUT_COL 630 add v16.16b, v28.16b, v0.16b 631 add TMP6, TMP6, OUTPUT_COL 632 add v18.16b, v29.16b, v0.16b 633 add TMP7, TMP7, OUTPUT_COL 634 add v20.16b, v30.16b, v0.16b 635 add TMP8, TMP8, OUTPUT_COL 636 add v22.16b, v31.16b, v0.16b 637 638 /* Transpose the final 8-bit samples */ 639 trn1 v28.16b, v16.16b, v18.16b 640 trn1 v30.16b, v20.16b, v22.16b 641 trn2 v29.16b, v16.16b, v18.16b 642 trn2 v31.16b, v20.16b, v22.16b 643 644 trn1 v16.8h, v28.8h, v30.8h 645 trn2 v18.8h, v28.8h, v30.8h 646 trn1 v20.8h, v29.8h, v31.8h 647 trn2 v22.8h, v29.8h, v31.8h 648 649 uzp1 v28.4s, v16.4s, v18.4s 650 uzp2 v30.4s, v16.4s, v18.4s 651 uzp1 v29.4s, v20.4s, v22.4s 652 uzp2 v31.4s, v20.4s, v22.4s 653 654 /* Store results to the output buffer */ 655 st1 {v28.d}[0], [TMP1] 656 st1 {v29.d}[0], [TMP2] 657 st1 {v28.d}[1], [TMP3] 658 st1 {v29.d}[1], [TMP4] 659 st1 {v30.d}[0], [TMP5] 660 st1 {v31.d}[0], [TMP6] 661 st1 {v30.d}[1], [TMP7] 662 st1 {v31.d}[1], [TMP8] 663 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32 664 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32 665 blr x30 666 667.balign 16 6682: 669 mul v3.8h, v3.8h, v19.8h 670 mul v4.8h, v4.8h, v20.8h 671 mul v5.8h, v5.8h, v21.8h 672 add TMP4, xzr, TMP2, LSL #32 673 mul v6.8h, v6.8h, v22.8h 674 mul v7.8h, v7.8h, v23.8h 675 adds TMP3, xzr, TMP2, LSR #32 676 mul v8.8h, v8.8h, v24.8h 677 mul v9.8h, v9.8h, v25.8h 678 b.ne 3f 679 /* Right AC coef is zero */ 680 dup v15.2d, v10.d[1] 681 /* Even part: reverse the even part of the forward DCT. */ 682 add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ 683 add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 684 sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 685 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 686 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 687 mov v20.16b, v18.16b /* tmp3 = z1 */ 688 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 689 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 690 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 691 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ 692 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ 693 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ 694 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ 695 696 /* Odd part per figure 8; the matrix is unitary and hence its 697 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 698 */ 699 700 add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 701 add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 702 add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 703 add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 704 add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */ 705 706 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 707 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 708 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 709 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 710 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 711 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 712 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 713 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 714 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 715 716 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ 717 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ 718 719 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ 720 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ 721 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ 722 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ 723 724 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ 725 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ 726 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ 727 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ 728 729 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 730 731 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ 732 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ 733 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ 734 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ 735 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ 736 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ 737 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ 738 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ 739 740 rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ 741 rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ 742 rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ 743 rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ 744 rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ 745 rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ 746 rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ 747 rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ 748 mov v6.16b, v15.16b 749 mov v7.16b, v15.16b 750 mov v8.16b, v15.16b 751 mov v9.16b, v15.16b 752 b 1b 753 754.balign 16 7553: 756 cbnz TMP4, 4f 757 /* Left AC coef is zero */ 758 dup v14.2d, v10.d[0] 759 /* Even part: reverse the even part of the forward DCT. */ 760 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ 761 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 762 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 763 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 764 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 765 mov v21.16b, v19.16b /* tmp3 = z1 */ 766 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 767 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 768 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 769 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ 770 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ 771 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ 772 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ 773 774 /* Odd part per figure 8; the matrix is unitary and hence its 775 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 776 */ 777 778 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 779 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 780 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 781 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 782 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ 783 784 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 785 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 786 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 787 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 788 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 789 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 790 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 791 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 792 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 793 794 add v23.4s, v23.4s, v27.4s /* z3 += z5 */ 795 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ 796 add v25.4s, v25.4s, v27.4s /* z4 += z5 */ 797 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ 798 799 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ 800 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ 801 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ 802 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ 803 804 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ 805 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ 806 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ 807 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ 808 809 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 810 811 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ 812 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ 813 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ 814 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ 815 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ 816 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ 817 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ 818 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ 819 820 mov v2.16b, v14.16b 821 mov v3.16b, v14.16b 822 mov v4.16b, v14.16b 823 mov v5.16b, v14.16b 824 rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ 825 rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ 826 rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ 827 rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ 828 rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ 829 rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ 830 rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ 831 rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ 832 b 1b 833 834.balign 16 8354: 836 /* "No" AC coef is zero */ 837 /* Even part: reverse the even part of the forward DCT. */ 838 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ 839 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 840 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 841 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 842 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 843 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 844 mov v21.16b, v19.16b /* tmp3 = z1 */ 845 mov v20.16b, v18.16b /* tmp3 = z1 */ 846 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 847 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 848 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 849 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 850 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 851 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 852 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 853 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ 854 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ 855 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ 856 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ 857 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ 858 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ 859 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ 860 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ 861 862 /* Odd part per figure 8; the matrix is unitary and hence its 863 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 864 */ 865 866 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 867 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 868 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 869 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 870 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ 871 872 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 873 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 874 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 875 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 876 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 877 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 878 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 879 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 880 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 881 882 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 883 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 884 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 885 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 886 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 887 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 888 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 889 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 890 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 891 892 add v23.4s, v23.4s, v27.4s /* z3 += z5 */ 893 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ 894 add v25.4s, v25.4s, v27.4s /* z4 += z5 */ 895 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ 896 897 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ 898 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ 899 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ 900 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ 901 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ 902 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ 903 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ 904 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ 905 906 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ 907 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ 908 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ 909 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ 910 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ 911 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ 912 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ 913 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ 914 915 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 916 917 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ 918 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ 919 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ 920 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ 921 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ 922 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ 923 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ 924 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ 925 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ 926 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ 927 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ 928 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ 929 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ 930 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ 931 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ 932 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ 933 934 rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ 935 rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ 936 rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ 937 rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ 938 rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ 939 rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ 940 rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ 941 rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ 942 rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ 943 rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ 944 rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ 945 rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ 946 rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ 947 rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ 948 rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ 949 rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ 950 b 1b 951 952 .unreq DCT_TABLE 953 .unreq COEF_BLOCK 954 .unreq OUTPUT_BUF 955 .unreq OUTPUT_COL 956 .unreq TMP1 957 .unreq TMP2 958 .unreq TMP3 959 .unreq TMP4 960 .unreq TMP5 961 .unreq TMP6 962 .unreq TMP7 963 .unreq TMP8 964 965#undef CENTERJSAMPLE 966#undef CONST_BITS 967#undef PASS1_BITS 968#undef XFIX_P_0_298 969#undef XFIX_N_0_390 970#undef XFIX_P_0_541 971#undef XFIX_P_0_765 972#undef XFIX_N_0_899 973#undef XFIX_P_1_175 974#undef XFIX_P_1_501 975#undef XFIX_N_1_847 976#undef XFIX_N_1_961 977#undef XFIX_P_2_053 978#undef XFIX_N_2_562 979#undef XFIX_P_3_072 980 981 982/*****************************************************************************/ 983 984/* 985 * jsimd_idct_ifast_neon 986 * 987 * This function contains a fast, not so accurate integer implementation of 988 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations 989 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' 990 * function from jidctfst.c 991 * 992 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. 993 * But in ARM NEON case some extra additions are required because VQDMULH 994 * instruction can't handle the constants larger than 1. So the expressions 995 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", 996 * which introduces an extra addition. Overall, there are 6 extra additions 997 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. 998 */ 999 1000#define XFIX_1_082392200 v0.h[0] 1001#define XFIX_1_414213562 v0.h[1] 1002#define XFIX_1_847759065 v0.h[2] 1003#define XFIX_2_613125930 v0.h[3] 1004 1005asm_function jsimd_idct_ifast_neon 1006 1007 DCT_TABLE .req x0 1008 COEF_BLOCK .req x1 1009 OUTPUT_BUF .req x2 1010 OUTPUT_COL .req x3 1011 TMP1 .req x0 1012 TMP2 .req x1 1013 TMP3 .req x9 1014 TMP4 .req x10 1015 TMP5 .req x11 1016 TMP6 .req x12 1017 TMP7 .req x13 1018 TMP8 .req x14 1019 1020 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 1021 guarantee that the upper (unused) 32 bits of x3 are valid. This 1022 instruction ensures that those bits are set to zero. */ 1023 uxtw x3, w3 1024 1025 /* Load and dequantize coefficients into NEON registers 1026 * with the following allocation: 1027 * 0 1 2 3 | 4 5 6 7 1028 * ---------+-------- 1029 * 0 | d16 | d17 ( v16.8h ) 1030 * 1 | d18 | d19 ( v17.8h ) 1031 * 2 | d20 | d21 ( v18.8h ) 1032 * 3 | d22 | d23 ( v19.8h ) 1033 * 4 | d24 | d25 ( v20.8h ) 1034 * 5 | d26 | d27 ( v21.8h ) 1035 * 6 | d28 | d29 ( v22.8h ) 1036 * 7 | d30 | d31 ( v23.8h ) 1037 */ 1038 /* Save NEON registers used in fast IDCT */ 1039 get_symbol_loc TMP5, Ljsimd_idct_ifast_neon_consts 1040 ld1 {v16.8h, v17.8h}, [COEF_BLOCK], 32 1041 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 1042 ld1 {v18.8h, v19.8h}, [COEF_BLOCK], 32 1043 mul v16.8h, v16.8h, v0.8h 1044 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 1045 mul v17.8h, v17.8h, v1.8h 1046 ld1 {v20.8h, v21.8h}, [COEF_BLOCK], 32 1047 mul v18.8h, v18.8h, v2.8h 1048 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 1049 mul v19.8h, v19.8h, v3.8h 1050 ld1 {v22.8h, v23.8h}, [COEF_BLOCK], 32 1051 mul v20.8h, v20.8h, v0.8h 1052 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 1053 mul v22.8h, v22.8h, v2.8h 1054 mul v21.8h, v21.8h, v1.8h 1055 ld1 {v0.4h}, [TMP5] /* load constants */ 1056 mul v23.8h, v23.8h, v3.8h 1057 1058 /* 1-D IDCT, pass 1 */ 1059 sub v2.8h, v18.8h, v22.8h 1060 add v22.8h, v18.8h, v22.8h 1061 sub v1.8h, v19.8h, v21.8h 1062 add v21.8h, v19.8h, v21.8h 1063 sub v5.8h, v17.8h, v23.8h 1064 add v23.8h, v17.8h, v23.8h 1065 sqdmulh v4.8h, v2.8h, XFIX_1_414213562 1066 sqdmulh v6.8h, v1.8h, XFIX_2_613125930 1067 add v3.8h, v1.8h, v1.8h 1068 sub v1.8h, v5.8h, v1.8h 1069 add v18.8h, v2.8h, v4.8h 1070 sqdmulh v4.8h, v1.8h, XFIX_1_847759065 1071 sub v2.8h, v23.8h, v21.8h 1072 add v3.8h, v3.8h, v6.8h 1073 sqdmulh v6.8h, v2.8h, XFIX_1_414213562 1074 add v1.8h, v1.8h, v4.8h 1075 sqdmulh v4.8h, v5.8h, XFIX_1_082392200 1076 sub v18.8h, v18.8h, v22.8h 1077 add v2.8h, v2.8h, v6.8h 1078 sub v6.8h, v16.8h, v20.8h 1079 add v20.8h, v16.8h, v20.8h 1080 add v17.8h, v5.8h, v4.8h 1081 add v5.8h, v6.8h, v18.8h 1082 sub v18.8h, v6.8h, v18.8h 1083 add v6.8h, v23.8h, v21.8h 1084 add v16.8h, v20.8h, v22.8h 1085 sub v3.8h, v6.8h, v3.8h 1086 sub v20.8h, v20.8h, v22.8h 1087 sub v3.8h, v3.8h, v1.8h 1088 sub v1.8h, v17.8h, v1.8h 1089 add v2.8h, v3.8h, v2.8h 1090 sub v23.8h, v16.8h, v6.8h 1091 add v1.8h, v1.8h, v2.8h 1092 add v16.8h, v16.8h, v6.8h 1093 add v22.8h, v5.8h, v3.8h 1094 sub v17.8h, v5.8h, v3.8h 1095 sub v21.8h, v18.8h, v2.8h 1096 add v18.8h, v18.8h, v2.8h 1097 sub v19.8h, v20.8h, v1.8h 1098 add v20.8h, v20.8h, v1.8h 1099 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31 1100 /* 1-D IDCT, pass 2 */ 1101 sub v2.8h, v18.8h, v22.8h 1102 add v22.8h, v18.8h, v22.8h 1103 sub v1.8h, v19.8h, v21.8h 1104 add v21.8h, v19.8h, v21.8h 1105 sub v5.8h, v17.8h, v23.8h 1106 add v23.8h, v17.8h, v23.8h 1107 sqdmulh v4.8h, v2.8h, XFIX_1_414213562 1108 sqdmulh v6.8h, v1.8h, XFIX_2_613125930 1109 add v3.8h, v1.8h, v1.8h 1110 sub v1.8h, v5.8h, v1.8h 1111 add v18.8h, v2.8h, v4.8h 1112 sqdmulh v4.8h, v1.8h, XFIX_1_847759065 1113 sub v2.8h, v23.8h, v21.8h 1114 add v3.8h, v3.8h, v6.8h 1115 sqdmulh v6.8h, v2.8h, XFIX_1_414213562 1116 add v1.8h, v1.8h, v4.8h 1117 sqdmulh v4.8h, v5.8h, XFIX_1_082392200 1118 sub v18.8h, v18.8h, v22.8h 1119 add v2.8h, v2.8h, v6.8h 1120 sub v6.8h, v16.8h, v20.8h 1121 add v20.8h, v16.8h, v20.8h 1122 add v17.8h, v5.8h, v4.8h 1123 add v5.8h, v6.8h, v18.8h 1124 sub v18.8h, v6.8h, v18.8h 1125 add v6.8h, v23.8h, v21.8h 1126 add v16.8h, v20.8h, v22.8h 1127 sub v3.8h, v6.8h, v3.8h 1128 sub v20.8h, v20.8h, v22.8h 1129 sub v3.8h, v3.8h, v1.8h 1130 sub v1.8h, v17.8h, v1.8h 1131 add v2.8h, v3.8h, v2.8h 1132 sub v23.8h, v16.8h, v6.8h 1133 add v1.8h, v1.8h, v2.8h 1134 add v16.8h, v16.8h, v6.8h 1135 add v22.8h, v5.8h, v3.8h 1136 sub v17.8h, v5.8h, v3.8h 1137 sub v21.8h, v18.8h, v2.8h 1138 add v18.8h, v18.8h, v2.8h 1139 sub v19.8h, v20.8h, v1.8h 1140 add v20.8h, v20.8h, v1.8h 1141 /* Descale to 8-bit and range limit */ 1142 movi v0.16b, #0x80 1143 /* Prepare pointers (dual-issue with NEON instructions) */ 1144 ldp TMP1, TMP2, [OUTPUT_BUF], 16 1145 sqshrn v28.8b, v16.8h, #5 1146 ldp TMP3, TMP4, [OUTPUT_BUF], 16 1147 sqshrn v29.8b, v17.8h, #5 1148 add TMP1, TMP1, OUTPUT_COL 1149 sqshrn v30.8b, v18.8h, #5 1150 add TMP2, TMP2, OUTPUT_COL 1151 sqshrn v31.8b, v19.8h, #5 1152 add TMP3, TMP3, OUTPUT_COL 1153 sqshrn2 v28.16b, v20.8h, #5 1154 add TMP4, TMP4, OUTPUT_COL 1155 sqshrn2 v29.16b, v21.8h, #5 1156 ldp TMP5, TMP6, [OUTPUT_BUF], 16 1157 sqshrn2 v30.16b, v22.8h, #5 1158 ldp TMP7, TMP8, [OUTPUT_BUF], 16 1159 sqshrn2 v31.16b, v23.8h, #5 1160 add TMP5, TMP5, OUTPUT_COL 1161 add v16.16b, v28.16b, v0.16b 1162 add TMP6, TMP6, OUTPUT_COL 1163 add v18.16b, v29.16b, v0.16b 1164 add TMP7, TMP7, OUTPUT_COL 1165 add v20.16b, v30.16b, v0.16b 1166 add TMP8, TMP8, OUTPUT_COL 1167 add v22.16b, v31.16b, v0.16b 1168 1169 /* Transpose the final 8-bit samples */ 1170 trn1 v28.16b, v16.16b, v18.16b 1171 trn1 v30.16b, v20.16b, v22.16b 1172 trn2 v29.16b, v16.16b, v18.16b 1173 trn2 v31.16b, v20.16b, v22.16b 1174 1175 trn1 v16.8h, v28.8h, v30.8h 1176 trn2 v18.8h, v28.8h, v30.8h 1177 trn1 v20.8h, v29.8h, v31.8h 1178 trn2 v22.8h, v29.8h, v31.8h 1179 1180 uzp1 v28.4s, v16.4s, v18.4s 1181 uzp2 v30.4s, v16.4s, v18.4s 1182 uzp1 v29.4s, v20.4s, v22.4s 1183 uzp2 v31.4s, v20.4s, v22.4s 1184 1185 /* Store results to the output buffer */ 1186 st1 {v28.d}[0], [TMP1] 1187 st1 {v29.d}[0], [TMP2] 1188 st1 {v28.d}[1], [TMP3] 1189 st1 {v29.d}[1], [TMP4] 1190 st1 {v30.d}[0], [TMP5] 1191 st1 {v31.d}[0], [TMP6] 1192 st1 {v30.d}[1], [TMP7] 1193 st1 {v31.d}[1], [TMP8] 1194 blr x30 1195 1196 .unreq DCT_TABLE 1197 .unreq COEF_BLOCK 1198 .unreq OUTPUT_BUF 1199 .unreq OUTPUT_COL 1200 .unreq TMP1 1201 .unreq TMP2 1202 .unreq TMP3 1203 .unreq TMP4 1204 .unreq TMP5 1205 .unreq TMP6 1206 .unreq TMP7 1207 .unreq TMP8 1208 1209 1210/*****************************************************************************/ 1211 1212/* 1213 * jsimd_idct_4x4_neon 1214 * 1215 * This function contains inverse-DCT code for getting reduced-size 1216 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations 1217 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' 1218 * function from jpeg-6b (jidctred.c). 1219 * 1220 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which 1221 * requires much less arithmetic operations and hence should be faster. 1222 * The primary purpose of this particular NEON optimized function is 1223 * bit exact compatibility with jpeg-6b. 1224 * 1225 * TODO: a bit better instructions scheduling can be achieved by expanding 1226 * idct_helper/transpose_4x4 macros and reordering instructions, 1227 * but readability will suffer somewhat. 1228 */ 1229 1230.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 1231 smull v28.4s, \x4, v2.h[2] 1232 smlal v28.4s, \x8, v0.h[0] 1233 smlal v28.4s, \x14, v0.h[1] 1234 1235 smull v26.4s, \x16, v1.h[2] 1236 smlal v26.4s, \x12, v1.h[3] 1237 smlal v26.4s, \x10, v2.h[0] 1238 smlal v26.4s, \x6, v2.h[1] 1239 1240 smull v30.4s, \x4, v2.h[2] 1241 smlsl v30.4s, \x8, v0.h[0] 1242 smlsl v30.4s, \x14, v0.h[1] 1243 1244 smull v24.4s, \x16, v0.h[2] 1245 smlal v24.4s, \x12, v0.h[3] 1246 smlal v24.4s, \x10, v1.h[0] 1247 smlal v24.4s, \x6, v1.h[1] 1248 1249 add v20.4s, v28.4s, v26.4s 1250 sub v28.4s, v28.4s, v26.4s 1251 1252 .if \shift > 16 1253 srshr v20.4s, v20.4s, #\shift 1254 srshr v28.4s, v28.4s, #\shift 1255 xtn \y26, v20.4s 1256 xtn \y29, v28.4s 1257 .else 1258 rshrn \y26, v20.4s, #\shift 1259 rshrn \y29, v28.4s, #\shift 1260 .endif 1261 1262 add v20.4s, v30.4s, v24.4s 1263 sub v30.4s, v30.4s, v24.4s 1264 1265 .if \shift > 16 1266 srshr v20.4s, v20.4s, #\shift 1267 srshr v30.4s, v30.4s, #\shift 1268 xtn \y27, v20.4s 1269 xtn \y28, v30.4s 1270 .else 1271 rshrn \y27, v20.4s, #\shift 1272 rshrn \y28, v30.4s, #\shift 1273 .endif 1274.endm 1275 1276asm_function jsimd_idct_4x4_neon 1277 1278 DCT_TABLE .req x0 1279 COEF_BLOCK .req x1 1280 OUTPUT_BUF .req x2 1281 OUTPUT_COL .req x3 1282 TMP1 .req x0 1283 TMP2 .req x1 1284 TMP3 .req x2 1285 TMP4 .req x15 1286 1287 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 1288 guarantee that the upper (unused) 32 bits of x3 are valid. This 1289 instruction ensures that those bits are set to zero. */ 1290 uxtw x3, w3 1291 1292 /* Save all used NEON registers */ 1293 sub sp, sp, 64 1294 mov x9, sp 1295 /* Load constants (v3.4h is just used for padding) */ 1296 get_symbol_loc TMP4, Ljsimd_idct_4x4_neon_consts 1297 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 1298 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 1299 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4] 1300 1301 /* Load all COEF_BLOCK into NEON registers with the following allocation: 1302 * 0 1 2 3 | 4 5 6 7 1303 * ---------+-------- 1304 * 0 | v4.4h | v5.4h 1305 * 1 | v6.4h | v7.4h 1306 * 2 | v8.4h | v9.4h 1307 * 3 | v10.4h | v11.4h 1308 * 4 | - | - 1309 * 5 | v12.4h | v13.4h 1310 * 6 | v14.4h | v15.4h 1311 * 7 | v16.4h | v17.4h 1312 */ 1313 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 1314 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32 1315 add COEF_BLOCK, COEF_BLOCK, #16 1316 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32 1317 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 1318 /* dequantize */ 1319 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 1320 mul v4.4h, v4.4h, v18.4h 1321 mul v5.4h, v5.4h, v19.4h 1322 ins v4.d[1], v5.d[0] /* 128 bit q4 */ 1323 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32 1324 mul v6.4h, v6.4h, v20.4h 1325 mul v7.4h, v7.4h, v21.4h 1326 ins v6.d[1], v7.d[0] /* 128 bit q6 */ 1327 mul v8.4h, v8.4h, v22.4h 1328 mul v9.4h, v9.4h, v23.4h 1329 ins v8.d[1], v9.d[0] /* 128 bit q8 */ 1330 add DCT_TABLE, DCT_TABLE, #16 1331 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32 1332 mul v10.4h, v10.4h, v24.4h 1333 mul v11.4h, v11.4h, v25.4h 1334 ins v10.d[1], v11.d[0] /* 128 bit q10 */ 1335 mul v12.4h, v12.4h, v26.4h 1336 mul v13.4h, v13.4h, v27.4h 1337 ins v12.d[1], v13.d[0] /* 128 bit q12 */ 1338 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 1339 mul v14.4h, v14.4h, v28.4h 1340 mul v15.4h, v15.4h, v29.4h 1341 ins v14.d[1], v15.d[0] /* 128 bit q14 */ 1342 mul v16.4h, v16.4h, v30.4h 1343 mul v17.4h, v17.4h, v31.4h 1344 ins v16.d[1], v17.d[0] /* 128 bit q16 */ 1345 1346 /* Pass 1 */ 1347 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \ 1348 v4.4h, v6.4h, v8.4h, v10.4h 1349 transpose_4x4 v4, v6, v8, v10, v3 1350 ins v10.d[1], v11.d[0] 1351 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \ 1352 v5.4h, v7.4h, v9.4h, v11.4h 1353 transpose_4x4 v5, v7, v9, v11, v3 1354 ins v10.d[1], v11.d[0] 1355 1356 /* Pass 2 */ 1357 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \ 1358 v26.4h, v27.4h, v28.4h, v29.4h 1359 transpose_4x4 v26, v27, v28, v29, v3 1360 1361 /* Range limit */ 1362 movi v30.8h, #0x80 1363 ins v26.d[1], v27.d[0] 1364 ins v28.d[1], v29.d[0] 1365 add v26.8h, v26.8h, v30.8h 1366 add v28.8h, v28.8h, v30.8h 1367 sqxtun v26.8b, v26.8h 1368 sqxtun v27.8b, v28.8h 1369 1370 /* Store results to the output buffer */ 1371 ldp TMP1, TMP2, [OUTPUT_BUF], 16 1372 ldp TMP3, TMP4, [OUTPUT_BUF] 1373 add TMP1, TMP1, OUTPUT_COL 1374 add TMP2, TMP2, OUTPUT_COL 1375 add TMP3, TMP3, OUTPUT_COL 1376 add TMP4, TMP4, OUTPUT_COL 1377 1378#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT 1379 /* We can use much less instructions on little endian systems if the 1380 * OS kernel is not configured to trap unaligned memory accesses 1381 */ 1382 st1 {v26.s}[0], [TMP1], 4 1383 st1 {v27.s}[0], [TMP3], 4 1384 st1 {v26.s}[1], [TMP2], 4 1385 st1 {v27.s}[1], [TMP4], 4 1386#else 1387 st1 {v26.b}[0], [TMP1], 1 1388 st1 {v27.b}[0], [TMP3], 1 1389 st1 {v26.b}[1], [TMP1], 1 1390 st1 {v27.b}[1], [TMP3], 1 1391 st1 {v26.b}[2], [TMP1], 1 1392 st1 {v27.b}[2], [TMP3], 1 1393 st1 {v26.b}[3], [TMP1], 1 1394 st1 {v27.b}[3], [TMP3], 1 1395 1396 st1 {v26.b}[4], [TMP2], 1 1397 st1 {v27.b}[4], [TMP4], 1 1398 st1 {v26.b}[5], [TMP2], 1 1399 st1 {v27.b}[5], [TMP4], 1 1400 st1 {v26.b}[6], [TMP2], 1 1401 st1 {v27.b}[6], [TMP4], 1 1402 st1 {v26.b}[7], [TMP2], 1 1403 st1 {v27.b}[7], [TMP4], 1 1404#endif 1405 1406 /* vpop {v8.4h - v15.4h} ;not available */ 1407 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1408 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1409 blr x30 1410 1411 .unreq DCT_TABLE 1412 .unreq COEF_BLOCK 1413 .unreq OUTPUT_BUF 1414 .unreq OUTPUT_COL 1415 .unreq TMP1 1416 .unreq TMP2 1417 .unreq TMP3 1418 .unreq TMP4 1419 1420.purgem idct_helper 1421 1422 1423/*****************************************************************************/ 1424 1425/* 1426 * jsimd_idct_2x2_neon 1427 * 1428 * This function contains inverse-DCT code for getting reduced-size 1429 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations 1430 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' 1431 * function from jpeg-6b (jidctred.c). 1432 * 1433 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which 1434 * requires much less arithmetic operations and hence should be faster. 1435 * The primary purpose of this particular NEON optimized function is 1436 * bit exact compatibility with jpeg-6b. 1437 */ 1438 1439.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 1440 sshll v15.4s, \x4, #15 1441 smull v26.4s, \x6, v14.h[3] 1442 smlal v26.4s, \x10, v14.h[2] 1443 smlal v26.4s, \x12, v14.h[1] 1444 smlal v26.4s, \x16, v14.h[0] 1445 1446 add v20.4s, v15.4s, v26.4s 1447 sub v15.4s, v15.4s, v26.4s 1448 1449 .if \shift > 16 1450 srshr v20.4s, v20.4s, #\shift 1451 srshr v15.4s, v15.4s, #\shift 1452 xtn \y26, v20.4s 1453 xtn \y27, v15.4s 1454 .else 1455 rshrn \y26, v20.4s, #\shift 1456 rshrn \y27, v15.4s, #\shift 1457 .endif 1458.endm 1459 1460asm_function jsimd_idct_2x2_neon 1461 1462 DCT_TABLE .req x0 1463 COEF_BLOCK .req x1 1464 OUTPUT_BUF .req x2 1465 OUTPUT_COL .req x3 1466 TMP1 .req x0 1467 TMP2 .req x15 1468 1469 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 1470 guarantee that the upper (unused) 32 bits of x3 are valid. This 1471 instruction ensures that those bits are set to zero. */ 1472 uxtw x3, w3 1473 1474 /* vpush {v8.4h - v15.4h} ; not available */ 1475 sub sp, sp, 64 1476 mov x9, sp 1477 1478 /* Load constants */ 1479 get_symbol_loc TMP2, Ljsimd_idct_2x2_neon_consts 1480 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 1481 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 1482 ld1 {v14.4h}, [TMP2] 1483 1484 /* Load all COEF_BLOCK into NEON registers with the following allocation: 1485 * 0 1 2 3 | 4 5 6 7 1486 * ---------+-------- 1487 * 0 | v4.4h | v5.4h 1488 * 1 | v6.4h | v7.4h 1489 * 2 | - | - 1490 * 3 | v10.4h | v11.4h 1491 * 4 | - | - 1492 * 5 | v12.4h | v13.4h 1493 * 6 | - | - 1494 * 7 | v16.4h | v17.4h 1495 */ 1496 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 1497 add COEF_BLOCK, COEF_BLOCK, #16 1498 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16 1499 add COEF_BLOCK, COEF_BLOCK, #16 1500 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16 1501 add COEF_BLOCK, COEF_BLOCK, #16 1502 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 1503 /* Dequantize */ 1504 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 1505 mul v4.4h, v4.4h, v18.4h 1506 mul v5.4h, v5.4h, v19.4h 1507 ins v4.d[1], v5.d[0] 1508 mul v6.4h, v6.4h, v20.4h 1509 mul v7.4h, v7.4h, v21.4h 1510 ins v6.d[1], v7.d[0] 1511 add DCT_TABLE, DCT_TABLE, #16 1512 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16 1513 mul v10.4h, v10.4h, v24.4h 1514 mul v11.4h, v11.4h, v25.4h 1515 ins v10.d[1], v11.d[0] 1516 add DCT_TABLE, DCT_TABLE, #16 1517 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16 1518 mul v12.4h, v12.4h, v26.4h 1519 mul v13.4h, v13.4h, v27.4h 1520 ins v12.d[1], v13.d[0] 1521 add DCT_TABLE, DCT_TABLE, #16 1522 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 1523 mul v16.4h, v16.4h, v30.4h 1524 mul v17.4h, v17.4h, v31.4h 1525 ins v16.d[1], v17.d[0] 1526 1527 /* Pass 1 */ 1528#if 0 1529 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h 1530 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h 1531 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h 1532 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h 1533#else 1534 smull v26.4s, v6.4h, v14.h[3] 1535 smlal v26.4s, v10.4h, v14.h[2] 1536 smlal v26.4s, v12.4h, v14.h[1] 1537 smlal v26.4s, v16.4h, v14.h[0] 1538 smull v24.4s, v7.4h, v14.h[3] 1539 smlal v24.4s, v11.4h, v14.h[2] 1540 smlal v24.4s, v13.4h, v14.h[1] 1541 smlal v24.4s, v17.4h, v14.h[0] 1542 sshll v15.4s, v4.4h, #15 1543 sshll v30.4s, v5.4h, #15 1544 add v20.4s, v15.4s, v26.4s 1545 sub v15.4s, v15.4s, v26.4s 1546 rshrn v4.4h, v20.4s, #13 1547 rshrn v6.4h, v15.4s, #13 1548 add v20.4s, v30.4s, v24.4s 1549 sub v15.4s, v30.4s, v24.4s 1550 rshrn v5.4h, v20.4s, #13 1551 rshrn v7.4h, v15.4s, #13 1552 ins v4.d[1], v5.d[0] 1553 ins v6.d[1], v7.d[0] 1554 transpose v4, v6, v3, .16b, .8h 1555 transpose v6, v10, v3, .16b, .4s 1556 ins v11.d[0], v10.d[1] 1557 ins v7.d[0], v6.d[1] 1558#endif 1559 1560 /* Pass 2 */ 1561 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h 1562 1563 /* Range limit */ 1564 movi v30.8h, #0x80 1565 ins v26.d[1], v27.d[0] 1566 add v26.8h, v26.8h, v30.8h 1567 sqxtun v30.8b, v26.8h 1568 ins v26.d[0], v30.d[0] 1569 sqxtun v27.8b, v26.8h 1570 1571 /* Store results to the output buffer */ 1572 ldp TMP1, TMP2, [OUTPUT_BUF] 1573 add TMP1, TMP1, OUTPUT_COL 1574 add TMP2, TMP2, OUTPUT_COL 1575 1576 st1 {v26.b}[0], [TMP1], 1 1577 st1 {v27.b}[4], [TMP1], 1 1578 st1 {v26.b}[1], [TMP2], 1 1579 st1 {v27.b}[5], [TMP2], 1 1580 1581 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1582 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1583 blr x30 1584 1585 .unreq DCT_TABLE 1586 .unreq COEF_BLOCK 1587 .unreq OUTPUT_BUF 1588 .unreq OUTPUT_COL 1589 .unreq TMP1 1590 .unreq TMP2 1591 1592.purgem idct_helper 1593 1594 1595/*****************************************************************************/ 1596 1597/* 1598 * jsimd_ycc_extrgb_convert_neon 1599 * jsimd_ycc_extbgr_convert_neon 1600 * jsimd_ycc_extrgbx_convert_neon 1601 * jsimd_ycc_extbgrx_convert_neon 1602 * jsimd_ycc_extxbgr_convert_neon 1603 * jsimd_ycc_extxrgb_convert_neon 1604 * 1605 * Colorspace conversion YCbCr -> RGB 1606 */ 1607 1608.macro do_load size 1609 .if \size == 8 1610 ld1 {v4.8b}, [U], 8 1611 ld1 {v5.8b}, [V], 8 1612 ld1 {v0.8b}, [Y], 8 1613 prfm pldl1keep, [U, #64] 1614 prfm pldl1keep, [V, #64] 1615 prfm pldl1keep, [Y, #64] 1616 .elseif \size == 4 1617 ld1 {v4.b}[0], [U], 1 1618 ld1 {v4.b}[1], [U], 1 1619 ld1 {v4.b}[2], [U], 1 1620 ld1 {v4.b}[3], [U], 1 1621 ld1 {v5.b}[0], [V], 1 1622 ld1 {v5.b}[1], [V], 1 1623 ld1 {v5.b}[2], [V], 1 1624 ld1 {v5.b}[3], [V], 1 1625 ld1 {v0.b}[0], [Y], 1 1626 ld1 {v0.b}[1], [Y], 1 1627 ld1 {v0.b}[2], [Y], 1 1628 ld1 {v0.b}[3], [Y], 1 1629 .elseif \size == 2 1630 ld1 {v4.b}[4], [U], 1 1631 ld1 {v4.b}[5], [U], 1 1632 ld1 {v5.b}[4], [V], 1 1633 ld1 {v5.b}[5], [V], 1 1634 ld1 {v0.b}[4], [Y], 1 1635 ld1 {v0.b}[5], [Y], 1 1636 .elseif \size == 1 1637 ld1 {v4.b}[6], [U], 1 1638 ld1 {v5.b}[6], [V], 1 1639 ld1 {v0.b}[6], [Y], 1 1640 .else 1641 .error unsupported macroblock size 1642 .endif 1643.endm 1644 1645.macro do_store bpp, size, fast_st3 1646 .if \bpp == 24 1647 .if \size == 8 1648 .if \fast_st3 == 1 1649 st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24 1650 .else 1651 st1 {v10.b}[0], [RGB], #1 1652 st1 {v11.b}[0], [RGB], #1 1653 st1 {v12.b}[0], [RGB], #1 1654 1655 st1 {v10.b}[1], [RGB], #1 1656 st1 {v11.b}[1], [RGB], #1 1657 st1 {v12.b}[1], [RGB], #1 1658 1659 st1 {v10.b}[2], [RGB], #1 1660 st1 {v11.b}[2], [RGB], #1 1661 st1 {v12.b}[2], [RGB], #1 1662 1663 st1 {v10.b}[3], [RGB], #1 1664 st1 {v11.b}[3], [RGB], #1 1665 st1 {v12.b}[3], [RGB], #1 1666 1667 st1 {v10.b}[4], [RGB], #1 1668 st1 {v11.b}[4], [RGB], #1 1669 st1 {v12.b}[4], [RGB], #1 1670 1671 st1 {v10.b}[5], [RGB], #1 1672 st1 {v11.b}[5], [RGB], #1 1673 st1 {v12.b}[5], [RGB], #1 1674 1675 st1 {v10.b}[6], [RGB], #1 1676 st1 {v11.b}[6], [RGB], #1 1677 st1 {v12.b}[6], [RGB], #1 1678 1679 st1 {v10.b}[7], [RGB], #1 1680 st1 {v11.b}[7], [RGB], #1 1681 st1 {v12.b}[7], [RGB], #1 1682 .endif 1683 .elseif \size == 4 1684 st3 {v10.b, v11.b, v12.b}[0], [RGB], 3 1685 st3 {v10.b, v11.b, v12.b}[1], [RGB], 3 1686 st3 {v10.b, v11.b, v12.b}[2], [RGB], 3 1687 st3 {v10.b, v11.b, v12.b}[3], [RGB], 3 1688 .elseif \size == 2 1689 st3 {v10.b, v11.b, v12.b}[4], [RGB], 3 1690 st3 {v10.b, v11.b, v12.b}[5], [RGB], 3 1691 .elseif \size == 1 1692 st3 {v10.b, v11.b, v12.b}[6], [RGB], 3 1693 .else 1694 .error unsupported macroblock size 1695 .endif 1696 .elseif \bpp == 32 1697 .if \size == 8 1698 st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32 1699 .elseif \size == 4 1700 st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4 1701 st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4 1702 st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4 1703 st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4 1704 .elseif \size == 2 1705 st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4 1706 st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4 1707 .elseif \size == 1 1708 st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4 1709 .else 1710 .error unsupported macroblock size 1711 .endif 1712 .elseif \bpp == 16 1713 .if \size == 8 1714 st1 {v25.8h}, [RGB], 16 1715 .elseif \size == 4 1716 st1 {v25.4h}, [RGB], 8 1717 .elseif \size == 2 1718 st1 {v25.h}[4], [RGB], 2 1719 st1 {v25.h}[5], [RGB], 2 1720 .elseif \size == 1 1721 st1 {v25.h}[6], [RGB], 2 1722 .else 1723 .error unsupported macroblock size 1724 .endif 1725 .else 1726 .error unsupported bpp 1727 .endif 1728.endm 1729 1730.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \ 1731 g_offs, gsize, b_offs, bsize, \ 1732 defsize, fast_st3 1733 1734/* 1735 * 2-stage pipelined YCbCr->RGB conversion 1736 */ 1737 1738.macro do_yuv_to_rgb_stage1 1739 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ 1740 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1741 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ 1742 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ 1743 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ 1744 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ 1745 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ 1746 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ 1747 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ 1748 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ 1749.endm 1750 1751.macro do_yuv_to_rgb_stage2 1752 rshrn v20.4h, v20.4s, #15 1753 rshrn2 v20.8h, v22.4s, #15 1754 rshrn v24.4h, v24.4s, #14 1755 rshrn2 v24.8h, v26.4s, #14 1756 rshrn v28.4h, v28.4s, #14 1757 rshrn2 v28.8h, v30.4s, #14 1758 uaddw v20.8h, v20.8h, v0.8b 1759 uaddw v24.8h, v24.8h, v0.8b 1760 uaddw v28.8h, v28.8h, v0.8b 1761 .if \bpp != 16 1762 sqxtun v1\g_offs\defsize, v20.8h 1763 sqxtun v1\r_offs\defsize, v24.8h 1764 sqxtun v1\b_offs\defsize, v28.8h 1765 .else 1766 sqshlu v21.8h, v20.8h, #8 1767 sqshlu v25.8h, v24.8h, #8 1768 sqshlu v29.8h, v28.8h, #8 1769 sri v25.8h, v21.8h, #5 1770 sri v25.8h, v29.8h, #11 1771 .endif 1772.endm 1773 1774.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3 1775 rshrn v20.4h, v20.4s, #15 1776 rshrn v24.4h, v24.4s, #14 1777 rshrn v28.4h, v28.4s, #14 1778 ld1 {v4.8b}, [U], 8 1779 rshrn2 v20.8h, v22.4s, #15 1780 rshrn2 v24.8h, v26.4s, #14 1781 rshrn2 v28.8h, v30.4s, #14 1782 ld1 {v5.8b}, [V], 8 1783 uaddw v20.8h, v20.8h, v0.8b 1784 uaddw v24.8h, v24.8h, v0.8b 1785 uaddw v28.8h, v28.8h, v0.8b 1786 .if \bpp != 16 /**************** rgb24/rgb32 ******************************/ 1787 sqxtun v1\g_offs\defsize, v20.8h 1788 ld1 {v0.8b}, [Y], 8 1789 sqxtun v1\r_offs\defsize, v24.8h 1790 prfm pldl1keep, [U, #64] 1791 prfm pldl1keep, [V, #64] 1792 prfm pldl1keep, [Y, #64] 1793 sqxtun v1\b_offs\defsize, v28.8h 1794 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ 1795 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1796 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ 1797 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ 1798 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ 1799 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ 1800 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ 1801 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ 1802 .else /**************************** rgb565 ********************************/ 1803 sqshlu v21.8h, v20.8h, #8 1804 sqshlu v25.8h, v24.8h, #8 1805 sqshlu v29.8h, v28.8h, #8 1806 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ 1807 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1808 ld1 {v0.8b}, [Y], 8 1809 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ 1810 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ 1811 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ 1812 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ 1813 sri v25.8h, v21.8h, #5 1814 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ 1815 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ 1816 prfm pldl1keep, [U, #64] 1817 prfm pldl1keep, [V, #64] 1818 prfm pldl1keep, [Y, #64] 1819 sri v25.8h, v29.8h, #11 1820 .endif 1821 do_store \bpp, 8, \fast_st3 1822 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ 1823 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ 1824.endm 1825 1826.macro do_yuv_to_rgb 1827 do_yuv_to_rgb_stage1 1828 do_yuv_to_rgb_stage2 1829.endm 1830 1831.if \fast_st3 == 1 1832asm_function jsimd_ycc_\colorid\()_convert_neon 1833.else 1834asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 1835.endif 1836 OUTPUT_WIDTH .req w0 1837 INPUT_BUF .req x1 1838 INPUT_ROW .req w2 1839 OUTPUT_BUF .req x3 1840 NUM_ROWS .req w4 1841 1842 INPUT_BUF0 .req x5 1843 INPUT_BUF1 .req x6 1844 INPUT_BUF2 .req x1 1845 1846 RGB .req x7 1847 Y .req x9 1848 U .req x10 1849 V .req x11 1850 N .req w15 1851 1852 sub sp, sp, 64 1853 mov x9, sp 1854 1855 /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */ 1856 get_symbol_loc x15, Ljsimd_ycc_rgb_neon_consts 1857 1858 /* Save NEON registers */ 1859 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 1860 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 1861 ld1 {v0.4h, v1.4h}, [x15], 16 1862 ld1 {v2.8h}, [x15] 1863 1864 ldr INPUT_BUF0, [INPUT_BUF] 1865 ldr INPUT_BUF1, [INPUT_BUF, #8] 1866 ldr INPUT_BUF2, [INPUT_BUF, #16] 1867 .unreq INPUT_BUF 1868 1869 /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */ 1870 movi v10.16b, #255 1871 movi v13.16b, #255 1872 1873 /* Outer loop over scanlines */ 1874 cmp NUM_ROWS, #1 1875 b.lt 9f 18760: 1877 ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3] 1878 ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3] 1879 mov N, OUTPUT_WIDTH 1880 ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3] 1881 add INPUT_ROW, INPUT_ROW, #1 1882 ldr RGB, [OUTPUT_BUF], #8 1883 1884 /* Inner loop over pixels */ 1885 subs N, N, #8 1886 b.lt 3f 1887 do_load 8 1888 do_yuv_to_rgb_stage1 1889 subs N, N, #8 1890 b.lt 2f 18911: 1892 do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3 1893 subs N, N, #8 1894 b.ge 1b 18952: 1896 do_yuv_to_rgb_stage2 1897 do_store \bpp, 8, \fast_st3 1898 tst N, #7 1899 b.eq 8f 19003: 1901 tst N, #4 1902 b.eq 3f 1903 do_load 4 19043: 1905 tst N, #2 1906 b.eq 4f 1907 do_load 2 19084: 1909 tst N, #1 1910 b.eq 5f 1911 do_load 1 19125: 1913 do_yuv_to_rgb 1914 tst N, #4 1915 b.eq 6f 1916 do_store \bpp, 4, \fast_st3 19176: 1918 tst N, #2 1919 b.eq 7f 1920 do_store \bpp, 2, \fast_st3 19217: 1922 tst N, #1 1923 b.eq 8f 1924 do_store \bpp, 1, \fast_st3 19258: 1926 subs NUM_ROWS, NUM_ROWS, #1 1927 b.gt 0b 19289: 1929 /* Restore all registers and return */ 1930 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1931 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1932 br x30 1933 .unreq OUTPUT_WIDTH 1934 .unreq INPUT_ROW 1935 .unreq OUTPUT_BUF 1936 .unreq NUM_ROWS 1937 .unreq INPUT_BUF0 1938 .unreq INPUT_BUF1 1939 .unreq INPUT_BUF2 1940 .unreq RGB 1941 .unreq Y 1942 .unreq U 1943 .unreq V 1944 .unreq N 1945 1946.purgem do_yuv_to_rgb 1947.purgem do_yuv_to_rgb_stage1 1948.purgem do_yuv_to_rgb_stage2 1949.purgem do_yuv_to_rgb_stage2_store_load_stage1 1950 1951.endm 1952 1953/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/ 1954generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1 1955generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1 1956generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1 1957generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1 1958generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1 1959generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1 1960generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1 1961 1962generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0 1963generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0 1964 1965.purgem do_load 1966.purgem do_store 1967 1968 1969/*****************************************************************************/ 1970 1971/* 1972 * jsimd_extrgb_ycc_convert_neon 1973 * jsimd_extbgr_ycc_convert_neon 1974 * jsimd_extrgbx_ycc_convert_neon 1975 * jsimd_extbgrx_ycc_convert_neon 1976 * jsimd_extxbgr_ycc_convert_neon 1977 * jsimd_extxrgb_ycc_convert_neon 1978 * 1979 * Colorspace conversion RGB -> YCbCr 1980 */ 1981 1982.macro do_store size 1983 .if \size == 8 1984 st1 {v20.8b}, [Y], #8 1985 st1 {v21.8b}, [U], #8 1986 st1 {v22.8b}, [V], #8 1987 .elseif \size == 4 1988 st1 {v20.b}[0], [Y], #1 1989 st1 {v20.b}[1], [Y], #1 1990 st1 {v20.b}[2], [Y], #1 1991 st1 {v20.b}[3], [Y], #1 1992 st1 {v21.b}[0], [U], #1 1993 st1 {v21.b}[1], [U], #1 1994 st1 {v21.b}[2], [U], #1 1995 st1 {v21.b}[3], [U], #1 1996 st1 {v22.b}[0], [V], #1 1997 st1 {v22.b}[1], [V], #1 1998 st1 {v22.b}[2], [V], #1 1999 st1 {v22.b}[3], [V], #1 2000 .elseif \size == 2 2001 st1 {v20.b}[4], [Y], #1 2002 st1 {v20.b}[5], [Y], #1 2003 st1 {v21.b}[4], [U], #1 2004 st1 {v21.b}[5], [U], #1 2005 st1 {v22.b}[4], [V], #1 2006 st1 {v22.b}[5], [V], #1 2007 .elseif \size == 1 2008 st1 {v20.b}[6], [Y], #1 2009 st1 {v21.b}[6], [U], #1 2010 st1 {v22.b}[6], [V], #1 2011 .else 2012 .error unsupported macroblock size 2013 .endif 2014.endm 2015 2016.macro do_load bpp, size, fast_ld3 2017 .if \bpp == 24 2018 .if \size == 8 2019 .if \fast_ld3 == 1 2020 ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24 2021 .else 2022 ld1 {v10.b}[0], [RGB], #1 2023 ld1 {v11.b}[0], [RGB], #1 2024 ld1 {v12.b}[0], [RGB], #1 2025 2026 ld1 {v10.b}[1], [RGB], #1 2027 ld1 {v11.b}[1], [RGB], #1 2028 ld1 {v12.b}[1], [RGB], #1 2029 2030 ld1 {v10.b}[2], [RGB], #1 2031 ld1 {v11.b}[2], [RGB], #1 2032 ld1 {v12.b}[2], [RGB], #1 2033 2034 ld1 {v10.b}[3], [RGB], #1 2035 ld1 {v11.b}[3], [RGB], #1 2036 ld1 {v12.b}[3], [RGB], #1 2037 2038 ld1 {v10.b}[4], [RGB], #1 2039 ld1 {v11.b}[4], [RGB], #1 2040 ld1 {v12.b}[4], [RGB], #1 2041 2042 ld1 {v10.b}[5], [RGB], #1 2043 ld1 {v11.b}[5], [RGB], #1 2044 ld1 {v12.b}[5], [RGB], #1 2045 2046 ld1 {v10.b}[6], [RGB], #1 2047 ld1 {v11.b}[6], [RGB], #1 2048 ld1 {v12.b}[6], [RGB], #1 2049 2050 ld1 {v10.b}[7], [RGB], #1 2051 ld1 {v11.b}[7], [RGB], #1 2052 ld1 {v12.b}[7], [RGB], #1 2053 .endif 2054 prfm pldl1keep, [RGB, #128] 2055 .elseif \size == 4 2056 ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3 2057 ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3 2058 ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3 2059 ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3 2060 .elseif \size == 2 2061 ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3 2062 ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3 2063 .elseif \size == 1 2064 ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3 2065 .else 2066 .error unsupported macroblock size 2067 .endif 2068 .elseif \bpp == 32 2069 .if \size == 8 2070 ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32 2071 prfm pldl1keep, [RGB, #128] 2072 .elseif \size == 4 2073 ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4 2074 ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4 2075 ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4 2076 ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4 2077 .elseif \size == 2 2078 ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4 2079 ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4 2080 .elseif \size == 1 2081 ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4 2082 .else 2083 .error unsupported macroblock size 2084 .endif 2085 .else 2086 .error unsupported bpp 2087 .endif 2088.endm 2089 2090.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \ 2091 b_offs, fast_ld3 2092 2093/* 2094 * 2-stage pipelined RGB->YCbCr conversion 2095 */ 2096 2097.macro do_rgb_to_yuv_stage1 2098 ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */ 2099 ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */ 2100 ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */ 2101 rev64 v18.4s, v1.4s 2102 rev64 v26.4s, v1.4s 2103 rev64 v28.4s, v1.4s 2104 rev64 v30.4s, v1.4s 2105 umull v14.4s, v4.4h, v0.h[0] 2106 umull2 v16.4s, v4.8h, v0.h[0] 2107 umlsl v18.4s, v4.4h, v0.h[3] 2108 umlsl2 v26.4s, v4.8h, v0.h[3] 2109 umlal v28.4s, v4.4h, v0.h[5] 2110 umlal2 v30.4s, v4.8h, v0.h[5] 2111 umlal v14.4s, v6.4h, v0.h[1] 2112 umlal2 v16.4s, v6.8h, v0.h[1] 2113 umlsl v18.4s, v6.4h, v0.h[4] 2114 umlsl2 v26.4s, v6.8h, v0.h[4] 2115 umlsl v28.4s, v6.4h, v0.h[6] 2116 umlsl2 v30.4s, v6.8h, v0.h[6] 2117 umlal v14.4s, v8.4h, v0.h[2] 2118 umlal2 v16.4s, v8.8h, v0.h[2] 2119 umlal v18.4s, v8.4h, v0.h[5] 2120 umlal2 v26.4s, v8.8h, v0.h[5] 2121 umlsl v28.4s, v8.4h, v0.h[7] 2122 umlsl2 v30.4s, v8.8h, v0.h[7] 2123.endm 2124 2125.macro do_rgb_to_yuv_stage2 2126 rshrn v20.4h, v14.4s, #16 2127 shrn v22.4h, v18.4s, #16 2128 shrn v24.4h, v28.4s, #16 2129 rshrn2 v20.8h, v16.4s, #16 2130 shrn2 v22.8h, v26.4s, #16 2131 shrn2 v24.8h, v30.4s, #16 2132 xtn v20.8b, v20.8h /* v20 = y */ 2133 xtn v21.8b, v22.8h /* v21 = u */ 2134 xtn v22.8b, v24.8h /* v22 = v */ 2135.endm 2136 2137.macro do_rgb_to_yuv 2138 do_rgb_to_yuv_stage1 2139 do_rgb_to_yuv_stage2 2140.endm 2141 2142/* TODO: expand macros and interleave instructions if some in-order 2143 * ARM64 processor actually can dual-issue LOAD/STORE with ALU */ 2144.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3 2145 do_rgb_to_yuv_stage2 2146 do_load \bpp, 8, \fast_ld3 2147 st1 {v20.8b}, [Y], #8 2148 st1 {v21.8b}, [U], #8 2149 st1 {v22.8b}, [V], #8 2150 do_rgb_to_yuv_stage1 2151.endm 2152 2153.if \fast_ld3 == 1 2154asm_function jsimd_\colorid\()_ycc_convert_neon 2155.else 2156asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3 2157.endif 2158 OUTPUT_WIDTH .req w0 2159 INPUT_BUF .req x1 2160 OUTPUT_BUF .req x2 2161 OUTPUT_ROW .req w3 2162 NUM_ROWS .req w4 2163 2164 OUTPUT_BUF0 .req x5 2165 OUTPUT_BUF1 .req x6 2166 OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */ 2167 2168 RGB .req x7 2169 Y .req x9 2170 U .req x10 2171 V .req x11 2172 N .req w12 2173 2174 /* Load constants to d0, d1, d2, d3 */ 2175 get_symbol_loc x13, Ljsimd_rgb_ycc_neon_consts 2176 ld1 {v0.8h, v1.8h}, [x13] 2177 2178 ldr OUTPUT_BUF0, [OUTPUT_BUF] 2179 ldr OUTPUT_BUF1, [OUTPUT_BUF, #8] 2180 ldr OUTPUT_BUF2, [OUTPUT_BUF, #16] 2181 .unreq OUTPUT_BUF 2182 2183 /* Save NEON registers */ 2184 sub sp, sp, #64 2185 mov x9, sp 2186 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 2187 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 2188 2189 /* Outer loop over scanlines */ 2190 cmp NUM_ROWS, #1 2191 b.lt 9f 21920: 2193 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3] 2194 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3] 2195 mov N, OUTPUT_WIDTH 2196 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3] 2197 add OUTPUT_ROW, OUTPUT_ROW, #1 2198 ldr RGB, [INPUT_BUF], #8 2199 2200 /* Inner loop over pixels */ 2201 subs N, N, #8 2202 b.lt 3f 2203 do_load \bpp, 8, \fast_ld3 2204 do_rgb_to_yuv_stage1 2205 subs N, N, #8 2206 b.lt 2f 22071: 2208 do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3 2209 subs N, N, #8 2210 b.ge 1b 22112: 2212 do_rgb_to_yuv_stage2 2213 do_store 8 2214 tst N, #7 2215 b.eq 8f 22163: 2217 tbz N, #2, 3f 2218 do_load \bpp, 4, \fast_ld3 22193: 2220 tbz N, #1, 4f 2221 do_load \bpp, 2, \fast_ld3 22224: 2223 tbz N, #0, 5f 2224 do_load \bpp, 1, \fast_ld3 22255: 2226 do_rgb_to_yuv 2227 tbz N, #2, 6f 2228 do_store 4 22296: 2230 tbz N, #1, 7f 2231 do_store 2 22327: 2233 tbz N, #0, 8f 2234 do_store 1 22358: 2236 subs NUM_ROWS, NUM_ROWS, #1 2237 b.gt 0b 22389: 2239 /* Restore all registers and return */ 2240 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 2241 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 2242 br x30 2243 2244 .unreq OUTPUT_WIDTH 2245 .unreq OUTPUT_ROW 2246 .unreq INPUT_BUF 2247 .unreq NUM_ROWS 2248 .unreq OUTPUT_BUF0 2249 .unreq OUTPUT_BUF1 2250 .unreq OUTPUT_BUF2 2251 .unreq RGB 2252 .unreq Y 2253 .unreq U 2254 .unreq V 2255 .unreq N 2256 2257.purgem do_rgb_to_yuv 2258.purgem do_rgb_to_yuv_stage1 2259.purgem do_rgb_to_yuv_stage2 2260.purgem do_rgb_to_yuv_stage2_store_load_stage1 2261 2262.endm 2263 2264/*--------------------------------- id ----- bpp R G B Fast LD3 */ 2265generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1 2266generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1 2267generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1 2268generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1 2269generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1 2270generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1 2271 2272generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0 2273generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0 2274 2275.purgem do_load 2276.purgem do_store 2277 2278 2279/*****************************************************************************/ 2280 2281/* 2282 * Load data into workspace, applying unsigned->signed conversion 2283 * 2284 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get 2285 * rid of VST1.16 instructions 2286 */ 2287 2288asm_function jsimd_convsamp_neon 2289 SAMPLE_DATA .req x0 2290 START_COL .req x1 2291 WORKSPACE .req x2 2292 TMP1 .req x9 2293 TMP2 .req x10 2294 TMP3 .req x11 2295 TMP4 .req x12 2296 TMP5 .req x13 2297 TMP6 .req x14 2298 TMP7 .req x15 2299 TMP8 .req x4 2300 TMPDUP .req w3 2301 2302 /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 2303 guarantee that the upper (unused) 32 bits of x1 are valid. This 2304 instruction ensures that those bits are set to zero. */ 2305 uxtw x1, w1 2306 2307 mov TMPDUP, #128 2308 ldp TMP1, TMP2, [SAMPLE_DATA], 16 2309 ldp TMP3, TMP4, [SAMPLE_DATA], 16 2310 dup v0.8b, TMPDUP 2311 add TMP1, TMP1, START_COL 2312 add TMP2, TMP2, START_COL 2313 ldp TMP5, TMP6, [SAMPLE_DATA], 16 2314 add TMP3, TMP3, START_COL 2315 add TMP4, TMP4, START_COL 2316 ldp TMP7, TMP8, [SAMPLE_DATA], 16 2317 add TMP5, TMP5, START_COL 2318 add TMP6, TMP6, START_COL 2319 ld1 {v16.8b}, [TMP1] 2320 add TMP7, TMP7, START_COL 2321 add TMP8, TMP8, START_COL 2322 ld1 {v17.8b}, [TMP2] 2323 usubl v16.8h, v16.8b, v0.8b 2324 ld1 {v18.8b}, [TMP3] 2325 usubl v17.8h, v17.8b, v0.8b 2326 ld1 {v19.8b}, [TMP4] 2327 usubl v18.8h, v18.8b, v0.8b 2328 ld1 {v20.8b}, [TMP5] 2329 usubl v19.8h, v19.8b, v0.8b 2330 ld1 {v21.8b}, [TMP6] 2331 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64 2332 usubl v20.8h, v20.8b, v0.8b 2333 ld1 {v22.8b}, [TMP7] 2334 usubl v21.8h, v21.8b, v0.8b 2335 ld1 {v23.8b}, [TMP8] 2336 usubl v22.8h, v22.8b, v0.8b 2337 usubl v23.8h, v23.8b, v0.8b 2338 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64 2339 2340 br x30 2341 2342 .unreq SAMPLE_DATA 2343 .unreq START_COL 2344 .unreq WORKSPACE 2345 .unreq TMP1 2346 .unreq TMP2 2347 .unreq TMP3 2348 .unreq TMP4 2349 .unreq TMP5 2350 .unreq TMP6 2351 .unreq TMP7 2352 .unreq TMP8 2353 .unreq TMPDUP 2354 2355/*****************************************************************************/ 2356 2357/* 2358 * jsimd_fdct_islow_neon 2359 * 2360 * This file contains a slow-but-accurate integer implementation of the 2361 * forward DCT (Discrete Cosine Transform). The following code is based 2362 * directly on the IJG''s original jfdctint.c; see the jfdctint.c for 2363 * more details. 2364 * 2365 * TODO: can be combined with 'jsimd_convsamp_neon' to get 2366 * rid of a bunch of VLD1.16 instructions 2367 */ 2368 2369#define CONST_BITS 13 2370#define PASS1_BITS 2 2371 2372#define DESCALE_P1 (CONST_BITS - PASS1_BITS) 2373#define DESCALE_P2 (CONST_BITS + PASS1_BITS) 2374 2375#define XFIX_P_0_298 v0.h[0] 2376#define XFIX_N_0_390 v0.h[1] 2377#define XFIX_P_0_541 v0.h[2] 2378#define XFIX_P_0_765 v0.h[3] 2379#define XFIX_N_0_899 v0.h[4] 2380#define XFIX_P_1_175 v0.h[5] 2381#define XFIX_P_1_501 v0.h[6] 2382#define XFIX_N_1_847 v0.h[7] 2383#define XFIX_N_1_961 v1.h[0] 2384#define XFIX_P_2_053 v1.h[1] 2385#define XFIX_N_2_562 v1.h[2] 2386#define XFIX_P_3_072 v1.h[3] 2387 2388asm_function jsimd_fdct_islow_neon 2389 2390 DATA .req x0 2391 TMP .req x9 2392 2393 /* Load constants */ 2394 get_symbol_loc TMP, Ljsimd_fdct_islow_neon_consts 2395 ld1 {v0.8h, v1.8h}, [TMP] 2396 2397 /* Save NEON registers */ 2398 sub sp, sp, #64 2399 mov x10, sp 2400 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32 2401 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32 2402 2403 /* Load all DATA into NEON registers with the following allocation: 2404 * 0 1 2 3 | 4 5 6 7 2405 * ---------+-------- 2406 * 0 | d16 | d17 | v16.8h 2407 * 1 | d18 | d19 | v17.8h 2408 * 2 | d20 | d21 | v18.8h 2409 * 3 | d22 | d23 | v19.8h 2410 * 4 | d24 | d25 | v20.8h 2411 * 5 | d26 | d27 | v21.8h 2412 * 6 | d28 | d29 | v22.8h 2413 * 7 | d30 | d31 | v23.8h 2414 */ 2415 2416 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 2417 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] 2418 sub DATA, DATA, #64 2419 2420 /* Transpose */ 2421 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 2422 /* 1-D FDCT */ 2423 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ 2424 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ 2425 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ 2426 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ 2427 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ 2428 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ 2429 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ 2430 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ 2431 2432 /* even part */ 2433 2434 add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ 2435 sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ 2436 add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ 2437 sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ 2438 2439 add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ 2440 sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ 2441 2442 add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ 2443 2444 shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */ 2445 shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */ 2446 2447 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 2448 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 2449 mov v22.16b, v18.16b 2450 mov v25.16b, v24.16b 2451 2452 smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 2453 smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 2454 smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 2455 smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 2456 2457 rshrn v18.4h, v18.4s, #DESCALE_P1 2458 rshrn v22.4h, v22.4s, #DESCALE_P1 2459 rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ 2460 rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ 2461 2462 /* Odd part */ 2463 2464 add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ 2465 add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ 2466 add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ 2467 add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ 2468 smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ 2469 smull2 v5.4s, v10.8h, XFIX_P_1_175 2470 smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ 2471 smlal2 v5.4s, v11.8h, XFIX_P_1_175 2472 2473 smull2 v24.4s, v28.8h, XFIX_P_0_298 2474 smull2 v25.4s, v29.8h, XFIX_P_2_053 2475 smull2 v26.4s, v30.8h, XFIX_P_3_072 2476 smull2 v27.4s, v31.8h, XFIX_P_1_501 2477 smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ 2478 smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ 2479 smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ 2480 smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ 2481 2482 smull2 v12.4s, v8.8h, XFIX_N_0_899 2483 smull2 v13.4s, v9.8h, XFIX_N_2_562 2484 smull2 v14.4s, v10.8h, XFIX_N_1_961 2485 smull2 v15.4s, v11.8h, XFIX_N_0_390 2486 smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */ 2487 smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */ 2488 smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */ 2489 smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */ 2490 2491 add v10.4s, v10.4s, v4.4s /* z3 += z5 */ 2492 add v14.4s, v14.4s, v5.4s 2493 add v11.4s, v11.4s, v4.4s /* z4 += z5 */ 2494 add v15.4s, v15.4s, v5.4s 2495 2496 add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ 2497 add v24.4s, v24.4s, v12.4s 2498 add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ 2499 add v25.4s, v25.4s, v13.4s 2500 add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ 2501 add v26.4s, v26.4s, v14.4s 2502 add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ 2503 add v27.4s, v27.4s, v15.4s 2504 2505 add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ 2506 add v24.4s, v24.4s, v14.4s 2507 add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ 2508 add v25.4s, v25.4s, v15.4s 2509 add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ 2510 add v26.4s, v26.4s, v13.4s 2511 add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ 2512 add v27.4s, v27.4s, v12.4s 2513 2514 rshrn v23.4h, v28.4s, #DESCALE_P1 2515 rshrn v21.4h, v29.4s, #DESCALE_P1 2516 rshrn v19.4h, v30.4s, #DESCALE_P1 2517 rshrn v17.4h, v31.4s, #DESCALE_P1 2518 rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ 2519 rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ 2520 rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ 2521 rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ 2522 2523 /* Transpose */ 2524 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 2525 2526 /* 1-D FDCT */ 2527 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ 2528 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ 2529 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ 2530 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ 2531 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ 2532 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ 2533 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ 2534 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ 2535 2536 /* even part */ 2537 add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ 2538 sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ 2539 add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ 2540 sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ 2541 2542 add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ 2543 sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ 2544 2545 add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ 2546 2547 srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */ 2548 srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */ 2549 2550 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 2551 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 2552 mov v22.16b, v18.16b 2553 mov v25.16b, v24.16b 2554 2555 smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 2556 smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 2557 smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 2558 smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 2559 2560 rshrn v18.4h, v18.4s, #DESCALE_P2 2561 rshrn v22.4h, v22.4s, #DESCALE_P2 2562 rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ 2563 rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ 2564 2565 /* Odd part */ 2566 add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ 2567 add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ 2568 add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ 2569 add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ 2570 2571 smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ 2572 smull2 v5.4s, v10.8h, XFIX_P_1_175 2573 smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ 2574 smlal2 v5.4s, v11.8h, XFIX_P_1_175 2575 2576 smull2 v24.4s, v28.8h, XFIX_P_0_298 2577 smull2 v25.4s, v29.8h, XFIX_P_2_053 2578 smull2 v26.4s, v30.8h, XFIX_P_3_072 2579 smull2 v27.4s, v31.8h, XFIX_P_1_501 2580 smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ 2581 smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ 2582 smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ 2583 smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ 2584 2585 smull2 v12.4s, v8.8h, XFIX_N_0_899 2586 smull2 v13.4s, v9.8h, XFIX_N_2_562 2587 smull2 v14.4s, v10.8h, XFIX_N_1_961 2588 smull2 v15.4s, v11.8h, XFIX_N_0_390 2589 smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */ 2590 smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */ 2591 smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */ 2592 smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */ 2593 2594 add v10.4s, v10.4s, v4.4s 2595 add v14.4s, v14.4s, v5.4s 2596 add v11.4s, v11.4s, v4.4s 2597 add v15.4s, v15.4s, v5.4s 2598 2599 add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ 2600 add v24.4s, v24.4s, v12.4s 2601 add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ 2602 add v25.4s, v25.4s, v13.4s 2603 add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ 2604 add v26.4s, v26.4s, v14.4s 2605 add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ 2606 add v27.4s, v27.4s, v15.4s 2607 2608 add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ 2609 add v24.4s, v24.4s, v14.4s 2610 add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ 2611 add v25.4s, v25.4s, v15.4s 2612 add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ 2613 add v26.4s, v26.4s, v13.4s 2614 add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ 2615 add v27.4s, v27.4s, v12.4s 2616 2617 rshrn v23.4h, v28.4s, #DESCALE_P2 2618 rshrn v21.4h, v29.4s, #DESCALE_P2 2619 rshrn v19.4h, v30.4s, #DESCALE_P2 2620 rshrn v17.4h, v31.4s, #DESCALE_P2 2621 rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ 2622 rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ 2623 rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ 2624 rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ 2625 2626 /* store results */ 2627 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 2628 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] 2629 2630 /* Restore NEON registers */ 2631 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 2632 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 2633 2634 br x30 2635 2636 .unreq DATA 2637 .unreq TMP 2638 2639#undef XFIX_P_0_298 2640#undef XFIX_N_0_390 2641#undef XFIX_P_0_541 2642#undef XFIX_P_0_765 2643#undef XFIX_N_0_899 2644#undef XFIX_P_1_175 2645#undef XFIX_P_1_501 2646#undef XFIX_N_1_847 2647#undef XFIX_N_1_961 2648#undef XFIX_P_2_053 2649#undef XFIX_N_2_562 2650#undef XFIX_P_3_072 2651 2652 2653/*****************************************************************************/ 2654 2655/* 2656 * jsimd_fdct_ifast_neon 2657 * 2658 * This function contains a fast, not so accurate integer implementation of 2659 * the forward DCT (Discrete Cosine Transform). It uses the same calculations 2660 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' 2661 * function from jfdctfst.c 2662 * 2663 * TODO: can be combined with 'jsimd_convsamp_neon' to get 2664 * rid of a bunch of VLD1.16 instructions 2665 */ 2666 2667#undef XFIX_0_541196100 2668#define XFIX_0_382683433 v0.h[0] 2669#define XFIX_0_541196100 v0.h[1] 2670#define XFIX_0_707106781 v0.h[2] 2671#define XFIX_1_306562965 v0.h[3] 2672 2673asm_function jsimd_fdct_ifast_neon 2674 2675 DATA .req x0 2676 TMP .req x9 2677 2678 /* Load constants */ 2679 get_symbol_loc TMP, Ljsimd_fdct_ifast_neon_consts 2680 ld1 {v0.4h}, [TMP] 2681 2682 /* Load all DATA into NEON registers with the following allocation: 2683 * 0 1 2 3 | 4 5 6 7 2684 * ---------+-------- 2685 * 0 | d16 | d17 | v0.8h 2686 * 1 | d18 | d19 | q9 2687 * 2 | d20 | d21 | q10 2688 * 3 | d22 | d23 | q11 2689 * 4 | d24 | d25 | q12 2690 * 5 | d26 | d27 | q13 2691 * 6 | d28 | d29 | q14 2692 * 7 | d30 | d31 | q15 2693 */ 2694 2695 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 2696 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] 2697 mov TMP, #2 2698 sub DATA, DATA, #64 26991: 2700 /* Transpose */ 2701 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4 2702 subs TMP, TMP, #1 2703 /* 1-D FDCT */ 2704 add v4.8h, v19.8h, v20.8h 2705 sub v20.8h, v19.8h, v20.8h 2706 sub v28.8h, v18.8h, v21.8h 2707 add v18.8h, v18.8h, v21.8h 2708 sub v29.8h, v17.8h, v22.8h 2709 add v17.8h, v17.8h, v22.8h 2710 sub v21.8h, v16.8h, v23.8h 2711 add v16.8h, v16.8h, v23.8h 2712 sub v6.8h, v17.8h, v18.8h 2713 sub v7.8h, v16.8h, v4.8h 2714 add v5.8h, v17.8h, v18.8h 2715 add v6.8h, v6.8h, v7.8h 2716 add v4.8h, v16.8h, v4.8h 2717 sqdmulh v6.8h, v6.8h, XFIX_0_707106781 2718 add v19.8h, v20.8h, v28.8h 2719 add v16.8h, v4.8h, v5.8h 2720 sub v20.8h, v4.8h, v5.8h 2721 add v5.8h, v28.8h, v29.8h 2722 add v29.8h, v29.8h, v21.8h 2723 sqdmulh v5.8h, v5.8h, XFIX_0_707106781 2724 sub v28.8h, v19.8h, v29.8h 2725 add v18.8h, v7.8h, v6.8h 2726 sqdmulh v28.8h, v28.8h, XFIX_0_382683433 2727 sub v22.8h, v7.8h, v6.8h 2728 sqdmulh v19.8h, v19.8h, XFIX_0_541196100 2729 sqdmulh v7.8h, v29.8h, XFIX_1_306562965 2730 add v6.8h, v21.8h, v5.8h 2731 sub v5.8h, v21.8h, v5.8h 2732 add v29.8h, v29.8h, v28.8h 2733 add v19.8h, v19.8h, v28.8h 2734 add v29.8h, v29.8h, v7.8h 2735 add v21.8h, v5.8h, v19.8h 2736 sub v19.8h, v5.8h, v19.8h 2737 add v17.8h, v6.8h, v29.8h 2738 sub v23.8h, v6.8h, v29.8h 2739 2740 b.ne 1b 2741 2742 /* store results */ 2743 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 2744 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] 2745 2746 br x30 2747 2748 .unreq DATA 2749 .unreq TMP 2750#undef XFIX_0_382683433 2751#undef XFIX_0_541196100 2752#undef XFIX_0_707106781 2753#undef XFIX_1_306562965 2754 2755 2756/*****************************************************************************/ 2757 2758/* 2759 * GLOBAL(void) 2760 * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors, 2761 * DCTELEM *workspace); 2762 * 2763 */ 2764asm_function jsimd_quantize_neon 2765 2766 COEF_BLOCK .req x0 2767 DIVISORS .req x1 2768 WORKSPACE .req x2 2769 2770 RECIPROCAL .req DIVISORS 2771 CORRECTION .req x9 2772 SHIFT .req x10 2773 LOOP_COUNT .req x11 2774 2775 mov LOOP_COUNT, #2 2776 add CORRECTION, DIVISORS, #(64 * 2) 2777 add SHIFT, DIVISORS, #(64 * 6) 27781: 2779 subs LOOP_COUNT, LOOP_COUNT, #1 2780 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64 2781 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64 2782 abs v20.8h, v0.8h 2783 abs v21.8h, v1.8h 2784 abs v22.8h, v2.8h 2785 abs v23.8h, v3.8h 2786 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64 2787 add v20.8h, v20.8h, v4.8h /* add correction */ 2788 add v21.8h, v21.8h, v5.8h 2789 add v22.8h, v22.8h, v6.8h 2790 add v23.8h, v23.8h, v7.8h 2791 umull v4.4s, v20.4h, v28.4h /* multiply by reciprocal */ 2792 umull2 v16.4s, v20.8h, v28.8h 2793 umull v5.4s, v21.4h, v29.4h 2794 umull2 v17.4s, v21.8h, v29.8h 2795 umull v6.4s, v22.4h, v30.4h /* multiply by reciprocal */ 2796 umull2 v18.4s, v22.8h, v30.8h 2797 umull v7.4s, v23.4h, v31.4h 2798 umull2 v19.4s, v23.8h, v31.8h 2799 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64 2800 shrn v4.4h, v4.4s, #16 2801 shrn v5.4h, v5.4s, #16 2802 shrn v6.4h, v6.4s, #16 2803 shrn v7.4h, v7.4s, #16 2804 shrn2 v4.8h, v16.4s, #16 2805 shrn2 v5.8h, v17.4s, #16 2806 shrn2 v6.8h, v18.4s, #16 2807 shrn2 v7.8h, v19.4s, #16 2808 neg v24.8h, v24.8h 2809 neg v25.8h, v25.8h 2810 neg v26.8h, v26.8h 2811 neg v27.8h, v27.8h 2812 sshr v0.8h, v0.8h, #15 /* extract sign */ 2813 sshr v1.8h, v1.8h, #15 2814 sshr v2.8h, v2.8h, #15 2815 sshr v3.8h, v3.8h, #15 2816 ushl v4.8h, v4.8h, v24.8h /* shift */ 2817 ushl v5.8h, v5.8h, v25.8h 2818 ushl v6.8h, v6.8h, v26.8h 2819 ushl v7.8h, v7.8h, v27.8h 2820 2821 eor v4.16b, v4.16b, v0.16b /* restore sign */ 2822 eor v5.16b, v5.16b, v1.16b 2823 eor v6.16b, v6.16b, v2.16b 2824 eor v7.16b, v7.16b, v3.16b 2825 sub v4.8h, v4.8h, v0.8h 2826 sub v5.8h, v5.8h, v1.8h 2827 sub v6.8h, v6.8h, v2.8h 2828 sub v7.8h, v7.8h, v3.8h 2829 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64 2830 2831 b.ne 1b 2832 2833 br x30 /* return */ 2834 2835 .unreq COEF_BLOCK 2836 .unreq DIVISORS 2837 .unreq WORKSPACE 2838 .unreq RECIPROCAL 2839 .unreq CORRECTION 2840 .unreq SHIFT 2841 .unreq LOOP_COUNT 2842 2843 2844/*****************************************************************************/ 2845 2846/* 2847 * Downsample pixel values of a single component. 2848 * This version handles the common case of 2:1 horizontal and 1:1 vertical, 2849 * without smoothing. 2850 * 2851 * GLOBAL(void) 2852 * jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor, 2853 * JDIMENSION v_samp_factor, 2854 * JDIMENSION width_in_blocks, 2855 * JSAMPARRAY input_data, JSAMPARRAY output_data); 2856 */ 2857 2858asm_function jsimd_h2v1_downsample_neon 2859 IMAGE_WIDTH .req x0 2860 MAX_V_SAMP .req x1 2861 V_SAMP .req x2 2862 BLOCK_WIDTH .req x3 2863 INPUT_DATA .req x4 2864 OUTPUT_DATA .req x5 2865 OUTPTR .req x9 2866 INPTR .req x10 2867 TMP1 .req x11 2868 TMP2 .req x12 2869 TMP3 .req x13 2870 TMPDUP .req w15 2871 2872 mov TMPDUP, #0x10000 2873 lsl TMP2, BLOCK_WIDTH, #4 2874 sub TMP2, TMP2, IMAGE_WIDTH 2875 get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts 2876 add TMP3, TMP3, TMP2, lsl #4 2877 dup v16.4s, TMPDUP 2878 ld1 {v18.16b}, [TMP3] 2879 28801: /* row loop */ 2881 ldr INPTR, [INPUT_DATA], #8 2882 ldr OUTPTR, [OUTPUT_DATA], #8 2883 subs TMP1, BLOCK_WIDTH, #1 2884 b.eq 3f 28852: /* columns */ 2886 ld1 {v0.16b}, [INPTR], #16 2887 mov v4.16b, v16.16b 2888 subs TMP1, TMP1, #1 2889 uadalp v4.8h, v0.16b 2890 shrn v6.8b, v4.8h, #1 2891 st1 {v6.8b}, [OUTPTR], #8 2892 b.ne 2b 28933: /* last columns */ 2894 ld1 {v0.16b}, [INPTR] 2895 mov v4.16b, v16.16b 2896 subs V_SAMP, V_SAMP, #1 2897 /* expand right */ 2898 tbl v2.16b, {v0.16b}, v18.16b 2899 uadalp v4.8h, v2.16b 2900 shrn v6.8b, v4.8h, #1 2901 st1 {v6.8b}, [OUTPTR], #8 2902 b.ne 1b 2903 2904 br x30 2905 2906 .unreq IMAGE_WIDTH 2907 .unreq MAX_V_SAMP 2908 .unreq V_SAMP 2909 .unreq BLOCK_WIDTH 2910 .unreq INPUT_DATA 2911 .unreq OUTPUT_DATA 2912 .unreq OUTPTR 2913 .unreq INPTR 2914 .unreq TMP1 2915 .unreq TMP2 2916 .unreq TMP3 2917 .unreq TMPDUP 2918 2919 2920/*****************************************************************************/ 2921 2922/* 2923 * Downsample pixel values of a single component. 2924 * This version handles the common case of 2:1 horizontal and 2:1 vertical, 2925 * without smoothing. 2926 * 2927 * GLOBAL(void) 2928 * jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor, 2929 * JDIMENSION v_samp_factor, 2930 * JDIMENSION width_in_blocks, 2931 * JSAMPARRAY input_data, JSAMPARRAY output_data); 2932 */ 2933 2934.balign 16 2935asm_function jsimd_h2v2_downsample_neon 2936 IMAGE_WIDTH .req x0 2937 MAX_V_SAMP .req x1 2938 V_SAMP .req x2 2939 BLOCK_WIDTH .req x3 2940 INPUT_DATA .req x4 2941 OUTPUT_DATA .req x5 2942 OUTPTR .req x9 2943 INPTR0 .req x10 2944 INPTR1 .req x14 2945 TMP1 .req x11 2946 TMP2 .req x12 2947 TMP3 .req x13 2948 TMPDUP .req w15 2949 2950 mov TMPDUP, #1 2951 lsl TMP2, BLOCK_WIDTH, #4 2952 lsl TMPDUP, TMPDUP, #17 2953 sub TMP2, TMP2, IMAGE_WIDTH 2954 get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts 2955 orr TMPDUP, TMPDUP, #1 2956 add TMP3, TMP3, TMP2, lsl #4 2957 dup v16.4s, TMPDUP 2958 ld1 {v18.16b}, [TMP3] 2959 29601: /* row loop */ 2961 ldr INPTR0, [INPUT_DATA], #8 2962 ldr OUTPTR, [OUTPUT_DATA], #8 2963 ldr INPTR1, [INPUT_DATA], #8 2964 subs TMP1, BLOCK_WIDTH, #1 2965 b.eq 3f 29662: /* columns */ 2967 ld1 {v0.16b}, [INPTR0], #16 2968 ld1 {v1.16b}, [INPTR1], #16 2969 mov v4.16b, v16.16b 2970 subs TMP1, TMP1, #1 2971 uadalp v4.8h, v0.16b 2972 uadalp v4.8h, v1.16b 2973 shrn v6.8b, v4.8h, #2 2974 st1 {v6.8b}, [OUTPTR], #8 2975 b.ne 2b 29763: /* last columns */ 2977 ld1 {v0.16b}, [INPTR0], #16 2978 ld1 {v1.16b}, [INPTR1], #16 2979 mov v4.16b, v16.16b 2980 subs V_SAMP, V_SAMP, #1 2981 /* expand right */ 2982 tbl v2.16b, {v0.16b}, v18.16b 2983 tbl v3.16b, {v1.16b}, v18.16b 2984 uadalp v4.8h, v2.16b 2985 uadalp v4.8h, v3.16b 2986 shrn v6.8b, v4.8h, #2 2987 st1 {v6.8b}, [OUTPTR], #8 2988 b.ne 1b 2989 2990 br x30 2991 2992 .unreq IMAGE_WIDTH 2993 .unreq MAX_V_SAMP 2994 .unreq V_SAMP 2995 .unreq BLOCK_WIDTH 2996 .unreq INPUT_DATA 2997 .unreq OUTPUT_DATA 2998 .unreq OUTPTR 2999 .unreq INPTR0 3000 .unreq INPTR1 3001 .unreq TMP1 3002 .unreq TMP2 3003 .unreq TMP3 3004 .unreq TMPDUP 3005 3006 3007/*****************************************************************************/ 3008 3009/* 3010 * GLOBAL(JOCTET *) 3011 * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer, 3012 * JCOEFPTR block, int last_dc_val, 3013 * c_derived_tbl *dctbl, c_derived_tbl *actbl) 3014 * 3015 */ 3016 3017 BUFFER .req x1 3018 PUT_BUFFER .req x6 3019 PUT_BITS .req x7 3020 PUT_BITSw .req w7 3021 3022.macro emit_byte 3023 sub PUT_BITS, PUT_BITS, #0x8 3024 lsr x19, PUT_BUFFER, PUT_BITS 3025 uxtb w19, w19 3026 strb w19, [BUFFER, #1]! 3027 cmp w19, #0xff 3028 b.ne 14f 3029 strb wzr, [BUFFER, #1]! 303014: 3031.endm 3032.macro put_bits CODE, SIZE 3033 lsl PUT_BUFFER, PUT_BUFFER, \SIZE 3034 add PUT_BITS, PUT_BITS, \SIZE 3035 orr PUT_BUFFER, PUT_BUFFER, \CODE 3036.endm 3037.macro checkbuf31 3038 cmp PUT_BITS, #0x20 3039 b.lt 31f 3040 emit_byte 3041 emit_byte 3042 emit_byte 3043 emit_byte 304431: 3045.endm 3046.macro checkbuf47 3047 cmp PUT_BITS, #0x30 3048 b.lt 47f 3049 emit_byte 3050 emit_byte 3051 emit_byte 3052 emit_byte 3053 emit_byte 3054 emit_byte 305547: 3056.endm 3057 3058.macro generate_jsimd_huff_encode_one_block fast_tbl 3059 3060.if \fast_tbl == 1 3061asm_function jsimd_huff_encode_one_block_neon 3062.else 3063asm_function jsimd_huff_encode_one_block_neon_slowtbl 3064.endif 3065 sub sp, sp, 272 3066 sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */ 3067 /* Save ARM registers */ 3068 stp x19, x20, [sp] 3069 get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_consts 3070 ldr PUT_BUFFER, [x0, #0x10] 3071 ldr PUT_BITSw, [x0, #0x18] 3072 ldrsh w12, [x2] /* load DC coeff in w12 */ 3073 /* prepare data */ 3074.if \fast_tbl == 1 3075 ld1 {v23.16b}, [x15], #16 3076 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64 3077 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64 3078 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64 3079 ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64 3080 ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64 3081 sub w12, w12, w3 /* last_dc_val, not used afterwards */ 3082 /* ZigZag 8x8 */ 3083 tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b 3084 tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b 3085 tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b 3086 tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b 3087 tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b 3088 tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b 3089 tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b 3090 tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b 3091 ins v0.h[0], w12 3092 tbx v1.16b, {v28.16b}, v16.16b 3093 tbx v2.16b, {v29.16b, v30.16b}, v17.16b 3094 tbx v5.16b, {v29.16b, v30.16b}, v18.16b 3095 tbx v6.16b, {v31.16b}, v19.16b 3096.else 3097 add x13, x2, #0x22 3098 sub w12, w12, w3 /* last_dc_val, not used afterwards */ 3099 ld1 {v23.16b}, [x15] 3100 add x14, x2, #0x18 3101 add x3, x2, #0x36 3102 ins v0.h[0], w12 3103 add x9, x2, #0x2 3104 ld1 {v1.h}[0], [x13] 3105 add x15, x2, #0x30 3106 ld1 {v2.h}[0], [x14] 3107 add x19, x2, #0x26 3108 ld1 {v3.h}[0], [x3] 3109 add x20, x2, #0x28 3110 ld1 {v0.h}[1], [x9] 3111 add x12, x2, #0x10 3112 ld1 {v1.h}[1], [x15] 3113 add x13, x2, #0x40 3114 ld1 {v2.h}[1], [x19] 3115 add x14, x2, #0x34 3116 ld1 {v3.h}[1], [x20] 3117 add x3, x2, #0x1a 3118 ld1 {v0.h}[2], [x12] 3119 add x9, x2, #0x20 3120 ld1 {v1.h}[2], [x13] 3121 add x15, x2, #0x32 3122 ld1 {v2.h}[2], [x14] 3123 add x19, x2, #0x42 3124 ld1 {v3.h}[2], [x3] 3125 add x20, x2, #0xc 3126 ld1 {v0.h}[3], [x9] 3127 add x12, x2, #0x12 3128 ld1 {v1.h}[3], [x15] 3129 add x13, x2, #0x24 3130 ld1 {v2.h}[3], [x19] 3131 add x14, x2, #0x50 3132 ld1 {v3.h}[3], [x20] 3133 add x3, x2, #0xe 3134 ld1 {v0.h}[4], [x12] 3135 add x9, x2, #0x4 3136 ld1 {v1.h}[4], [x13] 3137 add x15, x2, #0x16 3138 ld1 {v2.h}[4], [x14] 3139 add x19, x2, #0x60 3140 ld1 {v3.h}[4], [x3] 3141 add x20, x2, #0x1c 3142 ld1 {v0.h}[5], [x9] 3143 add x12, x2, #0x6 3144 ld1 {v1.h}[5], [x15] 3145 add x13, x2, #0x8 3146 ld1 {v2.h}[5], [x19] 3147 add x14, x2, #0x52 3148 ld1 {v3.h}[5], [x20] 3149 add x3, x2, #0x2a 3150 ld1 {v0.h}[6], [x12] 3151 add x9, x2, #0x14 3152 ld1 {v1.h}[6], [x13] 3153 add x15, x2, #0xa 3154 ld1 {v2.h}[6], [x14] 3155 add x19, x2, #0x44 3156 ld1 {v3.h}[6], [x3] 3157 add x20, x2, #0x38 3158 ld1 {v0.h}[7], [x9] 3159 add x12, x2, #0x46 3160 ld1 {v1.h}[7], [x15] 3161 add x13, x2, #0x3a 3162 ld1 {v2.h}[7], [x19] 3163 add x14, x2, #0x74 3164 ld1 {v3.h}[7], [x20] 3165 add x3, x2, #0x6a 3166 ld1 {v4.h}[0], [x12] 3167 add x9, x2, #0x54 3168 ld1 {v5.h}[0], [x13] 3169 add x15, x2, #0x2c 3170 ld1 {v6.h}[0], [x14] 3171 add x19, x2, #0x76 3172 ld1 {v7.h}[0], [x3] 3173 add x20, x2, #0x78 3174 ld1 {v4.h}[1], [x9] 3175 add x12, x2, #0x62 3176 ld1 {v5.h}[1], [x15] 3177 add x13, x2, #0x1e 3178 ld1 {v6.h}[1], [x19] 3179 add x14, x2, #0x68 3180 ld1 {v7.h}[1], [x20] 3181 add x3, x2, #0x7a 3182 ld1 {v4.h}[2], [x12] 3183 add x9, x2, #0x70 3184 ld1 {v5.h}[2], [x13] 3185 add x15, x2, #0x2e 3186 ld1 {v6.h}[2], [x14] 3187 add x19, x2, #0x5a 3188 ld1 {v7.h}[2], [x3] 3189 add x20, x2, #0x6c 3190 ld1 {v4.h}[3], [x9] 3191 add x12, x2, #0x72 3192 ld1 {v5.h}[3], [x15] 3193 add x13, x2, #0x3c 3194 ld1 {v6.h}[3], [x19] 3195 add x14, x2, #0x4c 3196 ld1 {v7.h}[3], [x20] 3197 add x3, x2, #0x5e 3198 ld1 {v4.h}[4], [x12] 3199 add x9, x2, #0x64 3200 ld1 {v5.h}[4], [x13] 3201 add x15, x2, #0x4a 3202 ld1 {v6.h}[4], [x14] 3203 add x19, x2, #0x3e 3204 ld1 {v7.h}[4], [x3] 3205 add x20, x2, #0x6e 3206 ld1 {v4.h}[5], [x9] 3207 add x12, x2, #0x56 3208 ld1 {v5.h}[5], [x15] 3209 add x13, x2, #0x58 3210 ld1 {v6.h}[5], [x19] 3211 add x14, x2, #0x4e 3212 ld1 {v7.h}[5], [x20] 3213 add x3, x2, #0x7c 3214 ld1 {v4.h}[6], [x12] 3215 add x9, x2, #0x48 3216 ld1 {v5.h}[6], [x13] 3217 add x15, x2, #0x66 3218 ld1 {v6.h}[6], [x14] 3219 add x19, x2, #0x5c 3220 ld1 {v7.h}[6], [x3] 3221 add x20, x2, #0x7e 3222 ld1 {v4.h}[7], [x9] 3223 ld1 {v5.h}[7], [x15] 3224 ld1 {v6.h}[7], [x19] 3225 ld1 {v7.h}[7], [x20] 3226.endif 3227 cmlt v24.8h, v0.8h, #0 3228 cmlt v25.8h, v1.8h, #0 3229 cmlt v26.8h, v2.8h, #0 3230 cmlt v27.8h, v3.8h, #0 3231 cmlt v28.8h, v4.8h, #0 3232 cmlt v29.8h, v5.8h, #0 3233 cmlt v30.8h, v6.8h, #0 3234 cmlt v31.8h, v7.8h, #0 3235 abs v0.8h, v0.8h 3236 abs v1.8h, v1.8h 3237 abs v2.8h, v2.8h 3238 abs v3.8h, v3.8h 3239 abs v4.8h, v4.8h 3240 abs v5.8h, v5.8h 3241 abs v6.8h, v6.8h 3242 abs v7.8h, v7.8h 3243 eor v24.16b, v24.16b, v0.16b 3244 eor v25.16b, v25.16b, v1.16b 3245 eor v26.16b, v26.16b, v2.16b 3246 eor v27.16b, v27.16b, v3.16b 3247 eor v28.16b, v28.16b, v4.16b 3248 eor v29.16b, v29.16b, v5.16b 3249 eor v30.16b, v30.16b, v6.16b 3250 eor v31.16b, v31.16b, v7.16b 3251 cmeq v16.8h, v0.8h, #0 3252 cmeq v17.8h, v1.8h, #0 3253 cmeq v18.8h, v2.8h, #0 3254 cmeq v19.8h, v3.8h, #0 3255 cmeq v20.8h, v4.8h, #0 3256 cmeq v21.8h, v5.8h, #0 3257 cmeq v22.8h, v6.8h, #0 3258 xtn v16.8b, v16.8h 3259 xtn v18.8b, v18.8h 3260 xtn v20.8b, v20.8h 3261 xtn v22.8b, v22.8h 3262 umov w14, v0.h[0] 3263 xtn2 v16.16b, v17.8h 3264 umov w13, v24.h[0] 3265 xtn2 v18.16b, v19.8h 3266 clz w14, w14 3267 xtn2 v20.16b, v21.8h 3268 lsl w13, w13, w14 3269 cmeq v17.8h, v7.8h, #0 3270 sub w12, w14, #32 3271 xtn2 v22.16b, v17.8h 3272 lsr w13, w13, w14 3273 and v16.16b, v16.16b, v23.16b 3274 neg w12, w12 3275 and v18.16b, v18.16b, v23.16b 3276 add x3, x4, #0x400 /* r1 = dctbl->ehufsi */ 3277 and v20.16b, v20.16b, v23.16b 3278 add x15, sp, #0x90 /* x15 = t2 */ 3279 and v22.16b, v22.16b, v23.16b 3280 ldr w10, [x4, x12, lsl #2] 3281 addp v16.16b, v16.16b, v18.16b 3282 ldrb w11, [x3, x12] 3283 addp v20.16b, v20.16b, v22.16b 3284 checkbuf47 3285 addp v16.16b, v16.16b, v20.16b 3286 put_bits x10, x11 3287 addp v16.16b, v16.16b, v18.16b 3288 checkbuf47 3289 umov x9, v16.D[0] 3290 put_bits x13, x12 3291 cnt v17.8b, v16.8b 3292 mvn x9, x9 3293 addv B18, v17.8b 3294 add x4, x5, #0x400 /* x4 = actbl->ehufsi */ 3295 umov w12, v18.b[0] 3296 lsr x9, x9, #0x1 /* clear AC coeff */ 3297 ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */ 3298 rbit x9, x9 /* x9 = index0 */ 3299 ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */ 3300 cmp w12, #(64-8) 3301 add x11, sp, #16 3302 b.lt 4f 3303 cbz x9, 6f 3304 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 3305 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 3306 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 3307 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 33081: 3309 clz x2, x9 3310 add x15, x15, x2, lsl #1 3311 lsl x9, x9, x2 3312 ldrh w20, [x15, #-126] 33132: 3314 cmp x2, #0x10 3315 b.lt 3f 3316 sub x2, x2, #0x10 3317 checkbuf47 3318 put_bits x13, x14 3319 b 2b 33203: 3321 clz w20, w20 3322 ldrh w3, [x15, #2]! 3323 sub w11, w20, #32 3324 lsl w3, w3, w20 3325 neg w11, w11 3326 lsr w3, w3, w20 3327 add x2, x11, x2, lsl #4 3328 lsl x9, x9, #0x1 3329 ldr w12, [x5, x2, lsl #2] 3330 ldrb w10, [x4, x2] 3331 checkbuf31 3332 put_bits x12, x10 3333 put_bits x3, x11 3334 cbnz x9, 1b 3335 b 6f 33364: 3337 movi v21.8h, #0x0010 3338 clz v0.8h, v0.8h 3339 clz v1.8h, v1.8h 3340 clz v2.8h, v2.8h 3341 clz v3.8h, v3.8h 3342 clz v4.8h, v4.8h 3343 clz v5.8h, v5.8h 3344 clz v6.8h, v6.8h 3345 clz v7.8h, v7.8h 3346 ushl v24.8h, v24.8h, v0.8h 3347 ushl v25.8h, v25.8h, v1.8h 3348 ushl v26.8h, v26.8h, v2.8h 3349 ushl v27.8h, v27.8h, v3.8h 3350 ushl v28.8h, v28.8h, v4.8h 3351 ushl v29.8h, v29.8h, v5.8h 3352 ushl v30.8h, v30.8h, v6.8h 3353 ushl v31.8h, v31.8h, v7.8h 3354 neg v0.8h, v0.8h 3355 neg v1.8h, v1.8h 3356 neg v2.8h, v2.8h 3357 neg v3.8h, v3.8h 3358 neg v4.8h, v4.8h 3359 neg v5.8h, v5.8h 3360 neg v6.8h, v6.8h 3361 neg v7.8h, v7.8h 3362 ushl v24.8h, v24.8h, v0.8h 3363 ushl v25.8h, v25.8h, v1.8h 3364 ushl v26.8h, v26.8h, v2.8h 3365 ushl v27.8h, v27.8h, v3.8h 3366 ushl v28.8h, v28.8h, v4.8h 3367 ushl v29.8h, v29.8h, v5.8h 3368 ushl v30.8h, v30.8h, v6.8h 3369 ushl v31.8h, v31.8h, v7.8h 3370 add v0.8h, v21.8h, v0.8h 3371 add v1.8h, v21.8h, v1.8h 3372 add v2.8h, v21.8h, v2.8h 3373 add v3.8h, v21.8h, v3.8h 3374 add v4.8h, v21.8h, v4.8h 3375 add v5.8h, v21.8h, v5.8h 3376 add v6.8h, v21.8h, v6.8h 3377 add v7.8h, v21.8h, v7.8h 3378 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 3379 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 3380 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 3381 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 33821: 3383 clz x2, x9 3384 add x15, x15, x2, lsl #1 3385 lsl x9, x9, x2 3386 ldrh w11, [x15, #-126] 33872: 3388 cmp x2, #0x10 3389 b.lt 3f 3390 sub x2, x2, #0x10 3391 checkbuf47 3392 put_bits x13, x14 3393 b 2b 33943: 3395 ldrh w3, [x15, #2]! 3396 add x2, x11, x2, lsl #4 3397 lsl x9, x9, #0x1 3398 ldr w12, [x5, x2, lsl #2] 3399 ldrb w10, [x4, x2] 3400 checkbuf31 3401 put_bits x12, x10 3402 put_bits x3, x11 3403 cbnz x9, 1b 34046: 3405 add x13, sp, #0x10e 3406 cmp x15, x13 3407 b.hs 1f 3408 ldr w12, [x5] 3409 ldrb w14, [x4] 3410 checkbuf47 3411 put_bits x12, x14 34121: 3413 str PUT_BUFFER, [x0, #0x10] 3414 str PUT_BITSw, [x0, #0x18] 3415 ldp x19, x20, [sp], 16 3416 add x0, BUFFER, #0x1 3417 add sp, sp, 256 3418 br x30 3419 3420.endm 3421 3422generate_jsimd_huff_encode_one_block 1 3423generate_jsimd_huff_encode_one_block 0 3424 3425 .unreq BUFFER 3426 .unreq PUT_BUFFER 3427 .unreq PUT_BITS 3428 .unreq PUT_BITSw 3429 3430.purgem emit_byte 3431.purgem put_bits 3432.purgem checkbuf31 3433.purgem checkbuf47 3434