1/* 2 * Armv7 Neon optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). 5 * All Rights Reserved. 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> 7 * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved. 8 * Copyright (C) 2014, Linaro Limited. All Rights Reserved. 9 * Copyright (C) 2015, D. R. Commander. All Rights Reserved. 10 * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved. 11 * 12 * This software is provided 'as-is', without any express or implied 13 * warranty. In no event will the authors be held liable for any damages 14 * arising from the use of this software. 15 * 16 * Permission is granted to anyone to use this software for any purpose, 17 * including commercial applications, and to alter it and redistribute it 18 * freely, subject to the following restrictions: 19 * 20 * 1. The origin of this software must not be misrepresented; you must not 21 * claim that you wrote the original software. If you use this software 22 * in a product, an acknowledgment in the product documentation would be 23 * appreciated but is not required. 24 * 2. Altered source versions must be plainly marked as such, and must not be 25 * misrepresented as being the original software. 26 * 3. This notice may not be removed or altered from any source distribution. 27 */ 28 29#if defined(__linux__) && defined(__ELF__) 30.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ 31#endif 32 33.text 34.fpu neon 35.arch armv7a 36.object_arch armv4 37.arm 38.syntax unified 39 40 41/*****************************************************************************/ 42 43/* Supplementary macro for setting function attributes */ 44.macro asm_function fname 45#ifdef __APPLE__ 46 .private_extern _\fname 47 .globl _\fname 48_\fname: 49#else 50 .global \fname 51#ifdef __ELF__ 52 .hidden \fname 53 .type \fname, %function 54#endif 55\fname: 56#endif 57.endm 58 59 60#define CENTERJSAMPLE 128 61 62/*****************************************************************************/ 63 64/* 65 * Perform dequantization and inverse DCT on one block of coefficients. 66 * 67 * GLOBAL(void) 68 * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block, 69 * JSAMPARRAY output_buf, JDIMENSION output_col) 70 */ 71 72#define FIX_0_298631336 (2446) 73#define FIX_0_390180644 (3196) 74#define FIX_0_541196100 (4433) 75#define FIX_0_765366865 (6270) 76#define FIX_0_899976223 (7373) 77#define FIX_1_175875602 (9633) 78#define FIX_1_501321110 (12299) 79#define FIX_1_847759065 (15137) 80#define FIX_1_961570560 (16069) 81#define FIX_2_053119869 (16819) 82#define FIX_2_562915447 (20995) 83#define FIX_3_072711026 (25172) 84 85#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) 86#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) 87#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) 88#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) 89#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) 90#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) 91#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) 92#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) 93 94/* 95 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. 96 * Uses some ideas from the comments in 'simd/jiss2int-64.asm' 97 */ 98#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \ 99 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ 100 JLONG q1, q2, q3, q4, q5, q6, q7; \ 101 JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \ 102 \ 103 /* 1-D iDCT input data */ \ 104 row0 = xrow0; \ 105 row1 = xrow1; \ 106 row2 = xrow2; \ 107 row3 = xrow3; \ 108 row4 = xrow4; \ 109 row5 = xrow5; \ 110 row6 = xrow6; \ 111 row7 = xrow7; \ 112 \ 113 q5 = row7 + row3; \ 114 q4 = row5 + row1; \ 115 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ 116 MULTIPLY(q4, FIX_1_175875602); \ 117 q7 = MULTIPLY(q5, FIX_1_175875602) + \ 118 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ 119 q2 = MULTIPLY(row2, FIX_0_541196100) + \ 120 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ 121 q4 = q6; \ 122 q3 = ((JLONG)row0 - (JLONG)row4) << 13; \ 123 q6 += MULTIPLY(row5, -FIX_2_562915447) + \ 124 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ 125 /* now we can use q1 (reloadable constants have been used up) */ \ 126 q1 = q3 + q2; \ 127 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ 128 MULTIPLY(row1, -FIX_0_899976223); \ 129 q5 = q7; \ 130 q1 = q1 + q6; \ 131 q7 += MULTIPLY(row7, -FIX_0_899976223) + \ 132 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ 133 \ 134 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ 135 tmp11_plus_tmp2 = q1; \ 136 row1 = 0; \ 137 \ 138 q1 = q1 - q6; \ 139 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ 140 MULTIPLY(row3, -FIX_2_562915447); \ 141 q1 = q1 - q6; \ 142 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ 143 MULTIPLY(row6, FIX_0_541196100); \ 144 q3 = q3 - q2; \ 145 \ 146 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ 147 tmp11_minus_tmp2 = q1; \ 148 \ 149 q1 = ((JLONG)row0 + (JLONG)row4) << 13; \ 150 q2 = q1 + q6; \ 151 q1 = q1 - q6; \ 152 \ 153 /* pick up the results */ \ 154 tmp0 = q4; \ 155 tmp1 = q5; \ 156 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ 157 tmp3 = q7; \ 158 tmp10 = q2; \ 159 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ 160 tmp12 = q3; \ 161 tmp13 = q1; \ 162} 163 164#define XFIX_0_899976223 d0[0] 165#define XFIX_0_541196100 d0[1] 166#define XFIX_2_562915447 d0[2] 167#define XFIX_0_298631336_MINUS_0_899976223 d0[3] 168#define XFIX_1_501321110_MINUS_0_899976223 d1[0] 169#define XFIX_2_053119869_MINUS_2_562915447 d1[1] 170#define XFIX_0_541196100_PLUS_0_765366865 d1[2] 171#define XFIX_1_175875602 d1[3] 172#define XFIX_1_175875602_MINUS_0_390180644 d2[0] 173#define XFIX_0_541196100_MINUS_1_847759065 d2[1] 174#define XFIX_3_072711026_MINUS_2_562915447 d2[2] 175#define XFIX_1_175875602_MINUS_1_961570560 d2[3] 176 177.balign 16 178jsimd_idct_islow_neon_consts: 179 .short FIX_0_899976223 /* d0[0] */ 180 .short FIX_0_541196100 /* d0[1] */ 181 .short FIX_2_562915447 /* d0[2] */ 182 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ 183 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ 184 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ 185 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ 186 .short FIX_1_175875602 /* d1[3] */ 187 /* reloadable constants */ 188 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ 189 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ 190 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ 191 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ 192 193asm_function jsimd_idct_islow_neon 194 195 DCT_TABLE .req r0 196 COEF_BLOCK .req r1 197 OUTPUT_BUF .req r2 198 OUTPUT_COL .req r3 199 TMP1 .req r0 200 TMP2 .req r1 201 TMP3 .req r2 202 TMP4 .req ip 203 204 ROW0L .req d16 205 ROW0R .req d17 206 ROW1L .req d18 207 ROW1R .req d19 208 ROW2L .req d20 209 ROW2R .req d21 210 ROW3L .req d22 211 ROW3R .req d23 212 ROW4L .req d24 213 ROW4R .req d25 214 ROW5L .req d26 215 ROW5R .req d27 216 ROW6L .req d28 217 ROW6R .req d29 218 ROW7L .req d30 219 ROW7R .req d31 220 221 /* Load and dequantize coefficients into Neon registers 222 * with the following allocation: 223 * 0 1 2 3 | 4 5 6 7 224 * ---------+-------- 225 * 0 | d16 | d17 ( q8 ) 226 * 1 | d18 | d19 ( q9 ) 227 * 2 | d20 | d21 ( q10 ) 228 * 3 | d22 | d23 ( q11 ) 229 * 4 | d24 | d25 ( q12 ) 230 * 5 | d26 | d27 ( q13 ) 231 * 6 | d28 | d29 ( q14 ) 232 * 7 | d30 | d31 ( q15 ) 233 */ 234 adr ip, jsimd_idct_islow_neon_consts 235 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! 236 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 237 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! 238 vmul.s16 q8, q8, q0 239 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 240 vmul.s16 q9, q9, q1 241 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! 242 vmul.s16 q10, q10, q2 243 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 244 vmul.s16 q11, q11, q3 245 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] 246 vmul.s16 q12, q12, q0 247 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 248 vmul.s16 q14, q14, q2 249 vmul.s16 q13, q13, q1 250 vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ 251 add ip, ip, #16 252 vmul.s16 q15, q15, q3 253 vpush {d8 - d15} /* save Neon registers */ 254 /* 1-D IDCT, pass 1, left 4x8 half */ 255 vadd.s16 d4, ROW7L, ROW3L 256 vadd.s16 d5, ROW5L, ROW1L 257 vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 258 vmlal.s16 q6, d5, XFIX_1_175875602 259 vmull.s16 q7, d4, XFIX_1_175875602 260 /* Check for the zero coefficients in the right 4x8 half */ 261 push {r4, r5} 262 vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 263 vsubl.s16 q3, ROW0L, ROW4L 264 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] 265 vmull.s16 q2, ROW2L, XFIX_0_541196100 266 vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 267 orr r0, r4, r5 268 vmov q4, q6 269 vmlsl.s16 q6, ROW5L, XFIX_2_562915447 270 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] 271 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 272 vshl.s32 q3, q3, #13 273 orr r0, r0, r4 274 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 275 orr r0, r0, r5 276 vadd.s32 q1, q3, q2 277 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] 278 vmov q5, q7 279 vadd.s32 q1, q1, q6 280 orr r0, r0, r4 281 vmlsl.s16 q7, ROW7L, XFIX_0_899976223 282 orr r0, r0, r5 283 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 284 vrshrn.s32 ROW1L, q1, #11 285 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] 286 vsub.s32 q1, q1, q6 287 vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 288 orr r0, r0, r4 289 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 290 orr r0, r0, r5 291 vsub.s32 q1, q1, q6 292 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 293 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] 294 vmlal.s16 q6, ROW6L, XFIX_0_541196100 295 vsub.s32 q3, q3, q2 296 orr r0, r0, r4 297 vrshrn.s32 ROW6L, q1, #11 298 orr r0, r0, r5 299 vadd.s32 q1, q3, q5 300 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] 301 vsub.s32 q3, q3, q5 302 vaddl.s16 q5, ROW0L, ROW4L 303 orr r0, r0, r4 304 vrshrn.s32 ROW2L, q1, #11 305 orr r0, r0, r5 306 vrshrn.s32 ROW5L, q3, #11 307 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] 308 vshl.s32 q5, q5, #13 309 vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 310 orr r0, r0, r4 311 vadd.s32 q2, q5, q6 312 orrs r0, r0, r5 313 vsub.s32 q1, q5, q6 314 vadd.s32 q6, q2, q7 315 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] 316 vsub.s32 q2, q2, q7 317 vadd.s32 q5, q1, q4 318 orr r0, r4, r5 319 vsub.s32 q3, q1, q4 320 pop {r4, r5} 321 vrshrn.s32 ROW7L, q2, #11 322 vrshrn.s32 ROW3L, q5, #11 323 vrshrn.s32 ROW0L, q6, #11 324 vrshrn.s32 ROW4L, q3, #11 325 326 beq 3f /* Go to do some special handling for the sparse 327 right 4x8 half */ 328 329 /* 1-D IDCT, pass 1, right 4x8 half */ 330 vld1.s16 {d2}, [ip, :64] /* reload constants */ 331 vadd.s16 d10, ROW7R, ROW3R 332 vadd.s16 d8, ROW5R, ROW1R 333 /* Transpose left 4x8 half */ 334 vtrn.16 ROW6L, ROW7L 335 vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 336 vmlal.s16 q6, d8, XFIX_1_175875602 337 vtrn.16 ROW2L, ROW3L 338 vmull.s16 q7, d10, XFIX_1_175875602 339 vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 340 vtrn.16 ROW0L, ROW1L 341 vsubl.s16 q3, ROW0R, ROW4R 342 vmull.s16 q2, ROW2R, XFIX_0_541196100 343 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 344 vtrn.16 ROW4L, ROW5L 345 vmov q4, q6 346 vmlsl.s16 q6, ROW5R, XFIX_2_562915447 347 vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 348 vtrn.32 ROW1L, ROW3L 349 vshl.s32 q3, q3, #13 350 vmlsl.s16 q4, ROW1R, XFIX_0_899976223 351 vtrn.32 ROW4L, ROW6L 352 vadd.s32 q1, q3, q2 353 vmov q5, q7 354 vadd.s32 q1, q1, q6 355 vtrn.32 ROW0L, ROW2L 356 vmlsl.s16 q7, ROW7R, XFIX_0_899976223 357 vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 358 vrshrn.s32 ROW1R, q1, #11 359 vtrn.32 ROW5L, ROW7L 360 vsub.s32 q1, q1, q6 361 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 362 vmlsl.s16 q5, ROW3R, XFIX_2_562915447 363 vsub.s32 q1, q1, q6 364 vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 365 vmlal.s16 q6, ROW6R, XFIX_0_541196100 366 vsub.s32 q3, q3, q2 367 vrshrn.s32 ROW6R, q1, #11 368 vadd.s32 q1, q3, q5 369 vsub.s32 q3, q3, q5 370 vaddl.s16 q5, ROW0R, ROW4R 371 vrshrn.s32 ROW2R, q1, #11 372 vrshrn.s32 ROW5R, q3, #11 373 vshl.s32 q5, q5, #13 374 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 375 vadd.s32 q2, q5, q6 376 vsub.s32 q1, q5, q6 377 vadd.s32 q6, q2, q7 378 vsub.s32 q2, q2, q7 379 vadd.s32 q5, q1, q4 380 vsub.s32 q3, q1, q4 381 vrshrn.s32 ROW7R, q2, #11 382 vrshrn.s32 ROW3R, q5, #11 383 vrshrn.s32 ROW0R, q6, #11 384 vrshrn.s32 ROW4R, q3, #11 385 /* Transpose right 4x8 half */ 386 vtrn.16 ROW6R, ROW7R 387 vtrn.16 ROW2R, ROW3R 388 vtrn.16 ROW0R, ROW1R 389 vtrn.16 ROW4R, ROW5R 390 vtrn.32 ROW1R, ROW3R 391 vtrn.32 ROW4R, ROW6R 392 vtrn.32 ROW0R, ROW2R 393 vtrn.32 ROW5R, ROW7R 394 3951: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ 396 vld1.s16 {d2}, [ip, :64] /* reload constants */ 397 vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ 398 vmlal.s16 q6, ROW1L, XFIX_1_175875602 399 vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ 400 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 401 vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ 402 vmlal.s16 q7, ROW3L, XFIX_1_175875602 403 vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ 404 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 405 vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ 406 vmull.s16 q2, ROW2L, XFIX_0_541196100 407 vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */ 408 vmov q4, q6 409 vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ 410 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 411 vshl.s32 q3, q3, #13 412 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 413 vadd.s32 q1, q3, q2 414 vmov q5, q7 415 vadd.s32 q1, q1, q6 416 vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ 417 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 418 vshrn.s32 ROW1L, q1, #16 419 vsub.s32 q1, q1, q6 420 vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */ 421 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 422 vsub.s32 q1, q1, q6 423 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 424 vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ 425 vsub.s32 q3, q3, q2 426 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ 427 vadd.s32 q1, q3, q5 428 vsub.s32 q3, q3, q5 429 vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ 430 vshrn.s32 ROW2L, q1, #16 431 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ 432 vshl.s32 q5, q5, #13 433 vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */ 434 vadd.s32 q2, q5, q6 435 vsub.s32 q1, q5, q6 436 vadd.s32 q6, q2, q7 437 vsub.s32 q2, q2, q7 438 vadd.s32 q5, q1, q4 439 vsub.s32 q3, q1, q4 440 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ 441 vshrn.s32 ROW3L, q5, #16 442 vshrn.s32 ROW0L, q6, #16 443 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ 444 /* 1-D IDCT, pass 2, right 4x8 half */ 445 vld1.s16 {d2}, [ip, :64] /* reload constants */ 446 vmull.s16 q6, ROW5R, XFIX_1_175875602 447 vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ 448 vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 449 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ 450 vmull.s16 q7, ROW7R, XFIX_1_175875602 451 vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ 452 vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 453 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ 454 vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ 455 vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ 456 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 457 vmov q4, q6 458 vmlsl.s16 q6, ROW5R, XFIX_2_562915447 459 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */ 460 vshl.s32 q3, q3, #13 461 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ 462 vadd.s32 q1, q3, q2 463 vmov q5, q7 464 vadd.s32 q1, q1, q6 465 vmlsl.s16 q7, ROW7R, XFIX_0_899976223 466 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */ 467 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ 468 vsub.s32 q1, q1, q6 469 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 470 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ 471 vsub.s32 q1, q1, q6 472 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */ 473 vmlal.s16 q6, ROW6R, XFIX_0_541196100 474 vsub.s32 q3, q3, q2 475 vshrn.s32 ROW6R, q1, #16 476 vadd.s32 q1, q3, q5 477 vsub.s32 q3, q3, q5 478 vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ 479 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ 480 vshrn.s32 ROW5R, q3, #16 481 vshl.s32 q5, q5, #13 482 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 483 vadd.s32 q2, q5, q6 484 vsub.s32 q1, q5, q6 485 vadd.s32 q6, q2, q7 486 vsub.s32 q2, q2, q7 487 vadd.s32 q5, q1, q4 488 vsub.s32 q3, q1, q4 489 vshrn.s32 ROW7R, q2, #16 490 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ 491 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ 492 vshrn.s32 ROW4R, q3, #16 493 4942: /* Descale to 8-bit and range limit */ 495 vqrshrn.s16 d16, q8, #2 496 vqrshrn.s16 d17, q9, #2 497 vqrshrn.s16 d18, q10, #2 498 vqrshrn.s16 d19, q11, #2 499 vpop {d8 - d15} /* restore Neon registers */ 500 vqrshrn.s16 d20, q12, #2 501 /* Transpose the final 8-bit samples and do signed->unsigned conversion */ 502 vtrn.16 q8, q9 503 vqrshrn.s16 d21, q13, #2 504 vqrshrn.s16 d22, q14, #2 505 vmov.u8 q0, #(CENTERJSAMPLE) 506 vqrshrn.s16 d23, q15, #2 507 vtrn.8 d16, d17 508 vtrn.8 d18, d19 509 vadd.u8 q8, q8, q0 510 vadd.u8 q9, q9, q0 511 vtrn.16 q10, q11 512 /* Store results to the output buffer */ 513 ldmia OUTPUT_BUF!, {TMP1, TMP2} 514 add TMP1, TMP1, OUTPUT_COL 515 add TMP2, TMP2, OUTPUT_COL 516 vst1.8 {d16}, [TMP1] 517 vtrn.8 d20, d21 518 vst1.8 {d17}, [TMP2] 519 ldmia OUTPUT_BUF!, {TMP1, TMP2} 520 add TMP1, TMP1, OUTPUT_COL 521 add TMP2, TMP2, OUTPUT_COL 522 vst1.8 {d18}, [TMP1] 523 vadd.u8 q10, q10, q0 524 vst1.8 {d19}, [TMP2] 525 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} 526 add TMP1, TMP1, OUTPUT_COL 527 add TMP2, TMP2, OUTPUT_COL 528 add TMP3, TMP3, OUTPUT_COL 529 add TMP4, TMP4, OUTPUT_COL 530 vtrn.8 d22, d23 531 vst1.8 {d20}, [TMP1] 532 vadd.u8 q11, q11, q0 533 vst1.8 {d21}, [TMP2] 534 vst1.8 {d22}, [TMP3] 535 vst1.8 {d23}, [TMP4] 536 bx lr 537 5383: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ 539 540 /* Transpose left 4x8 half */ 541 vtrn.16 ROW6L, ROW7L 542 vtrn.16 ROW2L, ROW3L 543 vtrn.16 ROW0L, ROW1L 544 vtrn.16 ROW4L, ROW5L 545 vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ 546 vtrn.32 ROW1L, ROW3L 547 vtrn.32 ROW4L, ROW6L 548 vtrn.32 ROW0L, ROW2L 549 vtrn.32 ROW5L, ROW7L 550 551 cmp r0, #0 552 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second 553 pass */ 554 555 /* Only row 0 is non-zero for the right 4x8 half */ 556 vdup.s16 ROW1R, ROW0R[1] 557 vdup.s16 ROW2R, ROW0R[2] 558 vdup.s16 ROW3R, ROW0R[3] 559 vdup.s16 ROW4R, ROW0R[0] 560 vdup.s16 ROW5R, ROW0R[1] 561 vdup.s16 ROW6R, ROW0R[2] 562 vdup.s16 ROW7R, ROW0R[3] 563 vdup.s16 ROW0R, ROW0R[0] 564 b 1b /* Go to 'normal' second pass */ 565 5664: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ 567 vld1.s16 {d2}, [ip, :64] /* reload constants */ 568 vmull.s16 q6, ROW1L, XFIX_1_175875602 569 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 570 vmull.s16 q7, ROW3L, XFIX_1_175875602 571 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 572 vmull.s16 q2, ROW2L, XFIX_0_541196100 573 vshll.s16 q3, ROW0L, #13 574 vmov q4, q6 575 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 576 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 577 vadd.s32 q1, q3, q2 578 vmov q5, q7 579 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 580 vadd.s32 q1, q1, q6 581 vadd.s32 q6, q6, q6 582 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 583 vshrn.s32 ROW1L, q1, #16 584 vsub.s32 q1, q1, q6 585 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 586 vsub.s32 q3, q3, q2 587 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ 588 vadd.s32 q1, q3, q5 589 vsub.s32 q3, q3, q5 590 vshll.s16 q5, ROW0L, #13 591 vshrn.s32 ROW2L, q1, #16 592 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ 593 vadd.s32 q2, q5, q6 594 vsub.s32 q1, q5, q6 595 vadd.s32 q6, q2, q7 596 vsub.s32 q2, q2, q7 597 vadd.s32 q5, q1, q4 598 vsub.s32 q3, q1, q4 599 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ 600 vshrn.s32 ROW3L, q5, #16 601 vshrn.s32 ROW0L, q6, #16 602 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ 603 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ 604 vld1.s16 {d2}, [ip, :64] /* reload constants */ 605 vmull.s16 q6, ROW5L, XFIX_1_175875602 606 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 607 vmull.s16 q7, ROW7L, XFIX_1_175875602 608 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 609 vmull.s16 q2, ROW6L, XFIX_0_541196100 610 vshll.s16 q3, ROW4L, #13 611 vmov q4, q6 612 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 613 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 614 vadd.s32 q1, q3, q2 615 vmov q5, q7 616 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 617 vadd.s32 q1, q1, q6 618 vadd.s32 q6, q6, q6 619 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 620 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ 621 vsub.s32 q1, q1, q6 622 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 623 vsub.s32 q3, q3, q2 624 vshrn.s32 ROW6R, q1, #16 625 vadd.s32 q1, q3, q5 626 vsub.s32 q3, q3, q5 627 vshll.s16 q5, ROW4L, #13 628 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ 629 vshrn.s32 ROW5R, q3, #16 630 vadd.s32 q2, q5, q6 631 vsub.s32 q1, q5, q6 632 vadd.s32 q6, q2, q7 633 vsub.s32 q2, q2, q7 634 vadd.s32 q5, q1, q4 635 vsub.s32 q3, q1, q4 636 vshrn.s32 ROW7R, q2, #16 637 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ 638 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ 639 vshrn.s32 ROW4R, q3, #16 640 b 2b /* Go to epilogue */ 641 642 .unreq DCT_TABLE 643 .unreq COEF_BLOCK 644 .unreq OUTPUT_BUF 645 .unreq OUTPUT_COL 646 .unreq TMP1 647 .unreq TMP2 648 .unreq TMP3 649 .unreq TMP4 650 651 .unreq ROW0L 652 .unreq ROW0R 653 .unreq ROW1L 654 .unreq ROW1R 655 .unreq ROW2L 656 .unreq ROW2R 657 .unreq ROW3L 658 .unreq ROW3R 659 .unreq ROW4L 660 .unreq ROW4R 661 .unreq ROW5L 662 .unreq ROW5R 663 .unreq ROW6L 664 .unreq ROW6R 665 .unreq ROW7L 666 .unreq ROW7R 667 668 669/*****************************************************************************/ 670 671/* 672 * jsimd_idct_ifast_neon 673 * 674 * This function contains a fast, not so accurate integer implementation of 675 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations 676 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' 677 * function from jidctfst.c 678 * 679 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. 680 * But in Arm Neon case some extra additions are required because VQDMULH 681 * instruction can't handle the constants larger than 1. So the expressions 682 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", 683 * which introduces an extra addition. Overall, there are 6 extra additions 684 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. 685 */ 686 687#define XFIX_1_082392200 d0[0] 688#define XFIX_1_414213562 d0[1] 689#define XFIX_1_847759065 d0[2] 690#define XFIX_2_613125930 d0[3] 691 692.balign 16 693jsimd_idct_ifast_neon_consts: 694 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ 695 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ 696 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ 697 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ 698 699asm_function jsimd_idct_ifast_neon 700 701 DCT_TABLE .req r0 702 COEF_BLOCK .req r1 703 OUTPUT_BUF .req r2 704 OUTPUT_COL .req r3 705 TMP1 .req r0 706 TMP2 .req r1 707 TMP3 .req r2 708 TMP4 .req ip 709 710 /* Load and dequantize coefficients into Neon registers 711 * with the following allocation: 712 * 0 1 2 3 | 4 5 6 7 713 * ---------+-------- 714 * 0 | d16 | d17 ( q8 ) 715 * 1 | d18 | d19 ( q9 ) 716 * 2 | d20 | d21 ( q10 ) 717 * 3 | d22 | d23 ( q11 ) 718 * 4 | d24 | d25 ( q12 ) 719 * 5 | d26 | d27 ( q13 ) 720 * 6 | d28 | d29 ( q14 ) 721 * 7 | d30 | d31 ( q15 ) 722 */ 723 adr ip, jsimd_idct_ifast_neon_consts 724 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! 725 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 726 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! 727 vmul.s16 q8, q8, q0 728 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 729 vmul.s16 q9, q9, q1 730 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! 731 vmul.s16 q10, q10, q2 732 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 733 vmul.s16 q11, q11, q3 734 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] 735 vmul.s16 q12, q12, q0 736 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 737 vmul.s16 q14, q14, q2 738 vmul.s16 q13, q13, q1 739 vld1.16 {d0}, [ip, :64] /* load constants */ 740 vmul.s16 q15, q15, q3 741 vpush {d8 - d13} /* save Neon registers */ 742 /* 1-D IDCT, pass 1 */ 743 vsub.s16 q2, q10, q14 744 vadd.s16 q14, q10, q14 745 vsub.s16 q1, q11, q13 746 vadd.s16 q13, q11, q13 747 vsub.s16 q5, q9, q15 748 vadd.s16 q15, q9, q15 749 vqdmulh.s16 q4, q2, XFIX_1_414213562 750 vqdmulh.s16 q6, q1, XFIX_2_613125930 751 vadd.s16 q3, q1, q1 752 vsub.s16 q1, q5, q1 753 vadd.s16 q10, q2, q4 754 vqdmulh.s16 q4, q1, XFIX_1_847759065 755 vsub.s16 q2, q15, q13 756 vadd.s16 q3, q3, q6 757 vqdmulh.s16 q6, q2, XFIX_1_414213562 758 vadd.s16 q1, q1, q4 759 vqdmulh.s16 q4, q5, XFIX_1_082392200 760 vsub.s16 q10, q10, q14 761 vadd.s16 q2, q2, q6 762 vsub.s16 q6, q8, q12 763 vadd.s16 q12, q8, q12 764 vadd.s16 q9, q5, q4 765 vadd.s16 q5, q6, q10 766 vsub.s16 q10, q6, q10 767 vadd.s16 q6, q15, q13 768 vadd.s16 q8, q12, q14 769 vsub.s16 q3, q6, q3 770 vsub.s16 q12, q12, q14 771 vsub.s16 q3, q3, q1 772 vsub.s16 q1, q9, q1 773 vadd.s16 q2, q3, q2 774 vsub.s16 q15, q8, q6 775 vadd.s16 q1, q1, q2 776 vadd.s16 q8, q8, q6 777 vadd.s16 q14, q5, q3 778 vsub.s16 q9, q5, q3 779 vsub.s16 q13, q10, q2 780 vadd.s16 q10, q10, q2 781 /* Transpose */ 782 vtrn.16 q8, q9 783 vsub.s16 q11, q12, q1 784 vtrn.16 q14, q15 785 vadd.s16 q12, q12, q1 786 vtrn.16 q10, q11 787 vtrn.16 q12, q13 788 vtrn.32 q9, q11 789 vtrn.32 q12, q14 790 vtrn.32 q8, q10 791 vtrn.32 q13, q15 792 vswp d28, d21 793 vswp d26, d19 794 /* 1-D IDCT, pass 2 */ 795 vsub.s16 q2, q10, q14 796 vswp d30, d23 797 vadd.s16 q14, q10, q14 798 vswp d24, d17 799 vsub.s16 q1, q11, q13 800 vadd.s16 q13, q11, q13 801 vsub.s16 q5, q9, q15 802 vadd.s16 q15, q9, q15 803 vqdmulh.s16 q4, q2, XFIX_1_414213562 804 vqdmulh.s16 q6, q1, XFIX_2_613125930 805 vadd.s16 q3, q1, q1 806 vsub.s16 q1, q5, q1 807 vadd.s16 q10, q2, q4 808 vqdmulh.s16 q4, q1, XFIX_1_847759065 809 vsub.s16 q2, q15, q13 810 vadd.s16 q3, q3, q6 811 vqdmulh.s16 q6, q2, XFIX_1_414213562 812 vadd.s16 q1, q1, q4 813 vqdmulh.s16 q4, q5, XFIX_1_082392200 814 vsub.s16 q10, q10, q14 815 vadd.s16 q2, q2, q6 816 vsub.s16 q6, q8, q12 817 vadd.s16 q12, q8, q12 818 vadd.s16 q9, q5, q4 819 vadd.s16 q5, q6, q10 820 vsub.s16 q10, q6, q10 821 vadd.s16 q6, q15, q13 822 vadd.s16 q8, q12, q14 823 vsub.s16 q3, q6, q3 824 vsub.s16 q12, q12, q14 825 vsub.s16 q3, q3, q1 826 vsub.s16 q1, q9, q1 827 vadd.s16 q2, q3, q2 828 vsub.s16 q15, q8, q6 829 vadd.s16 q1, q1, q2 830 vadd.s16 q8, q8, q6 831 vadd.s16 q14, q5, q3 832 vsub.s16 q9, q5, q3 833 vsub.s16 q13, q10, q2 834 vpop {d8 - d13} /* restore Neon registers */ 835 vadd.s16 q10, q10, q2 836 vsub.s16 q11, q12, q1 837 vadd.s16 q12, q12, q1 838 /* Descale to 8-bit and range limit */ 839 vmov.u8 q0, #0x80 840 vqshrn.s16 d16, q8, #5 841 vqshrn.s16 d17, q9, #5 842 vqshrn.s16 d18, q10, #5 843 vqshrn.s16 d19, q11, #5 844 vqshrn.s16 d20, q12, #5 845 vqshrn.s16 d21, q13, #5 846 vqshrn.s16 d22, q14, #5 847 vqshrn.s16 d23, q15, #5 848 vadd.u8 q8, q8, q0 849 vadd.u8 q9, q9, q0 850 vadd.u8 q10, q10, q0 851 vadd.u8 q11, q11, q0 852 /* Transpose the final 8-bit samples */ 853 vtrn.16 q8, q9 854 vtrn.16 q10, q11 855 vtrn.32 q8, q10 856 vtrn.32 q9, q11 857 vtrn.8 d16, d17 858 vtrn.8 d18, d19 859 /* Store results to the output buffer */ 860 ldmia OUTPUT_BUF!, {TMP1, TMP2} 861 add TMP1, TMP1, OUTPUT_COL 862 add TMP2, TMP2, OUTPUT_COL 863 vst1.8 {d16}, [TMP1] 864 vst1.8 {d17}, [TMP2] 865 ldmia OUTPUT_BUF!, {TMP1, TMP2} 866 add TMP1, TMP1, OUTPUT_COL 867 add TMP2, TMP2, OUTPUT_COL 868 vst1.8 {d18}, [TMP1] 869 vtrn.8 d20, d21 870 vst1.8 {d19}, [TMP2] 871 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} 872 add TMP1, TMP1, OUTPUT_COL 873 add TMP2, TMP2, OUTPUT_COL 874 add TMP3, TMP3, OUTPUT_COL 875 add TMP4, TMP4, OUTPUT_COL 876 vst1.8 {d20}, [TMP1] 877 vtrn.8 d22, d23 878 vst1.8 {d21}, [TMP2] 879 vst1.8 {d22}, [TMP3] 880 vst1.8 {d23}, [TMP4] 881 bx lr 882 883 .unreq DCT_TABLE 884 .unreq COEF_BLOCK 885 .unreq OUTPUT_BUF 886 .unreq OUTPUT_COL 887 .unreq TMP1 888 .unreq TMP2 889 .unreq TMP3 890 .unreq TMP4 891 892 893/*****************************************************************************/ 894 895/* 896 * jsimd_extrgb_ycc_convert_neon 897 * jsimd_extbgr_ycc_convert_neon 898 * jsimd_extrgbx_ycc_convert_neon 899 * jsimd_extbgrx_ycc_convert_neon 900 * jsimd_extxbgr_ycc_convert_neon 901 * jsimd_extxrgb_ycc_convert_neon 902 * 903 * Colorspace conversion RGB -> YCbCr 904 */ 905 906.macro do_store size 907 .if \size == 8 908 vst1.8 {d20}, [Y]! 909 vst1.8 {d21}, [U]! 910 vst1.8 {d22}, [V]! 911 .elseif \size == 4 912 vst1.8 {d20[0]}, [Y]! 913 vst1.8 {d20[1]}, [Y]! 914 vst1.8 {d20[2]}, [Y]! 915 vst1.8 {d20[3]}, [Y]! 916 vst1.8 {d21[0]}, [U]! 917 vst1.8 {d21[1]}, [U]! 918 vst1.8 {d21[2]}, [U]! 919 vst1.8 {d21[3]}, [U]! 920 vst1.8 {d22[0]}, [V]! 921 vst1.8 {d22[1]}, [V]! 922 vst1.8 {d22[2]}, [V]! 923 vst1.8 {d22[3]}, [V]! 924 .elseif \size == 2 925 vst1.8 {d20[4]}, [Y]! 926 vst1.8 {d20[5]}, [Y]! 927 vst1.8 {d21[4]}, [U]! 928 vst1.8 {d21[5]}, [U]! 929 vst1.8 {d22[4]}, [V]! 930 vst1.8 {d22[5]}, [V]! 931 .elseif \size == 1 932 vst1.8 {d20[6]}, [Y]! 933 vst1.8 {d21[6]}, [U]! 934 vst1.8 {d22[6]}, [V]! 935 .else 936 .error unsupported macroblock size 937 .endif 938.endm 939 940.macro do_load bpp, size 941 .if \bpp == 24 942 .if \size == 8 943 vld3.8 {d10, d11, d12}, [RGB]! 944 pld [RGB, #128] 945 .elseif \size == 4 946 vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! 947 vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! 948 vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! 949 vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! 950 .elseif \size == 2 951 vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! 952 vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! 953 .elseif \size == 1 954 vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! 955 .else 956 .error unsupported macroblock size 957 .endif 958 .elseif \bpp == 32 959 .if \size == 8 960 vld4.8 {d10, d11, d12, d13}, [RGB]! 961 pld [RGB, #128] 962 .elseif \size == 4 963 vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! 964 vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! 965 vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! 966 vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! 967 .elseif \size == 2 968 vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! 969 vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! 970 .elseif \size == 1 971 vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! 972 .else 973 .error unsupported macroblock size 974 .endif 975 .else 976 .error unsupported bpp 977 .endif 978.endm 979 980.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs 981 982/* 983 * 2-stage pipelined RGB->YCbCr conversion 984 */ 985 986.macro do_rgb_to_yuv_stage1 987 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ 988 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ 989 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ 990 vmull.u16 q7, d4, d0[0] 991 vmlal.u16 q7, d6, d0[1] 992 vmlal.u16 q7, d8, d0[2] 993 vmull.u16 q8, d5, d0[0] 994 vmlal.u16 q8, d7, d0[1] 995 vmlal.u16 q8, d9, d0[2] 996 vrev64.32 q9, q1 997 vrev64.32 q13, q1 998 vmlsl.u16 q9, d4, d0[3] 999 vmlsl.u16 q9, d6, d1[0] 1000 vmlal.u16 q9, d8, d1[1] 1001 vmlsl.u16 q13, d5, d0[3] 1002 vmlsl.u16 q13, d7, d1[0] 1003 vmlal.u16 q13, d9, d1[1] 1004 vrev64.32 q14, q1 1005 vrev64.32 q15, q1 1006 vmlal.u16 q14, d4, d1[1] 1007 vmlsl.u16 q14, d6, d1[2] 1008 vmlsl.u16 q14, d8, d1[3] 1009 vmlal.u16 q15, d5, d1[1] 1010 vmlsl.u16 q15, d7, d1[2] 1011 vmlsl.u16 q15, d9, d1[3] 1012.endm 1013 1014.macro do_rgb_to_yuv_stage2 1015 vrshrn.u32 d20, q7, #16 1016 vrshrn.u32 d21, q8, #16 1017 vshrn.u32 d22, q9, #16 1018 vshrn.u32 d23, q13, #16 1019 vshrn.u32 d24, q14, #16 1020 vshrn.u32 d25, q15, #16 1021 vmovn.u16 d20, q10 /* d20 = y */ 1022 vmovn.u16 d21, q11 /* d21 = u */ 1023 vmovn.u16 d22, q12 /* d22 = v */ 1024.endm 1025 1026.macro do_rgb_to_yuv 1027 do_rgb_to_yuv_stage1 1028 do_rgb_to_yuv_stage2 1029.endm 1030 1031.macro do_rgb_to_yuv_stage2_store_load_stage1 1032 vrshrn.u32 d20, q7, #16 1033 vrshrn.u32 d21, q8, #16 1034 vshrn.u32 d22, q9, #16 1035 vrev64.32 q9, q1 1036 vshrn.u32 d23, q13, #16 1037 vrev64.32 q13, q1 1038 vshrn.u32 d24, q14, #16 1039 vshrn.u32 d25, q15, #16 1040 do_load \bpp, 8 1041 vmovn.u16 d20, q10 /* d20 = y */ 1042 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ 1043 vmovn.u16 d21, q11 /* d21 = u */ 1044 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ 1045 vmovn.u16 d22, q12 /* d22 = v */ 1046 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ 1047 vmull.u16 q7, d4, d0[0] 1048 vmlal.u16 q7, d6, d0[1] 1049 vmlal.u16 q7, d8, d0[2] 1050 vst1.8 {d20}, [Y]! 1051 vmull.u16 q8, d5, d0[0] 1052 vmlal.u16 q8, d7, d0[1] 1053 vmlal.u16 q8, d9, d0[2] 1054 vmlsl.u16 q9, d4, d0[3] 1055 vmlsl.u16 q9, d6, d1[0] 1056 vmlal.u16 q9, d8, d1[1] 1057 vst1.8 {d21}, [U]! 1058 vmlsl.u16 q13, d5, d0[3] 1059 vmlsl.u16 q13, d7, d1[0] 1060 vmlal.u16 q13, d9, d1[1] 1061 vrev64.32 q14, q1 1062 vrev64.32 q15, q1 1063 vmlal.u16 q14, d4, d1[1] 1064 vmlsl.u16 q14, d6, d1[2] 1065 vmlsl.u16 q14, d8, d1[3] 1066 vst1.8 {d22}, [V]! 1067 vmlal.u16 q15, d5, d1[1] 1068 vmlsl.u16 q15, d7, d1[2] 1069 vmlsl.u16 q15, d9, d1[3] 1070.endm 1071 1072.balign 16 1073jsimd_\colorid\()_ycc_neon_consts: 1074 .short 19595, 38470, 7471, 11059 1075 .short 21709, 32768, 27439, 5329 1076 .short 32767, 128, 32767, 128 1077 .short 32767, 128, 32767, 128 1078 1079asm_function jsimd_\colorid\()_ycc_convert_neon 1080 OUTPUT_WIDTH .req r0 1081 INPUT_BUF .req r1 1082 OUTPUT_BUF .req r2 1083 OUTPUT_ROW .req r3 1084 NUM_ROWS .req r4 1085 1086 OUTPUT_BUF0 .req r5 1087 OUTPUT_BUF1 .req r6 1088 OUTPUT_BUF2 .req OUTPUT_BUF 1089 1090 RGB .req r7 1091 Y .req r8 1092 U .req r9 1093 V .req r10 1094 N .req ip 1095 1096 /* Load constants to d0, d1, d2, d3 */ 1097 adr ip, jsimd_\colorid\()_ycc_neon_consts 1098 vld1.16 {d0, d1, d2, d3}, [ip, :128] 1099 1100 /* Save Arm registers and handle input arguments */ 1101 push {r4, r5, r6, r7, r8, r9, r10, lr} 1102 ldr NUM_ROWS, [sp, #(4 * 8)] 1103 ldr OUTPUT_BUF0, [OUTPUT_BUF] 1104 ldr OUTPUT_BUF1, [OUTPUT_BUF, #4] 1105 ldr OUTPUT_BUF2, [OUTPUT_BUF, #8] 1106 .unreq OUTPUT_BUF 1107 1108 /* Save Neon registers */ 1109 vpush {d8 - d15} 1110 1111 /* Outer loop over scanlines */ 1112 cmp NUM_ROWS, #1 1113 blt 9f 11140: 1115 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2] 1116 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2] 1117 mov N, OUTPUT_WIDTH 1118 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2] 1119 add OUTPUT_ROW, OUTPUT_ROW, #1 1120 ldr RGB, [INPUT_BUF], #4 1121 1122 /* Inner loop over pixels */ 1123 subs N, N, #8 1124 blt 3f 1125 do_load \bpp, 8 1126 do_rgb_to_yuv_stage1 1127 subs N, N, #8 1128 blt 2f 11291: 1130 do_rgb_to_yuv_stage2_store_load_stage1 1131 subs N, N, #8 1132 bge 1b 11332: 1134 do_rgb_to_yuv_stage2 1135 do_store 8 1136 tst N, #7 1137 beq 8f 11383: 1139 tst N, #4 1140 beq 3f 1141 do_load \bpp, 4 11423: 1143 tst N, #2 1144 beq 4f 1145 do_load \bpp, 2 11464: 1147 tst N, #1 1148 beq 5f 1149 do_load \bpp, 1 11505: 1151 do_rgb_to_yuv 1152 tst N, #4 1153 beq 6f 1154 do_store 4 11556: 1156 tst N, #2 1157 beq 7f 1158 do_store 2 11597: 1160 tst N, #1 1161 beq 8f 1162 do_store 1 11638: 1164 subs NUM_ROWS, NUM_ROWS, #1 1165 bgt 0b 11669: 1167 /* Restore all registers and return */ 1168 vpop {d8 - d15} 1169 pop {r4, r5, r6, r7, r8, r9, r10, pc} 1170 1171 .unreq OUTPUT_WIDTH 1172 .unreq OUTPUT_ROW 1173 .unreq INPUT_BUF 1174 .unreq NUM_ROWS 1175 .unreq OUTPUT_BUF0 1176 .unreq OUTPUT_BUF1 1177 .unreq OUTPUT_BUF2 1178 .unreq RGB 1179 .unreq Y 1180 .unreq U 1181 .unreq V 1182 .unreq N 1183 1184.purgem do_rgb_to_yuv 1185.purgem do_rgb_to_yuv_stage1 1186.purgem do_rgb_to_yuv_stage2 1187.purgem do_rgb_to_yuv_stage2_store_load_stage1 1188 1189.endm 1190 1191/*--------------------------------- id ----- bpp R G B */ 1192generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 1193generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0 1194generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2 1195generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0 1196generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1 1197generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3 1198 1199.purgem do_load 1200.purgem do_store 1201