1/* 2 * ARMv7 NEON optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). 5 * All rights reserved. 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> 7 * Copyright (C) 2014 Siarhei Siamashka. All Rights Reserved. 8 * Copyright (C) 2014 Linaro Limited. All Rights Reserved. 9 * 10 * This software is provided 'as-is', without any express or implied 11 * warranty. In no event will the authors be held liable for any damages 12 * arising from the use of this software. 13 * 14 * Permission is granted to anyone to use this software for any purpose, 15 * including commercial applications, and to alter it and redistribute it 16 * freely, subject to the following restrictions: 17 * 18 * 1. The origin of this software must not be misrepresented; you must not 19 * claim that you wrote the original software. If you use this software 20 * in a product, an acknowledgment in the product documentation would be 21 * appreciated but is not required. 22 * 2. Altered source versions must be plainly marked as such, and must not be 23 * misrepresented as being the original software. 24 * 3. This notice may not be removed or altered from any source distribution. 25 */ 26 27#if defined(__linux__) && defined(__ELF__) 28.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ 29#endif 30 31.text 32.fpu neon 33.arch armv7a 34.object_arch armv4 35.arm 36 37 38#define RESPECT_STRICT_ALIGNMENT 1 39 40 41/*****************************************************************************/ 42 43/* Supplementary macro for setting function attributes */ 44.macro asm_function fname 45#ifdef __APPLE__ 46 .globl _\fname 47_\fname: 48#else 49 .global \fname 50#ifdef __ELF__ 51 .hidden \fname 52 .type \fname, %function 53#endif 54\fname: 55#endif 56.endm 57 58/* Transpose a block of 4x4 coefficients in four 64-bit registers */ 59.macro transpose_4x4 x0, x1, x2, x3 60 vtrn.16 \x0, \x1 61 vtrn.16 \x2, \x3 62 vtrn.32 \x0, \x2 63 vtrn.32 \x1, \x3 64.endm 65 66 67#define CENTERJSAMPLE 128 68 69/*****************************************************************************/ 70 71/* 72 * Perform dequantization and inverse DCT on one block of coefficients. 73 * 74 * GLOBAL(void) 75 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block, 76 * JSAMPARRAY output_buf, JDIMENSION output_col) 77 */ 78 79#define FIX_0_298631336 (2446) 80#define FIX_0_390180644 (3196) 81#define FIX_0_541196100 (4433) 82#define FIX_0_765366865 (6270) 83#define FIX_0_899976223 (7373) 84#define FIX_1_175875602 (9633) 85#define FIX_1_501321110 (12299) 86#define FIX_1_847759065 (15137) 87#define FIX_1_961570560 (16069) 88#define FIX_2_053119869 (16819) 89#define FIX_2_562915447 (20995) 90#define FIX_3_072711026 (25172) 91 92#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) 93#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) 94#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) 95#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) 96#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) 97#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) 98#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) 99#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) 100 101/* 102 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. 103 * Uses some ideas from the comments in 'simd/jiss2int-64.asm' 104 */ 105#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ 106{ \ 107 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ 108 INT32 q1, q2, q3, q4, q5, q6, q7; \ 109 INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \ 110 \ 111 /* 1-D iDCT input data */ \ 112 row0 = xrow0; \ 113 row1 = xrow1; \ 114 row2 = xrow2; \ 115 row3 = xrow3; \ 116 row4 = xrow4; \ 117 row5 = xrow5; \ 118 row6 = xrow6; \ 119 row7 = xrow7; \ 120 \ 121 q5 = row7 + row3; \ 122 q4 = row5 + row1; \ 123 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ 124 MULTIPLY(q4, FIX_1_175875602); \ 125 q7 = MULTIPLY(q5, FIX_1_175875602) + \ 126 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ 127 q2 = MULTIPLY(row2, FIX_0_541196100) + \ 128 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ 129 q4 = q6; \ 130 q3 = ((INT32) row0 - (INT32) row4) << 13; \ 131 q6 += MULTIPLY(row5, -FIX_2_562915447) + \ 132 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ 133 /* now we can use q1 (reloadable constants have been used up) */ \ 134 q1 = q3 + q2; \ 135 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ 136 MULTIPLY(row1, -FIX_0_899976223); \ 137 q5 = q7; \ 138 q1 = q1 + q6; \ 139 q7 += MULTIPLY(row7, -FIX_0_899976223) + \ 140 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ 141 \ 142 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ 143 tmp11_plus_tmp2 = q1; \ 144 row1 = 0; \ 145 \ 146 q1 = q1 - q6; \ 147 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ 148 MULTIPLY(row3, -FIX_2_562915447); \ 149 q1 = q1 - q6; \ 150 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ 151 MULTIPLY(row6, FIX_0_541196100); \ 152 q3 = q3 - q2; \ 153 \ 154 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ 155 tmp11_minus_tmp2 = q1; \ 156 \ 157 q1 = ((INT32) row0 + (INT32) row4) << 13; \ 158 q2 = q1 + q6; \ 159 q1 = q1 - q6; \ 160 \ 161 /* pick up the results */ \ 162 tmp0 = q4; \ 163 tmp1 = q5; \ 164 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ 165 tmp3 = q7; \ 166 tmp10 = q2; \ 167 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ 168 tmp12 = q3; \ 169 tmp13 = q1; \ 170} 171 172#define XFIX_0_899976223 d0[0] 173#define XFIX_0_541196100 d0[1] 174#define XFIX_2_562915447 d0[2] 175#define XFIX_0_298631336_MINUS_0_899976223 d0[3] 176#define XFIX_1_501321110_MINUS_0_899976223 d1[0] 177#define XFIX_2_053119869_MINUS_2_562915447 d1[1] 178#define XFIX_0_541196100_PLUS_0_765366865 d1[2] 179#define XFIX_1_175875602 d1[3] 180#define XFIX_1_175875602_MINUS_0_390180644 d2[0] 181#define XFIX_0_541196100_MINUS_1_847759065 d2[1] 182#define XFIX_3_072711026_MINUS_2_562915447 d2[2] 183#define XFIX_1_175875602_MINUS_1_961570560 d2[3] 184 185.balign 16 186jsimd_idct_islow_neon_consts: 187 .short FIX_0_899976223 /* d0[0] */ 188 .short FIX_0_541196100 /* d0[1] */ 189 .short FIX_2_562915447 /* d0[2] */ 190 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ 191 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ 192 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ 193 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ 194 .short FIX_1_175875602 /* d1[3] */ 195 /* reloadable constants */ 196 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ 197 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ 198 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ 199 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ 200 201asm_function jsimd_idct_islow_neon 202 203 DCT_TABLE .req r0 204 COEF_BLOCK .req r1 205 OUTPUT_BUF .req r2 206 OUTPUT_COL .req r3 207 TMP1 .req r0 208 TMP2 .req r1 209 TMP3 .req r2 210 TMP4 .req ip 211 212 ROW0L .req d16 213 ROW0R .req d17 214 ROW1L .req d18 215 ROW1R .req d19 216 ROW2L .req d20 217 ROW2R .req d21 218 ROW3L .req d22 219 ROW3R .req d23 220 ROW4L .req d24 221 ROW4R .req d25 222 ROW5L .req d26 223 ROW5R .req d27 224 ROW6L .req d28 225 ROW6R .req d29 226 ROW7L .req d30 227 ROW7R .req d31 228 229 /* Load and dequantize coefficients into NEON registers 230 * with the following allocation: 231 * 0 1 2 3 | 4 5 6 7 232 * ---------+-------- 233 * 0 | d16 | d17 ( q8 ) 234 * 1 | d18 | d19 ( q9 ) 235 * 2 | d20 | d21 ( q10 ) 236 * 3 | d22 | d23 ( q11 ) 237 * 4 | d24 | d25 ( q12 ) 238 * 5 | d26 | d27 ( q13 ) 239 * 6 | d28 | d29 ( q14 ) 240 * 7 | d30 | d31 ( q15 ) 241 */ 242 adr ip, jsimd_idct_islow_neon_consts 243 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! 244 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 245 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! 246 vmul.s16 q8, q8, q0 247 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 248 vmul.s16 q9, q9, q1 249 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! 250 vmul.s16 q10, q10, q2 251 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 252 vmul.s16 q11, q11, q3 253 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] 254 vmul.s16 q12, q12, q0 255 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 256 vmul.s16 q14, q14, q2 257 vmul.s16 q13, q13, q1 258 vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ 259 add ip, ip, #16 260 vmul.s16 q15, q15, q3 261 vpush {d8-d15} /* save NEON registers */ 262 /* 1-D IDCT, pass 1, left 4x8 half */ 263 vadd.s16 d4, ROW7L, ROW3L 264 vadd.s16 d5, ROW5L, ROW1L 265 vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 266 vmlal.s16 q6, d5, XFIX_1_175875602 267 vmull.s16 q7, d4, XFIX_1_175875602 268 /* Check for the zero coefficients in the right 4x8 half */ 269 push {r4, r5} 270 vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 271 vsubl.s16 q3, ROW0L, ROW4L 272 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] 273 vmull.s16 q2, ROW2L, XFIX_0_541196100 274 vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 275 orr r0, r4, r5 276 vmov q4, q6 277 vmlsl.s16 q6, ROW5L, XFIX_2_562915447 278 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] 279 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 280 vshl.s32 q3, q3, #13 281 orr r0, r0, r4 282 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 283 orr r0, r0, r5 284 vadd.s32 q1, q3, q2 285 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] 286 vmov q5, q7 287 vadd.s32 q1, q1, q6 288 orr r0, r0, r4 289 vmlsl.s16 q7, ROW7L, XFIX_0_899976223 290 orr r0, r0, r5 291 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 292 vrshrn.s32 ROW1L, q1, #11 293 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] 294 vsub.s32 q1, q1, q6 295 vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 296 orr r0, r0, r4 297 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 298 orr r0, r0, r5 299 vsub.s32 q1, q1, q6 300 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 301 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] 302 vmlal.s16 q6, ROW6L, XFIX_0_541196100 303 vsub.s32 q3, q3, q2 304 orr r0, r0, r4 305 vrshrn.s32 ROW6L, q1, #11 306 orr r0, r0, r5 307 vadd.s32 q1, q3, q5 308 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] 309 vsub.s32 q3, q3, q5 310 vaddl.s16 q5, ROW0L, ROW4L 311 orr r0, r0, r4 312 vrshrn.s32 ROW2L, q1, #11 313 orr r0, r0, r5 314 vrshrn.s32 ROW5L, q3, #11 315 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] 316 vshl.s32 q5, q5, #13 317 vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 318 orr r0, r0, r4 319 vadd.s32 q2, q5, q6 320 orrs r0, r0, r5 321 vsub.s32 q1, q5, q6 322 vadd.s32 q6, q2, q7 323 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] 324 vsub.s32 q2, q2, q7 325 vadd.s32 q5, q1, q4 326 orr r0, r4, r5 327 vsub.s32 q3, q1, q4 328 pop {r4, r5} 329 vrshrn.s32 ROW7L, q2, #11 330 vrshrn.s32 ROW3L, q5, #11 331 vrshrn.s32 ROW0L, q6, #11 332 vrshrn.s32 ROW4L, q3, #11 333 334 beq 3f /* Go to do some special handling for the sparse right 4x8 half */ 335 336 /* 1-D IDCT, pass 1, right 4x8 half */ 337 vld1.s16 {d2}, [ip, :64] /* reload constants */ 338 vadd.s16 d10, ROW7R, ROW3R 339 vadd.s16 d8, ROW5R, ROW1R 340 /* Transpose left 4x8 half */ 341 vtrn.16 ROW6L, ROW7L 342 vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 343 vmlal.s16 q6, d8, XFIX_1_175875602 344 vtrn.16 ROW2L, ROW3L 345 vmull.s16 q7, d10, XFIX_1_175875602 346 vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 347 vtrn.16 ROW0L, ROW1L 348 vsubl.s16 q3, ROW0R, ROW4R 349 vmull.s16 q2, ROW2R, XFIX_0_541196100 350 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 351 vtrn.16 ROW4L, ROW5L 352 vmov q4, q6 353 vmlsl.s16 q6, ROW5R, XFIX_2_562915447 354 vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 355 vtrn.32 ROW1L, ROW3L 356 vshl.s32 q3, q3, #13 357 vmlsl.s16 q4, ROW1R, XFIX_0_899976223 358 vtrn.32 ROW4L, ROW6L 359 vadd.s32 q1, q3, q2 360 vmov q5, q7 361 vadd.s32 q1, q1, q6 362 vtrn.32 ROW0L, ROW2L 363 vmlsl.s16 q7, ROW7R, XFIX_0_899976223 364 vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 365 vrshrn.s32 ROW1R, q1, #11 366 vtrn.32 ROW5L, ROW7L 367 vsub.s32 q1, q1, q6 368 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 369 vmlsl.s16 q5, ROW3R, XFIX_2_562915447 370 vsub.s32 q1, q1, q6 371 vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 372 vmlal.s16 q6, ROW6R, XFIX_0_541196100 373 vsub.s32 q3, q3, q2 374 vrshrn.s32 ROW6R, q1, #11 375 vadd.s32 q1, q3, q5 376 vsub.s32 q3, q3, q5 377 vaddl.s16 q5, ROW0R, ROW4R 378 vrshrn.s32 ROW2R, q1, #11 379 vrshrn.s32 ROW5R, q3, #11 380 vshl.s32 q5, q5, #13 381 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 382 vadd.s32 q2, q5, q6 383 vsub.s32 q1, q5, q6 384 vadd.s32 q6, q2, q7 385 vsub.s32 q2, q2, q7 386 vadd.s32 q5, q1, q4 387 vsub.s32 q3, q1, q4 388 vrshrn.s32 ROW7R, q2, #11 389 vrshrn.s32 ROW3R, q5, #11 390 vrshrn.s32 ROW0R, q6, #11 391 vrshrn.s32 ROW4R, q3, #11 392 /* Transpose right 4x8 half */ 393 vtrn.16 ROW6R, ROW7R 394 vtrn.16 ROW2R, ROW3R 395 vtrn.16 ROW0R, ROW1R 396 vtrn.16 ROW4R, ROW5R 397 vtrn.32 ROW1R, ROW3R 398 vtrn.32 ROW4R, ROW6R 399 vtrn.32 ROW0R, ROW2R 400 vtrn.32 ROW5R, ROW7R 401 4021: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ 403 vld1.s16 {d2}, [ip, :64] /* reload constants */ 404 vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ 405 vmlal.s16 q6, ROW1L, XFIX_1_175875602 406 vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ 407 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 408 vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ 409 vmlal.s16 q7, ROW3L, XFIX_1_175875602 410 vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ 411 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 412 vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ 413 vmull.s16 q2, ROW2L, XFIX_0_541196100 414 vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */ 415 vmov q4, q6 416 vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ 417 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 418 vshl.s32 q3, q3, #13 419 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 420 vadd.s32 q1, q3, q2 421 vmov q5, q7 422 vadd.s32 q1, q1, q6 423 vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ 424 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 425 vshrn.s32 ROW1L, q1, #16 426 vsub.s32 q1, q1, q6 427 vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */ 428 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 429 vsub.s32 q1, q1, q6 430 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 431 vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ 432 vsub.s32 q3, q3, q2 433 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ 434 vadd.s32 q1, q3, q5 435 vsub.s32 q3, q3, q5 436 vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ 437 vshrn.s32 ROW2L, q1, #16 438 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ 439 vshl.s32 q5, q5, #13 440 vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */ 441 vadd.s32 q2, q5, q6 442 vsub.s32 q1, q5, q6 443 vadd.s32 q6, q2, q7 444 vsub.s32 q2, q2, q7 445 vadd.s32 q5, q1, q4 446 vsub.s32 q3, q1, q4 447 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ 448 vshrn.s32 ROW3L, q5, #16 449 vshrn.s32 ROW0L, q6, #16 450 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ 451 /* 1-D IDCT, pass 2, right 4x8 half */ 452 vld1.s16 {d2}, [ip, :64] /* reload constants */ 453 vmull.s16 q6, ROW5R, XFIX_1_175875602 454 vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ 455 vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 456 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ 457 vmull.s16 q7, ROW7R, XFIX_1_175875602 458 vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ 459 vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 460 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ 461 vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ 462 vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ 463 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 464 vmov q4, q6 465 vmlsl.s16 q6, ROW5R, XFIX_2_562915447 466 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */ 467 vshl.s32 q3, q3, #13 468 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ 469 vadd.s32 q1, q3, q2 470 vmov q5, q7 471 vadd.s32 q1, q1, q6 472 vmlsl.s16 q7, ROW7R, XFIX_0_899976223 473 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */ 474 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ 475 vsub.s32 q1, q1, q6 476 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 477 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ 478 vsub.s32 q1, q1, q6 479 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */ 480 vmlal.s16 q6, ROW6R, XFIX_0_541196100 481 vsub.s32 q3, q3, q2 482 vshrn.s32 ROW6R, q1, #16 483 vadd.s32 q1, q3, q5 484 vsub.s32 q3, q3, q5 485 vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ 486 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ 487 vshrn.s32 ROW5R, q3, #16 488 vshl.s32 q5, q5, #13 489 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 490 vadd.s32 q2, q5, q6 491 vsub.s32 q1, q5, q6 492 vadd.s32 q6, q2, q7 493 vsub.s32 q2, q2, q7 494 vadd.s32 q5, q1, q4 495 vsub.s32 q3, q1, q4 496 vshrn.s32 ROW7R, q2, #16 497 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ 498 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ 499 vshrn.s32 ROW4R, q3, #16 500 5012: /* Descale to 8-bit and range limit */ 502 vqrshrn.s16 d16, q8, #2 503 vqrshrn.s16 d17, q9, #2 504 vqrshrn.s16 d18, q10, #2 505 vqrshrn.s16 d19, q11, #2 506 vpop {d8-d15} /* restore NEON registers */ 507 vqrshrn.s16 d20, q12, #2 508 /* Transpose the final 8-bit samples and do signed->unsigned conversion */ 509 vtrn.16 q8, q9 510 vqrshrn.s16 d21, q13, #2 511 vqrshrn.s16 d22, q14, #2 512 vmov.u8 q0, #(CENTERJSAMPLE) 513 vqrshrn.s16 d23, q15, #2 514 vtrn.8 d16, d17 515 vtrn.8 d18, d19 516 vadd.u8 q8, q8, q0 517 vadd.u8 q9, q9, q0 518 vtrn.16 q10, q11 519 /* Store results to the output buffer */ 520 ldmia OUTPUT_BUF!, {TMP1, TMP2} 521 add TMP1, TMP1, OUTPUT_COL 522 add TMP2, TMP2, OUTPUT_COL 523 vst1.8 {d16}, [TMP1] 524 vtrn.8 d20, d21 525 vst1.8 {d17}, [TMP2] 526 ldmia OUTPUT_BUF!, {TMP1, TMP2} 527 add TMP1, TMP1, OUTPUT_COL 528 add TMP2, TMP2, OUTPUT_COL 529 vst1.8 {d18}, [TMP1] 530 vadd.u8 q10, q10, q0 531 vst1.8 {d19}, [TMP2] 532 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} 533 add TMP1, TMP1, OUTPUT_COL 534 add TMP2, TMP2, OUTPUT_COL 535 add TMP3, TMP3, OUTPUT_COL 536 add TMP4, TMP4, OUTPUT_COL 537 vtrn.8 d22, d23 538 vst1.8 {d20}, [TMP1] 539 vadd.u8 q11, q11, q0 540 vst1.8 {d21}, [TMP2] 541 vst1.8 {d22}, [TMP3] 542 vst1.8 {d23}, [TMP4] 543 bx lr 544 5453: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ 546 547 /* Transpose left 4x8 half */ 548 vtrn.16 ROW6L, ROW7L 549 vtrn.16 ROW2L, ROW3L 550 vtrn.16 ROW0L, ROW1L 551 vtrn.16 ROW4L, ROW5L 552 vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ 553 vtrn.32 ROW1L, ROW3L 554 vtrn.32 ROW4L, ROW6L 555 vtrn.32 ROW0L, ROW2L 556 vtrn.32 ROW5L, ROW7L 557 558 cmp r0, #0 559 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */ 560 561 /* Only row 0 is non-zero for the right 4x8 half */ 562 vdup.s16 ROW1R, ROW0R[1] 563 vdup.s16 ROW2R, ROW0R[2] 564 vdup.s16 ROW3R, ROW0R[3] 565 vdup.s16 ROW4R, ROW0R[0] 566 vdup.s16 ROW5R, ROW0R[1] 567 vdup.s16 ROW6R, ROW0R[2] 568 vdup.s16 ROW7R, ROW0R[3] 569 vdup.s16 ROW0R, ROW0R[0] 570 b 1b /* Go to 'normal' second pass */ 571 5724: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ 573 vld1.s16 {d2}, [ip, :64] /* reload constants */ 574 vmull.s16 q6, ROW1L, XFIX_1_175875602 575 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 576 vmull.s16 q7, ROW3L, XFIX_1_175875602 577 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 578 vmull.s16 q2, ROW2L, XFIX_0_541196100 579 vshll.s16 q3, ROW0L, #13 580 vmov q4, q6 581 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 582 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 583 vadd.s32 q1, q3, q2 584 vmov q5, q7 585 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 586 vadd.s32 q1, q1, q6 587 vadd.s32 q6, q6, q6 588 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 589 vshrn.s32 ROW1L, q1, #16 590 vsub.s32 q1, q1, q6 591 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 592 vsub.s32 q3, q3, q2 593 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ 594 vadd.s32 q1, q3, q5 595 vsub.s32 q3, q3, q5 596 vshll.s16 q5, ROW0L, #13 597 vshrn.s32 ROW2L, q1, #16 598 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ 599 vadd.s32 q2, q5, q6 600 vsub.s32 q1, q5, q6 601 vadd.s32 q6, q2, q7 602 vsub.s32 q2, q2, q7 603 vadd.s32 q5, q1, q4 604 vsub.s32 q3, q1, q4 605 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ 606 vshrn.s32 ROW3L, q5, #16 607 vshrn.s32 ROW0L, q6, #16 608 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ 609 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ 610 vld1.s16 {d2}, [ip, :64] /* reload constants */ 611 vmull.s16 q6, ROW5L, XFIX_1_175875602 612 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 613 vmull.s16 q7, ROW7L, XFIX_1_175875602 614 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 615 vmull.s16 q2, ROW6L, XFIX_0_541196100 616 vshll.s16 q3, ROW4L, #13 617 vmov q4, q6 618 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 619 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 620 vadd.s32 q1, q3, q2 621 vmov q5, q7 622 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 623 vadd.s32 q1, q1, q6 624 vadd.s32 q6, q6, q6 625 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 626 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ 627 vsub.s32 q1, q1, q6 628 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 629 vsub.s32 q3, q3, q2 630 vshrn.s32 ROW6R, q1, #16 631 vadd.s32 q1, q3, q5 632 vsub.s32 q3, q3, q5 633 vshll.s16 q5, ROW4L, #13 634 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ 635 vshrn.s32 ROW5R, q3, #16 636 vadd.s32 q2, q5, q6 637 vsub.s32 q1, q5, q6 638 vadd.s32 q6, q2, q7 639 vsub.s32 q2, q2, q7 640 vadd.s32 q5, q1, q4 641 vsub.s32 q3, q1, q4 642 vshrn.s32 ROW7R, q2, #16 643 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ 644 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ 645 vshrn.s32 ROW4R, q3, #16 646 b 2b /* Go to epilogue */ 647 648 .unreq DCT_TABLE 649 .unreq COEF_BLOCK 650 .unreq OUTPUT_BUF 651 .unreq OUTPUT_COL 652 .unreq TMP1 653 .unreq TMP2 654 .unreq TMP3 655 .unreq TMP4 656 657 .unreq ROW0L 658 .unreq ROW0R 659 .unreq ROW1L 660 .unreq ROW1R 661 .unreq ROW2L 662 .unreq ROW2R 663 .unreq ROW3L 664 .unreq ROW3R 665 .unreq ROW4L 666 .unreq ROW4R 667 .unreq ROW5L 668 .unreq ROW5R 669 .unreq ROW6L 670 .unreq ROW6R 671 .unreq ROW7L 672 .unreq ROW7R 673 674 675/*****************************************************************************/ 676 677/* 678 * jsimd_idct_ifast_neon 679 * 680 * This function contains a fast, not so accurate integer implementation of 681 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations 682 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' 683 * function from jidctfst.c 684 * 685 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. 686 * But in ARM NEON case some extra additions are required because VQDMULH 687 * instruction can't handle the constants larger than 1. So the expressions 688 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", 689 * which introduces an extra addition. Overall, there are 6 extra additions 690 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. 691 */ 692 693#define XFIX_1_082392200 d0[0] 694#define XFIX_1_414213562 d0[1] 695#define XFIX_1_847759065 d0[2] 696#define XFIX_2_613125930 d0[3] 697 698.balign 16 699jsimd_idct_ifast_neon_consts: 700 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ 701 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ 702 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ 703 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ 704 705asm_function jsimd_idct_ifast_neon 706 707 DCT_TABLE .req r0 708 COEF_BLOCK .req r1 709 OUTPUT_BUF .req r2 710 OUTPUT_COL .req r3 711 TMP1 .req r0 712 TMP2 .req r1 713 TMP3 .req r2 714 TMP4 .req ip 715 716 /* Load and dequantize coefficients into NEON registers 717 * with the following allocation: 718 * 0 1 2 3 | 4 5 6 7 719 * ---------+-------- 720 * 0 | d16 | d17 ( q8 ) 721 * 1 | d18 | d19 ( q9 ) 722 * 2 | d20 | d21 ( q10 ) 723 * 3 | d22 | d23 ( q11 ) 724 * 4 | d24 | d25 ( q12 ) 725 * 5 | d26 | d27 ( q13 ) 726 * 6 | d28 | d29 ( q14 ) 727 * 7 | d30 | d31 ( q15 ) 728 */ 729 adr ip, jsimd_idct_ifast_neon_consts 730 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! 731 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 732 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! 733 vmul.s16 q8, q8, q0 734 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 735 vmul.s16 q9, q9, q1 736 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! 737 vmul.s16 q10, q10, q2 738 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 739 vmul.s16 q11, q11, q3 740 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] 741 vmul.s16 q12, q12, q0 742 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 743 vmul.s16 q14, q14, q2 744 vmul.s16 q13, q13, q1 745 vld1.16 {d0}, [ip, :64] /* load constants */ 746 vmul.s16 q15, q15, q3 747 vpush {d8-d13} /* save NEON registers */ 748 /* 1-D IDCT, pass 1 */ 749 vsub.s16 q2, q10, q14 750 vadd.s16 q14, q10, q14 751 vsub.s16 q1, q11, q13 752 vadd.s16 q13, q11, q13 753 vsub.s16 q5, q9, q15 754 vadd.s16 q15, q9, q15 755 vqdmulh.s16 q4, q2, XFIX_1_414213562 756 vqdmulh.s16 q6, q1, XFIX_2_613125930 757 vadd.s16 q3, q1, q1 758 vsub.s16 q1, q5, q1 759 vadd.s16 q10, q2, q4 760 vqdmulh.s16 q4, q1, XFIX_1_847759065 761 vsub.s16 q2, q15, q13 762 vadd.s16 q3, q3, q6 763 vqdmulh.s16 q6, q2, XFIX_1_414213562 764 vadd.s16 q1, q1, q4 765 vqdmulh.s16 q4, q5, XFIX_1_082392200 766 vsub.s16 q10, q10, q14 767 vadd.s16 q2, q2, q6 768 vsub.s16 q6, q8, q12 769 vadd.s16 q12, q8, q12 770 vadd.s16 q9, q5, q4 771 vadd.s16 q5, q6, q10 772 vsub.s16 q10, q6, q10 773 vadd.s16 q6, q15, q13 774 vadd.s16 q8, q12, q14 775 vsub.s16 q3, q6, q3 776 vsub.s16 q12, q12, q14 777 vsub.s16 q3, q3, q1 778 vsub.s16 q1, q9, q1 779 vadd.s16 q2, q3, q2 780 vsub.s16 q15, q8, q6 781 vadd.s16 q1, q1, q2 782 vadd.s16 q8, q8, q6 783 vadd.s16 q14, q5, q3 784 vsub.s16 q9, q5, q3 785 vsub.s16 q13, q10, q2 786 vadd.s16 q10, q10, q2 787 /* Transpose */ 788 vtrn.16 q8, q9 789 vsub.s16 q11, q12, q1 790 vtrn.16 q14, q15 791 vadd.s16 q12, q12, q1 792 vtrn.16 q10, q11 793 vtrn.16 q12, q13 794 vtrn.32 q9, q11 795 vtrn.32 q12, q14 796 vtrn.32 q8, q10 797 vtrn.32 q13, q15 798 vswp d28, d21 799 vswp d26, d19 800 /* 1-D IDCT, pass 2 */ 801 vsub.s16 q2, q10, q14 802 vswp d30, d23 803 vadd.s16 q14, q10, q14 804 vswp d24, d17 805 vsub.s16 q1, q11, q13 806 vadd.s16 q13, q11, q13 807 vsub.s16 q5, q9, q15 808 vadd.s16 q15, q9, q15 809 vqdmulh.s16 q4, q2, XFIX_1_414213562 810 vqdmulh.s16 q6, q1, XFIX_2_613125930 811 vadd.s16 q3, q1, q1 812 vsub.s16 q1, q5, q1 813 vadd.s16 q10, q2, q4 814 vqdmulh.s16 q4, q1, XFIX_1_847759065 815 vsub.s16 q2, q15, q13 816 vadd.s16 q3, q3, q6 817 vqdmulh.s16 q6, q2, XFIX_1_414213562 818 vadd.s16 q1, q1, q4 819 vqdmulh.s16 q4, q5, XFIX_1_082392200 820 vsub.s16 q10, q10, q14 821 vadd.s16 q2, q2, q6 822 vsub.s16 q6, q8, q12 823 vadd.s16 q12, q8, q12 824 vadd.s16 q9, q5, q4 825 vadd.s16 q5, q6, q10 826 vsub.s16 q10, q6, q10 827 vadd.s16 q6, q15, q13 828 vadd.s16 q8, q12, q14 829 vsub.s16 q3, q6, q3 830 vsub.s16 q12, q12, q14 831 vsub.s16 q3, q3, q1 832 vsub.s16 q1, q9, q1 833 vadd.s16 q2, q3, q2 834 vsub.s16 q15, q8, q6 835 vadd.s16 q1, q1, q2 836 vadd.s16 q8, q8, q6 837 vadd.s16 q14, q5, q3 838 vsub.s16 q9, q5, q3 839 vsub.s16 q13, q10, q2 840 vpop {d8-d13} /* restore NEON registers */ 841 vadd.s16 q10, q10, q2 842 vsub.s16 q11, q12, q1 843 vadd.s16 q12, q12, q1 844 /* Descale to 8-bit and range limit */ 845 vmov.u8 q0, #0x80 846 vqshrn.s16 d16, q8, #5 847 vqshrn.s16 d17, q9, #5 848 vqshrn.s16 d18, q10, #5 849 vqshrn.s16 d19, q11, #5 850 vqshrn.s16 d20, q12, #5 851 vqshrn.s16 d21, q13, #5 852 vqshrn.s16 d22, q14, #5 853 vqshrn.s16 d23, q15, #5 854 vadd.u8 q8, q8, q0 855 vadd.u8 q9, q9, q0 856 vadd.u8 q10, q10, q0 857 vadd.u8 q11, q11, q0 858 /* Transpose the final 8-bit samples */ 859 vtrn.16 q8, q9 860 vtrn.16 q10, q11 861 vtrn.32 q8, q10 862 vtrn.32 q9, q11 863 vtrn.8 d16, d17 864 vtrn.8 d18, d19 865 /* Store results to the output buffer */ 866 ldmia OUTPUT_BUF!, {TMP1, TMP2} 867 add TMP1, TMP1, OUTPUT_COL 868 add TMP2, TMP2, OUTPUT_COL 869 vst1.8 {d16}, [TMP1] 870 vst1.8 {d17}, [TMP2] 871 ldmia OUTPUT_BUF!, {TMP1, TMP2} 872 add TMP1, TMP1, OUTPUT_COL 873 add TMP2, TMP2, OUTPUT_COL 874 vst1.8 {d18}, [TMP1] 875 vtrn.8 d20, d21 876 vst1.8 {d19}, [TMP2] 877 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} 878 add TMP1, TMP1, OUTPUT_COL 879 add TMP2, TMP2, OUTPUT_COL 880 add TMP3, TMP3, OUTPUT_COL 881 add TMP4, TMP4, OUTPUT_COL 882 vst1.8 {d20}, [TMP1] 883 vtrn.8 d22, d23 884 vst1.8 {d21}, [TMP2] 885 vst1.8 {d22}, [TMP3] 886 vst1.8 {d23}, [TMP4] 887 bx lr 888 889 .unreq DCT_TABLE 890 .unreq COEF_BLOCK 891 .unreq OUTPUT_BUF 892 .unreq OUTPUT_COL 893 .unreq TMP1 894 .unreq TMP2 895 .unreq TMP3 896 .unreq TMP4 897 898 899/*****************************************************************************/ 900 901/* 902 * jsimd_idct_4x4_neon 903 * 904 * This function contains inverse-DCT code for getting reduced-size 905 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations 906 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' 907 * function from jpeg-6b (jidctred.c). 908 * 909 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which 910 * requires much less arithmetic operations and hence should be faster. 911 * The primary purpose of this particular NEON optimized function is 912 * bit exact compatibility with jpeg-6b. 913 * 914 * TODO: a bit better instructions scheduling can be achieved by expanding 915 * idct_helper/transpose_4x4 macros and reordering instructions, 916 * but readability will suffer somewhat. 917 */ 918 919#define CONST_BITS 13 920 921#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ 922#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ 923#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ 924#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ 925#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ 926#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ 927#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ 928#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ 929#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ 930#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ 931#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ 932#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ 933#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ 934#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ 935 936.balign 16 937jsimd_idct_4x4_neon_consts: 938 .short FIX_1_847759065 /* d0[0] */ 939 .short -FIX_0_765366865 /* d0[1] */ 940 .short -FIX_0_211164243 /* d0[2] */ 941 .short FIX_1_451774981 /* d0[3] */ 942 .short -FIX_2_172734803 /* d1[0] */ 943 .short FIX_1_061594337 /* d1[1] */ 944 .short -FIX_0_509795579 /* d1[2] */ 945 .short -FIX_0_601344887 /* d1[3] */ 946 .short FIX_0_899976223 /* d2[0] */ 947 .short FIX_2_562915447 /* d2[1] */ 948 .short 1 << (CONST_BITS+1) /* d2[2] */ 949 .short 0 /* d2[3] */ 950 951.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 952 vmull.s16 q14, \x4, d2[2] 953 vmlal.s16 q14, \x8, d0[0] 954 vmlal.s16 q14, \x14, d0[1] 955 956 vmull.s16 q13, \x16, d1[2] 957 vmlal.s16 q13, \x12, d1[3] 958 vmlal.s16 q13, \x10, d2[0] 959 vmlal.s16 q13, \x6, d2[1] 960 961 vmull.s16 q15, \x4, d2[2] 962 vmlsl.s16 q15, \x8, d0[0] 963 vmlsl.s16 q15, \x14, d0[1] 964 965 vmull.s16 q12, \x16, d0[2] 966 vmlal.s16 q12, \x12, d0[3] 967 vmlal.s16 q12, \x10, d1[0] 968 vmlal.s16 q12, \x6, d1[1] 969 970 vadd.s32 q10, q14, q13 971 vsub.s32 q14, q14, q13 972 973.if \shift > 16 974 vrshr.s32 q10, q10, #\shift 975 vrshr.s32 q14, q14, #\shift 976 vmovn.s32 \y26, q10 977 vmovn.s32 \y29, q14 978.else 979 vrshrn.s32 \y26, q10, #\shift 980 vrshrn.s32 \y29, q14, #\shift 981.endif 982 983 vadd.s32 q10, q15, q12 984 vsub.s32 q15, q15, q12 985 986.if \shift > 16 987 vrshr.s32 q10, q10, #\shift 988 vrshr.s32 q15, q15, #\shift 989 vmovn.s32 \y27, q10 990 vmovn.s32 \y28, q15 991.else 992 vrshrn.s32 \y27, q10, #\shift 993 vrshrn.s32 \y28, q15, #\shift 994.endif 995 996.endm 997 998asm_function jsimd_idct_4x4_neon 999 1000 DCT_TABLE .req r0 1001 COEF_BLOCK .req r1 1002 OUTPUT_BUF .req r2 1003 OUTPUT_COL .req r3 1004 TMP1 .req r0 1005 TMP2 .req r1 1006 TMP3 .req r2 1007 TMP4 .req ip 1008 1009 vpush {d8-d15} 1010 1011 /* Load constants (d3 is just used for padding) */ 1012 adr TMP4, jsimd_idct_4x4_neon_consts 1013 vld1.16 {d0, d1, d2, d3}, [TMP4, :128] 1014 1015 /* Load all COEF_BLOCK into NEON registers with the following allocation: 1016 * 0 1 2 3 | 4 5 6 7 1017 * ---------+-------- 1018 * 0 | d4 | d5 1019 * 1 | d6 | d7 1020 * 2 | d8 | d9 1021 * 3 | d10 | d11 1022 * 4 | - | - 1023 * 5 | d12 | d13 1024 * 6 | d14 | d15 1025 * 7 | d16 | d17 1026 */ 1027 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! 1028 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]! 1029 add COEF_BLOCK, COEF_BLOCK, #16 1030 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]! 1031 vld1.16 {d16, d17}, [COEF_BLOCK, :128]! 1032 /* dequantize */ 1033 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! 1034 vmul.s16 q2, q2, q9 1035 vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]! 1036 vmul.s16 q3, q3, q10 1037 vmul.s16 q4, q4, q11 1038 add DCT_TABLE, DCT_TABLE, #16 1039 vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]! 1040 vmul.s16 q5, q5, q12 1041 vmul.s16 q6, q6, q13 1042 vld1.16 {d30, d31}, [DCT_TABLE, :128]! 1043 vmul.s16 q7, q7, q14 1044 vmul.s16 q8, q8, q15 1045 1046 /* Pass 1 */ 1047 idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10 1048 transpose_4x4 d4, d6, d8, d10 1049 idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11 1050 transpose_4x4 d5, d7, d9, d11 1051 1052 /* Pass 2 */ 1053 idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29 1054 transpose_4x4 d26, d27, d28, d29 1055 1056 /* Range limit */ 1057 vmov.u16 q15, #0x80 1058 vadd.s16 q13, q13, q15 1059 vadd.s16 q14, q14, q15 1060 vqmovun.s16 d26, q13 1061 vqmovun.s16 d27, q14 1062 1063 /* Store results to the output buffer */ 1064 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} 1065 add TMP1, TMP1, OUTPUT_COL 1066 add TMP2, TMP2, OUTPUT_COL 1067 add TMP3, TMP3, OUTPUT_COL 1068 add TMP4, TMP4, OUTPUT_COL 1069 1070#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT 1071 /* We can use much less instructions on little endian systems if the 1072 * OS kernel is not configured to trap unaligned memory accesses 1073 */ 1074 vst1.32 {d26[0]}, [TMP1]! 1075 vst1.32 {d27[0]}, [TMP3]! 1076 vst1.32 {d26[1]}, [TMP2]! 1077 vst1.32 {d27[1]}, [TMP4]! 1078#else 1079 vst1.8 {d26[0]}, [TMP1]! 1080 vst1.8 {d27[0]}, [TMP3]! 1081 vst1.8 {d26[1]}, [TMP1]! 1082 vst1.8 {d27[1]}, [TMP3]! 1083 vst1.8 {d26[2]}, [TMP1]! 1084 vst1.8 {d27[2]}, [TMP3]! 1085 vst1.8 {d26[3]}, [TMP1]! 1086 vst1.8 {d27[3]}, [TMP3]! 1087 1088 vst1.8 {d26[4]}, [TMP2]! 1089 vst1.8 {d27[4]}, [TMP4]! 1090 vst1.8 {d26[5]}, [TMP2]! 1091 vst1.8 {d27[5]}, [TMP4]! 1092 vst1.8 {d26[6]}, [TMP2]! 1093 vst1.8 {d27[6]}, [TMP4]! 1094 vst1.8 {d26[7]}, [TMP2]! 1095 vst1.8 {d27[7]}, [TMP4]! 1096#endif 1097 1098 vpop {d8-d15} 1099 bx lr 1100 1101 .unreq DCT_TABLE 1102 .unreq COEF_BLOCK 1103 .unreq OUTPUT_BUF 1104 .unreq OUTPUT_COL 1105 .unreq TMP1 1106 .unreq TMP2 1107 .unreq TMP3 1108 .unreq TMP4 1109 1110.purgem idct_helper 1111 1112 1113/*****************************************************************************/ 1114 1115/* 1116 * jsimd_idct_2x2_neon 1117 * 1118 * This function contains inverse-DCT code for getting reduced-size 1119 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations 1120 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' 1121 * function from jpeg-6b (jidctred.c). 1122 * 1123 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which 1124 * requires much less arithmetic operations and hence should be faster. 1125 * The primary purpose of this particular NEON optimized function is 1126 * bit exact compatibility with jpeg-6b. 1127 */ 1128 1129.balign 8 1130jsimd_idct_2x2_neon_consts: 1131 .short -FIX_0_720959822 /* d0[0] */ 1132 .short FIX_0_850430095 /* d0[1] */ 1133 .short -FIX_1_272758580 /* d0[2] */ 1134 .short FIX_3_624509785 /* d0[3] */ 1135 1136.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 1137 vshll.s16 q14, \x4, #15 1138 vmull.s16 q13, \x6, d0[3] 1139 vmlal.s16 q13, \x10, d0[2] 1140 vmlal.s16 q13, \x12, d0[1] 1141 vmlal.s16 q13, \x16, d0[0] 1142 1143 vadd.s32 q10, q14, q13 1144 vsub.s32 q14, q14, q13 1145 1146.if \shift > 16 1147 vrshr.s32 q10, q10, #\shift 1148 vrshr.s32 q14, q14, #\shift 1149 vmovn.s32 \y26, q10 1150 vmovn.s32 \y27, q14 1151.else 1152 vrshrn.s32 \y26, q10, #\shift 1153 vrshrn.s32 \y27, q14, #\shift 1154.endif 1155 1156.endm 1157 1158asm_function jsimd_idct_2x2_neon 1159 1160 DCT_TABLE .req r0 1161 COEF_BLOCK .req r1 1162 OUTPUT_BUF .req r2 1163 OUTPUT_COL .req r3 1164 TMP1 .req r0 1165 TMP2 .req ip 1166 1167 vpush {d8-d15} 1168 1169 /* Load constants */ 1170 adr TMP2, jsimd_idct_2x2_neon_consts 1171 vld1.16 {d0}, [TMP2, :64] 1172 1173 /* Load all COEF_BLOCK into NEON registers with the following allocation: 1174 * 0 1 2 3 | 4 5 6 7 1175 * ---------+-------- 1176 * 0 | d4 | d5 1177 * 1 | d6 | d7 1178 * 2 | - | - 1179 * 3 | d10 | d11 1180 * 4 | - | - 1181 * 5 | d12 | d13 1182 * 6 | - | - 1183 * 7 | d16 | d17 1184 */ 1185 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! 1186 add COEF_BLOCK, COEF_BLOCK, #16 1187 vld1.16 {d10, d11}, [COEF_BLOCK, :128]! 1188 add COEF_BLOCK, COEF_BLOCK, #16 1189 vld1.16 {d12, d13}, [COEF_BLOCK, :128]! 1190 add COEF_BLOCK, COEF_BLOCK, #16 1191 vld1.16 {d16, d17}, [COEF_BLOCK, :128]! 1192 /* Dequantize */ 1193 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! 1194 vmul.s16 q2, q2, q9 1195 vmul.s16 q3, q3, q10 1196 add DCT_TABLE, DCT_TABLE, #16 1197 vld1.16 {d24, d25}, [DCT_TABLE, :128]! 1198 vmul.s16 q5, q5, q12 1199 add DCT_TABLE, DCT_TABLE, #16 1200 vld1.16 {d26, d27}, [DCT_TABLE, :128]! 1201 vmul.s16 q6, q6, q13 1202 add DCT_TABLE, DCT_TABLE, #16 1203 vld1.16 {d30, d31}, [DCT_TABLE, :128]! 1204 vmul.s16 q8, q8, q15 1205 1206 /* Pass 1 */ 1207#if 0 1208 idct_helper d4, d6, d10, d12, d16, 13, d4, d6 1209 transpose_4x4 d4, d6, d8, d10 1210 idct_helper d5, d7, d11, d13, d17, 13, d5, d7 1211 transpose_4x4 d5, d7, d9, d11 1212#else 1213 vmull.s16 q13, d6, d0[3] 1214 vmlal.s16 q13, d10, d0[2] 1215 vmlal.s16 q13, d12, d0[1] 1216 vmlal.s16 q13, d16, d0[0] 1217 vmull.s16 q12, d7, d0[3] 1218 vmlal.s16 q12, d11, d0[2] 1219 vmlal.s16 q12, d13, d0[1] 1220 vmlal.s16 q12, d17, d0[0] 1221 vshll.s16 q14, d4, #15 1222 vshll.s16 q15, d5, #15 1223 vadd.s32 q10, q14, q13 1224 vsub.s32 q14, q14, q13 1225 vrshrn.s32 d4, q10, #13 1226 vrshrn.s32 d6, q14, #13 1227 vadd.s32 q10, q15, q12 1228 vsub.s32 q14, q15, q12 1229 vrshrn.s32 d5, q10, #13 1230 vrshrn.s32 d7, q14, #13 1231 vtrn.16 q2, q3 1232 vtrn.32 q3, q5 1233#endif 1234 1235 /* Pass 2 */ 1236 idct_helper d4, d6, d10, d7, d11, 20, d26, d27 1237 1238 /* Range limit */ 1239 vmov.u16 q15, #0x80 1240 vadd.s16 q13, q13, q15 1241 vqmovun.s16 d26, q13 1242 vqmovun.s16 d27, q13 1243 1244 /* Store results to the output buffer */ 1245 ldmia OUTPUT_BUF, {TMP1, TMP2} 1246 add TMP1, TMP1, OUTPUT_COL 1247 add TMP2, TMP2, OUTPUT_COL 1248 1249 vst1.8 {d26[0]}, [TMP1]! 1250 vst1.8 {d27[4]}, [TMP1]! 1251 vst1.8 {d26[1]}, [TMP2]! 1252 vst1.8 {d27[5]}, [TMP2]! 1253 1254 vpop {d8-d15} 1255 bx lr 1256 1257 .unreq DCT_TABLE 1258 .unreq COEF_BLOCK 1259 .unreq OUTPUT_BUF 1260 .unreq OUTPUT_COL 1261 .unreq TMP1 1262 .unreq TMP2 1263 1264.purgem idct_helper 1265 1266 1267/*****************************************************************************/ 1268 1269/* 1270 * jsimd_ycc_extrgb_convert_neon 1271 * jsimd_ycc_extbgr_convert_neon 1272 * jsimd_ycc_extrgbx_convert_neon 1273 * jsimd_ycc_extbgrx_convert_neon 1274 * jsimd_ycc_extxbgr_convert_neon 1275 * jsimd_ycc_extxrgb_convert_neon 1276 * 1277 * Colorspace conversion YCbCr -> RGB 1278 */ 1279 1280 1281.macro do_load size 1282 .if \size == 8 1283 vld1.8 {d4}, [U, :64]! 1284 vld1.8 {d5}, [V, :64]! 1285 vld1.8 {d0}, [Y, :64]! 1286 pld [U, #64] 1287 pld [V, #64] 1288 pld [Y, #64] 1289 .elseif \size == 4 1290 vld1.8 {d4[0]}, [U]! 1291 vld1.8 {d4[1]}, [U]! 1292 vld1.8 {d4[2]}, [U]! 1293 vld1.8 {d4[3]}, [U]! 1294 vld1.8 {d5[0]}, [V]! 1295 vld1.8 {d5[1]}, [V]! 1296 vld1.8 {d5[2]}, [V]! 1297 vld1.8 {d5[3]}, [V]! 1298 vld1.8 {d0[0]}, [Y]! 1299 vld1.8 {d0[1]}, [Y]! 1300 vld1.8 {d0[2]}, [Y]! 1301 vld1.8 {d0[3]}, [Y]! 1302 .elseif \size == 2 1303 vld1.8 {d4[4]}, [U]! 1304 vld1.8 {d4[5]}, [U]! 1305 vld1.8 {d5[4]}, [V]! 1306 vld1.8 {d5[5]}, [V]! 1307 vld1.8 {d0[4]}, [Y]! 1308 vld1.8 {d0[5]}, [Y]! 1309 .elseif \size == 1 1310 vld1.8 {d4[6]}, [U]! 1311 vld1.8 {d5[6]}, [V]! 1312 vld1.8 {d0[6]}, [Y]! 1313 .else 1314 .error unsupported macroblock size 1315 .endif 1316.endm 1317 1318.macro do_store bpp, size 1319 .if \bpp == 24 1320 .if \size == 8 1321 vst3.8 {d10, d11, d12}, [RGB]! 1322 .elseif \size == 4 1323 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! 1324 vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! 1325 vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! 1326 vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! 1327 .elseif \size == 2 1328 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! 1329 vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! 1330 .elseif \size == 1 1331 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! 1332 .else 1333 .error unsupported macroblock size 1334 .endif 1335 .elseif \bpp == 32 1336 .if \size == 8 1337 vst4.8 {d10, d11, d12, d13}, [RGB]! 1338 .elseif \size == 4 1339 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! 1340 vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! 1341 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! 1342 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! 1343 .elseif \size == 2 1344 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! 1345 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! 1346 .elseif \size == 1 1347 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! 1348 .else 1349 .error unsupported macroblock size 1350 .endif 1351 .elseif \bpp == 16 1352 .if \size == 8 1353 vst1.16 {q15}, [RGB]! 1354 .elseif \size == 4 1355 vst1.16 {d30}, [RGB]! 1356 .elseif \size == 2 1357 vst1.16 {d31[0]}, [RGB]! 1358 vst1.16 {d31[1]}, [RGB]! 1359 .elseif \size == 1 1360 vst1.16 {d31[2]}, [RGB]! 1361 .else 1362 .error unsupported macroblock size 1363 .endif 1364 .else 1365 .error unsupported bpp 1366 .endif 1367.endm 1368 1369.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs 1370 1371/* 1372 * 2 stage pipelined YCbCr->RGB conversion 1373 */ 1374 1375.macro do_yuv_to_rgb_stage1 1376 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ 1377 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ 1378 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ 1379 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ 1380 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ 1381 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ 1382 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ 1383 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ 1384 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ 1385 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ 1386.endm 1387 1388.macro do_yuv_to_rgb_stage2 1389 vrshrn.s32 d20, q10, #15 1390 vrshrn.s32 d21, q11, #15 1391 vrshrn.s32 d24, q12, #14 1392 vrshrn.s32 d25, q13, #14 1393 vrshrn.s32 d28, q14, #14 1394 vrshrn.s32 d29, q15, #14 1395 vaddw.u8 q11, q10, d0 1396 vaddw.u8 q12, q12, d0 1397 vaddw.u8 q14, q14, d0 1398.if \bpp != 16 1399 vqmovun.s16 d1\g_offs, q11 1400 vqmovun.s16 d1\r_offs, q12 1401 vqmovun.s16 d1\b_offs, q14 1402.else /* rgb565 */ 1403 vqshlu.s16 q13, q11, #8 1404 vqshlu.s16 q15, q12, #8 1405 vqshlu.s16 q14, q14, #8 1406 vsri.u16 q15, q13, #5 1407 vsri.u16 q15, q14, #11 1408.endif 1409.endm 1410 1411.macro do_yuv_to_rgb_stage2_store_load_stage1 1412 /* "do_yuv_to_rgb_stage2" and "store" */ 1413 vrshrn.s32 d20, q10, #15 1414 /* "load" and "do_yuv_to_rgb_stage1" */ 1415 pld [U, #64] 1416 vrshrn.s32 d21, q11, #15 1417 pld [V, #64] 1418 vrshrn.s32 d24, q12, #14 1419 vrshrn.s32 d25, q13, #14 1420 vld1.8 {d4}, [U, :64]! 1421 vrshrn.s32 d28, q14, #14 1422 vld1.8 {d5}, [V, :64]! 1423 vrshrn.s32 d29, q15, #14 1424 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ 1425 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ 1426 vaddw.u8 q11, q10, d0 1427 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ 1428 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ 1429 vaddw.u8 q12, q12, d0 1430 vaddw.u8 q14, q14, d0 1431.if \bpp != 16 /**************** rgb24/rgb32 *********************************/ 1432 vqmovun.s16 d1\g_offs, q11 1433 pld [Y, #64] 1434 vqmovun.s16 d1\r_offs, q12 1435 vld1.8 {d0}, [Y, :64]! 1436 vqmovun.s16 d1\b_offs, q14 1437 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ 1438 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ 1439 do_store \bpp, 8 1440 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ 1441 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ 1442 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ 1443 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ 1444.else /**************************** rgb565 ***********************************/ 1445 vqshlu.s16 q13, q11, #8 1446 pld [Y, #64] 1447 vqshlu.s16 q15, q12, #8 1448 vqshlu.s16 q14, q14, #8 1449 vld1.8 {d0}, [Y, :64]! 1450 vmull.s16 q11, d7, d1[1] 1451 vmlal.s16 q11, d9, d1[2] 1452 vsri.u16 q15, q13, #5 1453 vmull.s16 q12, d8, d1[0] 1454 vsri.u16 q15, q14, #11 1455 vmull.s16 q13, d9, d1[0] 1456 vmull.s16 q14, d6, d1[3] 1457 do_store \bpp, 8 1458 vmull.s16 q15, d7, d1[3] 1459.endif 1460.endm 1461 1462.macro do_yuv_to_rgb 1463 do_yuv_to_rgb_stage1 1464 do_yuv_to_rgb_stage2 1465.endm 1466 1467/* Apple gas crashes on adrl, work around that by using adr. 1468 * But this requires a copy of these constants for each function. 1469 */ 1470 1471.balign 16 1472jsimd_ycc_\colorid\()_neon_consts: 1473 .short 0, 0, 0, 0 1474 .short 22971, -11277, -23401, 29033 1475 .short -128, -128, -128, -128 1476 .short -128, -128, -128, -128 1477 1478asm_function jsimd_ycc_\colorid\()_convert_neon 1479 OUTPUT_WIDTH .req r0 1480 INPUT_BUF .req r1 1481 INPUT_ROW .req r2 1482 OUTPUT_BUF .req r3 1483 NUM_ROWS .req r4 1484 1485 INPUT_BUF0 .req r5 1486 INPUT_BUF1 .req r6 1487 INPUT_BUF2 .req INPUT_BUF 1488 1489 RGB .req r7 1490 Y .req r8 1491 U .req r9 1492 V .req r10 1493 N .req ip 1494 1495 /* Load constants to d1, d2, d3 (d0 is just used for padding) */ 1496 adr ip, jsimd_ycc_\colorid\()_neon_consts 1497 vld1.16 {d0, d1, d2, d3}, [ip, :128] 1498 1499 /* Save ARM registers and handle input arguments */ 1500 push {r4, r5, r6, r7, r8, r9, r10, lr} 1501 ldr NUM_ROWS, [sp, #(4 * 8)] 1502 ldr INPUT_BUF0, [INPUT_BUF] 1503 ldr INPUT_BUF1, [INPUT_BUF, #4] 1504 ldr INPUT_BUF2, [INPUT_BUF, #8] 1505 .unreq INPUT_BUF 1506 1507 /* Save NEON registers */ 1508 vpush {d8-d15} 1509 1510 /* Initially set d10, d11, d12, d13 to 0xFF */ 1511 vmov.u8 q5, #255 1512 vmov.u8 q6, #255 1513 1514 /* Outer loop over scanlines */ 1515 cmp NUM_ROWS, #1 1516 blt 9f 15170: 1518 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] 1519 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] 1520 mov N, OUTPUT_WIDTH 1521 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] 1522 add INPUT_ROW, INPUT_ROW, #1 1523 ldr RGB, [OUTPUT_BUF], #4 1524 1525 /* Inner loop over pixels */ 1526 subs N, N, #8 1527 blt 3f 1528 do_load 8 1529 do_yuv_to_rgb_stage1 1530 subs N, N, #8 1531 blt 2f 15321: 1533 do_yuv_to_rgb_stage2_store_load_stage1 1534 subs N, N, #8 1535 bge 1b 15362: 1537 do_yuv_to_rgb_stage2 1538 do_store \bpp, 8 1539 tst N, #7 1540 beq 8f 15413: 1542 tst N, #4 1543 beq 3f 1544 do_load 4 15453: 1546 tst N, #2 1547 beq 4f 1548 do_load 2 15494: 1550 tst N, #1 1551 beq 5f 1552 do_load 1 15535: 1554 do_yuv_to_rgb 1555 tst N, #4 1556 beq 6f 1557 do_store \bpp, 4 15586: 1559 tst N, #2 1560 beq 7f 1561 do_store \bpp, 2 15627: 1563 tst N, #1 1564 beq 8f 1565 do_store \bpp, 1 15668: 1567 subs NUM_ROWS, NUM_ROWS, #1 1568 bgt 0b 15699: 1570 /* Restore all registers and return */ 1571 vpop {d8-d15} 1572 pop {r4, r5, r6, r7, r8, r9, r10, pc} 1573 1574 .unreq OUTPUT_WIDTH 1575 .unreq INPUT_ROW 1576 .unreq OUTPUT_BUF 1577 .unreq NUM_ROWS 1578 .unreq INPUT_BUF0 1579 .unreq INPUT_BUF1 1580 .unreq INPUT_BUF2 1581 .unreq RGB 1582 .unreq Y 1583 .unreq U 1584 .unreq V 1585 .unreq N 1586 1587.purgem do_yuv_to_rgb 1588.purgem do_yuv_to_rgb_stage1 1589.purgem do_yuv_to_rgb_stage2 1590.purgem do_yuv_to_rgb_stage2_store_load_stage1 1591 1592.endm 1593 1594/*--------------------------------- id ----- bpp R G B */ 1595generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 1596generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 1597generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 1598generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 1599generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 1600generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 1601generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0 1602 1603.purgem do_load 1604.purgem do_store 1605 1606 1607/*****************************************************************************/ 1608 1609/* 1610 * jsimd_extrgb_ycc_convert_neon 1611 * jsimd_extbgr_ycc_convert_neon 1612 * jsimd_extrgbx_ycc_convert_neon 1613 * jsimd_extbgrx_ycc_convert_neon 1614 * jsimd_extxbgr_ycc_convert_neon 1615 * jsimd_extxrgb_ycc_convert_neon 1616 * 1617 * Colorspace conversion RGB -> YCbCr 1618 */ 1619 1620.macro do_store size 1621 .if \size == 8 1622 vst1.8 {d20}, [Y]! 1623 vst1.8 {d21}, [U]! 1624 vst1.8 {d22}, [V]! 1625 .elseif \size == 4 1626 vst1.8 {d20[0]}, [Y]! 1627 vst1.8 {d20[1]}, [Y]! 1628 vst1.8 {d20[2]}, [Y]! 1629 vst1.8 {d20[3]}, [Y]! 1630 vst1.8 {d21[0]}, [U]! 1631 vst1.8 {d21[1]}, [U]! 1632 vst1.8 {d21[2]}, [U]! 1633 vst1.8 {d21[3]}, [U]! 1634 vst1.8 {d22[0]}, [V]! 1635 vst1.8 {d22[1]}, [V]! 1636 vst1.8 {d22[2]}, [V]! 1637 vst1.8 {d22[3]}, [V]! 1638 .elseif \size == 2 1639 vst1.8 {d20[4]}, [Y]! 1640 vst1.8 {d20[5]}, [Y]! 1641 vst1.8 {d21[4]}, [U]! 1642 vst1.8 {d21[5]}, [U]! 1643 vst1.8 {d22[4]}, [V]! 1644 vst1.8 {d22[5]}, [V]! 1645 .elseif \size == 1 1646 vst1.8 {d20[6]}, [Y]! 1647 vst1.8 {d21[6]}, [U]! 1648 vst1.8 {d22[6]}, [V]! 1649 .else 1650 .error unsupported macroblock size 1651 .endif 1652.endm 1653 1654.macro do_load bpp, size 1655 .if \bpp == 24 1656 .if \size == 8 1657 vld3.8 {d10, d11, d12}, [RGB]! 1658 pld [RGB, #128] 1659 .elseif \size == 4 1660 vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! 1661 vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! 1662 vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! 1663 vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! 1664 .elseif \size == 2 1665 vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! 1666 vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! 1667 .elseif \size == 1 1668 vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! 1669 .else 1670 .error unsupported macroblock size 1671 .endif 1672 .elseif \bpp == 32 1673 .if \size == 8 1674 vld4.8 {d10, d11, d12, d13}, [RGB]! 1675 pld [RGB, #128] 1676 .elseif \size == 4 1677 vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! 1678 vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! 1679 vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! 1680 vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! 1681 .elseif \size == 2 1682 vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! 1683 vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! 1684 .elseif \size == 1 1685 vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! 1686 .else 1687 .error unsupported macroblock size 1688 .endif 1689 .else 1690 .error unsupported bpp 1691 .endif 1692.endm 1693 1694.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs 1695 1696/* 1697 * 2 stage pipelined RGB->YCbCr conversion 1698 */ 1699 1700.macro do_rgb_to_yuv_stage1 1701 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ 1702 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ 1703 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ 1704 vmull.u16 q7, d4, d0[0] 1705 vmlal.u16 q7, d6, d0[1] 1706 vmlal.u16 q7, d8, d0[2] 1707 vmull.u16 q8, d5, d0[0] 1708 vmlal.u16 q8, d7, d0[1] 1709 vmlal.u16 q8, d9, d0[2] 1710 vrev64.32 q9, q1 1711 vrev64.32 q13, q1 1712 vmlsl.u16 q9, d4, d0[3] 1713 vmlsl.u16 q9, d6, d1[0] 1714 vmlal.u16 q9, d8, d1[1] 1715 vmlsl.u16 q13, d5, d0[3] 1716 vmlsl.u16 q13, d7, d1[0] 1717 vmlal.u16 q13, d9, d1[1] 1718 vrev64.32 q14, q1 1719 vrev64.32 q15, q1 1720 vmlal.u16 q14, d4, d1[1] 1721 vmlsl.u16 q14, d6, d1[2] 1722 vmlsl.u16 q14, d8, d1[3] 1723 vmlal.u16 q15, d5, d1[1] 1724 vmlsl.u16 q15, d7, d1[2] 1725 vmlsl.u16 q15, d9, d1[3] 1726.endm 1727 1728.macro do_rgb_to_yuv_stage2 1729 vrshrn.u32 d20, q7, #16 1730 vrshrn.u32 d21, q8, #16 1731 vshrn.u32 d22, q9, #16 1732 vshrn.u32 d23, q13, #16 1733 vshrn.u32 d24, q14, #16 1734 vshrn.u32 d25, q15, #16 1735 vmovn.u16 d20, q10 /* d20 = y */ 1736 vmovn.u16 d21, q11 /* d21 = u */ 1737 vmovn.u16 d22, q12 /* d22 = v */ 1738.endm 1739 1740.macro do_rgb_to_yuv 1741 do_rgb_to_yuv_stage1 1742 do_rgb_to_yuv_stage2 1743.endm 1744 1745.macro do_rgb_to_yuv_stage2_store_load_stage1 1746 vrshrn.u32 d20, q7, #16 1747 vrshrn.u32 d21, q8, #16 1748 vshrn.u32 d22, q9, #16 1749 vrev64.32 q9, q1 1750 vshrn.u32 d23, q13, #16 1751 vrev64.32 q13, q1 1752 vshrn.u32 d24, q14, #16 1753 vshrn.u32 d25, q15, #16 1754 do_load \bpp, 8 1755 vmovn.u16 d20, q10 /* d20 = y */ 1756 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ 1757 vmovn.u16 d21, q11 /* d21 = u */ 1758 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ 1759 vmovn.u16 d22, q12 /* d22 = v */ 1760 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ 1761 vmull.u16 q7, d4, d0[0] 1762 vmlal.u16 q7, d6, d0[1] 1763 vmlal.u16 q7, d8, d0[2] 1764 vst1.8 {d20}, [Y]! 1765 vmull.u16 q8, d5, d0[0] 1766 vmlal.u16 q8, d7, d0[1] 1767 vmlal.u16 q8, d9, d0[2] 1768 vmlsl.u16 q9, d4, d0[3] 1769 vmlsl.u16 q9, d6, d1[0] 1770 vmlal.u16 q9, d8, d1[1] 1771 vst1.8 {d21}, [U]! 1772 vmlsl.u16 q13, d5, d0[3] 1773 vmlsl.u16 q13, d7, d1[0] 1774 vmlal.u16 q13, d9, d1[1] 1775 vrev64.32 q14, q1 1776 vrev64.32 q15, q1 1777 vmlal.u16 q14, d4, d1[1] 1778 vmlsl.u16 q14, d6, d1[2] 1779 vmlsl.u16 q14, d8, d1[3] 1780 vst1.8 {d22}, [V]! 1781 vmlal.u16 q15, d5, d1[1] 1782 vmlsl.u16 q15, d7, d1[2] 1783 vmlsl.u16 q15, d9, d1[3] 1784.endm 1785 1786.balign 16 1787jsimd_\colorid\()_ycc_neon_consts: 1788 .short 19595, 38470, 7471, 11059 1789 .short 21709, 32768, 27439, 5329 1790 .short 32767, 128, 32767, 128 1791 .short 32767, 128, 32767, 128 1792 1793asm_function jsimd_\colorid\()_ycc_convert_neon 1794 OUTPUT_WIDTH .req r0 1795 INPUT_BUF .req r1 1796 OUTPUT_BUF .req r2 1797 OUTPUT_ROW .req r3 1798 NUM_ROWS .req r4 1799 1800 OUTPUT_BUF0 .req r5 1801 OUTPUT_BUF1 .req r6 1802 OUTPUT_BUF2 .req OUTPUT_BUF 1803 1804 RGB .req r7 1805 Y .req r8 1806 U .req r9 1807 V .req r10 1808 N .req ip 1809 1810 /* Load constants to d0, d1, d2, d3 */ 1811 adr ip, jsimd_\colorid\()_ycc_neon_consts 1812 vld1.16 {d0, d1, d2, d3}, [ip, :128] 1813 1814 /* Save ARM registers and handle input arguments */ 1815 push {r4, r5, r6, r7, r8, r9, r10, lr} 1816 ldr NUM_ROWS, [sp, #(4 * 8)] 1817 ldr OUTPUT_BUF0, [OUTPUT_BUF] 1818 ldr OUTPUT_BUF1, [OUTPUT_BUF, #4] 1819 ldr OUTPUT_BUF2, [OUTPUT_BUF, #8] 1820 .unreq OUTPUT_BUF 1821 1822 /* Save NEON registers */ 1823 vpush {d8-d15} 1824 1825 /* Outer loop over scanlines */ 1826 cmp NUM_ROWS, #1 1827 blt 9f 18280: 1829 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2] 1830 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2] 1831 mov N, OUTPUT_WIDTH 1832 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2] 1833 add OUTPUT_ROW, OUTPUT_ROW, #1 1834 ldr RGB, [INPUT_BUF], #4 1835 1836 /* Inner loop over pixels */ 1837 subs N, N, #8 1838 blt 3f 1839 do_load \bpp, 8 1840 do_rgb_to_yuv_stage1 1841 subs N, N, #8 1842 blt 2f 18431: 1844 do_rgb_to_yuv_stage2_store_load_stage1 1845 subs N, N, #8 1846 bge 1b 18472: 1848 do_rgb_to_yuv_stage2 1849 do_store 8 1850 tst N, #7 1851 beq 8f 18523: 1853 tst N, #4 1854 beq 3f 1855 do_load \bpp, 4 18563: 1857 tst N, #2 1858 beq 4f 1859 do_load \bpp, 2 18604: 1861 tst N, #1 1862 beq 5f 1863 do_load \bpp, 1 18645: 1865 do_rgb_to_yuv 1866 tst N, #4 1867 beq 6f 1868 do_store 4 18696: 1870 tst N, #2 1871 beq 7f 1872 do_store 2 18737: 1874 tst N, #1 1875 beq 8f 1876 do_store 1 18778: 1878 subs NUM_ROWS, NUM_ROWS, #1 1879 bgt 0b 18809: 1881 /* Restore all registers and return */ 1882 vpop {d8-d15} 1883 pop {r4, r5, r6, r7, r8, r9, r10, pc} 1884 1885 .unreq OUTPUT_WIDTH 1886 .unreq OUTPUT_ROW 1887 .unreq INPUT_BUF 1888 .unreq NUM_ROWS 1889 .unreq OUTPUT_BUF0 1890 .unreq OUTPUT_BUF1 1891 .unreq OUTPUT_BUF2 1892 .unreq RGB 1893 .unreq Y 1894 .unreq U 1895 .unreq V 1896 .unreq N 1897 1898.purgem do_rgb_to_yuv 1899.purgem do_rgb_to_yuv_stage1 1900.purgem do_rgb_to_yuv_stage2 1901.purgem do_rgb_to_yuv_stage2_store_load_stage1 1902 1903.endm 1904 1905/*--------------------------------- id ----- bpp R G B */ 1906generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 1907generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0 1908generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2 1909generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0 1910generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1 1911generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3 1912 1913.purgem do_load 1914.purgem do_store 1915 1916 1917/*****************************************************************************/ 1918 1919/* 1920 * Load data into workspace, applying unsigned->signed conversion 1921 * 1922 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get 1923 * rid of VST1.16 instructions 1924 */ 1925 1926asm_function jsimd_convsamp_neon 1927 SAMPLE_DATA .req r0 1928 START_COL .req r1 1929 WORKSPACE .req r2 1930 TMP1 .req r3 1931 TMP2 .req r4 1932 TMP3 .req r5 1933 TMP4 .req ip 1934 1935 push {r4, r5} 1936 vmov.u8 d0, #128 1937 1938 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} 1939 add TMP1, TMP1, START_COL 1940 add TMP2, TMP2, START_COL 1941 add TMP3, TMP3, START_COL 1942 add TMP4, TMP4, START_COL 1943 vld1.8 {d16}, [TMP1] 1944 vsubl.u8 q8, d16, d0 1945 vld1.8 {d18}, [TMP2] 1946 vsubl.u8 q9, d18, d0 1947 vld1.8 {d20}, [TMP3] 1948 vsubl.u8 q10, d20, d0 1949 vld1.8 {d22}, [TMP4] 1950 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} 1951 vsubl.u8 q11, d22, d0 1952 vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]! 1953 add TMP1, TMP1, START_COL 1954 add TMP2, TMP2, START_COL 1955 vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]! 1956 add TMP3, TMP3, START_COL 1957 add TMP4, TMP4, START_COL 1958 vld1.8 {d24}, [TMP1] 1959 vsubl.u8 q12, d24, d0 1960 vld1.8 {d26}, [TMP2] 1961 vsubl.u8 q13, d26, d0 1962 vld1.8 {d28}, [TMP3] 1963 vsubl.u8 q14, d28, d0 1964 vld1.8 {d30}, [TMP4] 1965 vsubl.u8 q15, d30, d0 1966 vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]! 1967 vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]! 1968 pop {r4, r5} 1969 bx lr 1970 1971 .unreq SAMPLE_DATA 1972 .unreq START_COL 1973 .unreq WORKSPACE 1974 .unreq TMP1 1975 .unreq TMP2 1976 .unreq TMP3 1977 .unreq TMP4 1978 1979 1980/*****************************************************************************/ 1981 1982/* 1983 * jsimd_fdct_ifast_neon 1984 * 1985 * This function contains a fast, not so accurate integer implementation of 1986 * the forward DCT (Discrete Cosine Transform). It uses the same calculations 1987 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' 1988 * function from jfdctfst.c 1989 * 1990 * TODO: can be combined with 'jsimd_convsamp_neon' to get 1991 * rid of a bunch of VLD1.16 instructions 1992 */ 1993 1994#define XFIX_0_382683433 d0[0] 1995#define XFIX_0_541196100 d0[1] 1996#define XFIX_0_707106781 d0[2] 1997#define XFIX_1_306562965 d0[3] 1998 1999.balign 16 2000jsimd_fdct_ifast_neon_consts: 2001 .short (98 * 128) /* XFIX_0_382683433 */ 2002 .short (139 * 128) /* XFIX_0_541196100 */ 2003 .short (181 * 128) /* XFIX_0_707106781 */ 2004 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ 2005 2006asm_function jsimd_fdct_ifast_neon 2007 2008 DATA .req r0 2009 TMP .req ip 2010 2011 vpush {d8-d15} 2012 2013 /* Load constants */ 2014 adr TMP, jsimd_fdct_ifast_neon_consts 2015 vld1.16 {d0}, [TMP, :64] 2016 2017 /* Load all DATA into NEON registers with the following allocation: 2018 * 0 1 2 3 | 4 5 6 7 2019 * ---------+-------- 2020 * 0 | d16 | d17 | q8 2021 * 1 | d18 | d19 | q9 2022 * 2 | d20 | d21 | q10 2023 * 3 | d22 | d23 | q11 2024 * 4 | d24 | d25 | q12 2025 * 5 | d26 | d27 | q13 2026 * 6 | d28 | d29 | q14 2027 * 7 | d30 | d31 | q15 2028 */ 2029 2030 vld1.16 {d16, d17, d18, d19}, [DATA, :128]! 2031 vld1.16 {d20, d21, d22, d23}, [DATA, :128]! 2032 vld1.16 {d24, d25, d26, d27}, [DATA, :128]! 2033 vld1.16 {d28, d29, d30, d31}, [DATA, :128] 2034 sub DATA, DATA, #(128 - 32) 2035 2036 mov TMP, #2 20371: 2038 /* Transpose */ 2039 vtrn.16 q12, q13 2040 vtrn.16 q10, q11 2041 vtrn.16 q8, q9 2042 vtrn.16 q14, q15 2043 vtrn.32 q9, q11 2044 vtrn.32 q13, q15 2045 vtrn.32 q8, q10 2046 vtrn.32 q12, q14 2047 vswp d30, d23 2048 vswp d24, d17 2049 vswp d26, d19 2050 /* 1-D FDCT */ 2051 vadd.s16 q2, q11, q12 2052 vswp d28, d21 2053 vsub.s16 q12, q11, q12 2054 vsub.s16 q6, q10, q13 2055 vadd.s16 q10, q10, q13 2056 vsub.s16 q7, q9, q14 2057 vadd.s16 q9, q9, q14 2058 vsub.s16 q1, q8, q15 2059 vadd.s16 q8, q8, q15 2060 vsub.s16 q4, q9, q10 2061 vsub.s16 q5, q8, q2 2062 vadd.s16 q3, q9, q10 2063 vadd.s16 q4, q4, q5 2064 vadd.s16 q2, q8, q2 2065 vqdmulh.s16 q4, q4, XFIX_0_707106781 2066 vadd.s16 q11, q12, q6 2067 vadd.s16 q8, q2, q3 2068 vsub.s16 q12, q2, q3 2069 vadd.s16 q3, q6, q7 2070 vadd.s16 q7, q7, q1 2071 vqdmulh.s16 q3, q3, XFIX_0_707106781 2072 vsub.s16 q6, q11, q7 2073 vadd.s16 q10, q5, q4 2074 vqdmulh.s16 q6, q6, XFIX_0_382683433 2075 vsub.s16 q14, q5, q4 2076 vqdmulh.s16 q11, q11, XFIX_0_541196100 2077 vqdmulh.s16 q5, q7, XFIX_1_306562965 2078 vadd.s16 q4, q1, q3 2079 vsub.s16 q3, q1, q3 2080 vadd.s16 q7, q7, q6 2081 vadd.s16 q11, q11, q6 2082 vadd.s16 q7, q7, q5 2083 vadd.s16 q13, q3, q11 2084 vsub.s16 q11, q3, q11 2085 vadd.s16 q9, q4, q7 2086 vsub.s16 q15, q4, q7 2087 subs TMP, TMP, #1 2088 bne 1b 2089 2090 /* store results */ 2091 vst1.16 {d16, d17, d18, d19}, [DATA, :128]! 2092 vst1.16 {d20, d21, d22, d23}, [DATA, :128]! 2093 vst1.16 {d24, d25, d26, d27}, [DATA, :128]! 2094 vst1.16 {d28, d29, d30, d31}, [DATA, :128] 2095 2096 vpop {d8-d15} 2097 bx lr 2098 2099 .unreq DATA 2100 .unreq TMP 2101 2102 2103/*****************************************************************************/ 2104 2105/* 2106 * GLOBAL(void) 2107 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors, 2108 * DCTELEM * workspace); 2109 * 2110 * Note: the code uses 2 stage pipelining in order to improve instructions 2111 * scheduling and eliminate stalls (this provides ~15% better 2112 * performance for this function on both ARM Cortex-A8 and 2113 * ARM Cortex-A9 when compared to the non-pipelined variant). 2114 * The instructions which belong to the second stage use different 2115 * indentation for better readiability. 2116 */ 2117asm_function jsimd_quantize_neon 2118 2119 COEF_BLOCK .req r0 2120 DIVISORS .req r1 2121 WORKSPACE .req r2 2122 2123 RECIPROCAL .req DIVISORS 2124 CORRECTION .req r3 2125 SHIFT .req ip 2126 LOOP_COUNT .req r4 2127 2128 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! 2129 vabs.s16 q12, q0 2130 add CORRECTION, DIVISORS, #(64 * 2) 2131 add SHIFT, DIVISORS, #(64 * 6) 2132 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! 2133 vabs.s16 q13, q1 2134 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! 2135 vadd.u16 q12, q12, q10 /* add correction */ 2136 vadd.u16 q13, q13, q11 2137 vmull.u16 q10, d24, d16 /* multiply by reciprocal */ 2138 vmull.u16 q11, d25, d17 2139 vmull.u16 q8, d26, d18 2140 vmull.u16 q9, d27, d19 2141 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! 2142 vshrn.u32 d20, q10, #16 2143 vshrn.u32 d21, q11, #16 2144 vshrn.u32 d22, q8, #16 2145 vshrn.u32 d23, q9, #16 2146 vneg.s16 q12, q12 2147 vneg.s16 q13, q13 2148 vshr.s16 q2, q0, #15 /* extract sign */ 2149 vshr.s16 q3, q1, #15 2150 vshl.u16 q14, q10, q12 /* shift */ 2151 vshl.u16 q15, q11, q13 2152 2153 push {r4, r5} 2154 mov LOOP_COUNT, #3 21551: 2156 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! 2157 veor.u16 q14, q14, q2 /* restore sign */ 2158 vabs.s16 q12, q0 2159 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! 2160 vabs.s16 q13, q1 2161 veor.u16 q15, q15, q3 2162 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! 2163 vadd.u16 q12, q12, q10 /* add correction */ 2164 vadd.u16 q13, q13, q11 2165 vmull.u16 q10, d24, d16 /* multiply by reciprocal */ 2166 vmull.u16 q11, d25, d17 2167 vmull.u16 q8, d26, d18 2168 vmull.u16 q9, d27, d19 2169 vsub.u16 q14, q14, q2 2170 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! 2171 vsub.u16 q15, q15, q3 2172 vshrn.u32 d20, q10, #16 2173 vshrn.u32 d21, q11, #16 2174 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! 2175 vshrn.u32 d22, q8, #16 2176 vshrn.u32 d23, q9, #16 2177 vneg.s16 q12, q12 2178 vneg.s16 q13, q13 2179 vshr.s16 q2, q0, #15 /* extract sign */ 2180 vshr.s16 q3, q1, #15 2181 vshl.u16 q14, q10, q12 /* shift */ 2182 vshl.u16 q15, q11, q13 2183 subs LOOP_COUNT, LOOP_COUNT, #1 2184 bne 1b 2185 pop {r4, r5} 2186 2187 veor.u16 q14, q14, q2 /* restore sign */ 2188 veor.u16 q15, q15, q3 2189 vsub.u16 q14, q14, q2 2190 vsub.u16 q15, q15, q3 2191 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! 2192 2193 bx lr /* return */ 2194 2195 .unreq COEF_BLOCK 2196 .unreq DIVISORS 2197 .unreq WORKSPACE 2198 .unreq RECIPROCAL 2199 .unreq CORRECTION 2200 .unreq SHIFT 2201 .unreq LOOP_COUNT 2202 2203 2204/*****************************************************************************/ 2205 2206/* 2207 * GLOBAL(void) 2208 * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, 2209 * JDIMENSION downsampled_width, 2210 * JSAMPARRAY input_data, 2211 * JSAMPARRAY * output_data_ptr); 2212 * 2213 * Note: the use of unaligned writes is the main remaining bottleneck in 2214 * this code, which can be potentially solved to get up to tens 2215 * of percents performance improvement on Cortex-A8/Cortex-A9. 2216 */ 2217 2218/* 2219 * Upsample 16 source pixels to 32 destination pixels. The new 16 source 2220 * pixels are loaded to q0. The previous 16 source pixels are in q1. The 2221 * shifted-by-one source pixels are constructed in q2 by using q0 and q1. 2222 * Register d28 is used for multiplication by 3. Register q15 is used 2223 * for adding +1 bias. 2224 */ 2225.macro upsample16 OUTPTR, INPTR 2226 vld1.8 {q0}, [\INPTR]! 2227 vmovl.u8 q8, d0 2228 vext.8 q2, q1, q0, #15 2229 vmovl.u8 q9, d1 2230 vaddw.u8 q10, q15, d4 2231 vaddw.u8 q11, q15, d5 2232 vmlal.u8 q8, d4, d28 2233 vmlal.u8 q9, d5, d28 2234 vmlal.u8 q10, d0, d28 2235 vmlal.u8 q11, d1, d28 2236 vmov q1, q0 /* backup source pixels to q1 */ 2237 vrshrn.u16 d6, q8, #2 2238 vrshrn.u16 d7, q9, #2 2239 vshrn.u16 d8, q10, #2 2240 vshrn.u16 d9, q11, #2 2241 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! 2242.endm 2243 2244/* 2245 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16' 2246 * macro, the roles of q0 and q1 registers are reversed for even and odd 2247 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed. 2248 * Also this unrolling allows to reorder loads and stores to compensate 2249 * multiplication latency and reduce stalls. 2250 */ 2251.macro upsample32 OUTPTR, INPTR 2252 /* even 16 pixels group */ 2253 vld1.8 {q0}, [\INPTR]! 2254 vmovl.u8 q8, d0 2255 vext.8 q2, q1, q0, #15 2256 vmovl.u8 q9, d1 2257 vaddw.u8 q10, q15, d4 2258 vaddw.u8 q11, q15, d5 2259 vmlal.u8 q8, d4, d28 2260 vmlal.u8 q9, d5, d28 2261 vmlal.u8 q10, d0, d28 2262 vmlal.u8 q11, d1, d28 2263 /* odd 16 pixels group */ 2264 vld1.8 {q1}, [\INPTR]! 2265 vrshrn.u16 d6, q8, #2 2266 vrshrn.u16 d7, q9, #2 2267 vshrn.u16 d8, q10, #2 2268 vshrn.u16 d9, q11, #2 2269 vmovl.u8 q8, d2 2270 vext.8 q2, q0, q1, #15 2271 vmovl.u8 q9, d3 2272 vaddw.u8 q10, q15, d4 2273 vaddw.u8 q11, q15, d5 2274 vmlal.u8 q8, d4, d28 2275 vmlal.u8 q9, d5, d28 2276 vmlal.u8 q10, d2, d28 2277 vmlal.u8 q11, d3, d28 2278 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! 2279 vrshrn.u16 d6, q8, #2 2280 vrshrn.u16 d7, q9, #2 2281 vshrn.u16 d8, q10, #2 2282 vshrn.u16 d9, q11, #2 2283 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! 2284.endm 2285 2286/* 2287 * Upsample a row of WIDTH pixels from INPTR to OUTPTR. 2288 */ 2289.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1 2290 /* special case for the first and last pixels */ 2291 sub \WIDTH, \WIDTH, #1 2292 add \OUTPTR, \OUTPTR, #1 2293 ldrb \TMP1, [\INPTR, \WIDTH] 2294 strb \TMP1, [\OUTPTR, \WIDTH, asl #1] 2295 ldrb \TMP1, [\INPTR], #1 2296 strb \TMP1, [\OUTPTR, #-1] 2297 vmov.8 d3[7], \TMP1 2298 2299 subs \WIDTH, \WIDTH, #32 2300 blt 5f 23010: /* process 32 pixels per iteration */ 2302 upsample32 \OUTPTR, \INPTR 2303 subs \WIDTH, \WIDTH, #32 2304 bge 0b 23055: 2306 adds \WIDTH, \WIDTH, #16 2307 blt 1f 23080: /* process 16 pixels if needed */ 2309 upsample16 \OUTPTR, \INPTR 2310 subs \WIDTH, \WIDTH, #16 23111: 2312 adds \WIDTH, \WIDTH, #16 2313 beq 9f 2314 2315 /* load the remaining 1-15 pixels */ 2316 add \INPTR, \INPTR, \WIDTH 2317 tst \WIDTH, #1 2318 beq 2f 2319 sub \INPTR, \INPTR, #1 2320 vld1.8 {d0[0]}, [\INPTR] 23212: 2322 tst \WIDTH, #2 2323 beq 2f 2324 vext.8 d0, d0, d0, #6 2325 sub \INPTR, \INPTR, #1 2326 vld1.8 {d0[1]}, [\INPTR] 2327 sub \INPTR, \INPTR, #1 2328 vld1.8 {d0[0]}, [\INPTR] 23292: 2330 tst \WIDTH, #4 2331 beq 2f 2332 vrev64.32 d0, d0 2333 sub \INPTR, \INPTR, #1 2334 vld1.8 {d0[3]}, [\INPTR] 2335 sub \INPTR, \INPTR, #1 2336 vld1.8 {d0[2]}, [\INPTR] 2337 sub \INPTR, \INPTR, #1 2338 vld1.8 {d0[1]}, [\INPTR] 2339 sub \INPTR, \INPTR, #1 2340 vld1.8 {d0[0]}, [\INPTR] 23412: 2342 tst \WIDTH, #8 2343 beq 2f 2344 vmov d1, d0 2345 sub \INPTR, \INPTR, #8 2346 vld1.8 {d0}, [\INPTR] 23472: /* upsample the remaining pixels */ 2348 vmovl.u8 q8, d0 2349 vext.8 q2, q1, q0, #15 2350 vmovl.u8 q9, d1 2351 vaddw.u8 q10, q15, d4 2352 vaddw.u8 q11, q15, d5 2353 vmlal.u8 q8, d4, d28 2354 vmlal.u8 q9, d5, d28 2355 vmlal.u8 q10, d0, d28 2356 vmlal.u8 q11, d1, d28 2357 vrshrn.u16 d10, q8, #2 2358 vrshrn.u16 d12, q9, #2 2359 vshrn.u16 d11, q10, #2 2360 vshrn.u16 d13, q11, #2 2361 vzip.8 d10, d11 2362 vzip.8 d12, d13 2363 /* store the remaining pixels */ 2364 tst \WIDTH, #8 2365 beq 2f 2366 vst1.8 {d10, d11}, [\OUTPTR]! 2367 vmov q5, q6 23682: 2369 tst \WIDTH, #4 2370 beq 2f 2371 vst1.8 {d10}, [\OUTPTR]! 2372 vmov d10, d11 23732: 2374 tst \WIDTH, #2 2375 beq 2f 2376 vst1.8 {d10[0]}, [\OUTPTR]! 2377 vst1.8 {d10[1]}, [\OUTPTR]! 2378 vst1.8 {d10[2]}, [\OUTPTR]! 2379 vst1.8 {d10[3]}, [\OUTPTR]! 2380 vext.8 d10, d10, d10, #4 23812: 2382 tst \WIDTH, #1 2383 beq 2f 2384 vst1.8 {d10[0]}, [\OUTPTR]! 2385 vst1.8 {d10[1]}, [\OUTPTR]! 23862: 23879: 2388.endm 2389 2390asm_function jsimd_h2v1_fancy_upsample_neon 2391 2392 MAX_V_SAMP_FACTOR .req r0 2393 DOWNSAMPLED_WIDTH .req r1 2394 INPUT_DATA .req r2 2395 OUTPUT_DATA_PTR .req r3 2396 OUTPUT_DATA .req OUTPUT_DATA_PTR 2397 2398 OUTPTR .req r4 2399 INPTR .req r5 2400 WIDTH .req ip 2401 TMP .req lr 2402 2403 push {r4, r5, r6, lr} 2404 vpush {d8-d15} 2405 2406 ldr OUTPUT_DATA, [OUTPUT_DATA_PTR] 2407 cmp MAX_V_SAMP_FACTOR, #0 2408 ble 99f 2409 2410 /* initialize constants */ 2411 vmov.u8 d28, #3 2412 vmov.u16 q15, #1 241311: 2414 ldr INPTR, [INPUT_DATA], #4 2415 ldr OUTPTR, [OUTPUT_DATA], #4 2416 mov WIDTH, DOWNSAMPLED_WIDTH 2417 upsample_row OUTPTR, INPTR, WIDTH, TMP 2418 subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1 2419 bgt 11b 2420 242199: 2422 vpop {d8-d15} 2423 pop {r4, r5, r6, pc} 2424 2425 .unreq MAX_V_SAMP_FACTOR 2426 .unreq DOWNSAMPLED_WIDTH 2427 .unreq INPUT_DATA 2428 .unreq OUTPUT_DATA_PTR 2429 .unreq OUTPUT_DATA 2430 2431 .unreq OUTPTR 2432 .unreq INPTR 2433 .unreq WIDTH 2434 .unreq TMP 2435 2436 2437.purgem upsample16 2438.purgem upsample32 2439.purgem upsample_row 2440