1/* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 18 19#include <machine/cpu-features.h> 20#include <machine/asm.h> 21 22/* 23 r0 = dst 24 r1 = y0 base pointer 25 r2 = y1 base pointer 26 r3 = y2 base pointer 27 sp = coeffs 28 sp = length / 2 29*/ 30 31ENTRY(rsdIntrinsicConvolve3x3_K) 32 push {r4-r8, r10, r11, lr} 33 vpush {q4-q7} 34 35 /* Get the coeffs pointer from the stack and load the 36 coefficients in the q0, q1 NEON registers */ 37 ldr r4, [sp, #32+64] 38 vld1.16 {q0, q1}, [r4] 39 40 /* Get count from the stack */ 41 ldr r4, [sp, #36+64] 42 43 /* Load the frequently used immediate in a register */ 44 mov r5, #8 45 461: 47 /* Load and post-increase the address by r5=#8 */ 48 vld1.8 {q13}, [r1], r5 49 vld1.8 {q14}, [r2], r5 50 vld1.8 {q15}, [r3], r5 51 52 /* Signal memory for data that will be used in the loop after the next */ 53 PLD (r1, r5) 54 PLD (r2, r5) 55 PLD (r3, r5) 56 57 vmovl.u8 q2, d26 58 vmovl.u8 q3, d27 59 vmovl.u8 q4, d28 60 vmovl.u8 q5, d29 61 vmovl.u8 q6, d30 62 vmovl.u8 q7, d31 63 64/* 65 The two pixel source array is 66 d4, d5, d6, d7 67 d8, d9, d10, d11 68 d12, d13, d14, d15 69*/ 70 71 vmull.s16 q8, d4, d0[0] 72 vmlal.s16 q8, d5, d0[1] 73 vmlal.s16 q8, d6, d0[2] 74 vmlal.s16 q8, d8, d0[3] 75 vmlal.s16 q8, d9, d1[0] 76 vmlal.s16 q8, d10, d1[1] 77 vmlal.s16 q8, d12, d1[2] 78 vmlal.s16 q8, d13, d1[3] 79 vmlal.s16 q8, d14, d2[0] 80 81 vmull.s16 q9, d5, d0[0] 82 vmlal.s16 q9, d6, d0[1] 83 vmlal.s16 q9, d7, d0[2] 84 vmlal.s16 q9, d9, d0[3] 85 vmlal.s16 q9, d10, d1[0] 86 vmlal.s16 q9, d11, d1[1] 87 vmlal.s16 q9, d13, d1[2] 88 vmlal.s16 q9, d14, d1[3] 89 vmlal.s16 q9, d15, d2[0] 90 91 vshrn.i32 d16, q8, #8 92 vshrn.i32 d17, q9, #8 93 94 vqmovun.s16 d16, q8 95 vst1.8 d16, [r0]! 96 97 /* Are we done yet? */ 98 subs r4, r4, #1 99 bne 1b 100 101 /* We're done, bye! */ 102 vpop {q4-q7} 103 pop {r4-r8, r10, r11, lr} 104 bx lr 105END(rsdIntrinsicConvolve3x3_K) 106 107/* 108 r0 = dst 109 r1 = src 110 r2 = matrix 111 r3 = length 112*/ 113ENTRY(rsdIntrinsicColorMatrix4x4_K) 114 stmfd sp!, {r4, lr} 115 vpush {q4-q7} 116 117 vld1.16 {q2}, [r2]! 118 vld1.16 {q3}, [r2]! 119 1201: 121 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 122 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 123 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 124 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 125 126 vmovl.u8 q12, d0 /* R */ 127 vmovl.u8 q13, d1 /* G */ 128 vmovl.u8 q14, d2 /* B */ 129 vmovl.u8 q15, d3 /* A */ 130 131 vmull.s16 q8, d24, d4[0] 132 vmull.s16 q9, d24, d4[1] 133 vmull.s16 q10, d24, d4[2] 134 vmull.s16 q11, d24, d4[3] 135 136 vmlal.s16 q8, d26, d5[0] 137 vmlal.s16 q9, d26, d5[1] 138 vmlal.s16 q10, d26, d5[2] 139 vmlal.s16 q11, d26, d5[3] 140 141 vmlal.s16 q8, d28, d6[0] 142 vmlal.s16 q9, d28, d6[1] 143 vmlal.s16 q10, d28, d6[2] 144 vmlal.s16 q11, d28, d6[3] 145 146 vmlal.s16 q8, d30, d7[0] 147 vmlal.s16 q9, d30, d7[1] 148 vmlal.s16 q10, d30, d7[2] 149 vmlal.s16 q11, d30, d7[3] 150 151 vshrn.i32 d24, q8, #8 152 vshrn.i32 d26, q9, #8 153 vshrn.i32 d28, q10, #8 154 vshrn.i32 d30, q11, #8 155 156 vqmovun.s16 d0, q12 157 vqmovun.s16 d1, q13 158 vqmovun.s16 d2, q14 159 vqmovun.s16 d3, q15 160 161 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 162 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 163 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 164 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 165 166 subs r3, r3, #1 167 bne 1b 168 169 vpop {q4-q7} 170 ldmfd sp!, {r4, lr} 171 bx lr 172END(rsdIntrinsicColorMatrix4x4_K) 173 174/* 175 r0 = dst 176 r1 = src 177 r2 = matrix 178 r3 = length 179*/ 180ENTRY(rsdIntrinsicColorMatrix3x3_K) 181 stmfd sp!, {r4, lr} 182 vpush {q4-q7} 183 184 vld1.16 {q2}, [r2]! 185 vld1.16 {q3}, [r2]! 186 1871: 188 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 189 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 190 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 191 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 192 193 vmovl.u8 q12, d0 194 vmovl.u8 q13, d1 195 vmovl.u8 q14, d2 196 197 vmull.s16 q8, d24, d4[0] 198 vmull.s16 q9, d24, d4[1] 199 vmull.s16 q10, d24, d4[2] 200 201 vmlal.s16 q8, d26, d5[0] 202 vmlal.s16 q9, d26, d5[1] 203 vmlal.s16 q10, d26, d5[2] 204 205 vmlal.s16 q8, d28, d6[0] 206 vmlal.s16 q9, d28, d6[1] 207 vmlal.s16 q10, d28, d6[2] 208 209 vshrn.i32 d24, q8, #8 210 vshrn.i32 d26, q9, #8 211 vshrn.i32 d28, q10, #8 212 213 vqmovun.s16 d0, q12 214 vqmovun.s16 d1, q13 215 vqmovun.s16 d2, q14 216 217 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 218 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 219 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 220 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 221 222 subs r3, r3, #1 223 bne 1b 224 225 vpop {q4-q7} 226 ldmfd sp!, {r4, lr} 227 bx lr 228END(rsdIntrinsicColorMatrix3x3_K) 229 230/* 231 r0 = dst 232 r1 = src 233 r2 = matrix 234 r3 = length 235*/ 236ENTRY(rsdIntrinsicColorMatrixDot_K) 237 stmfd sp!, {r4, lr} 238 vpush {q4-q7} 239 240 vld1.16 {q2}, [r2]! 241 vld1.16 {q3}, [r2]! 242 2431: 244 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 245 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 246 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 247 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 248 249 vmovl.u8 q12, d0 250 vmovl.u8 q13, d1 251 vmovl.u8 q14, d2 252 253 vmull.s16 q8, d24, d4[0] 254 vmlal.s16 q8, d26, d5[0] 255 vmlal.s16 q8, d28, d6[0] 256 vshrn.i32 d24, q8, #8 257 vqmovun.s16 d0, q12 258 vmov.u8 d1, d0 259 vmov.u8 d2, d0 260 261 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 262 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 263 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 264 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 265 266 subs r3, r3, #1 267 bne 1b 268 269 vpop {q4-q7} 270 ldmfd sp!, {r4, lr} 271 bx lr 272END(rsdIntrinsicColorMatrixDot_K) 273 274 275/* 276static void OneVF(float4 *out, const uchar *ptrIn, int iStride, 277 const float* gPtr, int iradius, int x1, int x2) 278 279 r0 = out 280 r1 = pin 281 r2 = stride 282 r3 = gptr 283 r4 = sp, ct 284 r5 = sp+4, x1 285 r6 = sp+8, x2 286*/ 287ENTRY(rsdIntrinsicBlurVFU4_K) 288 push {r4-r8, r10, r11, lr} 289 vpush {q4-q7} 290 291 ldr r4, [sp, #32+64] 292 ldr r5, [sp, #32+64 + 4] 293 ldr r6, [sp, #32+64 + 8] 294 2951: 296 veor q10, q10, q10 /* float4 blurredPixel = 0; */ 297 veor q11, q11, q11 /* float4 blurredPixel = 0; */ 298 add r7, r1, r5, lsl #2 /* const uchar *pi = ptrIn + x1 * 4; */ 299 mov r10, r3 300 301 mov r11, r4 302 3032: 304 vld1.32 {d2}, [r7] 305 vmovl.u8 q1, d2 306 vmovl.u16 q3, d2 307 vmovl.u16 q4, d3 308 vcvt.f32.s32 q3, q3 309 vcvt.f32.s32 q4, q4 310 vld1.32 {d0[0]}, [r10]! 311 add r7, r7, r2 312 vmla.f32 q10, q3, d0[0] 313 vmla.f32 q11, q4, d0[0] 314 subs r11, r11, #1 315 bne 2b 316 317 vst1.32 {q10}, [r0]! 318 vst1.32 {q11}, [r0]! 319 add r5, r5, #2 320 cmp r5, r6 321 bne 1b 322 323 324 vpop {q4-q7} 325 pop {r4-r8, r10, r11, lr} 326 bx lr 327END(rsdIntrinsicBlurVFU4_K) 328 329/* 330static void OneVF(float4 *out, const uchar *ptrIn, int iStride, 331 const float* gPtr, int iradius, int x1, int x2) 332 333 r0 = out 334 r1 = pin 335 r2 = gptr 336 r3 = ct 337 r4 = sp, x1 338 r5 = sp+4, x2 339*/ 340ENTRY(rsdIntrinsicBlurHFU4_K) 341 push {r4-r8, r10, r11, lr} 342 vpush {q4-q7} 343 344 ldr r4, [sp, #32+64] 345 ldr r5, [sp, #32+64 + 4] 346 3471: 348 add r7, r1, r4, lsl #4 /* const uchar *pi = ptrIn + x1 * 4; */ 349 mov r10, r2 350 mov r11, r3 351 352 vld1.32 {q1}, [r7]! 353 vld1.32 {d6[0]}, [r10]! 354 vmul.f32 q0, q1, d6[0] 355 sub r11, r11, #1 356 3572: 358 vld1.32 {q1}, [r7]! 359 vld1.32 {q2}, [r7]! 360 vld1.32 {d6}, [r10]! 361 vmla.f32 q0, q1, d6[0] 362 vmla.f32 q0, q2, d6[1] 363 subs r11, r11, #2 364 bne 2b 365 366 vcvt.s32.f32 q0, q0 367 vmovn.u32 d0, q0 368 vmovn.u16 d0, q0 369 370 vst1.32 {d0[0]}, [r0]! 371 add r4, r4, #1 372 cmp r4, r5 373 bne 1b 374 375 vpop {q4-q7} 376 pop {r4-r8, r10, r11, lr} 377 bx lr 378END(rsdIntrinsicBlurHFU4_K) 379 380ENTRY(rsdIntrinsicBlurHFU1_K) 381 push {r4-r8, r10, r11, lr} 382 vpush {q4-q7} 383 384 ldr r4, [sp, #32+64] 385 ldr r5, [sp, #32+64 + 4] 386 3871: 388 add r7, r1, r4, lsl #2 /* const uchar *pi = ptrIn + x1 * 4; */ 389 mov r10, r2 390 mov r11, r3 391 392 veor q0, q0 393 3942: 395 vld1.32 {q1}, [r7] 396 add r7, r7, #4 397 vld1.32 {d4[0]}, [r10]! 398 vmla.f32 q0, q1, d4[0] 399 subs r11, r11, #1 400 bne 2b 401 402 vcvt.s32.f32 q0, q0 403 vmovn.u32 d0, q0 404 vmovn.u16 d0, q0 405 406 vst1.32 {d0[0]}, [r0]! 407 add r4, r4, #4 408 cmp r4, r5 409 bne 1b 410 411 vpop {q4-q7} 412 pop {r4-r8, r10, r11, lr} 413 bx lr 414END(rsdIntrinsicBlurHFU1_K) 415 416/* 417 Function called with the following arguments: dst, Y, vu, len, YuvCoeff 418 r0 = dst 419 r1 = Y 420 r2 = VU 421 r3 = length (pixels / 8) 422 ---- Args below will be in the stack ---- 423 sp = YuvCoeff 424 425 This function converts 8 pixels per iteration 426*/ 427ENTRY(rsdIntrinsicYuv_K) 428 push {r4, r5, lr} @ preserve clobbered int registers 429 vpush {Q4-Q7} @ preserve Vregisters we clobber 430 431 mov r5, #16 @ Integer 16 in r5; used as an incrementing value 432 433 ldr r4, [sp, #64+12] @ load the coeffs address in memory in r4 (16*4 + 4*3) 434 vld1.16 {Q2}, [r4]! @ load the multipliers from the coeffs matrix (r4) in Q2 435 vld1.8 {d6[]}, [r4], r5 @ load y offset 16 from the coeffs matrix (r4) in d6 436 vld1.8 {d8[]}, [r4], r5 @ load V and U offset of 128 from the coeffs matrix (r4) in d8 437 438 mov r4, #8 @ Integer 8 in r4; used as an incrementing value 439 440 vdup.8 d3, d5[1] @ d3 = 255 (alpha) from the multipliers line in 441 @ the coeffs matrix (Q2) 442 443 1: 444 vld1.8 {d10}, [r1]! @ get Y (r1->Y) 445 vld2.8 {d12, d14}, [r2], r4 @ split V from U (r2 -> VU) and increase pointer by 8 (in r4) 446 pld [r1, #64] @ preloading data from address y(r1) + 64 for subsequent loops 447 pld [r2, #64] @ preloading data from address vu(r2) + 64 for subsequent loops 448 449 vsubl.u8 Q5, d10, d6 @ Y to 16 bit - 16 (in 16bit) (n to n+7) 450 vmull.s16 Q8, d10, d4[0] @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit) 451 vmull.s16 Q11, d11, d4[0] @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit) 452 453 vsubl.u8 Q5, d12, d8 @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3) 454 vsubl.u8 Q6, d14, d8 @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3) 455 vmov.u16 d11, d10 @ Copying V to d11 456 vmov.u16 d13, d12 @ Copying U to d13 457 vzip.u16 d10, d11 @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3) 458 vzip.u16 d12, d13 @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3) 459 460 461 vmov Q9, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9 462 vmov Q10, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10 463 vmov Q12, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12 464 vmov Q13, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13 465 466 @ R G B 467 @ Pixel(0-3) Q8, Q9, Q10 468 @ Pixel(4-7) Q11, Q12, Q13 469 @ 470 471 @ Pixel(0-3) 472 vmlal.s16 Q8, d10, d4[1] @ R : Q8 = Q8(Y-16) + (V-128) * 409 473 vmlal.s16 Q9, d10, d5[0] @ G : Q9 = Q9(Y-16) + (V-128) * (-208) 474 vmlal.s16 Q9, d12, d4[2] @ + (U-128) * (-100) 475 vmlal.s16 Q10, d12, d4[3] @ B : Q10 = Q10(Y-16) + (U-128) * 516 476 477 @ Pixel(4-7) 478 vmlal.s16 Q11, d11, d4[1] @ R : Q11 = Q11(Y-16) + (V-128) * 409 479 vmlal.s16 Q12, d11, d5[0] @ G : Q12 = Q12(Y-16) + (V-128) * (-208) 480 vmlal.s16 Q12, d13, d4[2] @ + (U-128) * (-100) 481 vmlal.s16 Q13, d13, d4[3] @ B : Q13 = Q13(Y-16) + (U-128) * 516 482 483 @ Pixel(0-3) 484 vrshrn.i32 d16, Q8, #8 @ d16 : R shifted right by 8 rounded'n narrowed to 16bit 485 vrshrn.i32 d18, Q9, #8 @ d18 : G shifted right by 8 rounded'n narrowed to 16bit 486 vrshrn.i32 d20, Q10, #8 @ d20 : B shifted right by 8 rounded'n narrowed to 16bit 487 488 @ Pixel(4-7) 489 vrshrn.i32 d17, Q11, #8 @ d17 : R shifted right by 8 rounded'n narrowed to 16bit 490 vrshrn.i32 d19, Q12, #8 @ d19 : G shifted right by 8 rounded'n narrowed to 16bit 491 vrshrn.i32 d21, Q13, #8 @ d21 : B shifted right by 8 rounded'n narrowed to 16bit 492 493 vqmovun.s16 d0, Q8 @ r = d0 (saturated, unsigned and narrowed to 8bit) 494 vqmovun.s16 d1, Q9 @ g = d1 (saturated, unsigned and narrowed to 8bit) 495 vqmovun.s16 d2, Q10 @ b = d2 (saturated, unsigned and narrowed to 8bit) 496 497 subs r3, r3, #1 @ Checking length (r3) 498 vst4.8 {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0) 499 500 bne 1b @ if not done with length, loop 501 502 vpop {Q4-Q7} @ Restore Vregisters 503 pop {r4, r5, lr} @ Restore int registers 504 bx lr 505END(rsdIntrinsicYuv_K) 506 507/* 508 Function called with the following arguments: dst, Y, v, u, len, YuvCoeff 509 r0 = dst 510 r1 = Y 511 r2 = V, 512 r3 = U 513 ---- Args below will be in the stack ---- 514 sp = length (pixels / 8) 515 sp+4 = YuvCoeff 516 517 This function converts 8 pixels per iteration 518*/ 519ENTRY(rsdIntrinsicYuv2_K) 520 push {r4, r5, r6, lr} @ preserve clobbered int registers 521 vpush {Q4-Q7} @ preserve Vregisters we clobber 522 523 mov r5, #16 @ Integer 16 in r5; used as an incrementing value 524 525 ldr r4, [sp, #64+16+4] @ load the coeffs address in memory in r4 (16*4 + 4*4 + 4) 526 ldr r6, [sp, #64+16] @ load the length in r6 (16*4 + 4*4) 527 vld1.16 {Q2}, [r4]! @ load the multipliers from the coeffs matrix (r4) in Q2 528 vld1.8 {d6[]}, [r4], r5 @ load y offset 16 from the coeffs matrix (r4) in d6 529 vld1.8 {d8[]}, [r4], r5 @ load V and U offset of 128 from the coeffs matrix (r4) in d8 530 531 mov r4, #4 @ Integer 8 in r4; used as an incrementing value 532 533 vdup.8 d3, d5[1] @ d3 = 255 (alpha) from the multipliers line in 534 @ the coeffs matrix (Q2) 535 536 1: 537 vld1.8 {d10}, [r1]! @ get Y (r1->Y) 538 vld1.8 {d12}, [r3], r4 @ split V from U (r2 -> VU) and increase pointer by 4 (in r4) 539 vld1.8 {d14}, [r2], r4 @ split V from U (r2 -> VU) and increase pointer by 4 (in r4) 540 pld [r1, #64] @ preloading data from address y(r1) + 64 for subsequent loops 541 pld [r2, #64] @ preloading data from address vu(r2) + 64 for subsequent loops 542 543 vsubl.u8 Q5, d10, d6 @ Y to 16 bit - 16 (in 16bit) (n to n+7) 544 vmull.s16 Q8, d10, d4[0] @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit) 545 vmull.s16 Q11, d11, d4[0] @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit) 546 547 vsubl.u8 Q5, d12, d8 @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3) 548 vsubl.u8 Q6, d14, d8 @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3) 549 vmov.u16 d11, d10 @ Copying V to d11 550 vmov.u16 d13, d12 @ Copying U to d13 551 vzip.u16 d10, d11 @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3) 552 vzip.u16 d12, d13 @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3) 553 554 555 vmov Q9, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9 556 vmov Q10, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10 557 vmov Q12, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12 558 vmov Q13, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13 559 560 @ R G B 561 @ Pixel(0-3) Q8, Q9, Q10 562 @ Pixel(4-7) Q11, Q12, Q13 563 @ 564 565 @ Pixel(0-3) 566 vmlal.s16 Q8, d10, d4[1] @ R : Q8 = Q8(Y-16) + (V-128) * 409 567 vmlal.s16 Q9, d10, d5[0] @ G : Q9 = Q9(Y-16) + (V-128) * (-208) 568 vmlal.s16 Q9, d12, d4[2] @ + (U-128) * (-100) 569 vmlal.s16 Q10, d12, d4[3] @ B : Q10 = Q10(Y-16) + (U-128) * 516 570 571 @ Pixel(4-7) 572 vmlal.s16 Q11, d11, d4[1] @ R : Q11 = Q11(Y-16) + (V-128) * 409 573 vmlal.s16 Q12, d11, d5[0] @ G : Q12 = Q12(Y-16) + (V-128) * (-208) 574 vmlal.s16 Q12, d13, d4[2] @ + (U-128) * (-100) 575 vmlal.s16 Q13, d13, d4[3] @ B : Q13 = Q13(Y-16) + (U-128) * 516 576 577 @ Pixel(0-3) 578 vrshrn.i32 d16, Q8, #8 @ d16 : R shifted right by 8 rounded'n narrowed to 16bit 579 vrshrn.i32 d18, Q9, #8 @ d18 : G shifted right by 8 rounded'n narrowed to 16bit 580 vrshrn.i32 d20, Q10, #8 @ d20 : B shifted right by 8 rounded'n narrowed to 16bit 581 582 @ Pixel(4-7) 583 vrshrn.i32 d17, Q11, #8 @ d17 : R shifted right by 8 rounded'n narrowed to 16bit 584 vrshrn.i32 d19, Q12, #8 @ d19 : G shifted right by 8 rounded'n narrowed to 16bit 585 vrshrn.i32 d21, Q13, #8 @ d21 : B shifted right by 8 rounded'n narrowed to 16bit 586 587 vqmovun.s16 d0, Q8 @ r = d0 (saturated, unsigned and narrowed to 8bit) 588 vqmovun.s16 d1, Q9 @ g = d1 (saturated, unsigned and narrowed to 8bit) 589 vqmovun.s16 d2, Q10 @ b = d2 (saturated, unsigned and narrowed to 8bit) 590 591 subs r6, r6, #1 @ Checking length (r6) 592 vst4.8 {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0) 593 594 bne 1b @ if not done with length, loop 595 596 vpop {Q4-Q7} @ Restore Vregisters 597 pop {r4, r5, r6, lr} @ Restore int registers 598 bx lr 599END(rsdIntrinsicYuv2_K) 600 601/* Convolve 5x5 */ 602 603/* 604 r0 = dst 605 r1 = y0 base pointer 606 r2 = y1 base pointer 607 r3 = y2 base pointer 608 r4 = y3 base pointer 609 r5 = y4 base pointer 610 r6 = coeffs 611 r7 = length 612*/ 613ENTRY(rsdIntrinsicConvolve5x5_K) 614 push {r4-r7, lr} 615 vpush {q4-q7} 616 617 /* load y3 in r4 */ 618 ldr r4, [sp, #20 + 64] 619 620 /* load y4 in r5 */ 621 ldr r5, [sp, #24 + 64] 622 623 /* Load the coefficients pointer */ 624 ldr r6, [sp, #28 + 64] 625 626 /* Create the coefficients vector */ 627 vld1.16 {d0, d1, d2, d3}, [r6]! 628 vld1.16 {d4, d5, d6}, [r6] 629 630 vmov.u32 q15, #0x7f 631 632 /* load the count */ 633 ldr r6, [sp, #32 + 64] 634 635 /* Load the frequently used immediate in a register */ 636 mov r7, #8 637 6381: 639 /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ 640 vld1.8 {d24, d25, d26}, [r1], r7 @ y0 ( y - 2 ) 641 vld1.8 {d27, d28, d29}, [r2], r7 @ y0 ( y - 1 ) 642 643 /* Signal memory for data that will be used in the loop after the next */ 644 PLD (r1, r7) 645 PLD (r2, r7) 646 647 /* Promoting the 8bit channels to 16bit */ 648 vmovl.u8 q9, d24 649 vmovl.u8 q10, d25 650 vmovl.u8 q11, d26 651 vmovl.u8 q12, d27 652 vmovl.u8 q13, d28 653 vmovl.u8 q14, d29 654 655/* 656 d18, d19, d20, d21, d22, d23, 657 d24, d25 658*/ 659 vmull.s16 q4, d18, d0[0] 660 vmlal.s16 q4, d19, d0[1] 661 vmlal.s16 q4, d20, d0[2] 662 vmlal.s16 q4, d21, d0[3] 663 vmlal.s16 q4, d22, d1[0] 664 665 vmlal.s16 q4, d24, d1[1] 666 vmlal.s16 q4, d25, d1[2] 667 vmlal.s16 q4, d26, d1[3] 668 vmlal.s16 q4, d27, d2[0] 669 vmlal.s16 q4, d28, d2[1] 670 671 vmull.s16 q5, d19, d0[0] 672 vmlal.s16 q5, d20, d0[1] 673 vmlal.s16 q5, d21, d0[2] 674 vmlal.s16 q5, d22, d0[3] 675 vmlal.s16 q5, d23, d1[0] 676 677 vmlal.s16 q5, d25, d1[1] 678 vmlal.s16 q5, d26, d1[2] 679 vmlal.s16 q5, d27, d1[3] 680 vmlal.s16 q5, d28, d2[0] 681 vmlal.s16 q5, d29, d2[1] 682 683 684 /* Next 2 rows */ 685 /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ 686 vld1.8 {d24, d25, d26}, [r3], r7 @ y0 ( y ) 687 vld1.8 {d27, d28, d29}, [r4], r7 @ y0 ( y + 1 ) 688 689 /* Signal memory for data that will be used in the loop after the next */ 690 PLD (r3, r7) 691 PLD (r4, r7) 692 693 /* Promoting the 8bit channels to 16bit */ 694 vmovl.u8 q9, d24 695 vmovl.u8 q10, d25 696 vmovl.u8 q11, d26 697 vmovl.u8 q12, d27 698 vmovl.u8 q13, d28 699 vmovl.u8 q14, d29 700 701/* 702 d18, d19, d20, d21, d22, d23, 703 d24, d25 704*/ 705 vmlal.s16 q4, d18, d2[2] 706 vmlal.s16 q4, d19, d2[3] 707 vmlal.s16 q4, d20, d3[0] 708 vmlal.s16 q4, d21, d3[1] 709 vmlal.s16 q4, d22, d3[2] 710 711 vmlal.s16 q4, d24, d3[3] 712 vmlal.s16 q4, d25, d4[0] 713 vmlal.s16 q4, d26, d4[1] 714 vmlal.s16 q4, d27, d4[2] 715 vmlal.s16 q4, d28, d4[3] 716 717 vmlal.s16 q5, d19, d2[2] 718 vmlal.s16 q5, d20, d2[3] 719 vmlal.s16 q5, d21, d3[0] 720 vmlal.s16 q5, d22, d3[1] 721 vmlal.s16 q5, d23, d3[2] 722 723 vmlal.s16 q5, d25, d3[3] 724 vmlal.s16 q5, d26, d4[0] 725 vmlal.s16 q5, d27, d4[1] 726 vmlal.s16 q5, d28, d4[2] 727 vmlal.s16 q5, d29, d4[3] 728 729 /* Last row */ 730 /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ 731 vld1.8 {d24, d25, d26}, [r5], r7 @ y0 ( y + 2 ) 732 733 /* Signal memory for data that will be used in the loop after the next */ 734 PLD (r5, r7) 735 736 /* Promoting the 8bit channels to 16bit */ 737 vmovl.u8 q9, d24 738 vmovl.u8 q10, d25 739 vmovl.u8 q11, d26 740 741/* 742 d18, d19, d20, d21, d22, d23, 743 d24, d25 744*/ 745 746 vmlal.s16 q4, d18, d5[0] 747 vmlal.s16 q4, d19, d5[1] 748 vmlal.s16 q4, d20, d5[2] 749 vmlal.s16 q4, d21, d5[3] 750 vmlal.s16 q4, d22, d6[0] 751 752 vmlal.s16 q5, d19, d5[0] 753 vmlal.s16 q5, d20, d5[1] 754 vmlal.s16 q5, d21, d5[2] 755 vmlal.s16 q5, d22, d5[3] 756 vmlal.s16 q5, d23, d6[0] 757 758 759 760 vadd.i32 q4, q4, q15 761 vadd.i32 q5, q5, q15 762 763/* Narrow it to a d-reg 32 -> 16 bit */ 764 vrshrn.i32 d8, q4, #8 765 vrshrn.i32 d9, q5, #8 766 767 768/* Pack 16 -> 8 bit, saturate, put two pixels into D reg */ 769 vqmovun.s16 d8, q4 770 771 vst1.8 d8, [r0]! @ return the output and increase the address of r0 772 773 /* Are we done? */ 774 subs r6, r6, #1 775 bne 1b 776 777 /* Yup, bye */ 778 vpop {q4-q7} 779 pop {r4-r7, lr} 780 bx lr 781 782END(rsdIntrinsicConvolve5x5_K) 783 784 785 786 787/* 788 dst = src + dst * (1.0 - src.a) 789 790 r0 = dst 791 r1 = src 792 r2 = length 793*/ 794ENTRY(rsdIntrinsicBlendSrcOver_K) 795 .save {r4, lr} 796 stmfd sp!, {r4, lr} 797 vpush {q4-q7} 798 799 mov r4, #255 800 vdup.16 q7, r4 801 802 mov r4, r0 8031: 804 805 /* src */ 806 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 807 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 808 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 809 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 810 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 811 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 812 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 813 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 814 vshll.u8 q12, d0, #8 815 vshll.u8 q13, d1, #8 816 vshll.u8 q14, d2, #8 817 vmovl.u8 q6, d3 818 vsub.i16 q6, q7, q6 // q6 = 1 - src.a 819 vshll.u8 q15, d3, #8 820 821 /* dst */ 822 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 823 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 824 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 825 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 826 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 827 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 828 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 829 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 830 vmovl.u8 q8, d0 831 vmovl.u8 q9, d1 832 vmovl.u8 q10, d2 833 vmovl.u8 q11, d3 834 835 vmla.i16 q12, q8, q6 836 vmla.i16 q13, q9, q6 837 vmla.i16 q14, q10, q6 838 vmla.i16 q15, q11, q6 839 840 vshrn.i16 d0, q12, #8 841 vshrn.i16 d1, q13, #8 842 vshrn.i16 d2, q14, #8 843 vshrn.i16 d3, q15, #8 844 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 845 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 846 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 847 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 848 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 849 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 850 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 851 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 852 853 subs r2, r2, #1 854 bne 1b 855 856 vpop {q4-q7} 857 ldmfd sp!, {r4, lr} 858 bx lr 859END(rsdIntrinsicBlendSrcOver_K) 860 861/* 862 dst = dst + src * (1.0 - dst.a) 863 864 r0 = dst 865 r1 = src 866 r2 = length 867*/ 868ENTRY(rsdIntrinsicBlendDstOver_K) 869 .save {r4, lr} 870 stmfd sp!, {r4, lr} 871 vpush {q4-q7} 872 873 mov r4, #255 874 vdup.16 q7, r4 875 876 mov r4, r0 8771: 878 879 /* src */ 880 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 881 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 882 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 883 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 884 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 885 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 886 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 887 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 888 vmovl.u8 q12, d0 889 vmovl.u8 q13, d1 890 vmovl.u8 q14, d2 891 vmovl.u8 q15, d3 892 893 /* dst */ 894 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 895 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 896 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 897 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 898 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 899 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 900 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 901 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 902 vshll.u8 q8, d0, #8 903 vshll.u8 q9, d1, #8 904 vshll.u8 q10, d2, #8 905 vmovl.u8 q6, d3 906 vsub.i16 q6, q7, q6 // q6 = 1 - dst.a 907 vshll.u8 q11, d3, #8 908 909 910 vmla.i16 q8, q12, q6 911 vmla.i16 q9, q13, q6 912 vmla.i16 q10, q14, q6 913 vmla.i16 q11, q15, q6 914 915 vshrn.i16 d0, q8, #8 916 vshrn.i16 d1, q9, #8 917 vshrn.i16 d2, q10, #8 918 vshrn.i16 d3, q11, #8 919 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 920 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 921 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 922 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 923 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 924 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 925 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 926 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 927 928 subs r2, r2, #1 929 bne 1b 930 931 vpop {q4-q7} 932 ldmfd sp!, {r4, lr} 933 bx lr 934END(rsdIntrinsicBlendDstOver_K) 935 936/* 937 dst = src * dst.a 938 939 r0 = dst 940 r1 = src 941 r2 = length 942*/ 943ENTRY(rsdIntrinsicBlendSrcIn_K) 944 .save {r4, lr} 945 stmfd sp!, {r4, lr} 946 vpush {q4-q7} 947 948 mov r4, r0 9491: 950 951 /* src */ 952 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 953 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 954 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 955 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 956 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 957 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 958 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 959 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 960 vmovl.u8 q12, d0 961 vmovl.u8 q13, d1 962 vmovl.u8 q14, d2 963 vmovl.u8 q15, d3 964 965 /* dst */ 966 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 967 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 968 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 969 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 970 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 971 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 972 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 973 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 974 //vmovl.u8 q8, d0 975 //vmovl.u8 q9, d1 976 //vmovl.u8 q10, d2 977 vmovl.u8 q11, d3 978 979 vmul.i16 q12, q12, q11 980 vmul.i16 q13, q13, q11 981 vmul.i16 q14, q14, q11 982 vmul.i16 q15, q15, q11 983 984 vshrn.i16 d0, q12, #8 985 vshrn.i16 d1, q13, #8 986 vshrn.i16 d2, q14, #8 987 vshrn.i16 d3, q15, #8 988 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 989 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 990 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 991 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 992 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 993 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 994 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 995 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 996 997 subs r2, r2, #1 998 bne 1b 999 1000 vpop {q4-q7} 1001 ldmfd sp!, {r4, lr} 1002 bx lr 1003END(rsdIntrinsicBlendSrcIn_K) 1004 1005/* 1006 dst = dst * src.a 1007 1008 r0 = dst 1009 r1 = src 1010 r2 = length 1011*/ 1012ENTRY(rsdIntrinsicBlendDstIn_K) 1013 .save {r4, lr} 1014 stmfd sp!, {r4, lr} 1015 vpush {q4-q7} 1016 1017 mov r4, r0 10181: 1019 1020 /* src */ 1021 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 1022 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 1023 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 1024 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 1025 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 1026 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 1027 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 1028 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 1029 //vmovl.u8 q12, d0 1030 //vmovl.u8 q13, d1 1031 //vmovl.u8 q14, d2 1032 vmovl.u8 q15, d3 1033 1034 /* dst */ 1035 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 1036 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 1037 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 1038 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 1039 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 1040 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 1041 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 1042 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 1043 vmovl.u8 q8, d0 1044 vmovl.u8 q9, d1 1045 vmovl.u8 q10, d2 1046 vmovl.u8 q11, d3 1047 1048 vmul.i16 q8, q8, q15 1049 vmul.i16 q9, q9, q15 1050 vmul.i16 q10, q10, q15 1051 vmul.i16 q11, q11, q15 1052 1053 vshrn.i16 d0, q8, #8 1054 vshrn.i16 d1, q9, #8 1055 vshrn.i16 d2, q10, #8 1056 vshrn.i16 d3, q11, #8 1057 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 1058 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 1059 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 1060 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 1061 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 1062 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 1063 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 1064 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 1065 1066 subs r2, r2, #1 1067 bne 1b 1068 1069 vpop {q4-q7} 1070 ldmfd sp!, {r4, lr} 1071 bx lr 1072END(rsdIntrinsicBlendDstIn_K) 1073 1074 1075 1076/* 1077 dst = src * (1.0 - dst.a) 1078 1079 r0 = dst 1080 r1 = src 1081 r2 = length 1082*/ 1083ENTRY(rsdIntrinsicBlendSrcOut_K) 1084 .save {r4, lr} 1085 stmfd sp!, {r4, lr} 1086 vpush {q4-q7} 1087 1088 mov r4, #255 1089 vdup.16 q7, r4 1090 1091 mov r4, r0 10921: 1093 1094 /* src */ 1095 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 1096 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 1097 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 1098 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 1099 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 1100 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 1101 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 1102 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 1103 vmovl.u8 q12, d0 1104 vmovl.u8 q13, d1 1105 vmovl.u8 q14, d2 1106 vmovl.u8 q15, d3 1107 1108 /* dst */ 1109 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 1110 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 1111 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 1112 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 1113 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 1114 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 1115 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 1116 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 1117 //vmovl.u8 q8, d0 1118 //vmovl.u8 q9, d1 1119 //vmovl.u8 q10, d2 1120 vmovl.u8 q11, d3 1121 1122 1123 vsub.i16 q6, q7, q11 // q6 = 1 - dst.a 1124 vmul.i16 q12, q12, q6 1125 vmul.i16 q13, q13, q6 1126 vmul.i16 q14, q14, q6 1127 vmul.i16 q15, q15, q6 1128 1129 vshrn.i16 d0, q12, #8 1130 vshrn.i16 d1, q13, #8 1131 vshrn.i16 d2, q14, #8 1132 vshrn.i16 d3, q15, #8 1133 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 1134 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 1135 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 1136 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 1137 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 1138 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 1139 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 1140 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 1141 1142 subs r2, r2, #1 1143 bne 1b 1144 1145 vpop {q4-q7} 1146 ldmfd sp!, {r4, lr} 1147 bx lr 1148END(rsdIntrinsicBlendSrcOut_K) 1149 1150 1151/* 1152 dst = dst * (1.0 - src.a) 1153 1154 r0 = dst 1155 r1 = src 1156 r2 = length 1157*/ 1158ENTRY(rsdIntrinsicBlendDstOut_K) 1159 .save {r4, lr} 1160 stmfd sp!, {r4, lr} 1161 vpush {q4-q7} 1162 1163 mov r4, #255 1164 vdup.16 q7, r4 1165 1166 mov r4, r0 11671: 1168 1169 /* src */ 1170 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 1171 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 1172 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 1173 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 1174 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 1175 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 1176 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 1177 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 1178 //vmovl.u8 q12, d0 1179 //vmovl.u8 q13, d1 1180 //vmovl.u8 q14, d2 1181 vmovl.u8 q15, d3 1182 1183 /* dst */ 1184 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 1185 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 1186 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 1187 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 1188 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 1189 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 1190 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 1191 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 1192 vmovl.u8 q8, d0 1193 vmovl.u8 q9, d1 1194 vmovl.u8 q10, d2 1195 vmovl.u8 q11, d3 1196 1197 1198 vsub.i16 q6, q7, q15 // q6 = 1 - src.a 1199 vmul.i16 q12, q8, q6 1200 vmul.i16 q13, q9, q6 1201 vmul.i16 q14, q10, q6 1202 vmul.i16 q15, q11, q6 1203 1204 vshrn.i16 d0, q12, #8 1205 vshrn.i16 d1, q13, #8 1206 vshrn.i16 d2, q14, #8 1207 vshrn.i16 d3, q15, #8 1208 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 1209 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 1210 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 1211 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 1212 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 1213 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 1214 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 1215 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 1216 1217 subs r2, r2, #1 1218 bne 1b 1219 1220 vpop {q4-q7} 1221 ldmfd sp!, {r4, lr} 1222 bx lr 1223END(rsdIntrinsicBlendDstOut_K) 1224 1225 1226/* 1227 dst.rgb = src.rgb * dst.a + (1.0 - src.a) * dst.rgb 1228 dst.a = dst.a 1229 1230 r0 = dst 1231 r1 = src 1232 r2 = length 1233*/ 1234ENTRY(rsdIntrinsicBlendSrcAtop_K) 1235 .save {r4, lr} 1236 stmfd sp!, {r4, lr} 1237 vpush {q4-q7} 1238 1239 mov r4, #255 1240 vdup.16 q7, r4 1241 1242 mov r4, r0 12431: 1244 1245 /* src */ 1246 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 1247 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 1248 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 1249 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 1250 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 1251 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 1252 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 1253 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 1254 vmovl.u8 q12, d0 1255 vmovl.u8 q13, d1 1256 vmovl.u8 q14, d2 1257 vmovl.u8 q15, d3 1258 1259 /* dst */ 1260 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 1261 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 1262 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 1263 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 1264 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 1265 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 1266 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 1267 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 1268 vmovl.u8 q8, d0 1269 vmovl.u8 q9, d1 1270 vmovl.u8 q10, d2 1271 vmovl.u8 q11, d3 1272 1273 1274 vsub.i16 q6, q7, q15 // q6 = 1 - src.a 1275 vmul.i16 q8, q8, q6 1276 vmul.i16 q9, q9, q6 1277 vmul.i16 q10, q10, q6 1278 1279 vmla.i16 q8, q12, q11 1280 vmla.i16 q9, q13, q11 1281 vmla.i16 q10, q14, q11 1282 1283 1284 vshrn.i16 d0, q8, #8 1285 vshrn.i16 d1, q9, #8 1286 vshrn.i16 d2, q10, #8 1287 //vshrn.i16 d3, q15, #8 1288 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 1289 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 1290 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 1291 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 1292 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 1293 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 1294 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 1295 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 1296 1297 subs r2, r2, #1 1298 bne 1b 1299 1300 vpop {q4-q7} 1301 ldmfd sp!, {r4, lr} 1302 bx lr 1303END(rsdIntrinsicBlendSrcAtop_K) 1304 1305/* 1306 dst = dst.rgb * src.a + (1.0 - dst.a) * src.rgb 1307 dst.a = src.a 1308 1309 r0 = dst 1310 r1 = src 1311 r2 = length 1312*/ 1313ENTRY(rsdIntrinsicBlendDstAtop_K) 1314 .save {r4, lr} 1315 stmfd sp!, {r4, lr} 1316 vpush {q4-q7} 1317 1318 mov r4, #255 1319 vdup.16 q7, r4 1320 1321 mov r4, r0 13221: 1323 1324 /* src */ 1325 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 1326 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 1327 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 1328 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 1329 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 1330 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 1331 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 1332 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 1333 vmovl.u8 q12, d0 1334 vmovl.u8 q13, d1 1335 vmovl.u8 q14, d2 1336 vmovl.u8 q15, d3 1337 1338 /* dst */ 1339 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 1340 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 1341 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 1342 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 1343 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 1344 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 1345 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 1346 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 1347 vmovl.u8 q8, d0 1348 vmovl.u8 q9, d1 1349 vmovl.u8 q10, d2 1350 vmovl.u8 q11, d3 1351 1352 1353 vsub.i16 q6, q7, q11 // q6 = 1 - dst.a 1354 vmul.i16 q12, q12, q6 1355 vmul.i16 q13, q13, q6 1356 vmul.i16 q14, q14, q6 1357 1358 vmla.i16 q12, q8, q15 1359 vmla.i16 q13, q9, q15 1360 vmla.i16 q14, q10, q15 1361 1362 1363 vshrn.i16 d0, q12, #8 1364 vshrn.i16 d1, q13, #8 1365 vshrn.i16 d2, q14, #8 1366 //vshrn.i16 d3, q15, #8 1367 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 1368 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 1369 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 1370 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 1371 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 1372 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 1373 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 1374 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 1375 1376 subs r2, r2, #1 1377 bne 1b 1378 1379 vpop {q4-q7} 1380 ldmfd sp!, {r4, lr} 1381 bx lr 1382END(rsdIntrinsicBlendDstAtop_K) 1383 1384/* 1385 dst = dst ^ src 1386 1387 r0 = dst 1388 r1 = src 1389 r2 = length 1390*/ 1391ENTRY(rsdIntrinsicBlendXor_K) 1392 .save {r4, lr} 1393 stmfd sp!, {r4, lr} 1394 vpush {q4-q7} 1395 1396 mov r4, #255 1397 vdup.16 q7, r4 1398 1399 mov r4, r0 14001: 1401 1402 /* src */ 1403 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 1404 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 1405 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 1406 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 1407 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 1408 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 1409 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 1410 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 1411 vmov.u8 d4, d0 1412 vmov.u8 d5, d1 1413 vmov.u8 d6, d2 1414 vmov.u8 d7, d3 1415 1416 /* dst */ 1417 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 1418 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 1419 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 1420 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 1421 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 1422 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 1423 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 1424 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 1425 1426 veor d0, d0, d4 1427 veor d1, d1, d5 1428 veor d2, d2, d6 1429 veor d3, d3, d7 1430 1431 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 1432 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 1433 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 1434 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 1435 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 1436 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 1437 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 1438 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 1439 1440 subs r2, r2, #1 1441 bne 1b 1442 1443 vpop {q4-q7} 1444 ldmfd sp!, {r4, lr} 1445 bx lr 1446END(rsdIntrinsicBlendXor_K) 1447 1448/* 1449 dst = dst * src 1450 1451 r0 = dst 1452 r1 = src 1453 r2 = length 1454*/ 1455ENTRY(rsdIntrinsicBlendMultiply_K) 1456 .save {r4, lr} 1457 stmfd sp!, {r4, lr} 1458 vpush {q4-q7} 1459 1460 mov r4, #255 1461 vdup.16 q7, r4 1462 1463 mov r4, r0 14641: 1465 1466 /* src */ 1467 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 1468 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 1469 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 1470 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 1471 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 1472 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 1473 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 1474 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 1475 vmovl.u8 q12, d0 1476 vmovl.u8 q13, d1 1477 vmovl.u8 q14, d2 1478 vmovl.u8 q15, d3 1479 1480 /* dst */ 1481 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 1482 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 1483 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 1484 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 1485 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 1486 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 1487 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 1488 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 1489 vmovl.u8 q8, d0 1490 vmovl.u8 q9, d1 1491 vmovl.u8 q10, d2 1492 vmovl.u8 q11, d3 1493 1494 1495 vmul.i16 q8, q8, q12 1496 vmul.i16 q9, q9, q13 1497 vmul.i16 q10, q10, q14 1498 vmul.i16 q11, q11, q15 1499 1500 vshrn.i16 d0, q8, #8 1501 vshrn.i16 d1, q9, #8 1502 vshrn.i16 d2, q10, #8 1503 vshrn.i16 d3, q11, #8 1504 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 1505 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 1506 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 1507 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 1508 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 1509 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 1510 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 1511 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 1512 1513 subs r2, r2, #1 1514 bne 1b 1515 1516 vpop {q4-q7} 1517 ldmfd sp!, {r4, lr} 1518 bx lr 1519END(rsdIntrinsicBlendMultiply_K) 1520 1521/* 1522 dst = min(src + dst, 1.0) 1523 1524 r0 = dst 1525 r1 = src 1526 r2 = length 1527*/ 1528ENTRY(rsdIntrinsicBlendAdd_K) 1529 .save {r4, lr} 1530 stmfd sp!, {r4, lr} 1531 vpush {q4-q7} 1532 1533 mov r4, #255 1534 vdup.16 q7, r4 1535 1536 mov r4, r0 15371: 1538 1539 /* src */ 1540 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 1541 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 1542 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 1543 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 1544 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 1545 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 1546 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 1547 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 1548 vmovl.u8 q12, d0 1549 vmovl.u8 q13, d1 1550 vmovl.u8 q14, d2 1551 vmovl.u8 q15, d3 1552 1553 /* dst */ 1554 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 1555 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 1556 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 1557 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 1558 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 1559 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 1560 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 1561 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 1562 vmovl.u8 q8, d0 1563 vmovl.u8 q9, d1 1564 vmovl.u8 q10, d2 1565 vmovl.u8 q11, d3 1566 1567 1568 vadd.i16 q8, q8, q12 1569 vadd.i16 q9, q9, q13 1570 vadd.i16 q10, q10, q14 1571 vadd.i16 q11, q11, q15 1572 1573 vqmovun.s16 d0, q8 1574 vqmovun.s16 d1, q9 1575 vqmovun.s16 d2, q10 1576 vqmovun.s16 d3, q11 1577 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 1578 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 1579 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 1580 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 1581 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 1582 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 1583 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 1584 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 1585 1586 subs r2, r2, #1 1587 bne 1b 1588 1589 vpop {q4-q7} 1590 ldmfd sp!, {r4, lr} 1591 bx lr 1592END(rsdIntrinsicBlendAdd_K) 1593 1594 1595/* 1596 dst = max(dst - src, 0.0) 1597 1598 r0 = dst 1599 r1 = src 1600 r2 = length 1601*/ 1602ENTRY(rsdIntrinsicBlendSub_K) 1603 .save {r4, lr} 1604 stmfd sp!, {r4, lr} 1605 vpush {q4-q7} 1606 1607 mov r4, #255 1608 vdup.16 q7, r4 1609 1610 mov r4, r0 16111: 1612 1613 /* src */ 1614 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 1615 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 1616 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 1617 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 1618 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 1619 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 1620 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 1621 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 1622 vmovl.u8 q12, d0 1623 vmovl.u8 q13, d1 1624 vmovl.u8 q14, d2 1625 vmovl.u8 q15, d3 1626 1627 /* dst */ 1628 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 1629 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 1630 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 1631 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 1632 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 1633 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 1634 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 1635 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 1636 vmovl.u8 q8, d0 1637 vmovl.u8 q9, d1 1638 vmovl.u8 q10, d2 1639 vmovl.u8 q11, d3 1640 1641 1642 vsub.i16 q8, q8, q12 1643 vsub.i16 q9, q9, q13 1644 vsub.i16 q10, q10, q14 1645 vsub.i16 q11, q11, q15 1646 1647 vqmovun.s16 d0, q8 1648 vqmovun.s16 d1, q9 1649 vqmovun.s16 d2, q10 1650 vqmovun.s16 d3, q11 1651 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 1652 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 1653 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 1654 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 1655 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 1656 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 1657 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 1658 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 1659 1660 subs r2, r2, #1 1661 bne 1b 1662 1663 vpop {q4-q7} 1664 ldmfd sp!, {r4, lr} 1665 bx lr 1666END(rsdIntrinsicBlendSub_K) 1667 1668 1669/* 3D LUT */ 1670 1671/* 1672 r0 = dst 1673 r1 = src 1674 r2 = cube base pointer 1675 r3 = cube Y stride 1676 r4 = cube Z stride 1677 r5 = count 1678 xr10 = * constants 1679 1680 d0 / q0 = weight 1 p1 1681 d1 = weight 2 p1 1682 1683 d2 / q1 = weight 1 p2 1684 d3 = weight 2 p2 1685 1686 d4 / q2 = src1 1687 d5 = src2 1688 1689 d6 / q3 = baseCoord 1690 d7 = baseCoord 1691 1692 d8 / q4 = coord1 p1 1693 d9 = 1694 1695 d10 / q5 = coord1 p2 1696 d11 = 1697 1698 d12 / q6 = 1699 d13 = 1700 1701 d14 / q7 = 1702 d15 = 1703 1704 1705 d16 / q8 = x0 y0 z0 1706 d17 = x1 y0 z0 1707 d18 / q9 = x0 y1 z0 1708 d19 = x1 y1 z0 1709 d20 / q10 = x0 y0 z1 1710 d21 = x1 y0 z1 1711 d22 / q11 = x0 y1 z1 1712 d23 = x1 y1 z1 1713 1714 d24 / q12 = alpha mash 1715 d25 = current pixel alpha 1716 d26 / q13 = 4, y stride 1717 d27 = z stride, 0 1718 d28 / q14 = 0x8000 1719 d29 = 0x7fff 1720 d30 / q15 = 0, 0, 0, 0xffff 1721 1722 1723 d31 = coordMult 1724*/ 1725 1726ENTRY(rsdIntrinsic3DLUT_K) 1727 push {r4-r8, r10, r11, lr} 1728 vpush {q4-q7} 1729 1730 /* load Z stride in r4 */ 1731 ldr r4, [sp, #32 + 64] 1732 1733 /* Load count */ 1734 ldr r5, [sp, #36 + 64] 1735 1736 vmov.u16 d28, #0x8000 1737 vmov.u16 d29, #0x7fff 1738 vmov.u32 d24, #0xff000000 1739 1740 /* load constants using r10 */ 1741 ldr r10, [sp, #40 + 64] 1742 vld1.32 {d31}, [r10]! 1743 vld1.32 {d30}, [r10]! 1744 1745 mov r6, #4 1746 vmov d26, r6, r3 1747 mov r6, #0 1748 vmov d27, r4, r6 1749 1750 add r8, r3, r4 1751 1752 1753 17541: 1755 vld1.8 {d4}, [r1]! 1756 vand.u8 d25, d4, d24 1757 vmovl.u8 q2, d4 1758 1759 1760 vmull.u16 q3, d4, d31 1761 vshr.u32 q4, q3, #15 // coord1 p1 1762 vmovn.u32 d1, q3 1763 vand.u16 d1, d29 // weight 2 1764 vsub.u16 d0, d28, d1 // weight 1 1765 vmul.u32 q4, q4, q13 // q4 = x*4, y*ystride, z*zstride, 0 1766 1767 vmull.u16 q3, d5, d31 1768 vshr.u32 q5, q3, #15 // coord1 p2 1769 vmovn.u32 d3, q3 1770 vand.u16 d3, d29 // weight 2 1771 vsub.u16 d2, d28, d3 // weight 1 1772 vmul.u32 q5, q5, q13 // q5 = x*4, y*ystride, z*zstride, 0 1773 1774 vpadd.u32 d8, d8, d9 1775 vpadd.u32 d9, d10, d11 1776 vpadd.u32 d8, d8, d9 1777 vmov r6, r7, d8 // base pointers 1778 1779 add r6, r6, r2 1780 add r7, r7, r2 1781 1782 vld1.8 {d16}, [r6] 1783 add r11, r6, r3 1784 vld1.8 {d18}, [r11] 1785 add r11, r6, r4 1786 vld1.8 {d20}, [r11] 1787 add r11, r6, r8 1788 vld1.8 {d22}, [r11] 1789 1790 vmovl.u8 q8, d16 1791 vmovl.u8 q9, d18 1792 vmovl.u8 q10, d20 1793 vmovl.u8 q11, d22 1794 1795 vmull.u16 q6, d16, d0[0] 1796 vmlal.u16 q6, d17, d1[0] 1797 vshrn.u32 d16, q6, #7 1798 vmull.u16 q6, d18, d0[0] 1799 vmlal.u16 q6, d19, d1[0] 1800 vshrn.u32 d18, q6, #7 1801 vmull.u16 q6, d20, d0[0] 1802 vmlal.u16 q6, d21, d1[0] 1803 vshrn.u32 d20, q6, #7 1804 vmull.u16 q6, d22, d0[0] 1805 vmlal.u16 q6, d23, d1[0] 1806 vshrn.u32 d22, q6, #7 1807 1808 vmull.u16 q6, d16, d0[1] 1809 vmlal.u16 q6, d18, d1[1] 1810 vshrn.u32 d16, q6, #15 1811 vmull.u16 q6, d20, d0[1] 1812 vmlal.u16 q6, d22, d1[1] 1813 vshrn.u32 d18, q6, #15 1814 1815 vmull.u16 q6, d16, d0[2] 1816 vmlal.u16 q6, d18, d1[2] 1817 vshrn.u32 d14, q6, #15 1818 1819 1820 vld1.8 {d16}, [r7] 1821 add r11, r7, r3 1822 vld1.8 {d18}, [r11] 1823 add r11, r7, r4 1824 vld1.8 {d20}, [r11] 1825 add r11, r7, r8 1826 vld1.8 {d22}, [r11] 1827 vmovl.u8 q8, d16 1828 vmovl.u8 q9, d18 1829 vmovl.u8 q10, d20 1830 vmovl.u8 q11, d22 1831 1832 vmull.u16 q6, d16, d2[0] 1833 vmlal.u16 q6, d17, d3[0] 1834 vshrn.u32 d16, q6, #7 1835 vmull.u16 q6, d18, d2[0] 1836 vmlal.u16 q6, d19, d3[0] 1837 vshrn.u32 d18, q6, #7 1838 vmull.u16 q6, d20, d2[0] 1839 vmlal.u16 q6, d21, d3[0] 1840 vshrn.u32 d20, q6, #7 1841 vmull.u16 q6, d22, d2[0] 1842 vmlal.u16 q6, d23, d3[0] 1843 vshrn.u32 d22, q6, #7 1844 1845 vmull.u16 q6, d16, d2[1] 1846 vmlal.u16 q6, d18, d3[1] 1847 vshrn.u32 d16, q6, #15 1848 vmull.u16 q6, d20, d2[1] 1849 vmlal.u16 q6, d22, d3[1] 1850 vshrn.u32 d18, q6, #15 1851 1852 vmull.u16 q6, d16, d2[2] 1853 vmlal.u16 q6, d18, d3[2] 1854 vshrn.u32 d15, q6, #15 1855 1856 vrshrn.u16 d14, q7, #8 1857 1858 vbic.u8 d14, d14, d24 // mix in alpha 1859 vorr.u8 d14, d14, d25 1860 vst1.32 {d14}, [r0]! 1861 1862 1863 /* Are we done? */ 1864 subs r5, r5, #1 1865 bne 1b 1866 1867 /* Yup, bye */ 1868 vpop {q4-q7} 1869 pop {r4-r8, r10, r11, lr} 1870 bx lr 1871 1872END(rsdIntrinsic3DLUT_K) 1873 1874 1875