1/* 2 * Copyright © 2021, VideoLAN and dav1d authors 3 * Copyright © 2021, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30#include "src/arm/asm-offsets.h" 31 32#define GRAIN_WIDTH 82 33#define GRAIN_HEIGHT 73 34 35#define SUB_GRAIN_WIDTH 44 36#define SUB_GRAIN_HEIGHT 38 37 38.macro increment_seed steps, shift=1 39 lsr r11, r2, #3 40 lsr r12, r2, #12 41 lsr lr, r2, #1 42 eor r11, r2, r11 // (r >> 0) ^ (r >> 3) 43 eor r12, r12, lr // (r >> 12) ^ (r >> 1) 44 eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) 45.if \shift 46 lsr r2, r2, #\steps 47.endif 48 and r11, r11, #((1 << \steps) - 1) // bit 49.if \shift 50 orr r2, r2, r11, lsl #(16 - \steps) // *state 51.else 52 orr r2, r2, r11, lsl #16 // *state 53.endif 54.endm 55 56.macro read_rand dest, bits, age 57 ubfx \dest, r2, #16 - \bits - \age, #\bits 58.endm 59 60.macro read_shift_rand dest, bits 61 ubfx \dest, r2, #17 - \bits, #\bits 62 lsr r2, r2, #1 63.endm 64 65// special calling convention: 66// r2 holds seed 67// r3 holds dav1d_gaussian_sequence 68// clobbers r11-r12 69// returns in d0-d1 70function get_gaussian_neon 71 push {r5-r6,lr} 72 increment_seed 4 73 read_rand r5, 11, 3 74 read_rand r6, 11, 2 75 add r5, r3, r5, lsl #1 76 add r6, r3, r6, lsl #1 77 vld1.16 {d0[0]}, [r5] 78 read_rand r5, 11, 1 79 vld1.16 {d0[1]}, [r6] 80 add r5, r3, r5, lsl #1 81 read_rand r6, 11, 0 82 increment_seed 4 83 add r6, r3, r6, lsl #1 84 vld1.16 {d0[2]}, [r5] 85 read_rand r5, 11, 3 86 vld1.16 {d0[3]}, [r6] 87 add r5, r3, r5, lsl #1 88 read_rand r6, 11, 2 89 vld1.16 {d1[0]}, [r5] 90 add r6, r3, r6, lsl #1 91 read_rand r5, 11, 1 92 vld1.16 {d1[1]}, [r6] 93 read_rand r6, 11, 0 94 add r5, r3, r5, lsl #1 95 add r6, r3, r6, lsl #1 96 vld1.16 {d1[2]}, [r5] 97 vld1.16 {d1[3]}, [r6] 98 pop {r5-r6,pc} 99endfunc 100 101function get_grain_2_neon 102 push {r11,lr} 103 increment_seed 2 104 read_rand r11, 11, 1 105 read_rand r12, 11, 0 106 add r11, r3, r11, lsl #1 107 add r12, r3, r12, lsl #1 108 vld1.16 {d0[0]}, [r11] 109 vld1.16 {d0[1]}, [r12] 110 vrshl.s16 d0, d0, d30 111 pop {r11,pc} 112endfunc 113 114.macro get_grain_2 dst 115 bl get_grain_2_neon 116.ifnc \dst, d0 117 vmov \dst, d0 118.endif 119.endm 120 121function get_grain_4_neon 122 push {r11,lr} 123 increment_seed 4 124 read_rand r11, 11, 3 125 read_rand r12, 11, 2 126 add r11, r3, r11, lsl #1 127 add r12, r3, r12, lsl #1 128 vld1.16 {d0[0]}, [r11] 129 read_rand r11, 11, 1 130 vld1.16 {d0[1]}, [r12] 131 read_rand r12, 11, 0 132 add r11, r3, r11, lsl #1 133 add r12, r3, r12, lsl #1 134 vld1.16 {d0[2]}, [r11] 135 vld1.16 {d0[3]}, [r12] 136 vrshl.s16 d0, d0, d30 137 pop {r11,pc} 138endfunc 139 140.macro get_grain_4 dst 141 bl get_grain_4_neon 142.ifnc \dst, d0 143 vmov \dst, d0 144.endif 145.endm 146 147// r1 holds the number of entries to produce 148// r6, r8 and r10 hold the previous output entries 149// q0 holds the vector of produced entries 150// q1 holds the input vector of sums from above 151.macro output_lag n 152function output_lag\n\()_neon 153 push {r0, lr} 154.if \n == 1 155 mvn lr, r5 // grain_min = ~grain_max 156.else 157 mov r0, #1 158 mov lr, #1 159 sub r7, r7, #1 160 sub r9, r9, #1 161 lsl r0, r0, r7 162 lsl lr, lr, r9 163 add r7, r7, #1 164 add r9, r9, #1 165.endif 1661: 167 read_shift_rand r12, 11 168 vmov.32 r11, d2[0] 169 lsl r12, r12, #1 170 vext.8 q0, q0, q0, #2 171 ldrsh r12, [r3, r12] 172.if \n == 1 173 mla r11, r6, r4, r11 // sum (above) + *coeff * prev output 174 add r6, r11, r8 // 1 << (ar_coeff_shift - 1) 175 add r12, r12, r10 176 asr r6, r6, r7 // >> ar_coeff_shift 177 asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) 178 add r6, r6, r12 179 cmp r6, r5 180.elseif \n == 2 181 mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1 182 mla r11, r6, r10, r11 // += *coeff * prev output 2 183 mov r8, r6 184 add r6, r11, r0 // 1 << (ar_coeff_shift - 1) 185 add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) 186 asr r6, r6, r7 // >> ar_coeff_shift 187 asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) 188 add r6, r6, r12 189 push {lr} 190 cmp r6, r5 191 mvn lr, r5 // grain_min = ~grain_max 192.else 193 push {r1-r3} 194 sbfx r1, r4, #0, #8 195 sbfx r2, r4, #8, #8 196 sbfx r3, r4, #16, #8 197 mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1 198 mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2 199 mla r11, r6, r3, r11 // += *coeff * prev output 3 200 pop {r1-r3} 201 mov r10, r8 202 mov r8, r6 203 204 add r6, r11, r0 // 1 << (ar_coeff_shift - 1) 205 add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) 206 asr r6, r6, r7 // >> ar_coeff_shift 207 asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) 208 add r6, r6, r12 209 push {lr} 210 cmp r6, r5 211 mvn lr, r5 // grain_min = ~grain_max 212.endif 213 it gt 214 movgt r6, r5 215 cmp r6, lr 216 it lt 217 movlt r6, lr 218.if \n >= 2 219 pop {lr} 220.endif 221 subs r1, r1, #1 222 vext.8 q1, q1, q1, #4 223 vmov.16 d1[3], r6 224 bgt 1b 225 pop {r0, pc} 226endfunc 227.endm 228 229output_lag 1 230output_lag 2 231output_lag 3 232 233 234function sum_lag1_above_neon 235 sub r12, r0, #1*GRAIN_WIDTH*2 - 16 236 vld1.16 {q10}, [r12] // load top right 237 238 vext.8 q0, q8, q9, #14 // top left, top mid 239 vext.8 q1, q9, q10, #2 // top left, top mid 240 241 vmull.s16 q2, d18, d28 242 vmlal.s16 q2, d0, d27 243 vmlal.s16 q2, d2, d29 244 vmull.s16 q3, d19, d28 245 vmlal.s16 q3, d1, d27 246 vmlal.s16 q3, d3, d29 247 248 vmov q8, q9 249 vmov q9, q10 250 251 bx lr 252endfunc 253 254.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff 255.ifc \lag\()_\edge, lag3_left 256 bl sum_lag3_left_above_neon 257.else 258 bl sum_\lag\()_above_neon 259.endif 260.ifc \type, uv_420 261 vpush {q6-q7} 262 add r12, r11, #GRAIN_WIDTH*2 263 vld1.16 {q0, q1}, [r11]! 264 vld1.16 {q6, q7}, [r12]! 265 vpadd.i16 d0, d0, d1 266 vpadd.i16 d1, d2, d3 267 vpadd.i16 d12, d12, d13 268 vpadd.i16 d13, d14, d15 269 vadd.i16 q0, q0, q6 270 vpop {q6-q7} 271 vrshr.s16 q0, q0, #2 272.endif 273.ifc \type, uv_422 274 vld1.16 {q0, q1}, [r11]! 275 vpadd.i16 d0, d0, d1 276 vpadd.i16 d1, d2, d3 277 vrshr.s16 q0, q0, #1 278.endif 279.ifc \type, uv_444 280 vld1.16 {q0}, [r11]! 281.endif 282.if \uv_layout 283.ifnb \uv_coeff 284 vdup.8 d13, \uv_coeff 285 vmovl.s8 q6, d13 286.endif 287 vmlal.s16 q2, d0, d13 288 vmlal.s16 q3, d1, d13 289.endif 290.if \uv_layout && \elems == 8 291 b sum_\lag\()_y_\edge\()_start 292.elseif \uv_layout == 444 && \elems == 7 293 b sum_\lag\()_y_\edge\()_start 294.elseif \uv_layout == 422 && \elems == 1 295 b sum_\lag\()_uv_420_\edge\()_start 296.else 297sum_\lag\()_\type\()_\edge\()_start: 298 push {r11} 299.if \elems > 4 300.ifc \edge, left 301 increment_seed 4 302 read_rand r11, 11, 3 303 read_rand r12, 11, 2 304 add r11, r3, r11, lsl #1 305 add r12, r3, r12, lsl #1 306 vld1.16 {d1[1]}, [r11] 307 read_rand r11, 11, 1 308 vld1.16 {d1[2]}, [r12] 309 add r11, r3, r11, lsl #1 310 vld1.16 {d1[3]}, [r11] 311 lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0 312 vrshl.s16 d1, d1, d30 313 vext.8 q2, q2, q2, #12 314.ifc \lag, lag3 315 vmov.s16 r10, d1[1] 316.endif 317.ifnc \lag, lag1 318 vmov.s16 r8, d1[2] 319.endif 320 vmov.s16 r6, d1[3] 321 322 vmov q1, q2 323 mov r1, #1 324 bl output_\lag\()_neon 325.else 326 increment_seed 4, shift=0 327 vmov q1, q2 328 mov r1, #4 329 bl output_\lag\()_neon 330.endif 331 332 increment_seed 4, shift=0 333 vmov q1, q3 334.ifc \edge, right 335 mov r1, #3 336 bl output_\lag\()_neon 337 read_shift_rand r12, 11 338 add r12, r3, r12, lsl #1 339 vld1.16 {d2[0]}, [r12] 340 vrshl.s16 d2, d2, d30 341 vext.8 q0, q0, q1, #2 342.else 343 mov r1, #4 344 bl output_\lag\()_neon 345.endif 346.else 347 // elems == 1 348 increment_seed 4, shift=0 349 vmov q1, q2 350 mov r1, #1 351 bl output_\lag\()_neon 352 lsr r2, r2, #3 353 354 read_rand r11, 11, 2 355 read_rand r12, 11, 1 356 add r11, r3, r11, lsl #1 357 add r12, r3, r12, lsl #1 358 vld1.16 {d2[0]}, [r11] 359 read_rand r11, 11, 0 360 vld1.16 {d2[1]}, [r12] 361 add r11, r3, r11, lsl #1 362 vld1.16 {d2[2]}, [r11] 363 vrshl.s16 d2, d2, d30 364 vext.8 q0, q0, q1, #14 365.endif 366 vst1.16 {q0}, [r0]! 367 pop {r11} 368 pop {r1, pc} 369.endif 370.endm 371 372.macro sum_lag1_func type, uv_layout, edge, elems=8 373function sum_\type\()_lag1_\edge\()_neon 374 push {r1, lr} 375.ifc \edge, left 376 sub r12, r0, #1*GRAIN_WIDTH*2 377 vld1.8 {q9}, [r12] // load the previous block right above 378.endif 379 sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems 380endfunc 381.endm 382 383sum_lag1_func y, 0, left 384sum_lag1_func y, 0, mid 385sum_lag1_func y, 0, right, 7 386sum_lag1_func uv_444, 444, left 387sum_lag1_func uv_444, 444, mid 388sum_lag1_func uv_444, 444, right, 7 389sum_lag1_func uv_422, 422, left 390sum_lag1_func uv_422, 422, mid 391sum_lag1_func uv_422, 422, right, 1 392sum_lag1_func uv_420, 420, left 393sum_lag1_func uv_420, 420, mid 394sum_lag1_func uv_420, 420, right, 1 395 396 397function sum_lag2_above_neon 398 push {lr} 399 sub r12, r0, #2*GRAIN_WIDTH*2 - 16 400 sub lr, r0, #1*GRAIN_WIDTH*2 - 16 401 vld1.16 {q10}, [r12] // load top right 402 vld1.16 {q13}, [lr] 403 404 vdup.8 d10, d28[0] 405 vext.8 q0, q8, q9, #12 // top left, top mid 406 vdup.8 d12, d28[1] 407 vext.8 q1, q8, q9, #14 408 vdup.8 d14, d28[3] 409 vext.8 q4, q9, q10, #2 // top mid, top right 410 vmovl.s8 q5, d10 411 vmovl.s8 q6, d12 412 vmovl.s8 q7, d14 413 414 vmull.s16 q2, d0, d10 415 vmlal.s16 q2, d2, d12 416 vmlal.s16 q2, d8, d14 417 vmull.s16 q3, d1, d10 418 vmlal.s16 q3, d3, d12 419 vmlal.s16 q3, d9, d14 420 421 vdup.8 d10, d28[4] 422 vext.8 q0, q9, q10, #4 // top mid, top right 423 vdup.8 d12, d28[5] 424 vext.8 q1, q11, q12, #12 // top left, top mid 425 vdup.8 d14, d28[6] 426 vext.8 q4, q11, q12, #14 427 vmovl.s8 q5, d10 428 vmovl.s8 q6, d12 429 vmovl.s8 q7, d14 430 431 vmlal.s16 q2, d0, d10 432 vmlal.s16 q2, d2, d12 433 vmlal.s16 q2, d8, d14 434 vmlal.s16 q3, d1, d10 435 vmlal.s16 q3, d3, d12 436 vmlal.s16 q3, d9, d14 437 438 vdup.8 d10, d29[0] 439 vext.8 q0, q12, q13, #2 // top mid, top right 440 vdup.8 d12, d29[1] 441 vext.8 q1, q12, q13, #4 442 443 vdup.8 d14, d28[2] 444 vdup.8 d8, d28[7] 445 446 vmovl.s8 q5, d10 447 vmovl.s8 q6, d12 448 vmovl.s8 q7, d14 449 vmovl.s8 q4, d8 450 451 vmlal.s16 q2, d0, d10 452 vmlal.s16 q2, d2, d12 453 vmlal.s16 q2, d18, d14 454 vmlal.s16 q2, d24, d8 455 vmlal.s16 q3, d1, d10 456 vmlal.s16 q3, d3, d12 457 vmlal.s16 q3, d19, d14 458 vmlal.s16 q3, d25, d8 459 460 vmov q8, q9 461 vmov q9, q10 462 463 vmov q11, q12 464 vmov q12, q13 465 466 pop {pc} 467endfunc 468 469.macro sum_lag2_func type, uv_layout, edge, elems=8 470function sum_\type\()_lag2_\edge\()_neon 471 push {r1, lr} 472.ifc \edge, left 473 sub r12, r0, #2*GRAIN_WIDTH*2 474 sub lr, r0, #1*GRAIN_WIDTH*2 475 vld1.16 {q9}, [r12] // load the previous block right above 476 vld1.16 {q12}, [lr] 477.endif 478 sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, uv_coeff=d29[4] 479endfunc 480.endm 481 482sum_lag2_func y, 0, left 483sum_lag2_func y, 0, mid 484sum_lag2_func y, 0, right, 7 485sum_lag2_func uv_444, 444, left 486sum_lag2_func uv_444, 444, mid 487sum_lag2_func uv_444, 444, right, 7 488sum_lag2_func uv_422, 422, left 489sum_lag2_func uv_422, 422, mid 490sum_lag2_func uv_422, 422, right, 1 491sum_lag2_func uv_420, 420, left 492sum_lag2_func uv_420, 420, mid 493sum_lag2_func uv_420, 420, right, 1 494 495 496function sum_lag3_left_above_neon 497 // A separate codepath for the left edge, to avoid reading outside 498 // of the edge of the buffer. 499 sub r12, r0, #3*GRAIN_WIDTH*2 500 vld1.8 {q11, q12}, [r12] 501 vext.8 q12, q11, q12, #10 502 vext.8 q11, q11, q11, #10 503 b sum_lag3_above_start 504endfunc 505 506function sum_lag3_above_neon 507 movw r12, #(3*GRAIN_WIDTH + 3)*2 508 sub r12, r0, r12 509 vld1.8 {q11, q12}, [r12] 510 511sum_lag3_above_start: 512 vdup.8 d12, d26[0] 513 vext.8 q1, q11, q12, #2 514 vdup.8 d14, d26[1] 515 vext.8 q4, q11, q12, #4 516 vdup.8 d16, d26[2] 517 vext.8 q5, q11, q12, #6 518 vdup.8 d18, d26[3] 519 vmovl.s8 q6, d12 520 vmovl.s8 q7, d14 521 vmovl.s8 q8, d16 522 vmovl.s8 q9, d18 523 524 movw r12, #(2*GRAIN_WIDTH + 3)*2 525 sub r12, r0, r12 526 527 vmull.s16 q2, d22, d12 528 vmlal.s16 q2, d2, d14 529 vmlal.s16 q2, d8, d16 530 vmlal.s16 q2, d10, d18 531 vmull.s16 q3, d23, d12 532 vmlal.s16 q3, d3, d14 533 vmlal.s16 q3, d9, d16 534 vmlal.s16 q3, d11, d18 535 536 vdup.8 d12, d26[4] 537 vext.8 q0, q11, q12, #8 538 vdup.8 d14, d26[5] 539 vext.8 q1, q11, q12, #10 540 vdup.8 d16, d26[6] 541 vext.8 q4, q11, q12, #12 542 vld1.8 {q11, q12}, [r12] 543 vdup.8 d18, d26[7] 544 vmovl.s8 q6, d12 545 vmovl.s8 q7, d14 546 vmovl.s8 q8, d16 547 vmovl.s8 q9, d18 548 549 vmlal.s16 q2, d0, d12 550 vmlal.s16 q2, d2, d14 551 vmlal.s16 q2, d8, d16 552 vmlal.s16 q2, d22, d18 553 vmlal.s16 q3, d1, d12 554 vmlal.s16 q3, d3, d14 555 vmlal.s16 q3, d9, d16 556 vmlal.s16 q3, d23, d18 557 558 vdup.8 d12, d27[0] 559 vext.8 q0, q11, q12, #2 560 vdup.8 d14, d27[1] 561 vext.8 q1, q11, q12, #4 562 vdup.8 d16, d27[2] 563 vext.8 q4, q11, q12, #6 564 vdup.8 d18, d27[3] 565 vext.8 q5, q11, q12, #8 566 vmovl.s8 q6, d12 567 vmovl.s8 q7, d14 568 vmovl.s8 q8, d16 569 vmovl.s8 q9, d18 570 571 sub r12, r0, #(1*GRAIN_WIDTH + 3)*2 572 573 vmlal.s16 q2, d0, d12 574 vmlal.s16 q2, d2, d14 575 vmlal.s16 q2, d8, d16 576 vmlal.s16 q2, d10, d18 577 vmlal.s16 q3, d1, d12 578 vmlal.s16 q3, d3, d14 579 vmlal.s16 q3, d9, d16 580 vmlal.s16 q3, d11, d18 581 582 vdup.8 d12, d27[4] 583 vext.8 q0, q11, q12, #10 584 vdup.8 d14, d27[5] 585 vext.8 q1, q11, q12, #12 586 vld1.8 {q11, q12}, [r12] 587 vdup.8 d16, d27[6] 588 vdup.8 d18, d27[7] 589 vmovl.s8 q6, d12 590 vmovl.s8 q7, d14 591 vext.8 q5, q11, q12, #2 592 vmovl.s8 q8, d16 593 vmovl.s8 q9, d18 594 595 vmlal.s16 q2, d0, d12 596 vmlal.s16 q2, d2, d14 597 vmlal.s16 q2, d22, d16 598 vmlal.s16 q2, d10, d18 599 vmlal.s16 q3, d1, d12 600 vmlal.s16 q3, d3, d14 601 vmlal.s16 q3, d23, d16 602 vmlal.s16 q3, d11, d18 603 604 vdup.8 d12, d28[0] 605 vext.8 q0, q11, q12, #4 606 vdup.8 d14, d28[1] 607 vext.8 q1, q11, q12, #6 608 vdup.8 d16, d28[2] 609 vext.8 q4, q11, q12, #8 610 vdup.8 d18, d28[3] 611 vext.8 q5, q11, q12, #10 612 vmovl.s8 q6, d12 613 vmovl.s8 q7, d14 614 vmovl.s8 q8, d16 615 vmovl.s8 q9, d18 616 617 vmlal.s16 q2, d0, d12 618 vmlal.s16 q2, d2, d14 619 vmlal.s16 q2, d8, d16 620 vmlal.s16 q2, d10, d18 621 vmlal.s16 q3, d1, d12 622 vmlal.s16 q3, d3, d14 623 vmlal.s16 q3, d9, d16 624 vmlal.s16 q3, d11, d18 625 626 vdup.8 d12, d28[4] 627 vext.8 q0, q11, q12, #12 628 vmovl.s8 q6, d12 629 630 vmlal.s16 q2, d0, d12 631 vmlal.s16 q3, d1, d12 632 633 bx lr 634endfunc 635 636.macro sum_lag3_func type, uv_layout, edge, elems=8 637function sum_\type\()_lag3_\edge\()_neon 638 push {r1, lr} 639 sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, uv_coeff=d29[0] 640endfunc 641.endm 642 643sum_lag3_func y, 0, left 644sum_lag3_func y, 0, mid 645sum_lag3_func y, 0, right, 7 646sum_lag3_func uv_444, 444, left 647sum_lag3_func uv_444, 444, mid 648sum_lag3_func uv_444, 444, right, 7 649sum_lag3_func uv_422, 422, left 650sum_lag3_func uv_422, 422, mid 651sum_lag3_func uv_422, 422, right, 1 652sum_lag3_func uv_420, 420, left 653sum_lag3_func uv_420, 420, mid 654sum_lag3_func uv_420, 420, right, 1 655 656function generate_grain_rows_neon 657 push {r10-r11,lr} 6581: 659 mov r10, #80 6602: 661 bl get_gaussian_neon 662 vrshl.s16 q0, q0, q15 663 subs r10, r10, #8 664 vst1.16 {q0}, [r0]! 665 bgt 2b 666 get_grain_2 d0 667 subs r1, r1, #1 668 vst1.32 {d0[0]}, [r0]! 669 bgt 1b 670 pop {r10-r11,pc} 671endfunc 672 673function generate_grain_rows_44_neon 674 push {r10-r11,lr} 6751: 676 mov r10, #40 6772: 678 bl get_gaussian_neon 679 vrshl.s16 q0, q0, q15 680 subs r10, r10, #8 681 vst1.16 {q0}, [r0]! 682 bgt 2b 683 get_grain_4 d0 684 subs r1, r1, #1 685 vst1.16 {d0}, [r0] 686 add r0, r0, #GRAIN_WIDTH*2-80 687 bgt 1b 688 pop {r10-r11,pc} 689endfunc 690 691function gen_grain_uv_444_lag0_neon 692 vld1.16 {q3}, [r11]! 693gen_grain_uv_lag0_8_start: 694 push {r11,lr} 695 bl get_gaussian_neon 696 vrshl.s16 q0, q0, q15 697gen_grain_uv_lag0_8_add: 698 vand q3, q3, q1 699 vmull.s16 q2, d6, d22 700 vmull.s16 q3, d7, d22 701 vrshl.s32 q2, q2, q12 702 vrshl.s32 q3, q3, q12 703 vqmovn.s32 d4, q2 704 vqmovn.s32 d5, q3 705 vqadd.s16 q2, q2, q0 706 vmin.s16 q2, q2, q9 707 vmax.s16 q2, q2, q10 708 vst1.16 {q2}, [r0]! 709 pop {r11,pc} 710endfunc 711 712function gen_grain_uv_420_lag0_8_neon 713 add r12, r11, #GRAIN_WIDTH*2 714 vld1.16 {q2,q3}, [r11]! 715 vld1.16 {q4,q5}, [r12] 716 vpadd.i16 d4, d4, d5 717 vpadd.i16 d5, d6, d7 718 vpadd.i16 d8, d8, d9 719 vpadd.i16 d9, d10, d11 720 vadd.i16 q2, q2, q4 721 vrshr.s16 q3, q2, #2 722 b gen_grain_uv_lag0_8_start 723endfunc 724 725function gen_grain_uv_422_lag0_8_neon 726 vld1.16 {q2,q3}, [r11]! 727 vpadd.i16 d4, d4, d5 728 vpadd.i16 d5, d6, d7 729 vrshr.s16 q3, q2, #1 730 b gen_grain_uv_lag0_8_start 731endfunc 732 733function gen_grain_uv_420_lag0_4_neon 734 add r12, r11, #GRAIN_WIDTH*2 735 vld1.16 {q2}, [r11] 736 vld1.16 {q0}, [r12] 737 add r11, r11, #32 738 vpadd.i16 d4, d4, d5 739 vpadd.i16 d0, d0, d1 740 vadd.i16 d4, d4, d0 741 vrshr.s16 d6, d4, #2 742 push {r11,lr} 743 get_grain_4 d0 744 b gen_grain_uv_lag0_8_add 745endfunc 746 747function gen_grain_uv_422_lag0_4_neon 748 vld1.16 {q2}, [r11] 749 add r11, r11, #32 750 vpadd.i16 d4, d4, d5 751 vrshr.s16 d6, d4, #1 752 push {r11,lr} 753 get_grain_4 d0 754 b gen_grain_uv_lag0_8_add 755endfunc 756 757.macro gen_grain_82 type 758function generate_grain_\type\()_16bpc_neon, export=1 759 push {r4-r11,lr} 760 761.ifc \type, uv_444 762 ldr r4, [sp, #36] 763 mov r12, r3 764 mov lr, #28 765 add r11, r1, #3*GRAIN_WIDTH*2 766 mov r1, r2 767 mul r12, r12, lr 768 clz lr, r4 769.else 770 clz lr, r2 771.endif 772 movrel r3, X(gaussian_sequence) 773 sub lr, lr, #24 // -bitdepth_min_8 774 ldr r2, [r1, #FGD_SEED] 775 ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] 776.ifc \type, y 777 add r4, r1, #FGD_AR_COEFFS_Y 778.else 779 add r4, r1, #FGD_AR_COEFFS_UV 780.endif 781 add r9, r9, lr // grain_scale_shift - bitdepth_min_8 782 adr r5, L(gen_grain_\type\()_tbl) 783 ldr r6, [r1, #FGD_AR_COEFF_LAG] 784 add r9, r9, #4 785 ldr r6, [r5, r6, lsl #2] 786 vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift 787 add r5, r5, r6 788 vneg.s16 q15, q15 789 790.ifc \type, uv_444 791 push {lr} 792 cmp r12, #0 793 movw r10, #0x49d8 794 movw lr, #0xb524 795 // Intentionally using a separate register instead of moveq with an 796 // immediate constant, to avoid armv8 deprecated it instruction forms. 797 it eq 798 moveq r10, lr 799 add r4, r4, r12 // Add offset to ar_coeffs_uv[1] 800 eor r2, r2, r10 801 pop {lr} 802.endif 803 804 ldr r7, [r1, #FGD_AR_COEFF_SHIFT] 805 neg lr, lr // bitdepth_min_8 806 mov r8, #1 807 mov r10, #1 808 lsl r8, r8, r7 // 1 << ar_coeff_shift 809 lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) 810 lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) 811 lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) 812 813 bx r5 814 815 .align 2 816L(gen_grain_\type\()_tbl): 817 .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB 818 .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB 819 .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB 820 .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB 821 822L(generate_grain_\type\()_lag0): 823.ifc \type, y 824 mov r1, #GRAIN_HEIGHT 825 bl generate_grain_rows_neon 826.else 827 mov r5, #128 828 lsl r5, r5, lr // 128 << bitdepth_min_8 829 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 830 mvn r6, r5 // grain_min = ~grain_max 831 832 mov r1, #3 833 bl generate_grain_rows_neon 834 mov r1, #GRAIN_HEIGHT-3 835 836 vdup.32 q12, r7 837 vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] 838 vmov.i8 q0, #0 839 vmov.i8 q1, #255 840 vdup.16 q9, r5 841 vdup.16 q10, r6 842 vext.8 q13, q0, q1, #10 843 vext.8 q14, q1, q0, #2 844 vneg.s32 q12, q12 845 vmovl.s8 q11, d22 846 8471: 848 vmov q1, q13 849 bl gen_grain_uv_444_lag0_neon // 8 850 vmov.i8 q1, #255 851 bl gen_grain_uv_444_lag0_neon // 16 852 bl gen_grain_uv_444_lag0_neon // 24 853 bl gen_grain_uv_444_lag0_neon // 32 854 bl gen_grain_uv_444_lag0_neon // 40 855 bl gen_grain_uv_444_lag0_neon // 48 856 bl gen_grain_uv_444_lag0_neon // 56 857 bl gen_grain_uv_444_lag0_neon // 64 858 bl gen_grain_uv_444_lag0_neon // 72 859 vmov q1, q14 860 bl gen_grain_uv_444_lag0_neon // 80 861 get_grain_2 d16 862 subs r1, r1, #1 863 add r11, r11, #4 864 vst1.32 {d16[0]}, [r0]! 865 bgt 1b 866.endif 867 pop {r4-r11,pc} 868 869L(generate_grain_\type\()_lag1): 870 vpush {q4-q7} 871 mov r5, #128 872 lsl r5, r5, lr // 128 << bitdepth_min_8 873 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 874 vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0] 875 vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1] 876 vld1.8 {d29[]}, [r4] // ar_coeffs_y[2] 877.ifc \type, y 878 ldrsb r4, [r4, #1] // ar_coeffs_y[3] 879.else 880 add r4, r4, #2 881.endif 882 883 mov r1, #3 884.ifc \type, uv_444 885 vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] 886 ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] 887.endif 888 bl generate_grain_rows_neon 889 vmovl.s8 q13, d27 890 vmovl.s8 q12, d29 891 vmovl.s8 q14, d28 892 vmov d29, d24 893.ifc \type, uv_444 894 vmovl.s8 q6, d13 895.endif 896 897 mov r1, #GRAIN_HEIGHT - 3 8981: 899 bl sum_\type\()_lag1_left_neon // 8 900 bl sum_\type\()_lag1_mid_neon // 16 901 bl sum_\type\()_lag1_mid_neon // 24 902 bl sum_\type\()_lag1_mid_neon // 32 903 bl sum_\type\()_lag1_mid_neon // 40 904 bl sum_\type\()_lag1_mid_neon // 48 905 bl sum_\type\()_lag1_mid_neon // 56 906 bl sum_\type\()_lag1_mid_neon // 64 907 bl sum_\type\()_lag1_mid_neon // 72 908 bl sum_\type\()_lag1_right_neon // 80 909 get_grain_2 d16 910 subs r1, r1, #1 911.ifc \type, uv_444 912 add r11, r11, #4 913.endif 914 vst1.32 {d16[0]}, [r0]! 915 bgt 1b 916 917 vpop {q4-q7} 918 pop {r4-r11,pc} 919 920L(generate_grain_\type\()_lag2): 921 vpush {q4-q7} 922 mov r5, #128 923 lsl r5, r5, lr // 128 << bitdepth_min_8 924 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 925 vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] 926 927 vmov.s8 r4, d29[2] 928 vmov.s8 r10, d29[3] 929 930 mov r1, #3 931 bl generate_grain_rows_neon 932 933 mov r1, #GRAIN_HEIGHT - 3 9341: 935 bl sum_\type\()_lag2_left_neon // 8 936 bl sum_\type\()_lag2_mid_neon // 16 937 bl sum_\type\()_lag2_mid_neon // 24 938 bl sum_\type\()_lag2_mid_neon // 32 939 bl sum_\type\()_lag2_mid_neon // 40 940 bl sum_\type\()_lag2_mid_neon // 48 941 bl sum_\type\()_lag2_mid_neon // 56 942 bl sum_\type\()_lag2_mid_neon // 64 943 bl sum_\type\()_lag2_mid_neon // 72 944 bl sum_\type\()_lag2_right_neon // 80 945 get_grain_2 d16 946 subs r1, r1, #1 947.ifc \type, uv_444 948 add r11, r11, #4 949.endif 950 vst1.32 {d16[0]}, [r0]! 951 bgt 1b 952 953 vpop {q4-q7} 954 pop {r4-r11,pc} 955 956L(generate_grain_\type\()_lag3): 957 vpush {q4-q7} 958 mov r5, #128 959 lsl r5, r5, lr // 128 << bitdepth_min_8 960 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 961 vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] 962 963 vmov.u8 r4, d28[5] 964 vmov.u8 r10, d28[6] 965 vmov.u8 r12, d28[7] 966 967 orr r4, r4, r10, lsl #8 968 orr r4, r4, r12, lsl #16 969 970 mov r1, #3 971 vpush {d26} 972 bl generate_grain_rows_neon 973 vpop {d26} 974 975 mov r1, #GRAIN_HEIGHT - 3 9761: 977 bl sum_\type\()_lag3_left_neon // 8 978 bl sum_\type\()_lag3_mid_neon // 16 979 bl sum_\type\()_lag3_mid_neon // 24 980 bl sum_\type\()_lag3_mid_neon // 32 981 bl sum_\type\()_lag3_mid_neon // 40 982 bl sum_\type\()_lag3_mid_neon // 48 983 bl sum_\type\()_lag3_mid_neon // 56 984 bl sum_\type\()_lag3_mid_neon // 64 985 bl sum_\type\()_lag3_mid_neon // 72 986 bl sum_\type\()_lag3_right_neon // 80 987 get_grain_2 d16 988 subs r1, r1, #1 989.ifc \type, uv_444 990 add r11, r11, #4 991.endif 992 vst1.32 {d16[0]}, [r0]! 993 bgt 1b 994 995 vpop {q4-q7} 996 pop {r4-r11,pc} 997endfunc 998.endm 999 1000gen_grain_82 y 1001gen_grain_82 uv_444 1002 1003.macro set_height dst, type 1004.ifc \type, uv_420 1005 mov \dst, #SUB_GRAIN_HEIGHT-3 1006.else 1007 mov \dst, #GRAIN_HEIGHT-3 1008.endif 1009.endm 1010 1011.macro increment_y_ptr reg, type 1012.ifc \type, uv_420 1013 add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32) 1014.else 1015 sub \reg, \reg, #6*32-GRAIN_WIDTH*2 1016.endif 1017.endm 1018 1019.macro gen_grain_44 type 1020function generate_grain_\type\()_16bpc_neon, export=1 1021 push {r4-r11,lr} 1022 1023 ldr r4, [sp, #36] 1024 mov r12, r3 1025 movw r11, #(3*GRAIN_WIDTH-3)*2 1026 mov lr, #28 1027 add r11, r1, r11 1028 mov r1, r2 1029 mul r12, r12, lr 1030 clz lr, r4 1031 1032 movrel r3, X(gaussian_sequence) 1033 sub lr, lr, #24 // -bitdepth_min_8 1034 ldr r2, [r1, #FGD_SEED] 1035 ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] 1036 add r4, r1, #FGD_AR_COEFFS_UV 1037 add r9, r9, lr // grain_scale_shift - bitdepth_min_8 1038 adr r5, L(gen_grain_\type\()_tbl) 1039 ldr r6, [r1, #FGD_AR_COEFF_LAG] 1040 add r9, r9, #4 1041 ldr r6, [r5, r6, lsl #2] 1042 vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift 1043 add r5, r5, r6 1044 vneg.s16 q15, q15 1045 1046 push {lr} 1047 cmp r12, #0 1048 movw r10, #0x49d8 1049 movw lr, #0xb524 1050 // Intentionally using a separate register instead of moveq with an 1051 // immediate constant, to avoid armv8 deprecated it instruction forms. 1052 it eq 1053 moveq r10, lr 1054 add r4, r4, r12 // Add offset to ar_coeffs_uv[1] 1055 eor r2, r2, r10 1056 pop {lr} 1057 1058 ldr r7, [r1, #FGD_AR_COEFF_SHIFT] 1059 neg lr, lr 1060 mov r8, #1 1061 mov r10, #1 1062 lsl r8, r8, r7 // 1 << ar_coeff_shift 1063 lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) 1064 lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) 1065 lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) 1066 bx r5 1067 1068 .align 2 1069L(gen_grain_\type\()_tbl): 1070 .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB 1071 .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB 1072 .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB 1073 .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB 1074 1075L(generate_grain_\type\()_lag0): 1076.ifc \type, uv_420 1077 vpush {q4-q5} 1078.endif 1079 mov r5, #128 1080 lsl r5, r5, lr // 128 << bitdepth_min_8 1081 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 1082 mvn r6, r5 // grain_min = ~grain_max 1083 1084 mov r1, #3 1085 bl generate_grain_rows_44_neon 1086 set_height r1, \type 1087 1088 vdup.32 q12, r7 1089 vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] 1090 vmov.i8 q0, #0 1091 vmov.i8 q1, #255 1092 vdup.16 q9, r5 1093 vdup.16 q10, r6 1094 vext.8 q13, q0, q1, #10 1095 vext.8 q14, q1, q0, #14 1096 vneg.s32 q12, q12 1097 vmovl.s8 q11, d22 1098 10991: 1100 vmov q1, q13 1101 bl gen_grain_\type\()_lag0_8_neon // 8 1102 vmov.i8 q1, #255 1103 bl gen_grain_\type\()_lag0_8_neon // 16 1104 bl gen_grain_\type\()_lag0_8_neon // 24 1105 bl gen_grain_\type\()_lag0_8_neon // 32 1106 bl gen_grain_\type\()_lag0_8_neon // 40 1107 vmov q1, q14 1108 bl gen_grain_\type\()_lag0_4_neon // 44 1109 subs r1, r1, #1 1110 increment_y_ptr r11, \type 1111 add r0, r0, #GRAIN_WIDTH*2-6*16 1112 bgt 1b 1113 1114.ifc \type, uv_420 1115 vpop {q4-q5} 1116.endif 1117 pop {r4-r11,pc} 1118 1119L(generate_grain_\type\()_lag1): 1120 vpush {q4-q7} 1121 mov r5, #128 1122 lsl r5, r5, lr // 128 << bitdepth_min_8 1123 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 1124 vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0] 1125 vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1] 1126 vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2] 1127 add r4, r4, #2 1128 1129 mov r1, #3 1130 vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] 1131 ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] 1132 bl generate_grain_rows_44_neon 1133 vmovl.s8 q13, d27 1134 vmovl.s8 q12, d29 1135 vmovl.s8 q14, d28 1136 vmov d29, d24 1137 vmovl.s8 q6, d13 1138 1139 set_height r1, \type 11401: 1141 bl sum_\type\()_lag1_left_neon // 8 1142 bl sum_\type\()_lag1_mid_neon // 16 1143 bl sum_\type\()_lag1_mid_neon // 24 1144 bl sum_\type\()_lag1_mid_neon // 32 1145 bl sum_\type\()_lag1_mid_neon // 40 1146 bl sum_\type\()_lag1_right_neon // 44 1147 subs r1, r1, #1 1148 increment_y_ptr r11, \type 1149 add r0, r0, #GRAIN_WIDTH*2-6*16 1150 bgt 1b 1151 1152 vpop {q4-q7} 1153 pop {r4-r11,pc} 1154 1155L(generate_grain_\type\()_lag2): 1156 vpush {q4-q7} 1157 mov r5, #128 1158 lsl r5, r5, lr // 128 << bitdepth_min_8 1159 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 1160 vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12] 1161 1162 vmov.s8 r4, d29[2] 1163 vmov.s8 r10, d29[3] 1164 1165 mov r1, #3 1166 bl generate_grain_rows_44_neon 1167 1168 set_height r1, \type 11691: 1170 bl sum_\type\()_lag2_left_neon // 8 1171 bl sum_\type\()_lag2_mid_neon // 16 1172 bl sum_\type\()_lag2_mid_neon // 24 1173 bl sum_\type\()_lag2_mid_neon // 32 1174 bl sum_\type\()_lag2_mid_neon // 40 1175 bl sum_\type\()_lag2_right_neon // 44 1176 subs r1, r1, #1 1177 increment_y_ptr r11, \type 1178 add r0, r0, #GRAIN_WIDTH*2-6*16 1179 bgt 1b 1180 1181 vpop {q4-q7} 1182 pop {r4-r11,pc} 1183 1184L(generate_grain_\type\()_lag3): 1185 vpush {q4-q7} 1186 mov r5, #128 1187 lsl r5, r5, lr // 128 << bitdepth_min_8 1188 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 1189 vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] 1190 1191 vmov.u8 r4, d28[5] 1192 vmov.u8 r10, d28[6] 1193 vmov.u8 r12, d28[7] 1194 1195 orr r4, r4, r10, lsl #8 1196 orr r4, r4, r12, lsl #16 1197 1198 mov r1, #3 1199 bl generate_grain_rows_44_neon 1200 1201 set_height r1, \type 12021: 1203 bl sum_\type\()_lag3_left_neon // 8 1204 bl sum_\type\()_lag3_mid_neon // 16 1205 bl sum_\type\()_lag3_mid_neon // 24 1206 bl sum_\type\()_lag3_mid_neon // 32 1207 bl sum_\type\()_lag3_mid_neon // 40 1208 bl sum_\type\()_lag3_right_neon // 44 1209 subs r1, r1, #1 1210 increment_y_ptr r11, \type 1211 add r0, r0, #GRAIN_WIDTH*2-6*16 1212 bgt 1b 1213 1214 vpop {q4-q7} 1215 pop {r4-r11,pc} 1216endfunc 1217.endm 1218 1219gen_grain_44 uv_420 1220gen_grain_44 uv_422 1221 1222.macro gather_interleaved dst1, dst2, src1, src2, src3, src4, off 1223 vmov.u16 r11, \src1[0+\off] 1224 vmov.u16 r12, \src3[0+\off] 1225 add r11, r11, r3 1226 vmov.u16 lr, \src1[2+\off] 1227 add r12, r12, r3 1228 vld1.8 {\dst1[0+\off]}, [r11] 1229 vmov.u16 r11, \src3[2+\off] 1230 add lr, lr, r3 1231 vld1.8 {\dst2[0+\off]}, [r12] 1232 vmov.u16 r12, \src2[0+\off] 1233 add r11, r11, r3 1234 vld1.8 {\dst1[2+\off]}, [lr] 1235 vmov.u16 lr, \src4[0+\off] 1236 add r12, r12, r3 1237 vld1.8 {\dst2[2+\off]}, [r11] 1238 vmov.u16 r11, \src2[2+\off] 1239 add lr, lr, r3 1240 vld1.8 {\dst1[4+\off]}, [r12] 1241 vmov.u16 r12, \src4[2+\off] 1242 add r11, r11, r3 1243 vld1.8 {\dst2[4+\off]}, [lr] 1244 add r12, r12, r3 1245 vld1.8 {\dst1[6+\off]}, [r11] 1246 vld1.8 {\dst2[6+\off]}, [r12] 1247.endm 1248 1249.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8 1250 gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 0 1251 gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 1 1252 gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 0 1253 gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 1 1254.endm 1255 1256function gather32_neon 1257 push {r11-r12,lr} 1258 gather d8, d9, d10, d11, d0, d1, d2, d3, d4, d5, d6, d7 1259 pop {r11-r12,pc} 1260endfunc 1261 1262function gather16_neon 1263 push {r11-r12,lr} 1264 gather_interleaved d8, d9, d0, d1, d2, d3, 0 1265 gather_interleaved d8, d9, d0, d1, d2, d3, 1 1266 pop {r11-r12,pc} 1267endfunc 1268 1269const overlap_coeffs_0, align=4 1270 .short 27, 17, 0, 0 1271 .short 17, 27, 32, 32 1272endconst 1273 1274const overlap_coeffs_1, align=4 1275 .short 23, 0, 0, 0 1276 .short 22, 32, 32, 32 1277endconst 1278 1279.macro calc_offset offx, offy, src, sx, sy 1280 and \offy, \src, #0xF // randval & 0xF 1281 lsr \offx, \src, #4 // randval >> 4 1282.if \sy == 0 1283 add \offy, \offy, \offy // 2 * (randval & 0xF) 1284.endif 1285.if \sx == 0 1286 add \offx, \offx, \offx // 2 * (randval >> 4) 1287.endif 1288.endm 1289 1290.macro add_offset dst, offx, offy, src, stride 1291 mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy 1292 add \dst, \dst, \offx, lsl #1 // grain_lut += offx 1293.endm 1294 1295// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src, 1296// const ptrdiff_t stride, 1297// const uint8_t scaling[SCALING_SIZE], 1298// const int scaling_shift, 1299// const entry grain_lut[][GRAIN_WIDTH], 1300// const int offsets[][2], 1301// const int h, const ptrdiff_t clip, 1302// const ptrdiff_t type, 1303// const int bitdepth_max); 1304function fgy_32x32_16bpc_neon, export=1 1305 push {r4-r11,lr} 1306 vpush {q4-q7} 1307 ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut 1308 ldrd r6, r7, [sp, #108] // offsets, h 1309 ldr r8, [sp, #116] // clip 1310 mov r9, #GRAIN_WIDTH*2 // grain_lut stride 1311 ldr r10, [sp, #124] // bitdepth_max 1312 1313 eor r4, r4, #15 // 15 - scaling_shift 1314 vdup.16 q6, r10 // bitdepth_max 1315 clz r10, r10 1316 vdup.16 q13, r4 // 15 - scaling_shift 1317 rsb r10, r10, #24 // bitdepth_min_8 1318 cmp r8, #0 1319 vdup.16 q12, r10 // bitdepth_min_8 1320 1321 movrel_local r12, overlap_coeffs_0 1322 1323 beq 1f 1324 // clip 1325 vmov.i16 q14, #16 1326 vmov.i16 q15, #235 1327 vshl.s16 q14, q14, q12 1328 vshl.s16 q15, q15, q12 1329 b 2f 13301: 1331 // no clip 1332 vmov.i16 q14, #0 1333 vmov q15, q6 13342: 1335 vshr.u16 q6, q6, #1 // grain_max 1336 1337 vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs 1338 1339 add r5, r5, #18 // grain_lut += 9 1340 add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride 1341 add r5, r5, r9 // grain_lut += grain_stride 1342 1343 ldr r10, [r6, #8] // offsets[1][0] 1344 calc_offset r10, r4, r10, 0, 0 1345 add_offset r4, r10, r4, r5, r9 1346 ldr r10, [r6, #4] // offsets[0][1] 1347 calc_offset r10, r11, r10, 0, 0 1348 add_offset r11, r10, r11, r5, r9 1349 ldr r10, [r6, #12] // offsets[1][1] 1350 calc_offset r10, r8, r10, 0, 0 1351 add_offset r8, r10, r8, r5, r9 1352 ldr r6, [r6] // offsets[0][0] 1353 calc_offset r6, lr, r6, 0, 0 1354 add_offset r5, r6, lr, r5, r9 1355 1356 add r4, r4, #32*2 // grain_lut += FG_BLOCK_SIZE * bx 1357 add r6, r11, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1358 1359 ldr r10, [sp, #120] // type 1360 adr r11, L(fgy_loop_tbl) 1361 1362 tst r10, #1 1363 ldr r10, [r11, r10, lsl #2] 1364 1365 add r8, r8, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1366 add r8, r8, #32*2 // grain_lut += FG_BLOCK_SIZE * bx 1367 1368 add r11, r11, r10 1369 1370 beq 1f 1371 // y overlap 1372 vdup.16 d14, d24[0] 1373 vdup.16 d15, d24[1] 1374 mov r10, r7 // backup actual h 1375 mov r7, #2 13761: 1377 sub r2, r2, #32 // src_stride -= 32 1378 sub r9, r9, #32 // grain_stride -= 32 1379 bx r11 1380endfunc 1381 1382function fgy_loop_neon 1383L(fgy_loop_tbl): 1384 .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB 1385 .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB 1386 .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB 1387 .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB 1388 1389.macro fgy ox, oy 1390L(loop_\ox\oy): 13911: 1392.if \ox 1393 vld1.16 {d0}, [r4], r9 // grain_lut old 1394.endif 1395.if \oy 1396 vld1.16 {q2, q3}, [r6]! // grain_lut top 1397.endif 1398.if \ox && \oy 1399 vld1.16 {d2}, [r8], r9 // grain_lut top old 1400.endif 1401.if \oy 1402 vld1.16 {q4, q5}, [r6], r9 // grain_lut top 1403.endif 1404.if !\ox && !\oy 1405 vld1.16 {q0, q1}, [r1, :128]! // src 1406.endif 1407 vld1.16 {q8, q9}, [r5]! // grain_lut 1408.if !\ox && !\oy 1409 vld1.16 {q2, q3}, [r1, :128], r2 // src 1410.endif 1411.if !\oy 1412 vmvn.i16 q5, #0xf000 // 0x0fff 1413.endif 1414 vld1.16 {q10, q11}, [r5], r9 // grain_lut 1415 1416.if \ox 1417 add r4, r4, #32 1418 vmull.s16 q0, d0, d24 1419 vmlal.s16 q0, d16, d25 1420.endif 1421 1422.if \oy 1423.if \ox 1424 add r8, r8, #32 1425 vmull.s16 q1, d2, d24 1426 vmlal.s16 q1, d4, d25 1427 vqrshrn.s32 d16, q0, #5 1428 vmvn d0, d12 // grain_min 1429 vqrshrn.s32 d4, q1, #5 1430 vmin.s16 d16, d16, d12 1431 vmin.s16 d4, d4, d12 1432 vmax.s16 d16, d16, d0 1433 vmax.s16 d4, d4, d0 1434.endif 1435 1436 vmull.s16 q0, d4, d14 1437 vmull.s16 q1, d5, d14 1438 vmull.s16 q2, d6, d14 1439 vmull.s16 q3, d7, d14 1440 vmlal.s16 q0, d16, d15 1441 vmlal.s16 q1, d17, d15 1442 vmlal.s16 q2, d18, d15 1443 vmlal.s16 q3, d19, d15 1444 vmull.s16 q8, d20, d15 1445 vmull.s16 q9, d21, d15 1446 vmull.s16 q10, d22, d15 1447 vmull.s16 q11, d23, d15 1448 vmlal.s16 q8, d8, d14 1449 vmlal.s16 q9, d9, d14 1450 vmlal.s16 q10, d10, d14 1451 vmlal.s16 q11, d11, d14 1452 vmvn q4, q6 // grain_min 1453 vqrshrn.s32 d0, q0, #5 1454 vqrshrn.s32 d1, q1, #5 1455 vqrshrn.s32 d2, q2, #5 1456 vqrshrn.s32 d3, q3, #5 1457 vqrshrn.s32 d4, q8, #5 1458 vqrshrn.s32 d5, q9, #5 1459 vqrshrn.s32 d6, q10, #5 1460 vqrshrn.s32 d7, q11, #5 1461 vmin.s16 q8, q0, q6 1462 vmin.s16 q9, q1, q6 1463 vld1.16 {q0, q1}, [r1, :128]! // src 1464 vmin.s16 q10, q2, q6 1465 vmin.s16 q11, q3, q6 1466 vmax.s16 q8, q8, q4 1467 vmax.s16 q9, q9, q4 1468 vld1.16 {q2, q3}, [r1, :128], r2 // src 1469 vmvn.i16 q5, #0xf000 // 0x0fff 1470 vmax.s16 q10, q10, q4 1471 vmax.s16 q11, q11, q4 1472.elseif \ox 1473 vmvn d4, d12 // grain_min 1474 vqrshrn.s32 d16, q0, #5 1475 vld1.16 {q0, q1}, [r1, :128]! // src 1476 vmin.s16 d16, d16, d12 1477 vmax.s16 d16, d16, d4 1478 vld1.16 {q2, q3}, [r1, :128], r2 // src 1479.endif 1480 1481 // Make sure that uninitialized pixels out of range past the right 1482 // edge are in range; their actual values shouldn't matter. 1483 vand q0, q0, q5 1484 vand q1, q1, q5 1485 vand q2, q2, q5 1486 vand q3, q3, q5 1487 1488 bl gather32_neon 1489 1490.if \ox || \oy 1491 vpush {q6-q7} 1492.endif 1493 1494 vmovl.u8 q6, d8 // scaling 1495 vmovl.u8 q7, d9 1496 vmovl.u8 q4, d10 1497 vmovl.u8 q5, d11 1498 1499 vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) 1500 vshl.u16 q7, q7, q13 1501 vshl.u16 q4, q4, q13 1502 vshl.u16 q5, q5, q13 1503 1504 vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) 1505 vqrdmulh.s16 q9, q9, q7 1506 vqrdmulh.s16 q10, q10, q4 1507 vqrdmulh.s16 q11, q11, q5 1508 1509.if \ox || \oy 1510 vpop {q6-q7} 1511.endif 1512 1513 vqadd.s16 q0, q0, q8 // *src + noise 1514 vqadd.s16 q1, q1, q9 1515 vqadd.s16 q2, q2, q10 1516 vqadd.s16 q3, q3, q11 1517 1518 vmax.s16 q0, q0, q14 1519 vmax.s16 q1, q1, q14 1520 vmax.s16 q2, q2, q14 1521 vmax.s16 q3, q3, q14 1522 vmin.s16 q0, q0, q15 1523 vmin.s16 q1, q1, q15 1524 vmin.s16 q2, q2, q15 1525 vmin.s16 q3, q3, q15 1526 1527 vst1.16 {q0, q1}, [r0, :128]! // dst 1528 subs r7, r7, #1 1529.if \oy 1530 vdup.16 d14, d25[0] 1531 vdup.16 d15, d25[1] 1532.endif 1533 vst1.16 {q2, q3}, [r0, :128], r2 // dst 1534 bgt 1b 1535 1536.if \oy 1537 cmp r10, #2 1538 sub r7, r10, #2 // restore actual remaining h 1539 bgt L(loop_\ox\()0) 1540.endif 1541 vpop {q4-q7} 1542 pop {r4-r11,pc} 1543.endm 1544 1545 fgy 0, 0 1546 fgy 0, 1 1547 fgy 1, 0 1548 fgy 1, 1 1549endfunc 1550 1551// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, 1552// const pixel *const src, 1553// const ptrdiff_t stride, 1554// const uint8_t scaling[SCALING_SIZE], 1555// const Dav1dFilmGrainData *const data, 1556// const entry grain_lut[][GRAIN_WIDTH], 1557// const pixel *const luma_row, 1558// const ptrdiff_t luma_stride, 1559// const int offsets[][2], 1560// const ptrdiff_t h, const ptrdiff_t uv, 1561// const ptrdiff_t is_id, 1562// const ptrdiff_t type, 1563// const int bitdepth_max); 1564.macro fguv layout, sx, sy 1565function fguv_32x32_\layout\()_16bpc_neon, export=1 1566 push {r4-r11,lr} 1567 vpush {q4-q7} 1568 ldrd r4, r5, [sp, #100] // data, grain_lut 1569 ldrd r10, r11, [sp, #124] // uv, is_id 1570 ldr r6, [sp, #136] // bitdepth_max 1571 1572 clz r7, r6 1573 rsb r7, r7, #24 // bitdepth_min_8 1574 1575 // !csfl 1576 add r10, r4, r10, lsl #2 // + 4*uv 1577 add r12, r10, #FGD_UV_LUMA_MULT 1578 add lr, r10, #FGD_UV_MULT 1579 ldrh r10, [r10, #FGD_UV_OFFSET] // uv_offset 1580 vld1.16 {d30[]}, [r12] // uv_luma_mult 1581 lsl r10, r10, r7 // uv_offset << bitdepth_min_8 1582 vld1.16 {d30[1]}, [lr] // uv_mult 1583 1584 ldr lr, [r4, #FGD_SCALING_SHIFT] 1585 ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE] 1586 eor lr, lr, #15 // 15 - scaling_shift 1587 1588 vmov.16 d30[2], r10 // uv_offset << bitdepth_min_8 1589 1590 cmp r12, #0 1591 vdup.16 q13, lr // 15 - scaling_shift 1592 1593 beq 1f 1594 // clip 1595 cmp r11, #0 1596 mov r8, #16 1597 mov r9, #240 1598 lsl r8, r8, r7 1599 lsl r9, r9, r7 1600 beq 2f 1601 // is_id 1602 mov r9, #235 1603 lsl r9, r9, r7 1604 b 2f 16051: 1606 // no clip 1607 mov r8, #0 1608 mov r9, r6 // bitdepth_max 16092: 1610 vmov.16 d30[3], r6 // bitdepth_max 1611 vdup.16 d31, r8 // clip_min 1612 1613 mov r10, #GRAIN_WIDTH*2 // grain_lut stride 1614 1615.if \sy 1616 mov r6, #23 1617 mov r7, #22 1618.else 1619 mov r6, #27 1620 mov r7, #17 1621.endif 1622 vmov.16 d31[1], r9 // clip_max 1623 1624 ldrd r8, r9, [sp, #116] // offsets, h 1625 1626 add r5, r5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 1627.if \sy 1628 add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride 1629 add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride 1630.else 1631 add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride 1632 add r5, r5, r10 // grain_lut += grain_stride 1633.endif 1634 vmov.16 d31[2], r6 // overlap y [0] 1635 1636 ldr r12, [r8, #8] // offsets[1][0] 1637 calc_offset r12, r4, r12, \sx, \sy 1638 add_offset r4, r12, r4, r5, r10 1639 1640 ldr r12, [r8, #4] // offsets[0][1] 1641 calc_offset r12, lr, r12, \sx, \sy 1642 add_offset lr, r12, lr, r5, r10 1643 1644 ldr r12, [r8, #12] // offsets[1][1] 1645 calc_offset r12, r11, r12, \sx, \sy 1646 add_offset r11, r12, r11, r5, r10 1647 1648 ldr r8, [r8] // offsets[0][0] 1649 calc_offset r8, r12, r8, \sx, \sy 1650 add_offset r5, r8, r12, r5, r10 1651 1652 vmov.16 d31[3], r7 // overlap y [1] 1653 1654 add r4, r4, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx 1655 add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1656 add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1657 add r11, r11, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx 1658 1659 movrel_local r12, overlap_coeffs_\sx 1660 ldr lr, [sp, #132] // type 1661 ldrd r6, r7, [sp, #108] // luma_row, luma_stride 1662 1663 vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs 1664 1665 movrel_local r12, L(fguv_loop_sx\sx\()_tbl) 1666#if CONFIG_THUMB 1667 // This uses movrel_local instead of adr above, because the target 1668 // can be out of range for adr. But movrel_local leaves the thumb bit 1669 // set on COFF (but probably wouldn't if building for thumb on ELF), 1670 // thus try to clear the bit for robustness. 1671 bic r12, r12, #1 1672#endif 1673 1674 tst lr, #1 1675 ldr lr, [r12, lr, lsl #2] 1676 1677 add r12, r12, lr 1678 1679 beq 1f 1680 // y overlap 1681 sub lr, r9, #(2 >> \sy) // backup remaining h 1682 mov r9, #(2 >> \sy) 1683 16841: 1685.if \sy 1686 add r7, r7, r7 // luma_stride *= 2 1687.endif 1688 sub r7, r7, #32 // luma_stride -= 32 1689 1690 bx r12 1691endfunc 1692.endm 1693 1694fguv 420, 1, 1 1695fguv 422, 1, 0 1696fguv 444, 0, 0 1697 1698function fguv_loop_sx0_neon 1699L(fguv_loop_sx0_tbl): 1700 .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB 1701 .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB 1702 .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB 1703 .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB 1704 .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB 1705 .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB 1706 .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB 1707 .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB 1708 1709.macro fguv_loop_sx0 csfl, ox, oy 1710L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): 1711 sub r2, r2, #32 // src_stride -= 32 1712 sub r10, r10, #32 // grain_stride -= 32 1713.if \oy 1714 mov r12, lr 1715.endif 1716L(fguv_loop_sx0_csfl\csfl\()_\ox\oy\()_loopstart): 17171: 1718.if \ox 1719 vld1.16 {d0}, [r4], r10 // grain_lut old 1720.endif 1721.if \oy 1722 vld1.16 {q2, q3}, [r8]! // grain_lut top 1723.endif 1724.if \ox && \oy 1725 vld1.16 {d2}, [r11], r10 // grain_lut top old 1726.endif 1727.if !\ox && !\oy 1728 vld1.16 {q0, q1}, [r6, :128]! // luma 1729.endif 1730 vld1.16 {q8, q9}, [r5]! // grain_lut 1731.if \oy 1732 vld1.16 {q4, q5}, [r8], r10 // grain_lut top 1733.endif 1734.if !\ox && !\oy 1735 vld1.16 {q2, q3}, [r6, :128], r7 // luma 1736.endif 1737.if \oy 1738 vdup.16 d28, d31[2] // overlap y coeff 1739 vdup.16 d29, d31[3] // overlap y coeff 1740.endif 1741 vld1.16 {q10, q11}, [r5], r10 // grain_lut 1742 1743.if \ox 1744 vdup.16 q7, d30[3] // bitdepth_max 1745 add r4, r4, #32 1746 vmull.s16 q0, d0, d24 1747 vshr.u16 q7, q7, #1 // grain_max 1748 vmlal.s16 q0, d16, d25 1749 vmvn q6, q7 // grain_min 1750.endif 1751 1752.if \oy 1753.if \ox 1754 add r11, r11, #32 1755 vmull.s16 q1, d2, d24 1756 vmlal.s16 q1, d4, d25 1757 vqrshrn.s32 d16, q0, #5 1758 vqrshrn.s32 d4, q1, #5 1759 vmin.s16 d4, d4, d14 1760 vmin.s16 d16, d16, d14 1761 vmax.s16 d4, d4, d12 1762 vmax.s16 d16, d16, d12 1763.endif 1764 1765 vmull.s16 q0, d4, d28 1766 vmull.s16 q1, d5, d28 1767 vmull.s16 q2, d6, d28 1768 vmull.s16 q3, d7, d28 1769.if !\ox 1770 vdup.16 q7, d30[3] // bitdepth_max 1771.endif 1772 vmlal.s16 q0, d16, d29 1773 vmlal.s16 q1, d17, d29 1774 vmlal.s16 q2, d18, d29 1775 vmlal.s16 q3, d19, d29 1776.if !\ox 1777 vshr.u16 q7, q7, #1 // grain_max 1778.endif 1779 vmull.s16 q8, d20, d29 1780 vmull.s16 q9, d21, d29 1781 vmull.s16 q10, d22, d29 1782 vmull.s16 q11, d23, d29 1783.if !\ox 1784 vmvn q6, q7 // grain_min 1785.endif 1786 vmlal.s16 q8, d8, d28 1787 vmlal.s16 q9, d9, d28 1788 vmlal.s16 q10, d10, d28 1789 vmlal.s16 q11, d11, d28 1790 vqrshrn.s32 d0, q0, #5 1791 vqrshrn.s32 d1, q1, #5 1792 vqrshrn.s32 d2, q2, #5 1793 vqrshrn.s32 d3, q3, #5 1794 vqrshrn.s32 d4, q8, #5 1795 vqrshrn.s32 d5, q9, #5 1796 vqrshrn.s32 d6, q10, #5 1797 vqrshrn.s32 d7, q11, #5 1798 vmin.s16 q8, q0, q7 1799 vmin.s16 q9, q1, q7 1800 vld1.16 {q0, q1}, [r6, :128]! // luma 1801 vmin.s16 q10, q2, q7 1802 vmin.s16 q11, q3, q7 1803 vmax.s16 q8, q8, q6 1804 vmax.s16 q9, q9, q6 1805 vld1.16 {q2, q3}, [r6, :128], r7 // luma 1806 vmax.s16 q10, q10, q6 1807 vmax.s16 q11, q11, q6 1808.elseif \ox 1809 vqrshrn.s32 d16, q0, #5 1810 vld1.16 {q0, q1}, [r6, :128]! // luma 1811 vmin.s16 d16, d16, d14 1812 vld1.16 {q2, q3}, [r6, :128], r7 // luma 1813 vmax.s16 d16, d16, d12 1814.endif 1815 1816.if !\csfl 1817 vdup.16 d28, d30[0] // uv_luma_mult 1818 vld1.16 {q4, q5}, [r1, :128]! // src 1819 vdup.16 d29, d30[1] // uv_mult 1820 vmull.s16 q6, d0, d28 1821 vmull.s16 q7, d1, d28 1822 vmull.s16 q0, d2, d28 1823 vmull.s16 q1, d3, d28 1824 vmlal.s16 q6, d8, d29 1825 vmlal.s16 q7, d9, d29 1826 vmlal.s16 q0, d10, d29 1827 vmlal.s16 q1, d11, d29 1828 vld1.16 {q4, q5}, [r1, :128] // src 1829 sub r1, r1, #32 1830 vshrn.s32 d12, q6, #6 1831 vshrn.s32 d13, q7, #6 1832 vshrn.s32 d14, q0, #6 1833 vshrn.s32 d15, q1, #6 1834 vmull.s16 q0, d4, d28 1835 vmull.s16 q1, d5, d28 1836 vmull.s16 q2, d6, d28 1837 vmull.s16 q3, d7, d28 1838 vmlal.s16 q0, d8, d29 1839 vmlal.s16 q1, d9, d29 1840 vmlal.s16 q2, d10, d29 1841 vmlal.s16 q3, d11, d29 1842 vdup.16 q14, d30[2] // uv_offset 1843 vshrn.s32 d0, q0, #6 1844 vshrn.s32 d1, q1, #6 1845 vshrn.s32 d2, q2, #6 1846 vshrn.s32 d3, q3, #6 1847 vdup.16 q4, d30[3] // bitdepth_max 1848 vmov.i16 q5, #0 1849 vadd.i16 q6, q6, q14 1850 vadd.i16 q7, q7, q14 1851 vadd.i16 q2, q0, q14 1852 vadd.i16 q3, q1, q14 1853 vmin.s16 q0, q6, q4 1854 vmin.s16 q1, q7, q4 1855 vmin.s16 q2, q2, q4 1856 vmin.s16 q3, q3, q4 1857 vmax.s16 q0, q0, q5 1858 vmax.s16 q1, q1, q5 1859 vmax.s16 q2, q2, q5 1860 vmax.s16 q3, q3, q5 1861.else 1862 vdup.16 q14, d30[3] // bitdepth_max 1863 // Make sure that uninitialized pixels out of range past the right 1864 // edge are in range; their actual values shouldn't matter. 1865 vand q0, q0, q14 1866 vand q1, q1, q14 1867 vand q2, q2, q14 1868 vand q3, q3, q14 1869.endif 1870 1871 bl gather32_neon 1872 1873 vld1.16 {q0, q1}, [r1, :128]! // src 1874 1875 vmovl.u8 q6, d8 // scaling 1876 vmovl.u8 q7, d9 1877 vmovl.u8 q4, d10 1878 vmovl.u8 q5, d11 1879 1880 vld1.16 {q2, q3}, [r1, :128], r2 // src 1881 1882 vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) 1883 vshl.u16 q7, q7, q13 1884 vshl.u16 q4, q4, q13 1885 vshl.u16 q5, q5, q13 1886 1887 vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) 1888 vqrdmulh.s16 q9, q9, q7 1889 vqrdmulh.s16 q10, q10, q4 1890 vqrdmulh.s16 q11, q11, q5 1891 1892 1893 vdup.16 q4, d31[0] // clip_min 1894 vdup.16 q5, d31[1] // clip_max 1895 1896 vqadd.s16 q0, q0, q8 // *src + noise 1897 vqadd.s16 q1, q1, q9 1898 vqadd.s16 q2, q2, q10 1899 vqadd.s16 q3, q3, q11 1900 1901.if \oy 1902 vmov.32 lr, d25[0] // 2 first 16 bit coeffs from overlap x 1903.endif 1904 1905 vmax.s16 q0, q0, q4 1906 vmax.s16 q1, q1, q4 1907 vmax.s16 q2, q2, q4 1908 vmax.s16 q3, q3, q4 1909 vmin.s16 q0, q0, q5 1910 vmin.s16 q1, q1, q5 1911 vmin.s16 q2, q2, q5 1912 vmin.s16 q3, q3, q5 1913 1914 vst1.16 {q0, q1}, [r0, :128]! // dst 1915 1916 subs r9, r9, #1 1917.if \oy 1918 vmov.32 d31[1], lr // new coeffs for overlap y 1919.endif 1920 1921 vst1.16 {q2, q3}, [r0, :128], r2 // dst 1922 bgt 1b 1923 1924.if \oy 1925 cmp r12, #0 1926 mov r9, r12 // restore actual remaining h 1927 bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0_loopstart) 1928.endif 1929 b 9f 1930.endm 1931 fguv_loop_sx0 0, 0, 0 1932 fguv_loop_sx0 0, 0, 1 1933 fguv_loop_sx0 0, 1, 0 1934 fguv_loop_sx0 0, 1, 1 1935 fguv_loop_sx0 1, 0, 0 1936 fguv_loop_sx0 1, 0, 1 1937 fguv_loop_sx0 1, 1, 0 1938 fguv_loop_sx0 1, 1, 1 1939 19409: 1941 vpop {q4-q7} 1942 pop {r4-r11,pc} 1943endfunc 1944 1945function fguv_loop_sx1_neon 1946L(fguv_loop_sx1_tbl): 1947 .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB 1948 .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB 1949 .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB 1950 .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB 1951 .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB 1952 .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB 1953 .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB 1954 .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB 1955 1956.macro fguv_loop_sx1 csfl, ox, oy 1957L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): 1958.if \oy 1959 mov r12, lr 1960.endif 19611: 1962.if \ox 1963 vld1.16 {d0}, [r4], r10 // grain_lut old 1964.endif 1965.if \ox && \oy 1966 vld1.16 {d2}, [r11], r10 // grain_lut top old 1967.endif 1968.if \oy 1969 vld1.16 {q2, q3}, [r8], r10 // grain_lut top 1970.endif 1971.if !\ox && !\oy 1972 vld1.16 {q0, q1}, [r6, :128]! // luma 1973.endif 1974 vld1.16 {q8, q9}, [r5], r10 // grain_lut 1975.if \oy 1976 vdup.16 d28, d31[2] // overlap y coeff 1977 vdup.16 d29, d31[3] // overlap y coeff 1978.endif 1979.if !\ox && !\oy 1980 vld1.16 {q2, q3}, [r6, :128], r7 // luma 1981.endif 1982 1983.if \ox 1984 vdup.16 q7, d30[3] // bitdepth_max 1985 vmull.s16 q0, d0, d24 1986 vshr.u16 q7, q7, #1 // grain_max 1987 vmlal.s16 q0, d16, d25 1988 vmvn q6, q7 // grain_min 1989.endif 1990 1991.if \oy 1992.if \ox 1993 vmull.s16 q1, d2, d24 1994 vmlal.s16 q1, d4, d25 1995 vqrshrn.s32 d16, q0, #5 1996 vqrshrn.s32 d4, q1, #5 1997 vmin.s16 d4, d4, d14 1998 vmin.s16 d16, d16, d14 1999 vmax.s16 d4, d4, d12 2000 vmax.s16 d16, d16, d12 2001.endif 2002 2003 vmull.s16 q0, d4, d28 2004 vmull.s16 q1, d5, d28 2005 vmull.s16 q2, d6, d28 2006 vmull.s16 q3, d7, d28 2007.if !\ox 2008 vdup.16 q7, d30[3] // bitdepth_max 2009.endif 2010 vmlal.s16 q0, d16, d29 2011 vmlal.s16 q1, d17, d29 2012 vmlal.s16 q2, d18, d29 2013 vmlal.s16 q3, d19, d29 2014.if !\ox 2015 vshr.u16 q7, q7, #1 // grain_max 2016.endif 2017 vqrshrn.s32 d16, q0, #5 2018 vqrshrn.s32 d17, q1, #5 2019 vqrshrn.s32 d18, q2, #5 2020 vqrshrn.s32 d19, q3, #5 2021.if !\ox 2022 vmvn q6, q7 // grain_min 2023.endif 2024 vld1.16 {q0, q1}, [r6, :128]! // luma 2025 vmin.s16 q8, q8, q7 2026 vmin.s16 q9, q9, q7 2027 vmax.s16 q8, q8, q6 2028 vmax.s16 q9, q9, q6 2029 vld1.16 {q2, q3}, [r6, :128], r7 // luma 2030.elseif \ox 2031 vqrshrn.s32 d16, q0, #5 2032 vld1.16 {q0, q1}, [r6, :128]! // luma 2033 vmin.s16 d16, d16, d14 2034 vld1.16 {q2, q3}, [r6, :128], r7 // luma 2035 vmax.s16 d16, d16, d12 2036.endif 2037 2038 vpadd.i16 d0, d0, d1 2039 vpadd.i16 d1, d2, d3 2040 vpadd.i16 d2, d4, d5 2041 vpadd.i16 d3, d6, d7 2042 vrshr.u16 q0, q0, #1 2043 vrshr.u16 q1, q1, #1 2044.if !\csfl 2045 vdup.16 d28, d30[0] // uv_luma_mult 2046 vld1.16 {q2, q3}, [r1, :128], r2 // src 2047 vdup.16 d29, d30[1] // uv_mult 2048 vmull.s16 q6, d0, d28 2049 vmull.s16 q7, d1, d28 2050 vmull.s16 q0, d2, d28 2051 vmull.s16 q1, d3, d28 2052 vmlal.s16 q6, d4, d29 2053 vmlal.s16 q7, d5, d29 2054 vmlal.s16 q0, d6, d29 2055 vmlal.s16 q1, d7, d29 2056 vshrn.s32 d12, q6, #6 2057 vshrn.s32 d13, q7, #6 2058 vshrn.s32 d14, q0, #6 2059 vshrn.s32 d15, q1, #6 2060 vdup.16 q14, d30[2] // uv_offset 2061 vdup.16 q4, d30[3] // bitdepth_max 2062 vmov.i16 q5, #0 2063 vadd.i16 q6, q6, q14 2064 vadd.i16 q7, q7, q14 2065 vmin.s16 q0, q6, q4 2066 vmin.s16 q1, q7, q4 2067 vmax.s16 q0, q0, q5 2068 vmax.s16 q1, q1, q5 2069.else 2070 vdup.16 q14, d30[3] // bitdepth_max 2071 vld1.16 {q2, q3}, [r1, :128], r2 // src 2072 2073 // Make sure that uninitialized pixels out of range past the right 2074 // edge are in range; their actual values shouldn't matter. 2075 vand q0, q0, q14 2076 vand q1, q1, q14 2077.endif 2078 2079 bl gather16_neon 2080 2081 vmovl.u8 q6, d8 // scaling 2082 vmovl.u8 q7, d9 2083 2084 vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) 2085 vshl.u16 q7, q7, q13 2086 2087 vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) 2088 vqrdmulh.s16 q9, q9, q7 2089 2090 2091 vdup.16 q4, d31[0] // clip_min 2092 vdup.16 q5, d31[1] // clip_max 2093 2094 vqadd.s16 q0, q2, q8 // *src + noise 2095 vqadd.s16 q1, q3, q9 2096 2097.if \oy 2098 // Swap the two last coefficients of d31, place them first in d28 2099 vrev64.16 d28, d31 2100.endif 2101 2102 vmax.s16 q0, q0, q4 2103 vmax.s16 q1, q1, q4 2104 vmin.s16 q0, q0, q5 2105 vmin.s16 q1, q1, q5 2106 2107 subs r9, r9, #1 2108.if \oy 2109 // Take the first two 16 bit coefficients of d28 and place them at the 2110 // end of d31 2111 vtrn.32 d31, d28 2112.endif 2113 2114 vst1.16 {q0, q1}, [r0, :128], r2 // dst 2115 bgt 1b 2116 2117.if \oy 2118 cmp r12, #0 2119 mov r9, r12 // restore actual remaining h 2120 bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) 2121.endif 2122 2123 b 9f 2124.endm 2125 fguv_loop_sx1 0, 0, 0 2126 fguv_loop_sx1 0, 0, 1 2127 fguv_loop_sx1 0, 1, 0 2128 fguv_loop_sx1 0, 1, 1 2129 fguv_loop_sx1 1, 0, 0 2130 fguv_loop_sx1 1, 0, 1 2131 fguv_loop_sx1 1, 1, 0 2132 fguv_loop_sx1 1, 1, 1 2133 21349: 2135 vpop {q4-q7} 2136 pop {r4-r11,pc} 2137endfunc 2138