1/* 2 * Copyright © 2021, VideoLAN and dav1d authors 3 * Copyright © 2021, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30#include "src/arm/asm-offsets.h" 31 32#define GRAIN_WIDTH 82 33#define GRAIN_HEIGHT 73 34 35#define SUB_GRAIN_WIDTH 44 36#define SUB_GRAIN_HEIGHT 38 37 38.macro increment_seed steps, shift=1 39 lsr w11, w2, #3 40 lsr w12, w2, #12 41 lsr w13, w2, #1 42 eor w11, w2, w11 // (r >> 0) ^ (r >> 3) 43 eor w12, w12, w13 // (r >> 12) ^ (r >> 1) 44 eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) 45.if \shift 46 lsr w2, w2, #\steps 47.endif 48 and w11, w11, #((1 << \steps) - 1) // bit 49.if \shift 50 orr w2, w2, w11, lsl #(16 - \steps) // *state 51.else 52 orr w2, w2, w11, lsl #16 // *state 53.endif 54.endm 55 56.macro read_rand dest, bits, age 57 ubfx \dest, x2, #16 - \bits - \age, #\bits 58.endm 59 60.macro read_shift_rand dest, bits 61 ubfx \dest, x2, #17 - \bits, #\bits 62 lsr w2, w2, #1 63.endm 64 65// special calling convention: 66// w2 holds seed 67// x3 holds dav1d_gaussian_sequence 68// clobbers x11-x15 69// returns in v0.8h 70function get_gaussian_neon 71 increment_seed 4 72 read_rand x14, 11, 3 73 read_rand x15, 11, 2 74 add x14, x3, x14, lsl #1 75 add x15, x3, x15, lsl #1 76 ld1 {v0.h}[0], [x14] 77 read_rand x14, 11, 1 78 ld1 {v0.h}[1], [x15] 79 add x14, x3, x14, lsl #1 80 read_rand x15, 11, 0 81 increment_seed 4 82 add x15, x3, x15, lsl #1 83 ld1 {v0.h}[2], [x14] 84 read_rand x14, 11, 3 85 ld1 {v0.h}[3], [x15] 86 add x14, x3, x14, lsl #1 87 read_rand x15, 11, 2 88 ld1 {v0.h}[4], [x14] 89 add x15, x3, x15, lsl #1 90 read_rand x14, 11, 1 91 ld1 {v0.h}[5], [x15] 92 read_rand x15, 11, 0 93 add x14, x3, x14, lsl #1 94 add x15, x3, x15, lsl #1 95 ld1 {v0.h}[6], [x14] 96 ld1 {v0.h}[7], [x15] 97 ret 98endfunc 99 100.macro store_grain_row r0, r1, r2, r3, r4, r5 101 st1 {\r0\().16b,\r1\().16b}, [x0], #32 102 st1 {\r2\().16b,\r3\().16b}, [x0], #32 103 st1 {\r4\().16b}, [x0], #16 104 st1 {\r5\().h}[0], [x0], #2 105.endm 106 107function get_grain_2_neon 108 increment_seed 2 109 read_rand x14, 11, 1 110 read_rand x15, 11, 0 111 add x14, x3, x14, lsl #1 112 add x15, x3, x15, lsl #1 113 ld1 {v0.h}[0], [x14] 114 ld1 {v0.h}[1], [x15] 115 srshl v0.4h, v0.4h, v31.4h 116 ret 117endfunc 118 119.macro get_grain_2 dst 120 bl get_grain_2_neon 121.ifnc \dst, v0 122 mov \dst\().8b, v0.8b 123.endif 124.endm 125 126function get_grain_4_neon 127 increment_seed 4 128 read_rand x14, 11, 3 129 read_rand x15, 11, 2 130 add x14, x3, x14, lsl #1 131 add x15, x3, x15, lsl #1 132 ld1 {v0.h}[0], [x14] 133 read_rand x14, 11, 1 134 ld1 {v0.h}[1], [x15] 135 add x14, x3, x14, lsl #1 136 read_rand x15, 11, 0 137 add x15, x3, x15, lsl #1 138 ld1 {v0.h}[2], [x14] 139 ld1 {v0.h}[3], [x15] 140 srshl v0.4h, v0.4h, v31.4h 141 ret 142endfunc 143 144.macro get_grain_4 dst 145 bl get_grain_4_neon 146.ifnc \dst, v0 147 mov \dst\().8b, v0.8b 148.endif 149.endm 150 151// w15 holds the number of entries to produce 152// w14, w16 and w17 hold the previous output entries 153// v0 holds the vector of produced entries 154// v1 holds the input vector of sums from above 155.macro output_lag n 156function output_lag\n\()_neon 1571: 158 read_shift_rand x13, 11 159 mov w11, v1.s[0] 160 ldrsh w12, [x3, x13, lsl #1] 161 ext v0.16b, v0.16b, v0.16b, #2 162.if \n == 1 163 madd w11, w14, w4, w11 // sum (above) + *coeff * prev output 164.elseif \n == 2 165 madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1 166 madd w11, w14, w17, w11 // += *coeff * prev output 2 167 mov w16, w14 168.else 169 madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1 170 madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2 171 madd w11, w14, w21, w11 // += *coeff * prev output 3 172 mov w17, w16 173 mov w16, w14 174.endif 175 add w14, w11, w8 // 1 << (ar_coeff_shift - 1) 176 add w12, w12, w10 // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) 177 asr w14, w14, w7 // >> ar_coeff_shift 178 asr w12, w12, w9 // >> (4 - bitdepth_min_8 + grain_scale_shift) 179 add w14, w14, w12 180 cmp w14, w5 181 csel w14, w14, w5, le 182 cmp w14, w6 183 csel w14, w14, w6, ge 184 subs w15, w15, #1 185 ext v1.16b, v1.16b, v1.16b, #4 186 ins v0.h[7], w14 187 b.gt 1b 188 ret 189endfunc 190.endm 191 192output_lag 1 193output_lag 2 194output_lag 3 195 196 197function sum_lag1_above_neon 198 sub x12, x0, #1*GRAIN_WIDTH*2 - 16 199 ld1 {v18.8h}, [x12] // load top right 200 201 ext v0.16b, v16.16b, v17.16b, #14 // top left, top mid 202 ext v1.16b, v17.16b, v18.16b, #2 // top mid, top right 203 204 smull v4.4s, v17.4h, v28.4h 205 smlal v4.4s, v0.4h, v27.4h 206 smlal v4.4s, v1.4h, v29.4h 207 smull2 v5.4s, v17.8h, v28.8h 208 smlal2 v5.4s, v0.8h, v27.8h 209 smlal2 v5.4s, v1.8h, v29.8h 210 211 mov v16.16b, v17.16b 212 mov v17.16b, v18.16b 213 214 ret 215endfunc 216 217.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff 218 bl sum_\lag\()_above_neon 219.ifc \type, uv_420 220 add x12, x19, #GRAIN_WIDTH*2 221 ld1 {v22.8h, v23.8h}, [x19], #32 222 ld1 {v24.8h, v25.8h}, [x12] 223 addp v22.8h, v22.8h, v23.8h 224 addp v23.8h, v24.8h, v25.8h 225 add v22.8h, v22.8h, v23.8h 226 srshr v0.8h, v22.8h, #2 227.endif 228.ifc \type, uv_422 229 ld1 {v22.8h, v23.8h}, [x19], #32 230 addp v22.8h, v22.8h, v23.8h 231 srshr v0.8h, v22.8h, #1 232.endif 233.ifc \type, uv_444 234 ld1 {v0.8h}, [x19], #16 235.endif 236.if \uv_layout 237.ifnb \uv_coeff 238 dup v1.8b, \uv_coeff 239 sxtl v1.8h, v1.8b 240 smlal v4.4s, v0.4h, v1.4h 241 smlal2 v5.4s, v0.8h, v1.8h 242.else 243 smlal v4.4s, v0.4h, v30.4h 244 smlal2 v5.4s, v0.8h, v30.8h 245.endif 246.endif 247.if \uv_layout && \elems == 8 248 b sum_\lag\()_y_\edge\()_start 249.elseif \uv_layout == 444 && \elems == 7 250 b sum_\lag\()_y_\edge\()_start 251.elseif \uv_layout == 422 && \elems == 1 252 b sum_\lag\()_uv_420_\edge\()_start 253.else 254sum_\lag\()_\type\()_\edge\()_start: 255.if \elems > 4 256.ifc \edge, left 257 increment_seed 4 258 read_rand x12, 11, 3 259 read_rand x13, 11, 2 260 read_rand x14, 11, 1 261 add x12, x3, x12, lsl #1 262 add x13, x3, x13, lsl #1 263 add x14, x3, x14, lsl #1 264 ld1 {v0.h}[5], [x12] 265 ld1 {v0.h}[6], [x13] 266 ld1 {v0.h}[7], [x14] 267 lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0 268 srshl v0.8h, v0.8h, v31.8h 269 ext v4.16b, v4.16b, v4.16b, #12 270.ifc \lag, lag3 271 smov w17, v0.h[5] 272.endif 273.ifnc \lag, lag1 274 smov w16, v0.h[6] 275.endif 276 smov w14, v0.h[7] 277 278 mov v1.16b, v4.16b 279 mov w15, #1 280 bl output_\lag\()_neon 281.else 282 increment_seed 4, shift=0 283 mov v1.16b, v4.16b 284 mov w15, #4 285 bl output_\lag\()_neon 286.endif 287 288 increment_seed 4, shift=0 289 mov v1.16b, v5.16b 290.ifc \edge, right 291 mov w15, #3 292 bl output_\lag\()_neon 293 read_shift_rand x15, 11 294 add x15, x3, x15, lsl #1 295 ld1 {v1.h}[0], [x15] 296 srshl v1.4h, v1.4h, v31.4h 297 ext v0.16b, v0.16b, v1.16b, #2 298.else 299 mov w15, #4 300 bl output_\lag\()_neon 301.endif 302.else 303 // elems == 1 304 increment_seed 4, shift=0 305 mov v1.16b, v4.16b 306 mov w15, #1 307 bl output_\lag\()_neon 308 lsr w2, w2, #3 309 310 read_rand x12, 11, 2 311 read_rand x13, 11, 1 312 read_rand x14, 11, 0 313 add x12, x3, x12, lsl #1 314 add x13, x3, x13, lsl #1 315 add x14, x3, x14, lsl #1 316 ld1 {v1.h}[0], [x12] 317 ld1 {v1.h}[1], [x13] 318 ld1 {v1.h}[2], [x14] 319 srshl v1.4h, v1.4h, v31.4h 320 ext v0.16b, v0.16b, v1.16b, #14 321.endif 322 st1 {v0.8h}, [x0], #16 323 ldr x30, [sp], #16 324 AARCH64_VALIDATE_LINK_REGISTER 325 ret 326.endif 327.endm 328 329.macro sum_lag1_func type, uv_layout, edge, elems=8 330function sum_\type\()_lag1_\edge\()_neon 331 AARCH64_SIGN_LINK_REGISTER 332 str x30, [sp, #-16]! 333.ifc \edge, left 334 sub x12, x0, #1*GRAIN_WIDTH*2 335 ld1 {v17.8h}, [x12] // load the previous block right above 336.endif 337 sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems 338endfunc 339.endm 340 341sum_lag1_func y, 0, left 342sum_lag1_func y, 0, mid 343sum_lag1_func y, 0, right, 7 344sum_lag1_func uv_444, 444, left 345sum_lag1_func uv_444, 444, mid 346sum_lag1_func uv_444, 444, right, 7 347sum_lag1_func uv_422, 422, left 348sum_lag1_func uv_422, 422, mid 349sum_lag1_func uv_422, 422, right, 1 350sum_lag1_func uv_420, 420, left 351sum_lag1_func uv_420, 420, mid 352sum_lag1_func uv_420, 420, right, 1 353 354 355function sum_lag2_above_neon 356 sub x12, x0, #2*GRAIN_WIDTH*2 - 16 357 sub x13, x0, #1*GRAIN_WIDTH*2 - 16 358 ld1 {v18.8h}, [x12] // load top right 359 ld1 {v21.8h}, [x13] 360 361 dup v26.8b, v30.b[0] 362 ext v22.16b, v16.16b, v17.16b, #12 // top left, top mid 363 dup v27.8b, v30.b[1] 364 ext v23.16b, v16.16b, v17.16b, #14 365 sxtl v26.8h, v26.8b 366 dup v28.8b, v30.b[3] 367 ext v0.16b, v17.16b, v18.16b, #2 // top mid, top right 368 sxtl v27.8h, v27.8b 369 dup v29.8b, v30.b[4] 370 ext v1.16b, v17.16b, v18.16b, #4 371 sxtl v28.8h, v28.8b 372 sxtl v29.8h, v29.8b 373 374 smull v4.4s, v22.4h, v26.4h 375 smlal v4.4s, v23.4h, v27.4h 376 smlal v4.4s, v0.4h, v28.4h 377 smlal v4.4s, v1.4h, v29.4h 378 smull2 v5.4s, v22.8h, v26.8h 379 smlal2 v5.4s, v23.8h, v27.8h 380 smlal2 v5.4s, v0.8h, v28.8h 381 smlal2 v5.4s, v1.8h, v29.8h 382 383 dup v26.16b, v30.b[5] 384 ext v22.16b, v19.16b, v20.16b, #12 // top left, top mid 385 dup v27.16b, v30.b[6] 386 ext v23.16b, v19.16b, v20.16b, #14 387 sxtl v26.8h, v26.8b 388 dup v28.16b, v30.b[8] 389 ext v0.16b, v20.16b, v21.16b, #2 // top mid, top right 390 sxtl v27.8h, v27.8b 391 dup v29.16b, v30.b[9] 392 ext v1.16b, v20.16b, v21.16b, #4 393 sxtl v28.8h, v28.8b 394 sxtl v29.8h, v29.8b 395 396 smlal v4.4s, v22.4h, v26.4h 397 smlal v4.4s, v23.4h, v27.4h 398 smlal v4.4s, v0.4h, v28.4h 399 smlal v4.4s, v1.4h, v29.4h 400 smlal2 v5.4s, v22.8h, v26.8h 401 smlal2 v5.4s, v23.8h, v27.8h 402 smlal2 v5.4s, v0.8h, v28.8h 403 smlal2 v5.4s, v1.8h, v29.8h 404 405 dup v26.16b, v30.b[2] 406 dup v27.16b, v30.b[7] 407 sxtl v26.8h, v26.8b 408 sxtl v27.8h, v27.8b 409 410 smlal v4.4s, v17.4h, v26.4h 411 smlal v4.4s, v20.4h, v27.4h 412 smlal2 v5.4s, v17.8h, v26.8h 413 smlal2 v5.4s, v20.8h, v27.8h 414 mov v16.16b, v17.16b 415 mov v17.16b, v18.16b 416 417 mov v19.16b, v20.16b 418 mov v20.16b, v21.16b 419 ret 420endfunc 421 422.macro sum_lag2_func type, uv_layout, edge, elems=8 423function sum_\type\()_lag2_\edge\()_neon 424 AARCH64_SIGN_LINK_REGISTER 425 str x30, [sp, #-16]! 426.ifc \edge, left 427 sub x12, x0, #2*GRAIN_WIDTH*2 428 sub x13, x0, #1*GRAIN_WIDTH*2 429 ld1 {v17.8h}, [x12] // load the previous block right above 430 ld1 {v20.8h}, [x13] 431.endif 432 sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, v30.b[12] 433endfunc 434.endm 435 436sum_lag2_func y, 0, left 437sum_lag2_func y, 0, mid 438sum_lag2_func y, 0, right, 7 439sum_lag2_func uv_444, 444, left 440sum_lag2_func uv_444, 444, mid 441sum_lag2_func uv_444, 444, right, 7 442sum_lag2_func uv_422, 422, left 443sum_lag2_func uv_422, 422, mid 444sum_lag2_func uv_422, 422, right, 1 445sum_lag2_func uv_420, 420, left 446sum_lag2_func uv_420, 420, mid 447sum_lag2_func uv_420, 420, right, 1 448 449 450function sum_lag3_above_neon 451 sub x11, x0, #3*GRAIN_WIDTH*2 - 16 452 sub x12, x0, #2*GRAIN_WIDTH*2 - 16 453 sub x13, x0, #1*GRAIN_WIDTH*2 - 16 454 ld1 {v15.8h}, [x11] // load top right 455 ld1 {v18.8h}, [x12] 456 ld1 {v21.8h}, [x13] 457 458 dup v22.8b, v29.b[0] 459 ext v8.16b, v13.16b, v14.16b, #10 // top left, top mid 460 dup v23.8b, v29.b[1] 461 ext v9.16b, v13.16b, v14.16b, #12 462 sxtl v22.8h, v22.8b 463 dup v24.8b, v29.b[2] 464 sxtl v23.8h, v23.8b 465 dup v25.8b, v29.b[3] 466 ext v10.16b, v13.16b, v14.16b, #14 467 sxtl v24.8h, v24.8b 468 dup v26.8b, v29.b[4] 469 ext v11.16b, v14.16b, v15.16b, #2 // top mid, top right 470 sxtl v25.8h, v25.8b 471 dup v27.8b, v29.b[5] 472 ext v12.16b, v14.16b, v15.16b, #4 473 sxtl v26.8h, v26.8b 474 dup v28.8b, v29.b[6] 475 ext v13.16b, v14.16b, v15.16b, #6 476 sxtl v27.8h, v27.8b 477 sxtl v28.8h, v28.8b 478 479 smull v4.4s, v8.4h, v22.4h 480 smlal v4.4s, v9.4h, v23.4h 481 smlal v4.4s, v10.4h, v24.4h 482 smlal v4.4s, v11.4h, v26.4h 483 smlal v4.4s, v12.4h, v27.4h 484 smlal v4.4s, v13.4h, v28.4h 485 smlal v4.4s, v14.4h, v25.4h 486 smull2 v5.4s, v8.8h, v22.8h 487 smlal2 v5.4s, v9.8h, v23.8h 488 smlal2 v5.4s, v10.8h, v24.8h 489 smlal2 v5.4s, v11.8h, v26.8h 490 smlal2 v5.4s, v12.8h, v27.8h 491 smlal2 v5.4s, v13.8h, v28.8h 492 smlal2 v5.4s, v14.8h, v25.8h 493 494 dup v22.8b, v29.b[7] 495 ext v8.16b, v16.16b, v17.16b, #10 // top left, top mid 496 dup v23.8b, v29.b[8] 497 ext v9.16b, v16.16b, v17.16b, #12 498 sxtl v22.8h, v22.8b 499 dup v24.8b, v29.b[9] 500 sxtl v23.8h, v23.8b 501 dup v25.8b, v29.b[10] 502 ext v10.16b, v16.16b, v17.16b, #14 503 sxtl v24.8h, v24.8b 504 dup v26.8b, v29.b[11] 505 ext v11.16b, v17.16b, v18.16b, #2 // top mid, top right 506 sxtl v25.8h, v25.8b 507 dup v27.8b, v29.b[12] 508 ext v12.16b, v17.16b, v18.16b, #4 509 sxtl v26.8h, v26.8b 510 dup v28.8b, v29.b[13] 511 ext v13.16b, v17.16b, v18.16b, #6 512 sxtl v27.8h, v27.8b 513 sxtl v28.8h, v28.8b 514 515 smlal v4.4s, v8.4h, v22.4h 516 smlal v4.4s, v9.4h, v23.4h 517 smlal v4.4s, v10.4h, v24.4h 518 smlal v4.4s, v11.4h, v26.4h 519 smlal v4.4s, v12.4h, v27.4h 520 smlal v4.4s, v13.4h, v28.4h 521 smlal v4.4s, v17.4h, v25.4h 522 smlal2 v5.4s, v8.8h, v22.8h 523 smlal2 v5.4s, v9.8h, v23.8h 524 smlal2 v5.4s, v10.8h, v24.8h 525 smlal2 v5.4s, v11.8h, v26.8h 526 smlal2 v5.4s, v12.8h, v27.8h 527 smlal2 v5.4s, v13.8h, v28.8h 528 smlal2 v5.4s, v17.8h, v25.8h 529 530 dup v22.8b, v29.b[14] 531 ext v8.16b, v19.16b, v20.16b, #10 // top left, top mid 532 dup v23.8b, v29.b[15] 533 ext v9.16b, v19.16b, v20.16b, #12 534 sxtl v22.8h, v22.8b 535 dup v24.8b, v30.b[0] 536 sxtl v23.8h, v23.8b 537 dup v25.8b, v30.b[1] 538 ext v10.16b, v19.16b, v20.16b, #14 539 sxtl v24.8h, v24.8b 540 dup v26.8b, v30.b[2] 541 ext v11.16b, v20.16b, v21.16b, #2 // top mid, top right 542 sxtl v25.8h, v25.8b 543 dup v27.8b, v30.b[3] 544 ext v12.16b, v20.16b, v21.16b, #4 545 sxtl v26.8h, v26.8b 546 dup v28.8b, v30.b[4] 547 ext v13.16b, v20.16b, v21.16b, #6 548 sxtl v27.8h, v27.8b 549 sxtl v28.8h, v28.8b 550 551 smlal v4.4s, v8.4h, v22.4h 552 smlal v4.4s, v9.4h, v23.4h 553 smlal v4.4s, v10.4h, v24.4h 554 smlal v4.4s, v11.4h, v26.4h 555 smlal v4.4s, v12.4h, v27.4h 556 smlal v4.4s, v13.4h, v28.4h 557 smlal v4.4s, v20.4h, v25.4h 558 mov v16.16b, v17.16b 559 mov v17.16b, v18.16b 560 smlal2 v5.4s, v8.8h, v22.8h 561 smlal2 v5.4s, v9.8h, v23.8h 562 smlal2 v5.4s, v10.8h, v24.8h 563 smlal2 v5.4s, v11.8h, v26.8h 564 smlal2 v5.4s, v12.8h, v27.8h 565 smlal2 v5.4s, v13.8h, v28.8h 566 smlal2 v5.4s, v20.8h, v25.8h 567 568 mov v13.16b, v14.16b 569 mov v14.16b, v15.16b 570 571 mov v19.16b, v20.16b 572 mov v20.16b, v21.16b 573 ret 574endfunc 575 576.macro sum_lag3_func type, uv_layout, edge, elems=8 577function sum_\type\()_lag3_\edge\()_neon 578 AARCH64_SIGN_LINK_REGISTER 579 str x30, [sp, #-16]! 580.ifc \edge, left 581 sub x11, x0, #3*GRAIN_WIDTH*2 582 sub x12, x0, #2*GRAIN_WIDTH*2 583 sub x13, x0, #1*GRAIN_WIDTH*2 584 ld1 {v14.8h}, [x11] // load the previous block right above 585 ld1 {v17.8h}, [x12] 586 ld1 {v20.8h}, [x13] 587.endif 588 sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, v30.b[8] 589endfunc 590.endm 591 592sum_lag3_func y, 0, left 593sum_lag3_func y, 0, mid 594sum_lag3_func y, 0, right, 7 595sum_lag3_func uv_444, 444, left 596sum_lag3_func uv_444, 444, mid 597sum_lag3_func uv_444, 444, right, 7 598sum_lag3_func uv_422, 422, left 599sum_lag3_func uv_422, 422, mid 600sum_lag3_func uv_422, 422, right, 1 601sum_lag3_func uv_420, 420, left 602sum_lag3_func uv_420, 420, mid 603sum_lag3_func uv_420, 420, right, 1 604 605function generate_grain_rows_neon 606 AARCH64_SIGN_LINK_REGISTER 607 str x30, [sp, #-16]! 6081: 609 mov w16, #80 6102: 611 bl get_gaussian_neon 612 srshl v0.8h, v0.8h, v31.8h 613 subs w16, w16, #8 614 st1 {v0.8h}, [x0], #16 615 b.gt 2b 616 get_grain_2 v0 617 subs w1, w1, #1 618 st1 {v0.s}[0], [x0], #4 619 b.gt 1b 620 ldr x30, [sp], #16 621 AARCH64_VALIDATE_LINK_REGISTER 622 ret 623endfunc 624 625function generate_grain_rows_44_neon 626 AARCH64_SIGN_LINK_REGISTER 627 str x30, [sp, #-16]! 6281: 629 mov w16, #40 6302: 631 bl get_gaussian_neon 632 srshl v0.8h, v0.8h, v31.8h 633 subs w16, w16, #8 634 st1 {v0.8h}, [x0], #16 635 b.gt 2b 636 get_grain_4 v0 637 subs w1, w1, #1 638 st1 {v0.4h}, [x0] 639 add x0, x0, #GRAIN_WIDTH*2-80 640 b.gt 1b 641 ldr x30, [sp], #16 642 AARCH64_VALIDATE_LINK_REGISTER 643 ret 644endfunc 645 646function gen_grain_uv_444_lag0_neon 647 AARCH64_SIGN_LINK_REGISTER 648 str x30, [sp, #-16]! 649 ld1 {v4.8h}, [x19], #16 650gen_grain_uv_lag0_8_start: 651 bl get_gaussian_neon 652 srshl v0.8h, v0.8h, v31.8h 653gen_grain_uv_lag0_8_add: 654 and v4.16b, v4.16b, v1.16b 655 smull v2.4s, v4.4h, v27.4h 656 smull2 v3.4s, v4.8h, v27.8h 657 srshl v2.4s, v2.4s, v28.4s 658 srshl v3.4s, v3.4s, v28.4s 659 sqxtn v2.4h, v2.4s 660 sqxtn2 v2.8h, v3.4s 661 sqadd v2.8h, v2.8h, v0.8h 662 smin v2.8h, v2.8h, v25.8h 663 smax v2.8h, v2.8h, v26.8h 664 st1 {v2.8h}, [x0], #16 665 ldr x30, [sp], #16 666 AARCH64_VALIDATE_LINK_REGISTER 667 ret 668endfunc 669 670function gen_grain_uv_420_lag0_8_neon 671 AARCH64_SIGN_LINK_REGISTER 672 add x12, x19, #GRAIN_WIDTH*2 673 str x30, [sp, #-16]! 674 ld1 {v16.8h, v17.8h}, [x19], #32 675 ld1 {v18.8h, v19.8h}, [x12] 676 addp v16.8h, v16.8h, v17.8h 677 addp v17.8h, v18.8h, v19.8h 678 add v16.8h, v16.8h, v17.8h 679 srshr v4.8h, v16.8h, #2 680 b gen_grain_uv_lag0_8_start 681endfunc 682 683function gen_grain_uv_422_lag0_8_neon 684 AARCH64_SIGN_LINK_REGISTER 685 str x30, [sp, #-16]! 686 ld1 {v16.8h, v17.8h}, [x19], #32 687 addp v16.8h, v16.8h, v17.8h 688 srshr v4.8h, v16.8h, #1 689 b gen_grain_uv_lag0_8_start 690endfunc 691 692function gen_grain_uv_420_lag0_4_neon 693 add x12, x19, #GRAIN_WIDTH*2 694 AARCH64_SIGN_LINK_REGISTER 695 str x30, [sp, #-16]! 696 ld1 {v16.4h, v17.4h}, [x19] 697 ld1 {v18.4h, v19.4h}, [x12] 698 add x19, x19, #32 699 addp v16.4h, v16.4h, v17.4h 700 addp v17.4h, v18.4h, v19.4h 701 add v16.4h, v16.4h, v17.4h 702 srshr v4.4h, v16.4h, #2 703 get_grain_4 v0 704 b gen_grain_uv_lag0_8_add 705endfunc 706 707function gen_grain_uv_422_lag0_4_neon 708 AARCH64_SIGN_LINK_REGISTER 709 str x30, [sp, #-16]! 710 ld1 {v16.4h, v17.4h}, [x19] 711 add x19, x19, #32 712 addp v16.4h, v16.4h, v17.4h 713 srshr v4.4h, v16.4h, #1 714 get_grain_4 v0 715 b gen_grain_uv_lag0_8_add 716endfunc 717 718.macro gen_grain_82 type 719function generate_grain_\type\()_16bpc_neon, export=1 720 AARCH64_SIGN_LINK_REGISTER 721 stp x30, x19, [sp, #-96]! 722 723.ifc \type, uv_444 724 mov w13, w3 725 mov w14, #28 726 add x19, x1, #3*GRAIN_WIDTH*2 727 mov x1, x2 728 mul w13, w13, w14 729 clz w15, w4 730.else 731 clz w15, w2 732.endif 733 movrel x3, X(gaussian_sequence) 734 sub w15, w15, #24 // -bitdepth_min_8 735 ldr w2, [x1, #FGD_SEED] 736 ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] 737.ifc \type, y 738 add x4, x1, #FGD_AR_COEFFS_Y 739.else 740 add x4, x1, #FGD_AR_COEFFS_UV 741.endif 742 add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 743 adr x16, L(gen_grain_\type\()_tbl) 744 ldr w17, [x1, #FGD_AR_COEFF_LAG] 745 add w9, w9, #4 746 ldrh w17, [x16, w17, uxtw #1] 747 dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift 748 sub x16, x16, w17, uxtw 749 neg v31.8h, v31.8h 750 751.ifc \type, uv_444 752 cmp w13, #0 753 mov w11, #0x49d8 754 mov w14, #0xb524 755 add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] 756 csel w11, w11, w14, ne 757.endif 758 759 ldr w7, [x1, #FGD_AR_COEFF_SHIFT] 760 neg w15, w15 // bitdepth_min_8 761 mov w8, #1 762 mov w10, #1 763 lsl w8, w8, w7 // 1 << ar_coeff_shift 764 lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) 765 lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) 766 lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) 767 mov w5, #128 768 lsl w5, w5, w15 // 128 << bitdepth_min_8 769 neg w6, w5 // -(128 << bitpdeth_min_8) 770 sub w5, w5, #1 // (128 << bitdepth_min_8) - 1 771 772.ifc \type, uv_444 773 eor w2, w2, w11 774.endif 775 776 br x16 777 778L(generate_grain_\type\()_lag0): 779 AARCH64_VALID_JUMP_TARGET 780.ifc \type, y 781 mov w1, #GRAIN_HEIGHT 782 bl generate_grain_rows_neon 783.else 784 dup v28.4s, w7 785 ld1r {v27.8b}, [x4] // ar_coeffs_uv[0] 786 movi v0.16b, #0 787 movi v1.16b, #255 788 dup v25.8h, w5 789 dup v26.8h, w6 790 ext v29.16b, v0.16b, v1.16b, #10 791 ext v30.16b, v1.16b, v0.16b, #2 792 neg v28.4s, v28.4s 793 sxtl v27.8h, v27.8b 794 795 mov w1, #3 796 bl generate_grain_rows_neon 797 mov w1, #GRAIN_HEIGHT-3 7981: 799 mov v1.16b, v29.16b 800 bl gen_grain_uv_444_lag0_neon // 8 801 movi v1.16b, #255 802 bl gen_grain_uv_444_lag0_neon // 16 803 bl gen_grain_uv_444_lag0_neon // 24 804 bl gen_grain_uv_444_lag0_neon // 32 805 bl gen_grain_uv_444_lag0_neon // 40 806 bl gen_grain_uv_444_lag0_neon // 48 807 bl gen_grain_uv_444_lag0_neon // 56 808 bl gen_grain_uv_444_lag0_neon // 64 809 bl gen_grain_uv_444_lag0_neon // 72 810 mov v1.16b, v30.16b 811 bl gen_grain_uv_444_lag0_neon // 80 812 get_grain_2 v16 813 subs w1, w1, #1 814 add x19, x19, #4 815 st1 {v16.s}[0], [x0], #4 816 b.gt 1b 817.endif 818 ldp x30, x19, [sp], #96 819 AARCH64_VALIDATE_LINK_REGISTER 820 ret 821 822L(generate_grain_\type\()_lag1): 823 AARCH64_VALID_JUMP_TARGET 824 ld1r {v27.8b}, [x4], #1 // ar_coeffs_y[0] 825 ld1r {v28.8b}, [x4], #1 // ar_coeffs_y[1] 826 ld1r {v29.8b}, [x4] // ar_coeffs_y[2] 827.ifc \type, y 828 ldrsb w4, [x4, #1] // ar_coeffs_y[3] 829.else 830 add x4, x4, #2 831.endif 832 833 mov w1, #3 834.ifc \type, uv_444 835 ld1r {v30.8b}, [x4] // ar_coeffs_uv[4] 836 ldursb w4, [x4, #-1] // ar_coeffs_uv[3] 837.endif 838 bl generate_grain_rows_neon 839 sxtl v27.8h, v27.8b 840 sxtl v28.8h, v28.8b 841 sxtl v29.8h, v29.8b 842.ifc \type, uv_444 843 sxtl v30.8h, v30.8b 844.endif 845 846 mov w1, #GRAIN_HEIGHT - 3 8471: 848 bl sum_\type\()_lag1_left_neon // 8 849 bl sum_\type\()_lag1_mid_neon // 16 850 bl sum_\type\()_lag1_mid_neon // 24 851 bl sum_\type\()_lag1_mid_neon // 32 852 bl sum_\type\()_lag1_mid_neon // 40 853 bl sum_\type\()_lag1_mid_neon // 48 854 bl sum_\type\()_lag1_mid_neon // 56 855 bl sum_\type\()_lag1_mid_neon // 64 856 bl sum_\type\()_lag1_mid_neon // 72 857 bl sum_\type\()_lag1_right_neon // 80 858 get_grain_2 v16 859 subs w1, w1, #1 860.ifc \type, uv_444 861 add x19, x19, #4 862.endif 863 st1 {v16.s}[0], [x0], #4 864 b.gt 1b 865 866 ldp x30, x19, [sp], #96 867 AARCH64_VALIDATE_LINK_REGISTER 868 ret 869 870L(generate_grain_\type\()_lag2): 871 AARCH64_VALID_JUMP_TARGET 872 ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] 873 874 smov w4, v30.b[10] 875 smov w17, v30.b[11] 876 877 mov w1, #3 878 bl generate_grain_rows_neon 879 880 mov w1, #GRAIN_HEIGHT - 3 8811: 882 bl sum_\type\()_lag2_left_neon // 8 883 bl sum_\type\()_lag2_mid_neon // 16 884 bl sum_\type\()_lag2_mid_neon // 24 885 bl sum_\type\()_lag2_mid_neon // 32 886 bl sum_\type\()_lag2_mid_neon // 40 887 bl sum_\type\()_lag2_mid_neon // 48 888 bl sum_\type\()_lag2_mid_neon // 56 889 bl sum_\type\()_lag2_mid_neon // 64 890 bl sum_\type\()_lag2_mid_neon // 72 891 bl sum_\type\()_lag2_right_neon // 80 892 get_grain_2 v16 893 subs w1, w1, #1 894.ifc \type, uv_444 895 add x19, x19, #4 896.endif 897 st1 {v16.s}[0], [x0], #4 898 b.gt 1b 899 900 ldp x30, x19, [sp], #96 901 AARCH64_VALIDATE_LINK_REGISTER 902 ret 903 904L(generate_grain_\type\()_lag3): 905 AARCH64_VALID_JUMP_TARGET 906 ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] 907 stp d8, d9, [sp, #16] 908 stp d10, d11, [sp, #32] 909 stp d12, d13, [sp, #48] 910 stp d14, d15, [sp, #64] 911 stp x20, x21, [sp, #80] 912 913 smov w4, v30.b[5] 914 smov w20, v30.b[6] 915 smov w21, v30.b[7] 916 917 mov w1, #3 918 bl generate_grain_rows_neon 919 920 mov w1, #GRAIN_HEIGHT - 3 9211: 922 bl sum_\type\()_lag3_left_neon // 8 923 bl sum_\type\()_lag3_mid_neon // 16 924 bl sum_\type\()_lag3_mid_neon // 24 925 bl sum_\type\()_lag3_mid_neon // 32 926 bl sum_\type\()_lag3_mid_neon // 40 927 bl sum_\type\()_lag3_mid_neon // 48 928 bl sum_\type\()_lag3_mid_neon // 56 929 bl sum_\type\()_lag3_mid_neon // 64 930 bl sum_\type\()_lag3_mid_neon // 72 931 bl sum_\type\()_lag3_right_neon // 80 932 get_grain_2 v16 933 subs w1, w1, #1 934.ifc \type, uv_444 935 add x19, x19, #4 936.endif 937 st1 {v16.s}[0], [x0], #4 938 b.gt 1b 939 940 ldp x20, x21, [sp, #80] 941 ldp d14, d15, [sp, #64] 942 ldp d12, d13, [sp, #48] 943 ldp d10, d11, [sp, #32] 944 ldp d8, d9, [sp, #16] 945 ldp x30, x19, [sp], #96 946 AARCH64_VALIDATE_LINK_REGISTER 947 ret 948 949L(gen_grain_\type\()_tbl): 950 .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) 951 .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) 952 .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) 953 .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) 954endfunc 955.endm 956 957gen_grain_82 y 958gen_grain_82 uv_444 959 960.macro set_height dst, type 961.ifc \type, uv_420 962 mov \dst, #SUB_GRAIN_HEIGHT-3 963.else 964 mov \dst, #GRAIN_HEIGHT-3 965.endif 966.endm 967 968.macro increment_y_ptr reg, type 969.ifc \type, uv_420 970 add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32) 971.else 972 sub \reg, \reg, #6*32-GRAIN_WIDTH*2 973.endif 974.endm 975 976.macro gen_grain_44 type 977function generate_grain_\type\()_16bpc_neon, export=1 978 AARCH64_SIGN_LINK_REGISTER 979 stp x30, x19, [sp, #-96]! 980 981 mov w13, w3 982 mov w14, #28 983 add x19, x1, #(3*GRAIN_WIDTH-3)*2 984 mov x1, x2 985 mul w13, w13, w14 986 clz w15, w4 987 988 movrel x3, X(gaussian_sequence) 989 sub w15, w15, #24 // -bitdepth_min_8 990 ldr w2, [x1, #FGD_SEED] 991 ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] 992 add x4, x1, #FGD_AR_COEFFS_UV 993 add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 994 adr x16, L(gen_grain_\type\()_tbl) 995 ldr w17, [x1, #FGD_AR_COEFF_LAG] 996 add w9, w9, #4 997 ldrh w17, [x16, w17, uxtw #1] 998 dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift 999 sub x16, x16, w17, uxtw 1000 neg v31.8h, v31.8h 1001 1002 cmp w13, #0 1003 mov w11, #0x49d8 1004 mov w14, #0xb524 1005 add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] 1006 csel w11, w11, w14, ne 1007 1008 ldr w7, [x1, #FGD_AR_COEFF_SHIFT] 1009 neg w15, w15 // bitdepth_min_8 1010 mov w8, #1 1011 mov w10, #1 1012 lsl w8, w8, w7 // 1 << ar_coeff_shift 1013 lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) 1014 lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) 1015 lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) 1016 mov w5, #128 1017 lsl w5, w5, w15 // 128 << bitdepth_min_8 1018 neg w6, w5 // -(128 << bitpdeth_min_8) 1019 sub w5, w5, #1 // (128 << bitdepth_min_8) - 1 1020 1021 eor w2, w2, w11 1022 1023 br x16 1024 1025L(generate_grain_\type\()_lag0): 1026 AARCH64_VALID_JUMP_TARGET 1027 dup v28.4s, w7 1028 ld1r {v27.8b}, [x4] // ar_coeffs_uv[0] 1029 movi v0.16b, #0 1030 movi v1.16b, #255 1031 dup v25.8h, w5 1032 dup v26.8h, w6 1033 ext v29.16b, v0.16b, v1.16b, #10 1034 ext v30.16b, v1.16b, v0.16b, #14 1035 neg v28.4s, v28.4s 1036 sxtl v27.8h, v27.8b 1037 1038 mov w1, #3 1039 bl generate_grain_rows_44_neon 1040 set_height w1, \type 10411: 1042 mov v1.16b, v29.16b 1043 bl gen_grain_\type\()_lag0_8_neon // 8 1044 movi v1.16b, #255 1045 bl gen_grain_\type\()_lag0_8_neon // 16 1046 bl gen_grain_\type\()_lag0_8_neon // 24 1047 bl gen_grain_\type\()_lag0_8_neon // 32 1048 bl gen_grain_\type\()_lag0_8_neon // 40 1049 mov v1.16b, v30.16b 1050 bl gen_grain_\type\()_lag0_4_neon // 44 1051 subs w1, w1, #1 1052 increment_y_ptr x19, \type 1053 add x0, x0, #GRAIN_WIDTH*2-6*16 1054 b.gt 1b 1055 1056 ldp x30, x19, [sp], #96 1057 AARCH64_VALIDATE_LINK_REGISTER 1058 ret 1059 1060L(generate_grain_\type\()_lag1): 1061 AARCH64_VALID_JUMP_TARGET 1062 ld1r {v27.8b}, [x4], #1 // ar_coeffs_uv[0] 1063 ld1r {v28.8b}, [x4], #1 // ar_coeffs_uv[1] 1064 ld1r {v29.8b}, [x4] // ar_coeffs_uv[2] 1065 add x4, x4, #2 1066 1067 mov w1, #3 1068 ld1r {v30.8b}, [x4] // ar_coeffs_u4[4] 1069 ldursb w4, [x4, #-1] // ar_coeffs_uv[3] 1070 bl generate_grain_rows_44_neon 1071 1072 sxtl v27.8h, v27.8b 1073 sxtl v28.8h, v28.8b 1074 sxtl v29.8h, v29.8b 1075 sxtl v30.8h, v30.8b 1076 set_height w1, \type 10771: 1078 bl sum_\type\()_lag1_left_neon // 8 1079 bl sum_\type\()_lag1_mid_neon // 16 1080 bl sum_\type\()_lag1_mid_neon // 24 1081 bl sum_\type\()_lag1_mid_neon // 32 1082 bl sum_\type\()_lag1_mid_neon // 40 1083 bl sum_\type\()_lag1_right_neon // 44 1084 subs w1, w1, #1 1085 increment_y_ptr x19, \type 1086 add x0, x0, #GRAIN_WIDTH*2-6*16 1087 b.gt 1b 1088 1089 ldp x30, x19, [sp], #96 1090 AARCH64_VALIDATE_LINK_REGISTER 1091 ret 1092 1093L(generate_grain_\type\()_lag2): 1094 AARCH64_VALID_JUMP_TARGET 1095 ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12] 1096 1097 smov w4, v30.b[10] 1098 smov w17, v30.b[11] 1099 1100 mov w1, #3 1101 bl generate_grain_rows_44_neon 1102 1103 set_height w1, \type 11041: 1105 bl sum_\type\()_lag2_left_neon // 8 1106 bl sum_\type\()_lag2_mid_neon // 16 1107 bl sum_\type\()_lag2_mid_neon // 24 1108 bl sum_\type\()_lag2_mid_neon // 32 1109 bl sum_\type\()_lag2_mid_neon // 40 1110 bl sum_\type\()_lag2_right_neon // 44 1111 subs w1, w1, #1 1112 increment_y_ptr x19, \type 1113 add x0, x0, #GRAIN_WIDTH*2-6*16 1114 b.gt 1b 1115 1116 ldp x30, x19, [sp], #96 1117 AARCH64_VALIDATE_LINK_REGISTER 1118 ret 1119 1120L(generate_grain_\type\()_lag3): 1121 AARCH64_VALID_JUMP_TARGET 1122 ldr q29, [x4] // ar_coeffs_uv[0-15] 1123 ldr q30, [x4, #16] // ar_coeffs_uv[16-24] 1124 stp d8, d9, [sp, #16] 1125 stp d10, d11, [sp, #32] 1126 stp d12, d13, [sp, #48] 1127 stp d14, d15, [sp, #64] 1128 stp x20, x21, [sp, #80] 1129 1130 smov w4, v30.b[5] 1131 smov w20, v30.b[6] 1132 smov w21, v30.b[7] 1133 1134 mov w1, #3 1135 bl generate_grain_rows_44_neon 1136 1137 set_height w1, \type 11381: 1139 bl sum_\type\()_lag3_left_neon // 8 1140 bl sum_\type\()_lag3_mid_neon // 16 1141 bl sum_\type\()_lag3_mid_neon // 24 1142 bl sum_\type\()_lag3_mid_neon // 32 1143 bl sum_\type\()_lag3_mid_neon // 40 1144 bl sum_\type\()_lag3_right_neon // 44 1145 subs w1, w1, #1 1146 increment_y_ptr x19, \type 1147 add x0, x0, #GRAIN_WIDTH*2-6*16 1148 b.gt 1b 1149 1150 ldp x20, x21, [sp, #80] 1151 ldp d14, d15, [sp, #64] 1152 ldp d12, d13, [sp, #48] 1153 ldp d10, d11, [sp, #32] 1154 ldp d8, d9, [sp, #16] 1155 ldp x30, x19, [sp], #96 1156 AARCH64_VALIDATE_LINK_REGISTER 1157 ret 1158 1159L(gen_grain_\type\()_tbl): 1160 .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) 1161 .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) 1162 .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) 1163 .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) 1164endfunc 1165.endm 1166 1167gen_grain_44 uv_420 1168gen_grain_44 uv_422 1169 1170.macro gather_interleaved dst1, dst2, src1, src2, off 1171 umov w14, \src1[0] 1172 umov w15, \src2[1] 1173 umov w16, \src1[2] 1174 add x14, x14, x3 1175 umov w17, \src2[3] 1176 add x15, x15, x3 1177 ld1 {\dst1}[0+\off], [x14] 1178 umov w14, \src1[4] 1179 add x16, x16, x3 1180 ld1 {\dst2}[1+\off], [x15] 1181 umov w15, \src2[5] 1182 add x17, x17, x3 1183 ld1 {\dst1}[2+\off], [x16] 1184 umov w16, \src1[6] 1185 add x14, x14, x3 1186 ld1 {\dst2}[3+\off], [x17] 1187 umov w17, \src2[7] 1188 add x15, x15, x3 1189 ld1 {\dst1}[4+\off], [x14] 1190 add x16, x16, x3 1191 ld1 {\dst2}[5+\off], [x15] 1192 add x17, x17, x3 1193 ld1 {\dst1}[6+\off], [x16] 1194 ld1 {\dst2}[7+\off], [x17] 1195.endm 1196 1197.macro gather dst1, dst2, src1, src2, src3, src4 1198 gather_interleaved \dst1, \dst2, \src1, \src3, 0 1199 gather_interleaved \dst2, \dst1, \src3, \src1, 0 1200 gather_interleaved \dst1, \dst2, \src2, \src4, 8 1201 gather_interleaved \dst2, \dst1, \src4, \src2, 8 1202.endm 1203 1204function gather32_neon 1205 gather v6.b, v7.b, v0.h, v1.h, v2.h, v3.h 1206 ret 1207endfunc 1208 1209function gather16_neon 1210 gather_interleaved v6.b, v7.b, v0.h, v1.h, 0 1211 gather_interleaved v7.b, v6.b, v1.h, v0.h, 0 1212 ins v6.d[1], v7.d[0] 1213 ret 1214endfunc 1215 1216const overlap_coeffs_0, align=4 1217 .short 27, 17, 0, 0 1218 .short 17, 27, 32, 32 1219endconst 1220 1221const overlap_coeffs_1, align=4 1222 .short 23, 0, 0, 0 1223 .short 22, 32, 32, 32 1224endconst 1225 1226.macro calc_offset offx, offy, src, sx, sy 1227 and \offy, \src, #0xF // randval & 0xF 1228 lsr \offx, \src, #4 // randval >> 4 1229.if \sy == 0 1230 add \offy, \offy, \offy // 2 * (randval & 0xF) 1231.endif 1232.if \sx == 0 1233 add \offx, \offx, \offx // 2 * (randval >> 4) 1234.endif 1235.endm 1236 1237.macro add_offset dst, offx, offy, src, stride 1238 madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy 1239 add \dst, \dst, \offx, uxtw #1 // grain_lut += offx 1240.endm 1241 1242// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src, 1243// const ptrdiff_t stride, 1244// const uint8_t scaling[SCALING_SIZE], 1245// const int scaling_shift, 1246// const entry grain_lut[][GRAIN_WIDTH], 1247// const int offsets[][2], 1248// const int h, const ptrdiff_t clip, 1249// const ptrdiff_t type, 1250// const int bitdepth_max); 1251function fgy_32x32_16bpc_neon, export=1 1252 AARCH64_SIGN_LINK_REGISTER 1253 str x30, [sp, #-80]! 1254 stp d8, d9, [sp, #16] 1255 stp d10, d11, [sp, #32] 1256 stp d12, d13, [sp, #48] 1257 str d14, [sp, #64] 1258 eor w4, w4, #15 // 15 - scaling_shift 1259 ldr w11, [x6, #8] // offsets[1][0] 1260 ldr w13, [x6, #4] // offsets[0][1] 1261 ldr w15, [x6, #12] // offsets[1][1] 1262 ldr w10, [sp, #96] // bitdepth_max 1263 ldr w6, [x6] // offsets[0][0] 1264 dup v26.8h, w10 // bitdepth_max 1265 clz w10, w10 1266 ldr w8, [sp, #80] // clip 1267 sub w10, w10, #24 // -bitdepth_min_8 1268 mov x9, #GRAIN_WIDTH*2 // grain_lut stride 1269 neg w10, w10 // bitdepth_min_8 1270 1271 dup v29.8h, w4 // 15 - scaling_shift 1272 dup v27.8h, w10 // bitdepth_min_8 1273 1274 movrel x16, overlap_coeffs_0 1275 1276 cbz w8, 1f 1277 // clip 1278 movi v30.8h, #16 1279 movi v31.8h, #235 1280 sshl v30.8h, v30.8h, v27.8h 1281 sshl v31.8h, v31.8h, v27.8h 1282 b 2f 12831: 1284 // no clip 1285 movi v30.8h, #0 1286 mov v31.16b, v26.16b // bitdepth_max 12872: 1288 1289 ushr v26.8h, v26.8h, #1 // grain_max 1290 not v25.16b, v26.16b // grain_min 1291 1292 ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs 1293 1294 add x5, x5, #18 // grain_lut += 9 1295 add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride 1296 add x5, x5, x9 // grain_lut += grain_stride 1297 1298 calc_offset w11, w12, w11, 0, 0 1299 calc_offset w13, w14, w13, 0, 0 1300 calc_offset w15, w16, w15, 0, 0 1301 calc_offset w6, w10, w6, 0, 0 1302 1303 add_offset x12, w11, x12, x5, x9 1304 add_offset x14, w13, x14, x5, x9 1305 add_offset x16, w15, x16, x5, x9 1306 add_offset x5, w6, x10, x5, x9 1307 1308 ldr w11, [sp, #88] // type 1309 adr x13, L(fgy_loop_tbl) 1310 1311 add x4, x12, #32*2 // grain_lut += FG_BLOCK_SIZE * bx 1312 add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1313 1314 tst w11, #1 1315 ldrh w11, [x13, w11, uxtw #1] 1316 1317 add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1318 add x8, x8, #32*2 // grain_lut += FG_BLOCK_SIZE * bx 1319 1320 sub x11, x13, w11, uxtw 1321 1322 b.eq 1f 1323 // y overlap 1324 dup v8.8h, v27.h[0] 1325 dup v9.8h, v27.h[1] 1326 mov w10, w7 // backup actual h 1327 mov w7, #2 13281: 1329 br x11 1330endfunc 1331 1332function fgy_loop_neon 1333.macro fgy ox, oy 1334L(loop_\ox\oy): 1335 AARCH64_VALID_JUMP_TARGET 13361: 1337 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 // src 1338.if \ox 1339 ld1 {v20.4h}, [x4], x9 // grain_lut old 1340.endif 1341.if \oy 1342 ld1 {v21.8h, v22.8h, v23.8h, v24.8h}, [x6], x9 // grain_lut top 1343.endif 1344.if \ox && \oy 1345 ld1 {v14.4h}, [x8], x9 // grain_lut top old 1346.endif 1347 mvni v4.8h, #0xf0, lsl #8 // 0x0fff 1348 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x9 // grain_lut 1349 1350 // Make sure that uninitialized pixels out of range past the right 1351 // edge are in range; their actual values shouldn't matter. 1352 and v0.16b, v0.16b, v4.16b 1353 and v1.16b, v1.16b, v4.16b 1354 and v2.16b, v2.16b, v4.16b 1355 and v3.16b, v3.16b, v4.16b 1356 bl gather32_neon 1357 1358.if \ox 1359 smull v20.4s, v20.4h, v27.4h 1360 smlal v20.4s, v16.4h, v28.4h 1361.endif 1362 1363.if \oy 1364.if \ox 1365 smull v14.4s, v14.4h, v27.4h 1366 smlal v14.4s, v21.4h, v28.4h 1367 sqrshrn v20.4h, v20.4s, #5 1368 sqrshrn v14.4h, v14.4s, #5 1369 smin v20.4h, v20.4h, v26.4h 1370 smin v14.4h, v14.4h, v26.4h 1371 smax v20.4h, v20.4h, v25.4h 1372 smax v14.4h, v14.4h, v25.4h 1373.endif 1374 1375.if \ox 1376 smull v10.4s, v20.4h, v9.4h 1377.else 1378 smull v10.4s, v16.4h, v9.4h 1379.endif 1380 smull2 v11.4s, v16.8h, v9.8h 1381 smull v12.4s, v17.4h, v9.4h 1382 smull2 v13.4s, v17.8h, v9.8h 1383 smull v16.4s, v18.4h, v9.4h 1384 smull2 v17.4s, v18.8h, v9.8h 1385 smull v18.4s, v19.4h, v9.4h 1386 smull2 v19.4s, v19.8h, v9.8h 1387.if \ox 1388 smlal v10.4s, v14.4h, v8.4h 1389.else 1390 smlal v10.4s, v21.4h, v8.4h 1391.endif 1392 smlal2 v11.4s, v21.8h, v8.8h 1393 smlal v12.4s, v22.4h, v8.4h 1394 smlal2 v13.4s, v22.8h, v8.8h 1395 smlal v16.4s, v23.4h, v8.4h 1396 smlal2 v17.4s, v23.8h, v8.8h 1397 smlal v18.4s, v24.4h, v8.4h 1398 smlal2 v19.4s, v24.8h, v8.8h 1399 sqrshrn v10.4h, v10.4s, #5 1400 sqrshrn2 v10.8h, v11.4s, #5 1401 sqrshrn v11.4h, v12.4s, #5 1402 sqrshrn2 v11.8h, v13.4s, #5 1403 sqrshrn v12.4h, v16.4s, #5 1404 sqrshrn2 v12.8h, v17.4s, #5 1405 sqrshrn v13.4h, v18.4s, #5 1406 sqrshrn2 v13.8h, v19.4s, #5 1407 smin v16.8h, v10.8h, v26.8h 1408 smin v17.8h, v11.8h, v26.8h 1409 smin v18.8h, v12.8h, v26.8h 1410 smin v19.8h, v13.8h, v26.8h 1411 smax v16.8h, v16.8h, v25.8h 1412 smax v17.8h, v17.8h, v25.8h 1413 smax v18.8h, v18.8h, v25.8h 1414 smax v19.8h, v19.8h, v25.8h 1415.endif 1416 1417 uxtl v4.8h, v6.8b // scaling 1418.if \ox && !\oy 1419 sqrshrn v20.4h, v20.4s, #5 1420.endif 1421 uxtl2 v5.8h, v6.16b 1422.if \ox && !\oy 1423 smin v20.4h, v20.4h, v26.4h 1424.endif 1425 uxtl v6.8h, v7.8b 1426.if \ox && !\oy 1427 smax v20.4h, v20.4h, v25.4h 1428.endif 1429 uxtl2 v7.8h, v7.16b 1430.if \ox && !\oy 1431 ins v16.d[0], v20.d[0] 1432.endif 1433 ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) 1434 ushl v5.8h, v5.8h, v29.8h 1435 ushl v6.8h, v6.8h, v29.8h 1436 ushl v7.8h, v7.8h, v29.8h 1437 1438 sqrdmulh v20.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) 1439 sqrdmulh v21.8h, v17.8h, v5.8h 1440 sqrdmulh v22.8h, v18.8h, v6.8h 1441 sqrdmulh v23.8h, v19.8h, v7.8h 1442 1443 usqadd v0.8h, v20.8h // *src + noise 1444 usqadd v1.8h, v21.8h 1445 usqadd v2.8h, v22.8h 1446 usqadd v3.8h, v23.8h 1447 1448 umax v0.8h, v0.8h, v30.8h 1449 umax v1.8h, v1.8h, v30.8h 1450 umax v2.8h, v2.8h, v30.8h 1451 umax v3.8h, v3.8h, v30.8h 1452 umin v0.8h, v0.8h, v31.8h 1453 umin v1.8h, v1.8h, v31.8h 1454 umin v2.8h, v2.8h, v31.8h 1455 umin v3.8h, v3.8h, v31.8h 1456 1457 subs w7, w7, #1 1458.if \oy 1459 dup v8.8h, v28.h[0] 1460 dup v9.8h, v28.h[1] 1461.endif 1462 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst 1463 b.gt 1b 1464 1465.if \oy 1466 cmp w10, #2 1467 sub w7, w10, #2 // restore actual remaining h 1468 b.gt L(loop_\ox\()0) 1469.endif 1470 ldr d14, [sp, #64] 1471 ldp d12, d13, [sp, #48] 1472 ldp d10, d11, [sp, #32] 1473 ldp d8, d9, [sp, #16] 1474 ldr x30, [sp], #80 1475 AARCH64_VALIDATE_LINK_REGISTER 1476 ret 1477.endm 1478 1479 fgy 0, 0 1480 fgy 0, 1 1481 fgy 1, 0 1482 fgy 1, 1 1483 1484L(fgy_loop_tbl): 1485 .hword L(fgy_loop_tbl) - L(loop_00) 1486 .hword L(fgy_loop_tbl) - L(loop_01) 1487 .hword L(fgy_loop_tbl) - L(loop_10) 1488 .hword L(fgy_loop_tbl) - L(loop_11) 1489endfunc 1490 1491// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, 1492// const pixel *const src, 1493// const ptrdiff_t stride, 1494// const uint8_t scaling[SCALING_SIZE], 1495// const Dav1dFilmGrainData *const data, 1496// const entry grain_lut[][GRAIN_WIDTH], 1497// const pixel *const luma_row, 1498// const ptrdiff_t luma_stride, 1499// const int offsets[][2], 1500// const ptrdiff_t h, const ptrdiff_t uv, 1501// const ptrdiff_t is_id, 1502// const ptrdiff_t type, 1503// const int bitdepth_max); 1504.macro fguv layout, sx, sy 1505function fguv_32x32_\layout\()_16bpc_neon, export=1 1506 AARCH64_SIGN_LINK_REGISTER 1507 str x30, [sp, #-80]! 1508 stp d8, d9, [sp, #16] 1509 stp d10, d11, [sp, #32] 1510 stp d12, d13, [sp, #48] 1511 stp d14, d15, [sp, #64] 1512 1513 ldp x8, x9, [sp, #80] // offsets, h 1514 ldp x10, x11, [sp, #96] // uv, is_id 1515 ldr w16, [sp, #120] // bitdepth_max 1516 1517 ldr w13, [x4, #FGD_SCALING_SHIFT] 1518 ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE] 1519 dup v23.8h, w16 // bitdepth_max 1520 clz w16, w16 1521 eor w13, w13, #15 // 15 - scaling_shift 1522 sub w16, w16, #24 // -bitdepth_min_8 1523 1524 // !csfl 1525 add x10, x4, x10, lsl #2 // + 4*uv 1526 add x14, x10, #FGD_UV_LUMA_MULT 1527 add x15, x10, #FGD_UV_MULT 1528 add x10, x10, #FGD_UV_OFFSET 1529 neg w16, w16 // bitdepth_min_8 1530 ld1r {v8.8h}, [x14] // uv_luma_mult 1531 ld1r {v24.8h}, [x10] // uv_offset 1532 ld1r {v9.8h}, [x15] // uv_mult 1533 1534 dup v29.8h, w13 // 15 - scaling_shift 1535 dup v27.8h, w16 // bitdepth_min_8 1536 1537 cbz w12, 1f 1538 // clip 1539 movi v30.8h, #16 1540 movi v31.8h, #240 1541 sshl v30.8h, v30.8h, v27.8h 1542 sshl v31.8h, v31.8h, v27.8h 1543 cbz w11, 2f 1544 // is_id 1545 movi v31.8h, #235 1546 sshl v31.8h, v31.8h, v27.8h 1547 b 2f 15481: 1549 // no clip 1550 movi v30.8h, #0 1551 mov v31.16b, v23.16b // bitdepth_max 15522: 1553 1554 ushr v15.8h, v23.8h, #1 // grain_max 1555 sshl v24.8h, v24.8h, v27.8h // uv_offset << bitdepth_min_8 1556 not v14.16b, v15.16b // grain_min 1557 1558 ldr w12, [x8, #8] // offsets[1][0] 1559 ldr w14, [x8, #4] // offsets[0][1] 1560 ldr w16, [x8, #12] // offsets[1][1] 1561 ldr w8, [x8] // offsets[0][0] 1562 1563 mov x10, #GRAIN_WIDTH*2 // grain_lut stride 1564 1565 add x5, x5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 1566.if \sy 1567 add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride 1568 add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride 1569.else 1570 add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride 1571 add x5, x5, x10 // grain_lut += grain_stride 1572.endif 1573 1574 calc_offset w12, w13, w12, \sx, \sy 1575 calc_offset w14, w15, w14, \sx, \sy 1576 calc_offset w16, w17, w16, \sx, \sy 1577 calc_offset w8, w11, w8, \sx, \sy 1578 1579 add_offset x13, w12, x13, x5, x10 1580 add_offset x15, w14, x15, x5, x10 1581 add_offset x17, w16, x17, x5, x10 1582 add_offset x5, w8, x11, x5, x10 1583 1584 add x4, x13, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx 1585 add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1586 add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1587 add x11, x11, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx 1588 1589 ldr w13, [sp, #112] // type 1590 1591 movrel x16, overlap_coeffs_\sx 1592 adr x14, L(fguv_loop_sx\sx\()_tbl) 1593 1594 ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs 1595 tst w13, #1 1596 ldrh w13, [x14, w13, uxtw #1] 1597 1598 b.eq 1f 1599 // y overlap 1600 sub w12, w9, #(2 >> \sy) // backup remaining h 1601 mov w9, #(2 >> \sy) 1602 16031: 1604 sub x13, x14, w13, uxtw 1605 1606.if \sy 1607 movi v25.8h, #23 1608 movi v26.8h, #22 1609.else 1610 movi v25.8h, #27 1611 movi v26.8h, #17 1612.endif 1613 1614.if \sy 1615 add x7, x7, x7 // luma_stride *= 2 1616.endif 1617 1618 br x13 1619endfunc 1620.endm 1621 1622fguv 420, 1, 1 1623fguv 422, 1, 0 1624fguv 444, 0, 0 1625 1626function fguv_loop_sx0_neon 1627.macro fguv_loop_sx0 csfl, ox, oy 1628L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): 1629 AARCH64_VALID_JUMP_TARGET 16301: 1631.if \ox 1632 ld1 {v4.4h}, [x4], x10 // grain_lut old 1633.endif 1634.if \oy 1635 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], x10 // grain_lut top 1636.endif 1637.if \ox && \oy 1638 ld1 {v5.4h}, [x11], x10 // grain_lut top old 1639.endif 1640 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x10 // grain_lut 1641 1642.if \ox 1643 smull v4.4s, v4.4h, v27.4h 1644 smlal v4.4s, v16.4h, v28.4h 1645.endif 1646 1647.if \oy 1648.if \ox 1649 smull v5.4s, v5.4h, v27.4h 1650 smlal v5.4s, v0.4h, v28.4h 1651 sqrshrn v4.4h, v4.4s, #5 1652 sqrshrn v5.4h, v5.4s, #5 1653 smin v4.4h, v4.4h, v15.4h 1654 smin v5.4h, v5.4h, v15.4h 1655 smax v4.4h, v4.4h, v14.4h 1656 smax v5.4h, v5.4h, v14.4h 1657 ins v16.d[0], v4.d[0] 1658 ins v0.d[0], v5.d[0] 1659.endif 1660 1661 smull v6.4s, v16.4h, v26.4h 1662 smull2 v7.4s, v16.8h, v26.8h 1663 smull v10.4s, v17.4h, v26.4h 1664 smull2 v11.4s, v17.8h, v26.8h 1665 smull v16.4s, v18.4h, v26.4h 1666 smull2 v17.4s, v18.8h, v26.8h 1667 smull v18.4s, v19.4h, v26.4h 1668 smull2 v19.4s, v19.8h, v26.8h 1669 smlal v6.4s, v0.4h, v25.4h 1670 smlal2 v7.4s, v0.8h, v25.8h 1671 smlal v10.4s, v1.4h, v25.4h 1672 smlal2 v11.4s, v1.8h, v25.8h 1673 smlal v16.4s, v2.4h, v25.4h 1674 smlal2 v17.4s, v2.8h, v25.8h 1675 smlal v18.4s, v3.4h, v25.4h 1676 smlal2 v19.4s, v3.8h, v25.8h 1677 sqrshrn v6.4h, v6.4s, #5 1678 sqrshrn2 v6.8h, v7.4s, #5 1679 sqrshrn v7.4h, v10.4s, #5 1680 sqrshrn2 v7.8h, v11.4s, #5 1681 sqrshrn v10.4h, v16.4s, #5 1682 sqrshrn2 v10.8h, v17.4s, #5 1683 sqrshrn v11.4h, v18.4s, #5 1684 sqrshrn2 v11.8h, v19.4s, #5 1685.endif 1686 1687.if \ox && !\oy 1688 sqrshrn v4.4h, v4.4s, #5 1689 smin v4.4h, v4.4h, v15.4h 1690.endif 1691 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma 1692.if \oy 1693 smin v16.8h, v6.8h, v15.8h 1694 smin v17.8h, v7.8h, v15.8h 1695 smin v18.8h, v10.8h, v15.8h 1696 smin v19.8h, v11.8h, v15.8h 1697 smax v16.8h, v16.8h, v14.8h 1698 smax v17.8h, v17.8h, v14.8h 1699 smax v18.8h, v18.8h, v14.8h 1700 smax v19.8h, v19.8h, v14.8h 1701.endif 1702 1703.if \ox && !\oy 1704 smax v4.4h, v4.4h, v14.4h 1705.endif 1706 ld1 {v10.8h, v11.8h, v12.8h, v13.8h}, [x1], x2 // src 1707.if \ox && !\oy 1708 ins v16.d[0], v4.d[0] 1709.endif 1710 1711.if !\csfl 1712 smull v4.4s, v0.4h, v8.4h 1713 smull2 v5.4s, v0.8h, v8.8h 1714 smull v6.4s, v1.4h, v8.4h 1715 smull2 v7.4s, v1.8h, v8.8h 1716 smull v0.4s, v2.4h, v8.4h 1717 smull2 v1.4s, v2.8h, v8.8h 1718 smull v2.4s, v3.4h, v8.4h 1719 smull2 v3.4s, v3.8h, v8.8h 1720 smlal v4.4s, v10.4h, v9.4h 1721 smlal2 v5.4s, v10.8h, v9.8h 1722 smlal v6.4s, v11.4h, v9.4h 1723 smlal2 v7.4s, v11.8h, v9.8h 1724 smlal v0.4s, v12.4h, v9.4h 1725 smlal2 v1.4s, v12.8h, v9.8h 1726 smlal v2.4s, v13.4h, v9.4h 1727 smlal2 v3.4s, v13.8h, v9.8h 1728 shrn v4.4h, v4.4s, #6 1729 shrn2 v4.8h, v5.4s, #6 1730 shrn v5.4h, v6.4s, #6 1731 shrn2 v5.8h, v7.4s, #6 1732 shrn v6.4h, v0.4s, #6 1733 shrn2 v6.8h, v1.4s, #6 1734 shrn v7.4h, v2.4s, #6 1735 shrn2 v7.8h, v3.4s, #6 1736 add v0.8h, v4.8h, v24.8h 1737 add v1.8h, v5.8h, v24.8h 1738 add v2.8h, v6.8h, v24.8h 1739 add v3.8h, v7.8h, v24.8h 1740 movi v20.8h, #0 1741 smin v0.8h, v0.8h, v23.8h 1742 smin v1.8h, v1.8h, v23.8h 1743 smin v2.8h, v2.8h, v23.8h 1744 smin v3.8h, v3.8h, v23.8h 1745 smax v0.8h, v0.8h, v20.8h 1746 smax v1.8h, v1.8h, v20.8h 1747 smax v2.8h, v2.8h, v20.8h 1748 smax v3.8h, v3.8h, v20.8h 1749.else 1750 // Make sure that uninitialized pixels out of range past the right 1751 // edge are in range; their actual values shouldn't matter. 1752 and v0.16b, v0.16b, v23.16b 1753 and v1.16b, v1.16b, v23.16b 1754 and v2.16b, v2.16b, v23.16b 1755 and v3.16b, v3.16b, v23.16b 1756.endif 1757 1758 bl gather32_neon 1759 1760 uxtl v4.8h, v6.8b // scaling 1761 uxtl2 v5.8h, v6.16b 1762 uxtl v6.8h, v7.8b 1763 uxtl2 v7.8h, v7.16b 1764 1765 ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) 1766 ushl v5.8h, v5.8h, v29.8h 1767 ushl v6.8h, v6.8h, v29.8h 1768 ushl v7.8h, v7.8h, v29.8h 1769 1770 sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) 1771 sqrdmulh v17.8h, v17.8h, v5.8h 1772 sqrdmulh v18.8h, v18.8h, v6.8h 1773 sqrdmulh v19.8h, v19.8h, v7.8h 1774 1775 usqadd v10.8h, v16.8h // *src + noise 1776 usqadd v11.8h, v17.8h 1777 usqadd v12.8h, v18.8h 1778 usqadd v13.8h, v19.8h 1779 1780 umax v0.8h, v10.8h, v30.8h 1781 umax v1.8h, v11.8h, v30.8h 1782 umax v2.8h, v12.8h, v30.8h 1783 umax v3.8h, v13.8h, v30.8h 1784 umin v0.8h, v0.8h, v31.8h 1785 umin v1.8h, v1.8h, v31.8h 1786 umin v2.8h, v2.8h, v31.8h 1787 umin v3.8h, v3.8h, v31.8h 1788 1789 subs w9, w9, #1 1790.if \oy 1791 dup v25.8h, v28.h[0] 1792 dup v26.8h, v28.h[1] 1793.endif 1794 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst 1795 b.gt 1b 1796 1797.if \oy 1798 cmp w12, #0 1799 mov w9, w12 // restore actual remaining h 1800 b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) 1801.endif 1802 b 9f 1803.endm 1804 fguv_loop_sx0 0, 0, 0 1805 fguv_loop_sx0 0, 0, 1 1806 fguv_loop_sx0 0, 1, 0 1807 fguv_loop_sx0 0, 1, 1 1808 fguv_loop_sx0 1, 0, 0 1809 fguv_loop_sx0 1, 0, 1 1810 fguv_loop_sx0 1, 1, 0 1811 fguv_loop_sx0 1, 1, 1 1812 18139: 1814 ldp d14, d15, [sp, #64] 1815 ldp d12, d13, [sp, #48] 1816 ldp d10, d11, [sp, #32] 1817 ldp d8, d9, [sp, #16] 1818 ldr x30, [sp], #80 1819 AARCH64_VALIDATE_LINK_REGISTER 1820 ret 1821 1822L(fguv_loop_sx0_tbl): 1823 .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00) 1824 .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01) 1825 .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10) 1826 .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11) 1827 .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00) 1828 .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01) 1829 .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10) 1830 .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11) 1831endfunc 1832 1833function fguv_loop_sx1_neon 1834.macro fguv_loop_sx1 csfl, ox, oy 1835L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): 1836 AARCH64_VALID_JUMP_TARGET 18371: 1838.if \ox 1839 ld1 {v18.4h}, [x4], x10 // grain_lut old 1840.endif 1841.if \oy 1842 ld1 {v20.8h, v21.8h}, [x8], x10 // grain_lut top 1843.endif 1844.if \ox && \oy 1845 ld1 {v19.4h}, [x11], x10 // grain_lut top old 1846.endif 1847 ld1 {v16.8h, v17.8h}, [x5], x10 // grain_lut 1848 1849.if \ox 1850 smull v18.4s, v18.4h, v27.4h 1851 smlal v18.4s, v16.4h, v28.4h 1852.endif 1853 1854.if \oy 1855.if \ox 1856 smull v19.4s, v19.4h, v27.4h 1857 smlal v19.4s, v20.4h, v28.4h 1858 sqrshrn v18.4h, v18.4s, #5 1859 sqrshrn v19.4h, v19.4s, #5 1860 smin v18.4h, v18.4h, v15.4h 1861 smin v19.4h, v19.4h, v15.4h 1862 smax v18.4h, v18.4h, v14.4h 1863 smax v19.4h, v19.4h, v14.4h 1864 ins v16.d[0], v18.d[0] 1865 ins v20.d[0], v19.d[0] 1866.endif 1867 1868 smull v0.4s, v16.4h, v26.4h 1869 smull2 v1.4s, v16.8h, v26.8h 1870 smull v2.4s, v17.4h, v26.4h 1871 smull2 v3.4s, v17.8h, v26.8h 1872 smlal v0.4s, v20.4h, v25.4h 1873 smlal2 v1.4s, v20.8h, v25.8h 1874 smlal v2.4s, v21.4h, v25.4h 1875 smlal2 v3.4s, v21.8h, v25.8h 1876 sqrshrn v16.4h, v0.4s, #5 1877 sqrshrn2 v16.8h, v1.4s, #5 1878 sqrshrn v17.4h, v2.4s, #5 1879 sqrshrn2 v17.8h, v3.4s, #5 1880.endif 1881 1882.if \ox && !\oy 1883 sqrshrn v18.4h, v18.4s, #5 1884 smin v18.4h, v18.4h, v15.4h 1885.endif 1886 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma 1887.if \oy 1888 smin v16.8h, v16.8h, v15.8h 1889 smin v17.8h, v17.8h, v15.8h 1890 smax v16.8h, v16.8h, v14.8h 1891 smax v17.8h, v17.8h, v14.8h 1892.endif 1893 1894.if \ox && !\oy 1895 smax v18.4h, v18.4h, v14.4h 1896.endif 1897 ld1 {v10.8h, v11.8h}, [x1], x2 // src 1898.if \ox && !\oy 1899 ins v16.d[0], v18.d[0] 1900.endif 1901 addp v0.8h, v0.8h, v1.8h 1902 addp v1.8h, v2.8h, v3.8h 1903 urshr v0.8h, v0.8h, #1 1904 urshr v1.8h, v1.8h, #1 1905.if !\csfl 1906 smull v2.4s, v0.4h, v8.4h 1907 smull2 v3.4s, v0.8h, v8.8h 1908 smull v0.4s, v1.4h, v8.4h 1909 smull2 v1.4s, v1.8h, v8.8h 1910 smlal v2.4s, v10.4h, v9.4h 1911 smlal2 v3.4s, v10.8h, v9.8h 1912 smlal v0.4s, v11.4h, v9.4h 1913 smlal2 v1.4s, v11.8h, v9.8h 1914 shrn v2.4h, v2.4s, #6 1915 shrn2 v2.8h, v3.4s, #6 1916 shrn v3.4h, v0.4s, #6 1917 shrn2 v3.8h, v1.4s, #6 1918 add v0.8h, v2.8h, v24.8h 1919 add v1.8h, v3.8h, v24.8h 1920 movi v2.8h, #0 1921 smin v0.8h, v0.8h, v23.8h 1922 smin v1.8h, v1.8h, v23.8h 1923 smax v0.8h, v0.8h, v2.8h 1924 smax v1.8h, v1.8h, v2.8h 1925.else 1926 // Make sure that uninitialized pixels out of range past the right 1927 // edge are in range; their actual values shouldn't matter. 1928 and v0.16b, v0.16b, v23.16b 1929 and v1.16b, v1.16b, v23.16b 1930.endif 1931 1932 bl gather16_neon 1933 1934 uxtl v4.8h, v6.8b // scaling 1935 uxtl2 v5.8h, v6.16b 1936 1937 ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) 1938 ushl v5.8h, v5.8h, v29.8h 1939 1940 sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) 1941 sqrdmulh v17.8h, v17.8h, v5.8h 1942 1943 usqadd v10.8h, v16.8h // *src + noise 1944 usqadd v11.8h, v17.8h 1945 1946 umax v0.8h, v10.8h, v30.8h 1947 umax v1.8h, v11.8h, v30.8h 1948 umin v0.8h, v0.8h, v31.8h 1949 umin v1.8h, v1.8h, v31.8h 1950 1951.if \oy 1952 mov v16.16b, v25.16b 1953.endif 1954 subs w9, w9, #1 1955.if \oy 1956 mov v25.16b, v26.16b 1957 mov v26.16b, v16.16b 1958.endif 1959 st1 {v0.8h, v1.8h}, [x0], x2 // dst 1960 b.gt 1b 1961 1962.if \oy 1963 cmp w12, #0 1964 mov w9, w12 // restore actual remaining h 1965 b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) 1966.endif 1967 1968 b 9f 1969.endm 1970 fguv_loop_sx1 0, 0, 0 1971 fguv_loop_sx1 0, 0, 1 1972 fguv_loop_sx1 0, 1, 0 1973 fguv_loop_sx1 0, 1, 1 1974 fguv_loop_sx1 1, 0, 0 1975 fguv_loop_sx1 1, 0, 1 1976 fguv_loop_sx1 1, 1, 0 1977 fguv_loop_sx1 1, 1, 1 1978 19799: 1980 ldp d14, d15, [sp, #64] 1981 ldp d12, d13, [sp, #48] 1982 ldp d10, d11, [sp, #32] 1983 ldp d8, d9, [sp, #16] 1984 ldr x30, [sp], #80 1985 AARCH64_VALIDATE_LINK_REGISTER 1986 ret 1987 1988L(fguv_loop_sx1_tbl): 1989 .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00) 1990 .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01) 1991 .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10) 1992 .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11) 1993 .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00) 1994 .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01) 1995 .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10) 1996 .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11) 1997endfunc 1998