1/* 2 * Copyright © 2021, VideoLAN and dav1d authors 3 * Copyright © 2021, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30#include "src/arm/asm-offsets.h" 31 32#define GRAIN_WIDTH 82 33#define GRAIN_HEIGHT 73 34 35#define SUB_GRAIN_WIDTH 44 36#define SUB_GRAIN_HEIGHT 38 37 38.macro increment_seed steps, shift=1 39 lsr w11, w2, #3 40 lsr w12, w2, #12 41 lsr w13, w2, #1 42 eor w11, w2, w11 // (r >> 0) ^ (r >> 3) 43 eor w12, w12, w13 // (r >> 12) ^ (r >> 1) 44 eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) 45.if \shift 46 lsr w2, w2, #\steps 47.endif 48 and w11, w11, #((1 << \steps) - 1) // bit 49.if \shift 50 orr w2, w2, w11, lsl #(16 - \steps) // *state 51.else 52 orr w2, w2, w11, lsl #16 // *state 53.endif 54.endm 55 56.macro read_rand dest, bits, age 57 ubfx \dest, x2, #16 - \bits - \age, #\bits 58.endm 59 60.macro read_shift_rand dest, bits 61 ubfx \dest, x2, #17 - \bits, #\bits 62 lsr w2, w2, #1 63.endm 64 65// special calling convention: 66// w2 holds seed 67// x3 holds dav1d_gaussian_sequence 68// clobbers x11-x15 69// returns in v0.8h 70function get_gaussian_neon 71 increment_seed 4 72 read_rand x14, 11, 3 73 read_rand x15, 11, 2 74 add x14, x3, x14, lsl #1 75 add x15, x3, x15, lsl #1 76 ld1 {v0.h}[0], [x14] 77 read_rand x14, 11, 1 78 ld1 {v0.h}[1], [x15] 79 add x14, x3, x14, lsl #1 80 read_rand x15, 11, 0 81 increment_seed 4 82 add x15, x3, x15, lsl #1 83 ld1 {v0.h}[2], [x14] 84 read_rand x14, 11, 3 85 ld1 {v0.h}[3], [x15] 86 add x14, x3, x14, lsl #1 87 read_rand x15, 11, 2 88 ld1 {v0.h}[4], [x14] 89 add x15, x3, x15, lsl #1 90 read_rand x14, 11, 1 91 ld1 {v0.h}[5], [x15] 92 read_rand x15, 11, 0 93 add x14, x3, x14, lsl #1 94 add x15, x3, x15, lsl #1 95 ld1 {v0.h}[6], [x14] 96 ld1 {v0.h}[7], [x15] 97 ret 98endfunc 99 100.macro get_grain_row r0, r1, r2, r3, r4, r5 101 bl get_gaussian_neon 102 srshl \r5\().8h, v0.8h, v31.8h 103 xtn \r0\().8b, \r5\().8h 104 bl get_gaussian_neon 105 srshl \r5\().8h, v0.8h, v31.8h 106 xtn2 \r0\().16b, \r5\().8h 107 bl get_gaussian_neon 108 srshl \r5\().8h, v0.8h, v31.8h 109 xtn \r1\().8b, \r5\().8h 110 bl get_gaussian_neon 111 srshl \r5\().8h, v0.8h, v31.8h 112 xtn2 \r1\().16b, \r5\().8h 113 bl get_gaussian_neon 114 srshl \r5\().8h, v0.8h, v31.8h 115 xtn \r2\().8b, \r5\().8h 116 bl get_gaussian_neon 117 srshl \r5\().8h, v0.8h, v31.8h 118 xtn2 \r2\().16b, \r5\().8h 119 bl get_gaussian_neon 120 srshl \r5\().8h, v0.8h, v31.8h 121 xtn \r3\().8b, \r5\().8h 122 bl get_gaussian_neon 123 srshl \r5\().8h, v0.8h, v31.8h 124 xtn2 \r3\().16b, \r5\().8h 125 bl get_gaussian_neon 126 srshl \r5\().8h, v0.8h, v31.8h 127 xtn \r4\().8b, \r5\().8h 128 bl get_gaussian_neon 129 srshl \r5\().8h, v0.8h, v31.8h 130 xtn2 \r4\().16b, \r5\().8h 131 increment_seed 2 132 read_rand x14, 11, 1 133 read_rand x15, 11, 0 134 add x14, x3, x14, lsl #1 135 add x15, x3, x15, lsl #1 136 ld1 {\r5\().h}[0], [x14] 137 ld1 {\r5\().h}[1], [x15] 138 srshl v0.4h, \r5\().4h, v31.4h 139 xtn \r5\().8b, v0.8h 140.endm 141 142.macro store_grain_row r0, r1, r2, r3, r4, r5 143 st1 {\r0\().16b,\r1\().16b}, [x0], #32 144 st1 {\r2\().16b,\r3\().16b}, [x0], #32 145 st1 {\r4\().16b}, [x0], #16 146 st1 {\r5\().h}[0], [x0], #2 147.endm 148 149.macro get_grain_row_44 r0, r1, r2 150 bl get_gaussian_neon 151 srshl \r2\().8h, v0.8h, v31.8h 152 xtn \r0\().8b, \r2\().8h 153 bl get_gaussian_neon 154 srshl \r2\().8h, v0.8h, v31.8h 155 xtn2 \r0\().16b, \r2\().8h 156 bl get_gaussian_neon 157 srshl \r2\().8h, v0.8h, v31.8h 158 xtn \r1\().8b, \r2\().8h 159 bl get_gaussian_neon 160 srshl \r2\().8h, v0.8h, v31.8h 161 xtn2 \r1\().16b, \r2\().8h 162 bl get_gaussian_neon 163 srshl \r2\().8h, v0.8h, v31.8h 164 xtn \r2\().8b, \r2\().8h 165 166 increment_seed 4 167 read_rand x14, 11, 3 168 read_rand x15, 11, 2 169 add x14, x3, x14, lsl #1 170 add x15, x3, x15, lsl #1 171 ld1 {v0.h}[0], [x14] 172 read_rand x14, 11, 1 173 ld1 {v0.h}[1], [x15] 174 read_rand x15, 11, 0 175 add x14, x3, x14, lsl #1 176 add x15, x3, x15, lsl #1 177 ld1 {v0.h}[2], [x14] 178 ld1 {v0.h}[3], [x15] 179 srshl v0.4h, v0.4h, v31.4h 180 xtn2 \r2\().16b, v0.8h 181.endm 182 183.macro store_grain_row_44 r0, r1, r2 184 st1 {\r0\().16b,\r1\().16b}, [x0], #32 185 st1 {\r2\().16b}, [x0] 186 add x0, x0, #GRAIN_WIDTH-32 187.endm 188 189function get_grain_2_neon 190 increment_seed 2 191 read_rand x14, 11, 1 192 read_rand x15, 11, 0 193 add x14, x3, x14, lsl #1 194 add x15, x3, x15, lsl #1 195 ld1 {v0.h}[0], [x14] 196 ld1 {v0.h}[1], [x15] 197 srshl v0.4h, v0.4h, v31.4h 198 xtn v0.8b, v0.8h 199 ret 200endfunc 201 202.macro get_grain_2 dst 203 bl get_grain_2_neon 204.ifnc \dst, v0 205 mov \dst\().8b, v0.8b 206.endif 207.endm 208 209// w15 holds the number of entries to produce 210// w14, w16 and w17 hold the previous output entries 211// v0 holds the vector of produced entries 212// v1 holds the input vector of sums from above 213.macro output_lag n 214function output_lag\n\()_neon 2151: 216 read_shift_rand x13, 11 217 mov w11, v1.s[0] 218 ldrsh w12, [x3, x13, lsl #1] 219 ext v0.16b, v0.16b, v0.16b, #1 220.if \n == 1 221 madd w11, w14, w4, w11 // sum (above) + *coeff * prev output 222.elseif \n == 2 223 madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1 224 madd w11, w14, w17, w11 // += *coeff * prev output 2 225 mov w16, w14 226.else 227 madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1 228 madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2 229 madd w11, w14, w21, w11 // += *coeff * prev output 3 230 mov w17, w16 231 mov w16, w14 232.endif 233 add w14, w11, w8 // 1 << (ar_coeff_shift - 1) 234 add w12, w12, w10 // 1 << (4 + grain_scale_shift - 1) 235 asr w14, w14, w7 // >> ar_coeff_shift 236 asr w12, w12, w9 // >> (4 + grain_scale_shift) 237 add w14, w14, w12 238 cmp w14, w5 239 csel w14, w14, w5, le 240 cmp w14, w6 241 csel w14, w14, w6, ge 242 subs w15, w15, #1 243 ext v1.16b, v1.16b, v1.16b, #4 244 ins v0.b[15], w14 245 b.gt 1b 246 ret 247endfunc 248.endm 249 250output_lag 1 251output_lag 2 252output_lag 3 253 254 255function sum_lag1_above_neon 256 smull v2.8h, v3.8b, v28.8b 257 smull2 v3.8h, v3.16b, v28.16b 258 smull v4.8h, v0.8b, v27.8b 259 smull2 v5.8h, v0.16b, v27.16b 260 smull v6.8h, v1.8b, v29.8b 261 smull2 v7.8h, v1.16b, v29.16b 262 saddl v0.4s, v2.4h, v4.4h 263 saddl2 v1.4s, v2.8h, v4.8h 264 saddl v2.4s, v3.4h, v5.4h 265 saddl2 v3.4s, v3.8h, v5.8h 266 saddw v4.4s, v0.4s, v6.4h 267 saddw2 v5.4s, v1.4s, v6.8h 268 saddw v6.4s, v2.4s, v7.4h 269 saddw2 v7.4s, v3.4s, v7.8h 270 ret 271endfunc 272 273.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff 274 bl sum_\lag\()_above_neon 275.ifc \type, uv_420 276 add x12, x19, #GRAIN_WIDTH 277 ld1 {v22.16b, v23.16b}, [x19], #32 278 ld1 {v24.16b, v25.16b}, [x12] 279 saddlp v22.8h, v22.16b 280 saddlp v23.8h, v23.16b 281 saddlp v24.8h, v24.16b 282 saddlp v25.8h, v25.16b 283 add v22.8h, v22.8h, v24.8h 284 add v23.8h, v23.8h, v25.8h 285 rshrn v0.8b, v22.8h, #2 286 rshrn2 v0.16b, v23.8h, #2 287.endif 288.ifc \type, uv_422 289 ld1 {v22.16b, v23.16b}, [x19], #32 290 saddlp v22.8h, v22.16b 291 saddlp v23.8h, v23.16b 292 rshrn v0.8b, v22.8h, #1 293 rshrn2 v0.16b, v23.8h, #1 294.endif 295.ifc \type, uv_444 296 ld1 {v0.16b}, [x19], #16 297.endif 298.if \uv_layout 299.ifnb \uv_coeff 300 dup v1.16b, \uv_coeff 301 smull v2.8h, v0.8b, v1.8b 302 smull2 v3.8h, v0.16b, v1.16b 303.else 304 smull v2.8h, v0.8b, v30.8b 305 smull2 v3.8h, v0.16b, v30.16b 306.endif 307 saddw v4.4s, v4.4s, v2.4h 308 saddw2 v5.4s, v5.4s, v2.8h 309 saddw v6.4s, v6.4s, v3.4h 310 saddw2 v7.4s, v7.4s, v3.8h 311.endif 312.if \uv_layout && \elems == 16 313 b sum_\lag\()_y_\edge\()_start 314.elseif \uv_layout == 444 && \elems == 15 315 b sum_\lag\()_y_\edge\()_start 316.elseif \uv_layout == 422 && \elems == 9 317 b sum_\lag\()_uv_420_\edge\()_start 318.else 319sum_\lag\()_\type\()_\edge\()_start: 320.ifc \edge, left 321 increment_seed 4 322 read_rand x12, 11, 3 323 read_rand x13, 11, 2 324 read_rand x14, 11, 1 325 add x12, x3, x12, lsl #1 326 add x13, x3, x13, lsl #1 327 add x14, x3, x14, lsl #1 328 ld1 {v0.h}[5], [x12] 329 ld1 {v0.h}[6], [x13] 330 ld1 {v0.h}[7], [x14] 331 lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0 332 srshl v0.8h, v0.8h, v31.8h 333 xtn2 v0.16b, v0.8h 334 ext v4.16b, v4.16b, v4.16b, #12 335.ifc \lag, lag3 336 smov w17, v0.b[13] 337.endif 338.ifnc \lag, lag1 339 smov w16, v0.b[14] 340.endif 341 smov w14, v0.b[15] 342 343 mov v1.16b, v4.16b 344 mov w15, #1 345 bl output_\lag\()_neon 346.else 347 increment_seed 4, shift=0 348 mov v1.16b, v4.16b 349 mov w15, #4 350 bl output_\lag\()_neon 351.endif 352 353 increment_seed 4, shift=0 354 mov v1.16b, v5.16b 355 mov w15, #4 356 bl output_\lag\()_neon 357 358 increment_seed 4, shift=0 359 mov v1.16b, v6.16b 360.if \elems == 9 361 mov w15, #1 362 bl output_\lag\()_neon 363 lsr w2, w2, #3 364 365 read_rand x12, 11, 2 366 read_rand x13, 11, 1 367 read_rand x14, 11, 0 368 add x12, x3, x12, lsl #1 369 add x13, x3, x13, lsl #1 370 add x14, x3, x14, lsl #1 371 ld1 {v1.h}[0], [x12] 372 ld1 {v1.h}[1], [x13] 373 ld1 {v1.h}[2], [x14] 374 srshl v1.4h, v1.4h, v31.4h 375 xtn v1.8b, v1.8h 376 ext v0.16b, v0.16b, v1.16b, #7 377.else 378 mov w15, #4 379 bl output_\lag\()_neon 380 381 increment_seed 4, shift=0 382 mov v1.16b, v7.16b 383 384.ifc \edge, right 385 mov w15, #3 386 bl output_\lag\()_neon 387 read_shift_rand x15, 11 388 add x15, x3, x15, lsl #1 389 ld1 {v1.h}[0], [x15] 390 srshl v1.4h, v1.4h, v31.4h 391 ext v0.16b, v0.16b, v1.16b, #1 392.else 393 mov w15, #4 394 bl output_\lag\()_neon 395.endif 396.endif 397.if \store 398 st1 {v0.16b}, [x0], #16 399.endif 400 ldr x30, [sp], #16 401 AARCH64_VALIDATE_LINK_REGISTER 402 ret 403.endif 404.endm 405 406.macro sum_lag1_func type, uv_layout, edge, elems=16 407function sum_\type\()_lag1_\edge\()_neon 408 AARCH64_SIGN_LINK_REGISTER 409 str x30, [sp, #-16]! 410 sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0 411endfunc 412.endm 413 414sum_lag1_func y, 0, left 415sum_lag1_func y, 0, mid 416sum_lag1_func y, 0, right, 15 417sum_lag1_func uv_444, 444, left 418sum_lag1_func uv_444, 444, mid 419sum_lag1_func uv_444, 444, right, 15 420sum_lag1_func uv_422, 422, left 421sum_lag1_func uv_422, 422, mid 422sum_lag1_func uv_422, 422, right, 9 423sum_lag1_func uv_420, 420, left 424sum_lag1_func uv_420, 420, mid 425sum_lag1_func uv_420, 420, right, 9 426 427.macro sum_lag1 type, dst, left, mid, right, edge=mid 428 mov v3.16b, \mid\().16b 429 ext v0.16b, \left\().16b, \mid\().16b, #15 430 ext v1.16b, \mid\().16b, \right\().16b, #1 431 bl sum_\type\()_lag1_\edge\()_neon 432 mov \dst\().16b, v0.16b 433.endm 434 435.macro sum_y_lag1 dst, left, mid, right, edge=mid 436 sum_lag1 y, \dst, \left, \mid, \right, \edge 437.endm 438 439.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid 440 sum_lag1 uv_444, \dst, \left, \mid, \right, \edge 441.endm 442 443.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid 444 sum_lag1 uv_422, \dst, \left, \mid, \right, \edge 445.endm 446 447.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid 448 sum_lag1 uv_420, \dst, \left, \mid, \right, \edge 449.endm 450 451 452function sum_lag2_above_neon 453 sub x12, x0, #2*GRAIN_WIDTH - 16 454 sub x13, x0, #1*GRAIN_WIDTH - 16 455 ld1 {v18.16b}, [x12] // load top right 456 ld1 {v21.16b}, [x13] 457 458 ext v22.16b, v16.16b, v17.16b, #14 // top left, top mid 459 dup v26.16b, v30.b[0] 460 ext v23.16b, v16.16b, v17.16b, #15 461 dup v27.16b, v30.b[1] 462 ext v0.16b, v17.16b, v18.16b, #1 // top mid, top right 463 dup v28.16b, v30.b[3] 464 ext v1.16b, v17.16b, v18.16b, #2 465 dup v29.16b, v30.b[4] 466 467 smull v2.8h, v22.8b, v26.8b 468 smull2 v3.8h, v22.16b, v26.16b 469 smull v4.8h, v23.8b, v27.8b 470 smull2 v5.8h, v23.16b, v27.16b 471 smull v6.8h, v0.8b, v28.8b 472 smull2 v7.8h, v0.16b, v28.16b 473 smull v0.8h, v1.8b, v29.8b 474 smull2 v1.8h, v1.16b, v29.16b 475 saddl v22.4s, v2.4h, v4.4h 476 saddl2 v23.4s, v2.8h, v4.8h 477 saddl v26.4s, v3.4h, v5.4h 478 saddl2 v27.4s, v3.8h, v5.8h 479 saddl v2.4s, v0.4h, v6.4h 480 saddl2 v3.4s, v0.8h, v6.8h 481 saddl v6.4s, v1.4h, v7.4h 482 saddl2 v7.4s, v1.8h, v7.8h 483 add v4.4s, v22.4s, v2.4s 484 add v5.4s, v23.4s, v3.4s 485 add v6.4s, v26.4s, v6.4s 486 add v7.4s, v27.4s, v7.4s 487 488 ext v22.16b, v19.16b, v20.16b, #14 // top left, top mid 489 dup v26.16b, v30.b[5] 490 ext v23.16b, v19.16b, v20.16b, #15 491 dup v27.16b, v30.b[6] 492 ext v0.16b, v20.16b, v21.16b, #1 // top mid, top right 493 dup v28.16b, v30.b[8] 494 ext v1.16b, v20.16b, v21.16b, #2 495 dup v29.16b, v30.b[9] 496 497 smull v2.8h, v22.8b, v26.8b 498 smull2 v3.8h, v22.16b, v26.16b 499 smull v22.8h, v23.8b, v27.8b 500 smull2 v23.8h, v23.16b, v27.16b 501 smull v26.8h, v0.8b, v28.8b 502 smull2 v27.8h, v0.16b, v28.16b 503 smull v28.8h, v1.8b, v29.8b 504 smull2 v29.8h, v1.16b, v29.16b 505 saddl v0.4s, v2.4h, v22.4h 506 saddl2 v1.4s, v2.8h, v22.8h 507 saddl v2.4s, v3.4h, v23.4h 508 saddl2 v3.4s, v3.8h, v23.8h 509 saddl v22.4s, v26.4h, v28.4h 510 saddl2 v23.4s, v26.8h, v28.8h 511 saddl v26.4s, v27.4h, v29.4h 512 saddl2 v27.4s, v27.8h, v29.8h 513 add v0.4s, v0.4s, v22.4s 514 add v1.4s, v1.4s, v23.4s 515 add v2.4s, v2.4s, v26.4s 516 add v3.4s, v3.4s, v27.4s 517 dup v26.16b, v30.b[2] 518 dup v27.16b, v30.b[7] 519 smull v22.8h, v17.8b, v26.8b 520 smull2 v23.8h, v17.16b, v26.16b 521 smull v24.8h, v20.8b, v27.8b 522 smull2 v25.8h, v20.16b, v27.16b 523 add v4.4s, v4.4s, v0.4s 524 add v5.4s, v5.4s, v1.4s 525 add v6.4s, v6.4s, v2.4s 526 add v7.4s, v7.4s, v3.4s 527 528 mov v16.16b, v17.16b 529 mov v17.16b, v18.16b 530 531 saddl v0.4s, v22.4h, v24.4h 532 saddl2 v1.4s, v22.8h, v24.8h 533 saddl v2.4s, v23.4h, v25.4h 534 saddl2 v3.4s, v23.8h, v25.8h 535 mov v19.16b, v20.16b 536 mov v20.16b, v21.16b 537 add v4.4s, v4.4s, v0.4s 538 add v5.4s, v5.4s, v1.4s 539 add v6.4s, v6.4s, v2.4s 540 add v7.4s, v7.4s, v3.4s 541 ret 542endfunc 543 544.macro sum_lag2_func type, uv_layout, edge, elems=16 545function sum_\type\()_lag2_\edge\()_neon 546 AARCH64_SIGN_LINK_REGISTER 547 str x30, [sp, #-16]! 548.ifc \edge, left 549 sub x12, x0, #2*GRAIN_WIDTH 550 sub x13, x0, #1*GRAIN_WIDTH 551 ld1 {v17.16b}, [x12] // load the previous block right above 552 ld1 {v20.16b}, [x13] 553.endif 554 sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[12] 555endfunc 556.endm 557 558sum_lag2_func y, 0, left 559sum_lag2_func y, 0, mid 560sum_lag2_func y, 0, right, 15 561sum_lag2_func uv_444, 444, left 562sum_lag2_func uv_444, 444, mid 563sum_lag2_func uv_444, 444, right, 15 564sum_lag2_func uv_422, 422, left 565sum_lag2_func uv_422, 422, mid 566sum_lag2_func uv_422, 422, right, 9 567sum_lag2_func uv_420, 420, left 568sum_lag2_func uv_420, 420, mid 569sum_lag2_func uv_420, 420, right, 9 570 571 572function sum_lag3_above_neon 573 sub x11, x0, #3*GRAIN_WIDTH - 16 574 sub x12, x0, #2*GRAIN_WIDTH - 16 575 sub x13, x0, #1*GRAIN_WIDTH - 16 576 ld1 {v15.16b}, [x11] // load top right 577 ld1 {v18.16b}, [x12] 578 ld1 {v21.16b}, [x13] 579 580 ext v8.16b, v13.16b, v14.16b, #13 // top left, top mid 581 dup v22.16b, v29.b[0] 582 ext v9.16b, v13.16b, v14.16b, #14 583 dup v23.16b, v29.b[1] 584 ext v10.16b, v13.16b, v14.16b, #15 585 dup v24.16b, v29.b[2] 586 dup v25.16b, v29.b[3] 587 ext v11.16b, v14.16b, v15.16b, #1 // top mid, top right 588 dup v26.16b, v29.b[4] 589 ext v12.16b, v14.16b, v15.16b, #2 590 dup v27.16b, v29.b[5] 591 ext v13.16b, v14.16b, v15.16b, #3 592 dup v28.16b, v29.b[6] 593 594 smull v0.8h, v8.8b, v22.8b 595 smull2 v1.8h, v8.16b, v22.16b 596 smull v2.8h, v9.8b, v23.8b 597 smull2 v3.8h, v9.16b, v23.16b 598 smull v8.8h, v10.8b, v24.8b 599 smull2 v9.8h, v10.16b, v24.16b 600 smull v10.8h, v11.8b, v26.8b 601 smull2 v11.8h, v11.16b, v26.16b 602 saddl v22.4s, v0.4h, v2.4h 603 saddl2 v23.4s, v0.8h, v2.8h 604 saddl v24.4s, v1.4h, v3.4h 605 saddl2 v26.4s, v1.8h, v3.8h 606 saddl v0.4s, v8.4h, v10.4h 607 saddl2 v1.4s, v8.8h, v10.8h 608 saddl v2.4s, v9.4h, v11.4h 609 saddl2 v3.4s, v9.8h, v11.8h 610 smull v8.8h, v12.8b, v27.8b 611 smull2 v9.8h, v12.16b, v27.16b 612 smull v10.8h, v13.8b, v28.8b 613 smull2 v11.8h, v13.16b, v28.16b 614 smull v12.8h, v14.8b, v25.8b 615 smull2 v13.8h, v14.16b, v25.16b 616 add v4.4s, v22.4s, v0.4s 617 add v5.4s, v23.4s, v1.4s 618 add v6.4s, v24.4s, v2.4s 619 add v7.4s, v26.4s, v3.4s 620 saddl v0.4s, v8.4h, v10.4h 621 saddl2 v1.4s, v8.8h, v10.8h 622 saddl v2.4s, v9.4h, v11.4h 623 saddl2 v3.4s, v9.8h, v11.8h 624 add v4.4s, v4.4s, v0.4s 625 add v5.4s, v5.4s, v1.4s 626 add v6.4s, v6.4s, v2.4s 627 add v7.4s, v7.4s, v3.4s 628 saddw v4.4s, v4.4s, v12.4h 629 saddw2 v5.4s, v5.4s, v12.8h 630 saddw v6.4s, v6.4s, v13.4h 631 saddw2 v7.4s, v7.4s, v13.8h 632 633 ext v8.16b, v16.16b, v17.16b, #13 // top left, top mid 634 dup v22.16b, v29.b[7] 635 ext v9.16b, v16.16b, v17.16b, #14 636 dup v23.16b, v29.b[8] 637 ext v10.16b, v16.16b, v17.16b, #15 638 dup v24.16b, v29.b[9] 639 dup v25.16b, v29.b[10] 640 ext v11.16b, v17.16b, v18.16b, #1 // top mid, top right 641 dup v26.16b, v29.b[11] 642 ext v12.16b, v17.16b, v18.16b, #2 643 dup v27.16b, v29.b[12] 644 ext v13.16b, v17.16b, v18.16b, #3 645 dup v28.16b, v29.b[13] 646 647 smull v0.8h, v8.8b, v22.8b 648 smull2 v1.8h, v8.16b, v22.16b 649 smull v2.8h, v9.8b, v23.8b 650 smull2 v3.8h, v9.16b, v23.16b 651 smull v8.8h, v10.8b, v24.8b 652 smull2 v9.8h, v10.16b, v24.16b 653 smull v10.8h, v11.8b, v26.8b 654 smull2 v11.8h, v11.16b, v26.16b 655 saddl v22.4s, v0.4h, v2.4h 656 saddl2 v23.4s, v0.8h, v2.8h 657 saddl v24.4s, v1.4h, v3.4h 658 saddl2 v26.4s, v1.8h, v3.8h 659 saddl v0.4s, v8.4h, v10.4h 660 saddl2 v1.4s, v8.8h, v10.8h 661 saddl v2.4s, v9.4h, v11.4h 662 saddl2 v3.4s, v9.8h, v11.8h 663 smull v8.8h, v12.8b, v27.8b 664 smull2 v9.8h, v12.16b, v27.16b 665 smull v10.8h, v13.8b, v28.8b 666 smull2 v11.8h, v13.16b, v28.16b 667 smull v12.8h, v17.8b, v25.8b 668 smull2 v13.8h, v17.16b, v25.16b 669 add v22.4s, v22.4s, v0.4s 670 add v23.4s, v23.4s, v1.4s 671 add v24.4s, v24.4s, v2.4s 672 add v26.4s, v26.4s, v3.4s 673 saddl v0.4s, v8.4h, v10.4h 674 saddl2 v1.4s, v8.8h, v10.8h 675 saddl v2.4s, v9.4h, v11.4h 676 saddl2 v3.4s, v9.8h, v11.8h 677 add v4.4s, v4.4s, v22.4s 678 add v5.4s, v5.4s, v23.4s 679 add v6.4s, v6.4s, v24.4s 680 add v7.4s, v7.4s, v26.4s 681 add v4.4s, v4.4s, v0.4s 682 add v5.4s, v5.4s, v1.4s 683 add v6.4s, v6.4s, v2.4s 684 add v7.4s, v7.4s, v3.4s 685 saddw v4.4s, v4.4s, v12.4h 686 saddw2 v5.4s, v5.4s, v12.8h 687 saddw v6.4s, v6.4s, v13.4h 688 saddw2 v7.4s, v7.4s, v13.8h 689 690 ext v8.16b, v19.16b, v20.16b, #13 // top left, top mid 691 dup v22.16b, v29.b[14] 692 ext v9.16b, v19.16b, v20.16b, #14 693 dup v23.16b, v29.b[15] 694 ext v10.16b, v19.16b, v20.16b, #15 695 dup v24.16b, v30.b[0] 696 dup v25.16b, v30.b[1] 697 ext v11.16b, v20.16b, v21.16b, #1 // top mid, top right 698 dup v26.16b, v30.b[2] 699 ext v12.16b, v20.16b, v21.16b, #2 700 dup v27.16b, v30.b[3] 701 ext v13.16b, v20.16b, v21.16b, #3 702 dup v28.16b, v30.b[4] 703 704 smull v0.8h, v8.8b, v22.8b 705 smull2 v1.8h, v8.16b, v22.16b 706 smull v2.8h, v9.8b, v23.8b 707 smull2 v3.8h, v9.16b, v23.16b 708 smull v8.8h, v10.8b, v24.8b 709 smull2 v9.8h, v10.16b, v24.16b 710 smull v10.8h, v11.8b, v26.8b 711 smull2 v11.8h, v11.16b, v26.16b 712 saddl v22.4s, v0.4h, v2.4h 713 saddl2 v23.4s, v0.8h, v2.8h 714 saddl v24.4s, v1.4h, v3.4h 715 saddl2 v26.4s, v1.8h, v3.8h 716 saddl v0.4s, v8.4h, v10.4h 717 saddl2 v1.4s, v8.8h, v10.8h 718 saddl v2.4s, v9.4h, v11.4h 719 saddl2 v3.4s, v9.8h, v11.8h 720 smull v8.8h, v12.8b, v27.8b 721 smull2 v9.8h, v12.16b, v27.16b 722 smull v10.8h, v13.8b, v28.8b 723 smull2 v11.8h, v13.16b, v28.16b 724 smull v12.8h, v20.8b, v25.8b 725 smull2 v19.8h, v20.16b, v25.16b 726 add v22.4s, v22.4s, v0.4s 727 add v23.4s, v23.4s, v1.4s 728 add v24.4s, v24.4s, v2.4s 729 add v26.4s, v26.4s, v3.4s 730 saddl v0.4s, v8.4h, v10.4h 731 saddl2 v1.4s, v8.8h, v10.8h 732 saddl v2.4s, v9.4h, v11.4h 733 saddl2 v3.4s, v9.8h, v11.8h 734 add v4.4s, v4.4s, v22.4s 735 add v5.4s, v5.4s, v23.4s 736 add v6.4s, v6.4s, v24.4s 737 add v7.4s, v7.4s, v26.4s 738 mov v13.16b, v14.16b 739 mov v14.16b, v15.16b 740 add v4.4s, v4.4s, v0.4s 741 add v5.4s, v5.4s, v1.4s 742 add v6.4s, v6.4s, v2.4s 743 add v7.4s, v7.4s, v3.4s 744 mov v16.16b, v17.16b 745 mov v17.16b, v18.16b 746 saddw v4.4s, v4.4s, v12.4h 747 saddw2 v5.4s, v5.4s, v12.8h 748 saddw v6.4s, v6.4s, v19.4h 749 saddw2 v7.4s, v7.4s, v19.8h 750 751 mov v19.16b, v20.16b 752 mov v20.16b, v21.16b 753 ret 754endfunc 755 756.macro sum_lag3_func type, uv_layout, edge, elems=16 757function sum_\type\()_lag3_\edge\()_neon 758 AARCH64_SIGN_LINK_REGISTER 759 str x30, [sp, #-16]! 760.ifc \edge, left 761 sub x11, x0, #3*GRAIN_WIDTH 762 sub x12, x0, #2*GRAIN_WIDTH 763 sub x13, x0, #1*GRAIN_WIDTH 764 ld1 {v14.16b}, [x11] // load the previous block right above 765 ld1 {v17.16b}, [x12] 766 ld1 {v20.16b}, [x13] 767.endif 768 sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[8] 769endfunc 770.endm 771 772sum_lag3_func y, 0, left 773sum_lag3_func y, 0, mid 774sum_lag3_func y, 0, right, 15 775sum_lag3_func uv_444, 444, left 776sum_lag3_func uv_444, 444, mid 777sum_lag3_func uv_444, 444, right, 15 778sum_lag3_func uv_422, 422, left 779sum_lag3_func uv_422, 422, mid 780sum_lag3_func uv_422, 422, right, 9 781sum_lag3_func uv_420, 420, left 782sum_lag3_func uv_420, 420, mid 783sum_lag3_func uv_420, 420, right, 9 784 785function generate_grain_rows_neon 786 AARCH64_SIGN_LINK_REGISTER 787 str x30, [sp, #-16]! 7881: 789 get_grain_row v16, v17, v18, v19, v20, v21 790 subs w1, w1, #1 791 store_grain_row v16, v17, v18, v19, v20, v21 792 b.gt 1b 793 ldr x30, [sp], #16 794 AARCH64_VALIDATE_LINK_REGISTER 795 ret 796endfunc 797 798function generate_grain_rows_44_neon 799 AARCH64_SIGN_LINK_REGISTER 800 str x30, [sp, #-16]! 8011: 802 get_grain_row_44 v16, v17, v18 803 subs w1, w1, #1 804 store_grain_row_44 v16, v17, v18 805 b.gt 1b 806 ldr x30, [sp], #16 807 AARCH64_VALIDATE_LINK_REGISTER 808 ret 809endfunc 810 811function get_grain_row_neon 812 AARCH64_SIGN_LINK_REGISTER 813 str x30, [sp, #-16]! 814 get_grain_row v16, v17, v18, v19, v20, v21 815 ldr x30, [sp], #16 816 AARCH64_VALIDATE_LINK_REGISTER 817 ret 818endfunc 819 820function get_grain_row_44_neon 821 AARCH64_SIGN_LINK_REGISTER 822 str x30, [sp, #-16]! 823 get_grain_row_44 v16, v17, v18 824 ldr x30, [sp], #16 825 AARCH64_VALIDATE_LINK_REGISTER 826 ret 827endfunc 828 829function add_uv_444_coeff_lag0_neon 830add_coeff_lag0_start: 831 smull v2.8h, v0.8b, v27.8b 832 smull2 v3.8h, v0.16b, v27.16b 833 srshl v2.8h, v2.8h, v28.8h 834 srshl v3.8h, v3.8h, v28.8h 835 saddw v2.8h, v2.8h, v1.8b 836 saddw2 v3.8h, v3.8h, v1.16b 837 sqxtn v2.8b, v2.8h 838 sqxtn2 v2.16b, v3.8h 839 ret 840endfunc 841 842function add_uv_420_coeff_lag0_neon 843 ld1 {v4.16b, v5.16b}, [x19], #32 844 ld1 {v6.16b, v7.16b}, [x12], #32 845 saddlp v4.8h, v4.16b 846 saddlp v5.8h, v5.16b 847 saddlp v6.8h, v6.16b 848 saddlp v7.8h, v7.16b 849 add v4.8h, v4.8h, v6.8h 850 add v5.8h, v5.8h, v7.8h 851 rshrn v4.8b, v4.8h, #2 852 rshrn2 v4.16b, v5.8h, #2 853 and v0.16b, v4.16b, v0.16b 854 b add_coeff_lag0_start 855endfunc 856 857function add_uv_422_coeff_lag0_neon 858 ld1 {v4.16b, v5.16b}, [x19], #32 859 saddlp v4.8h, v4.16b 860 saddlp v5.8h, v5.16b 861 rshrn v4.8b, v4.8h, #1 862 rshrn2 v4.16b, v5.8h, #1 863 and v0.16b, v4.16b, v0.16b 864 b add_coeff_lag0_start 865endfunc 866 867.macro gen_grain_82 type 868function generate_grain_\type\()_8bpc_neon, export=1 869 AARCH64_SIGN_LINK_REGISTER 870 stp x30, x19, [sp, #-96]! 871 872.ifc \type, uv_444 873 mov w13, w3 874 mov w14, #28 875 add x19, x1, #3*GRAIN_WIDTH 876 mov x1, x2 877 mul w13, w13, w14 878.endif 879 movrel x3, X(gaussian_sequence) 880 ldr w2, [x1, #FGD_SEED] 881 ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] 882.ifc \type, y 883 add x4, x1, #FGD_AR_COEFFS_Y 884.else 885 add x4, x1, #FGD_AR_COEFFS_UV 886.endif 887 adr x16, L(gen_grain_\type\()_tbl) 888 ldr w17, [x1, #FGD_AR_COEFF_LAG] 889 add w9, w9, #4 890 ldrh w17, [x16, w17, uxtw #1] 891 dup v31.8h, w9 // 4 + data->grain_scale_shift 892 sub x16, x16, w17, uxtw 893 neg v31.8h, v31.8h 894 895.ifc \type, uv_444 896 cmp w13, #0 897 mov w11, #0x49d8 898 mov w14, #0xb524 899 add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] 900 csel w11, w11, w14, ne 901.endif 902 903 ldr w7, [x1, #FGD_AR_COEFF_SHIFT] 904 mov w8, #1 905 mov w10, #1 906 lsl w8, w8, w7 // 1 << ar_coeff_shift 907 lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) 908 lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) 909 lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) 910 mov w5, #127 911 mov w6, #-128 912 913.ifc \type, uv_444 914 eor w2, w2, w11 915.endif 916 917 br x16 918 919L(generate_grain_\type\()_lag0): 920 AARCH64_VALID_JUMP_TARGET 921.ifc \type, y 922 mov w1, #GRAIN_HEIGHT 923 bl generate_grain_rows_neon 924.else 925 dup v28.8h, w7 926 ld1r {v27.16b}, [x4] // ar_coeffs_uv[0] 927 movi v0.16b, #0 928 movi v1.16b, #255 929 ext v29.16b, v0.16b, v1.16b, #13 930 ext v30.16b, v1.16b, v0.16b, #1 931 neg v28.8h, v28.8h 932 933 mov w1, #3 934 bl generate_grain_rows_neon 935 mov w1, #GRAIN_HEIGHT-3 9361: 937 ld1 {v22.16b, v23.16b, v24.16b, v25.16b}, [x19], #64 938 bl get_grain_row_neon 939 and v0.16b, v22.16b, v29.16b 940 mov v1.16b, v16.16b 941 bl add_uv_444_coeff_lag0_neon 942 mov v0.16b, v23.16b 943 mov v1.16b, v17.16b 944 mov v16.16b, v2.16b 945 bl add_uv_444_coeff_lag0_neon 946 ld1 {v26.16b}, [x19], #16 947 mov v0.16b, v24.16b 948 mov v1.16b, v18.16b 949 mov v17.16b, v2.16b 950 bl add_uv_444_coeff_lag0_neon 951 add x19, x19, #2 952 mov v0.16b, v25.16b 953 mov v1.16b, v19.16b 954 mov v18.16b, v2.16b 955 bl add_uv_444_coeff_lag0_neon 956 and v0.16b, v26.16b, v30.16b 957 mov v1.16b, v20.16b 958 mov v19.16b, v2.16b 959 bl add_uv_444_coeff_lag0_neon 960 mov v20.16b, v2.16b 961 subs w1, w1, #1 962 store_grain_row v16, v17, v18, v19, v20, v21 963 b.gt 1b 964.endif 965 ldp x30, x19, [sp], #96 966 AARCH64_VALIDATE_LINK_REGISTER 967 ret 968 969L(generate_grain_\type\()_lag1): 970 AARCH64_VALID_JUMP_TARGET 971 ld1r {v27.16b}, [x4], #1 // ar_coeffs_y[0] 972 ld1r {v28.16b}, [x4], #1 // ar_coeffs_y[1] 973 ld1r {v29.16b}, [x4] // ar_coeffs_y[2] 974.ifc \type, y 975 ldrsb w4, [x4, #1] // ar_coeffs_y[3] 976.else 977 add x4, x4, #2 978.endif 979 980 mov w1, #3 981.ifc \type, uv_444 982 ld1r {v30.16b}, [x4] // ar_coeffs_uv[4] 983 ldursb w4, [x4, #-1] // ar_coeffs_uv[3] 984.endif 985 bl generate_grain_rows_neon 986 987 mov w1, #GRAIN_HEIGHT - 3 9881: 989 sum_\type\()_lag1 v22, v16, v16, v17, left 990 sum_\type\()_lag1 v23, v16, v17, v18 991 sum_\type\()_lag1 v24, v17, v18, v19 992 sum_\type\()_lag1 v25, v18, v19, v20 993 sum_\type\()_lag1 v20, v19, v20, v21, right 994 get_grain_2 v21 995 subs w1, w1, #1 996.ifc \type, uv_444 997 add x19, x19, #2 998.endif 999 store_grain_row v22, v23, v24, v25, v20, v21 1000 mov v16.16b, v22.16b 1001 mov v17.16b, v23.16b 1002 mov v18.16b, v24.16b 1003 mov v19.16b, v25.16b 1004 b.gt 1b 1005 1006 ldp x30, x19, [sp], #96 1007 AARCH64_VALIDATE_LINK_REGISTER 1008 ret 1009 1010L(generate_grain_\type\()_lag2): 1011 AARCH64_VALID_JUMP_TARGET 1012 ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] 1013 1014 smov w4, v30.b[10] 1015 smov w17, v30.b[11] 1016 1017 mov w1, #3 1018 bl generate_grain_rows_neon 1019 1020 mov w1, #GRAIN_HEIGHT - 3 10211: 1022 bl sum_\type\()_lag2_left_neon 1023 bl sum_\type\()_lag2_mid_neon 1024 bl sum_\type\()_lag2_mid_neon 1025 bl sum_\type\()_lag2_mid_neon 1026 bl sum_\type\()_lag2_right_neon 1027 get_grain_2 v16 1028 subs w1, w1, #1 1029.ifc \type, uv_444 1030 add x19, x19, #2 1031.endif 1032 st1 {v16.h}[0], [x0], #2 1033 b.gt 1b 1034 1035 ldp x30, x19, [sp], #96 1036 AARCH64_VALIDATE_LINK_REGISTER 1037 ret 1038 1039L(generate_grain_\type\()_lag3): 1040 AARCH64_VALID_JUMP_TARGET 1041 ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] 1042 stp d8, d9, [sp, #16] 1043 stp d10, d11, [sp, #32] 1044 stp d12, d13, [sp, #48] 1045 stp d14, d15, [sp, #64] 1046 stp x20, x21, [sp, #80] 1047 1048 smov w4, v30.b[5] 1049 smov w20, v30.b[6] 1050 smov w21, v30.b[7] 1051 1052 mov w1, #3 1053 bl generate_grain_rows_neon 1054 1055 mov w1, #GRAIN_HEIGHT - 3 10561: 1057 bl sum_\type\()_lag3_left_neon 1058 bl sum_\type\()_lag3_mid_neon 1059 bl sum_\type\()_lag3_mid_neon 1060 bl sum_\type\()_lag3_mid_neon 1061 bl sum_\type\()_lag3_right_neon 1062 get_grain_2 v16 1063 subs w1, w1, #1 1064.ifc \type, uv_444 1065 add x19, x19, #2 1066.endif 1067 st1 {v16.h}[0], [x0], #2 1068 b.gt 1b 1069 1070 ldp x20, x21, [sp, #80] 1071 ldp d14, d15, [sp, #64] 1072 ldp d12, d13, [sp, #48] 1073 ldp d10, d11, [sp, #32] 1074 ldp d8, d9, [sp, #16] 1075 ldp x30, x19, [sp], #96 1076 AARCH64_VALIDATE_LINK_REGISTER 1077 ret 1078 1079L(gen_grain_\type\()_tbl): 1080 .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) 1081 .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) 1082 .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) 1083 .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) 1084endfunc 1085.endm 1086 1087gen_grain_82 y 1088gen_grain_82 uv_444 1089 1090.macro set_height dst, type 1091.ifc \type, uv_420 1092 mov \dst, #SUB_GRAIN_HEIGHT-3 1093.else 1094 mov \dst, #GRAIN_HEIGHT-3 1095.endif 1096.endm 1097 1098.macro increment_y_ptr reg, type 1099.ifc \type, uv_420 1100 add \reg, \reg, #2*GRAIN_WIDTH-(3*32) 1101.else 1102 sub \reg, \reg, #3*32-GRAIN_WIDTH 1103.endif 1104.endm 1105 1106.macro gen_grain_44 type 1107function generate_grain_\type\()_8bpc_neon, export=1 1108 AARCH64_SIGN_LINK_REGISTER 1109 stp x30, x19, [sp, #-96]! 1110 1111 mov w13, w3 1112 mov w14, #28 1113 add x19, x1, #3*GRAIN_WIDTH-3 1114 mov x1, x2 1115 mul w13, w13, w14 1116 1117 movrel x3, X(gaussian_sequence) 1118 ldr w2, [x1, #FGD_SEED] 1119 ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] 1120 add x4, x1, #FGD_AR_COEFFS_UV 1121 adr x16, L(gen_grain_\type\()_tbl) 1122 ldr w17, [x1, #FGD_AR_COEFF_LAG] 1123 add w9, w9, #4 1124 ldrh w17, [x16, w17, uxtw #1] 1125 dup v31.8h, w9 // 4 + data->grain_scale_shift 1126 sub x16, x16, w17, uxtw 1127 neg v31.8h, v31.8h 1128 1129 cmp w13, #0 1130 mov w11, #0x49d8 1131 mov w14, #0xb524 1132 add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] 1133 csel w11, w11, w14, ne 1134 1135 ldr w7, [x1, #FGD_AR_COEFF_SHIFT] 1136 mov w8, #1 1137 mov w10, #1 1138 lsl w8, w8, w7 // 1 << ar_coeff_shift 1139 lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) 1140 lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) 1141 lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) 1142 mov w5, #127 1143 mov w6, #-128 1144 1145 eor w2, w2, w11 1146 1147 br x16 1148 1149L(generate_grain_\type\()_lag0): 1150 AARCH64_VALID_JUMP_TARGET 1151 dup v28.8h, w7 1152 ld1r {v27.16b}, [x4] // ar_coeffs_uv[0] 1153 movi v0.16b, #0 1154 movi v1.16b, #255 1155 ext v29.16b, v0.16b, v1.16b, #13 1156 ext v30.16b, v1.16b, v0.16b, #7 1157 neg v28.8h, v28.8h 1158 1159 mov w1, #3 1160 bl generate_grain_rows_44_neon 1161 set_height w1, \type 11621: 1163 bl get_grain_row_44_neon 1164.ifc \type, uv_420 1165 add x12, x19, #GRAIN_WIDTH 1166.endif 1167 mov v0.16b, v29.16b 1168 mov v1.16b, v16.16b 1169 bl add_\type\()_coeff_lag0_neon 1170 movi v0.16b, #255 1171 mov v1.16b, v17.16b 1172 mov v16.16b, v2.16b 1173 bl add_\type\()_coeff_lag0_neon 1174 mov v0.16b, v30.16b 1175 mov v1.16b, v18.16b 1176 mov v17.16b, v2.16b 1177 bl add_\type\()_coeff_lag0_neon 1178 mov v18.16b, v2.16b 1179 subs w1, w1, #1 1180 increment_y_ptr x19, \type 1181 store_grain_row_44 v16, v17, v18 1182 b.gt 1b 1183 1184 ldp x30, x19, [sp], #96 1185 AARCH64_VALIDATE_LINK_REGISTER 1186 ret 1187 1188L(generate_grain_\type\()_lag1): 1189 AARCH64_VALID_JUMP_TARGET 1190 ld1r {v27.16b}, [x4], #1 // ar_coeffs_uv[0] 1191 ld1r {v28.16b}, [x4], #1 // ar_coeffs_uv[1] 1192 ld1r {v29.16b}, [x4] // ar_coeffs_uv[2] 1193 add x4, x4, #2 1194 1195 mov w1, #3 1196 ld1r {v30.16b}, [x4] // ar_coeffs_u4[4] 1197 ldursb w4, [x4, #-1] // ar_coeffs_uv[3] 1198 bl generate_grain_rows_44_neon 1199 1200 set_height w1, \type 12011: 1202 sum_\type\()_lag1 v20, v16, v16, v17, left 1203 sum_\type\()_lag1 v21, v16, v17, v18 1204 sum_\type\()_lag1 v18, v17, v18, v18, right 1205 subs w1, w1, #1 1206 increment_y_ptr x19, \type 1207 store_grain_row_44 v20, v21, v18 1208 mov v16.16b, v20.16b 1209 mov v17.16b, v21.16b 1210 b.gt 1b 1211 1212 ldp x30, x19, [sp], #96 1213 AARCH64_VALIDATE_LINK_REGISTER 1214 ret 1215 1216L(generate_grain_\type\()_lag2): 1217 AARCH64_VALID_JUMP_TARGET 1218 ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12] 1219 1220 smov w4, v30.b[10] 1221 smov w17, v30.b[11] 1222 1223 mov w1, #3 1224 bl generate_grain_rows_44_neon 1225 1226 set_height w1, \type 12271: 1228 bl sum_\type\()_lag2_left_neon 1229 bl sum_\type\()_lag2_mid_neon 1230 bl sum_\type\()_lag2_right_neon 1231 subs w1, w1, #1 1232 increment_y_ptr x19, \type 1233 add x0, x0, #GRAIN_WIDTH-48 1234 b.gt 1b 1235 1236 ldp x30, x19, [sp], #96 1237 AARCH64_VALIDATE_LINK_REGISTER 1238 ret 1239 1240L(generate_grain_\type\()_lag3): 1241 AARCH64_VALID_JUMP_TARGET 1242 ldr q29, [x4] // ar_coeffs_uv[0-15] 1243 ldr q30, [x4, #16] // ar_coeffs_uv[16-24] 1244 stp d8, d9, [sp, #16] 1245 stp d10, d11, [sp, #32] 1246 stp d12, d13, [sp, #48] 1247 stp d14, d15, [sp, #64] 1248 stp x20, x21, [sp, #80] 1249 1250 smov w4, v30.b[5] 1251 smov w20, v30.b[6] 1252 smov w21, v30.b[7] 1253 1254 mov w1, #3 1255 bl generate_grain_rows_44_neon 1256 1257 set_height w1, \type 12581: 1259 bl sum_\type\()_lag3_left_neon 1260 bl sum_\type\()_lag3_mid_neon 1261 bl sum_\type\()_lag3_right_neon 1262 subs w1, w1, #1 1263 increment_y_ptr x19, \type 1264 add x0, x0, #GRAIN_WIDTH-48 1265 b.gt 1b 1266 1267 ldp x20, x21, [sp, #80] 1268 ldp d14, d15, [sp, #64] 1269 ldp d12, d13, [sp, #48] 1270 ldp d10, d11, [sp, #32] 1271 ldp d8, d9, [sp, #16] 1272 ldp x30, x19, [sp], #96 1273 AARCH64_VALIDATE_LINK_REGISTER 1274 ret 1275 1276L(gen_grain_\type\()_tbl): 1277 .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) 1278 .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) 1279 .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) 1280 .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) 1281endfunc 1282.endm 1283 1284gen_grain_44 uv_420 1285gen_grain_44 uv_422 1286 1287.macro gather_interleaved dst1, dst2, src1, src2, off 1288 umov w14, \src1[0+\off] 1289 umov w15, \src2[8+\off] 1290 umov w16, \src1[2+\off] 1291 add x14, x14, x3 1292 umov w17, \src2[10+\off] 1293 add x15, x15, x3 1294 ld1 {\dst1}[0+\off], [x14] 1295 umov w14, \src1[4+\off] 1296 add x16, x16, x3 1297 ld1 {\dst2}[8+\off], [x15] 1298 umov w15, \src2[12+\off] 1299 add x17, x17, x3 1300 ld1 {\dst1}[2+\off], [x16] 1301 umov w16, \src1[6+\off] 1302 add x14, x14, x3 1303 ld1 {\dst2}[10+\off], [x17] 1304 umov w17, \src2[14+\off] 1305 add x15, x15, x3 1306 ld1 {\dst1}[4+\off], [x14] 1307 add x16, x16, x3 1308 ld1 {\dst2}[12+\off], [x15] 1309 add x17, x17, x3 1310 ld1 {\dst1}[6+\off], [x16] 1311 ld1 {\dst2}[14+\off], [x17] 1312.endm 1313 1314.macro gather dst1, dst2, src1, src2 1315 gather_interleaved \dst1, \dst2, \src1, \src2, 0 1316 gather_interleaved \dst2, \dst1, \src2, \src1, 0 1317 gather_interleaved \dst1, \dst2, \src1, \src2, 1 1318 gather_interleaved \dst2, \dst1, \src2, \src1, 1 1319.endm 1320 1321function gather32_neon 1322 gather v4.b, v5.b, v0.b, v1.b 1323 ret 1324endfunc 1325 1326function gather16_neon 1327 gather_interleaved v4.b, v5.b, v0.b, v0.b, 0 1328 gather_interleaved v4.b, v5.b, v0.b, v0.b, 1 1329 ins v4.d[1], v5.d[1] 1330 ret 1331endfunc 1332 1333const overlap_coeffs_0, align=4 1334 .byte 27, 17, 0, 0, 0, 0, 0, 0 1335 .byte 17, 27, 32, 32, 32, 32, 32, 32 1336endconst 1337 1338const overlap_coeffs_1, align=4 1339 .byte 23, 0, 0, 0, 0, 0, 0, 0 1340 .byte 22, 32, 32, 32, 32, 32, 32, 32 1341endconst 1342 1343.macro calc_offset offx, offy, src, sx, sy 1344 and \offy, \src, #0xF // randval & 0xF 1345 lsr \offx, \src, #4 // randval >> 4 1346.if \sy == 0 1347 add \offy, \offy, \offy // 2 * (randval & 0xF) 1348.endif 1349.if \sx == 0 1350 add \offx, \offx, \offx // 2 * (randval >> 4) 1351.endif 1352.endm 1353 1354.macro add_offset dst, offx, offy, src, stride 1355 madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy 1356 add \dst, \dst, \offx, uxtw // grain_lut += offx 1357.endm 1358 1359// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src, 1360// const ptrdiff_t stride, 1361// const uint8_t scaling[SCALING_SIZE], 1362// const int scaling_shift, 1363// const entry grain_lut[][GRAIN_WIDTH], 1364// const int offsets[][2], 1365// const int h, const ptrdiff_t clip, 1366// const ptrdiff_t type); 1367function fgy_32x32_8bpc_neon, export=1 1368 AARCH64_SIGN_LINK_REGISTER 1369 str x30, [sp, #-16]! 1370 ldr w11, [x6, #8] // offsets[1][0] 1371 ldr w13, [x6, #4] // offsets[0][1] 1372 ldr w15, [x6, #12] // offsets[1][1] 1373 ldr w6, [x6] // offsets[0][0] 1374 ldr w8, [sp, #16] // clip 1375 mov x9, #GRAIN_WIDTH // grain_lut stride 1376 1377 neg w4, w4 1378 dup v29.8h, w4 // -scaling_shift 1379 1380 movrel x16, overlap_coeffs_0 1381 1382 cbz w8, 1f 1383 // clip 1384 movi v30.16b, #16 1385 movi v31.16b, #235 1386 b 2f 13871: 1388 // no clip 1389 movi v30.16b, #0 1390 movi v31.16b, #255 13912: 1392 1393 ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs 1394 1395 add x5, x5, #9 // grain_lut += 9 1396 add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride 1397 add x5, x5, x9 // grain_lut += grain_stride 1398 1399 calc_offset w11, w12, w11, 0, 0 1400 calc_offset w13, w14, w13, 0, 0 1401 calc_offset w15, w16, w15, 0, 0 1402 calc_offset w6, w10, w6, 0, 0 1403 1404 add_offset x12, w11, x12, x5, x9 1405 add_offset x14, w13, x14, x5, x9 1406 add_offset x16, w15, x16, x5, x9 1407 add_offset x5, w6, x10, x5, x9 1408 1409 ldr w11, [sp, #24] // type 1410 adr x13, L(fgy_loop_tbl) 1411 1412 add x4, x12, #32 // grain_lut += FG_BLOCK_SIZE * bx 1413 add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1414 1415 tst w11, #1 1416 ldrh w11, [x13, w11, uxtw #1] 1417 1418 add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1419 add x8, x8, #32 // grain_lut += FG_BLOCK_SIZE * bx 1420 1421 sub x11, x13, w11, uxtw 1422 1423 b.eq 1f 1424 // y overlap 1425 dup v6.16b, v27.b[0] 1426 dup v7.16b, v27.b[1] 1427 mov w10, w7 // backup actual h 1428 mov w7, #2 14291: 1430 br x11 1431endfunc 1432 1433function fgy_loop_neon 1434.macro fgy ox, oy 1435L(loop_\ox\oy): 1436 AARCH64_VALID_JUMP_TARGET 14371: 1438 ld1 {v0.16b, v1.16b}, [x1], x2 // src 1439.if \ox 1440 ld1 {v20.8b}, [x4], x9 // grain_lut old 1441.endif 1442.if \oy 1443 ld1 {v22.16b, v23.16b}, [x6], x9 // grain_lut top 1444.endif 1445.if \ox && \oy 1446 ld1 {v21.8b}, [x8], x9 // grain_lut top old 1447.endif 1448 ld1 {v18.16b, v19.16b}, [x5], x9 // grain_lut 1449 1450 bl gather32_neon 1451 1452.if \ox 1453 smull v20.8h, v20.8b, v27.8b 1454 smlal v20.8h, v18.8b, v28.8b 1455.endif 1456 1457.if \oy 1458.if \ox 1459 smull v21.8h, v21.8b, v27.8b 1460 smlal v21.8h, v22.8b, v28.8b 1461 sqrshrn v20.8b, v20.8h, #5 1462 sqrshrn v21.8b, v21.8h, #5 1463.endif 1464 1465.if \ox 1466 smull v16.8h, v20.8b, v7.8b 1467.else 1468 smull v16.8h, v18.8b, v7.8b 1469.endif 1470 smull2 v17.8h, v18.16b, v7.16b 1471 smull v18.8h, v19.8b, v7.8b 1472 smull2 v19.8h, v19.16b, v7.16b 1473.if \ox 1474 smlal v16.8h, v21.8b, v6.8b 1475.else 1476 smlal v16.8h, v22.8b, v6.8b 1477.endif 1478 smlal2 v17.8h, v22.16b, v6.16b 1479 smlal v18.8h, v23.8b, v6.8b 1480 smlal2 v19.8h, v23.16b, v6.16b 1481 sqrshrn v22.8b, v16.8h, #5 1482 sqrshrn2 v22.16b, v17.8h, #5 1483 sqrshrn v23.8b, v18.8h, #5 1484 sqrshrn2 v23.16b, v19.8h, #5 1485.endif 1486 1487 // sxtl of grain 1488.if \oy 1489 sxtl v16.8h, v22.8b 1490 sxtl2 v17.8h, v22.16b 1491 sxtl v18.8h, v23.8b 1492 sxtl2 v19.8h, v23.16b 1493.elseif \ox 1494 sqrshrn v20.8b, v20.8h, #5 1495 sxtl2 v17.8h, v18.16b 1496 sxtl v18.8h, v19.8b 1497 sxtl2 v19.8h, v19.16b 1498 sxtl v16.8h, v20.8b 1499.else 1500 sxtl v16.8h, v18.8b 1501 sxtl2 v17.8h, v18.16b 1502 sxtl v18.8h, v19.8b 1503 sxtl2 v19.8h, v19.16b 1504.endif 1505 1506 uxtl v2.8h, v4.8b // scaling 1507 uxtl2 v3.8h, v4.16b 1508 uxtl v4.8h, v5.8b 1509 uxtl2 v5.8h, v5.16b 1510 1511 mul v16.8h, v16.8h, v2.8h // scaling * grain 1512 mul v17.8h, v17.8h, v3.8h 1513 mul v18.8h, v18.8h, v4.8h 1514 mul v19.8h, v19.8h, v5.8h 1515 1516 srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) 1517 srshl v17.8h, v17.8h, v29.8h 1518 srshl v18.8h, v18.8h, v29.8h 1519 srshl v19.8h, v19.8h, v29.8h 1520 1521 uaddw v16.8h, v16.8h, v0.8b // *src + noise 1522 uaddw2 v17.8h, v17.8h, v0.16b 1523 uaddw v18.8h, v18.8h, v1.8b 1524 uaddw2 v19.8h, v19.8h, v1.16b 1525 1526 sqxtun v0.8b, v16.8h 1527 sqxtun2 v0.16b, v17.8h 1528 sqxtun v1.8b, v18.8h 1529 sqxtun2 v1.16b, v19.8h 1530 1531 umax v0.16b, v0.16b, v30.16b 1532 umax v1.16b, v1.16b, v30.16b 1533 umin v0.16b, v0.16b, v31.16b 1534 umin v1.16b, v1.16b, v31.16b 1535 1536 subs w7, w7, #1 1537.if \oy 1538 dup v6.16b, v28.b[0] 1539 dup v7.16b, v28.b[1] 1540.endif 1541 st1 {v0.16b, v1.16b}, [x0], x2 // dst 1542 b.gt 1b 1543 1544.if \oy 1545 cmp w10, #2 1546 sub w7, w10, #2 // restore actual remaining h 1547 b.gt L(loop_\ox\()0) 1548.endif 1549 ldr x30, [sp], #16 1550 AARCH64_VALIDATE_LINK_REGISTER 1551 ret 1552.endm 1553 1554 fgy 0, 0 1555 fgy 0, 1 1556 fgy 1, 0 1557 fgy 1, 1 1558 1559L(fgy_loop_tbl): 1560 .hword L(fgy_loop_tbl) - L(loop_00) 1561 .hword L(fgy_loop_tbl) - L(loop_01) 1562 .hword L(fgy_loop_tbl) - L(loop_10) 1563 .hword L(fgy_loop_tbl) - L(loop_11) 1564endfunc 1565 1566// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst, 1567// const pixel *const src, 1568// const ptrdiff_t stride, 1569// const uint8_t scaling[SCALING_SIZE], 1570// const Dav1dFilmGrainData *const data, 1571// const entry grain_lut[][GRAIN_WIDTH], 1572// const pixel *const luma_row, 1573// const ptrdiff_t luma_stride, 1574// const int offsets[][2], 1575// const ptrdiff_t h, const ptrdiff_t uv, 1576// const ptrdiff_t is_id, 1577// const ptrdiff_t type); 1578.macro fguv layout, sx, sy 1579function fguv_32x32_\layout\()_8bpc_neon, export=1 1580 AARCH64_SIGN_LINK_REGISTER 1581 str x30, [sp, #-32]! 1582 str d8, [sp, #16] 1583 ldp x8, x9, [sp, #32] // offsets, h 1584 ldp x10, x11, [sp, #48] // uv, is_id 1585 1586 ldr w13, [x4, #FGD_SCALING_SHIFT] 1587 ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE] 1588 neg w13, w13 // -scaling_shift 1589 1590 // !csfl 1591 add x10, x4, x10, lsl #2 // + 4*uv 1592 add x14, x10, #FGD_UV_LUMA_MULT 1593 add x15, x10, #FGD_UV_MULT 1594 add x10, x10, #FGD_UV_OFFSET 1595 ld1 {v8.h}[0], [x14] // uv_luma_mult 1596 ld1r {v24.8h}, [x10] // uv_offset 1597 ld1 {v8.h}[1], [x15] // uv_mult 1598 1599 dup v29.8h, w13 // -scaling_shift 1600 1601 cbz w12, 1f 1602 // clip 1603 movi v30.16b, #16 1604 movi v31.16b, #240 1605 cbz w11, 2f 1606 // is_id 1607 movi v31.16b, #235 1608 b 2f 16091: 1610 // no clip 1611 movi v30.16b, #0 1612 movi v31.16b, #255 16132: 1614 1615 ldr w12, [x8, #8] // offsets[1][0] 1616 ldr w14, [x8, #4] // offsets[0][1] 1617 ldr w16, [x8, #12] // offsets[1][1] 1618 ldr w8, [x8] // offsets[0][0] 1619 1620 mov x10, #GRAIN_WIDTH // grain_lut stride 1621 1622 add x5, x5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6 1623.if \sy 1624 add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride 1625 add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride 1626.else 1627 add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride 1628 add x5, x5, x10 // grain_lut += grain_stride 1629.endif 1630 1631 calc_offset w12, w13, w12, \sx, \sy 1632 calc_offset w14, w15, w14, \sx, \sy 1633 calc_offset w16, w17, w16, \sx, \sy 1634 calc_offset w8, w11, w8, \sx, \sy 1635 1636 add_offset x13, w12, x13, x5, x10 1637 add_offset x15, w14, x15, x5, x10 1638 add_offset x17, w16, x17, x5, x10 1639 add_offset x5, w8, x11, x5, x10 1640 1641 add x4, x13, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx 1642 add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1643 add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1644 add x11, x11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx 1645 1646 ldr w13, [sp, #64] // type 1647 1648 movrel x16, overlap_coeffs_\sx 1649 adr x14, L(fguv_loop_sx\sx\()_tbl) 1650 1651 ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs 1652 tst w13, #1 1653 ldrh w13, [x14, w13, uxtw #1] 1654 1655 b.eq 1f 1656 // y overlap 1657 sub w12, w9, #(2 >> \sy) // backup remaining h 1658 mov w9, #(2 >> \sy) 1659 16601: 1661 sub x13, x14, w13, uxtw 1662 1663.if \sy 1664 movi v25.16b, #23 1665 movi v26.16b, #22 1666.else 1667 movi v25.16b, #27 1668 movi v26.16b, #17 1669.endif 1670 1671.if \sy 1672 add x7, x7, x7 // luma_stride *= 2 1673.endif 1674 1675 br x13 1676endfunc 1677.endm 1678 1679fguv 420, 1, 1 1680fguv 422, 1, 0 1681fguv 444, 0, 0 1682 1683function fguv_loop_sx0_neon 1684.macro fguv_loop_sx0 csfl, ox, oy 1685L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): 1686 AARCH64_VALID_JUMP_TARGET 16871: 1688 ld1 {v0.16b, v1.16b}, [x6], x7 // luma 1689 ld1 {v6.16b, v7.16b}, [x1], x2 // src 1690.if \ox 1691 ld1 {v20.8b}, [x4], x10 // grain_lut old 1692.endif 1693.if \oy 1694 ld1 {v22.16b, v23.16b}, [x8], x10 // grain_lut top 1695.endif 1696.if \ox && \oy 1697 ld1 {v21.8b}, [x11], x10 // grain_lut top old 1698.endif 1699 ld1 {v18.16b, v19.16b}, [x5], x10 // grain_lut 1700 1701.if !\csfl 1702 uxtl v2.8h, v0.8b 1703 uxtl2 v3.8h, v0.16b 1704 uxtl v4.8h, v1.8b 1705 uxtl2 v5.8h, v1.16b 1706 uxtl v0.8h, v6.8b 1707 uxtl2 v1.8h, v6.16b 1708 uxtl v16.8h, v7.8b 1709 uxtl2 v17.8h, v7.16b 1710 mul v2.8h, v2.8h, v8.h[0] 1711 mul v3.8h, v3.8h, v8.h[0] 1712 mul v4.8h, v4.8h, v8.h[0] 1713 mul v5.8h, v5.8h, v8.h[0] 1714 mul v0.8h, v0.8h, v8.h[1] 1715 mul v1.8h, v1.8h, v8.h[1] 1716 mul v16.8h, v16.8h, v8.h[1] 1717 mul v17.8h, v17.8h, v8.h[1] 1718 sqadd v2.8h, v2.8h, v0.8h 1719 sqadd v3.8h, v3.8h, v1.8h 1720 sqadd v4.8h, v4.8h, v16.8h 1721 sqadd v5.8h, v5.8h, v17.8h 1722 sshr v2.8h, v2.8h, #6 1723 sshr v3.8h, v3.8h, #6 1724 sshr v4.8h, v4.8h, #6 1725 sshr v5.8h, v5.8h, #6 1726 add v2.8h, v2.8h, v24.8h 1727 add v3.8h, v3.8h, v24.8h 1728 add v4.8h, v4.8h, v24.8h 1729 add v5.8h, v5.8h, v24.8h 1730 sqxtun v0.8b, v2.8h 1731 sqxtun2 v0.16b, v3.8h 1732 sqxtun v1.8b, v4.8h 1733 sqxtun2 v1.16b, v5.8h 1734.endif 1735 1736 bl gather32_neon 1737 1738.if \ox 1739 smull v20.8h, v20.8b, v27.8b 1740 smlal v20.8h, v18.8b, v28.8b 1741.endif 1742 1743.if \oy 1744.if \ox 1745 smull v21.8h, v21.8b, v27.8b 1746 smlal v21.8h, v22.8b, v28.8b 1747 sqrshrn v20.8b, v20.8h, #5 1748 sqrshrn v21.8b, v21.8h, #5 1749.endif 1750 1751.if \ox 1752 smull v16.8h, v20.8b, v26.8b 1753.else 1754 smull v16.8h, v18.8b, v26.8b 1755.endif 1756 smull2 v17.8h, v18.16b, v26.16b 1757 smull v18.8h, v19.8b, v26.8b 1758 smull2 v19.8h, v19.16b, v26.16b 1759.if \ox 1760 smlal v16.8h, v21.8b, v25.8b 1761.else 1762 smlal v16.8h, v22.8b, v25.8b 1763.endif 1764 smlal2 v17.8h, v22.16b, v25.16b 1765 smlal v18.8h, v23.8b, v25.8b 1766 smlal2 v19.8h, v23.16b, v25.16b 1767 sqrshrn v22.8b, v16.8h, #5 1768 sqrshrn2 v22.16b, v17.8h, #5 1769 sqrshrn v23.8b, v18.8h, #5 1770 sqrshrn2 v23.16b, v19.8h, #5 1771.endif 1772 1773 // sxtl of grain 1774.if \oy 1775 sxtl v16.8h, v22.8b 1776 sxtl2 v17.8h, v22.16b 1777 sxtl v18.8h, v23.8b 1778 sxtl2 v19.8h, v23.16b 1779.elseif \ox 1780 sqrshrn v20.8b, v20.8h, #5 1781 sxtl2 v17.8h, v18.16b 1782 sxtl v18.8h, v19.8b 1783 sxtl2 v19.8h, v19.16b 1784 sxtl v16.8h, v20.8b 1785.else 1786 sxtl v16.8h, v18.8b 1787 sxtl2 v17.8h, v18.16b 1788 sxtl v18.8h, v19.8b 1789 sxtl2 v19.8h, v19.16b 1790.endif 1791 1792 uxtl v2.8h, v4.8b // scaling 1793 uxtl2 v3.8h, v4.16b 1794 uxtl v4.8h, v5.8b 1795 uxtl2 v5.8h, v5.16b 1796 1797 mul v16.8h, v16.8h, v2.8h // scaling * grain 1798 mul v17.8h, v17.8h, v3.8h 1799 mul v18.8h, v18.8h, v4.8h 1800 mul v19.8h, v19.8h, v5.8h 1801 1802 srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) 1803 srshl v17.8h, v17.8h, v29.8h 1804 srshl v18.8h, v18.8h, v29.8h 1805 srshl v19.8h, v19.8h, v29.8h 1806 1807 uaddw v16.8h, v16.8h, v6.8b // *src + noise 1808 uaddw2 v17.8h, v17.8h, v6.16b 1809 uaddw v18.8h, v18.8h, v7.8b 1810 uaddw2 v19.8h, v19.8h, v7.16b 1811 1812 sqxtun v0.8b, v16.8h 1813 sqxtun2 v0.16b, v17.8h 1814 sqxtun v1.8b, v18.8h 1815 sqxtun2 v1.16b, v19.8h 1816 1817 umax v0.16b, v0.16b, v30.16b 1818 umax v1.16b, v1.16b, v30.16b 1819 umin v0.16b, v0.16b, v31.16b 1820 umin v1.16b, v1.16b, v31.16b 1821 1822 subs w9, w9, #1 1823.if \oy 1824 dup v25.16b, v28.b[0] 1825 dup v26.16b, v28.b[1] 1826.endif 1827 st1 {v0.16b, v1.16b}, [x0], x2 // dst 1828 b.gt 1b 1829 1830.if \oy 1831 cmp w12, #0 1832 mov w9, w12 // restore actual remaining h 1833 b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) 1834.endif 1835 b 9f 1836.endm 1837 fguv_loop_sx0 0, 0, 0 1838 fguv_loop_sx0 0, 0, 1 1839 fguv_loop_sx0 0, 1, 0 1840 fguv_loop_sx0 0, 1, 1 1841 fguv_loop_sx0 1, 0, 0 1842 fguv_loop_sx0 1, 0, 1 1843 fguv_loop_sx0 1, 1, 0 1844 fguv_loop_sx0 1, 1, 1 1845 18469: 1847 ldr d8, [sp, #16] 1848 ldr x30, [sp], #32 1849 AARCH64_VALIDATE_LINK_REGISTER 1850 ret 1851 1852L(fguv_loop_sx0_tbl): 1853 .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00) 1854 .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01) 1855 .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10) 1856 .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11) 1857 .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00) 1858 .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01) 1859 .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10) 1860 .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11) 1861endfunc 1862 1863function fguv_loop_sx1_neon 1864.macro fguv_loop_sx1 csfl, ox, oy 1865L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): 1866 AARCH64_VALID_JUMP_TARGET 18671: 1868 ld1 {v0.16b, v1.16b}, [x6], x7 // luma 1869 ld1 {v6.16b}, [x1], x2 // src 1870.if \ox 1871 ld1 {v20.8b}, [x4], x10 // grain_lut old 1872.endif 1873.if \oy 1874 ld1 {v22.16b}, [x8], x10 // grain_lut top 1875.endif 1876.if \ox && \oy 1877 ld1 {v21.8b}, [x11], x10 // grain_lut top old 1878.endif 1879 ld1 {v18.16b}, [x5], x10 // grain_lut 1880 1881 uaddlp v2.8h, v0.16b 1882 uaddlp v3.8h, v1.16b 1883.if \csfl 1884 rshrn v0.8b, v2.8h, #1 1885 rshrn2 v0.16b, v3.8h, #1 1886.else 1887 urshr v2.8h, v2.8h, #1 1888 urshr v3.8h, v3.8h, #1 1889 uxtl v0.8h, v6.8b 1890 uxtl2 v1.8h, v6.16b 1891 mul v2.8h, v2.8h, v8.h[0] 1892 mul v3.8h, v3.8h, v8.h[0] 1893 mul v0.8h, v0.8h, v8.h[1] 1894 mul v1.8h, v1.8h, v8.h[1] 1895 sqadd v2.8h, v2.8h, v0.8h 1896 sqadd v3.8h, v3.8h, v1.8h 1897 sshr v2.8h, v2.8h, #6 1898 sshr v3.8h, v3.8h, #6 1899 add v2.8h, v2.8h, v24.8h 1900 add v3.8h, v3.8h, v24.8h 1901 sqxtun v0.8b, v2.8h 1902 sqxtun2 v0.16b, v3.8h 1903.endif 1904 1905 bl gather16_neon 1906 1907.if \ox 1908 smull v20.8h, v20.8b, v27.8b 1909 smlal v20.8h, v18.8b, v28.8b 1910.endif 1911 1912.if \oy 1913.if \ox 1914 smull v21.8h, v21.8b, v27.8b 1915 smlal v21.8h, v22.8b, v28.8b 1916 sqrshrn v20.8b, v20.8h, #5 1917 sqrshrn v21.8b, v21.8h, #5 1918.endif 1919 1920.if \ox 1921 smull v16.8h, v20.8b, v26.8b 1922.else 1923 smull v16.8h, v18.8b, v26.8b 1924.endif 1925 smull2 v17.8h, v18.16b, v26.16b 1926.if \ox 1927 smlal v16.8h, v21.8b, v25.8b 1928.else 1929 smlal v16.8h, v22.8b, v25.8b 1930.endif 1931 smlal2 v17.8h, v22.16b, v25.16b 1932 sqrshrn v22.8b, v16.8h, #5 1933 sqrshrn2 v22.16b, v17.8h, #5 1934.endif 1935 1936 // sxtl of grain 1937.if \oy 1938 sxtl v16.8h, v22.8b 1939 sxtl2 v17.8h, v22.16b 1940.elseif \ox 1941 sqrshrn v20.8b, v20.8h, #5 1942 sxtl2 v17.8h, v18.16b 1943 sxtl v16.8h, v20.8b 1944.else 1945 sxtl v16.8h, v18.8b 1946 sxtl2 v17.8h, v18.16b 1947.endif 1948 1949 uxtl v2.8h, v4.8b // scaling 1950 uxtl2 v3.8h, v4.16b 1951 1952 mul v16.8h, v16.8h, v2.8h // scaling * grain 1953 mul v17.8h, v17.8h, v3.8h 1954 1955 srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) 1956 srshl v17.8h, v17.8h, v29.8h 1957 1958 uaddw v16.8h, v16.8h, v6.8b // *src + noise 1959 uaddw2 v17.8h, v17.8h, v6.16b 1960 1961 sqxtun v0.8b, v16.8h 1962 sqxtun2 v0.16b, v17.8h 1963 1964 umax v0.16b, v0.16b, v30.16b 1965 umin v0.16b, v0.16b, v31.16b 1966 1967.if \oy 1968 mov v16.16b, v25.16b 1969.endif 1970 subs w9, w9, #1 1971.if \oy 1972 mov v25.16b, v26.16b 1973 mov v26.16b, v16.16b 1974.endif 1975 st1 {v0.16b}, [x0], x2 // dst 1976 b.gt 1b 1977 1978.if \oy 1979 cmp w12, #0 1980 mov w9, w12 // restore actual remaining h 1981 b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) 1982.endif 1983 1984 b 9f 1985.endm 1986 fguv_loop_sx1 0, 0, 0 1987 fguv_loop_sx1 0, 0, 1 1988 fguv_loop_sx1 0, 1, 0 1989 fguv_loop_sx1 0, 1, 1 1990 fguv_loop_sx1 1, 0, 0 1991 fguv_loop_sx1 1, 0, 1 1992 fguv_loop_sx1 1, 1, 0 1993 fguv_loop_sx1 1, 1, 1 1994 19959: 1996 ldr d8, [sp, #16] 1997 ldr x30, [sp], #32 1998 AARCH64_VALIDATE_LINK_REGISTER 1999 ret 2000 2001L(fguv_loop_sx1_tbl): 2002 .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00) 2003 .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01) 2004 .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10) 2005 .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11) 2006 .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00) 2007 .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01) 2008 .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10) 2009 .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11) 2010endfunc 2011