1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31const right_ext_mask_buf 32 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 33 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 34 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 35 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 36 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 37 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 38right_ext_mask: 39 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 40 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 41 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 42 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 43 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 44 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 45endconst 46 47// void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t stride, 48// const pixel (*left)[4], const pixel *lpf, 49// const int w, int h, 50// const int16_t filter[2][8], 51// const enum LrEdgeFlags edges); 52function wiener_filter7_8bpc_neon, export=1 53 AARCH64_SIGN_LINK_REGISTER 54 stp x29, x30, [sp, #-16]! 55 mov x29, sp 56 ld1 {v0.8h, v1.8h}, [x6] 57 tst w7, #4 // LR_HAVE_TOP 58 sub_sp 384*2*6 59 60 mov w17, #(1 << 14) - (1 << 2) 61 dup v30.8h, w17 62 movi v31.8h, #8, lsl #8 63 64 // x9 - t6 65 // x10 - t5 66 // x11 - t4 67 // x12 - t3 68 // x13 - t2 69 // x14 - t1 70 // x15 - t0 71 mov x14, sp // t1 72 b.eq L(no_top_7) 73 74 mov x16, x2 // backup left 75 mov x2, #0 76 bl wiener_filter7_h_8bpc_neon 77 add x3, x3, x1 // lpf += stride 78 mov x9, x14 // t6 79 mov x10, x14 // t5 80 add x14, x14, #384*2 // t1 += 384*2 81 bl wiener_filter7_h_8bpc_neon 82 add x3, x3, x1, lsl #2 83 add x3, x3, x1 // lpf += stride*5 84 mov x11, x14 // t4 85 add x14, x14, #384*2 // t1 += 384*2 86 mov x2, x16 // left 87 mov x16, x3 // backup lpf 88 mov x3, x0 // lpf = p 89 bl wiener_filter7_h_8bpc_neon 90 subs w5, w5, #1 // h-- 91 mov x12, x14 // t3 92 mov x13, x14 // t2 93 b.eq L(v1_7) 94 add x3, x3, x1 // src += stride 95 add x14, x14, #384*2 // t1 += 384*2 96 bl wiener_filter7_h_8bpc_neon 97 mov x13, x14 // t2 98 subs w5, w5, #1 // h-- 99 b.eq L(v2_7) 100 add x3, x3, x1 // src += stride 101 add x14, x14, #384*2 // t1 += 384*2 102 bl wiener_filter7_h_8bpc_neon 103 subs w5, w5, #1 // h-- 104 b.eq L(v3_7) 105 add x3, x3, x1 // src += stride 106 107L(main_7): 108 add x15, x14, #384*2 // t0 = t1 + 384*2 109L(main_loop_7): 110 bl wiener_filter7_hv_8bpc_neon 111 subs w5, w5, #1 // h-- 112 b.ne L(main_loop_7) 113 tst w7, #8 // LR_HAVE_BOTTOM 114 b.eq L(v3_7) 115 116 mov x3, x16 // restore lpf 117 mov x2, #0 // left = NULL 118 bl wiener_filter7_hv_8bpc_neon 119 bl wiener_filter7_hv_8bpc_neon 120L(v1_7): 121 bl wiener_filter7_v_8bpc_neon 122 123 mov sp, x29 124 ldp x29, x30, [sp], #16 125 AARCH64_VALIDATE_LINK_REGISTER 126 ret 127 128L(no_top_7): 129 add x3, x3, x1, lsl #2 130 add x16, x3, x1, lsl #1 // lpf += stride*6, backup 131 mov x3, x0 // lpf = p 132 133 bl wiener_filter7_h_8bpc_neon 134 subs w5, w5, #1 // h-- 135 mov x9, x14 // t6 136 mov x10, x14 // t5 137 mov x11, x14 // t4 138 mov x12, x14 // t3 139 mov x13, x14 // t2 140 b.eq L(v1_7) 141 add x3, x3, x1 // src += stride 142 add x14, x14, #384*2 // t1 += 384*2 143 bl wiener_filter7_h_8bpc_neon 144 subs w5, w5, #1 // h-- 145 mov x13, x14 // t2 146 b.eq L(v2_7) 147 add x3, x3, x1 // src += stride 148 add x14, x14, #384*2 // t1 += 384*2 149 bl wiener_filter7_h_8bpc_neon 150 subs w5, w5, #1 // h-- 151 b.eq L(v3_7) 152 add x3, x3, x1 // src += stride 153 add x15, x14, #384*2 // t0 = t1 + 384*2 154 bl wiener_filter7_hv_8bpc_neon 155 subs w5, w5, #1 // h-- 156 b.eq L(v3_7) 157 add x15, x15, #384*2*4 // t0 += 384*2*4 158 bl wiener_filter7_hv_8bpc_neon 159 subs w5, w5, #1 // h-- 160 b.ne L(main_7) 161L(v3_7): 162 bl wiener_filter7_v_8bpc_neon 163L(v2_7): 164 bl wiener_filter7_v_8bpc_neon 165 b L(v1_7) 166endfunc 167 168 169function wiener_filter7_h_8bpc_neon 170 stp x3, x4, [sp, #-32]! 171 str x14, [sp, #16] 172 173 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 174 tst w7, #1 // LR_HAVE_LEFT 175 b.eq 1f 176 // LR_HAVE_LEFT 177 cbnz x2, 0f 178 // left == NULL 179 sub x3, x3, #3 180 ld1 {v3.16b}, [x3], #16 181 b 2f 182 1830: 184 // LR_HAVE_LEFT, left != NULL 185 ld1 {v3.16b}, [x3], #16 186 ld1 {v2.s}[3], [x2], #4 187 // Move x3 back to account for the last 3 bytes we loaded earlier, 188 // which we'll shift out. 189 sub x3, x3, #3 190 ext v3.16b, v2.16b, v3.16b, #13 191 b 2f 192 1931: 194 ld1 {v3.16b}, [x3], #16 195 // !LR_HAVE_LEFT, fill v2 with the leftmost byte 196 // and shift v3 to have 3x the first byte at the front. 197 dup v2.16b, v3.b[0] 198 // Move x3 back to account for the last 3 bytes we loaded before, 199 // which we shifted out. 200 sub x3, x3, #3 201 ext v3.16b, v2.16b, v3.16b, #13 202 2032: 204 ld1 {v4.8b}, [x3], #8 205 uxtl v2.8h, v3.8b 206 uxtl2 v3.8h, v3.16b 207 uxtl v4.8h, v4.8b 208 209 tst w7, #2 // LR_HAVE_RIGHT 210 b.ne 4f 211 2123: // !LR_HAVE_RIGHT 213 214 // Check whether we need to pad the right edge 215 cmp w4, #19 216 b.ge 4f // If w >= 19, all used input pixels are valid 217 218 // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, 219 // this ends up called again; it's not strictly needed in those 220 // cases (we pad enough here), but keeping the code as simple as possible. 221 222 // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie 223 // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. 224 sub w17, w4, #22 225 // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the 226 // buffer pointer. 227 movrel x6, right_ext_mask, -6 228 ldr b28, [x3, w17, sxtw] 229 sub x6, x6, w4, uxtw #1 230 dup v28.8h, v28.h[0] 231 ld1 {v25.16b, v26.16b, v27.16b}, [x6] 232 233 bit v2.16b, v28.16b, v25.16b 234 bit v3.16b, v28.16b, v26.16b 235 bit v4.16b, v28.16b, v27.16b 236 2374: // Loop horizontally 238 // Interleaving the mul/mla chains actually hurts performance 239 // significantly on Cortex A53, thus keeping mul/mla tightly 240 // chained like this. 241 ext v17.16b, v2.16b, v3.16b, #4 242 ext v19.16b, v2.16b, v3.16b, #8 243 ext v16.16b, v2.16b, v3.16b, #2 244 ext v20.16b, v2.16b, v3.16b, #10 245 ext v21.16b, v2.16b, v3.16b, #12 246 ext v18.16b, v2.16b, v3.16b, #6 247 add v19.8h, v19.8h, v17.8h 248 add v20.8h, v20.8h, v16.8h 249 add v21.8h, v21.8h, v2.8h 250 shl v22.8h, v18.8h, #7 251 mul v6.8h, v18.8h, v0.h[3] 252 mla v6.8h, v19.8h, v0.h[4] 253 mla v6.8h, v20.8h, v0.h[5] 254 mla v6.8h, v21.8h, v0.h[6] 255 256 ext v17.16b, v3.16b, v4.16b, #4 257 ext v19.16b, v3.16b, v4.16b, #8 258 ext v16.16b, v3.16b, v4.16b, #2 259 ext v20.16b, v3.16b, v4.16b, #10 260 ext v21.16b, v3.16b, v4.16b, #12 261 ext v18.16b, v3.16b, v4.16b, #6 262 263 add v19.8h, v19.8h, v17.8h 264 add v20.8h, v20.8h, v16.8h 265 add v21.8h, v21.8h, v3.8h 266 shl v23.8h, v18.8h, #7 267 mul v7.8h, v18.8h, v0.h[3] 268 mla v7.8h, v19.8h, v0.h[4] 269 mla v7.8h, v20.8h, v0.h[5] 270 mla v7.8h, v21.8h, v0.h[6] 271 272 sub v22.8h, v22.8h, v30.8h 273 sub v23.8h, v23.8h, v30.8h 274 sqadd v6.8h, v6.8h, v22.8h 275 sqadd v7.8h, v7.8h, v23.8h 276 sshr v6.8h, v6.8h, #3 277 sshr v7.8h, v7.8h, #3 278 add v6.8h, v6.8h, v31.8h 279 add v7.8h, v7.8h, v31.8h 280 281 subs w4, w4, #16 282 283 st1 {v6.8h, v7.8h}, [x14], #32 284 285 b.le 0f 286 mov v2.16b, v4.16b 287 ld1 {v4.16b}, [x3], #16 288 tst w7, #2 // LR_HAVE_RIGHT 289 uxtl v3.8h, v4.8b 290 uxtl2 v4.8h, v4.16b 291 b.ne 4b // If we don't need to pad, just keep filtering. 292 b 3b // If we need to pad, check how many pixels we have left. 293 2940: 295 ldr x14, [sp, #16] 296 ldp x3, x4, [sp], #32 297 ret 298endfunc 299 300function wiener_filter7_v_8bpc_neon 301 // Backing up/restoring registers shifted, so that x9 gets the value 302 // of x10, etc, afterwards. 303 stp x10, x11, [sp, #-64]! 304 stp x12, x13, [sp, #16] 305 stp x14, x14, [sp, #32] 306 stp x0, x4, [sp, #48] 3071: 308 ld1 {v20.8h, v21.8h}, [x11], #32 309 ld1 {v24.8h, v25.8h}, [x13], #32 310 311 ld1 {v18.8h, v19.8h}, [x10], #32 312 add v24.8h, v24.8h, v20.8h 313 ld1 {v26.8h, v27.8h}, [x14], #32 314 315 ld1 {v16.8h, v17.8h}, [x9], #32 316 add v28.8h, v26.8h, v18.8h 317 ld1 {v22.8h, v23.8h}, [x12], #32 318 319 add v16.8h, v26.8h, v16.8h 320 add v25.8h, v25.8h, v21.8h 321 322 smull v2.4s, v22.4h, v1.h[3] 323 smlal v2.4s, v24.4h, v1.h[4] 324 smlal v2.4s, v28.4h, v1.h[5] 325 smlal v2.4s, v16.4h, v1.h[6] 326 add v29.8h, v27.8h, v19.8h 327 smull2 v3.4s, v22.8h, v1.h[3] 328 smlal2 v3.4s, v24.8h, v1.h[4] 329 smlal2 v3.4s, v28.8h, v1.h[5] 330 smlal2 v3.4s, v16.8h, v1.h[6] 331 add v17.8h, v27.8h, v17.8h 332 smull v4.4s, v23.4h, v1.h[3] 333 smlal v4.4s, v25.4h, v1.h[4] 334 smlal v4.4s, v29.4h, v1.h[5] 335 smlal v4.4s, v17.4h, v1.h[6] 336 smull2 v5.4s, v23.8h, v1.h[3] 337 smlal2 v5.4s, v25.8h, v1.h[4] 338 smlal2 v5.4s, v29.8h, v1.h[5] 339 smlal2 v5.4s, v17.8h, v1.h[6] 340 sqrshrun v2.4h, v2.4s, #11 341 sqrshrun2 v2.8h, v3.4s, #11 342 sqrshrun v3.4h, v4.4s, #11 343 sqrshrun2 v3.8h, v5.4s, #11 344 sqxtun v2.8b, v2.8h 345 sqxtun2 v2.16b, v3.8h 346 subs w4, w4, #16 347 st1 {v2.16b}, [x0], #16 348 b.gt 1b 349 350 ldp x0, x4, [sp, #48] 351 ldp x13, x14, [sp, #32] 352 ldp x11, x12, [sp, #16] 353 ldp x9, x10, [sp], #64 354 355 add x0, x0, x1 356 ret 357endfunc 358 359function wiener_filter7_hv_8bpc_neon 360 // Backing up/restoring registers shifted, so that x9 gets the value 361 // of x10, etc, and x15==x9, afterwards. 362 stp x10, x11, [sp, #-80]! 363 stp x12, x13, [sp, #16] 364 stp x14, x15, [sp, #32] 365 stp x10, x0, [sp, #48] 366 stp x3, x4, [sp, #64] 367 368 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 369 tst w7, #1 // LR_HAVE_LEFT 370 b.eq 1f 371 // LR_HAVE_LEFT 372 cbnz x2, 0f 373 // left == NULL 374 sub x3, x3, #3 375 ld1 {v3.16b}, [x3], #16 376 b 2f 377 3780: 379 // LR_HAVE_LEFT, left != NULL 380 ld1 {v3.16b}, [x3], #16 381 ld1 {v2.s}[3], [x2], #4 382 // Move x3 back to account for the last 3 bytes we loaded earlier, 383 // which we'll shift out. 384 sub x3, x3, #3 385 ext v3.16b, v2.16b, v3.16b, #13 386 b 2f 3871: 388 ld1 {v3.16b}, [x3], #16 389 // !LR_HAVE_LEFT, fill v2 with the leftmost byte 390 // and shift v3 to have 3x the first byte at the front. 391 dup v2.16b, v3.b[0] 392 // Move x3 back to account for the last 3 bytes we loaded before, 393 // which we shifted out. 394 sub x3, x3, #3 395 ext v3.16b, v2.16b, v3.16b, #13 396 3972: 398 ld1 {v4.8b}, [x3], #8 399 uxtl v2.8h, v3.8b 400 uxtl2 v3.8h, v3.16b 401 uxtl v4.8h, v4.8b 402 403 tst w7, #2 // LR_HAVE_RIGHT 404 b.ne 4f 405 4063: // !LR_HAVE_RIGHT 407 408 // Check whether we need to pad the right edge 409 cmp w4, #19 410 b.ge 4f // If w >= 19, all used input pixels are valid 411 412 // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, 413 // this ends up called again; it's not strictly needed in those 414 // cases (we pad enough here), but keeping the code as simple as possible. 415 416 // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie 417 // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. 418 sub w17, w4, #22 419 // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the 420 // buffer pointer. 421 movrel x6, right_ext_mask, -6 422 ldr b28, [x3, w17, sxtw] 423 sub x6, x6, w4, uxtw #1 424 dup v28.8h, v28.h[0] 425 ld1 {v25.16b, v26.16b, v27.16b}, [x6] 426 427 bit v2.16b, v28.16b, v25.16b 428 bit v3.16b, v28.16b, v26.16b 429 bit v4.16b, v28.16b, v27.16b 430 4314: // Loop horizontally 432 ext v17.16b, v2.16b, v3.16b, #4 433 ext v19.16b, v2.16b, v3.16b, #8 434 ext v16.16b, v2.16b, v3.16b, #2 435 ext v20.16b, v2.16b, v3.16b, #10 436 ext v21.16b, v2.16b, v3.16b, #12 437 ext v18.16b, v2.16b, v3.16b, #6 438 add v19.8h, v19.8h, v17.8h 439 add v20.8h, v20.8h, v16.8h 440 add v21.8h, v21.8h, v2.8h 441 shl v22.8h, v18.8h, #7 442 mul v6.8h, v18.8h, v0.h[3] 443 mla v6.8h, v19.8h, v0.h[4] 444 mla v6.8h, v20.8h, v0.h[5] 445 mla v6.8h, v21.8h, v0.h[6] 446 447 ext v17.16b, v3.16b, v4.16b, #4 448 ext v19.16b, v3.16b, v4.16b, #8 449 ext v16.16b, v3.16b, v4.16b, #2 450 ext v20.16b, v3.16b, v4.16b, #10 451 ext v21.16b, v3.16b, v4.16b, #12 452 ext v18.16b, v3.16b, v4.16b, #6 453 454 add v19.8h, v19.8h, v17.8h 455 add v20.8h, v20.8h, v16.8h 456 add v21.8h, v21.8h, v3.8h 457 shl v23.8h, v18.8h, #7 458 mul v7.8h, v18.8h, v0.h[3] 459 mla v7.8h, v19.8h, v0.h[4] 460 mla v7.8h, v20.8h, v0.h[5] 461 mla v7.8h, v21.8h, v0.h[6] 462 463 ld1 {v20.8h, v21.8h}, [x11], #32 464 465 sub v22.8h, v22.8h, v30.8h 466 sub v23.8h, v23.8h, v30.8h 467 ld1 {v26.8h, v27.8h}, [x13], #32 468 sqadd v6.8h, v6.8h, v22.8h 469 sqadd v7.8h, v7.8h, v23.8h 470 ld1 {v18.8h, v19.8h}, [x10], #32 471 sshr v6.8h, v6.8h, #3 472 sshr v7.8h, v7.8h, #3 473 ld1 {v28.8h, v29.8h}, [x14], #32 474 add v6.8h, v6.8h, v31.8h 475 add v7.8h, v7.8h, v31.8h 476 477 ld1 {v16.8h, v17.8h}, [x9], #32 478 add v26.8h, v20.8h, v26.8h 479 480 ld1 {v24.8h, v25.8h}, [x12], #32 481 add v28.8h, v18.8h, v28.8h 482 483 add v16.8h, v16.8h, v6.8h 484 add v27.8h, v21.8h, v27.8h 485 486 smull v18.4s, v24.4h, v1.h[3] 487 smlal v18.4s, v26.4h, v1.h[4] 488 smlal v18.4s, v28.4h, v1.h[5] 489 smlal v18.4s, v16.4h, v1.h[6] 490 add v29.8h, v19.8h, v29.8h 491 smull2 v19.4s, v24.8h, v1.h[3] 492 smlal2 v19.4s, v26.8h, v1.h[4] 493 smlal2 v19.4s, v28.8h, v1.h[5] 494 smlal2 v19.4s, v16.8h, v1.h[6] 495 add v17.8h, v17.8h, v7.8h 496 smull v20.4s, v25.4h, v1.h[3] 497 smlal v20.4s, v27.4h, v1.h[4] 498 smlal v20.4s, v29.4h, v1.h[5] 499 smlal v20.4s, v17.4h, v1.h[6] 500 smull2 v21.4s, v25.8h, v1.h[3] 501 smlal2 v21.4s, v27.8h, v1.h[4] 502 smlal2 v21.4s, v29.8h, v1.h[5] 503 smlal2 v21.4s, v17.8h, v1.h[6] 504 sqrshrun v18.4h, v18.4s, #11 505 sqrshrun2 v18.8h, v19.4s, #11 506 sqrshrun v19.4h, v20.4s, #11 507 sqrshrun2 v19.8h, v21.4s, #11 508 st1 {v6.8h, v7.8h}, [x15], #32 509 sqxtun v18.8b, v18.8h 510 sqxtun2 v18.16b, v19.8h 511 subs w4, w4, #16 512 513 st1 {v18.16b}, [x0], #16 514 515 b.le 0f 516 mov v2.16b, v4.16b 517 ld1 {v4.16b}, [x3], #16 518 tst w7, #2 // LR_HAVE_RIGHT 519 uxtl v3.8h, v4.8b 520 uxtl2 v4.8h, v4.16b 521 b.ne 4b // If we don't need to pad, just keep filtering. 522 b 3b // If we need to pad, check how many pixels we have left. 523 5240: 525 ldp x3, x4, [sp, #64] 526 ldp x15, x0, [sp, #48] 527 ldp x13, x14, [sp, #32] 528 ldp x11, x12, [sp, #16] 529 ldp x9, x10, [sp], #80 530 531 add x3, x3, x1 532 add x0, x0, x1 533 534 ret 535endfunc 536 537// void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t stride, 538// const pixel (*left)[4], const pixel *lpf, 539// const int w, int h, 540// const int16_t filter[2][8], 541// const enum LrEdgeFlags edges); 542function wiener_filter5_8bpc_neon, export=1 543 AARCH64_SIGN_LINK_REGISTER 544 stp x29, x30, [sp, #-16]! 545 mov x29, sp 546 ld1 {v0.8h, v1.8h}, [x6] 547 tst w7, #4 // LR_HAVE_TOP 548 sub_sp 384*2*4 549 550 mov w17, #(1 << 14) - (1 << 2) 551 dup v30.8h, w17 552 movi v31.8h, #8, lsl #8 553 554 // x11 - t4 555 // x12 - t3 556 // x13 - t2 557 // x14 - t1 558 // x15 - t0 559 mov x14, sp // t1 560 b.eq L(no_top_5) 561 562 mov x16, x2 // backup left 563 mov x2, #0 564 bl wiener_filter5_h_8bpc_neon 565 add x3, x3, x1 // lpf += stride 566 mov x11, x14 // t4 567 add x14, x14, #384*2 // t1 += 384*2 568 bl wiener_filter5_h_8bpc_neon 569 add x3, x3, x1, lsl #2 570 add x3, x3, x1 // lpf += stride*5 571 mov x12, x14 // t3 572 add x14, x14, #384*2 // t1 += 384*2 573 mov x2, x16 // left 574 mov x16, x3 // backup lpf 575 mov x3, x0 // lpf = p 576 bl wiener_filter5_h_8bpc_neon 577 subs w5, w5, #1 // h-- 578 mov x13, x14 // t2 579 b.eq L(v1_5) 580 add x3, x3, x1 // src += stride 581 add x14, x14, #384*2 // t1 += 384*2 582 bl wiener_filter5_h_8bpc_neon 583 subs w5, w5, #1 // h-- 584 b.eq L(v2_5) 585 add x3, x3, x1 // src += stride 586 587L(main_5): 588 mov x15, x11 // t0 = t4 589L(main_loop_5): 590 bl wiener_filter5_hv_8bpc_neon 591 subs w5, w5, #1 // h-- 592 b.ne L(main_loop_5) 593 tst w7, #8 // LR_HAVE_BOTTOM 594 b.eq L(v2_5) 595 596 mov x3, x16 // restore lpf 597 mov x2, #0 // left = NULL 598 bl wiener_filter5_hv_8bpc_neon 599 bl wiener_filter5_hv_8bpc_neon 600L(end_5): 601 602 mov sp, x29 603 ldp x29, x30, [sp], #16 604 AARCH64_VALIDATE_LINK_REGISTER 605 ret 606 607L(no_top_5): 608 add x3, x3, x1, lsl #2 609 add x16, x3, x1, lsl #1 // lpf += stride*6, backup 610 mov x3, x0 // lpf = p 611 612 bl wiener_filter5_h_8bpc_neon 613 subs w5, w5, #1 // h-- 614 mov x11, x14 // t4 615 mov x12, x14 // t3 616 mov x13, x14 // t2 617 b.eq L(v1_5) 618 add x3, x3, x1 // src += stride 619 add x14, x14, #384*2 // t1 += 384*2 620 bl wiener_filter5_h_8bpc_neon 621 subs w5, w5, #1 // h-- 622 b.eq L(v2_5) 623 add x3, x3, x1 // src += stride 624 add x15, x14, #384*2 // t0 = t1 + 384*2 625 bl wiener_filter5_hv_8bpc_neon 626 subs w5, w5, #1 // h-- 627 b.eq L(v2_5) 628 add x15, x15, #384*2*3 // t0 += 384*2*3 629 bl wiener_filter5_hv_8bpc_neon 630 subs w5, w5, #1 // h-- 631 b.ne L(main_5) 632L(v2_5): 633 bl wiener_filter5_v_8bpc_neon 634 add x0, x0, x1 635 mov x11, x12 636 mov x12, x13 637 mov x13, x14 638L(v1_5): 639 bl wiener_filter5_v_8bpc_neon 640 b L(end_5) 641endfunc 642 643 644function wiener_filter5_h_8bpc_neon 645 stp x3, x4, [sp, #-32]! 646 str x14, [sp, #16] 647 648 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 649 tst w7, #1 // LR_HAVE_LEFT 650 b.eq 1f 651 // LR_HAVE_LEFT 652 cbnz x2, 0f 653 // left == NULL 654 sub x3, x3, #2 655 ld1 {v3.16b}, [x3], #16 656 b 2f 657 6580: 659 // LR_HAVE_LEFT, left != NULL 660 ld1 {v3.16b}, [x3], #16 661 ld1 {v2.s}[3], [x2], #4 662 // Move x3 back to account for the last 2 bytes we loaded earlier, 663 // which we'll shift out. 664 sub x3, x3, #2 665 ext v3.16b, v2.16b, v3.16b, #14 666 b 2f 667 6681: 669 ld1 {v3.16b}, [x3], #16 670 // !LR_HAVE_LEFT, fill v2 with the leftmost byte 671 // and shift v3 to have 3x the first byte at the front. 672 dup v2.16b, v3.b[0] 673 // Move x3 back to account for the last 2 bytes we loaded before, 674 // which we shifted out. 675 sub x3, x3, #2 676 ext v3.16b, v2.16b, v3.16b, #14 677 6782: 679 ld1 {v4.8b}, [x3], #8 680 uxtl v2.8h, v3.8b 681 uxtl2 v3.8h, v3.16b 682 uxtl v4.8h, v4.8b 683 684 tst w7, #2 // LR_HAVE_RIGHT 685 b.ne 4f 686 6873: // !LR_HAVE_RIGHT 688 689 // Check whether we need to pad the right edge 690 cmp w4, #18 691 b.ge 4f // If w >= 18, all used input pixels are valid 692 693 // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, 694 // this ends up called again; it's not strictly needed in those 695 // cases (we pad enough here), but keeping the code as simple as possible. 696 697 // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie 698 // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. 699 sub w17, w4, #23 700 // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the 701 // buffer pointer. 702 movrel x6, right_ext_mask, -4 703 ldr b28, [x3, w17, sxtw] 704 sub x6, x6, w4, uxtw #1 705 dup v28.8h, v28.h[0] 706 ld1 {v25.16b, v26.16b, v27.16b}, [x6] 707 708 bit v2.16b, v28.16b, v25.16b 709 bit v3.16b, v28.16b, v26.16b 710 bit v4.16b, v28.16b, v27.16b 711 7124: // Loop horizontally 713 // Interleaving the mul/mla chains actually hurts performance 714 // significantly on Cortex A53, thus keeping mul/mla tightly 715 // chained like this. 716 ext v16.16b, v2.16b, v3.16b, #2 717 ext v18.16b, v2.16b, v3.16b, #6 718 ext v19.16b, v2.16b, v3.16b, #8 719 ext v17.16b, v2.16b, v3.16b, #4 720 add v18.8h, v18.8h, v16.8h 721 add v19.8h, v19.8h, v2.8h 722 shl v22.8h, v17.8h, #7 723 mul v6.8h, v17.8h, v0.h[3] 724 mla v6.8h, v18.8h, v0.h[4] 725 mla v6.8h, v19.8h, v0.h[5] 726 727 ext v16.16b, v3.16b, v4.16b, #2 728 ext v18.16b, v3.16b, v4.16b, #6 729 ext v19.16b, v3.16b, v4.16b, #8 730 ext v17.16b, v3.16b, v4.16b, #4 731 add v18.8h, v18.8h, v16.8h 732 add v19.8h, v19.8h, v3.8h 733 shl v23.8h, v17.8h, #7 734 mul v7.8h, v17.8h, v0.h[3] 735 mla v7.8h, v18.8h, v0.h[4] 736 mla v7.8h, v19.8h, v0.h[5] 737 738 sub v22.8h, v22.8h, v30.8h 739 sub v23.8h, v23.8h, v30.8h 740 sqadd v6.8h, v6.8h, v22.8h 741 sqadd v7.8h, v7.8h, v23.8h 742 sshr v6.8h, v6.8h, #3 743 sshr v7.8h, v7.8h, #3 744 add v6.8h, v6.8h, v31.8h 745 add v7.8h, v7.8h, v31.8h 746 747 subs w4, w4, #16 748 749 st1 {v6.8h, v7.8h}, [x14], #32 750 751 b.le 0f 752 mov v2.16b, v4.16b 753 ld1 {v4.16b}, [x3], #16 754 tst w7, #2 // LR_HAVE_RIGHT 755 uxtl v3.8h, v4.8b 756 uxtl2 v4.8h, v4.16b 757 b.ne 4b // If we don't need to pad, just keep filtering. 758 b 3b // If we need to pad, check how many pixels we have left. 759 7600: 761 ldr x14, [sp, #16] 762 ldp x3, x4, [sp], #32 763 ret 764endfunc 765 766function wiener_filter5_v_8bpc_neon 767 stp x11, x12, [sp, #-48]! 768 stp x13, x14, [sp, #16] 769 stp x0, x4, [sp, #32] 7701: 771 ld1 {v18.8h, v19.8h}, [x12], #32 772 ld1 {v22.8h, v23.8h}, [x14], #32 773 ld1 {v16.8h, v17.8h}, [x11], #32 774 775 add v24.8h, v22.8h, v18.8h 776 ld1 {v20.8h, v21.8h}, [x13], #32 777 add v16.8h, v22.8h, v16.8h 778 add v25.8h, v23.8h, v19.8h 779 780 smull v2.4s, v20.4h, v1.h[3] 781 smlal v2.4s, v24.4h, v1.h[4] 782 smlal v2.4s, v16.4h, v1.h[5] 783 add v17.8h, v23.8h, v17.8h 784 smull2 v3.4s, v20.8h, v1.h[3] 785 smlal2 v3.4s, v24.8h, v1.h[4] 786 smlal2 v3.4s, v16.8h, v1.h[5] 787 smull v4.4s, v21.4h, v1.h[3] 788 smlal v4.4s, v25.4h, v1.h[4] 789 smlal v4.4s, v17.4h, v1.h[5] 790 smull2 v5.4s, v21.8h, v1.h[3] 791 smlal2 v5.4s, v25.8h, v1.h[4] 792 smlal2 v5.4s, v17.8h, v1.h[5] 793 sqrshrun v2.4h, v2.4s, #11 794 sqrshrun2 v2.8h, v3.4s, #11 795 sqrshrun v3.4h, v4.4s, #11 796 sqrshrun2 v3.8h, v5.4s, #11 797 sqxtun v2.8b, v2.8h 798 sqxtun2 v2.16b, v3.8h 799 subs w4, w4, #16 800 st1 {v2.16b}, [x0], #16 801 b.gt 1b 802 803 ldp x0, x4, [sp, #32] 804 ldp x13, x14, [sp, #16] 805 ldp x11, x12, [sp], #48 806 807 ret 808endfunc 809 810function wiener_filter5_hv_8bpc_neon 811 // Backing up/restoring registers shifted, so that x11 gets the value 812 // of x12, etc, and x15==x11, afterwards. 813 stp x12, x13, [sp, #-64]! 814 stp x14, x15, [sp, #16] 815 stp x12, x0, [sp, #32] 816 stp x3, x4, [sp, #48] 817 818 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 819 tst w7, #1 // LR_HAVE_LEFT 820 b.eq 1f 821 // LR_HAVE_LEFT 822 cbnz x2, 0f 823 // left == NULL 824 sub x3, x3, #2 825 ld1 {v3.16b}, [x3], #16 826 b 2f 827 8280: 829 // LR_HAVE_LEFT, left != NULL 830 ld1 {v3.16b}, [x3], #16 831 ld1 {v2.s}[3], [x2], #4 832 // Move x3 back to account for the last 2 bytes we loaded earlier, 833 // which we'll shift out. 834 sub x3, x3, #2 835 ext v3.16b, v2.16b, v3.16b, #14 836 b 2f 8371: 838 ld1 {v3.16b}, [x3], #16 839 // !LR_HAVE_LEFT, fill v2 with the leftmost byte 840 // and shift v3 to have 2x the first byte at the front. 841 dup v2.16b, v3.b[0] 842 // Move x3 back to account for the last 2 bytes we loaded before, 843 // which we shifted out. 844 sub x3, x3, #2 845 ext v3.16b, v2.16b, v3.16b, #14 846 8472: 848 ld1 {v4.8b}, [x3], #8 849 uxtl v2.8h, v3.8b 850 uxtl2 v3.8h, v3.16b 851 uxtl v4.8h, v4.8b 852 853 tst w7, #2 // LR_HAVE_RIGHT 854 b.ne 4f 855 8563: // !LR_HAVE_RIGHT 857 858 // Check whether we need to pad the right edge 859 cmp w4, #18 860 b.ge 4f // If w >= 18, all used input pixels are valid 861 862 // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, 863 // this ends up called again; it's not strictly needed in those 864 // cases (we pad enough here), but keeping the code as simple as possible. 865 866 // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie 867 // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. 868 sub w17, w4, #23 869 // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the 870 // buffer pointer. 871 movrel x6, right_ext_mask, -4 872 ldr b28, [x3, w17, sxtw] 873 sub x6, x6, w4, uxtw #1 874 dup v28.8h, v28.h[0] 875 ld1 {v25.16b, v26.16b, v27.16b}, [x6] 876 877 bit v2.16b, v28.16b, v25.16b 878 bit v3.16b, v28.16b, v26.16b 879 bit v4.16b, v28.16b, v27.16b 880 8814: // Loop horizontally 882 883 ext v16.16b, v2.16b, v3.16b, #2 884 ext v18.16b, v2.16b, v3.16b, #6 885 ext v19.16b, v2.16b, v3.16b, #8 886 ext v17.16b, v2.16b, v3.16b, #4 887 add v18.8h, v18.8h, v16.8h 888 add v19.8h, v19.8h, v2.8h 889 shl v22.8h, v17.8h, #7 890 mul v6.8h, v17.8h, v0.h[3] 891 mla v6.8h, v18.8h, v0.h[4] 892 mla v6.8h, v19.8h, v0.h[5] 893 894 ext v16.16b, v3.16b, v4.16b, #2 895 ext v18.16b, v3.16b, v4.16b, #6 896 ext v19.16b, v3.16b, v4.16b, #8 897 ext v17.16b, v3.16b, v4.16b, #4 898 add v18.8h, v18.8h, v16.8h 899 add v19.8h, v19.8h, v3.8h 900 shl v23.8h, v17.8h, #7 901 mul v7.8h, v17.8h, v0.h[3] 902 mla v7.8h, v18.8h, v0.h[4] 903 mla v7.8h, v19.8h, v0.h[5] 904 905 ld1 {v18.8h, v19.8h}, [x12], #32 906 907 sub v22.8h, v22.8h, v30.8h 908 sub v23.8h, v23.8h, v30.8h 909 ld1 {v24.8h, v25.8h}, [x14], #32 910 sqadd v6.8h, v6.8h, v22.8h 911 sqadd v7.8h, v7.8h, v23.8h 912 ld1 {v16.8h, v17.8h}, [x11], #32 913 sshr v6.8h, v6.8h, #3 914 sshr v7.8h, v7.8h, #3 915 ld1 {v20.8h, v21.8h}, [x13], #32 916 add v6.8h, v6.8h, v31.8h 917 add v7.8h, v7.8h, v31.8h 918 919 add v24.8h, v24.8h, v18.8h 920 add v16.8h, v16.8h, v6.8h 921 922 smull v18.4s, v20.4h, v1.h[3] 923 smlal v18.4s, v24.4h, v1.h[4] 924 smlal v18.4s, v16.4h, v1.h[5] 925 add v25.8h, v25.8h, v19.8h 926 smull2 v19.4s, v20.8h, v1.h[3] 927 smlal2 v19.4s, v24.8h, v1.h[4] 928 smlal2 v19.4s, v16.8h, v1.h[5] 929 add v17.8h, v17.8h, v7.8h 930 smull v20.4s, v21.4h, v1.h[3] 931 smlal v20.4s, v25.4h, v1.h[4] 932 smlal v20.4s, v17.4h, v1.h[5] 933 smull2 v21.4s, v21.8h, v1.h[3] 934 smlal2 v21.4s, v25.8h, v1.h[4] 935 smlal2 v21.4s, v17.8h, v1.h[5] 936 sqrshrun v18.4h, v18.4s, #11 937 sqrshrun2 v18.8h, v19.4s, #11 938 sqrshrun v19.4h, v20.4s, #11 939 sqrshrun2 v19.8h, v21.4s, #11 940 st1 {v6.8h, v7.8h}, [x15], #32 941 sqxtun v18.8b, v18.8h 942 sqxtun2 v18.16b, v19.8h 943 subs w4, w4, #16 944 945 st1 {v18.16b}, [x0], #16 946 947 b.le 0f 948 mov v2.16b, v4.16b 949 ld1 {v4.16b}, [x3], #16 950 tst w7, #2 // LR_HAVE_RIGHT 951 uxtl v3.8h, v4.8b 952 uxtl2 v4.8h, v4.16b 953 b.ne 4b // If we don't need to pad, just keep filtering. 954 b 3b // If we need to pad, check how many pixels we have left. 955 9560: 957 ldp x3, x4, [sp, #48] 958 ldp x15, x0, [sp, #32] 959 ldp x13, x14, [sp, #16] 960 ldp x11, x12, [sp], #64 961 962 add x3, x3, x1 963 add x0, x0, x1 964 965 ret 966endfunc 967 968#include "looprestoration_tmpl.S" 969 970// void dav1d_sgr_box3_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum, 971// const pixel (*left)[4], 972// const pixel *src, const int w, 973// const enum LrEdgeFlags edges); 974function sgr_box3_row_h_8bpc_neon, export=1 975 add w4, w4, #2 // w += 2 976 977 tst w5, #1 // LR_HAVE_LEFT 978 b.eq 1f 979 cbnz x2, 0f 980 981 // LR_HAVE_LEFT && left == NULL 982 sub x3, x3, #2 983 ld1 {v0.16b}, [x3], #16 984 b 2f 985 9860: 987 // LR_HAVE_LEFT, left != NULL 988 ld1 {v0.16b}, [x3], #16 989 ld1 {v1.s}[3], [x2] 990 // Move x3 back to account for the last 2 bytes we loaded earlier, 991 // which we'll shift out. 992 sub x3, x3, #2 993 ext v0.16b, v1.16b, v0.16b, #14 994 b 2f 995 9961: 997 ld1 {v0.16b}, [x3], #16 998 // !LR_HAVE_LEFT, fill v1 with the leftmost byte 999 // and shift v0 to have 2x the first byte at the front. 1000 dup v1.16b, v0.b[0] 1001 // Move x3 back to account for the last 2 bytes we loaded before, 1002 // which we shifted out. 1003 sub x3, x3, #2 1004 ext v0.16b, v1.16b, v0.16b, #14 1005 10062: 1007 umull v1.8h, v0.8b, v0.8b 1008 umull2 v2.8h, v0.16b, v0.16b 1009 1010 tst w5, #2 // LR_HAVE_RIGHT 1011 b.ne 4f 1012 // If we'll need to pad the right edge, load that byte to pad with 1013 // here since we can find it pretty easily from here. 1014 sub w13, w4, #(2 + 16 - 2 + 1) 1015 ldr b30, [x3, w13, sxtw] 1016 // Fill v30 with the right padding pixel 1017 dup v30.16b, v30.b[0] 10183: // !LR_HAVE_RIGHT 1019 1020 // Check whether we need to pad the right edge 1021 cmp w4, #10 1022 b.ge 4f // If w >= 10, all used input pixels are valid 1023 1024 // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called 1025 // again; it's not strictly needed in those cases (we pad enough here), 1026 // but keeping the code as simple as possible. 1027 1028 // Insert padding in v0.b[w] onwards 1029 movrel x13, right_ext_mask 1030 sub x13, x13, w4, uxtw 1031 ld1 {v29.16b}, [x13] 1032 1033 bit v0.16b, v30.16b, v29.16b 1034 1035 // Update the precalculated squares 1036 umull v1.8h, v0.8b, v0.8b 1037 umull2 v2.8h, v0.16b, v0.16b 1038 10394: // Loop horizontally 1040 ext v16.16b, v0.16b, v0.16b, #1 1041 ext v17.16b, v0.16b, v0.16b, #2 1042 uaddl v3.8h, v0.8b, v16.8b 1043 ext v20.16b, v1.16b, v2.16b, #2 1044 uaddw v3.8h, v3.8h, v17.8b 1045 1046 ext v21.16b, v1.16b, v2.16b, #4 1047 1048 uaddl v26.4s, v1.4h, v20.4h 1049 uaddl2 v27.4s, v1.8h, v20.8h 1050 uaddw v26.4s, v26.4s, v21.4h 1051 uaddw2 v27.4s, v27.4s, v21.8h 1052 1053 subs w4, w4, #8 1054 1055 st1 {v3.8h}, [x1], #16 1056 st1 {v26.4s,v27.4s}, [x0], #32 1057 1058 b.le 9f 1059 tst w5, #2 // LR_HAVE_RIGHT 1060 ld1 {v3.8b}, [x3], #8 1061 mov v1.16b, v2.16b 1062 ext v0.16b, v0.16b, v3.16b, #8 1063 umull v2.8h, v3.8b, v3.8b 1064 1065 b.ne 4b // If we don't need to pad, just keep summing. 1066 b 3b // If we need to pad, check how many pixels we have left. 1067 10689: 1069 ret 1070endfunc 1071 1072// void dav1d_sgr_box5_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum, 1073// const pixel (*left)[4], 1074// const pixel *src, const int w, 1075// const enum LrEdgeFlags edges); 1076function sgr_box5_row_h_8bpc_neon, export=1 1077 add w4, w4, #2 // w += 2 1078 1079 tst w5, #1 // LR_HAVE_LEFT 1080 b.eq 1f 1081 cbnz x2, 0f 1082 1083 // LR_HAVE_LEFT && left == NULL 1084 sub x3, x3, #3 1085 ld1 {v0.16b}, [x3], #16 1086 b 2f 1087 10880: 1089 // LR_HAVE_LEFT, left != NULL 1090 ld1 {v0.16b}, [x3], #16 1091 ld1 {v1.s}[3], [x2], #4 1092 // Move x3 back to account for the last 3 bytes we loaded earlier, 1093 // which we'll shift out. 1094 sub x3, x3, #3 1095 ext v0.16b, v1.16b, v0.16b, #13 1096 b 2f 1097 10981: 1099 ld1 {v0.16b}, [x3], #16 1100 // !LR_HAVE_LEFT, fill v1 with the leftmost byte 1101 // and shift v0 to have 3x the first byte at the front. 1102 dup v1.16b, v0.b[0] 1103 // Move x3 back to account for the last 3 bytes we loaded before, 1104 // which we shifted out. 1105 sub x3, x3, #3 1106 ext v0.16b, v1.16b, v0.16b, #13 1107 11082: 1109 umull v1.8h, v0.8b, v0.8b 1110 umull2 v2.8h, v0.16b, v0.16b 1111 1112 tst w5, #2 // LR_HAVE_RIGHT 1113 b.ne 4f 1114 // If we'll need to pad the right edge, load that byte to pad with 1115 // here since we can find it pretty easily from here. 1116 sub w13, w4, #(2 + 16 - 3 + 1) 1117 ldr b30, [x3, w13, sxtw] 1118 // Fill v30 with the right padding pixel 1119 dup v30.16b, v30.b[0] 11203: // !LR_HAVE_RIGHT 1121 1122 // Check whether we need to pad the right edge 1123 cmp w4, #11 1124 b.ge 4f // If w >= 11, all used input pixels are valid 1125 1126 // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10, 1127 // this ends up called again; it's not strictly needed in those 1128 // cases (we pad enough here), but keeping the code as simple as possible. 1129 1130 // Insert padding in v0.b[w+1] onwards; fuse the +1 into the 1131 // buffer pointer. 1132 movrel x13, right_ext_mask, -1 1133 sub x13, x13, w4, uxtw 1134 ld1 {v29.16b}, [x13] 1135 1136 bit v0.16b, v30.16b, v29.16b 1137 1138 // Update the precalculated squares 1139 umull v1.8h, v0.8b, v0.8b 1140 umull2 v2.8h, v0.16b, v0.16b 1141 11424: // Loop horizontally 1143 ext v16.16b, v0.16b, v0.16b, #1 1144 ext v17.16b, v0.16b, v0.16b, #2 1145 ext v18.16b, v0.16b, v0.16b, #3 1146 ext v19.16b, v0.16b, v0.16b, #4 1147 uaddl v3.8h, v0.8b, v16.8b 1148 uaddl v24.8h, v17.8b, v18.8b 1149 uaddw v3.8h, v3.8h, v19.8b 1150 add v3.8h, v3.8h, v24.8h 1151 1152 ext v16.16b, v1.16b, v2.16b, #2 1153 ext v17.16b, v1.16b, v2.16b, #4 1154 ext v18.16b, v1.16b, v2.16b, #6 1155 ext v19.16b, v1.16b, v2.16b, #8 1156 1157 uaddl v26.4s, v1.4h, v16.4h 1158 uaddl2 v27.4s, v1.8h, v16.8h 1159 uaddl v16.4s, v17.4h, v18.4h 1160 uaddl2 v17.4s, v17.8h, v18.8h 1161 uaddw v26.4s, v26.4s, v19.4h 1162 uaddw2 v27.4s, v27.4s, v19.8h 1163 add v26.4s, v26.4s, v16.4s 1164 add v27.4s, v27.4s, v17.4s 1165 1166 subs w4, w4, #8 1167 1168 st1 {v3.8h}, [x1], #16 1169 st1 {v26.4s,v27.4s}, [x0], #32 1170 1171 b.le 9f 1172 tst w5, #2 // LR_HAVE_RIGHT 1173 ld1 {v3.8b}, [x3], #8 1174 mov v1.16b, v2.16b 1175 ext v0.16b, v0.16b, v3.16b, #8 1176 umull v2.8h, v3.8b, v3.8b 1177 1178 b.ne 4b // If we don't need to pad, just keep summing. 1179 b 3b // If we need to pad, check how many pixels we have left. 1180 11819: 1182 ret 1183endfunc 1184 1185// void dav1d_sgr_box35_row_h_8bpc_neon(int32_t *sumsq3, int16_t *sum3, 1186// int32_t *sumsq5, int16_t *sum5, 1187// const pixel (*left)[4], 1188// const pixel *src, const int w, 1189// const enum LrEdgeFlags edges); 1190function sgr_box35_row_h_8bpc_neon, export=1 1191 add w6, w6, #2 // w += 2 1192 1193 tst w7, #1 // LR_HAVE_LEFT 1194 b.eq 1f 1195 cbnz x4, 0f 1196 1197 // LR_HAVE_LEFT && left == NULL 1198 sub x5, x5, #3 1199 ld1 {v0.16b}, [x5], #16 1200 b 2f 1201 12020: 1203 // LR_HAVE_LEFT, left != NULL 1204 ld1 {v0.16b}, [x5], #16 1205 ld1 {v1.s}[3], [x4], #4 1206 // Move x3 back to account for the last 3 bytes we loaded earlier, 1207 // which we'll shift out. 1208 sub x5, x5, #3 1209 ext v0.16b, v1.16b, v0.16b, #13 1210 b 2f 1211 12121: 1213 ld1 {v0.16b}, [x5], #16 1214 // !LR_HAVE_LEFT, fill v1 with the leftmost byte 1215 // and shift v0 to have 3x the first byte at the front. 1216 dup v1.16b, v0.b[0] 1217 // Move x3 back to account for the last 3 bytes we loaded before, 1218 // which we shifted out. 1219 sub x5, x5, #3 1220 ext v0.16b, v1.16b, v0.16b, #13 1221 12222: 1223 umull v1.8h, v0.8b, v0.8b 1224 umull2 v2.8h, v0.16b, v0.16b 1225 1226 tst w7, #2 // LR_HAVE_RIGHT 1227 b.ne 4f 1228 // If we'll need to pad the right edge, load that byte to pad with 1229 // here since we can find it pretty easily from here. 1230 sub w13, w6, #(2 + 16 - 3 + 1) 1231 ldr b30, [x5, w13, sxtw] 1232 // Fill v30 with the right padding pixel 1233 dup v30.16b, v30.b[0] 12343: // !LR_HAVE_RIGHT 1235 1236 // Check whether we need to pad the right edge 1237 cmp w6, #11 1238 b.ge 4f // If w >= 11, all used input pixels are valid 1239 1240 // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10, 1241 // this ends up called again; it's not strictly needed in those 1242 // cases (we pad enough here), but keeping the code as simple as possible. 1243 1244 // Insert padding in v0.b[w+1] onwards; fuse the +1 into the 1245 // buffer pointer. 1246 movrel x13, right_ext_mask, -1 1247 sub x13, x13, w6, uxtw 1248 ld1 {v29.16b}, [x13] 1249 1250 bit v0.16b, v30.16b, v29.16b 1251 1252 // Update the precalculated squares 1253 umull v1.8h, v0.8b, v0.8b 1254 umull2 v2.8h, v0.16b, v0.16b 1255 12564: // Loop horizontally 1257 ext v16.16b, v0.16b, v0.16b, #1 1258 ext v17.16b, v0.16b, v0.16b, #2 1259 ext v19.16b, v0.16b, v0.16b, #4 1260 ext v18.16b, v0.16b, v0.16b, #3 1261 uaddl v3.8h, v16.8b, v17.8b 1262 uaddl v24.8h, v0.8b, v19.8b 1263 uaddw v3.8h, v3.8h, v18.8b 1264 1265 ext v16.16b, v1.16b, v2.16b, #2 1266 ext v17.16b, v1.16b, v2.16b, #4 1267 ext v19.16b, v1.16b, v2.16b, #8 1268 ext v18.16b, v1.16b, v2.16b, #6 1269 1270 st1 {v3.8h}, [x1], #16 1271 add v3.8h, v3.8h, v24.8h 1272 1273 uaddl v26.4s, v16.4h, v17.4h 1274 uaddl2 v27.4s, v16.8h, v17.8h 1275 uaddl v16.4s, v1.4h, v19.4h 1276 uaddl2 v17.4s, v1.8h, v19.8h 1277 uaddw v26.4s, v26.4s, v18.4h 1278 uaddw2 v27.4s, v27.4s, v18.8h 1279 1280 st1 {v26.4s,v27.4s}, [x0], #32 1281 add v26.4s, v26.4s, v16.4s 1282 add v27.4s, v27.4s, v17.4s 1283 1284 subs w6, w6, #8 1285 1286 st1 {v3.8h}, [x3], #16 1287 st1 {v26.4s,v27.4s}, [x2], #32 1288 1289 b.le 9f 1290 tst w7, #2 // LR_HAVE_RIGHT 1291 ld1 {v3.8b}, [x5], #8 1292 mov v1.16b, v2.16b 1293 ext v0.16b, v0.16b, v3.16b, #8 1294 umull v2.8h, v3.8b, v3.8b 1295 1296 b.ne 4b // If we don't need to pad, just keep summing. 1297 b 3b // If we need to pad, check how many pixels we have left. 1298 12999: 1300 ret 1301endfunc 1302 1303sgr_funcs 8 1304