1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, 32// const pixel *const topleft, 33// const int width, const int height, const int a, 34// const int max_width, const int max_height, 35// const int bitdepth_max); 36function ipred_dc_128_16bpc_neon, export=1 37 ldr w8, [sp] 38 clz w3, w3 39 adr x5, L(ipred_dc_128_tbl) 40 sub w3, w3, #25 41 ldrh w3, [x5, w3, uxtw #1] 42 dup v0.8h, w8 43 sub x5, x5, w3, uxtw 44 add x6, x0, x1 45 lsl x1, x1, #1 46 urshr v0.8h, v0.8h, #1 47 br x5 484: 49 AARCH64_VALID_JUMP_TARGET 50 st1 {v0.4h}, [x0], x1 51 st1 {v0.4h}, [x6], x1 52 subs w4, w4, #4 53 st1 {v0.4h}, [x0], x1 54 st1 {v0.4h}, [x6], x1 55 b.gt 4b 56 ret 578: 58 AARCH64_VALID_JUMP_TARGET 59 st1 {v0.8h}, [x0], x1 60 st1 {v0.8h}, [x6], x1 61 subs w4, w4, #4 62 st1 {v0.8h}, [x0], x1 63 st1 {v0.8h}, [x6], x1 64 b.gt 8b 65 ret 66160: 67 AARCH64_VALID_JUMP_TARGET 68 mov v1.16b, v0.16b 6916: 70 st1 {v0.8h, v1.8h}, [x0], x1 71 st1 {v0.8h, v1.8h}, [x6], x1 72 subs w4, w4, #4 73 st1 {v0.8h, v1.8h}, [x0], x1 74 st1 {v0.8h, v1.8h}, [x6], x1 75 b.gt 16b 76 ret 77320: 78 AARCH64_VALID_JUMP_TARGET 79 mov v1.16b, v0.16b 80 mov v2.16b, v0.16b 81 mov v3.16b, v0.16b 8232: 83 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 84 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 85 subs w4, w4, #4 86 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 87 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 88 b.gt 32b 89 ret 90640: 91 AARCH64_VALID_JUMP_TARGET 92 mov v1.16b, v0.16b 93 mov v2.16b, v0.16b 94 mov v3.16b, v0.16b 95 sub x1, x1, #64 9664: 97 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 98 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 99 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 100 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 101 subs w4, w4, #4 102 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 103 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 104 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 105 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 106 b.gt 64b 107 ret 108 109L(ipred_dc_128_tbl): 110 .hword L(ipred_dc_128_tbl) - 640b 111 .hword L(ipred_dc_128_tbl) - 320b 112 .hword L(ipred_dc_128_tbl) - 160b 113 .hword L(ipred_dc_128_tbl) - 8b 114 .hword L(ipred_dc_128_tbl) - 4b 115endfunc 116 117// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, 118// const pixel *const topleft, 119// const int width, const int height, const int a, 120// const int max_width, const int max_height); 121function ipred_v_16bpc_neon, export=1 122 clz w3, w3 123 adr x5, L(ipred_v_tbl) 124 sub w3, w3, #25 125 ldrh w3, [x5, w3, uxtw #1] 126 add x2, x2, #2 127 sub x5, x5, w3, uxtw 128 add x6, x0, x1 129 lsl x1, x1, #1 130 br x5 13140: 132 AARCH64_VALID_JUMP_TARGET 133 ld1 {v0.4h}, [x2] 1344: 135 st1 {v0.4h}, [x0], x1 136 st1 {v0.4h}, [x6], x1 137 subs w4, w4, #4 138 st1 {v0.4h}, [x0], x1 139 st1 {v0.4h}, [x6], x1 140 b.gt 4b 141 ret 14280: 143 AARCH64_VALID_JUMP_TARGET 144 ld1 {v0.8h}, [x2] 1458: 146 st1 {v0.8h}, [x0], x1 147 st1 {v0.8h}, [x6], x1 148 subs w4, w4, #4 149 st1 {v0.8h}, [x0], x1 150 st1 {v0.8h}, [x6], x1 151 b.gt 8b 152 ret 153160: 154 AARCH64_VALID_JUMP_TARGET 155 ld1 {v0.8h, v1.8h}, [x2] 15616: 157 st1 {v0.8h, v1.8h}, [x0], x1 158 st1 {v0.8h, v1.8h}, [x6], x1 159 subs w4, w4, #4 160 st1 {v0.8h, v1.8h}, [x0], x1 161 st1 {v0.8h, v1.8h}, [x6], x1 162 b.gt 16b 163 ret 164320: 165 AARCH64_VALID_JUMP_TARGET 166 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] 16732: 168 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 169 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 170 subs w4, w4, #4 171 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 172 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 173 b.gt 32b 174 ret 175640: 176 AARCH64_VALID_JUMP_TARGET 177 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 178 sub x1, x1, #64 179 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] 18064: 181 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 182 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 183 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 184 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 185 subs w4, w4, #4 186 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 187 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 188 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 189 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 190 b.gt 64b 191 ret 192 193L(ipred_v_tbl): 194 .hword L(ipred_v_tbl) - 640b 195 .hword L(ipred_v_tbl) - 320b 196 .hword L(ipred_v_tbl) - 160b 197 .hword L(ipred_v_tbl) - 80b 198 .hword L(ipred_v_tbl) - 40b 199endfunc 200 201// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, 202// const pixel *const topleft, 203// const int width, const int height, const int a, 204// const int max_width, const int max_height); 205function ipred_h_16bpc_neon, export=1 206 clz w3, w3 207 adr x5, L(ipred_h_tbl) 208 sub w3, w3, #25 209 ldrh w3, [x5, w3, uxtw #1] 210 sub x2, x2, #8 211 sub x5, x5, w3, uxtw 212 mov x7, #-8 213 add x6, x0, x1 214 lsl x1, x1, #1 215 br x5 2164: 217 AARCH64_VALID_JUMP_TARGET 218 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 219 st1 {v3.4h}, [x0], x1 220 st1 {v2.4h}, [x6], x1 221 subs w4, w4, #4 222 st1 {v1.4h}, [x0], x1 223 st1 {v0.4h}, [x6], x1 224 b.gt 4b 225 ret 2268: 227 AARCH64_VALID_JUMP_TARGET 228 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 229 st1 {v3.8h}, [x0], x1 230 st1 {v2.8h}, [x6], x1 231 subs w4, w4, #4 232 st1 {v1.8h}, [x0], x1 233 st1 {v0.8h}, [x6], x1 234 b.gt 8b 235 ret 23616: 237 AARCH64_VALID_JUMP_TARGET 238 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 239 str q3, [x0, #16] 240 str q2, [x6, #16] 241 st1 {v3.8h}, [x0], x1 242 st1 {v2.8h}, [x6], x1 243 subs w4, w4, #4 244 str q1, [x0, #16] 245 str q0, [x6, #16] 246 st1 {v1.8h}, [x0], x1 247 st1 {v0.8h}, [x6], x1 248 b.gt 16b 249 ret 25032: 251 AARCH64_VALID_JUMP_TARGET 252 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 253 str q3, [x0, #16] 254 str q2, [x6, #16] 255 stp q3, q3, [x0, #32] 256 stp q2, q2, [x6, #32] 257 st1 {v3.8h}, [x0], x1 258 st1 {v2.8h}, [x6], x1 259 subs w4, w4, #4 260 str q1, [x0, #16] 261 str q0, [x6, #16] 262 stp q1, q1, [x0, #32] 263 stp q0, q0, [x6, #32] 264 st1 {v1.8h}, [x0], x1 265 st1 {v0.8h}, [x6], x1 266 b.gt 32b 267 ret 26864: 269 AARCH64_VALID_JUMP_TARGET 270 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 271 str q3, [x0, #16] 272 str q2, [x6, #16] 273 stp q3, q3, [x0, #32] 274 stp q2, q2, [x6, #32] 275 stp q3, q3, [x0, #64] 276 stp q2, q2, [x6, #64] 277 stp q3, q3, [x0, #96] 278 stp q2, q2, [x6, #96] 279 st1 {v3.8h}, [x0], x1 280 st1 {v2.8h}, [x6], x1 281 subs w4, w4, #4 282 str q1, [x0, #16] 283 str q0, [x6, #16] 284 stp q1, q1, [x0, #32] 285 stp q0, q0, [x6, #32] 286 stp q1, q1, [x0, #64] 287 stp q0, q0, [x6, #64] 288 stp q1, q1, [x0, #96] 289 stp q0, q0, [x6, #96] 290 st1 {v1.8h}, [x0], x1 291 st1 {v0.8h}, [x6], x1 292 b.gt 64b 293 ret 294 295L(ipred_h_tbl): 296 .hword L(ipred_h_tbl) - 64b 297 .hword L(ipred_h_tbl) - 32b 298 .hword L(ipred_h_tbl) - 16b 299 .hword L(ipred_h_tbl) - 8b 300 .hword L(ipred_h_tbl) - 4b 301endfunc 302 303// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, 304// const pixel *const topleft, 305// const int width, const int height, const int a, 306// const int max_width, const int max_height); 307function ipred_dc_top_16bpc_neon, export=1 308 clz w3, w3 309 adr x5, L(ipred_dc_top_tbl) 310 sub w3, w3, #25 311 ldrh w3, [x5, w3, uxtw #1] 312 add x2, x2, #2 313 sub x5, x5, w3, uxtw 314 add x6, x0, x1 315 lsl x1, x1, #1 316 br x5 31740: 318 AARCH64_VALID_JUMP_TARGET 319 ld1 {v0.4h}, [x2] 320 addv h0, v0.4h 321 urshr v0.4h, v0.4h, #2 322 dup v0.4h, v0.h[0] 3234: 324 st1 {v0.4h}, [x0], x1 325 st1 {v0.4h}, [x6], x1 326 subs w4, w4, #4 327 st1 {v0.4h}, [x0], x1 328 st1 {v0.4h}, [x6], x1 329 b.gt 4b 330 ret 33180: 332 AARCH64_VALID_JUMP_TARGET 333 ld1 {v0.8h}, [x2] 334 addv h0, v0.8h 335 urshr v0.4h, v0.4h, #3 336 dup v0.8h, v0.h[0] 3378: 338 st1 {v0.8h}, [x0], x1 339 st1 {v0.8h}, [x6], x1 340 subs w4, w4, #4 341 st1 {v0.8h}, [x0], x1 342 st1 {v0.8h}, [x6], x1 343 b.gt 8b 344 ret 345160: 346 AARCH64_VALID_JUMP_TARGET 347 ld1 {v0.8h, v1.8h}, [x2] 348 addp v0.8h, v0.8h, v1.8h 349 addv h0, v0.8h 350 urshr v2.4h, v0.4h, #4 351 dup v0.8h, v2.h[0] 352 dup v1.8h, v2.h[0] 35316: 354 st1 {v0.8h, v1.8h}, [x0], x1 355 st1 {v0.8h, v1.8h}, [x6], x1 356 subs w4, w4, #4 357 st1 {v0.8h, v1.8h}, [x0], x1 358 st1 {v0.8h, v1.8h}, [x6], x1 359 b.gt 16b 360 ret 361320: 362 AARCH64_VALID_JUMP_TARGET 363 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] 364 addp v0.8h, v0.8h, v1.8h 365 addp v2.8h, v2.8h, v3.8h 366 addp v0.8h, v0.8h, v2.8h 367 uaddlv s0, v0.8h 368 rshrn v4.4h, v0.4s, #5 369 dup v0.8h, v4.h[0] 370 dup v1.8h, v4.h[0] 371 dup v2.8h, v4.h[0] 372 dup v3.8h, v4.h[0] 37332: 374 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 375 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 376 subs w4, w4, #4 377 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 378 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 379 b.gt 32b 380 ret 381640: 382 AARCH64_VALID_JUMP_TARGET 383 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 384 addp v0.8h, v0.8h, v1.8h 385 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] 386 addp v2.8h, v2.8h, v3.8h 387 addp v4.8h, v4.8h, v5.8h 388 addp v6.8h, v6.8h, v7.8h 389 addp v0.8h, v0.8h, v2.8h 390 addp v4.8h, v4.8h, v6.8h 391 addp v0.8h, v0.8h, v4.8h 392 uaddlv s0, v0.8h 393 rshrn v4.4h, v0.4s, #6 394 sub x1, x1, #64 395 dup v0.8h, v4.h[0] 396 dup v1.8h, v4.h[0] 397 dup v2.8h, v4.h[0] 398 dup v3.8h, v4.h[0] 39964: 400 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 401 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 402 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 403 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 404 subs w4, w4, #4 405 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 406 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 407 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 408 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 409 b.gt 64b 410 ret 411 412L(ipred_dc_top_tbl): 413 .hword L(ipred_dc_top_tbl) - 640b 414 .hword L(ipred_dc_top_tbl) - 320b 415 .hword L(ipred_dc_top_tbl) - 160b 416 .hword L(ipred_dc_top_tbl) - 80b 417 .hword L(ipred_dc_top_tbl) - 40b 418endfunc 419 420// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, 421// const pixel *const topleft, 422// const int width, const int height, const int a, 423// const int max_width, const int max_height); 424function ipred_dc_left_16bpc_neon, export=1 425 sub x2, x2, w4, uxtw #1 426 clz w3, w3 427 clz w7, w4 428 adr x5, L(ipred_dc_left_tbl) 429 sub w3, w3, #20 // 25 leading bits, minus table offset 5 430 sub w7, w7, #25 431 ldrh w3, [x5, w3, uxtw #1] 432 ldrh w7, [x5, w7, uxtw #1] 433 sub x3, x5, w3, uxtw 434 sub x5, x5, w7, uxtw 435 add x6, x0, x1 436 lsl x1, x1, #1 437 br x5 438 439L(ipred_dc_left_h4): 440 AARCH64_VALID_JUMP_TARGET 441 ld1 {v0.4h}, [x2] 442 addv h0, v0.4h 443 urshr v0.4h, v0.4h, #2 444 dup v0.8h, v0.h[0] 445 br x3 446L(ipred_dc_left_w4): 447 AARCH64_VALID_JUMP_TARGET 448 st1 {v0.4h}, [x0], x1 449 st1 {v0.4h}, [x6], x1 450 subs w4, w4, #4 451 st1 {v0.4h}, [x0], x1 452 st1 {v0.4h}, [x6], x1 453 b.gt L(ipred_dc_left_w4) 454 ret 455 456L(ipred_dc_left_h8): 457 AARCH64_VALID_JUMP_TARGET 458 ld1 {v0.8h}, [x2] 459 addv h0, v0.8h 460 urshr v0.4h, v0.4h, #3 461 dup v0.8h, v0.h[0] 462 br x3 463L(ipred_dc_left_w8): 464 AARCH64_VALID_JUMP_TARGET 465 st1 {v0.8h}, [x0], x1 466 st1 {v0.8h}, [x6], x1 467 subs w4, w4, #4 468 st1 {v0.8h}, [x0], x1 469 st1 {v0.8h}, [x6], x1 470 b.gt L(ipred_dc_left_w8) 471 ret 472 473L(ipred_dc_left_h16): 474 AARCH64_VALID_JUMP_TARGET 475 ld1 {v0.8h, v1.8h}, [x2] 476 addp v0.8h, v0.8h, v1.8h 477 addv h0, v0.8h 478 urshr v2.4h, v0.4h, #4 479 dup v0.8h, v2.h[0] 480 dup v1.8h, v2.h[0] 481 br x3 482L(ipred_dc_left_w16): 483 AARCH64_VALID_JUMP_TARGET 484 mov v1.16b, v0.16b 4851: 486 st1 {v0.8h, v1.8h}, [x0], x1 487 st1 {v0.8h, v1.8h}, [x6], x1 488 subs w4, w4, #4 489 st1 {v0.8h, v1.8h}, [x0], x1 490 st1 {v0.8h, v1.8h}, [x6], x1 491 b.gt 1b 492 ret 493 494L(ipred_dc_left_h32): 495 AARCH64_VALID_JUMP_TARGET 496 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] 497 addp v0.8h, v0.8h, v1.8h 498 addp v2.8h, v2.8h, v3.8h 499 addp v0.8h, v0.8h, v2.8h 500 uaddlp v0.4s, v0.8h 501 addv s0, v0.4s 502 rshrn v4.4h, v0.4s, #5 503 dup v0.8h, v4.h[0] 504 br x3 505L(ipred_dc_left_w32): 506 AARCH64_VALID_JUMP_TARGET 507 mov v1.16b, v0.16b 508 mov v2.16b, v0.16b 509 mov v3.16b, v0.16b 5101: 511 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 512 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 513 subs w4, w4, #4 514 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 515 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 516 b.gt 1b 517 ret 518 519L(ipred_dc_left_h64): 520 AARCH64_VALID_JUMP_TARGET 521 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 522 addp v0.8h, v0.8h, v1.8h 523 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] 524 addp v2.8h, v2.8h, v3.8h 525 addp v4.8h, v4.8h, v5.8h 526 addp v6.8h, v6.8h, v7.8h 527 addp v0.8h, v0.8h, v2.8h 528 addp v4.8h, v4.8h, v6.8h 529 addp v0.8h, v0.8h, v4.8h 530 uaddlv s0, v0.8h 531 rshrn v4.4h, v0.4s, #6 532 dup v0.8h, v4.h[0] 533 br x3 534L(ipred_dc_left_w64): 535 AARCH64_VALID_JUMP_TARGET 536 mov v1.16b, v0.16b 537 mov v2.16b, v0.16b 538 mov v3.16b, v0.16b 539 sub x1, x1, #64 5401: 541 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 542 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 543 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 544 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 545 subs w4, w4, #4 546 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 547 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 548 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 549 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 550 b.gt 1b 551 ret 552 553L(ipred_dc_left_tbl): 554 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) 555 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) 556 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) 557 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) 558 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) 559 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) 560 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) 561 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) 562 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) 563 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) 564endfunc 565 566// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride, 567// const pixel *const topleft, 568// const int width, const int height, const int a, 569// const int max_width, const int max_height); 570function ipred_dc_16bpc_neon, export=1 571 sub x2, x2, w4, uxtw #1 572 add w7, w3, w4 // width + height 573 clz w3, w3 574 clz w6, w4 575 dup v16.4s, w7 // width + height 576 adr x5, L(ipred_dc_tbl) 577 rbit w7, w7 // rbit(width + height) 578 sub w3, w3, #20 // 25 leading bits, minus table offset 5 579 sub w6, w6, #25 580 clz w7, w7 // ctz(width + height) 581 ldrh w3, [x5, w3, uxtw #1] 582 ldrh w6, [x5, w6, uxtw #1] 583 neg w7, w7 // -ctz(width + height) 584 sub x3, x5, w3, uxtw 585 sub x5, x5, w6, uxtw 586 ushr v16.4s, v16.4s, #1 // (width + height) >> 1 587 dup v17.4s, w7 // -ctz(width + height) 588 add x6, x0, x1 589 lsl x1, x1, #1 590 br x5 591 592L(ipred_dc_h4): 593 AARCH64_VALID_JUMP_TARGET 594 ld1 {v0.4h}, [x2], #8 595 uaddlv s0, v0.4h 596 add x2, x2, #2 597 br x3 598L(ipred_dc_w4): 599 AARCH64_VALID_JUMP_TARGET 600 ld1 {v1.4h}, [x2] 601 add v0.2s, v0.2s, v16.2s 602 uaddlv s1, v1.4h 603 cmp w4, #4 604 add v0.2s, v0.2s, v1.2s 605 ushl v0.2s, v0.2s, v17.2s 606 b.eq 1f 607 // h = 8/16 608 cmp w4, #16 609 mov w16, #0x6667 610 mov w17, #0xAAAB 611 csel w16, w16, w17, eq 612 dup v16.2s, w16 613 mul v0.2s, v0.2s, v16.2s 614 ushr v0.2s, v0.2s, #17 6151: 616 dup v0.4h, v0.h[0] 6172: 618 st1 {v0.4h}, [x0], x1 619 st1 {v0.4h}, [x6], x1 620 subs w4, w4, #4 621 st1 {v0.4h}, [x0], x1 622 st1 {v0.4h}, [x6], x1 623 b.gt 2b 624 ret 625 626L(ipred_dc_h8): 627 AARCH64_VALID_JUMP_TARGET 628 ld1 {v0.8h}, [x2], #16 629 uaddlv s0, v0.8h 630 add x2, x2, #2 631 br x3 632L(ipred_dc_w8): 633 AARCH64_VALID_JUMP_TARGET 634 ld1 {v1.8h}, [x2] 635 add v0.2s, v0.2s, v16.2s 636 uaddlv s1, v1.8h 637 cmp w4, #8 638 add v0.2s, v0.2s, v1.2s 639 ushl v0.2s, v0.2s, v17.2s 640 b.eq 1f 641 // h = 4/16/32 642 cmp w4, #32 643 mov w16, #0x6667 644 mov w17, #0xAAAB 645 csel w16, w16, w17, eq 646 dup v16.2s, w16 647 mul v0.2s, v0.2s, v16.2s 648 ushr v0.2s, v0.2s, #17 6491: 650 dup v0.8h, v0.h[0] 6512: 652 st1 {v0.8h}, [x0], x1 653 st1 {v0.8h}, [x6], x1 654 subs w4, w4, #4 655 st1 {v0.8h}, [x0], x1 656 st1 {v0.8h}, [x6], x1 657 b.gt 2b 658 ret 659 660L(ipred_dc_h16): 661 AARCH64_VALID_JUMP_TARGET 662 ld1 {v0.8h, v1.8h}, [x2], #32 663 addp v0.8h, v0.8h, v1.8h 664 add x2, x2, #2 665 uaddlv s0, v0.8h 666 br x3 667L(ipred_dc_w16): 668 AARCH64_VALID_JUMP_TARGET 669 ld1 {v1.8h, v2.8h}, [x2] 670 add v0.2s, v0.2s, v16.2s 671 addp v1.8h, v1.8h, v2.8h 672 uaddlv s1, v1.8h 673 cmp w4, #16 674 add v0.2s, v0.2s, v1.2s 675 ushl v4.2s, v0.2s, v17.2s 676 b.eq 1f 677 // h = 4/8/32/64 678 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask 679 mov w16, #0x6667 680 mov w17, #0xAAAB 681 csel w16, w16, w17, eq 682 dup v16.2s, w16 683 mul v4.2s, v4.2s, v16.2s 684 ushr v4.2s, v4.2s, #17 6851: 686 dup v0.8h, v4.h[0] 687 dup v1.8h, v4.h[0] 6882: 689 st1 {v0.8h, v1.8h}, [x0], x1 690 st1 {v0.8h, v1.8h}, [x6], x1 691 subs w4, w4, #4 692 st1 {v0.8h, v1.8h}, [x0], x1 693 st1 {v0.8h, v1.8h}, [x6], x1 694 b.gt 2b 695 ret 696 697L(ipred_dc_h32): 698 AARCH64_VALID_JUMP_TARGET 699 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 700 addp v0.8h, v0.8h, v1.8h 701 addp v2.8h, v2.8h, v3.8h 702 addp v0.8h, v0.8h, v2.8h 703 add x2, x2, #2 704 uaddlv s0, v0.8h 705 br x3 706L(ipred_dc_w32): 707 AARCH64_VALID_JUMP_TARGET 708 ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2] 709 add v0.2s, v0.2s, v16.2s 710 addp v1.8h, v1.8h, v2.8h 711 addp v3.8h, v3.8h, v4.8h 712 addp v1.8h, v1.8h, v3.8h 713 uaddlv s1, v1.8h 714 cmp w4, #32 715 add v0.2s, v0.2s, v1.2s 716 ushl v4.2s, v0.2s, v17.2s 717 b.eq 1f 718 // h = 8/16/64 719 cmp w4, #8 720 mov w16, #0x6667 721 mov w17, #0xAAAB 722 csel w16, w16, w17, eq 723 dup v16.2s, w16 724 mul v4.2s, v4.2s, v16.2s 725 ushr v4.2s, v4.2s, #17 7261: 727 dup v0.8h, v4.h[0] 728 dup v1.8h, v4.h[0] 729 dup v2.8h, v4.h[0] 730 dup v3.8h, v4.h[0] 7312: 732 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 733 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 734 subs w4, w4, #4 735 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 736 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 737 b.gt 2b 738 ret 739 740L(ipred_dc_h64): 741 AARCH64_VALID_JUMP_TARGET 742 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 743 addp v0.8h, v0.8h, v1.8h 744 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 745 addp v2.8h, v2.8h, v3.8h 746 addp v4.8h, v4.8h, v5.8h 747 addp v6.8h, v6.8h, v7.8h 748 addp v0.8h, v0.8h, v2.8h 749 addp v4.8h, v4.8h, v6.8h 750 addp v0.8h, v0.8h, v4.8h 751 add x2, x2, #2 752 uaddlv s0, v0.8h 753 br x3 754L(ipred_dc_w64): 755 AARCH64_VALID_JUMP_TARGET 756 ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64 757 add v0.2s, v0.2s, v16.2s 758 addp v1.8h, v1.8h, v2.8h 759 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2] 760 addp v3.8h, v3.8h, v4.8h 761 addp v20.8h, v20.8h, v21.8h 762 addp v22.8h, v22.8h, v23.8h 763 addp v1.8h, v1.8h, v3.8h 764 addp v20.8h, v20.8h, v22.8h 765 addp v1.8h, v1.8h, v20.8h 766 uaddlv s1, v1.8h 767 cmp w4, #64 768 add v0.2s, v0.2s, v1.2s 769 ushl v4.2s, v0.2s, v17.2s 770 b.eq 1f 771 // h = 16/32 772 cmp w4, #16 773 mov w16, #0x6667 774 mov w17, #0xAAAB 775 csel w16, w16, w17, eq 776 dup v16.2s, w16 777 mul v4.2s, v4.2s, v16.2s 778 ushr v4.2s, v4.2s, #17 7791: 780 sub x1, x1, #64 781 dup v0.8h, v4.h[0] 782 dup v1.8h, v4.h[0] 783 dup v2.8h, v4.h[0] 784 dup v3.8h, v4.h[0] 7852: 786 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 787 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 788 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 789 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 790 subs w4, w4, #4 791 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 792 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 793 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 794 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 795 b.gt 2b 796 ret 797 798L(ipred_dc_tbl): 799 .hword L(ipred_dc_tbl) - L(ipred_dc_h64) 800 .hword L(ipred_dc_tbl) - L(ipred_dc_h32) 801 .hword L(ipred_dc_tbl) - L(ipred_dc_h16) 802 .hword L(ipred_dc_tbl) - L(ipred_dc_h8) 803 .hword L(ipred_dc_tbl) - L(ipred_dc_h4) 804 .hword L(ipred_dc_tbl) - L(ipred_dc_w64) 805 .hword L(ipred_dc_tbl) - L(ipred_dc_w32) 806 .hword L(ipred_dc_tbl) - L(ipred_dc_w16) 807 .hword L(ipred_dc_tbl) - L(ipred_dc_w8) 808 .hword L(ipred_dc_tbl) - L(ipred_dc_w4) 809endfunc 810 811// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride, 812// const pixel *const topleft, 813// const int width, const int height, const int a, 814// const int max_width, const int max_height); 815function ipred_paeth_16bpc_neon, export=1 816 clz w9, w3 817 adr x5, L(ipred_paeth_tbl) 818 sub w9, w9, #25 819 ldrh w9, [x5, w9, uxtw #1] 820 ld1r {v4.8h}, [x2] 821 add x8, x2, #2 822 sub x2, x2, #8 823 sub x5, x5, w9, uxtw 824 mov x7, #-8 825 add x6, x0, x1 826 lsl x1, x1, #1 827 br x5 82840: 829 AARCH64_VALID_JUMP_TARGET 830 ld1r {v5.2d}, [x8] 831 sub v6.8h, v5.8h, v4.8h // top - topleft 8324: 833 ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 834 zip1 v0.2d, v0.2d, v1.2d 835 zip1 v2.2d, v2.2d, v3.2d 836 add v16.8h, v6.8h, v0.8h // base 837 add v17.8h, v6.8h, v2.8h 838 sabd v20.8h, v5.8h, v16.8h // tdiff 839 sabd v21.8h, v5.8h, v17.8h 840 sabd v22.8h, v4.8h, v16.8h // tldiff 841 sabd v23.8h, v4.8h, v17.8h 842 sabd v16.8h, v0.8h, v16.8h // ldiff 843 sabd v17.8h, v2.8h, v17.8h 844 umin v18.8h, v20.8h, v22.8h // min(tdiff, tldiff) 845 umin v19.8h, v21.8h, v23.8h 846 cmge v20.8h, v22.8h, v20.8h // tldiff >= tdiff 847 cmge v21.8h, v23.8h, v21.8h 848 cmge v16.8h, v18.8h, v16.8h // min(tdiff, tldiff) >= ldiff 849 cmge v17.8h, v19.8h, v17.8h 850 bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft 851 bsl v20.16b, v5.16b, v4.16b 852 bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... 853 bit v20.16b, v0.16b, v16.16b 854 st1 {v21.d}[1], [x0], x1 855 st1 {v21.d}[0], [x6], x1 856 subs w4, w4, #4 857 st1 {v20.d}[1], [x0], x1 858 st1 {v20.d}[0], [x6], x1 859 b.gt 4b 860 ret 86180: 862160: 863320: 864640: 865 AARCH64_VALID_JUMP_TARGET 866 ld1 {v5.8h}, [x8], #16 867 mov w9, w3 868 // Set up pointers for four rows in parallel; x0, x6, x5, x10 869 add x5, x0, x1 870 add x10, x6, x1 871 lsl x1, x1, #1 872 sub x1, x1, w3, uxtw #1 8731: 874 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 8752: 876 sub v6.8h, v5.8h, v4.8h // top - topleft 877 add v16.8h, v6.8h, v0.8h // base 878 add v17.8h, v6.8h, v1.8h 879 add v18.8h, v6.8h, v2.8h 880 add v19.8h, v6.8h, v3.8h 881 sabd v20.8h, v5.8h, v16.8h // tdiff 882 sabd v21.8h, v5.8h, v17.8h 883 sabd v22.8h, v5.8h, v18.8h 884 sabd v23.8h, v5.8h, v19.8h 885 sabd v24.8h, v4.8h, v16.8h // tldiff 886 sabd v25.8h, v4.8h, v17.8h 887 sabd v26.8h, v4.8h, v18.8h 888 sabd v27.8h, v4.8h, v19.8h 889 sabd v16.8h, v0.8h, v16.8h // ldiff 890 sabd v17.8h, v1.8h, v17.8h 891 sabd v18.8h, v2.8h, v18.8h 892 sabd v19.8h, v3.8h, v19.8h 893 umin v28.8h, v20.8h, v24.8h // min(tdiff, tldiff) 894 umin v29.8h, v21.8h, v25.8h 895 umin v30.8h, v22.8h, v26.8h 896 umin v31.8h, v23.8h, v27.8h 897 cmge v20.8h, v24.8h, v20.8h // tldiff >= tdiff 898 cmge v21.8h, v25.8h, v21.8h 899 cmge v22.8h, v26.8h, v22.8h 900 cmge v23.8h, v27.8h, v23.8h 901 cmge v16.8h, v28.8h, v16.8h // min(tdiff, tldiff) >= ldiff 902 cmge v17.8h, v29.8h, v17.8h 903 cmge v18.8h, v30.8h, v18.8h 904 cmge v19.8h, v31.8h, v19.8h 905 bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft 906 bsl v22.16b, v5.16b, v4.16b 907 bsl v21.16b, v5.16b, v4.16b 908 bsl v20.16b, v5.16b, v4.16b 909 bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... 910 bit v22.16b, v2.16b, v18.16b 911 bit v21.16b, v1.16b, v17.16b 912 bit v20.16b, v0.16b, v16.16b 913 st1 {v23.8h}, [x0], #16 914 st1 {v22.8h}, [x6], #16 915 subs w3, w3, #8 916 st1 {v21.8h}, [x5], #16 917 st1 {v20.8h}, [x10], #16 918 b.le 8f 919 ld1 {v5.8h}, [x8], #16 920 b 2b 9218: 922 subs w4, w4, #4 923 b.le 9f 924 // End of horizontal loop, move pointers to next four rows 925 sub x8, x8, w9, uxtw #1 926 add x0, x0, x1 927 add x6, x6, x1 928 // Load the top row as early as possible 929 ld1 {v5.8h}, [x8], #16 930 add x5, x5, x1 931 add x10, x10, x1 932 mov w3, w9 933 b 1b 9349: 935 ret 936 937L(ipred_paeth_tbl): 938 .hword L(ipred_paeth_tbl) - 640b 939 .hword L(ipred_paeth_tbl) - 320b 940 .hword L(ipred_paeth_tbl) - 160b 941 .hword L(ipred_paeth_tbl) - 80b 942 .hword L(ipred_paeth_tbl) - 40b 943endfunc 944 945// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride, 946// const pixel *const topleft, 947// const int width, const int height, const int a, 948// const int max_width, const int max_height); 949function ipred_smooth_16bpc_neon, export=1 950 movrel x10, X(sm_weights) 951 add x11, x10, w4, uxtw 952 add x10, x10, w3, uxtw 953 clz w9, w3 954 adr x5, L(ipred_smooth_tbl) 955 sub x12, x2, w4, uxtw #1 956 sub w9, w9, #25 957 ldrh w9, [x5, w9, uxtw #1] 958 ld1r {v4.8h}, [x12] // bottom 959 add x8, x2, #2 960 sub x5, x5, w9, uxtw 961 add x6, x0, x1 962 lsl x1, x1, #1 963 br x5 96440: 965 AARCH64_VALID_JUMP_TARGET 966 ld1r {v6.2d}, [x8] // top 967 ld1r {v7.2s}, [x10] // weights_hor 968 sub x2, x2, #8 969 mov x7, #-8 970 dup v5.8h, v6.h[3] // right 971 sub v6.8h, v6.8h, v4.8h // top-bottom 972 uxtl v7.8h, v7.8b // weights_hor 973 add v31.4h, v4.4h, v5.4h // bottom+right 9744: 975 ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left 976 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver 977 ushll v20.4s, v31.4h, #8 // (bottom+right)*256 978 ushll v21.4s, v31.4h, #8 979 ushll v22.4s, v31.4h, #8 980 ushll v23.4s, v31.4h, #8 981 zip1 v1.2d, v1.2d, v0.2d // left, flipped 982 zip1 v0.2d, v3.2d, v2.2d 983 zip1 v16.2s, v16.2s, v17.2s // weights_ver 984 zip1 v18.2s, v18.2s, v19.2s 985 sub v0.8h, v0.8h, v5.8h // left-right 986 sub v1.8h, v1.8h, v5.8h 987 uxtl v16.8h, v16.8b // weights_ver 988 uxtl v18.8h, v18.8b 989 smlal v20.4s, v0.4h, v7.4h // += (left-right)*weights_hor 990 smlal2 v21.4s, v0.8h, v7.8h 991 smlal v22.4s, v1.4h, v7.4h 992 smlal2 v23.4s, v1.8h, v7.8h 993 smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver 994 smlal2 v21.4s, v6.8h, v16.8h 995 smlal v22.4s, v6.4h, v18.4h 996 smlal2 v23.4s, v6.8h, v18.8h 997 rshrn v20.4h, v20.4s, #9 998 rshrn v21.4h, v21.4s, #9 999 rshrn v22.4h, v22.4s, #9 1000 rshrn v23.4h, v23.4s, #9 1001 st1 {v20.4h}, [x0], x1 1002 st1 {v21.4h}, [x6], x1 1003 subs w4, w4, #4 1004 st1 {v22.4h}, [x0], x1 1005 st1 {v23.4h}, [x6], x1 1006 b.gt 4b 1007 ret 100880: 1009 AARCH64_VALID_JUMP_TARGET 1010 ld1 {v6.8h}, [x8] // top 1011 ld1 {v7.8b}, [x10] // weights_hor 1012 sub x2, x2, #8 1013 mov x7, #-8 1014 dup v5.8h, v6.h[7] // right 1015 sub v6.8h, v6.8h, v4.8h // top-bottom 1016 uxtl v7.8h, v7.8b // weights_hor 1017 add v31.4h, v4.4h, v5.4h // bottom+right 10188: 1019 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left 1020 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver 1021 ushll v20.4s, v31.4h, #8 // (bottom+right)*256 1022 ushll v21.4s, v31.4h, #8 1023 ushll v22.4s, v31.4h, #8 1024 ushll v23.4s, v31.4h, #8 1025 ushll v24.4s, v31.4h, #8 1026 ushll v25.4s, v31.4h, #8 1027 ushll v26.4s, v31.4h, #8 1028 ushll v27.4s, v31.4h, #8 1029 sub v0.8h, v0.8h, v5.8h // left-right 1030 sub v1.8h, v1.8h, v5.8h 1031 sub v2.8h, v2.8h, v5.8h 1032 sub v3.8h, v3.8h, v5.8h 1033 uxtl v16.8h, v16.8b // weights_ver 1034 uxtl v17.8h, v17.8b 1035 uxtl v18.8h, v18.8b 1036 uxtl v19.8h, v19.8b 1037 smlal v20.4s, v3.4h, v7.4h // += (left-right)*weights_hor 1038 smlal2 v21.4s, v3.8h, v7.8h // (left flipped) 1039 smlal v22.4s, v2.4h, v7.4h 1040 smlal2 v23.4s, v2.8h, v7.8h 1041 smlal v24.4s, v1.4h, v7.4h 1042 smlal2 v25.4s, v1.8h, v7.8h 1043 smlal v26.4s, v0.4h, v7.4h 1044 smlal2 v27.4s, v0.8h, v7.8h 1045 smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver 1046 smlal2 v21.4s, v6.8h, v16.8h 1047 smlal v22.4s, v6.4h, v17.4h 1048 smlal2 v23.4s, v6.8h, v17.8h 1049 smlal v24.4s, v6.4h, v18.4h 1050 smlal2 v25.4s, v6.8h, v18.8h 1051 smlal v26.4s, v6.4h, v19.4h 1052 smlal2 v27.4s, v6.8h, v19.8h 1053 rshrn v20.4h, v20.4s, #9 1054 rshrn2 v20.8h, v21.4s, #9 1055 rshrn v21.4h, v22.4s, #9 1056 rshrn2 v21.8h, v23.4s, #9 1057 rshrn v22.4h, v24.4s, #9 1058 rshrn2 v22.8h, v25.4s, #9 1059 rshrn v23.4h, v26.4s, #9 1060 rshrn2 v23.8h, v27.4s, #9 1061 st1 {v20.8h}, [x0], x1 1062 st1 {v21.8h}, [x6], x1 1063 subs w4, w4, #4 1064 st1 {v22.8h}, [x0], x1 1065 st1 {v23.8h}, [x6], x1 1066 b.gt 8b 1067 ret 1068160: 1069320: 1070640: 1071 AARCH64_VALID_JUMP_TARGET 1072 add x12, x2, w3, uxtw #1 1073 sub x1, x1, w3, uxtw #1 1074 ld1r {v5.8h}, [x12] // right 1075 sub x2, x2, #4 1076 mov x7, #-4 1077 mov w9, w3 1078 add v31.4h, v4.4h, v5.4h // bottom+right 1079 10801: 1081 ld2r {v0.8h, v1.8h}, [x2], x7 // left 1082 ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver 1083 sub v0.8h, v0.8h, v5.8h // left-right 1084 sub v1.8h, v1.8h, v5.8h 1085 uxtl v16.8h, v16.8b // weights_ver 1086 uxtl v17.8h, v17.8b 10872: 1088 ld1 {v7.16b}, [x10], #16 // weights_hor 1089 ld1 {v2.8h, v3.8h}, [x8], #32 // top 1090 ushll v20.4s, v31.4h, #8 // (bottom+right)*256 1091 ushll v21.4s, v31.4h, #8 1092 ushll v22.4s, v31.4h, #8 1093 ushll v23.4s, v31.4h, #8 1094 ushll v24.4s, v31.4h, #8 1095 ushll v25.4s, v31.4h, #8 1096 ushll v26.4s, v31.4h, #8 1097 ushll v27.4s, v31.4h, #8 1098 uxtl v6.8h, v7.8b // weights_hor 1099 uxtl2 v7.8h, v7.16b 1100 sub v2.8h, v2.8h, v4.8h // top-bottom 1101 sub v3.8h, v3.8h, v4.8h 1102 smlal v20.4s, v1.4h, v6.4h // += (left-right)*weights_hor 1103 smlal2 v21.4s, v1.8h, v6.8h // (left flipped) 1104 smlal v22.4s, v1.4h, v7.4h 1105 smlal2 v23.4s, v1.8h, v7.8h 1106 smlal v24.4s, v0.4h, v6.4h 1107 smlal2 v25.4s, v0.8h, v6.8h 1108 smlal v26.4s, v0.4h, v7.4h 1109 smlal2 v27.4s, v0.8h, v7.8h 1110 smlal v20.4s, v2.4h, v16.4h // += (top-bottom)*weights_ver 1111 smlal2 v21.4s, v2.8h, v16.8h 1112 smlal v22.4s, v3.4h, v16.4h 1113 smlal2 v23.4s, v3.8h, v16.8h 1114 smlal v24.4s, v2.4h, v17.4h 1115 smlal2 v25.4s, v2.8h, v17.8h 1116 smlal v26.4s, v3.4h, v17.4h 1117 smlal2 v27.4s, v3.8h, v17.8h 1118 rshrn v20.4h, v20.4s, #9 1119 rshrn2 v20.8h, v21.4s, #9 1120 rshrn v21.4h, v22.4s, #9 1121 rshrn2 v21.8h, v23.4s, #9 1122 rshrn v22.4h, v24.4s, #9 1123 rshrn2 v22.8h, v25.4s, #9 1124 rshrn v23.4h, v26.4s, #9 1125 rshrn2 v23.8h, v27.4s, #9 1126 subs w3, w3, #16 1127 st1 {v20.8h, v21.8h}, [x0], #32 1128 st1 {v22.8h, v23.8h}, [x6], #32 1129 b.gt 2b 1130 subs w4, w4, #2 1131 b.le 9f 1132 sub x8, x8, w9, uxtw #1 1133 sub x10, x10, w9, uxtw 1134 add x0, x0, x1 1135 add x6, x6, x1 1136 mov w3, w9 1137 b 1b 11389: 1139 ret 1140 1141L(ipred_smooth_tbl): 1142 .hword L(ipred_smooth_tbl) - 640b 1143 .hword L(ipred_smooth_tbl) - 320b 1144 .hword L(ipred_smooth_tbl) - 160b 1145 .hword L(ipred_smooth_tbl) - 80b 1146 .hword L(ipred_smooth_tbl) - 40b 1147endfunc 1148 1149// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1150// const pixel *const topleft, 1151// const int width, const int height, const int a, 1152// const int max_width, const int max_height); 1153function ipred_smooth_v_16bpc_neon, export=1 1154 movrel x7, X(sm_weights) 1155 add x7, x7, w4, uxtw 1156 clz w9, w3 1157 adr x5, L(ipred_smooth_v_tbl) 1158 sub x8, x2, w4, uxtw #1 1159 sub w9, w9, #25 1160 ldrh w9, [x5, w9, uxtw #1] 1161 ld1r {v4.8h}, [x8] // bottom 1162 add x2, x2, #2 1163 sub x5, x5, w9, uxtw 1164 add x6, x0, x1 1165 lsl x1, x1, #1 1166 br x5 116740: 1168 AARCH64_VALID_JUMP_TARGET 1169 ld1r {v6.2d}, [x2] // top 1170 sub v6.8h, v6.8h, v4.8h // top-bottom 11714: 1172 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1173 zip1 v16.2s, v16.2s, v17.2s // weights_ver 1174 zip1 v18.2s, v18.2s, v19.2s 1175 ushll v16.8h, v16.8b, #7 // weights_ver << 7 1176 ushll v18.8h, v18.8b, #7 1177 sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 1178 sqrdmulh v21.8h, v6.8h, v18.8h 1179 add v20.8h, v20.8h, v4.8h 1180 add v21.8h, v21.8h, v4.8h 1181 st1 {v20.d}[0], [x0], x1 1182 st1 {v20.d}[1], [x6], x1 1183 subs w4, w4, #4 1184 st1 {v21.d}[0], [x0], x1 1185 st1 {v21.d}[1], [x6], x1 1186 b.gt 4b 1187 ret 118880: 1189 AARCH64_VALID_JUMP_TARGET 1190 ld1 {v6.8h}, [x2] // top 1191 sub v6.8h, v6.8h, v4.8h // top-bottom 11928: 1193 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1194 ushll v16.8h, v16.8b, #7 // weights_ver << 7 1195 ushll v17.8h, v17.8b, #7 1196 ushll v18.8h, v18.8b, #7 1197 ushll v19.8h, v19.8b, #7 1198 sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 1199 sqrdmulh v21.8h, v6.8h, v17.8h 1200 sqrdmulh v22.8h, v6.8h, v18.8h 1201 sqrdmulh v23.8h, v6.8h, v19.8h 1202 add v20.8h, v20.8h, v4.8h 1203 add v21.8h, v21.8h, v4.8h 1204 add v22.8h, v22.8h, v4.8h 1205 add v23.8h, v23.8h, v4.8h 1206 st1 {v20.8h}, [x0], x1 1207 st1 {v21.8h}, [x6], x1 1208 subs w4, w4, #4 1209 st1 {v22.8h}, [x0], x1 1210 st1 {v23.8h}, [x6], x1 1211 b.gt 8b 1212 ret 1213160: 1214320: 1215640: 1216 AARCH64_VALID_JUMP_TARGET 1217 // Set up pointers for four rows in parallel; x0, x6, x5, x8 1218 add x5, x0, x1 1219 add x8, x6, x1 1220 lsl x1, x1, #1 1221 sub x1, x1, w3, uxtw #1 1222 mov w9, w3 1223 12241: 1225 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1226 ushll v16.8h, v16.8b, #7 // weights_ver << 7 1227 ushll v17.8h, v17.8b, #7 1228 ushll v18.8h, v18.8b, #7 1229 ushll v19.8h, v19.8b, #7 12302: 1231 ld1 {v2.8h, v3.8h}, [x2], #32 // top 1232 sub v2.8h, v2.8h, v4.8h // top-bottom 1233 sub v3.8h, v3.8h, v4.8h 1234 sqrdmulh v20.8h, v2.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 1235 sqrdmulh v21.8h, v3.8h, v16.8h 1236 sqrdmulh v22.8h, v2.8h, v17.8h 1237 sqrdmulh v23.8h, v3.8h, v17.8h 1238 sqrdmulh v24.8h, v2.8h, v18.8h 1239 sqrdmulh v25.8h, v3.8h, v18.8h 1240 sqrdmulh v26.8h, v2.8h, v19.8h 1241 sqrdmulh v27.8h, v3.8h, v19.8h 1242 add v20.8h, v20.8h, v4.8h 1243 add v21.8h, v21.8h, v4.8h 1244 add v22.8h, v22.8h, v4.8h 1245 add v23.8h, v23.8h, v4.8h 1246 add v24.8h, v24.8h, v4.8h 1247 add v25.8h, v25.8h, v4.8h 1248 add v26.8h, v26.8h, v4.8h 1249 add v27.8h, v27.8h, v4.8h 1250 subs w3, w3, #16 1251 st1 {v20.8h, v21.8h}, [x0], #32 1252 st1 {v22.8h, v23.8h}, [x6], #32 1253 st1 {v24.8h, v25.8h}, [x5], #32 1254 st1 {v26.8h, v27.8h}, [x8], #32 1255 b.gt 2b 1256 subs w4, w4, #4 1257 b.le 9f 1258 sub x2, x2, w9, uxtw #1 1259 add x0, x0, x1 1260 add x6, x6, x1 1261 add x5, x5, x1 1262 add x8, x8, x1 1263 mov w3, w9 1264 b 1b 12659: 1266 ret 1267 1268L(ipred_smooth_v_tbl): 1269 .hword L(ipred_smooth_v_tbl) - 640b 1270 .hword L(ipred_smooth_v_tbl) - 320b 1271 .hword L(ipred_smooth_v_tbl) - 160b 1272 .hword L(ipred_smooth_v_tbl) - 80b 1273 .hword L(ipred_smooth_v_tbl) - 40b 1274endfunc 1275 1276// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1277// const pixel *const topleft, 1278// const int width, const int height, const int a, 1279// const int max_width, const int max_height); 1280function ipred_smooth_h_16bpc_neon, export=1 1281 movrel x8, X(sm_weights) 1282 add x8, x8, w3, uxtw 1283 clz w9, w3 1284 adr x5, L(ipred_smooth_h_tbl) 1285 add x12, x2, w3, uxtw #1 1286 sub w9, w9, #25 1287 ldrh w9, [x5, w9, uxtw #1] 1288 ld1r {v5.8h}, [x12] // right 1289 sub x5, x5, w9, uxtw 1290 add x6, x0, x1 1291 lsl x1, x1, #1 1292 br x5 129340: 1294 AARCH64_VALID_JUMP_TARGET 1295 ld1r {v7.2s}, [x8] // weights_hor 1296 sub x2, x2, #8 1297 mov x7, #-8 1298 ushll v7.8h, v7.8b, #7 // weights_hor << 7 12994: 1300 ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left 1301 zip1 v1.2d, v1.2d, v0.2d // left, flipped 1302 zip1 v0.2d, v3.2d, v2.2d 1303 sub v0.8h, v0.8h, v5.8h // left-right 1304 sub v1.8h, v1.8h, v5.8h 1305 sqrdmulh v20.8h, v0.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 1306 sqrdmulh v21.8h, v1.8h, v7.8h 1307 add v20.8h, v20.8h, v5.8h 1308 add v21.8h, v21.8h, v5.8h 1309 st1 {v20.d}[0], [x0], x1 1310 st1 {v20.d}[1], [x6], x1 1311 subs w4, w4, #4 1312 st1 {v21.d}[0], [x0], x1 1313 st1 {v21.d}[1], [x6], x1 1314 b.gt 4b 1315 ret 131680: 1317 AARCH64_VALID_JUMP_TARGET 1318 ld1 {v7.8b}, [x8] // weights_hor 1319 sub x2, x2, #8 1320 mov x7, #-8 1321 ushll v7.8h, v7.8b, #7 // weights_hor << 7 13228: 1323 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left 1324 sub v3.8h, v3.8h, v5.8h // left-right 1325 sub v2.8h, v2.8h, v5.8h 1326 sub v1.8h, v1.8h, v5.8h 1327 sub v0.8h, v0.8h, v5.8h 1328 sqrdmulh v20.8h, v3.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 1329 sqrdmulh v21.8h, v2.8h, v7.8h // (left flipped) 1330 sqrdmulh v22.8h, v1.8h, v7.8h 1331 sqrdmulh v23.8h, v0.8h, v7.8h 1332 add v20.8h, v20.8h, v5.8h 1333 add v21.8h, v21.8h, v5.8h 1334 add v22.8h, v22.8h, v5.8h 1335 add v23.8h, v23.8h, v5.8h 1336 st1 {v20.8h}, [x0], x1 1337 st1 {v21.8h}, [x6], x1 1338 subs w4, w4, #4 1339 st1 {v22.8h}, [x0], x1 1340 st1 {v23.8h}, [x6], x1 1341 b.gt 8b 1342 ret 1343160: 1344320: 1345640: 1346 AARCH64_VALID_JUMP_TARGET 1347 sub x2, x2, #8 1348 mov x7, #-8 1349 // Set up pointers for four rows in parallel; x0, x6, x5, x10 1350 add x5, x0, x1 1351 add x10, x6, x1 1352 lsl x1, x1, #1 1353 sub x1, x1, w3, uxtw #1 1354 mov w9, w3 1355 13561: 1357 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left 1358 sub v0.8h, v0.8h, v5.8h // left-right 1359 sub v1.8h, v1.8h, v5.8h 1360 sub v2.8h, v2.8h, v5.8h 1361 sub v3.8h, v3.8h, v5.8h 13622: 1363 ld1 {v7.16b}, [x8], #16 // weights_hor 1364 ushll v6.8h, v7.8b, #7 // weights_hor << 7 1365 ushll2 v7.8h, v7.16b, #7 1366 sqrdmulh v20.8h, v3.8h, v6.8h // ((left-right)*weights_hor + 128) >> 8 1367 sqrdmulh v21.8h, v3.8h, v7.8h // (left flipped) 1368 sqrdmulh v22.8h, v2.8h, v6.8h 1369 sqrdmulh v23.8h, v2.8h, v7.8h 1370 sqrdmulh v24.8h, v1.8h, v6.8h 1371 sqrdmulh v25.8h, v1.8h, v7.8h 1372 sqrdmulh v26.8h, v0.8h, v6.8h 1373 sqrdmulh v27.8h, v0.8h, v7.8h 1374 add v20.8h, v20.8h, v5.8h 1375 add v21.8h, v21.8h, v5.8h 1376 add v22.8h, v22.8h, v5.8h 1377 add v23.8h, v23.8h, v5.8h 1378 add v24.8h, v24.8h, v5.8h 1379 add v25.8h, v25.8h, v5.8h 1380 add v26.8h, v26.8h, v5.8h 1381 add v27.8h, v27.8h, v5.8h 1382 subs w3, w3, #16 1383 st1 {v20.8h, v21.8h}, [x0], #32 1384 st1 {v22.8h, v23.8h}, [x6], #32 1385 st1 {v24.8h, v25.8h}, [x5], #32 1386 st1 {v26.8h, v27.8h}, [x10], #32 1387 b.gt 2b 1388 subs w4, w4, #4 1389 b.le 9f 1390 sub x8, x8, w9, uxtw 1391 add x0, x0, x1 1392 add x6, x6, x1 1393 add x5, x5, x1 1394 add x10, x10, x1 1395 mov w3, w9 1396 b 1b 13979: 1398 ret 1399 1400L(ipred_smooth_h_tbl): 1401 .hword L(ipred_smooth_h_tbl) - 640b 1402 .hword L(ipred_smooth_h_tbl) - 320b 1403 .hword L(ipred_smooth_h_tbl) - 160b 1404 .hword L(ipred_smooth_h_tbl) - 80b 1405 .hword L(ipred_smooth_h_tbl) - 40b 1406endfunc 1407 1408const padding_mask_buf 1409 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1410 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1411 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1412 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1413 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1414 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1415padding_mask: 1416 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1417 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1418 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1419 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1420 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1421 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1422endconst 1423 1424// void ipred_z1_upsample_edge_16bpc_neon(pixel *out, const int hsz, 1425// const pixel *const in, const int end, 1426// const int bitdepth_max); 1427function ipred_z1_upsample_edge_16bpc_neon, export=1 1428 dup v30.8h, w4 // bitdepth_max 1429 movrel x4, padding_mask 1430 ld1 {v0.8h, v1.8h}, [x2] // in[] 1431 add x5, x2, w3, uxtw #1 // in[end] 1432 sub x4, x4, w3, uxtw #1 1433 1434 ld1r {v2.8h}, [x5] // padding 1435 ld1 {v3.8h, v4.8h}, [x4] // padding_mask 1436 1437 movi v31.8h, #9 1438 1439 bit v0.16b, v2.16b, v3.16b // padded in[] 1440 bit v1.16b, v2.16b, v4.16b 1441 1442 ext v4.16b, v0.16b, v1.16b, #2 1443 ext v5.16b, v1.16b, v2.16b, #2 1444 ext v6.16b, v0.16b, v1.16b, #4 1445 ext v7.16b, v1.16b, v2.16b, #4 1446 ext v16.16b, v0.16b, v1.16b, #6 1447 ext v17.16b, v1.16b, v2.16b, #6 1448 1449 add v18.8h, v4.8h, v6.8h // in[i+1] + in[i+2] 1450 add v19.8h, v5.8h, v7.8h 1451 add v20.8h, v0.8h, v16.8h 1452 add v21.8h, v1.8h, v17.8h 1453 umull v22.4s, v18.4h, v31.4h // 9*(in[i+1] + in[i+2]) 1454 umull2 v23.4s, v18.8h, v31.8h 1455 umull v24.4s, v19.4h, v31.4h 1456 umull2 v25.4s, v19.8h, v31.8h 1457 usubw v22.4s, v22.4s, v20.4h 1458 usubw2 v23.4s, v23.4s, v20.8h 1459 usubw v24.4s, v24.4s, v21.4h 1460 usubw2 v25.4s, v25.4s, v21.8h 1461 1462 sqrshrun v16.4h, v22.4s, #4 1463 sqrshrun2 v16.8h, v23.4s, #4 1464 sqrshrun v17.4h, v24.4s, #4 1465 sqrshrun2 v17.8h, v25.4s, #4 1466 1467 smin v16.8h, v16.8h, v30.8h 1468 smin v17.8h, v17.8h, v30.8h 1469 1470 zip1 v0.8h, v4.8h, v16.8h 1471 zip2 v1.8h, v4.8h, v16.8h 1472 zip1 v2.8h, v5.8h, v17.8h 1473 zip2 v3.8h, v5.8h, v17.8h 1474 1475 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] 1476 1477 ret 1478endfunc 1479 1480// void ipred_z2_upsample_edge_16bpc_neon(pixel *out, const int sz, 1481// const pixel *const in, 1482// const int bitdepth_max); 1483function ipred_z2_upsample_edge_16bpc_neon, export=1 1484 dup v30.8h, w3 // bitdepth_max 1485 // Here, sz is 4 or 8, and we produce 2*sz+1 output elements. 1486 movrel x4, padding_mask 1487 ld1 {v0.8h, v1.8h}, [x2] // in[] 1488 add x5, x2, w1, uxtw #1 // in[sz] 1489 sub x4, x4, w1, uxtw #1 1490 1491 ld1r {v3.8h}, [x2] // in[0] for padding 1492 ld1r {v2.8h}, [x5] // padding 1493 ld1 {v4.8h, v5.8h}, [x4] // padding_mask 1494 1495 movi v31.8h, #9 1496 1497 bit v0.16b, v2.16b, v4.16b // padded in[] 1498 bit v1.16b, v2.16b, v5.16b 1499 1500 ext v4.16b, v3.16b, v0.16b, #14 1501 ext v5.16b, v0.16b, v1.16b, #2 1502 ext v6.16b, v0.16b, v1.16b, #4 1503 1504 add v16.8h, v0.8h, v5.8h // in[i+0] + in[i+1] 1505 add v17.8h, v4.8h, v6.8h // in[i-1] + in[i+2] 1506 umull v18.4s, v16.4h, v31.4h // 9*(in[i+1] + in[i+2]) 1507 umull2 v19.4s, v16.8h, v31.8h 1508 usubw v18.4s, v18.4s, v17.4h 1509 usubw2 v19.4s, v19.4s, v17.8h 1510 1511 sqrshrun v16.4h, v18.4s, #4 1512 sqrshrun2 v16.8h, v19.4s, #4 1513 1514 add x5, x0, #2*16 1515 1516 smin v16.8h, v16.8h, v30.8h 1517 1518 zip1 v4.8h, v0.8h, v16.8h 1519 zip2 v5.8h, v0.8h, v16.8h 1520 1521 st1 {v2.h}[0], [x5] 1522 // In case sz=8, output one single pixel in out[16]. 1523 st1 {v4.8h, v5.8h}, [x0] 1524 1525 ret 1526endfunc 1527 1528const edge_filter 1529 .short 0, 4, 8, 0 1530 .short 0, 5, 6, 0 1531// Leaving out the coeffs for strength=3 1532// .byte 2, 4, 4, 0 1533endconst 1534 1535// void ipred_z1_filter_edge_16bpc_neon(pixel *out, const int sz, 1536// const pixel *const in, const int end, 1537// const int strength); 1538function ipred_z1_filter_edge_16bpc_neon, export=1 1539 cmp w4, #3 1540 b.eq L(fivetap) // if (strength == 3) goto fivetap 1541 1542 movrel x5, edge_filter, -6 1543 add x5, x5, w4, uxtw #3 // edge_filter + 2*((strength - 1)*4 + 1) 1544 1545 ld1 {v31.s}[0], [x5] // kernel[1-2] 1546 1547 ld1 {v0.8h}, [x2], #16 1548 1549 dup v30.8h, v31.h[0] 1550 dup v31.8h, v31.h[1] 15511: 1552 // in[end], is the last valid pixel. We produce 16 pixels out by 1553 // using 18 pixels in - the last pixel used is [17] of the ones 1554 // read/buffered. 1555 cmp w3, #17 1556 ld1 {v1.8h, v2.8h}, [x2], #32 1557 b.lt 2f 1558 ext v3.16b, v0.16b, v1.16b, #2 1559 ext v4.16b, v1.16b, v2.16b, #2 1560 ext v5.16b, v0.16b, v1.16b, #4 1561 ext v6.16b, v1.16b, v2.16b, #4 1562 mul v16.8h, v0.8h, v30.8h 1563 mla v16.8h, v3.8h, v31.8h 1564 mla v16.8h, v5.8h, v30.8h 1565 mul v17.8h, v1.8h, v30.8h 1566 mla v17.8h, v4.8h, v31.8h 1567 mla v17.8h, v6.8h, v30.8h 1568 subs w1, w1, #16 1569 mov v0.16b, v2.16b 1570 urshr v16.8h, v16.8h, #4 1571 urshr v17.8h, v17.8h, #4 1572 sub w3, w3, #16 1573 st1 {v16.8h, v17.8h}, [x0], #32 1574 b.gt 1b 1575 ret 15762: 1577 // Right padding 1578 1579 // x2[w3-24] is the padding pixel (x2 points 24 pixels ahead) 1580 movrel x5, padding_mask 1581 sub w6, w3, #24 1582 sub x5, x5, w3, uxtw #1 1583 add x6, x2, w6, sxtw #1 1584 1585 ld1 {v3.8h, v4.8h}, [x5] // padding_mask 1586 1587 ld1r {v2.8h}, [x6] 1588 bit v0.16b, v2.16b, v3.16b // Pad v0-v1 1589 bit v1.16b, v2.16b, v4.16b 1590 1591 // Filter one block 1592 ext v3.16b, v0.16b, v1.16b, #2 1593 ext v4.16b, v1.16b, v2.16b, #2 1594 ext v5.16b, v0.16b, v1.16b, #4 1595 ext v6.16b, v1.16b, v2.16b, #4 1596 mul v16.8h, v0.8h, v30.8h 1597 mla v16.8h, v3.8h, v31.8h 1598 mla v16.8h, v5.8h, v30.8h 1599 mul v17.8h, v1.8h, v30.8h 1600 mla v17.8h, v4.8h, v31.8h 1601 mla v17.8h, v6.8h, v30.8h 1602 subs w1, w1, #16 1603 urshr v16.8h, v16.8h, #4 1604 urshr v17.8h, v17.8h, #4 1605 st1 {v16.8h, v17.8h}, [x0], #32 1606 b.le 9f 16075: 1608 // After one block, any remaining output would only be filtering 1609 // padding - thus just store the padding. 1610 subs w1, w1, #16 1611 st1 {v2.16b}, [x0], #16 1612 b.gt 5b 16139: 1614 ret 1615 1616L(fivetap): 1617 sub x2, x2, #2 // topleft -= 1 pixel 1618 movi v29.8h, #2 1619 ld1 {v0.8h}, [x2], #16 1620 movi v30.8h, #4 1621 movi v31.8h, #4 1622 ins v0.h[0], v0.h[1] 16231: 1624 // in[end+1], is the last valid pixel. We produce 16 pixels out by 1625 // using 20 pixels in - the last pixel used is [19] of the ones 1626 // read/buffered. 1627 cmp w3, #18 1628 ld1 {v1.8h, v2.8h}, [x2], #32 1629 b.lt 2f // if (end + 1 < 19) 1630 ext v3.16b, v0.16b, v1.16b, #2 1631 ext v4.16b, v1.16b, v2.16b, #2 1632 ext v5.16b, v0.16b, v1.16b, #4 1633 ext v6.16b, v1.16b, v2.16b, #4 1634 ext v16.16b, v0.16b, v1.16b, #6 1635 ext v17.16b, v1.16b, v2.16b, #6 1636 ext v18.16b, v0.16b, v1.16b, #8 1637 ext v19.16b, v1.16b, v2.16b, #8 1638 mul v20.8h, v0.8h, v29.8h 1639 mla v20.8h, v3.8h, v30.8h 1640 mla v20.8h, v5.8h, v31.8h 1641 mla v20.8h, v16.8h, v30.8h 1642 mla v20.8h, v18.8h, v29.8h 1643 mul v21.8h, v1.8h, v29.8h 1644 mla v21.8h, v4.8h, v30.8h 1645 mla v21.8h, v6.8h, v31.8h 1646 mla v21.8h, v17.8h, v30.8h 1647 mla v21.8h, v19.8h, v29.8h 1648 subs w1, w1, #16 1649 mov v0.16b, v2.16b 1650 urshr v20.8h, v20.8h, #4 1651 urshr v21.8h, v21.8h, #4 1652 sub w3, w3, #16 1653 st1 {v20.8h, v21.8h}, [x0], #32 1654 b.gt 1b 1655 ret 16562: 1657 // Right padding 1658 1659 // x2[w3+1-24] is the padding pixel (x2 points 24 pixels ahead) 1660 movrel x5, padding_mask, -2 1661 sub w6, w3, #23 1662 sub x5, x5, w3, uxtw #1 1663 add x6, x2, w6, sxtw #1 1664 1665 ld1 {v3.8h, v4.8h, v5.8h}, [x5] // padding_mask 1666 1667 ld1r {v28.8h}, [x6] 1668 bit v0.16b, v28.16b, v3.16b // Pad v0-v2 1669 bit v1.16b, v28.16b, v4.16b 1670 bit v2.16b, v28.16b, v5.16b 16714: 1672 // Filter one block 1673 ext v3.16b, v0.16b, v1.16b, #2 1674 ext v4.16b, v1.16b, v2.16b, #2 1675 ext v5.16b, v0.16b, v1.16b, #4 1676 ext v6.16b, v1.16b, v2.16b, #4 1677 ext v16.16b, v0.16b, v1.16b, #6 1678 ext v17.16b, v1.16b, v2.16b, #6 1679 ext v18.16b, v0.16b, v1.16b, #8 1680 ext v19.16b, v1.16b, v2.16b, #8 1681 mul v20.8h, v0.8h, v29.8h 1682 mla v20.8h, v3.8h, v30.8h 1683 mla v20.8h, v5.8h, v31.8h 1684 mla v20.8h, v16.8h, v30.8h 1685 mla v20.8h, v18.8h, v29.8h 1686 mul v21.8h, v1.8h, v29.8h 1687 mla v21.8h, v4.8h, v30.8h 1688 mla v21.8h, v6.8h, v31.8h 1689 mla v21.8h, v17.8h, v30.8h 1690 mla v21.8h, v19.8h, v29.8h 1691 subs w1, w1, #16 1692 mov v0.16b, v2.16b 1693 mov v1.16b, v28.16b 1694 mov v2.16b, v28.16b 1695 urshr v20.8h, v20.8h, #4 1696 urshr v21.8h, v21.8h, #4 1697 sub w3, w3, #16 1698 st1 {v20.8h, v21.8h}, [x0], #32 1699 b.le 9f 1700 // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to 1701 // filter properly once more - aka (w3 >= 0). 1702 cmp w3, #0 1703 b.ge 4b 17045: 1705 // When w3 <= 0, all remaining pixels in v0-v1 are equal to the 1706 // last valid pixel - thus just output that without filtering. 1707 subs w1, w1, #8 1708 st1 {v28.8h}, [x0], #16 1709 b.gt 5b 17109: 1711 ret 1712endfunc 1713 1714// void ipred_pixel_set_16bpc_neon(pixel *out, const pixel px, 1715// const int n); 1716function ipred_pixel_set_16bpc_neon, export=1 1717 dup v0.8h, w1 17181: 1719 subs w2, w2, #8 1720 st1 {v0.8h}, [x0], #16 1721 b.gt 1b 1722 ret 1723endfunc 1724 1725// void ipred_z1_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1726// const pixel *const top, 1727// const int width, const int height, 1728// const int dx, const int max_base_x); 1729function ipred_z1_fill1_16bpc_neon, export=1 1730 clz w9, w3 1731 adr x8, L(ipred_z1_fill1_tbl) 1732 sub w9, w9, #25 1733 ldrh w9, [x8, w9, uxtw #1] 1734 add x10, x2, w6, uxtw #1 // top[max_base_x] 1735 sub x8, x8, w9, uxtw 1736 ld1r {v31.8h}, [x10] // padding 1737 mov w7, w5 1738 mov w15, #64 1739 br x8 174040: 1741 AARCH64_VALID_JUMP_TARGET 17424: 1743 lsr w8, w7, #6 // base 1744 and w9, w7, #0x3e // frac 1745 add w7, w7, w5 // xpos += dx 1746 cmp w8, w6 // base >= max_base_x 1747 lsr w10, w7, #6 // base 1748 and w11, w7, #0x3e // frac 1749 b.ge 49f 1750 lsl w8, w8, #1 1751 lsl w10, w10, #1 1752 ldr q0, [x2, w8, uxtw] // top[base] 1753 ldr q2, [x2, w10, uxtw] 1754 dup v4.4h, w9 // frac 1755 dup v5.4h, w11 1756 ext v1.16b, v0.16b, v0.16b, #2 // top[base+1] 1757 ext v3.16b, v2.16b, v2.16b, #2 1758 sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] 1759 sub v7.4h, v3.4h, v2.4h 1760 ushll v16.4s, v0.4h, #6 // top[base]*64 1761 ushll v17.4s, v2.4h, #6 1762 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac 1763 smlal v17.4s, v7.4h, v5.4h 1764 rshrn v16.4h, v16.4s, #6 1765 rshrn v17.4h, v17.4s, #6 1766 st1 {v16.4h}, [x0], x1 1767 add w7, w7, w5 // xpos += dx 1768 subs w4, w4, #2 1769 st1 {v17.4h}, [x0], x1 1770 b.gt 4b 1771 ret 1772 177349: 1774 st1 {v31.4h}, [x0], x1 1775 subs w4, w4, #2 1776 st1 {v31.4h}, [x0], x1 1777 b.gt 49b 1778 ret 1779 178080: 1781 AARCH64_VALID_JUMP_TARGET 17828: 1783 lsr w8, w7, #6 // base 1784 and w9, w7, #0x3e // frac 1785 add w7, w7, w5 // xpos += dx 1786 cmp w8, w6 // base >= max_base_x 1787 lsr w10, w7, #6 // base 1788 and w11, w7, #0x3e // frac 1789 b.ge 89f 1790 add x8, x2, w8, uxtw #1 1791 add x10, x2, w10, uxtw #1 1792 dup v4.8h, w9 // frac 1793 dup v5.8h, w11 1794 ld1 {v0.8h}, [x8] // top[base] 1795 ld1 {v2.8h}, [x10] 1796 sub w9, w15, w9 // 64 - frac 1797 sub w11, w15, w11 1798 ldr h1, [x8, #16] 1799 ldr h3, [x10, #16] 1800 dup v6.8h, w9 // 64 - frac 1801 dup v7.8h, w11 1802 ext v1.16b, v0.16b, v1.16b, #2 // top[base+1] 1803 ext v3.16b, v2.16b, v3.16b, #2 1804 umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) 1805 umlal v16.4s, v1.4h, v4.4h // + top[base+1]*frac 1806 umull2 v17.4s, v0.8h, v6.8h 1807 umlal2 v17.4s, v1.8h, v4.8h 1808 umull v18.4s, v2.4h, v7.4h 1809 umlal v18.4s, v3.4h, v5.4h 1810 umull2 v19.4s, v2.8h, v7.8h 1811 umlal2 v19.4s, v3.8h, v5.8h 1812 rshrn v16.4h, v16.4s, #6 1813 rshrn2 v16.8h, v17.4s, #6 1814 rshrn v17.4h, v18.4s, #6 1815 rshrn2 v17.8h, v19.4s, #6 1816 st1 {v16.8h}, [x0], x1 1817 add w7, w7, w5 // xpos += dx 1818 subs w4, w4, #2 1819 st1 {v17.8h}, [x0], x1 1820 b.gt 8b 1821 ret 1822 182389: 1824 st1 {v31.8h}, [x0], x1 1825 subs w4, w4, #2 1826 st1 {v31.8h}, [x0], x1 1827 b.gt 89b 1828 ret 1829 1830160: 1831320: 1832640: 1833 AARCH64_VALID_JUMP_TARGET 1834 1835 mov w12, w3 1836 1837 add x13, x0, x1 1838 lsl x1, x1, #1 1839 sub x1, x1, w3, uxtw #1 18401: 1841 lsr w8, w7, #6 // base 1842 and w9, w7, #0x3e // frac 1843 add w7, w7, w5 // xpos += dx 1844 cmp w8, w6 // base >= max_base_x 1845 lsr w10, w7, #6 // base 1846 and w11, w7, #0x3e // frac 1847 b.ge 169f 1848 add x8, x2, w8, uxtw #1 1849 add x10, x2, w10, uxtw #1 1850 dup v6.8h, w9 // frac 1851 dup v7.8h, w11 1852 ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // top[base] 1853 ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48 1854 sub w9, w15, w9 // 64 - frac 1855 sub w11, w15, w11 1856 dup v16.8h, w9 // 64 - frac 1857 dup v17.8h, w11 1858 add w7, w7, w5 // xpos += dx 18592: 1860 ext v18.16b, v0.16b, v1.16b, #2 // top[base+1] 1861 ext v19.16b, v1.16b, v2.16b, #2 1862 ext v20.16b, v3.16b, v4.16b, #2 1863 ext v21.16b, v4.16b, v5.16b, #2 1864 subs w3, w3, #16 1865 umull v22.4s, v0.4h, v16.4h // top[base]*(64-frac) 1866 umlal v22.4s, v18.4h, v6.4h // + top[base+1]*frac 1867 umull2 v23.4s, v0.8h, v16.8h 1868 umlal2 v23.4s, v18.8h, v6.8h 1869 umull v24.4s, v1.4h, v16.4h 1870 umlal v24.4s, v19.4h, v6.4h 1871 umull2 v25.4s, v1.8h, v16.8h 1872 umlal2 v25.4s, v19.8h, v6.8h 1873 umull v26.4s, v3.4h, v17.4h 1874 umlal v26.4s, v20.4h, v7.4h 1875 umull2 v27.4s, v3.8h, v17.8h 1876 umlal2 v27.4s, v20.8h, v7.8h 1877 umull v28.4s, v4.4h, v17.4h 1878 umlal v28.4s, v21.4h, v7.4h 1879 umull2 v29.4s, v4.8h, v17.8h 1880 umlal2 v29.4s, v21.8h, v7.8h 1881 rshrn v22.4h, v22.4s, #6 1882 rshrn2 v22.8h, v23.4s, #6 1883 rshrn v23.4h, v24.4s, #6 1884 rshrn2 v23.8h, v25.4s, #6 1885 rshrn v24.4h, v26.4s, #6 1886 rshrn2 v24.8h, v27.4s, #6 1887 rshrn v25.4h, v28.4s, #6 1888 rshrn2 v25.8h, v29.4s, #6 1889 st1 {v22.8h, v23.8h}, [x0], #32 1890 st1 {v24.8h, v25.8h}, [x13], #32 1891 b.le 3f 1892 mov v0.16b, v2.16b 1893 ld1 {v1.8h, v2.8h}, [x8], #32 // top[base] 1894 mov v3.16b, v5.16b 1895 ld1 {v4.8h, v5.8h}, [x10], #32 1896 b 2b 1897 18983: 1899 subs w4, w4, #2 1900 b.le 9f 1901 add x0, x0, x1 1902 add x13, x13, x1 1903 mov w3, w12 1904 b 1b 19059: 1906 ret 1907 1908169: 1909 st1 {v31.8h}, [x0], #16 1910 subs w3, w3, #8 1911 st1 {v31.8h}, [x13], #16 1912 b.gt 169b 1913 subs w4, w4, #2 1914 b.le 9b 1915 add x0, x0, x1 1916 add x13, x13, x1 1917 mov w3, w12 1918 b 169b 1919 1920L(ipred_z1_fill1_tbl): 1921 .hword L(ipred_z1_fill1_tbl) - 640b 1922 .hword L(ipred_z1_fill1_tbl) - 320b 1923 .hword L(ipred_z1_fill1_tbl) - 160b 1924 .hword L(ipred_z1_fill1_tbl) - 80b 1925 .hword L(ipred_z1_fill1_tbl) - 40b 1926endfunc 1927 1928function ipred_z1_fill2_16bpc_neon, export=1 1929 cmp w3, #8 1930 add x10, x2, w6, uxtw // top[max_base_x] 1931 ld1r {v31.16b}, [x10] // padding 1932 mov w7, w5 1933 mov w15, #64 1934 b.eq 8f 1935 19364: // w == 4 1937 lsr w8, w7, #6 // base 1938 and w9, w7, #0x3e // frac 1939 add w7, w7, w5 // xpos += dx 1940 cmp w8, w6 // base >= max_base_x 1941 lsr w10, w7, #6 // base 1942 and w11, w7, #0x3e // frac 1943 b.ge 49f 1944 lsl w8, w8, #1 1945 lsl w10, w10, #1 1946 ldr q0, [x2, w8, uxtw] // top[base] 1947 ldr q2, [x2, w10, uxtw] 1948 dup v4.4h, w9 // frac 1949 dup v5.4h, w11 1950 uzp2 v1.8h, v0.8h, v0.8h // top[base+1] 1951 uzp1 v0.8h, v0.8h, v0.8h // top[base] 1952 uzp2 v3.8h, v2.8h, v2.8h 1953 uzp1 v2.8h, v2.8h, v2.8h 1954 sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] 1955 sub v7.4h, v3.4h, v2.4h 1956 ushll v16.4s, v0.4h, #6 // top[base]*64 1957 ushll v17.4s, v2.4h, #6 1958 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac 1959 smlal v17.4s, v7.4h, v5.4h 1960 rshrn v16.4h, v16.4s, #6 1961 rshrn v17.4h, v17.4s, #6 1962 st1 {v16.4h}, [x0], x1 1963 add w7, w7, w5 // xpos += dx 1964 subs w4, w4, #2 1965 st1 {v17.4h}, [x0], x1 1966 b.gt 4b 1967 ret 1968 196949: 1970 st1 {v31.4h}, [x0], x1 1971 subs w4, w4, #2 1972 st1 {v31.4h}, [x0], x1 1973 b.gt 49b 1974 ret 1975 19768: // w == 8 1977 lsr w8, w7, #6 // base 1978 and w9, w7, #0x3e // frac 1979 add w7, w7, w5 // xpos += dx 1980 cmp w8, w6 // base >= max_base_x 1981 lsr w10, w7, #6 // base 1982 and w11, w7, #0x3e // frac 1983 b.ge 89f 1984 add x8, x2, w8, uxtw #1 1985 add x10, x2, w10, uxtw #1 1986 dup v4.8h, w9 // frac 1987 dup v5.8h, w11 1988 ld1 {v0.8h, v1.8h}, [x8] // top[base] 1989 ld1 {v2.8h, v3.8h}, [x10] 1990 sub w9, w15, w9 // 64 - frac 1991 sub w11, w15, w11 1992 dup v6.8h, w9 // 64 - frac 1993 dup v7.8h, w11 1994 uzp2 v20.8h, v0.8h, v1.8h // top[base+1] 1995 uzp1 v0.8h, v0.8h, v1.8h // top[base] 1996 uzp2 v21.8h, v2.8h, v3.8h 1997 uzp1 v2.8h, v2.8h, v3.8h 1998 umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) 1999 umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac 2000 umull2 v17.4s, v0.8h, v6.8h 2001 umlal2 v17.4s, v20.8h, v4.8h 2002 umull v18.4s, v2.4h, v7.4h 2003 umlal v18.4s, v21.4h, v5.4h 2004 umull2 v19.4s, v2.8h, v7.8h 2005 umlal2 v19.4s, v21.8h, v5.8h 2006 rshrn v16.4h, v16.4s, #6 2007 rshrn2 v16.8h, v17.4s, #6 2008 rshrn v17.4h, v18.4s, #6 2009 rshrn2 v17.8h, v19.4s, #6 2010 st1 {v16.8h}, [x0], x1 2011 add w7, w7, w5 // xpos += dx 2012 subs w4, w4, #2 2013 st1 {v17.8h}, [x0], x1 2014 b.gt 8b 2015 ret 2016 201789: 2018 st1 {v31.8h}, [x0], x1 2019 subs w4, w4, #2 2020 st1 {v31.8h}, [x0], x1 2021 b.gt 89b 2022 ret 2023endfunc 2024 2025// void ipred_reverse_16bpc_neon(pixel *dst, const pixel *const src, 2026// const int n); 2027function ipred_reverse_16bpc_neon, export=1 2028 sub x1, x1, #16 2029 add x3, x0, #8 2030 mov x4, #16 20311: 2032 ld1 {v0.8h}, [x1] 2033 subs w2, w2, #8 2034 rev64 v0.8h, v0.8h 2035 sub x1, x1, #16 2036 st1 {v0.d}[1], [x0], x4 2037 st1 {v0.d}[0], [x3], x4 2038 b.gt 1b 2039 ret 2040endfunc 2041 2042const increments 2043 .short 0, 1, 2, 3, 4, 5, 6, 7 2044endconst 2045 2046// void ipred_z2_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, 2047// const pixel *const top, 2048// const pixel *const left, 2049// const int width, const int height, 2050// const int dx, const int dy); 2051function ipred_z2_fill1_16bpc_neon, export=1 2052 clz w10, w4 2053 adr x9, L(ipred_z2_fill1_tbl) 2054 sub w10, w10, #25 2055 ldrh w10, [x9, w10, uxtw #1] 2056 mov w8, #(1 << 6) // xpos = 1 << 6 2057 sub x9, x9, w10, uxtw 2058 sub w8, w8, w6 // xpos -= dx 2059 2060 movrel x11, increments 2061 ld1 {v31.8h}, [x11] // increments 2062 neg w7, w7 // -dy 2063 2064 br x9 206540: 2066 AARCH64_VALID_JUMP_TARGET 2067 2068 dup v30.4h, w7 // -dy 2069 movi v17.8b, #1 2070 2071 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy 2072 movi v25.8h, #0x3e 2073 add v30.4h, v16.4h, v30.4h // -= dy 2074 2075 // Worst case height for w=4 is 16, but we need at least h+1 elements 2076 ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[] 2077 2078 movi v26.8h, #64 2079 movi v19.16b, #4 2080 2081 shrn v29.8b, v30.8h, #6 // ypos >> 6 2082 and v27.8b, v30.8b, v25.8b // frac_y 2083 2084 add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 2085 2086 movi v23.4h, #1, lsl #8 2087 shl v29.8b, v29.8b, #1 // 2*base_y 2088 zip1 v29.8b, v29.8b, v29.8b // duplicate elements 2089 movi v17.8b, #2 2090 add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ... 2091 2092 add v30.8b, v29.8b, v17.8b // base_y + 1 (*2) 2093 add v28.8b, v29.8b, v19.8b // base_y + 2 (*2) 2094 2095 tbl v18.8b, {v0.16b}, v29.8b // left[base_y] 2096 2097 trn1 v30.2d, v30.2d, v28.2d // base_y + 1, base_y + 2 2098 2099 sub v28.4h, v26.4h, v27.4h // 64 - frac_y 2100 2101 trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3} 2102 2103 trn1 v27.2d, v27.2d, v27.2d // frac_y 2104 trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y 2105 2106 movi v29.16b, #4 21074: 2108 asr w9, w8, #6 // base_x 2109 dup v16.4h, w8 // xpos 2110 sub w8, w8, w6 // xpos -= dx 2111 cmp w9, #-4 // base_x <= -4 2112 asr w11, w8, #6 // base_x 2113 b.le 49f 2114 2115 lsl w9, w9, #1 2116 lsl w11, w11, #1 2117 2118 dup v17.4h, w8 // xpos 2119 2120 ldr q4, [x2, w9, sxtw] // top[base_x] 2121 ldr q6, [x2, w11, sxtw] 2122 2123 trn1 v16.2d, v16.2d, v17.2d // xpos 2124 2125 // Cut corners here; only doing tbl over v0-v1 here; we only 2126 // seem to need the last pixel, from v2, after skipping to the 2127 // left-only codepath below. 2128 tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2] 2129 2130 sshr v20.8h, v16.8h, #6 // first base_x for each row 2131 2132 ext v5.16b, v4.16b, v4.16b, #2 // top[base_x+1] 2133 ext v7.16b, v6.16b, v6.16b, #2 2134 2135 and v16.16b, v16.16b, v25.16b // frac_x 2136 2137 trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] 2138 2139 trn1 v4.2d, v4.2d, v6.2d // top[base_x] 2140 trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] 2141 2142 sub v17.8h, v26.8h, v16.8h // 64 - frac_x 2143 2144 add v20.8h, v20.8h, v31.8h // actual base_x 2145 2146 umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 2147 umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 2148 umull2 v22.4s, v18.8h, v28.8h 2149 umlal2 v22.4s, v19.8h, v27.8h 2150 2151 umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x) 2152 umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x 2153 umull2 v24.4s, v4.8h, v17.8h 2154 umlal2 v24.4s, v5.8h, v16.8h 2155 2156 cmge v20.8h, v20.8h, #0 2157 2158 rshrn v21.4h, v21.4s, #6 2159 rshrn2 v21.8h, v22.4s, #6 2160 rshrn v22.4h, v23.4s, #6 2161 rshrn2 v22.8h, v24.4s, #6 2162 2163 bit v21.16b, v22.16b, v20.16b 2164 2165 st1 {v21.d}[0], [x0], x1 2166 sub w8, w8, w6 // xpos -= dx 2167 subs w5, w5, #2 2168 st1 {v21.d}[1], [x0], x1 2169 b.le 9f 2170 2171 ext v18.16b, v19.16b, v19.16b, #8 2172 add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) 2173 b 4b 2174 217549: 2176 tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+2] 2177 2178 trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] 2179 2180 umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 2181 umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 2182 umull2 v21.4s, v18.8h, v28.8h 2183 umlal2 v21.4s, v19.8h, v27.8h 2184 2185 rshrn v20.4h, v20.4s, #6 2186 rshrn2 v20.8h, v21.4s, #6 2187 2188 st1 {v20.d}[0], [x0], x1 2189 subs w5, w5, #2 2190 st1 {v20.d}[1], [x0], x1 2191 b.le 9f 2192 2193 ext v18.16b, v19.16b, v19.16b, #8 2194 add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) 2195 b 49b 2196 21979: 2198 ret 2199 220080: 2201 AARCH64_VALID_JUMP_TARGET 2202 2203 stp d8, d9, [sp, #-0x40]! 2204 stp d10, d11, [sp, #0x10] 2205 stp d12, d13, [sp, #0x20] 2206 stp d14, d15, [sp, #0x30] 2207 2208 dup v18.8h, w7 // -dy 2209 add x3, x3, #2 // Skip past left[0] 2210 2211 mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy 2212 movi v25.8h, #0x3e 2213 add v16.8h, v16.8h, v18.8h // -= dy 2214 2215 // Worst case height for w=8 is 32. 2216 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[] 2217 ld1r {v15.8h}, [x2] // left[0] == top[0] 2218 2219 movi v26.8h, #64 2220 movi v19.16b, #4 2221 2222 shrn v29.8b, v16.8h, #6 // ypos >> 6 2223 and v27.16b, v16.16b, v25.16b // frac_y 2224 2225 movi v23.8h, #1, lsl #8 2226 shl v29.8b, v29.8b, #1 // 2*base_y 2227 mov v18.16b, v15.16b // left[0] 2228 zip1 v29.16b, v29.16b, v29.16b // duplicate elements 2229 movi v17.16b, #2 2230 add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ... 2231 2232 // Cut corners here; for the first row we don't expect to need to 2233 // read outside of v0. 2234 tbx v18.16b, {v0.16b}, v29.16b // left[base_y] 2235 2236 add v30.16b, v29.16b, v19.16b // base_y + 2 (*2) 2237 add v29.16b, v29.16b, v17.16b // base_y + 1 (*2) 2238 2239 sub v28.8h, v26.8h, v27.8h // 64 - frac_y 2240 2241 movi v24.16b, #4 22428: 2243 asr w9, w8, #6 // base_x 2244 dup v16.8h, w8 // xpos 2245 sub w8, w8, w6 // xpos -= dx 2246 cmp w9, #-16 // base_x <= -16 2247 asr w11, w8, #6 // base_x 2248 b.le 89f 2249 2250 dup v17.8h, w8 // xpos 2251 2252 add x9, x2, w9, sxtw #1 2253 add x11, x2, w11, sxtw #1 2254 2255 ld1 {v4.8h, v5.8h}, [x9] // top[base_x] 2256 mov v19.16b, v15.16b // left[0] 2257 ld1 {v6.8h, v7.8h}, [x11] 2258 2259 tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] 2260 2261 mov v20.16b, v15.16b // left[0] 2262 2263 sshr v21.8h, v16.8h, #6 // first base_x 2264 sshr v22.8h, v17.8h, #6 2265 2266 tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] 2267 2268 ext v5.16b, v4.16b, v5.16b, #2 // top[base_x+1] 2269 ext v7.16b, v6.16b, v7.16b, #2 2270 2271 and v16.16b, v16.16b, v25.16b // frac_x 2272 and v17.16b, v17.16b, v25.16b 2273 2274 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 2275 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 2276 2277 sub v8.8h, v26.8h, v16.8h // 64 - frac_x 2278 sub v9.8h, v26.8h, v17.8h 2279 2280 umull2 v11.4s, v18.8h, v28.8h 2281 umlal2 v11.4s, v19.8h, v27.8h 2282 2283 add v21.8h, v21.8h, v31.8h // actual base_x 2284 add v22.8h, v22.8h, v31.8h 2285 2286 umull v12.4s, v19.4h, v28.4h 2287 umlal v12.4s, v20.4h, v27.4h 2288 umull2 v13.4s, v19.8h, v28.8h 2289 umlal2 v13.4s, v20.8h, v27.8h 2290 2291 rshrn v10.4h, v10.4s, #6 2292 rshrn2 v10.8h, v11.4s, #6 2293 rshrn v11.4h, v12.4s, #6 2294 rshrn2 v11.8h, v13.4s, #6 2295 2296 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) 2297 umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x 2298 umull2 v13.4s, v4.8h, v8.8h 2299 umlal2 v13.4s, v5.8h, v16.8h 2300 umull v14.4s, v6.4h, v9.4h 2301 umlal v14.4s, v7.4h, v17.4h 2302 umull2 v18.4s, v6.8h, v9.8h 2303 umlal2 v18.4s, v7.8h, v17.8h 2304 2305 cmge v21.8h, v21.8h, #0 2306 cmge v22.8h, v22.8h, #0 2307 2308 rshrn v12.4h, v12.4s, #6 2309 rshrn2 v12.8h, v13.4s, #6 2310 rshrn v13.4h, v14.4s, #6 2311 rshrn2 v13.8h, v18.4s, #6 2312 2313 bit v10.16b, v12.16b, v21.16b 2314 bit v11.16b, v13.16b, v22.16b 2315 2316 st1 {v10.8h}, [x0], x1 2317 subs w5, w5, #2 2318 sub w8, w8, w6 // xpos -= dx 2319 st1 {v11.8h}, [x0], x1 2320 b.le 9f 2321 2322 mov v18.16b, v20.16b 2323 add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) 2324 add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) 2325 b 8b 2326 232789: 2328 mov v19.16b, v15.16b 2329 mov v20.16b, v15.16b 2330 tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] 2331 tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] 2332 2333 umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 2334 umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 2335 umull2 v5.4s, v18.8h, v28.8h 2336 umlal2 v5.4s, v19.8h, v27.8h 2337 umull v6.4s, v19.4h, v28.4h 2338 umlal v6.4s, v20.4h, v27.4h 2339 umull2 v7.4s, v19.8h, v28.8h 2340 umlal2 v7.4s, v20.8h, v27.8h 2341 2342 rshrn v4.4h, v4.4s, #6 2343 rshrn2 v4.8h, v5.4s, #6 2344 rshrn v5.4h, v6.4s, #6 2345 rshrn2 v5.8h, v7.4s, #6 2346 2347 st1 {v4.8h}, [x0], x1 2348 subs w5, w5, #2 2349 st1 {v5.8h}, [x0], x1 2350 b.le 9f 2351 2352 mov v18.16b, v20.16b 2353 add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) 2354 add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) 2355 b 89b 2356 23579: 2358 ldp d14, d15, [sp, #0x30] 2359 ldp d12, d13, [sp, #0x20] 2360 ldp d10, d11, [sp, #0x10] 2361 ldp d8, d9, [sp], 0x40 2362 ret 2363 2364160: 2365320: 2366640: 2367 AARCH64_VALID_JUMP_TARGET 2368 2369 stp d8, d9, [sp, #-0x40]! 2370 stp d10, d11, [sp, #0x10] 2371 stp d12, d13, [sp, #0x20] 2372 stp d14, d15, [sp, #0x30] 2373 2374 dup v25.8h, w7 // -dy 2375 add x3, x3, #2 // Skip past left[0] 2376 2377 add x13, x0, x1 // alternating row 2378 lsl x1, x1, #1 // stride *= 2 2379 sub x1, x1, w4, uxtw #1 // stride -= width 2380 2381 movi v11.8h, #8 2382 mul v26.8h, v31.8h, v25.8h // {0,1,2,3,4,5,6,7}* -dy 2383 add v26.8h, v26.8h, v25.8h // -= dy 2384 mul v25.8h, v25.8h, v11.8h // -8*dy 2385 2386 // Worst case height is 64, but we can only fit 32 pixels into 2387 // v0-v3 usable within one tbx instruction. As long as base_y is 2388 // up to 32, we use tbx. 2389 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[] 2390 ld1r {v15.8h}, [x2] // left[0] == top[0] 2391 2392 mov w12, w4 // orig w 2393 neg w14, w4 // -w 2394 23951: 2396 mov v23.16b, v26.16b // reset ypos 2397 2398 asr w9, w8, #6 // base_x 2399 dup v16.8h, w8 // xpos 2400 sub w8, w8, w6 // xpos -= dx 2401 cmp w9, w14 // base_x <= -2*w 2402 asr w11, w8, #6 // base_x 2403 b.le 169f 2404 2405 dup v17.8h, w8 // xpos 2406 sub w8, w8, w6 // xpos -= dx 2407 2408 add x9, x2, w9, sxtw #1 2409 add x11, x2, w11, sxtw #1 2410 2411 sshr v21.8h, v16.8h, #6 // first base_x 2412 sshr v22.8h, v17.8h, #6 2413 2414 ld1 {v4.8h}, [x9], #16 // top[base_x] 2415 ld1 {v6.8h}, [x11], #16 2416 2417 movi v10.8h, #0x3e 2418 movi v11.8h, #64 2419 2420 and v16.16b, v16.16b, v10.16b // frac_x 2421 and v17.16b, v17.16b, v10.16b 2422 2423 sub v8.8h, v11.8h, v16.8h // 64 - frac_x 2424 sub v9.8h, v11.8h, v17.8h 2425 2426 add v21.8h, v21.8h, v31.8h // actual base_x 2427 add v22.8h, v22.8h, v31.8h 2428 24292: 2430 smov w10, v22.h[0] 2431 2432 shrn v29.8b, v23.8h, #6 // ypos >> 6 2433 movi v12.8h, #64 2434 cmp w10, #0 // base_x (bottom left) >= 0 2435 smov w10, v29.b[0] // base_y[0] 2436 movi v10.8h, #0x3e 2437 2438 b.ge 4f 2439 and v27.16b, v23.16b, v10.16b // frac_y 2440 cmp w10, #(32-3) 2441 2442 mov v18.16b, v15.16b // left[0] 2443 sub v28.8h, v12.8h, v27.8h // 64 - frac_y 2444 b.gt 22f 2445 244621: 2447 // base_y < 32, using tbx 2448 shl v29.8b, v29.8b, #1 // 2*base_y 2449 movi v11.8h, #1, lsl #8 2450 zip1 v29.16b, v29.16b, v29.16b // duplicate elements 2451 add v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ... 2452 2453 movi v13.16b, #2 2454 2455 tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] 2456 2457 add v29.16b, v29.16b, v13.16b // base_y + 1 (*2) 2458 mov v19.16b, v15.16b // left[0] 2459 2460 tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] 2461 2462 add v29.16b, v29.16b, v13.16b // base_y + 2 (*2) 2463 mov v20.16b, v15.16b // left[0] 2464 2465 tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] 2466 2467 b 23f 2468 246922: 2470 // base_y >= 32, using separate loads. 2471 smov w15, v29.b[1] 2472 smov w16, v29.b[2] 2473 add x10, x3, w10, sxtw #1 2474 smov w17, v29.b[3] 2475 add x15, x3, w15, sxtw #1 2476 ld3 {v18.h, v19.h, v20.h}[0], [x10] 2477 smov w10, v29.b[4] 2478 add x16, x3, w16, sxtw #1 2479 ld3 {v18.h, v19.h, v20.h}[1], [x15] 2480 smov w15, v29.b[5] 2481 add x17, x3, w17, sxtw #1 2482 ld3 {v18.h, v19.h, v20.h}[2], [x16] 2483 smov w16, v29.b[6] 2484 add x10, x3, w10, sxtw #1 2485 ld3 {v18.h, v19.h, v20.h}[3], [x17] 2486 smov w17, v29.b[7] 2487 add x15, x3, w15, sxtw #1 2488 add x16, x3, w16, sxtw #1 2489 ld3 {v18.h, v19.h, v20.h}[4], [x10] 2490 add x17, x3, w17, sxtw #1 2491 ld3 {v18.h, v19.h, v20.h}[5], [x15] 2492 ld3 {v18.h, v19.h, v20.h}[6], [x16] 2493 ld3 {v18.h, v19.h, v20.h}[7], [x17] 2494 249523: 2496 2497 ld1 {v5.8h}, [x9], #16 // top[base_x] 2498 ld1 {v7.8h}, [x11], #16 2499 2500 add v23.8h, v23.8h, v25.8h // ypos -= 8*dy 2501 2502 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 2503 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 2504 umull2 v11.4s, v18.8h, v28.8h 2505 umlal2 v11.4s, v19.8h, v27.8h 2506 umull v12.4s, v19.4h, v28.4h 2507 umlal v12.4s, v20.4h, v27.4h 2508 umull2 v13.4s, v19.8h, v28.8h 2509 umlal2 v13.4s, v20.8h, v27.8h 2510 2511 ext v18.16b, v4.16b, v5.16b, #2 // top[base_x+1] 2512 ext v19.16b, v6.16b, v7.16b, #2 2513 2514 rshrn v10.4h, v10.4s, #6 2515 rshrn2 v10.8h, v11.4s, #6 2516 rshrn v11.4h, v12.4s, #6 2517 rshrn2 v11.8h, v13.4s, #6 2518 2519 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) 2520 umlal v12.4s, v18.4h, v16.4h // + top[base_x+1]*frac_x 2521 umull2 v13.4s, v4.8h, v8.8h 2522 umlal2 v13.4s, v18.8h, v16.8h 2523 umull v14.4s, v6.4h, v9.4h 2524 umlal v14.4s, v19.4h, v17.4h 2525 umull2 v20.4s, v6.8h, v9.8h 2526 umlal2 v20.4s, v19.8h, v17.8h 2527 2528 cmge v18.8h, v21.8h, #0 2529 cmge v19.8h, v22.8h, #0 2530 2531 rshrn v12.4h, v12.4s, #6 2532 rshrn2 v12.8h, v13.4s, #6 2533 rshrn v13.4h, v14.4s, #6 2534 rshrn2 v13.8h, v20.4s, #6 2535 2536 bit v10.16b, v12.16b, v18.16b 2537 bit v11.16b, v13.16b, v19.16b 2538 2539 st1 {v10.8h}, [x0], #16 2540 subs w4, w4, #8 2541 st1 {v11.8h}, [x13], #16 2542 b.le 3f 2543 2544 movi v10.8h, #8 2545 mov v4.16b, v5.16b 2546 mov v6.16b, v7.16b 2547 add v21.8h, v21.8h, v10.8h // base_x += 8 2548 add v22.8h, v22.8h, v10.8h 2549 b 2b 2550 25513: 2552 subs w5, w5, #2 2553 b.le 9f 2554 movi v10.8h, #128 2555 add x0, x0, x1 2556 add x13, x13, x1 2557 mov w4, w12 // reset w 2558 add v26.8h, v26.8h, v10.8h // ypos += 2*(1<<6) 2559 b 1b 2560 25614: // The rest of the row only predicted from top[] 2562 ld1 {v5.8h}, [x9], #16 // top[base_x] 2563 ld1 {v7.8h}, [x11], #16 2564 2565 ext v18.16b, v4.16b, v5.16b, #2 // top[base_x+1] 2566 ext v19.16b, v6.16b, v7.16b, #2 2567 2568 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) 2569 umlal v12.4s, v18.4h, v16.4h // + top[base_x+1]*frac_x 2570 umull2 v13.4s, v4.8h, v8.8h 2571 umlal2 v13.4s, v18.8h, v16.8h 2572 umull v14.4s, v6.4h, v9.4h 2573 umlal v14.4s, v19.4h, v17.4h 2574 umull2 v20.4s, v6.8h, v9.8h 2575 umlal2 v20.4s, v19.8h, v17.8h 2576 2577 rshrn v12.4h, v12.4s, #6 2578 rshrn2 v12.8h, v13.4s, #6 2579 rshrn v13.4h, v14.4s, #6 2580 rshrn2 v13.8h, v20.4s, #6 2581 2582 st1 {v12.8h}, [x0], #16 2583 subs w4, w4, #8 2584 st1 {v13.8h}, [x13], #16 2585 b.le 3b 2586 2587 mov v4.16b, v5.16b 2588 mov v6.16b, v7.16b 2589 b 4b 2590 2591169: // The rest of the block only predicted from left[] 2592 add x1, x1, w4, uxtw #1 // restore stride 2593 mov w12, w5 // orig remaining h 25941: 2595 movi v12.8h, #64 2596 movi v10.8h, #0x3e 2597 2598 shrn v29.8b, v23.8h, #6 // ypos >> 6 2599 and v27.16b, v23.16b, v10.16b // frac_y 2600 2601 smov w10, v29.b[0] // base_y[0] 2602 2603 shl v29.8b, v29.8b, #1 // 2*base_y 2604 movi v11.8h, #1, lsl #8 2605 zip1 v29.16b, v29.16b, v29.16b // duplicate elements 2606 add v23.8h, v23.8h, v25.8h // ypos -= 8*dy 2607 add v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ... 2608 2609 cmp w10, #(32-1) 2610 2611 mov v18.16b, v15.16b // left[0] 2612 movi v21.16b, #2 2613 2614 sub v28.8h, v12.8h, v27.8h // 64 - frac_y 2615 2616 b.gt 31f 2617 2618 tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] 2619 add v29.16b, v29.16b, v21.16b // base_y + 1 (*2) 2620 26212: 2622 // base_y < 32, using tbx. 2623 smov w10, v29.b[0] // base_y[0] 2624 mov v19.16b, v15.16b // left[0] 2625 cmp w10, #(64-4) 2626 b.gt 32f 2627 tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] 2628 add v29.16b, v29.16b, v21.16b // base_y + 2 (*2) 2629 mov v20.16b, v15.16b // left[0] 2630 tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] 2631 add v29.16b, v29.16b, v21.16b // next base_y 2632 2633 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 2634 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 2635 umull2 v11.4s, v18.8h, v28.8h 2636 umlal2 v11.4s, v19.8h, v27.8h 2637 umull v12.4s, v19.4h, v28.4h 2638 umlal v12.4s, v20.4h, v27.4h 2639 umull2 v13.4s, v19.8h, v28.8h 2640 umlal2 v13.4s, v20.8h, v27.8h 2641 2642 rshrn v10.4h, v10.4s, #6 2643 rshrn2 v10.8h, v11.4s, #6 2644 rshrn v11.4h, v12.4s, #6 2645 rshrn2 v11.8h, v13.4s, #6 2646 2647 st1 {v10.8h}, [x0], x1 2648 subs w5, w5, #2 2649 st1 {v11.8h}, [x13], x1 2650 b.le 4f 2651 mov v18.16b, v20.16b 2652 b 2b 2653 265431: // base_y >= 32, using separate loads, loading v18 if we had to bail 2655 // in the prologue. 2656 smov w10, v29.b[0] 2657 smov w15, v29.b[2] 2658 movi v21.16b, #2 2659 smov w16, v29.b[4] 2660 add x10, x3, w10, sxtw 2661 smov w17, v29.b[6] 2662 add x15, x3, w15, sxtw 2663 ld1 {v18.h}[0], [x10] 2664 smov w10, v29.b[8] 2665 add x16, x3, w16, sxtw 2666 ld1 {v18.h}[1], [x15] 2667 smov w15, v29.b[10] 2668 add x17, x3, w17, sxtw 2669 ld1 {v18.h}[2], [x16] 2670 smov w16, v29.b[12] 2671 add x10, x3, w10, sxtw 2672 ld1 {v18.h}[3], [x17] 2673 smov w17, v29.b[14] 2674 add x15, x3, w15, sxtw 2675 add x16, x3, w16, sxtw 2676 ld1 {v18.h}[4], [x10] 2677 add x17, x3, w17, sxtw 2678 ld1 {v18.h}[5], [x15] 2679 add v29.16b, v29.16b, v21.16b // next base_y 2680 ld1 {v18.h}[6], [x16] 2681 ld1 {v18.h}[7], [x17] 2682 268332: // base_y >= 32, using separate loads. 2684 cmp w5, #4 2685 b.lt 34f 268633: // h >= 4, preserving v18 from the previous round, loading v19-v22. 2687 smov w10, v29.b[0] 2688 subs w5, w5, #4 2689 smov w15, v29.b[2] 2690 movi v10.16b, #8 2691 smov w16, v29.b[4] 2692 add x10, x3, w10, sxtw 2693 smov w17, v29.b[6] 2694 add x15, x3, w15, sxtw 2695 ld4 {v19.h, v20.h, v21.h, v22.h}[0], [x10] 2696 smov w10, v29.b[8] 2697 add x16, x3, w16, sxtw 2698 ld4 {v19.h, v20.h, v21.h, v22.h}[1], [x15] 2699 smov w15, v29.b[10] 2700 add x17, x3, w17, sxtw 2701 ld4 {v19.h, v20.h, v21.h, v22.h}[2], [x16] 2702 smov w16, v29.b[12] 2703 add x10, x3, w10, sxtw 2704 ld4 {v19.h, v20.h, v21.h, v22.h}[3], [x17] 2705 smov w17, v29.b[14] 2706 add x15, x3, w15, sxtw 2707 add x16, x3, w16, sxtw 2708 ld4 {v19.h, v20.h, v21.h, v22.h}[4], [x10] 2709 add x17, x3, w17, sxtw 2710 ld4 {v19.h, v20.h, v21.h, v22.h}[5], [x15] 2711 ld4 {v19.h, v20.h, v21.h, v22.h}[6], [x16] 2712 add v29.16b, v29.16b, v10.16b // next base_y 2713 ld4 {v19.h, v20.h, v21.h, v22.h}[7], [x17] 2714 2715 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 2716 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 2717 umull2 v11.4s, v18.8h, v28.8h 2718 umlal2 v11.4s, v19.8h, v27.8h 2719 umull v12.4s, v19.4h, v28.4h 2720 umlal v12.4s, v20.4h, v27.4h 2721 umull2 v13.4s, v19.8h, v28.8h 2722 umlal2 v13.4s, v20.8h, v27.8h 2723 2724 rshrn v10.4h, v10.4s, #6 2725 rshrn2 v10.8h, v11.4s, #6 2726 rshrn v11.4h, v12.4s, #6 2727 rshrn2 v11.8h, v13.4s, #6 2728 2729 umull v12.4s, v20.4h, v28.4h // left[base_y]*(64-frac_y) 2730 umlal v12.4s, v21.4h, v27.4h // + left[base_y+1]*frac_y 2731 umull2 v13.4s, v20.8h, v28.8h 2732 umlal2 v13.4s, v21.8h, v27.8h 2733 umull v14.4s, v21.4h, v28.4h 2734 umlal v14.4s, v22.4h, v27.4h 2735 umull2 v18.4s, v21.8h, v28.8h 2736 umlal2 v18.4s, v22.8h, v27.8h 2737 2738 rshrn v12.4h, v12.4s, #6 2739 rshrn2 v12.8h, v13.4s, #6 2740 rshrn v13.4h, v14.4s, #6 2741 rshrn2 v13.8h, v18.4s, #6 2742 2743 st1 {v10.8h}, [x0], x1 2744 cmp w5, #2 2745 st1 {v11.8h}, [x13], x1 2746 st1 {v12.8h}, [x0], x1 2747 st1 {v13.8h}, [x13], x1 2748 b.lt 4f 2749 mov v18.16b, v22.16b 2750 b.gt 33b 2751 275234: // h == 2, preserving v18 from the previous round, loading v19-v20. 2753 smov w10, v29.b[0] 2754 smov w15, v29.b[2] 2755 movi v21.16b, #4 2756 smov w16, v29.b[4] 2757 add x10, x3, w10, sxtw 2758 smov w17, v29.b[6] 2759 add x15, x3, w15, sxtw 2760 ld2 {v19.h, v20.h}[0], [x10] 2761 smov w10, v29.b[8] 2762 add x16, x3, w16, sxtw 2763 ld2 {v19.h, v20.h}[1], [x15] 2764 smov w15, v29.b[10] 2765 add x17, x3, w17, sxtw 2766 ld2 {v19.h, v20.h}[2], [x16] 2767 smov w16, v29.b[12] 2768 add x10, x3, w10, sxtw 2769 ld2 {v19.h, v20.h}[3], [x17] 2770 smov w17, v29.b[14] 2771 add x15, x3, w15, sxtw 2772 add x16, x3, w16, sxtw 2773 ld2 {v19.h, v20.h}[4], [x10] 2774 add x17, x3, w17, sxtw 2775 ld2 {v19.h, v20.h}[5], [x15] 2776 ld2 {v19.h, v20.h}[6], [x16] 2777 add v29.16b, v29.16b, v21.16b // next base_y 2778 ld2 {v19.h, v20.h}[7], [x17] 2779 2780 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 2781 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 2782 umull2 v11.4s, v18.8h, v28.8h 2783 umlal2 v11.4s, v19.8h, v27.8h 2784 umull v12.4s, v19.4h, v28.4h 2785 umlal v12.4s, v20.4h, v27.4h 2786 umull2 v13.4s, v19.8h, v28.8h 2787 umlal2 v13.4s, v20.8h, v27.8h 2788 2789 rshrn v10.4h, v10.4s, #6 2790 rshrn2 v10.8h, v11.4s, #6 2791 rshrn v11.4h, v12.4s, #6 2792 rshrn2 v11.8h, v13.4s, #6 2793 2794 st1 {v10.8h}, [x0], x1 2795 st1 {v11.8h}, [x13], x1 2796 // The h==2 case only happens once at the end, if at all. 2797 27984: 2799 subs w4, w4, #8 2800 b.le 9f 2801 2802 lsr x1, x1, #1 2803 msub x0, x1, x12, x0 // ptr -= h * stride 2804 msub x13, x1, x12, x13 2805 lsl x1, x1, #1 2806 add x0, x0, #16 2807 add x13, x13, #16 2808 mov w5, w12 // reset h 2809 b 1b 2810 28119: 2812 ldp d14, d15, [sp, #0x30] 2813 ldp d12, d13, [sp, #0x20] 2814 ldp d10, d11, [sp, #0x10] 2815 ldp d8, d9, [sp], 0x40 2816 ret 2817 2818L(ipred_z2_fill1_tbl): 2819 .hword L(ipred_z2_fill1_tbl) - 640b 2820 .hword L(ipred_z2_fill1_tbl) - 320b 2821 .hword L(ipred_z2_fill1_tbl) - 160b 2822 .hword L(ipred_z2_fill1_tbl) - 80b 2823 .hword L(ipred_z2_fill1_tbl) - 40b 2824endfunc 2825 2826function ipred_z2_fill2_16bpc_neon, export=1 2827 cmp w4, #8 2828 mov w8, #(2 << 6) // xpos = 2 << 6 2829 sub w8, w8, w6 // xpos -= dx 2830 2831 movrel x11, increments 2832 ld1 {v31.8h}, [x11] // increments 2833 neg w7, w7 // -dy 2834 b.eq 80f 2835 283640: 2837 dup v30.4h, w7 // -dy 2838 movi v17.8b, #1 2839 2840 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy 2841 movi v25.8h, #0x3e 2842 add v30.4h, v16.4h, v30.4h // -= dy 2843 2844 // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements 2845 // from left. 2846 ld1 {v0.8h, v1.8h}, [x3] // left[] 2847 2848 movi v26.8h, #64 2849 movi v19.16b, #4 2850 2851 shrn v29.8b, v30.8h, #6 // ypos >> 6 2852 and v27.8b, v30.8b, v25.8b // frac_y 2853 2854 add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 2855 2856 movi v23.4h, #1, lsl #8 2857 shl v29.8b, v29.8b, #1 // 2*base_y 2858 zip1 v29.8b, v29.8b, v29.8b // duplicate elements 2859 movi v17.8b, #2 2860 add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ... 2861 2862 add v30.8b, v29.8b, v17.8b // base_y + 1 (*2) 2863 add v28.8b, v29.8b, v19.8b // base_y + 2 (*2) 2864 2865 tbl v18.8b, {v0.16b}, v29.8b // left[base_y] 2866 2867 trn1 v30.2d, v30.2d, v28.2d // base_y + 1, base_y + 2 2868 2869 sub v28.4h, v26.4h, v27.4h // 64 - frac_y 2870 2871 trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3} 2872 2873 trn1 v27.2d, v27.2d, v27.2d // frac_y 2874 trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y 2875 2876 movi v29.16b, #4 2877 add v31.8h, v31.8h, v31.8h // {0,2,4,6,0,2,4,6} 28784: 2879 asr w9, w8, #6 // base_x 2880 dup v16.4h, w8 // xpos 2881 sub w8, w8, w6 // xpos -= dx 2882 cmp w9, #-8 // base_x <= -8 2883 asr w11, w8, #6 // base_x 2884 b.le 49f 2885 2886 lsl w9, w9, #1 2887 lsl w11, w11, #1 2888 2889 dup v17.4h, w8 // xpos 2890 2891 ldr q4, [x2, w9, sxtw] // top[base_x] 2892 ldr q6, [x2, w11, sxtw] 2893 2894 trn1 v16.2d, v16.2d, v17.2d // xpos 2895 2896 tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2] 2897 2898 sshr v20.8h, v16.8h, #6 // first base_x for each row 2899 2900 uzp2 v5.8h, v4.8h, v6.8h // top[base_x+1] 2901 uzp1 v4.8h, v4.8h, v6.8h // top[base_x] 2902 2903 and v16.16b, v16.16b, v25.16b // frac_x 2904 2905 trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] 2906 2907 sub v17.8h, v26.8h, v16.8h // 64 - frac_x 2908 2909 add v20.8h, v20.8h, v31.8h // actual base_x 2910 2911 umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 2912 umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 2913 umull2 v22.4s, v18.8h, v28.8h 2914 umlal2 v22.4s, v19.8h, v27.8h 2915 2916 umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x) 2917 umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x 2918 umull2 v24.4s, v4.8h, v17.8h 2919 umlal2 v24.4s, v5.8h, v16.8h 2920 2921 cmge v20.8h, v20.8h, #0 2922 2923 rshrn v21.4h, v21.4s, #6 2924 rshrn2 v21.8h, v22.4s, #6 2925 rshrn v22.4h, v23.4s, #6 2926 rshrn2 v22.8h, v24.4s, #6 2927 2928 bit v21.16b, v22.16b, v20.16b 2929 2930 st1 {v21.d}[0], [x0], x1 2931 sub w8, w8, w6 // xpos -= dx 2932 subs w5, w5, #2 2933 st1 {v21.d}[1], [x0], x1 2934 b.le 9f 2935 2936 ext v18.16b, v19.16b, v19.16b, #8 2937 add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) 2938 b 4b 2939 294049: 2941 tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2] 2942 2943 trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] 2944 2945 umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 2946 umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 2947 umull2 v21.4s, v18.8h, v28.8h 2948 umlal2 v21.4s, v19.8h, v27.8h 2949 2950 rshrn v20.4h, v20.4s, #6 2951 rshrn2 v20.8h, v21.4s, #6 2952 2953 st1 {v20.d}[0], [x0], x1 2954 subs w5, w5, #2 2955 st1 {v20.d}[1], [x0], x1 2956 b.le 9f 2957 2958 ext v18.16b, v19.16b, v19.16b, #8 2959 add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) 2960 b 49b 2961 29629: 2963 ret 2964 296580: 2966 stp d8, d9, [sp, #-0x40]! 2967 stp d10, d11, [sp, #0x10] 2968 stp d12, d13, [sp, #0x20] 2969 stp d14, d15, [sp, #0x30] 2970 2971 dup v18.8h, w7 // -dy 2972 movi v17.8b, #1 2973 2974 mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy 2975 movi v25.8h, #0x3e 2976 add v16.8h, v16.8h, v18.8h // -= dy 2977 2978 // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements 2979 // from left. 2980 ld1 {v0.8h, v1.8h}, [x3] // left[] 2981 2982 movi v26.8h, #64 2983 movi v19.16b, #4 2984 2985 shrn v29.8b, v16.8h, #6 // ypos >> 6 2986 and v27.16b, v16.16b, v25.16b // frac_y 2987 2988 add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 2989 2990 movi v23.8h, #1, lsl #8 2991 shl v29.8b, v29.8b, #1 // 2*base_y 2992 zip1 v29.16b, v29.16b, v29.16b // duplicate elements 2993 movi v17.16b, #2 2994 add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ... 2995 2996 // Cut corners here; for the first row we don't expect to need to 2997 // read outside of v0. 2998 tbl v18.16b, {v0.16b}, v29.16b // left[base_y] 2999 3000 add v30.16b, v29.16b, v19.16b // base_y + 2 (*2) 3001 add v29.16b, v29.16b, v17.16b // base_y + 1 (*2) 3002 3003 sub v28.8h, v26.8h, v27.8h // 64 - frac_y 3004 3005 movi v24.16b, #4 3006 add v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14} 30078: 3008 asr w9, w8, #6 // base_x 3009 dup v16.8h, w8 // xpos 3010 sub w8, w8, w6 // xpos -= dx 3011 cmp w9, #-16 // base_x <= -16 3012 asr w11, w8, #6 // base_x 3013 b.le 89f 3014 3015 dup v17.8h, w8 // xpos 3016 3017 add x9, x2, w9, sxtw #1 3018 add x11, x2, w11, sxtw #1 3019 3020 ld1 {v4.8h, v5.8h}, [x9] // top[base_x] 3021 ld1 {v6.8h, v7.8h}, [x11] 3022 3023 tbl v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1] 3024 3025 sshr v21.8h, v16.8h, #6 // first base_x 3026 sshr v22.8h, v17.8h, #6 3027 3028 tbl v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2] 3029 3030 uzp2 v2.8h, v4.8h, v5.8h // top[base_x+1] 3031 uzp1 v4.8h, v4.8h, v5.8h // top[base_x] 3032 uzp2 v3.8h, v6.8h, v7.8h 3033 uzp1 v6.8h, v6.8h, v7.8h 3034 mov v5.16b, v2.16b 3035 mov v7.16b, v3.16b 3036 3037 and v16.16b, v16.16b, v25.16b // frac_x 3038 and v17.16b, v17.16b, v25.16b 3039 3040 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 3041 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 3042 3043 sub v8.8h, v26.8h, v16.8h // 64 - frac_x 3044 sub v9.8h, v26.8h, v17.8h 3045 3046 umull2 v11.4s, v18.8h, v28.8h 3047 umlal2 v11.4s, v19.8h, v27.8h 3048 3049 add v21.8h, v21.8h, v31.8h // actual base_x 3050 add v22.8h, v22.8h, v31.8h 3051 3052 umull v12.4s, v19.4h, v28.4h 3053 umlal v12.4s, v20.4h, v27.4h 3054 umull2 v13.4s, v19.8h, v28.8h 3055 umlal2 v13.4s, v20.8h, v27.8h 3056 3057 rshrn v10.4h, v10.4s, #6 3058 rshrn2 v10.8h, v11.4s, #6 3059 rshrn v11.4h, v12.4s, #6 3060 rshrn2 v11.8h, v13.4s, #6 3061 3062 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) 3063 umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x 3064 umull2 v13.4s, v4.8h, v8.8h 3065 umlal2 v13.4s, v5.8h, v16.8h 3066 umull v14.4s, v6.4h, v9.4h 3067 umlal v14.4s, v7.4h, v17.4h 3068 umull2 v18.4s, v6.8h, v9.8h 3069 umlal2 v18.4s, v7.8h, v17.8h 3070 3071 cmge v21.8h, v21.8h, #0 3072 cmge v22.8h, v22.8h, #0 3073 3074 rshrn v12.4h, v12.4s, #6 3075 rshrn2 v12.8h, v13.4s, #6 3076 rshrn v13.4h, v14.4s, #6 3077 rshrn2 v13.8h, v18.4s, #6 3078 3079 bit v10.16b, v12.16b, v21.16b 3080 bit v11.16b, v13.16b, v22.16b 3081 3082 st1 {v10.8h}, [x0], x1 3083 subs w5, w5, #2 3084 sub w8, w8, w6 // xpos -= dx 3085 st1 {v11.8h}, [x0], x1 3086 b.le 9f 3087 3088 mov v18.16b, v20.16b 3089 add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) 3090 add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) 3091 b 8b 3092 309389: 3094 tbl v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1] 3095 tbl v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2] 3096 3097 umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 3098 umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 3099 umull2 v5.4s, v18.8h, v28.8h 3100 umlal2 v5.4s, v19.8h, v27.8h 3101 umull v6.4s, v19.4h, v28.4h 3102 umlal v6.4s, v20.4h, v27.4h 3103 umull2 v7.4s, v19.8h, v28.8h 3104 umlal2 v7.4s, v20.8h, v27.8h 3105 3106 rshrn v4.4h, v4.4s, #6 3107 rshrn2 v4.8h, v5.4s, #6 3108 rshrn v5.4h, v6.4s, #6 3109 rshrn2 v5.8h, v7.4s, #6 3110 3111 st1 {v4.8h}, [x0], x1 3112 subs w5, w5, #2 3113 st1 {v5.8h}, [x0], x1 3114 b.le 9f 3115 3116 mov v18.16b, v20.16b 3117 add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) 3118 add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) 3119 b 89b 3120 31219: 3122 ldp d14, d15, [sp, #0x30] 3123 ldp d12, d13, [sp, #0x20] 3124 ldp d10, d11, [sp, #0x10] 3125 ldp d8, d9, [sp], 0x40 3126 ret 3127endfunc 3128 3129function ipred_z2_fill3_16bpc_neon, export=1 3130 cmp w4, #8 3131 mov w8, #(1 << 6) // xpos = 1 << 6 3132 sub w8, w8, w6 // xpos -= dx 3133 3134 movrel x11, increments 3135 ld1 {v31.8h}, [x11] // increments 3136 neg w7, w7 // -dy 3137 b.eq 80f 3138 313940: 3140 dup v30.4h, w7 // -dy 3141 movi v17.8b, #1 3142 3143 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy 3144 movi v25.8h, #0x3e 3145 add v30.4h, v16.4h, v30.4h // -= dy 3146 3147 // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. 3148 ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[] 3149 3150 movi v26.8h, #64 3151 movi v19.16b, #2 3152 3153 shrn v29.8b, v30.8h, #6 // ypos >> 6 3154 and v27.8b, v30.8b, v25.8b // frac_y 3155 3156 add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2 3157 3158 movi v23.4h, #1, lsl #8 3159 shl v29.8b, v29.8b, #1 // 2*base_y 3160 movi v19.16b, #4 3161 zip1 v29.8b, v29.8b, v29.8b // duplicate elements 3162 movi v17.8b, #2 3163 add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ... 3164 3165 add v30.8b, v29.8b, v17.8b // base_y + 1 (*2) 3166 add v28.8b, v29.8b, v19.8b // base_y + 2 (*2) 3167 3168 trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3} 3169 3170 add v24.8b, v30.8b, v19.8b // base_y + 3 (*2) 3171 3172 trn1 v29.2d, v29.2d, v28.2d // base_y + 0, base_y + 2 3173 trn1 v30.2d, v30.2d, v24.2d // base_y + 1, base_y + 3 3174 3175 sub v28.4h, v26.4h, v27.4h // 64 - frac_y 3176 3177 trn1 v27.2d, v27.2d, v27.2d // frac_y 3178 trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y 3179 3180 movi v24.16b, #8 31814: 3182 asr w9, w8, #6 // base_x 3183 dup v16.4h, w8 // xpos 3184 sub w8, w8, w6 // xpos -= dx 3185 cmp w9, #-4 // base_x <= -4 3186 asr w11, w8, #6 // base_x 3187 b.le 49f 3188 3189 lsl w9, w9, #1 3190 lsl w11, w11, #1 3191 3192 dup v17.4h, w8 // xpos 3193 3194 ldr q4, [x2, w9, sxtw] // top[base_x] 3195 ldr q6, [x2, w11, sxtw] 3196 3197 trn1 v16.2d, v16.2d, v17.2d // xpos 3198 3199 tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] 3200 tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] 3201 3202 sshr v20.8h, v16.8h, #6 // first base_x for each row 3203 3204 ext v5.16b, v4.16b, v4.16b, #2 // top[base_x+1] 3205 ext v7.16b, v6.16b, v6.16b, #2 3206 3207 and v16.16b, v16.16b, v25.16b // frac_x 3208 3209 trn1 v4.2d, v4.2d, v6.2d // top[base_x] 3210 trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] 3211 3212 sub v17.8h, v26.8h, v16.8h // 64 - frac_x 3213 3214 add v20.8h, v20.8h, v31.8h // actual base_x 3215 3216 umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 3217 umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 3218 umull2 v22.4s, v18.8h, v28.8h 3219 umlal2 v22.4s, v19.8h, v27.8h 3220 3221 umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x) 3222 umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x 3223 umull2 v24.4s, v4.8h, v17.8h 3224 umlal2 v24.4s, v5.8h, v16.8h 3225 3226 cmge v20.8h, v20.8h, #0 3227 3228 rshrn v21.4h, v21.4s, #6 3229 rshrn2 v21.8h, v22.4s, #6 3230 rshrn v22.4h, v23.4s, #6 3231 rshrn2 v22.8h, v24.4s, #6 3232 3233 movi v24.16b, #8 3234 3235 bit v21.16b, v22.16b, v20.16b 3236 3237 st1 {v21.d}[0], [x0], x1 3238 sub w8, w8, w6 // xpos -= dx 3239 subs w5, w5, #2 3240 st1 {v21.d}[1], [x0], x1 3241 b.le 9f 3242 3243 add v29.16b, v29.16b, v24.16b // base_y += 4 (*2) 3244 add v30.16b, v30.16b, v24.16b // base_y += 4 (*2) 3245 b 4b 3246 324749: 3248 tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] 3249 tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] 3250 3251 umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 3252 umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 3253 umull2 v21.4s, v18.8h, v28.8h 3254 umlal2 v21.4s, v19.8h, v27.8h 3255 3256 rshrn v20.4h, v20.4s, #6 3257 rshrn2 v20.8h, v21.4s, #6 3258 3259 st1 {v20.d}[0], [x0], x1 3260 subs w5, w5, #2 3261 st1 {v20.d}[1], [x0], x1 3262 b.le 9f 3263 3264 add v29.16b, v29.16b, v24.16b // base_y += 4 (*2) 3265 add v30.16b, v30.16b, v24.16b // base_y += 4 (*2) 3266 b 49b 3267 32689: 3269 ret 3270 327180: 3272 stp d8, d9, [sp, #-0x40]! 3273 stp d10, d11, [sp, #0x10] 3274 stp d12, d13, [sp, #0x20] 3275 stp d14, d15, [sp, #0x30] 3276 3277 dup v18.8h, w7 // -dy 3278 movi v17.16b, #2 3279 3280 mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy 3281 movi v25.8h, #0x3e 3282 add v16.8h, v16.8h, v18.8h // -= dy 3283 3284 // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. 3285 ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[] 3286 3287 movi v26.8h, #64 3288 movi v19.16b, #4 3289 3290 shrn v29.8b, v16.8h, #6 // ypos >> 6 3291 and v27.16b, v16.16b, v25.16b // frac_y 3292 3293 add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 2 3294 3295 movi v23.8h, #1, lsl #8 3296 shl v29.8b, v29.8b, #1 // 2*base_y 3297 mov v18.16b, v15.16b // left[0] 3298 zip1 v29.16b, v29.16b, v29.16b // duplicate elements 3299 add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ... 3300 3301 add v30.16b, v29.16b, v17.16b // base_y + 1 (*2) 3302 3303 sub v28.8h, v26.8h, v27.8h // 64 - frac_y 3304 3305 movi v24.16b, #4 33068: 3307 asr w9, w8, #6 // base_x 3308 dup v16.8h, w8 // xpos 3309 sub w8, w8, w6 // xpos -= dx 3310 cmp w9, #-16 // base_x <= -16 3311 asr w11, w8, #6 // base_x 3312 b.le 89f 3313 3314 dup v17.8h, w8 // xpos 3315 3316 add x9, x2, w9, sxtw #1 3317 add x11, x2, w11, sxtw #1 3318 3319 ld1 {v4.8h, v5.8h}, [x9] // top[base_x] 3320 ld1 {v6.8h, v7.8h}, [x11] 3321 3322 tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0] 3323 add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) 3324 tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1] 3325 add v30.16b, v30.16b, v24.16b 3326 3327 sshr v22.8h, v16.8h, #6 // first base_x 3328 tbl v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2] 3329 sshr v23.8h, v17.8h, #6 3330 tbl v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3] 3331 3332 ext v5.16b, v4.16b, v5.16b, #2 // top[base_x+1] 3333 ext v7.16b, v6.16b, v7.16b, #2 3334 3335 and v16.16b, v16.16b, v25.16b // frac_x 3336 and v17.16b, v17.16b, v25.16b 3337 3338 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 3339 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 3340 3341 sub v8.8h, v26.8h, v16.8h // 64 - frac_x 3342 sub v9.8h, v26.8h, v17.8h 3343 3344 umull2 v11.4s, v18.8h, v28.8h 3345 umlal2 v11.4s, v19.8h, v27.8h 3346 3347 add v22.8h, v22.8h, v31.8h // actual base_x 3348 add v23.8h, v23.8h, v31.8h 3349 3350 umull v12.4s, v20.4h, v28.4h 3351 umlal v12.4s, v21.4h, v27.4h 3352 umull2 v13.4s, v20.8h, v28.8h 3353 umlal2 v13.4s, v21.8h, v27.8h 3354 3355 rshrn v10.4h, v10.4s, #6 3356 rshrn2 v10.8h, v11.4s, #6 3357 rshrn v11.4h, v12.4s, #6 3358 rshrn2 v11.8h, v13.4s, #6 3359 3360 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) 3361 umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x 3362 umull2 v13.4s, v4.8h, v8.8h 3363 umlal2 v13.4s, v5.8h, v16.8h 3364 umull v14.4s, v6.4h, v9.4h 3365 umlal v14.4s, v7.4h, v17.4h 3366 umull2 v18.4s, v6.8h, v9.8h 3367 umlal2 v18.4s, v7.8h, v17.8h 3368 3369 cmge v22.8h, v22.8h, #0 3370 cmge v23.8h, v23.8h, #0 3371 3372 rshrn v12.4h, v12.4s, #6 3373 rshrn2 v12.8h, v13.4s, #6 3374 rshrn v13.4h, v14.4s, #6 3375 rshrn2 v13.8h, v18.4s, #6 3376 3377 bit v10.16b, v12.16b, v22.16b 3378 bit v11.16b, v13.16b, v23.16b 3379 3380 st1 {v10.8h}, [x0], x1 3381 subs w5, w5, #2 3382 sub w8, w8, w6 // xpos -= dx 3383 st1 {v11.8h}, [x0], x1 3384 b.le 9f 3385 3386 add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) 3387 add v30.16b, v30.16b, v24.16b 3388 b 8b 3389 339089: 3391 tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0] 3392 add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) 3393 tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1] 3394 add v30.16b, v30.16b, v24.16b 3395 tbl v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2] 3396 tbl v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3] 3397 3398 umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 3399 umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 3400 umull2 v5.4s, v18.8h, v28.8h 3401 umlal2 v5.4s, v19.8h, v27.8h 3402 umull v6.4s, v20.4h, v28.4h 3403 umlal v6.4s, v21.4h, v27.4h 3404 umull2 v7.4s, v20.8h, v28.8h 3405 umlal2 v7.4s, v21.8h, v27.8h 3406 3407 rshrn v4.4h, v4.4s, #6 3408 rshrn2 v4.8h, v5.4s, #6 3409 rshrn v5.4h, v6.4s, #6 3410 rshrn2 v5.8h, v7.4s, #6 3411 3412 st1 {v4.8h}, [x0], x1 3413 subs w5, w5, #2 3414 st1 {v5.8h}, [x0], x1 3415 b.le 9f 3416 3417 add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) 3418 add v30.16b, v30.16b, v24.16b 3419 b 89b 3420 34219: 3422 ldp d14, d15, [sp, #0x30] 3423 ldp d12, d13, [sp, #0x20] 3424 ldp d10, d11, [sp, #0x10] 3425 ldp d8, d9, [sp], 0x40 3426 ret 3427endfunc 3428 3429// void ipred_z3_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, 3430// const pixel *const left, 3431// const int width, const int height, 3432// const int dy, const int max_base_y); 3433function ipred_z3_fill1_16bpc_neon, export=1 3434 clz w9, w4 3435 adr x8, L(ipred_z3_fill1_tbl) 3436 sub w9, w9, #25 3437 ldrh w9, [x8, w9, uxtw #1] 3438 add x10, x2, w6, uxtw #1 // left[max_base_y] 3439 sub x8, x8, w9, uxtw 3440 ld1r {v31.8h}, [x10] // padding 3441 mov w7, w5 3442 mov w15, #64 3443 add x13, x0, x1 3444 lsl x1, x1, #1 3445 br x8 3446 344740: 3448 AARCH64_VALID_JUMP_TARGET 34494: 3450 lsr w8, w7, #6 // base 3451 and w9, w7, #0x3e // frac 3452 add w7, w7, w5 // xpos += dx 3453 cmp w8, w6 // base >= max_base_x 3454 lsr w10, w7, #6 // base 3455 and w11, w7, #0x3e // frac 3456 b.ge ipred_z3_fill_padding_neon 3457 lsl w8, w8, #1 3458 lsl w10, w10, #1 3459 ldr q0, [x2, w8, uxtw] // left[base] 3460 ldr q2, [x2, w10, uxtw] 3461 dup v4.8h, w9 // frac 3462 dup v5.8h, w11 3463 ext v1.16b, v0.16b, v0.16b, #2 // left[base+1] 3464 ext v3.16b, v2.16b, v2.16b, #2 3465 sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] 3466 sub v7.4h, v3.4h, v2.4h 3467 ushll v16.4s, v0.4h, #6 // top[base]*64 3468 ushll v17.4s, v2.4h, #6 3469 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac 3470 smlal v17.4s, v7.4h, v5.4h 3471 rshrn v16.4h, v16.4s, #6 3472 rshrn v17.4h, v17.4s, #6 3473 subs w3, w3, #2 3474 zip1 v18.8h, v16.8h, v17.8h 3475 st1 {v18.s}[0], [x0], x1 3476 st1 {v18.s}[1], [x13], x1 3477 add w7, w7, w5 // xpos += dx 3478 st1 {v18.s}[2], [x0] 3479 st1 {v18.s}[3], [x13] 3480 b.le 9f 3481 sub x0, x0, x1 // ptr -= 4 * (2*stride) 3482 sub x13, x13, x1 3483 add x0, x0, #4 3484 add x13, x13, #4 3485 b 4b 34869: 3487 ret 3488 348980: 3490 AARCH64_VALID_JUMP_TARGET 34918: 3492 lsr w8, w7, #6 // base 3493 and w9, w7, #0x3e // frac 3494 add w7, w7, w5 // xpos += dx 3495 cmp w8, w6 // base >= max_base_x 3496 lsr w10, w7, #6 // base 3497 and w11, w7, #0x3e // frac 3498 b.ge ipred_z3_fill_padding_neon 3499 add x8, x2, w8, uxtw #1 3500 add x10, x2, w10, uxtw #1 3501 dup v4.8h, w9 // frac 3502 dup v5.8h, w11 3503 ld1 {v0.8h}, [x8] // left[base] 3504 ld1 {v2.8h}, [x10] 3505 sub w9, w15, w9 // 64 - frac 3506 sub w11, w15, w11 3507 ldr h1, [x8, #16] 3508 ldr h3, [x10, #16] 3509 dup v6.8h, w9 // 64 - frac 3510 dup v7.8h, w11 3511 ext v1.16b, v0.16b, v1.16b, #2 // left[base+1] 3512 ext v3.16b, v2.16b, v3.16b, #2 3513 umull v16.4s, v0.4h, v6.4h // left[base]*(64-frac) 3514 umlal v16.4s, v1.4h, v4.4h // + left[base+1]*frac 3515 umull2 v17.4s, v0.8h, v6.8h 3516 umlal2 v17.4s, v1.8h, v4.8h 3517 umull v18.4s, v2.4h, v7.4h 3518 umlal v18.4s, v3.4h, v5.4h 3519 umull2 v19.4s, v2.8h, v7.8h 3520 umlal2 v19.4s, v3.8h, v5.8h 3521 rshrn v16.4h, v16.4s, #6 3522 rshrn2 v16.8h, v17.4s, #6 3523 rshrn v17.4h, v18.4s, #6 3524 rshrn2 v17.8h, v19.4s, #6 3525 subs w3, w3, #2 3526 zip1 v18.8h, v16.8h, v17.8h 3527 zip2 v19.8h, v16.8h, v17.8h 3528 add w7, w7, w5 // xpos += dx 3529 st1 {v18.s}[0], [x0], x1 3530 st1 {v18.s}[1], [x13], x1 3531 st1 {v18.s}[2], [x0], x1 3532 st1 {v18.s}[3], [x13], x1 3533 st1 {v19.s}[0], [x0], x1 3534 st1 {v19.s}[1], [x13], x1 3535 st1 {v19.s}[2], [x0], x1 3536 st1 {v19.s}[3], [x13], x1 3537 b.le 9f 3538 sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride) 3539 sub x13, x13, x1, lsl #2 3540 add x0, x0, #4 3541 add x13, x13, #4 3542 b 8b 35439: 3544 ret 3545 3546160: 3547320: 3548640: 3549 AARCH64_VALID_JUMP_TARGET 3550 mov w12, w4 35511: 3552 lsr w8, w7, #6 // base 3553 and w9, w7, #0x3e // frac 3554 add w7, w7, w5 // ypos += dy 3555 cmp w8, w6 // base >= max_base_y 3556 lsr w10, w7, #6 // base 3557 and w11, w7, #0x3e // frac 3558 b.ge ipred_z3_fill_padding_neon 3559 add x8, x2, w8, uxtw #1 3560 add x10, x2, w10, uxtw #1 3561 dup v6.8h, w9 // frac 3562 dup v7.8h, w11 3563 ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // left[base] 3564 ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48 3565 sub w9, w15, w9 // 64 - frac 3566 sub w11, w15, w11 3567 dup v16.8h, w9 // 64 - frac 3568 dup v17.8h, w11 3569 add w7, w7, w5 // ypos += dy 35702: 3571 ext v18.16b, v0.16b, v1.16b, #2 // left[base+1] 3572 ext v19.16b, v1.16b, v2.16b, #2 3573 ext v20.16b, v3.16b, v4.16b, #2 3574 ext v21.16b, v4.16b, v5.16b, #2 3575 subs w4, w4, #16 3576 umull v22.4s, v0.4h, v16.4h // left[base]*(64-frac) 3577 umlal v22.4s, v18.4h, v6.4h // + left[base+1]*frac 3578 umull2 v23.4s, v0.8h, v16.8h 3579 umlal2 v23.4s, v18.8h, v6.8h 3580 umull v24.4s, v1.4h, v16.4h 3581 umlal v24.4s, v19.4h, v6.4h 3582 umull2 v25.4s, v1.8h, v16.8h 3583 umlal2 v25.4s, v19.8h, v6.8h 3584 umull v26.4s, v3.4h, v17.4h 3585 umlal v26.4s, v20.4h, v7.4h 3586 umull2 v27.4s, v3.8h, v17.8h 3587 umlal2 v27.4s, v20.8h, v7.8h 3588 umull v28.4s, v4.4h, v17.4h 3589 umlal v28.4s, v21.4h, v7.4h 3590 umull2 v29.4s, v4.8h, v17.8h 3591 umlal2 v29.4s, v21.8h, v7.8h 3592 rshrn v22.4h, v22.4s, #6 3593 rshrn2 v22.8h, v23.4s, #6 3594 rshrn v23.4h, v24.4s, #6 3595 rshrn2 v23.8h, v25.4s, #6 3596 rshrn v24.4h, v26.4s, #6 3597 rshrn2 v24.8h, v27.4s, #6 3598 rshrn v25.4h, v28.4s, #6 3599 rshrn2 v25.8h, v29.4s, #6 3600 zip1 v18.8h, v22.8h, v24.8h 3601 zip2 v19.8h, v22.8h, v24.8h 3602 zip1 v20.8h, v23.8h, v25.8h 3603 zip2 v21.8h, v23.8h, v25.8h 3604 st1 {v18.s}[0], [x0], x1 3605 st1 {v18.s}[1], [x13], x1 3606 st1 {v18.s}[2], [x0], x1 3607 st1 {v18.s}[3], [x13], x1 3608 st1 {v19.s}[0], [x0], x1 3609 st1 {v19.s}[1], [x13], x1 3610 st1 {v19.s}[2], [x0], x1 3611 st1 {v19.s}[3], [x13], x1 3612 st1 {v20.s}[0], [x0], x1 3613 st1 {v20.s}[1], [x13], x1 3614 st1 {v20.s}[2], [x0], x1 3615 st1 {v20.s}[3], [x13], x1 3616 st1 {v21.s}[0], [x0], x1 3617 st1 {v21.s}[1], [x13], x1 3618 st1 {v21.s}[2], [x0], x1 3619 st1 {v21.s}[3], [x13], x1 3620 b.le 3f 3621 mov v0.16b, v2.16b 3622 ld1 {v1.8h, v2.8h}, [x8], #32 // left[base] 3623 mov v3.16b, v5.16b 3624 ld1 {v4.8h, v5.8h}, [x10], #32 3625 b 2b 3626 36273: 3628 subs w3, w3, #2 3629 b.le 9f 3630 lsr x1, x1, #1 3631 msub x0, x1, x12, x0 // ptr -= h * stride 3632 msub x13, x1, x12, x13 3633 lsl x1, x1, #1 3634 add x0, x0, #4 3635 add x13, x13, #4 3636 mov w4, w12 3637 b 1b 36389: 3639 ret 3640 3641L(ipred_z3_fill1_tbl): 3642 .hword L(ipred_z3_fill1_tbl) - 640b 3643 .hword L(ipred_z3_fill1_tbl) - 320b 3644 .hword L(ipred_z3_fill1_tbl) - 160b 3645 .hword L(ipred_z3_fill1_tbl) - 80b 3646 .hword L(ipred_z3_fill1_tbl) - 40b 3647endfunc 3648 3649function ipred_z3_fill_padding_neon, export=0 3650 cmp w3, #8 3651 adr x8, L(ipred_z3_fill_padding_tbl) 3652 b.gt L(ipred_z3_fill_padding_wide) 3653 // w3 = remaining width, w4 = constant height 3654 mov w12, w4 3655 36561: 3657 // Fill a WxH rectangle with padding. W can be any number; 3658 // this fills the exact width by filling in the largest 3659 // power of two in the remaining width, and repeating. 3660 clz w9, w3 3661 sub w9, w9, #25 3662 ldrh w9, [x8, w9, uxtw #1] 3663 sub x9, x8, w9, uxtw 3664 br x9 3665 36662: 3667 AARCH64_VALID_JUMP_TARGET 3668 st1 {v31.s}[0], [x0], x1 3669 subs w4, w4, #4 3670 st1 {v31.s}[0], [x13], x1 3671 st1 {v31.s}[0], [x0], x1 3672 st1 {v31.s}[0], [x13], x1 3673 b.gt 2b 3674 subs w3, w3, #2 3675 lsr x1, x1, #1 3676 msub x0, x1, x12, x0 // ptr -= h * stride 3677 msub x13, x1, x12, x13 3678 b.le 9f 3679 lsl x1, x1, #1 3680 add x0, x0, #4 3681 add x13, x13, #4 3682 mov w4, w12 3683 b 1b 3684 36854: 3686 AARCH64_VALID_JUMP_TARGET 3687 st1 {v31.4h}, [x0], x1 3688 subs w4, w4, #4 3689 st1 {v31.4h}, [x13], x1 3690 st1 {v31.4h}, [x0], x1 3691 st1 {v31.4h}, [x13], x1 3692 b.gt 4b 3693 subs w3, w3, #4 3694 lsr x1, x1, #1 3695 msub x0, x1, x12, x0 // ptr -= h * stride 3696 msub x13, x1, x12, x13 3697 b.le 9f 3698 lsl x1, x1, #1 3699 add x0, x0, #8 3700 add x13, x13, #8 3701 mov w4, w12 3702 b 1b 3703 37048: 370516: 370632: 370764: 3708 AARCH64_VALID_JUMP_TARGET 3709 st1 {v31.8h}, [x0], x1 3710 subs w4, w4, #4 3711 st1 {v31.8h}, [x13], x1 3712 st1 {v31.8h}, [x0], x1 3713 st1 {v31.8h}, [x13], x1 3714 b.gt 4b 3715 subs w3, w3, #8 3716 lsr x1, x1, #1 3717 msub x0, x1, x12, x0 // ptr -= h * stride 3718 msub x13, x1, x12, x13 3719 b.le 9f 3720 lsl x1, x1, #1 3721 add x0, x0, #16 3722 add x13, x13, #16 3723 mov w4, w12 3724 b 1b 3725 37269: 3727 ret 3728 3729L(ipred_z3_fill_padding_tbl): 3730 .hword L(ipred_z3_fill_padding_tbl) - 64b 3731 .hword L(ipred_z3_fill_padding_tbl) - 32b 3732 .hword L(ipred_z3_fill_padding_tbl) - 16b 3733 .hword L(ipred_z3_fill_padding_tbl) - 8b 3734 .hword L(ipred_z3_fill_padding_tbl) - 4b 3735 .hword L(ipred_z3_fill_padding_tbl) - 2b 3736 3737L(ipred_z3_fill_padding_wide): 3738 // Fill a WxH rectangle with padding, with W > 8. 3739 lsr x1, x1, #1 3740 mov w12, w3 3741 sub x1, x1, w3, uxtw #1 37421: 3743 ands w5, w3, #7 3744 b.eq 2f 3745 // If the width isn't aligned to 8, first do one 8 pixel write 3746 // and align the start pointer. 3747 sub w3, w3, w5 3748 st1 {v31.8h}, [x0] 3749 add x0, x0, w5, uxtw #1 37502: 3751 // Fill the rest of the line with aligned 8 pixel writes. 3752 subs w3, w3, #8 3753 st1 {v31.8h}, [x0], #16 3754 b.gt 2b 3755 subs w4, w4, #1 3756 add x0, x0, x1 3757 b.le 9f 3758 mov w3, w12 3759 b 1b 37609: 3761 ret 3762endfunc 3763 3764function ipred_z3_fill2_16bpc_neon, export=1 3765 cmp w4, #8 3766 add x10, x2, w6, uxtw // left[max_base_y] 3767 ld1r {v31.16b}, [x10] // padding 3768 mov w7, w5 3769 mov w15, #64 3770 add x13, x0, x1 3771 lsl x1, x1, #1 3772 b.eq 8f 3773 37744: // h == 4 3775 lsr w8, w7, #6 // base 3776 and w9, w7, #0x3e // frac 3777 add w7, w7, w5 // xpos += dx 3778 cmp w8, w6 // base >= max_base_x 3779 lsr w10, w7, #6 // base 3780 and w11, w7, #0x3e // frac 3781 b.ge ipred_z3_fill_padding_neon 3782 lsl w8, w8, #1 3783 lsl w10, w10, #1 3784 ldr q0, [x2, w8, uxtw] // top[base] 3785 ldr q2, [x2, w10, uxtw] 3786 dup v4.4h, w9 // frac 3787 dup v5.4h, w11 3788 uzp2 v1.8h, v0.8h, v0.8h // top[base+1] 3789 uzp1 v0.8h, v0.8h, v0.8h // top[base] 3790 uzp2 v3.8h, v2.8h, v2.8h 3791 uzp1 v2.8h, v2.8h, v2.8h 3792 sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] 3793 sub v7.4h, v3.4h, v2.4h 3794 ushll v16.4s, v0.4h, #6 // top[base]*64 3795 ushll v17.4s, v2.4h, #6 3796 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac 3797 smlal v17.4s, v7.4h, v5.4h 3798 rshrn v16.4h, v16.4s, #6 3799 rshrn v17.4h, v17.4s, #6 3800 subs w3, w3, #2 3801 zip1 v18.8h, v16.8h, v17.8h 3802 st1 {v18.s}[0], [x0], x1 3803 st1 {v18.s}[1], [x13], x1 3804 add w7, w7, w5 // xpos += dx 3805 st1 {v18.s}[2], [x0] 3806 st1 {v18.s}[3], [x13] 3807 b.le 9f 3808 sub x0, x0, x1 // ptr -= 4 * (2*stride) 3809 sub x13, x13, x1 3810 add x0, x0, #4 3811 add x13, x13, #4 3812 b 4b 38139: 3814 ret 3815 38168: // h == 8 3817 lsr w8, w7, #6 // base 3818 and w9, w7, #0x3e // frac 3819 add w7, w7, w5 // xpos += dx 3820 cmp w8, w6 // base >= max_base_x 3821 lsr w10, w7, #6 // base 3822 and w11, w7, #0x3e // frac 3823 b.ge ipred_z3_fill_padding_neon 3824 add x8, x2, w8, uxtw #1 3825 add x10, x2, w10, uxtw #1 3826 dup v4.8h, w9 // frac 3827 dup v5.8h, w11 3828 ld1 {v0.8h, v1.8h}, [x8] // top[base] 3829 ld1 {v2.8h, v3.8h}, [x10] 3830 sub w9, w15, w9 // 64 - frac 3831 sub w11, w15, w11 3832 dup v6.8h, w9 // 64 - frac 3833 dup v7.8h, w11 3834 uzp2 v20.8h, v0.8h, v1.8h // top[base+1] 3835 uzp1 v0.8h, v0.8h, v1.8h // top[base] 3836 uzp2 v21.8h, v2.8h, v3.8h 3837 uzp1 v2.8h, v2.8h, v3.8h 3838 umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) 3839 umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac 3840 umull2 v17.4s, v0.8h, v6.8h 3841 umlal2 v17.4s, v20.8h, v4.8h 3842 umull v18.4s, v2.4h, v7.4h 3843 umlal v18.4s, v21.4h, v5.4h 3844 umull2 v19.4s, v2.8h, v7.8h 3845 umlal2 v19.4s, v21.8h, v5.8h 3846 rshrn v16.4h, v16.4s, #6 3847 rshrn2 v16.8h, v17.4s, #6 3848 rshrn v17.4h, v18.4s, #6 3849 rshrn2 v17.8h, v19.4s, #6 3850 subs w3, w3, #2 3851 zip1 v18.8h, v16.8h, v17.8h 3852 zip2 v19.8h, v16.8h, v17.8h 3853 add w7, w7, w5 // xpos += dx 3854 st1 {v18.s}[0], [x0], x1 3855 st1 {v18.s}[1], [x13], x1 3856 st1 {v18.s}[2], [x0], x1 3857 st1 {v18.s}[3], [x13], x1 3858 st1 {v19.s}[0], [x0], x1 3859 st1 {v19.s}[1], [x13], x1 3860 st1 {v19.s}[2], [x0], x1 3861 st1 {v19.s}[3], [x13], x1 3862 b.le 9f 3863 sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride) 3864 sub x13, x13, x1, lsl #2 3865 add x0, x0, #4 3866 add x13, x13, #4 3867 b 8b 38689: 3869 ret 3870endfunc 3871 3872 3873// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride, 3874// const pixel *const topleft, 3875// const int width, const int height, const int filt_idx, 3876// const int max_width, const int max_height, 3877// const int bitdepth_max); 3878.macro filter_fn bpc 3879function ipred_filter_\bpc\()bpc_neon 3880 and w5, w5, #511 3881 movrel x6, X(filter_intra_taps) 3882 lsl w5, w5, #6 3883 add x6, x6, w5, uxtw 3884 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 3885 clz w9, w3 3886 adr x5, L(ipred_filter\bpc\()_tbl) 3887 ld1 {v20.8b, v21.8b, v22.8b}, [x6] 3888 sub w9, w9, #26 3889 ldrh w9, [x5, w9, uxtw #1] 3890 sxtl v16.8h, v16.8b 3891 sxtl v17.8h, v17.8b 3892 sub x5, x5, w9, uxtw 3893 sxtl v18.8h, v18.8b 3894 sxtl v19.8h, v19.8b 3895 add x6, x0, x1 3896 lsl x1, x1, #1 3897 sxtl v20.8h, v20.8b 3898 sxtl v21.8h, v21.8b 3899 sxtl v22.8h, v22.8b 3900 dup v31.8h, w8 3901.if \bpc == 10 3902 movi v30.8h, #0 3903.endif 3904 br x5 390540: 3906 AARCH64_VALID_JUMP_TARGET 3907 ldur d0, [x2, #2] // top (0-3) 3908 sub x2, x2, #4 3909 mov x7, #-4 39104: 3911 ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) 3912.if \bpc == 10 3913 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 3914 mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 3915 mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 3916 mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 3917 mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) 3918 mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 3919 mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 3920 srshr v2.8h, v2.8h, #4 3921 smax v2.8h, v2.8h, v30.8h 3922.else 3923 smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) 3924 smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) 3925 smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) 3926 smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) 3927 smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) 3928 smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) 3929 smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) 3930 smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 3931 smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 3932 smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 3933 smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 3934 smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) 3935 smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 3936 smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 3937 sqrshrun v2.4h, v2.4s, #4 3938 sqrshrun2 v2.8h, v3.4s, #4 3939.endif 3940 smin v2.8h, v2.8h, v31.8h 3941 subs w4, w4, #2 3942 st1 {v2.d}[0], [x0], x1 3943 ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3] 3944 st1 {v2.d}[1], [x6], x1 3945 b.gt 4b 3946 ret 394780: 3948 AARCH64_VALID_JUMP_TARGET 3949 ldur q0, [x2, #2] // top (0-7) 3950 sub x2, x2, #4 3951 mov x7, #-4 39528: 3953 ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) 3954.if \bpc == 10 3955 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 3956 mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 3957 mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 3958 mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 3959 mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) 3960 mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 3961 mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 3962 mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) 3963 mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) 3964 mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) 3965 srshr v2.8h, v2.8h, #4 3966 smax v2.8h, v2.8h, v30.8h 3967 smin v2.8h, v2.8h, v31.8h 3968 mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) 3969 mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) 3970 mla v3.8h, v21.8h, v2.h[3] // p5(left[0]) * filter(5) 3971 mla v3.8h, v22.8h, v2.h[7] // p6(left[1]) * filter(6) 3972 srshr v3.8h, v3.8h, #4 3973 smax v3.8h, v3.8h, v30.8h 3974.else 3975 smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) 3976 smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) 3977 smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) 3978 smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) 3979 smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) 3980 smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) 3981 smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) 3982 smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 3983 smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 3984 smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 3985 smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 3986 smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) 3987 smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 3988 smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 3989 smull v4.4s, v17.4h, v0.h[4] // p1(top[0]) * filter(1) 3990 smlal v4.4s, v18.4h, v0.h[5] // p2(top[1]) * filter(2) 3991 smlal v4.4s, v19.4h, v0.h[6] // p3(top[2]) * filter(3) 3992 sqrshrun v2.4h, v2.4s, #4 3993 sqrshrun2 v2.8h, v3.4s, #4 3994 smin v2.8h, v2.8h, v31.8h 3995 smlal v4.4s, v20.4h, v0.h[7] // p4(top[3]) * filter(4) 3996 smlal v4.4s, v16.4h, v0.h[3] // p0(topleft) * filter(0) 3997 smlal v4.4s, v21.4h, v2.h[3] // p5(left[0]) * filter(5) 3998 smlal v4.4s, v22.4h, v2.h[7] // p6(left[1]) * filter(6) 3999 smull2 v5.4s, v17.8h, v0.h[4] // p1(top[0]) * filter(1) 4000 smlal2 v5.4s, v18.8h, v0.h[5] // p2(top[1]) * filter(2) 4001 smlal2 v5.4s, v19.8h, v0.h[6] // p3(top[2]) * filter(3) 4002 smlal2 v5.4s, v20.8h, v0.h[7] // p4(top[3]) * filter(4) 4003 smlal2 v5.4s, v16.8h, v0.h[3] // p0(topleft) * filter(0) 4004 smlal2 v5.4s, v21.8h, v2.h[3] // p5(left[0]) * filter(5) 4005 smlal2 v5.4s, v22.8h, v2.h[7] // p6(left[1]) * filter(6) 4006 sqrshrun v3.4h, v4.4s, #4 4007 sqrshrun2 v3.8h, v5.4s, #4 4008.endif 4009 smin v3.8h, v3.8h, v31.8h 4010 subs w4, w4, #2 4011 st2 {v2.d, v3.d}[0], [x0], x1 4012 zip2 v0.2d, v2.2d, v3.2d 4013 st2 {v2.d, v3.d}[1], [x6], x1 4014 b.gt 8b 4015 ret 4016160: 4017320: 4018 AARCH64_VALID_JUMP_TARGET 4019 add x8, x2, #2 4020 sub x2, x2, #4 4021 mov x7, #-4 4022 sub x1, x1, w3, uxtw #1 4023 mov w9, w3 4024 40251: 4026 ld1 {v0.4h}, [x2], x7 // left (0-1) + topleft (2) 40272: 4028 ld1 {v1.8h, v2.8h}, [x8], #32 // top(0-15) 4029.if \bpc == 10 4030 mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) 4031 mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) 4032 mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) 4033 mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) 4034 mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) 4035 mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) 4036 mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) 4037 4038 mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) 4039 mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) 4040 mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) 4041 srshr v3.8h, v3.8h, #4 4042 smax v3.8h, v3.8h, v30.8h 4043 smin v3.8h, v3.8h, v31.8h 4044 mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) 4045 mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) 4046 mla v4.8h, v21.8h, v3.h[3] // p5(left[0]) * filter(5) 4047 mla v4.8h, v22.8h, v3.h[7] // p6(left[1]) * filter(6) 4048 4049 mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) 4050 mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) 4051 mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) 4052 srshr v4.8h, v4.8h, #4 4053 smax v4.8h, v4.8h, v30.8h 4054 smin v4.8h, v4.8h, v31.8h 4055 mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) 4056 mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) 4057 mla v5.8h, v21.8h, v4.h[3] // p5(left[0]) * filter(5) 4058 mla v5.8h, v22.8h, v4.h[7] // p6(left[1]) * filter(6) 4059 4060 mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) 4061 mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) 4062 mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) 4063 srshr v5.8h, v5.8h, #4 4064 smax v5.8h, v5.8h, v30.8h 4065 smin v5.8h, v5.8h, v31.8h 4066 mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) 4067 mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) 4068 mla v6.8h, v21.8h, v5.h[3] // p5(left[0]) * filter(5) 4069 mla v6.8h, v22.8h, v5.h[7] // p6(left[1]) * filter(6) 4070 4071 subs w3, w3, #16 4072 srshr v6.8h, v6.8h, #4 4073 smax v6.8h, v6.8h, v30.8h 4074.else 4075 smull v3.4s, v16.4h, v0.h[2] // p0(topleft) * filter(0) 4076 smlal v3.4s, v21.4h, v0.h[1] // p5(left[0]) * filter(5) 4077 smlal v3.4s, v22.4h, v0.h[0] // p6(left[1]) * filter(6) 4078 smlal v3.4s, v17.4h, v1.h[0] // p1(top[0]) * filter(1) 4079 smlal v3.4s, v18.4h, v1.h[1] // p2(top[1]) * filter(2) 4080 smlal v3.4s, v19.4h, v1.h[2] // p3(top[2]) * filter(3) 4081 smlal v3.4s, v20.4h, v1.h[3] // p4(top[3]) * filter(4) 4082 smull2 v4.4s, v16.8h, v0.h[2] // p0(topleft) * filter(0) 4083 smlal2 v4.4s, v21.8h, v0.h[1] // p5(left[0]) * filter(5) 4084 smlal2 v4.4s, v22.8h, v0.h[0] // p6(left[1]) * filter(6) 4085 smlal2 v4.4s, v17.8h, v1.h[0] // p1(top[0]) * filter(1) 4086 smlal2 v4.4s, v18.8h, v1.h[1] // p2(top[1]) * filter(2) 4087 smlal2 v4.4s, v19.8h, v1.h[2] // p3(top[2]) * filter(3) 4088 smlal2 v4.4s, v20.8h, v1.h[3] // p4(top[3]) * filter(4) 4089 4090 smull v5.4s, v17.4h, v1.h[4] // p1(top[0]) * filter(1) 4091 smlal v5.4s, v18.4h, v1.h[5] // p2(top[1]) * filter(2) 4092 smlal v5.4s, v19.4h, v1.h[6] // p3(top[2]) * filter(3) 4093 sqrshrun v3.4h, v3.4s, #4 4094 sqrshrun2 v3.8h, v4.4s, #4 4095 smin v3.8h, v3.8h, v31.8h 4096 smlal v5.4s, v20.4h, v1.h[7] // p4(top[3]) * filter(4) 4097 smlal v5.4s, v16.4h, v1.h[3] // p0(topleft) * filter(0) 4098 smlal v5.4s, v21.4h, v3.h[3] // p5(left[0]) * filter(5) 4099 smlal v5.4s, v22.4h, v3.h[7] // p6(left[1]) * filter(6) 4100 smull2 v6.4s, v17.8h, v1.h[4] // p1(top[0]) * filter(1) 4101 smlal2 v6.4s, v18.8h, v1.h[5] // p2(top[1]) * filter(2) 4102 smlal2 v6.4s, v19.8h, v1.h[6] // p3(top[2]) * filter(3) 4103 smlal2 v6.4s, v20.8h, v1.h[7] // p4(top[3]) * filter(4) 4104 smlal2 v6.4s, v16.8h, v1.h[3] // p0(topleft) * filter(0) 4105 smlal2 v6.4s, v21.8h, v3.h[3] // p5(left[0]) * filter(5) 4106 smlal2 v6.4s, v22.8h, v3.h[7] // p6(left[1]) * filter(6) 4107 4108 smull v24.4s, v17.4h, v2.h[0] // p1(top[0]) * filter(1) 4109 smlal v24.4s, v18.4h, v2.h[1] // p2(top[1]) * filter(2) 4110 smlal v24.4s, v19.4h, v2.h[2] // p3(top[2]) * filter(3) 4111 sqrshrun v4.4h, v5.4s, #4 4112 sqrshrun2 v4.8h, v6.4s, #4 4113 smin v4.8h, v4.8h, v31.8h 4114 smlal v24.4s, v20.4h, v2.h[3] // p4(top[3]) * filter(4) 4115 smlal v24.4s, v16.4h, v1.h[7] // p0(topleft) * filter(0) 4116 smlal v24.4s, v21.4h, v4.h[3] // p5(left[0]) * filter(5) 4117 smlal v24.4s, v22.4h, v4.h[7] // p6(left[1]) * filter(6) 4118 smull2 v25.4s, v17.8h, v2.h[0] // p1(top[0]) * filter(1) 4119 smlal2 v25.4s, v18.8h, v2.h[1] // p2(top[1]) * filter(2) 4120 smlal2 v25.4s, v19.8h, v2.h[2] // p3(top[2]) * filter(3) 4121 smlal2 v25.4s, v20.8h, v2.h[3] // p4(top[3]) * filter(4) 4122 smlal2 v25.4s, v16.8h, v1.h[7] // p0(topleft) * filter(0) 4123 smlal2 v25.4s, v21.8h, v4.h[3] // p5(left[0]) * filter(5) 4124 smlal2 v25.4s, v22.8h, v4.h[7] // p6(left[1]) * filter(6) 4125 4126 smull v26.4s, v17.4h, v2.h[4] // p1(top[0]) * filter(1) 4127 smlal v26.4s, v18.4h, v2.h[5] // p2(top[1]) * filter(2) 4128 smlal v26.4s, v19.4h, v2.h[6] // p3(top[2]) * filter(3) 4129 sqrshrun v5.4h, v24.4s, #4 4130 sqrshrun2 v5.8h, v25.4s, #4 4131 smin v5.8h, v5.8h, v31.8h 4132 smlal v26.4s, v20.4h, v2.h[7] // p4(top[3]) * filter(4) 4133 smlal v26.4s, v16.4h, v2.h[3] // p0(topleft) * filter(0) 4134 smlal v26.4s, v21.4h, v5.h[3] // p5(left[0]) * filter(5) 4135 smlal v26.4s, v22.4h, v5.h[7] // p6(left[1]) * filter(6) 4136 smull2 v27.4s, v17.8h, v2.h[4] // p1(top[0]) * filter(1) 4137 smlal2 v27.4s, v18.8h, v2.h[5] // p2(top[1]) * filter(2) 4138 smlal2 v27.4s, v19.8h, v2.h[6] // p3(top[2]) * filter(3) 4139 smlal2 v27.4s, v20.8h, v2.h[7] // p4(top[3]) * filter(4) 4140 smlal2 v27.4s, v16.8h, v2.h[3] // p0(topleft) * filter(0) 4141 smlal2 v27.4s, v21.8h, v5.h[3] // p5(left[0]) * filter(5) 4142 smlal2 v27.4s, v22.8h, v5.h[7] // p6(left[1]) * filter(6) 4143 4144 subs w3, w3, #16 4145 sqrshrun v6.4h, v26.4s, #4 4146 sqrshrun2 v6.8h, v27.4s, #4 4147.endif 4148 smin v6.8h, v6.8h, v31.8h 4149 4150 ins v0.h[2], v2.h[7] 4151 st4 {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32 4152 ins v0.h[0], v6.h[7] 4153 st4 {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32 4154 ins v0.h[1], v6.h[3] 4155 b.gt 2b 4156 subs w4, w4, #2 4157 b.le 9f 4158 sub x8, x6, w9, uxtw #1 4159 add x0, x0, x1 4160 add x6, x6, x1 4161 mov w3, w9 4162 b 1b 41639: 4164 ret 4165 4166L(ipred_filter\bpc\()_tbl): 4167 .hword L(ipred_filter\bpc\()_tbl) - 320b 4168 .hword L(ipred_filter\bpc\()_tbl) - 160b 4169 .hword L(ipred_filter\bpc\()_tbl) - 80b 4170 .hword L(ipred_filter\bpc\()_tbl) - 40b 4171endfunc 4172.endm 4173 4174filter_fn 10 4175filter_fn 12 4176 4177function ipred_filter_16bpc_neon, export=1 4178 ldr w8, [sp] 4179 cmp w8, 0x3ff 4180 b.le ipred_filter_10bpc_neon 4181 b ipred_filter_12bpc_neon 4182endfunc 4183 4184// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride, 4185// const pixel *const pal, const uint8_t *idx, 4186// const int w, const int h); 4187function pal_pred_16bpc_neon, export=1 4188 ld1 {v30.8h}, [x2] 4189 clz w9, w4 4190 adr x6, L(pal_pred_tbl) 4191 sub w9, w9, #25 4192 movi v29.16b, #7 4193 ldrh w9, [x6, w9, uxtw #1] 4194 movi v31.8h, #1, lsl #8 4195 sub x6, x6, w9, uxtw 4196 br x6 419740: 4198 AARCH64_VALID_JUMP_TARGET 4199 add x2, x0, x1 4200 lsl x1, x1, #1 42014: 4202 ld1 {v1.8b}, [x3], #8 4203 subs w5, w5, #4 4204 ushr v3.8b, v1.8b, #4 4205 and v2.8b, v1.8b, v29.8b 4206 zip1 v1.16b, v2.16b, v3.16b 4207 // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ... 4208 add v1.16b, v1.16b, v1.16b 4209 zip1 v0.16b, v1.16b, v1.16b 4210 zip2 v1.16b, v1.16b, v1.16b 4211 add v0.8h, v0.8h, v31.8h 4212 add v1.8h, v1.8h, v31.8h 4213 tbl v0.16b, {v30.16b}, v0.16b 4214 st1 {v0.d}[0], [x0], x1 4215 tbl v1.16b, {v30.16b}, v1.16b 4216 st1 {v0.d}[1], [x2], x1 4217 st1 {v1.d}[0], [x0], x1 4218 st1 {v1.d}[1], [x2], x1 4219 b.gt 4b 4220 ret 422180: 4222 AARCH64_VALID_JUMP_TARGET 4223 add x2, x0, x1 4224 lsl x1, x1, #1 42258: 4226 ld1 {v2.16b}, [x3], #16 4227 subs w5, w5, #4 4228 ushr v4.16b, v2.16b, #4 4229 and v3.16b, v2.16b, v29.16b 4230 zip1 v2.16b, v3.16b, v4.16b 4231 zip2 v3.16b, v3.16b, v4.16b 4232 add v2.16b, v2.16b, v2.16b 4233 add v3.16b, v3.16b, v3.16b 4234 zip1 v0.16b, v2.16b, v2.16b 4235 zip2 v1.16b, v2.16b, v2.16b 4236 zip1 v2.16b, v3.16b, v3.16b 4237 zip2 v3.16b, v3.16b, v3.16b 4238 add v0.8h, v0.8h, v31.8h 4239 add v1.8h, v1.8h, v31.8h 4240 add v2.8h, v2.8h, v31.8h 4241 add v3.8h, v3.8h, v31.8h 4242 tbl v0.16b, {v30.16b}, v0.16b 4243 tbl v1.16b, {v30.16b}, v1.16b 4244 st1 {v0.8h}, [x0], x1 4245 tbl v2.16b, {v30.16b}, v2.16b 4246 st1 {v1.8h}, [x2], x1 4247 tbl v3.16b, {v30.16b}, v3.16b 4248 st1 {v2.8h}, [x0], x1 4249 st1 {v3.8h}, [x2], x1 4250 b.gt 8b 4251 ret 4252160: 4253 AARCH64_VALID_JUMP_TARGET 4254 add x2, x0, x1 4255 lsl x1, x1, #1 425616: 4257 ld1 {v4.16b, v5.16b}, [x3], #32 4258 subs w5, w5, #4 4259 ushr v7.16b, v4.16b, #4 4260 and v6.16b, v4.16b, v29.16b 4261 ushr v3.16b, v5.16b, #4 4262 and v2.16b, v5.16b, v29.16b 4263 zip1 v4.16b, v6.16b, v7.16b 4264 zip2 v5.16b, v6.16b, v7.16b 4265 zip1 v6.16b, v2.16b, v3.16b 4266 zip2 v7.16b, v2.16b, v3.16b 4267 add v4.16b, v4.16b, v4.16b 4268 add v5.16b, v5.16b, v5.16b 4269 add v6.16b, v6.16b, v6.16b 4270 add v7.16b, v7.16b, v7.16b 4271 zip1 v0.16b, v4.16b, v4.16b 4272 zip2 v1.16b, v4.16b, v4.16b 4273 zip1 v2.16b, v5.16b, v5.16b 4274 zip2 v3.16b, v5.16b, v5.16b 4275 zip1 v4.16b, v6.16b, v6.16b 4276 zip2 v5.16b, v6.16b, v6.16b 4277 zip1 v6.16b, v7.16b, v7.16b 4278 zip2 v7.16b, v7.16b, v7.16b 4279 add v0.8h, v0.8h, v31.8h 4280 add v1.8h, v1.8h, v31.8h 4281 add v2.8h, v2.8h, v31.8h 4282 add v3.8h, v3.8h, v31.8h 4283 add v4.8h, v4.8h, v31.8h 4284 tbl v0.16b, {v30.16b}, v0.16b 4285 add v5.8h, v5.8h, v31.8h 4286 tbl v1.16b, {v30.16b}, v1.16b 4287 add v6.8h, v6.8h, v31.8h 4288 tbl v2.16b, {v30.16b}, v2.16b 4289 add v7.8h, v7.8h, v31.8h 4290 tbl v3.16b, {v30.16b}, v3.16b 4291 tbl v4.16b, {v30.16b}, v4.16b 4292 tbl v5.16b, {v30.16b}, v5.16b 4293 st1 {v0.8h, v1.8h}, [x0], x1 4294 tbl v6.16b, {v30.16b}, v6.16b 4295 st1 {v2.8h, v3.8h}, [x2], x1 4296 tbl v7.16b, {v30.16b}, v7.16b 4297 st1 {v4.8h, v5.8h}, [x0], x1 4298 st1 {v6.8h, v7.8h}, [x2], x1 4299 b.gt 16b 4300 ret 4301320: 4302 AARCH64_VALID_JUMP_TARGET 4303 add x2, x0, x1 4304 lsl x1, x1, #1 430532: 4306 ld1 {v4.16b, v5.16b}, [x3], #32 4307 subs w5, w5, #2 4308 ushr v7.16b, v4.16b, #4 4309 and v6.16b, v4.16b, v29.16b 4310 ushr v3.16b, v5.16b, #4 4311 and v2.16b, v5.16b, v29.16b 4312 zip1 v4.16b, v6.16b, v7.16b 4313 zip2 v5.16b, v6.16b, v7.16b 4314 zip1 v6.16b, v2.16b, v3.16b 4315 zip2 v7.16b, v2.16b, v3.16b 4316 add v4.16b, v4.16b, v4.16b 4317 add v5.16b, v5.16b, v5.16b 4318 add v6.16b, v6.16b, v6.16b 4319 add v7.16b, v7.16b, v7.16b 4320 zip1 v0.16b, v4.16b, v4.16b 4321 zip2 v1.16b, v4.16b, v4.16b 4322 zip1 v2.16b, v5.16b, v5.16b 4323 zip2 v3.16b, v5.16b, v5.16b 4324 zip1 v4.16b, v6.16b, v6.16b 4325 zip2 v5.16b, v6.16b, v6.16b 4326 zip1 v6.16b, v7.16b, v7.16b 4327 zip2 v7.16b, v7.16b, v7.16b 4328 add v0.8h, v0.8h, v31.8h 4329 add v1.8h, v1.8h, v31.8h 4330 add v2.8h, v2.8h, v31.8h 4331 add v3.8h, v3.8h, v31.8h 4332 add v4.8h, v4.8h, v31.8h 4333 tbl v0.16b, {v30.16b}, v0.16b 4334 add v5.8h, v5.8h, v31.8h 4335 tbl v1.16b, {v30.16b}, v1.16b 4336 add v6.8h, v6.8h, v31.8h 4337 tbl v2.16b, {v30.16b}, v2.16b 4338 add v7.8h, v7.8h, v31.8h 4339 tbl v3.16b, {v30.16b}, v3.16b 4340 tbl v4.16b, {v30.16b}, v4.16b 4341 tbl v5.16b, {v30.16b}, v5.16b 4342 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 4343 tbl v6.16b, {v30.16b}, v6.16b 4344 tbl v7.16b, {v30.16b}, v7.16b 4345 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 4346 b.gt 32b 4347 ret 4348640: 4349 AARCH64_VALID_JUMP_TARGET 4350 add x2, x0, #64 435164: 4352 ld1 {v4.16b, v5.16b}, [x3], #32 4353 subs w5, w5, #1 4354 ushr v7.16b, v4.16b, #4 4355 and v6.16b, v4.16b, v29.16b 4356 ushr v3.16b, v5.16b, #4 4357 and v2.16b, v5.16b, v29.16b 4358 zip1 v4.16b, v6.16b, v7.16b 4359 zip2 v5.16b, v6.16b, v7.16b 4360 zip1 v6.16b, v2.16b, v3.16b 4361 zip2 v7.16b, v2.16b, v3.16b 4362 add v4.16b, v4.16b, v4.16b 4363 add v5.16b, v5.16b, v5.16b 4364 add v6.16b, v6.16b, v6.16b 4365 add v7.16b, v7.16b, v7.16b 4366 zip1 v0.16b, v4.16b, v4.16b 4367 zip2 v1.16b, v4.16b, v4.16b 4368 zip1 v2.16b, v5.16b, v5.16b 4369 zip2 v3.16b, v5.16b, v5.16b 4370 zip1 v4.16b, v6.16b, v6.16b 4371 zip2 v5.16b, v6.16b, v6.16b 4372 zip1 v6.16b, v7.16b, v7.16b 4373 zip2 v7.16b, v7.16b, v7.16b 4374 add v0.8h, v0.8h, v31.8h 4375 add v1.8h, v1.8h, v31.8h 4376 add v2.8h, v2.8h, v31.8h 4377 add v3.8h, v3.8h, v31.8h 4378 add v4.8h, v4.8h, v31.8h 4379 tbl v0.16b, {v30.16b}, v0.16b 4380 add v5.8h, v5.8h, v31.8h 4381 tbl v1.16b, {v30.16b}, v1.16b 4382 add v6.8h, v6.8h, v31.8h 4383 tbl v2.16b, {v30.16b}, v2.16b 4384 add v7.8h, v7.8h, v31.8h 4385 tbl v3.16b, {v30.16b}, v3.16b 4386 tbl v4.16b, {v30.16b}, v4.16b 4387 tbl v5.16b, {v30.16b}, v5.16b 4388 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 4389 tbl v6.16b, {v30.16b}, v6.16b 4390 tbl v7.16b, {v30.16b}, v7.16b 4391 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 4392 b.gt 64b 4393 ret 4394 4395L(pal_pred_tbl): 4396 .hword L(pal_pred_tbl) - 640b 4397 .hword L(pal_pred_tbl) - 320b 4398 .hword L(pal_pred_tbl) - 160b 4399 .hword L(pal_pred_tbl) - 80b 4400 .hword L(pal_pred_tbl) - 40b 4401endfunc 4402 4403// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, 4404// const pixel *const topleft, 4405// const int width, const int height, 4406// const int16_t *ac, const int alpha, 4407// const int bitdepth_max); 4408function ipred_cfl_128_16bpc_neon, export=1 4409 dup v31.8h, w7 // bitdepth_max 4410 clz w9, w3 4411 adr x7, L(ipred_cfl_128_tbl) 4412 sub w9, w9, #26 4413 ldrh w9, [x7, w9, uxtw #1] 4414 urshr v0.8h, v31.8h, #1 4415 dup v1.8h, w6 // alpha 4416 sub x7, x7, w9, uxtw 4417 add x6, x0, x1 4418 lsl x1, x1, #1 4419 movi v30.8h, #0 4420 br x7 4421L(ipred_cfl_splat_w4): 4422 AARCH64_VALID_JUMP_TARGET 4423 ld1 {v4.8h, v5.8h}, [x5], #32 4424 subs w4, w4, #4 4425 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha 4426 smull2 v3.4s, v4.8h, v1.8h 4427 smull v4.4s, v5.4h, v1.4h 4428 smull2 v5.4s, v5.8h, v1.8h 4429 cmlt v16.4s, v2.4s, #0 // sign 4430 cmlt v17.4s, v3.4s, #0 4431 cmlt v18.4s, v4.4s, #0 4432 cmlt v19.4s, v5.4s, #0 4433 add v2.4s, v2.4s, v16.4s // diff + sign 4434 add v3.4s, v3.4s, v17.4s 4435 add v4.4s, v4.4s, v18.4s 4436 add v5.4s, v5.4s, v19.4s 4437 rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() 4438 rshrn2 v2.8h, v3.4s, #6 4439 rshrn v3.4h, v4.4s, #6 4440 rshrn2 v3.8h, v5.4s, #6 4441 add v2.8h, v2.8h, v0.8h // dc + apply_sign() 4442 add v3.8h, v3.8h, v0.8h 4443 smax v2.8h, v2.8h, v30.8h 4444 smax v3.8h, v3.8h, v30.8h 4445 smin v2.8h, v2.8h, v31.8h 4446 smin v3.8h, v3.8h, v31.8h 4447 st1 {v2.d}[0], [x0], x1 4448 st1 {v2.d}[1], [x6], x1 4449 st1 {v3.d}[0], [x0], x1 4450 st1 {v3.d}[1], [x6], x1 4451 b.gt L(ipred_cfl_splat_w4) 4452 ret 4453L(ipred_cfl_splat_w8): 4454 AARCH64_VALID_JUMP_TARGET 4455 ld1 {v4.8h, v5.8h}, [x5], #32 4456 subs w4, w4, #2 4457 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha 4458 smull2 v3.4s, v4.8h, v1.8h 4459 smull v4.4s, v5.4h, v1.4h 4460 smull2 v5.4s, v5.8h, v1.8h 4461 cmlt v16.4s, v2.4s, #0 // sign 4462 cmlt v17.4s, v3.4s, #0 4463 cmlt v18.4s, v4.4s, #0 4464 cmlt v19.4s, v5.4s, #0 4465 add v2.4s, v2.4s, v16.4s // diff + sign 4466 add v3.4s, v3.4s, v17.4s 4467 add v4.4s, v4.4s, v18.4s 4468 add v5.4s, v5.4s, v19.4s 4469 rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() 4470 rshrn2 v2.8h, v3.4s, #6 4471 rshrn v3.4h, v4.4s, #6 4472 rshrn2 v3.8h, v5.4s, #6 4473 add v2.8h, v2.8h, v0.8h // dc + apply_sign() 4474 add v3.8h, v3.8h, v0.8h 4475 smax v2.8h, v2.8h, v30.8h 4476 smax v3.8h, v3.8h, v30.8h 4477 smin v2.8h, v2.8h, v31.8h 4478 smin v3.8h, v3.8h, v31.8h 4479 st1 {v2.8h}, [x0], x1 4480 st1 {v3.8h}, [x6], x1 4481 b.gt L(ipred_cfl_splat_w8) 4482 ret 4483L(ipred_cfl_splat_w16): 4484 AARCH64_VALID_JUMP_TARGET 4485 add x7, x5, w3, uxtw #1 4486 sub x1, x1, w3, uxtw #1 4487 mov w9, w3 44881: 4489 ld1 {v2.8h, v3.8h}, [x5], #32 4490 ld1 {v4.8h, v5.8h}, [x7], #32 4491 subs w3, w3, #16 4492 smull v16.4s, v2.4h, v1.4h // diff = ac * alpha 4493 smull2 v17.4s, v2.8h, v1.8h 4494 smull v18.4s, v3.4h, v1.4h 4495 smull2 v19.4s, v3.8h, v1.8h 4496 smull v2.4s, v4.4h, v1.4h 4497 smull2 v3.4s, v4.8h, v1.8h 4498 smull v4.4s, v5.4h, v1.4h 4499 smull2 v5.4s, v5.8h, v1.8h 4500 cmlt v20.4s, v16.4s, #0 // sign 4501 cmlt v21.4s, v17.4s, #0 4502 cmlt v22.4s, v18.4s, #0 4503 cmlt v23.4s, v19.4s, #0 4504 cmlt v24.4s, v2.4s, #0 4505 cmlt v25.4s, v3.4s, #0 4506 cmlt v26.4s, v4.4s, #0 4507 cmlt v27.4s, v5.4s, #0 4508 add v16.4s, v16.4s, v20.4s // diff + sign 4509 add v17.4s, v17.4s, v21.4s 4510 add v18.4s, v18.4s, v22.4s 4511 add v19.4s, v19.4s, v23.4s 4512 add v2.4s, v2.4s, v24.4s 4513 add v3.4s, v3.4s, v25.4s 4514 add v4.4s, v4.4s, v26.4s 4515 add v5.4s, v5.4s, v27.4s 4516 rshrn v16.4h, v16.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() 4517 rshrn2 v16.8h, v17.4s, #6 4518 rshrn v17.4h, v18.4s, #6 4519 rshrn2 v17.8h, v19.4s, #6 4520 rshrn v6.4h, v2.4s, #6 4521 rshrn2 v6.8h, v3.4s, #6 4522 rshrn v7.4h, v4.4s, #6 4523 rshrn2 v7.8h, v5.4s, #6 4524 add v2.8h, v16.8h, v0.8h // dc + apply_sign() 4525 add v3.8h, v17.8h, v0.8h 4526 add v4.8h, v6.8h, v0.8h 4527 add v5.8h, v7.8h, v0.8h 4528 smax v2.8h, v2.8h, v30.8h 4529 smax v3.8h, v3.8h, v30.8h 4530 smax v4.8h, v4.8h, v30.8h 4531 smax v5.8h, v5.8h, v30.8h 4532 smin v2.8h, v2.8h, v31.8h 4533 smin v3.8h, v3.8h, v31.8h 4534 smin v4.8h, v4.8h, v31.8h 4535 smin v5.8h, v5.8h, v31.8h 4536 st1 {v2.8h, v3.8h}, [x0], #32 4537 st1 {v4.8h, v5.8h}, [x6], #32 4538 b.gt 1b 4539 subs w4, w4, #2 4540 add x5, x5, w9, uxtw #1 4541 add x7, x7, w9, uxtw #1 4542 add x0, x0, x1 4543 add x6, x6, x1 4544 mov w3, w9 4545 b.gt 1b 4546 ret 4547 4548L(ipred_cfl_128_tbl): 4549L(ipred_cfl_splat_tbl): 4550 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) 4551 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) 4552 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) 4553 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) 4554endfunc 4555 4556// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, 4557// const pixel *const topleft, 4558// const int width, const int height, 4559// const int16_t *ac, const int alpha, 4560// const int bitdepth_max); 4561function ipred_cfl_top_16bpc_neon, export=1 4562 dup v31.8h, w7 // bitdepth_max 4563 clz w9, w3 4564 adr x7, L(ipred_cfl_top_tbl) 4565 sub w9, w9, #26 4566 ldrh w9, [x7, w9, uxtw #1] 4567 dup v1.8h, w6 // alpha 4568 add x2, x2, #2 4569 sub x7, x7, w9, uxtw 4570 add x6, x0, x1 4571 lsl x1, x1, #1 4572 movi v30.8h, #0 4573 br x7 45744: 4575 AARCH64_VALID_JUMP_TARGET 4576 ld1 {v0.4h}, [x2] 4577 addv h0, v0.4h 4578 urshr v0.4h, v0.4h, #2 4579 dup v0.8h, v0.h[0] 4580 b L(ipred_cfl_splat_w4) 45818: 4582 AARCH64_VALID_JUMP_TARGET 4583 ld1 {v0.8h}, [x2] 4584 addv h0, v0.8h 4585 urshr v0.4h, v0.4h, #3 4586 dup v0.8h, v0.h[0] 4587 b L(ipred_cfl_splat_w8) 458816: 4589 AARCH64_VALID_JUMP_TARGET 4590 ld1 {v2.8h, v3.8h}, [x2] 4591 addp v0.8h, v2.8h, v3.8h 4592 addv h0, v0.8h 4593 urshr v0.4h, v0.4h, #4 4594 dup v0.8h, v0.h[0] 4595 b L(ipred_cfl_splat_w16) 459632: 4597 AARCH64_VALID_JUMP_TARGET 4598 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] 4599 addp v2.8h, v2.8h, v3.8h 4600 addp v4.8h, v4.8h, v5.8h 4601 addp v0.8h, v2.8h, v4.8h 4602 uaddlv s0, v0.8h 4603 rshrn v0.4h, v0.4s, #5 4604 dup v0.8h, v0.h[0] 4605 b L(ipred_cfl_splat_w16) 4606 4607L(ipred_cfl_top_tbl): 4608 .hword L(ipred_cfl_top_tbl) - 32b 4609 .hword L(ipred_cfl_top_tbl) - 16b 4610 .hword L(ipred_cfl_top_tbl) - 8b 4611 .hword L(ipred_cfl_top_tbl) - 4b 4612endfunc 4613 4614// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, 4615// const pixel *const topleft, 4616// const int width, const int height, 4617// const int16_t *ac, const int alpha, 4618// const int bitdepth_max); 4619function ipred_cfl_left_16bpc_neon, export=1 4620 dup v31.8h, w7 // bitdepth_max 4621 sub x2, x2, w4, uxtw #1 4622 clz w9, w3 4623 clz w8, w4 4624 adr x10, L(ipred_cfl_splat_tbl) 4625 adr x7, L(ipred_cfl_left_tbl) 4626 sub w9, w9, #26 4627 sub w8, w8, #26 4628 ldrh w9, [x10, w9, uxtw #1] 4629 ldrh w8, [x7, w8, uxtw #1] 4630 dup v1.8h, w6 // alpha 4631 sub x9, x10, w9, uxtw 4632 sub x7, x7, w8, uxtw 4633 add x6, x0, x1 4634 lsl x1, x1, #1 4635 movi v30.8h, #0 4636 br x7 4637 4638L(ipred_cfl_left_h4): 4639 AARCH64_VALID_JUMP_TARGET 4640 ld1 {v0.4h}, [x2] 4641 addv h0, v0.4h 4642 urshr v0.4h, v0.4h, #2 4643 dup v0.8h, v0.h[0] 4644 br x9 4645 4646L(ipred_cfl_left_h8): 4647 AARCH64_VALID_JUMP_TARGET 4648 ld1 {v0.8h}, [x2] 4649 addv h0, v0.8h 4650 urshr v0.4h, v0.4h, #3 4651 dup v0.8h, v0.h[0] 4652 br x9 4653 4654L(ipred_cfl_left_h16): 4655 AARCH64_VALID_JUMP_TARGET 4656 ld1 {v2.8h, v3.8h}, [x2] 4657 addp v0.8h, v2.8h, v3.8h 4658 addv h0, v0.8h 4659 urshr v0.4h, v0.4h, #4 4660 dup v0.8h, v0.h[0] 4661 br x9 4662 4663L(ipred_cfl_left_h32): 4664 AARCH64_VALID_JUMP_TARGET 4665 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] 4666 addp v2.8h, v2.8h, v3.8h 4667 addp v4.8h, v4.8h, v5.8h 4668 addp v0.8h, v2.8h, v4.8h 4669 uaddlv s0, v0.8h 4670 rshrn v0.4h, v0.4s, #5 4671 dup v0.8h, v0.h[0] 4672 br x9 4673 4674L(ipred_cfl_left_tbl): 4675 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) 4676 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) 4677 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) 4678 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) 4679endfunc 4680 4681// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride, 4682// const pixel *const topleft, 4683// const int width, const int height, 4684// const int16_t *ac, const int alpha, 4685// const int bitdepth_max); 4686function ipred_cfl_16bpc_neon, export=1 4687 dup v31.8h, w7 // bitdepth_max 4688 sub x2, x2, w4, uxtw #1 4689 add w8, w3, w4 // width + height 4690 dup v1.8h, w6 // alpha 4691 clz w9, w3 4692 clz w6, w4 4693 dup v16.4s, w8 // width + height 4694 adr x7, L(ipred_cfl_tbl) 4695 rbit w8, w8 // rbit(width + height) 4696 sub w9, w9, #22 // 26 leading bits, minus table offset 4 4697 sub w6, w6, #26 4698 clz w8, w8 // ctz(width + height) 4699 ldrh w9, [x7, w9, uxtw #1] 4700 ldrh w6, [x7, w6, uxtw #1] 4701 neg w8, w8 // -ctz(width + height) 4702 sub x9, x7, w9, uxtw 4703 sub x7, x7, w6, uxtw 4704 ushr v16.4s, v16.4s, #1 // (width + height) >> 1 4705 dup v17.4s, w8 // -ctz(width + height) 4706 add x6, x0, x1 4707 lsl x1, x1, #1 4708 movi v30.8h, #0 4709 br x7 4710 4711L(ipred_cfl_h4): 4712 AARCH64_VALID_JUMP_TARGET 4713 ld1 {v0.4h}, [x2], #8 4714 uaddlv s0, v0.4h 4715 add x2, x2, #2 4716 br x9 4717L(ipred_cfl_w4): 4718 AARCH64_VALID_JUMP_TARGET 4719 ld1 {v2.4h}, [x2] 4720 add v0.2s, v0.2s, v16.2s 4721 uaddlv s2, v2.4h 4722 cmp w4, #4 4723 add v0.2s, v0.2s, v2.2s 4724 ushl v0.2s, v0.2s, v17.2s 4725 b.eq 1f 4726 // h = 8/16 4727 cmp w4, #16 4728 mov w16, #0x6667 4729 mov w17, #0xAAAB 4730 csel w16, w16, w17, eq 4731 dup v16.2s, w16 4732 mul v0.2s, v0.2s, v16.2s 4733 ushr v0.2s, v0.2s, #17 47341: 4735 dup v0.8h, v0.h[0] 4736 b L(ipred_cfl_splat_w4) 4737 4738L(ipred_cfl_h8): 4739 AARCH64_VALID_JUMP_TARGET 4740 ld1 {v0.8h}, [x2], #16 4741 uaddlv s0, v0.8h 4742 add x2, x2, #2 4743 br x9 4744L(ipred_cfl_w8): 4745 AARCH64_VALID_JUMP_TARGET 4746 ld1 {v2.8h}, [x2] 4747 add v0.2s, v0.2s, v16.2s 4748 uaddlv s2, v2.8h 4749 cmp w4, #8 4750 add v0.2s, v0.2s, v2.2s 4751 ushl v0.2s, v0.2s, v17.2s 4752 b.eq 1f 4753 // h = 4/16/32 4754 cmp w4, #32 4755 mov w16, #0x6667 4756 mov w17, #0xAAAB 4757 csel w16, w16, w17, eq 4758 dup v16.2s, w16 4759 mul v0.2s, v0.2s, v16.2s 4760 ushr v0.2s, v0.2s, #17 47611: 4762 dup v0.8h, v0.h[0] 4763 b L(ipred_cfl_splat_w8) 4764 4765L(ipred_cfl_h16): 4766 AARCH64_VALID_JUMP_TARGET 4767 ld1 {v2.8h, v3.8h}, [x2], #32 4768 addp v0.8h, v2.8h, v3.8h 4769 add x2, x2, #2 4770 uaddlv s0, v0.8h 4771 br x9 4772L(ipred_cfl_w16): 4773 AARCH64_VALID_JUMP_TARGET 4774 ld1 {v2.8h, v3.8h}, [x2] 4775 add v0.2s, v0.2s, v16.2s 4776 addp v2.8h, v2.8h, v3.8h 4777 uaddlv s2, v2.8h 4778 cmp w4, #16 4779 add v0.2s, v0.2s, v2.2s 4780 ushl v0.2s, v0.2s, v17.2s 4781 b.eq 1f 4782 // h = 4/8/32 4783 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask 4784 mov w16, #0x6667 4785 mov w17, #0xAAAB 4786 csel w16, w16, w17, eq 4787 dup v16.2s, w16 4788 mul v0.2s, v0.2s, v16.2s 4789 ushr v0.2s, v0.2s, #17 47901: 4791 dup v0.8h, v0.h[0] 4792 b L(ipred_cfl_splat_w16) 4793 4794L(ipred_cfl_h32): 4795 AARCH64_VALID_JUMP_TARGET 4796 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64 4797 addp v2.8h, v2.8h, v3.8h 4798 addp v4.8h, v4.8h, v5.8h 4799 addp v0.8h, v2.8h, v4.8h 4800 add x2, x2, #2 4801 uaddlv s0, v0.8h 4802 br x9 4803L(ipred_cfl_w32): 4804 AARCH64_VALID_JUMP_TARGET 4805 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] 4806 add v0.4s, v0.4s, v16.4s 4807 addp v2.8h, v2.8h, v3.8h 4808 addp v4.8h, v4.8h, v5.8h 4809 addp v2.8h, v2.8h, v4.8h 4810 cmp w4, #32 4811 uaddlv s2, v2.8h 4812 add v0.2s, v0.2s, v2.2s 4813 ushl v0.2s, v0.2s, v17.2s 4814 b.eq 1f 4815 // h = 8/16 4816 cmp w4, #8 4817 mov w16, #0x6667 4818 mov w17, #0xAAAB 4819 csel w16, w16, w17, eq 4820 dup v16.2s, w16 4821 mul v0.2s, v0.2s, v16.2s 4822 ushr v0.2s, v0.2s, #17 48231: 4824 dup v0.8h, v0.h[0] 4825 b L(ipred_cfl_splat_w16) 4826 4827L(ipred_cfl_tbl): 4828 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) 4829 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) 4830 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) 4831 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) 4832 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) 4833 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) 4834 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) 4835 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) 4836endfunc 4837 4838// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx, 4839// const ptrdiff_t stride, const int w_pad, 4840// const int h_pad, const int cw, const int ch); 4841function ipred_cfl_ac_420_16bpc_neon, export=1 4842 clz w8, w5 4843 lsl w4, w4, #2 4844 adr x7, L(ipred_cfl_ac_420_tbl) 4845 sub w8, w8, #27 4846 ldrh w8, [x7, w8, uxtw #1] 4847 movi v24.4s, #0 4848 movi v25.4s, #0 4849 movi v26.4s, #0 4850 movi v27.4s, #0 4851 sub x7, x7, w8, uxtw 4852 sub w8, w6, w4 // height - h_pad 4853 rbit w9, w5 // rbit(width) 4854 rbit w10, w6 // rbit(height) 4855 clz w9, w9 // ctz(width) 4856 clz w10, w10 // ctz(height) 4857 add w9, w9, w10 // log2sz 4858 add x10, x1, x2 4859 dup v31.4s, w9 4860 lsl x2, x2, #1 4861 neg v31.4s, v31.4s // -log2sz 4862 br x7 4863 4864L(ipred_cfl_ac_420_w4): 4865 AARCH64_VALID_JUMP_TARGET 48661: // Copy and subsample input 4867 ld1 {v0.8h}, [x1], x2 4868 ld1 {v1.8h}, [x10], x2 4869 ld1 {v2.8h}, [x1], x2 4870 ld1 {v3.8h}, [x10], x2 4871 addp v0.8h, v0.8h, v2.8h 4872 addp v1.8h, v1.8h, v3.8h 4873 add v0.8h, v0.8h, v1.8h 4874 shl v0.8h, v0.8h, #1 4875 subs w8, w8, #2 4876 st1 {v0.8h}, [x0], #16 4877 uaddw v24.4s, v24.4s, v0.4h 4878 uaddw2 v25.4s, v25.4s, v0.8h 4879 b.gt 1b 4880 trn2 v1.2d, v0.2d, v0.2d 4881 trn2 v0.2d, v0.2d, v0.2d 4882L(ipred_cfl_ac_420_w4_hpad): 4883 cbz w4, 3f 48842: // Vertical padding (h_pad > 0) 4885 subs w4, w4, #4 4886 st1 {v0.8h, v1.8h}, [x0], #32 4887 uaddw v24.4s, v24.4s, v0.4h 4888 uaddw2 v25.4s, v25.4s, v0.8h 4889 uaddw v26.4s, v26.4s, v1.4h 4890 uaddw2 v27.4s, v27.4s, v1.8h 4891 b.gt 2b 48923: 4893L(ipred_cfl_ac_420_w4_calc_subtract_dc): 4894 // Aggregate the sums 4895 add v24.4s, v24.4s, v25.4s 4896 add v26.4s, v26.4s, v27.4s 4897 add v0.4s, v24.4s, v26.4s 4898 addv s0, v0.4s // sum 4899 sub x0, x0, w6, uxtw #3 4900 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz 4901 dup v4.8h, v4.h[0] 49026: // Subtract dc from ac 4903 ld1 {v0.8h, v1.8h}, [x0] 4904 subs w6, w6, #4 4905 sub v0.8h, v0.8h, v4.8h 4906 sub v1.8h, v1.8h, v4.8h 4907 st1 {v0.8h, v1.8h}, [x0], #32 4908 b.gt 6b 4909 ret 4910 4911L(ipred_cfl_ac_420_w8): 4912 AARCH64_VALID_JUMP_TARGET 4913 cbnz w3, L(ipred_cfl_ac_420_w8_wpad) 49141: // Copy and subsample input, without padding 4915 ld1 {v0.8h, v1.8h}, [x1], x2 4916 ld1 {v2.8h, v3.8h}, [x10], x2 4917 ld1 {v4.8h, v5.8h}, [x1], x2 4918 addp v0.8h, v0.8h, v1.8h 4919 ld1 {v6.8h, v7.8h}, [x10], x2 4920 addp v2.8h, v2.8h, v3.8h 4921 addp v4.8h, v4.8h, v5.8h 4922 addp v6.8h, v6.8h, v7.8h 4923 add v0.8h, v0.8h, v2.8h 4924 add v4.8h, v4.8h, v6.8h 4925 shl v0.8h, v0.8h, #1 4926 shl v1.8h, v4.8h, #1 4927 subs w8, w8, #2 4928 st1 {v0.8h, v1.8h}, [x0], #32 4929 uaddw v24.4s, v24.4s, v0.4h 4930 uaddw2 v25.4s, v25.4s, v0.8h 4931 uaddw v26.4s, v26.4s, v1.4h 4932 uaddw2 v27.4s, v27.4s, v1.8h 4933 b.gt 1b 4934 mov v0.16b, v1.16b 4935 b L(ipred_cfl_ac_420_w8_hpad) 4936 4937L(ipred_cfl_ac_420_w8_wpad): 49381: // Copy and subsample input, padding 4 4939 ld1 {v0.8h}, [x1], x2 4940 ld1 {v1.8h}, [x10], x2 4941 ld1 {v2.8h}, [x1], x2 4942 ld1 {v3.8h}, [x10], x2 4943 addp v0.8h, v0.8h, v2.8h 4944 addp v1.8h, v1.8h, v3.8h 4945 add v0.8h, v0.8h, v1.8h 4946 shl v0.8h, v0.8h, #1 4947 dup v1.4h, v0.h[3] 4948 dup v3.4h, v0.h[7] 4949 trn2 v2.2d, v0.2d, v0.2d 4950 subs w8, w8, #2 4951 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 4952 uaddw v24.4s, v24.4s, v0.4h 4953 uaddw v25.4s, v25.4s, v1.4h 4954 uaddw v26.4s, v26.4s, v2.4h 4955 uaddw v27.4s, v27.4s, v3.4h 4956 b.gt 1b 4957 trn1 v0.2d, v2.2d, v3.2d 4958 trn1 v1.2d, v2.2d, v3.2d 4959 4960L(ipred_cfl_ac_420_w8_hpad): 4961 cbz w4, 3f 49622: // Vertical padding (h_pad > 0) 4963 subs w4, w4, #4 4964 st1 {v0.8h, v1.8h}, [x0], #32 4965 uaddw v24.4s, v24.4s, v0.4h 4966 uaddw2 v25.4s, v25.4s, v0.8h 4967 uaddw v26.4s, v26.4s, v1.4h 4968 uaddw2 v27.4s, v27.4s, v1.8h 4969 st1 {v0.8h, v1.8h}, [x0], #32 4970 uaddw v24.4s, v24.4s, v0.4h 4971 uaddw2 v25.4s, v25.4s, v0.8h 4972 uaddw v26.4s, v26.4s, v1.4h 4973 uaddw2 v27.4s, v27.4s, v1.8h 4974 b.gt 2b 49753: 4976 4977 // Double the height and reuse the w4 summing/subtracting 4978 lsl w6, w6, #1 4979 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) 4980 4981L(ipred_cfl_ac_420_w16): 4982 AARCH64_VALID_JUMP_TARGET 4983 adr x7, L(ipred_cfl_ac_420_w16_tbl) 4984 ldrh w3, [x7, w3, uxtw #1] 4985 sub x7, x7, w3, uxtw 4986 br x7 4987 4988L(ipred_cfl_ac_420_w16_wpad0): 4989 AARCH64_VALID_JUMP_TARGET 49901: // Copy and subsample input, without padding 4991 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 4992 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 4993 addp v0.8h, v0.8h, v1.8h 4994 addp v2.8h, v2.8h, v3.8h 4995 addp v4.8h, v4.8h, v5.8h 4996 addp v6.8h, v6.8h, v7.8h 4997 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], x2 4998 add v0.8h, v0.8h, v4.8h 4999 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2 5000 add v2.8h, v2.8h, v6.8h 5001 addp v16.8h, v16.8h, v17.8h 5002 addp v18.8h, v18.8h, v19.8h 5003 addp v20.8h, v20.8h, v21.8h 5004 addp v22.8h, v22.8h, v23.8h 5005 add v16.8h, v16.8h, v20.8h 5006 add v18.8h, v18.8h, v22.8h 5007 shl v0.8h, v0.8h, #1 5008 shl v1.8h, v2.8h, #1 5009 shl v2.8h, v16.8h, #1 5010 shl v3.8h, v18.8h, #1 5011 subs w8, w8, #2 5012 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5013 uaddw v24.4s, v24.4s, v0.4h 5014 uaddw2 v25.4s, v25.4s, v0.8h 5015 uaddw v26.4s, v26.4s, v1.4h 5016 uaddw2 v27.4s, v27.4s, v1.8h 5017 uaddw v24.4s, v24.4s, v2.4h 5018 uaddw2 v25.4s, v25.4s, v2.8h 5019 uaddw v26.4s, v26.4s, v3.4h 5020 uaddw2 v27.4s, v27.4s, v3.8h 5021 b.gt 1b 5022 mov v0.16b, v2.16b 5023 mov v1.16b, v3.16b 5024 b L(ipred_cfl_ac_420_w16_hpad) 5025 5026L(ipred_cfl_ac_420_w16_wpad1): 5027 AARCH64_VALID_JUMP_TARGET 50281: // Copy and subsample input, padding 4 5029 ldr q2, [x1, #32] 5030 ld1 {v0.8h, v1.8h}, [x1], x2 5031 ldr q5, [x10, #32] 5032 ld1 {v3.8h, v4.8h}, [x10], x2 5033 addp v2.8h, v2.8h, v2.8h 5034 addp v0.8h, v0.8h, v1.8h 5035 addp v5.8h, v5.8h, v5.8h 5036 addp v3.8h, v3.8h, v4.8h 5037 ldr q18, [x1, #32] 5038 add v2.4h, v2.4h, v5.4h 5039 ld1 {v16.8h, v17.8h}, [x1], x2 5040 add v0.8h, v0.8h, v3.8h 5041 ldr q21, [x10, #32] 5042 ld1 {v19.8h, v20.8h}, [x10], x2 5043 addp v18.8h, v18.8h, v18.8h 5044 addp v16.8h, v16.8h, v17.8h 5045 addp v21.8h, v21.8h, v21.8h 5046 addp v19.8h, v19.8h, v20.8h 5047 add v18.4h, v18.4h, v21.4h 5048 add v16.8h, v16.8h, v19.8h 5049 shl v1.4h, v2.4h, #1 5050 shl v0.8h, v0.8h, #1 5051 shl v3.4h, v18.4h, #1 5052 shl v2.8h, v16.8h, #1 5053 dup v4.4h, v1.h[3] 5054 dup v5.4h, v3.h[3] 5055 trn1 v1.2d, v1.2d, v4.2d 5056 trn1 v3.2d, v3.2d, v5.2d 5057 subs w8, w8, #2 5058 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5059 uaddw v24.4s, v24.4s, v0.4h 5060 uaddw2 v25.4s, v25.4s, v0.8h 5061 uaddw v26.4s, v26.4s, v1.4h 5062 uaddw2 v27.4s, v27.4s, v1.8h 5063 uaddw v24.4s, v24.4s, v2.4h 5064 uaddw2 v25.4s, v25.4s, v2.8h 5065 uaddw v26.4s, v26.4s, v3.4h 5066 uaddw2 v27.4s, v27.4s, v3.8h 5067 b.gt 1b 5068 mov v0.16b, v2.16b 5069 mov v1.16b, v3.16b 5070 b L(ipred_cfl_ac_420_w16_hpad) 5071 5072L(ipred_cfl_ac_420_w16_wpad2): 5073 AARCH64_VALID_JUMP_TARGET 50741: // Copy and subsample input, padding 8 5075 ld1 {v0.8h, v1.8h}, [x1], x2 5076 ld1 {v2.8h, v3.8h}, [x10], x2 5077 ld1 {v4.8h, v5.8h}, [x1], x2 5078 addp v0.8h, v0.8h, v1.8h 5079 ld1 {v6.8h, v7.8h}, [x10], x2 5080 addp v2.8h, v2.8h, v3.8h 5081 addp v4.8h, v4.8h, v5.8h 5082 addp v6.8h, v6.8h, v7.8h 5083 add v0.8h, v0.8h, v2.8h 5084 add v4.8h, v4.8h, v6.8h 5085 shl v0.8h, v0.8h, #1 5086 shl v2.8h, v4.8h, #1 5087 dup v1.8h, v0.h[7] 5088 dup v3.8h, v2.h[7] 5089 subs w8, w8, #2 5090 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5091 uaddw v24.4s, v24.4s, v0.4h 5092 uaddw2 v25.4s, v25.4s, v0.8h 5093 uaddw v26.4s, v26.4s, v1.4h 5094 uaddw2 v27.4s, v27.4s, v1.8h 5095 uaddw v24.4s, v24.4s, v2.4h 5096 uaddw2 v25.4s, v25.4s, v2.8h 5097 uaddw v26.4s, v26.4s, v3.4h 5098 uaddw2 v27.4s, v27.4s, v3.8h 5099 b.gt 1b 5100 mov v0.16b, v2.16b 5101 mov v1.16b, v3.16b 5102 b L(ipred_cfl_ac_420_w16_hpad) 5103 5104L(ipred_cfl_ac_420_w16_wpad3): 5105 AARCH64_VALID_JUMP_TARGET 51061: // Copy and subsample input, padding 12 5107 ld1 {v0.8h}, [x1], x2 5108 ld1 {v2.8h}, [x10], x2 5109 ld1 {v4.8h}, [x1], x2 5110 ld1 {v6.8h}, [x10], x2 5111 addp v0.8h, v0.8h, v4.8h 5112 addp v2.8h, v2.8h, v6.8h 5113 add v0.8h, v0.8h, v2.8h 5114 shl v0.8h, v0.8h, #1 5115 dup v1.8h, v0.h[3] 5116 dup v3.8h, v0.h[7] 5117 trn2 v2.2d, v0.2d, v3.2d 5118 trn1 v0.2d, v0.2d, v1.2d 5119 subs w8, w8, #2 5120 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5121 uaddw v24.4s, v24.4s, v0.4h 5122 uaddw2 v25.4s, v25.4s, v0.8h 5123 uaddw v26.4s, v26.4s, v1.4h 5124 uaddw2 v27.4s, v27.4s, v1.8h 5125 uaddw v24.4s, v24.4s, v2.4h 5126 uaddw2 v25.4s, v25.4s, v2.8h 5127 uaddw v26.4s, v26.4s, v3.4h 5128 uaddw2 v27.4s, v27.4s, v3.8h 5129 b.gt 1b 5130 mov v0.16b, v2.16b 5131 mov v1.16b, v3.16b 5132 5133L(ipred_cfl_ac_420_w16_hpad): 5134 cbz w4, 3f 51352: // Vertical padding (h_pad > 0) 5136 subs w4, w4, #4 5137 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5138 uaddw v24.4s, v24.4s, v0.4h 5139 uaddw2 v25.4s, v25.4s, v0.8h 5140 uaddw v26.4s, v26.4s, v1.4h 5141 uaddw2 v27.4s, v27.4s, v1.8h 5142 uaddw v24.4s, v24.4s, v2.4h 5143 uaddw2 v25.4s, v25.4s, v2.8h 5144 uaddw v26.4s, v26.4s, v3.4h 5145 uaddw2 v27.4s, v27.4s, v3.8h 5146 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5147 uaddw v24.4s, v24.4s, v0.4h 5148 uaddw2 v25.4s, v25.4s, v0.8h 5149 uaddw v26.4s, v26.4s, v1.4h 5150 uaddw2 v27.4s, v27.4s, v1.8h 5151 uaddw v24.4s, v24.4s, v2.4h 5152 uaddw2 v25.4s, v25.4s, v2.8h 5153 uaddw v26.4s, v26.4s, v3.4h 5154 uaddw2 v27.4s, v27.4s, v3.8h 5155 b.gt 2b 51563: 5157 5158 // Quadruple the height and reuse the w4 summing/subtracting 5159 lsl w6, w6, #2 5160 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) 5161 5162L(ipred_cfl_ac_420_tbl): 5163 .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) 5164 .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) 5165 .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) 5166 .hword 0 5167 5168L(ipred_cfl_ac_420_w16_tbl): 5169 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) 5170 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) 5171 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) 5172 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) 5173endfunc 5174 5175// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx, 5176// const ptrdiff_t stride, const int w_pad, 5177// const int h_pad, const int cw, const int ch); 5178function ipred_cfl_ac_422_16bpc_neon, export=1 5179 clz w8, w5 5180 lsl w4, w4, #2 5181 adr x7, L(ipred_cfl_ac_422_tbl) 5182 sub w8, w8, #27 5183 ldrh w8, [x7, w8, uxtw #1] 5184 movi v24.4s, #0 5185 movi v25.4s, #0 5186 movi v26.4s, #0 5187 movi v27.4s, #0 5188 sub x7, x7, w8, uxtw 5189 sub w8, w6, w4 // height - h_pad 5190 rbit w9, w5 // rbit(width) 5191 rbit w10, w6 // rbit(height) 5192 clz w9, w9 // ctz(width) 5193 clz w10, w10 // ctz(height) 5194 add w9, w9, w10 // log2sz 5195 add x10, x1, x2 5196 dup v31.4s, w9 5197 lsl x2, x2, #1 5198 neg v31.4s, v31.4s // -log2sz 5199 br x7 5200 5201L(ipred_cfl_ac_422_w4): 5202 AARCH64_VALID_JUMP_TARGET 52031: // Copy and subsample input 5204 ld1 {v0.8h}, [x1], x2 5205 ld1 {v1.8h}, [x10], x2 5206 ld1 {v2.8h}, [x1], x2 5207 ld1 {v3.8h}, [x10], x2 5208 addp v0.8h, v0.8h, v1.8h 5209 addp v2.8h, v2.8h, v3.8h 5210 shl v0.8h, v0.8h, #2 5211 shl v1.8h, v2.8h, #2 5212 subs w8, w8, #4 5213 st1 {v0.8h, v1.8h}, [x0], #32 5214 uaddw v24.4s, v24.4s, v0.4h 5215 uaddw2 v25.4s, v25.4s, v0.8h 5216 uaddw v26.4s, v26.4s, v1.4h 5217 uaddw2 v27.4s, v27.4s, v1.8h 5218 b.gt 1b 5219 trn2 v0.2d, v1.2d, v1.2d 5220 trn2 v1.2d, v1.2d, v1.2d 5221 b L(ipred_cfl_ac_420_w4_hpad) 5222 5223L(ipred_cfl_ac_422_w8): 5224 AARCH64_VALID_JUMP_TARGET 5225 cbnz w3, L(ipred_cfl_ac_422_w8_wpad) 52261: // Copy and subsample input, without padding 5227 ld1 {v0.8h, v1.8h}, [x1], x2 5228 ld1 {v2.8h, v3.8h}, [x10], x2 5229 ld1 {v4.8h, v5.8h}, [x1], x2 5230 addp v0.8h, v0.8h, v1.8h 5231 ld1 {v6.8h, v7.8h}, [x10], x2 5232 addp v2.8h, v2.8h, v3.8h 5233 addp v4.8h, v4.8h, v5.8h 5234 addp v6.8h, v6.8h, v7.8h 5235 shl v0.8h, v0.8h, #2 5236 shl v1.8h, v2.8h, #2 5237 shl v2.8h, v4.8h, #2 5238 shl v3.8h, v6.8h, #2 5239 subs w8, w8, #4 5240 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5241 uaddw v24.4s, v24.4s, v0.4h 5242 uaddw2 v25.4s, v25.4s, v0.8h 5243 uaddw v26.4s, v26.4s, v1.4h 5244 uaddw2 v27.4s, v27.4s, v1.8h 5245 uaddw v24.4s, v24.4s, v2.4h 5246 uaddw2 v25.4s, v25.4s, v2.8h 5247 uaddw v26.4s, v26.4s, v3.4h 5248 uaddw2 v27.4s, v27.4s, v3.8h 5249 b.gt 1b 5250 mov v0.16b, v3.16b 5251 mov v1.16b, v3.16b 5252 b L(ipred_cfl_ac_420_w8_hpad) 5253 5254L(ipred_cfl_ac_422_w8_wpad): 52551: // Copy and subsample input, padding 4 5256 ld1 {v0.8h}, [x1], x2 5257 ld1 {v1.8h}, [x10], x2 5258 ld1 {v2.8h}, [x1], x2 5259 ld1 {v3.8h}, [x10], x2 5260 addp v0.8h, v0.8h, v1.8h 5261 addp v2.8h, v2.8h, v3.8h 5262 shl v0.8h, v0.8h, #2 5263 shl v2.8h, v2.8h, #2 5264 dup v4.4h, v0.h[3] 5265 dup v5.8h, v0.h[7] 5266 dup v6.4h, v2.h[3] 5267 dup v7.8h, v2.h[7] 5268 trn2 v1.2d, v0.2d, v5.2d 5269 trn1 v0.2d, v0.2d, v4.2d 5270 trn2 v3.2d, v2.2d, v7.2d 5271 trn1 v2.2d, v2.2d, v6.2d 5272 subs w8, w8, #4 5273 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5274 uaddw v24.4s, v24.4s, v0.4h 5275 uaddw2 v25.4s, v25.4s, v0.8h 5276 uaddw v26.4s, v26.4s, v1.4h 5277 uaddw2 v27.4s, v27.4s, v1.8h 5278 uaddw v24.4s, v24.4s, v2.4h 5279 uaddw2 v25.4s, v25.4s, v2.8h 5280 uaddw v26.4s, v26.4s, v3.4h 5281 uaddw2 v27.4s, v27.4s, v3.8h 5282 b.gt 1b 5283 mov v0.16b, v3.16b 5284 mov v1.16b, v3.16b 5285 b L(ipred_cfl_ac_420_w8_hpad) 5286 5287L(ipred_cfl_ac_422_w16): 5288 AARCH64_VALID_JUMP_TARGET 5289 adr x7, L(ipred_cfl_ac_422_w16_tbl) 5290 ldrh w3, [x7, w3, uxtw #1] 5291 sub x7, x7, w3, uxtw 5292 br x7 5293 5294L(ipred_cfl_ac_422_w16_wpad0): 5295 AARCH64_VALID_JUMP_TARGET 52961: // Copy and subsample input, without padding 5297 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 5298 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 5299 addp v0.8h, v0.8h, v1.8h 5300 addp v2.8h, v2.8h, v3.8h 5301 addp v4.8h, v4.8h, v5.8h 5302 addp v6.8h, v6.8h, v7.8h 5303 shl v0.8h, v0.8h, #2 5304 shl v1.8h, v2.8h, #2 5305 shl v2.8h, v4.8h, #2 5306 shl v3.8h, v6.8h, #2 5307 subs w8, w8, #2 5308 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5309 uaddw v24.4s, v24.4s, v0.4h 5310 uaddw2 v25.4s, v25.4s, v0.8h 5311 uaddw v26.4s, v26.4s, v1.4h 5312 uaddw2 v27.4s, v27.4s, v1.8h 5313 uaddw v24.4s, v24.4s, v2.4h 5314 uaddw2 v25.4s, v25.4s, v2.8h 5315 uaddw v26.4s, v26.4s, v3.4h 5316 uaddw2 v27.4s, v27.4s, v3.8h 5317 b.gt 1b 5318 mov v0.16b, v2.16b 5319 mov v1.16b, v3.16b 5320 b L(ipred_cfl_ac_420_w16_hpad) 5321 5322L(ipred_cfl_ac_422_w16_wpad1): 5323 AARCH64_VALID_JUMP_TARGET 53241: // Copy and subsample input, padding 4 5325 ldr q2, [x1, #32] 5326 ld1 {v0.8h, v1.8h}, [x1], x2 5327 ldr q6, [x10, #32] 5328 ld1 {v4.8h, v5.8h}, [x10], x2 5329 addp v2.8h, v2.8h, v2.8h 5330 addp v0.8h, v0.8h, v1.8h 5331 addp v6.8h, v6.8h, v6.8h 5332 addp v4.8h, v4.8h, v5.8h 5333 shl v1.4h, v2.4h, #2 5334 shl v0.8h, v0.8h, #2 5335 shl v3.4h, v6.4h, #2 5336 shl v2.8h, v4.8h, #2 5337 dup v4.4h, v1.h[3] 5338 dup v5.4h, v3.h[3] 5339 trn1 v1.2d, v1.2d, v4.2d 5340 trn1 v3.2d, v3.2d, v5.2d 5341 subs w8, w8, #2 5342 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5343 uaddw v24.4s, v24.4s, v0.4h 5344 uaddw2 v25.4s, v25.4s, v0.8h 5345 uaddw v26.4s, v26.4s, v1.4h 5346 uaddw2 v27.4s, v27.4s, v1.8h 5347 uaddw v24.4s, v24.4s, v2.4h 5348 uaddw2 v25.4s, v25.4s, v2.8h 5349 uaddw v26.4s, v26.4s, v3.4h 5350 uaddw2 v27.4s, v27.4s, v3.8h 5351 b.gt 1b 5352 mov v0.16b, v2.16b 5353 mov v1.16b, v3.16b 5354 b L(ipred_cfl_ac_420_w16_hpad) 5355 5356L(ipred_cfl_ac_422_w16_wpad2): 5357 AARCH64_VALID_JUMP_TARGET 53581: // Copy and subsample input, padding 8 5359 ld1 {v0.8h, v1.8h}, [x1], x2 5360 ld1 {v2.8h, v3.8h}, [x10], x2 5361 addp v0.8h, v0.8h, v1.8h 5362 addp v2.8h, v2.8h, v3.8h 5363 shl v0.8h, v0.8h, #2 5364 shl v2.8h, v2.8h, #2 5365 dup v1.8h, v0.h[7] 5366 dup v3.8h, v2.h[7] 5367 subs w8, w8, #2 5368 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5369 uaddw v24.4s, v24.4s, v0.4h 5370 uaddw2 v25.4s, v25.4s, v0.8h 5371 uaddw v26.4s, v26.4s, v1.4h 5372 uaddw2 v27.4s, v27.4s, v1.8h 5373 uaddw v24.4s, v24.4s, v2.4h 5374 uaddw2 v25.4s, v25.4s, v2.8h 5375 uaddw v26.4s, v26.4s, v3.4h 5376 uaddw2 v27.4s, v27.4s, v3.8h 5377 b.gt 1b 5378 mov v0.16b, v2.16b 5379 mov v1.16b, v3.16b 5380 b L(ipred_cfl_ac_420_w16_hpad) 5381 5382L(ipred_cfl_ac_422_w16_wpad3): 5383 AARCH64_VALID_JUMP_TARGET 53841: // Copy and subsample input, padding 12 5385 ld1 {v0.8h}, [x1], x2 5386 ld1 {v2.8h}, [x10], x2 5387 addp v0.8h, v0.8h, v0.8h 5388 addp v2.8h, v2.8h, v2.8h 5389 shl v0.4h, v0.4h, #2 5390 shl v2.4h, v2.4h, #2 5391 dup v1.8h, v0.h[3] 5392 dup v3.8h, v2.h[3] 5393 trn1 v0.2d, v0.2d, v1.2d 5394 trn1 v2.2d, v2.2d, v3.2d 5395 subs w8, w8, #2 5396 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5397 uaddw v24.4s, v24.4s, v0.4h 5398 uaddw2 v25.4s, v25.4s, v0.8h 5399 uaddw v26.4s, v26.4s, v1.4h 5400 uaddw2 v27.4s, v27.4s, v1.8h 5401 uaddw v24.4s, v24.4s, v2.4h 5402 uaddw2 v25.4s, v25.4s, v2.8h 5403 uaddw v26.4s, v26.4s, v3.4h 5404 uaddw2 v27.4s, v27.4s, v3.8h 5405 b.gt 1b 5406 mov v0.16b, v2.16b 5407 mov v1.16b, v3.16b 5408 b L(ipred_cfl_ac_420_w16_hpad) 5409 5410L(ipred_cfl_ac_422_tbl): 5411 .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) 5412 .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) 5413 .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) 5414 .hword 0 5415 5416L(ipred_cfl_ac_422_w16_tbl): 5417 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) 5418 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) 5419 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) 5420 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) 5421endfunc 5422 5423// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx, 5424// const ptrdiff_t stride, const int w_pad, 5425// const int h_pad, const int cw, const int ch); 5426function ipred_cfl_ac_444_16bpc_neon, export=1 5427 clz w8, w5 5428 lsl w4, w4, #2 5429 adr x7, L(ipred_cfl_ac_444_tbl) 5430 sub w8, w8, #26 5431 ldrh w8, [x7, w8, uxtw #1] 5432 movi v24.4s, #0 5433 movi v25.4s, #0 5434 movi v26.4s, #0 5435 movi v27.4s, #0 5436 sub x7, x7, w8, uxtw 5437 sub w8, w6, w4 // height - h_pad 5438 rbit w9, w5 // rbit(width) 5439 rbit w10, w6 // rbit(height) 5440 clz w9, w9 // ctz(width) 5441 clz w10, w10 // ctz(height) 5442 add w9, w9, w10 // log2sz 5443 add x10, x1, x2 5444 dup v31.4s, w9 5445 lsl x2, x2, #1 5446 neg v31.4s, v31.4s // -log2sz 5447 br x7 5448 5449L(ipred_cfl_ac_444_w4): 5450 AARCH64_VALID_JUMP_TARGET 54511: // Copy and expand input 5452 ld1 {v0.4h}, [x1], x2 5453 ld1 {v0.d}[1], [x10], x2 5454 ld1 {v1.4h}, [x1], x2 5455 ld1 {v1.d}[1], [x10], x2 5456 shl v0.8h, v0.8h, #3 5457 shl v1.8h, v1.8h, #3 5458 subs w8, w8, #4 5459 st1 {v0.8h, v1.8h}, [x0], #32 5460 uaddw v24.4s, v24.4s, v0.4h 5461 uaddw2 v25.4s, v25.4s, v0.8h 5462 uaddw v26.4s, v26.4s, v1.4h 5463 uaddw2 v27.4s, v27.4s, v1.8h 5464 b.gt 1b 5465 trn2 v0.2d, v1.2d, v1.2d 5466 trn2 v1.2d, v1.2d, v1.2d 5467 b L(ipred_cfl_ac_420_w4_hpad) 5468 5469L(ipred_cfl_ac_444_w8): 5470 AARCH64_VALID_JUMP_TARGET 54711: // Copy and expand input 5472 ld1 {v0.8h}, [x1], x2 5473 ld1 {v1.8h}, [x10], x2 5474 ld1 {v2.8h}, [x1], x2 5475 shl v0.8h, v0.8h, #3 5476 ld1 {v3.8h}, [x10], x2 5477 shl v1.8h, v1.8h, #3 5478 shl v2.8h, v2.8h, #3 5479 shl v3.8h, v3.8h, #3 5480 subs w8, w8, #4 5481 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5482 uaddw v24.4s, v24.4s, v0.4h 5483 uaddw2 v25.4s, v25.4s, v0.8h 5484 uaddw v26.4s, v26.4s, v1.4h 5485 uaddw2 v27.4s, v27.4s, v1.8h 5486 uaddw v24.4s, v24.4s, v2.4h 5487 uaddw2 v25.4s, v25.4s, v2.8h 5488 uaddw v26.4s, v26.4s, v3.4h 5489 uaddw2 v27.4s, v27.4s, v3.8h 5490 b.gt 1b 5491 mov v0.16b, v3.16b 5492 mov v1.16b, v3.16b 5493 b L(ipred_cfl_ac_420_w8_hpad) 5494 5495L(ipred_cfl_ac_444_w16): 5496 AARCH64_VALID_JUMP_TARGET 5497 cbnz w3, L(ipred_cfl_ac_444_w16_wpad) 54981: // Copy and expand input, without padding 5499 ld1 {v0.8h, v1.8h}, [x1], x2 5500 ld1 {v2.8h, v3.8h}, [x10], x2 5501 shl v0.8h, v0.8h, #3 5502 shl v1.8h, v1.8h, #3 5503 shl v2.8h, v2.8h, #3 5504 shl v3.8h, v3.8h, #3 5505 subs w8, w8, #2 5506 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5507 uaddw v24.4s, v24.4s, v0.4h 5508 uaddw2 v25.4s, v25.4s, v0.8h 5509 uaddw v26.4s, v26.4s, v1.4h 5510 uaddw2 v27.4s, v27.4s, v1.8h 5511 uaddw v24.4s, v24.4s, v2.4h 5512 uaddw2 v25.4s, v25.4s, v2.8h 5513 uaddw v26.4s, v26.4s, v3.4h 5514 uaddw2 v27.4s, v27.4s, v3.8h 5515 b.gt 1b 5516 mov v0.16b, v2.16b 5517 mov v1.16b, v3.16b 5518 b L(ipred_cfl_ac_420_w16_hpad) 5519 5520L(ipred_cfl_ac_444_w16_wpad): 55211: // Copy and expand input, padding 8 5522 ld1 {v0.8h}, [x1], x2 5523 ld1 {v2.8h}, [x10], x2 5524 shl v0.8h, v0.8h, #3 5525 shl v2.8h, v2.8h, #3 5526 dup v1.8h, v0.h[7] 5527 dup v3.8h, v2.h[7] 5528 subs w8, w8, #2 5529 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5530 uaddw v24.4s, v24.4s, v0.4h 5531 uaddw2 v25.4s, v25.4s, v0.8h 5532 uaddw v26.4s, v26.4s, v1.4h 5533 uaddw2 v27.4s, v27.4s, v1.8h 5534 uaddw v24.4s, v24.4s, v2.4h 5535 uaddw2 v25.4s, v25.4s, v2.8h 5536 uaddw v26.4s, v26.4s, v3.4h 5537 uaddw2 v27.4s, v27.4s, v3.8h 5538 b.gt 1b 5539 mov v0.16b, v2.16b 5540 mov v1.16b, v3.16b 5541 b L(ipred_cfl_ac_420_w16_hpad) 5542 5543L(ipred_cfl_ac_444_w32): 5544 AARCH64_VALID_JUMP_TARGET 5545 adr x7, L(ipred_cfl_ac_444_w32_tbl) 5546 ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 5547 lsr x2, x2, #1 // Restore the stride to one line increments 5548 sub x7, x7, w3, uxtw 5549 br x7 5550 5551L(ipred_cfl_ac_444_w32_wpad0): 5552 AARCH64_VALID_JUMP_TARGET 55531: // Copy and expand input, without padding 5554 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 5555 shl v0.8h, v0.8h, #3 5556 shl v1.8h, v1.8h, #3 5557 shl v2.8h, v2.8h, #3 5558 shl v3.8h, v3.8h, #3 5559 subs w8, w8, #1 5560 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5561 uaddw v24.4s, v24.4s, v0.4h 5562 uaddw2 v25.4s, v25.4s, v0.8h 5563 uaddw v26.4s, v26.4s, v1.4h 5564 uaddw2 v27.4s, v27.4s, v1.8h 5565 uaddw v24.4s, v24.4s, v2.4h 5566 uaddw2 v25.4s, v25.4s, v2.8h 5567 uaddw v26.4s, v26.4s, v3.4h 5568 uaddw2 v27.4s, v27.4s, v3.8h 5569 b.gt 1b 5570 b L(ipred_cfl_ac_444_w32_hpad) 5571 5572L(ipred_cfl_ac_444_w32_wpad2): 5573 AARCH64_VALID_JUMP_TARGET 55741: // Copy and expand input, padding 8 5575 ld1 {v0.8h, v1.8h, v2.8h}, [x1], x2 5576 shl v2.8h, v2.8h, #3 5577 shl v0.8h, v0.8h, #3 5578 shl v1.8h, v1.8h, #3 5579 dup v3.8h, v2.h[7] 5580 subs w8, w8, #1 5581 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5582 uaddw v24.4s, v24.4s, v0.4h 5583 uaddw2 v25.4s, v25.4s, v0.8h 5584 uaddw v26.4s, v26.4s, v1.4h 5585 uaddw2 v27.4s, v27.4s, v1.8h 5586 uaddw v24.4s, v24.4s, v2.4h 5587 uaddw2 v25.4s, v25.4s, v2.8h 5588 uaddw v26.4s, v26.4s, v3.4h 5589 uaddw2 v27.4s, v27.4s, v3.8h 5590 b.gt 1b 5591 b L(ipred_cfl_ac_444_w32_hpad) 5592 5593L(ipred_cfl_ac_444_w32_wpad4): 5594 AARCH64_VALID_JUMP_TARGET 55951: // Copy and expand input, padding 16 5596 ld1 {v0.8h, v1.8h}, [x1], x2 5597 shl v1.8h, v1.8h, #3 5598 shl v0.8h, v0.8h, #3 5599 dup v2.8h, v1.h[7] 5600 dup v3.8h, v1.h[7] 5601 subs w8, w8, #1 5602 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5603 uaddw v24.4s, v24.4s, v0.4h 5604 uaddw2 v25.4s, v25.4s, v0.8h 5605 uaddw v26.4s, v26.4s, v1.4h 5606 uaddw2 v27.4s, v27.4s, v1.8h 5607 uaddw v24.4s, v24.4s, v2.4h 5608 uaddw2 v25.4s, v25.4s, v2.8h 5609 uaddw v26.4s, v26.4s, v3.4h 5610 uaddw2 v27.4s, v27.4s, v3.8h 5611 b.gt 1b 5612 b L(ipred_cfl_ac_444_w32_hpad) 5613 5614L(ipred_cfl_ac_444_w32_wpad6): 5615 AARCH64_VALID_JUMP_TARGET 56161: // Copy and expand input, padding 24 5617 ld1 {v0.8h}, [x1], x2 5618 shl v0.8h, v0.8h, #3 5619 dup v1.8h, v0.h[7] 5620 dup v2.8h, v0.h[7] 5621 dup v3.8h, v0.h[7] 5622 subs w8, w8, #1 5623 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5624 uaddw v24.4s, v24.4s, v0.4h 5625 uaddw2 v25.4s, v25.4s, v0.8h 5626 uaddw v26.4s, v26.4s, v1.4h 5627 uaddw2 v27.4s, v27.4s, v1.8h 5628 uaddw v24.4s, v24.4s, v2.4h 5629 uaddw2 v25.4s, v25.4s, v2.8h 5630 uaddw v26.4s, v26.4s, v3.4h 5631 uaddw2 v27.4s, v27.4s, v3.8h 5632 b.gt 1b 5633 5634L(ipred_cfl_ac_444_w32_hpad): 5635 cbz w4, 3f 56362: // Vertical padding (h_pad > 0) 5637 subs w4, w4, #2 5638 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5639 uaddw v24.4s, v24.4s, v0.4h 5640 uaddw2 v25.4s, v25.4s, v0.8h 5641 uaddw v26.4s, v26.4s, v1.4h 5642 uaddw2 v27.4s, v27.4s, v1.8h 5643 uaddw v24.4s, v24.4s, v2.4h 5644 uaddw2 v25.4s, v25.4s, v2.8h 5645 uaddw v26.4s, v26.4s, v3.4h 5646 uaddw2 v27.4s, v27.4s, v3.8h 5647 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5648 uaddw v24.4s, v24.4s, v0.4h 5649 uaddw2 v25.4s, v25.4s, v0.8h 5650 uaddw v26.4s, v26.4s, v1.4h 5651 uaddw2 v27.4s, v27.4s, v1.8h 5652 uaddw v24.4s, v24.4s, v2.4h 5653 uaddw2 v25.4s, v25.4s, v2.8h 5654 uaddw v26.4s, v26.4s, v3.4h 5655 uaddw2 v27.4s, v27.4s, v3.8h 5656 b.gt 2b 56573: 5658 5659 // Multiply the height by eight and reuse the w4 subtracting 5660 lsl w6, w6, #3 5661 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) 5662 5663L(ipred_cfl_ac_444_tbl): 5664 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) 5665 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) 5666 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) 5667 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) 5668 5669L(ipred_cfl_ac_444_w32_tbl): 5670 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) 5671 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) 5672 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) 5673 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) 5674endfunc 5675