1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, 32// const pixel *const topleft, 33// const int width, const int height, const int a, 34// const int max_width, const int max_height); 35function ipred_dc_128_8bpc_neon, export=1 36 clz w3, w3 37 adr x5, L(ipred_dc_128_tbl) 38 sub w3, w3, #25 39 ldrh w3, [x5, w3, uxtw #1] 40 movi v0.16b, #128 41 sub x5, x5, w3, uxtw 42 add x6, x0, x1 43 lsl x1, x1, #1 44 br x5 454: 46 AARCH64_VALID_JUMP_TARGET 47 st1 {v0.s}[0], [x0], x1 48 st1 {v0.s}[0], [x6], x1 49 subs w4, w4, #4 50 st1 {v0.s}[0], [x0], x1 51 st1 {v0.s}[0], [x6], x1 52 b.gt 4b 53 ret 548: 55 AARCH64_VALID_JUMP_TARGET 56 st1 {v0.8b}, [x0], x1 57 st1 {v0.8b}, [x6], x1 58 subs w4, w4, #4 59 st1 {v0.8b}, [x0], x1 60 st1 {v0.8b}, [x6], x1 61 b.gt 8b 62 ret 6316: 64 AARCH64_VALID_JUMP_TARGET 65 st1 {v0.16b}, [x0], x1 66 st1 {v0.16b}, [x6], x1 67 subs w4, w4, #4 68 st1 {v0.16b}, [x0], x1 69 st1 {v0.16b}, [x6], x1 70 b.gt 16b 71 ret 72320: 73 AARCH64_VALID_JUMP_TARGET 74 movi v1.16b, #128 7532: 76 st1 {v0.16b, v1.16b}, [x0], x1 77 st1 {v0.16b, v1.16b}, [x6], x1 78 subs w4, w4, #4 79 st1 {v0.16b, v1.16b}, [x0], x1 80 st1 {v0.16b, v1.16b}, [x6], x1 81 b.gt 32b 82 ret 83640: 84 AARCH64_VALID_JUMP_TARGET 85 movi v1.16b, #128 86 movi v2.16b, #128 87 movi v3.16b, #128 8864: 89 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 90 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 91 subs w4, w4, #4 92 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 93 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 94 b.gt 64b 95 ret 96 97L(ipred_dc_128_tbl): 98 .hword L(ipred_dc_128_tbl) - 640b 99 .hword L(ipred_dc_128_tbl) - 320b 100 .hword L(ipred_dc_128_tbl) - 16b 101 .hword L(ipred_dc_128_tbl) - 8b 102 .hword L(ipred_dc_128_tbl) - 4b 103endfunc 104 105// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, 106// const pixel *const topleft, 107// const int width, const int height, const int a, 108// const int max_width, const int max_height); 109function ipred_v_8bpc_neon, export=1 110 clz w3, w3 111 adr x5, L(ipred_v_tbl) 112 sub w3, w3, #25 113 ldrh w3, [x5, w3, uxtw #1] 114 add x2, x2, #1 115 sub x5, x5, w3, uxtw 116 add x6, x0, x1 117 lsl x1, x1, #1 118 br x5 11940: 120 AARCH64_VALID_JUMP_TARGET 121 ld1 {v0.s}[0], [x2] 1224: 123 st1 {v0.s}[0], [x0], x1 124 st1 {v0.s}[0], [x6], x1 125 subs w4, w4, #4 126 st1 {v0.s}[0], [x0], x1 127 st1 {v0.s}[0], [x6], x1 128 b.gt 4b 129 ret 13080: 131 AARCH64_VALID_JUMP_TARGET 132 ld1 {v0.8b}, [x2] 1338: 134 st1 {v0.8b}, [x0], x1 135 st1 {v0.8b}, [x6], x1 136 subs w4, w4, #4 137 st1 {v0.8b}, [x0], x1 138 st1 {v0.8b}, [x6], x1 139 b.gt 8b 140 ret 141160: 142 AARCH64_VALID_JUMP_TARGET 143 ld1 {v0.16b}, [x2] 14416: 145 st1 {v0.16b}, [x0], x1 146 st1 {v0.16b}, [x6], x1 147 subs w4, w4, #4 148 st1 {v0.16b}, [x0], x1 149 st1 {v0.16b}, [x6], x1 150 b.gt 16b 151 ret 152320: 153 AARCH64_VALID_JUMP_TARGET 154 ld1 {v0.16b, v1.16b}, [x2] 15532: 156 st1 {v0.16b, v1.16b}, [x0], x1 157 st1 {v0.16b, v1.16b}, [x6], x1 158 subs w4, w4, #4 159 st1 {v0.16b, v1.16b}, [x0], x1 160 st1 {v0.16b, v1.16b}, [x6], x1 161 b.gt 32b 162 ret 163640: 164 AARCH64_VALID_JUMP_TARGET 165 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] 16664: 167 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 168 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 169 subs w4, w4, #4 170 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 171 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 172 b.gt 64b 173 ret 174 175L(ipred_v_tbl): 176 .hword L(ipred_v_tbl) - 640b 177 .hword L(ipred_v_tbl) - 320b 178 .hword L(ipred_v_tbl) - 160b 179 .hword L(ipred_v_tbl) - 80b 180 .hword L(ipred_v_tbl) - 40b 181endfunc 182 183// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, 184// const pixel *const topleft, 185// const int width, const int height, const int a, 186// const int max_width, const int max_height); 187function ipred_h_8bpc_neon, export=1 188 clz w3, w3 189 adr x5, L(ipred_h_tbl) 190 sub w3, w3, #25 191 ldrh w3, [x5, w3, uxtw #1] 192 sub x2, x2, #4 193 sub x5, x5, w3, uxtw 194 mov x7, #-4 195 add x6, x0, x1 196 lsl x1, x1, #1 197 br x5 1984: 199 AARCH64_VALID_JUMP_TARGET 200 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 201 st1 {v3.s}[0], [x0], x1 202 st1 {v2.s}[0], [x6], x1 203 subs w4, w4, #4 204 st1 {v1.s}[0], [x0], x1 205 st1 {v0.s}[0], [x6], x1 206 b.gt 4b 207 ret 2088: 209 AARCH64_VALID_JUMP_TARGET 210 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 211 st1 {v3.8b}, [x0], x1 212 st1 {v2.8b}, [x6], x1 213 subs w4, w4, #4 214 st1 {v1.8b}, [x0], x1 215 st1 {v0.8b}, [x6], x1 216 b.gt 8b 217 ret 21816: 219 AARCH64_VALID_JUMP_TARGET 220 ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 221 st1 {v3.16b}, [x0], x1 222 st1 {v2.16b}, [x6], x1 223 subs w4, w4, #4 224 st1 {v1.16b}, [x0], x1 225 st1 {v0.16b}, [x6], x1 226 b.gt 16b 227 ret 22832: 229 AARCH64_VALID_JUMP_TARGET 230 ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 231 str q3, [x0, #16] 232 str q2, [x6, #16] 233 st1 {v3.16b}, [x0], x1 234 st1 {v2.16b}, [x6], x1 235 subs w4, w4, #4 236 str q1, [x0, #16] 237 str q0, [x6, #16] 238 st1 {v1.16b}, [x0], x1 239 st1 {v0.16b}, [x6], x1 240 b.gt 32b 241 ret 24264: 243 AARCH64_VALID_JUMP_TARGET 244 ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 245 str q3, [x0, #16] 246 str q2, [x6, #16] 247 stp q3, q3, [x0, #32] 248 stp q2, q2, [x6, #32] 249 st1 {v3.16b}, [x0], x1 250 st1 {v2.16b}, [x6], x1 251 subs w4, w4, #4 252 str q1, [x0, #16] 253 str q0, [x6, #16] 254 stp q1, q1, [x0, #32] 255 stp q0, q0, [x6, #32] 256 st1 {v1.16b}, [x0], x1 257 st1 {v0.16b}, [x6], x1 258 b.gt 64b 259 ret 260 261L(ipred_h_tbl): 262 .hword L(ipred_h_tbl) - 64b 263 .hword L(ipred_h_tbl) - 32b 264 .hword L(ipred_h_tbl) - 16b 265 .hword L(ipred_h_tbl) - 8b 266 .hword L(ipred_h_tbl) - 4b 267endfunc 268 269// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, 270// const pixel *const topleft, 271// const int width, const int height, const int a, 272// const int max_width, const int max_height); 273function ipred_dc_top_8bpc_neon, export=1 274 clz w3, w3 275 adr x5, L(ipred_dc_top_tbl) 276 sub w3, w3, #25 277 ldrh w3, [x5, w3, uxtw #1] 278 add x2, x2, #1 279 sub x5, x5, w3, uxtw 280 add x6, x0, x1 281 lsl x1, x1, #1 282 br x5 28340: 284 AARCH64_VALID_JUMP_TARGET 285 ld1r {v0.2s}, [x2] 286 uaddlv h0, v0.8b 287 rshrn v0.8b, v0.8h, #3 288 dup v0.8b, v0.b[0] 2894: 290 st1 {v0.s}[0], [x0], x1 291 st1 {v0.s}[0], [x6], x1 292 subs w4, w4, #4 293 st1 {v0.s}[0], [x0], x1 294 st1 {v0.s}[0], [x6], x1 295 b.gt 4b 296 ret 29780: 298 AARCH64_VALID_JUMP_TARGET 299 ld1 {v0.8b}, [x2] 300 uaddlv h0, v0.8b 301 rshrn v0.8b, v0.8h, #3 302 dup v0.8b, v0.b[0] 3038: 304 st1 {v0.8b}, [x0], x1 305 st1 {v0.8b}, [x6], x1 306 subs w4, w4, #4 307 st1 {v0.8b}, [x0], x1 308 st1 {v0.8b}, [x6], x1 309 b.gt 8b 310 ret 311160: 312 AARCH64_VALID_JUMP_TARGET 313 ld1 {v0.16b}, [x2] 314 uaddlv h0, v0.16b 315 rshrn v0.8b, v0.8h, #4 316 dup v0.16b, v0.b[0] 31716: 318 st1 {v0.16b}, [x0], x1 319 st1 {v0.16b}, [x6], x1 320 subs w4, w4, #4 321 st1 {v0.16b}, [x0], x1 322 st1 {v0.16b}, [x6], x1 323 b.gt 16b 324 ret 325320: 326 AARCH64_VALID_JUMP_TARGET 327 ld1 {v0.16b, v1.16b}, [x2] 328 uaddlv h0, v0.16b 329 uaddlv h1, v1.16b 330 add v2.4h, v0.4h, v1.4h 331 rshrn v2.8b, v2.8h, #5 332 dup v0.16b, v2.b[0] 333 dup v1.16b, v2.b[0] 33432: 335 st1 {v0.16b, v1.16b}, [x0], x1 336 st1 {v0.16b, v1.16b}, [x6], x1 337 subs w4, w4, #4 338 st1 {v0.16b, v1.16b}, [x0], x1 339 st1 {v0.16b, v1.16b}, [x6], x1 340 b.gt 32b 341 ret 342640: 343 AARCH64_VALID_JUMP_TARGET 344 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] 345 uaddlv h0, v0.16b 346 uaddlv h1, v1.16b 347 uaddlv h2, v2.16b 348 uaddlv h3, v3.16b 349 add v4.4h, v0.4h, v1.4h 350 add v5.4h, v2.4h, v3.4h 351 add v4.4h, v4.4h, v5.4h 352 rshrn v4.8b, v4.8h, #6 353 dup v0.16b, v4.b[0] 354 dup v1.16b, v4.b[0] 355 dup v2.16b, v4.b[0] 356 dup v3.16b, v4.b[0] 35764: 358 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 359 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 360 subs w4, w4, #4 361 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 362 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 363 b.gt 64b 364 ret 365 366L(ipred_dc_top_tbl): 367 .hword L(ipred_dc_top_tbl) - 640b 368 .hword L(ipred_dc_top_tbl) - 320b 369 .hword L(ipred_dc_top_tbl) - 160b 370 .hword L(ipred_dc_top_tbl) - 80b 371 .hword L(ipred_dc_top_tbl) - 40b 372endfunc 373 374// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, 375// const pixel *const topleft, 376// const int width, const int height, const int a, 377// const int max_width, const int max_height); 378function ipred_dc_left_8bpc_neon, export=1 379 sub x2, x2, w4, uxtw 380 clz w3, w3 381 clz w7, w4 382 adr x5, L(ipred_dc_left_tbl) 383 sub w3, w3, #20 // 25 leading bits, minus table offset 5 384 sub w7, w7, #25 385 ldrh w3, [x5, w3, uxtw #1] 386 ldrh w7, [x5, w7, uxtw #1] 387 sub x3, x5, w3, uxtw 388 sub x5, x5, w7, uxtw 389 add x6, x0, x1 390 lsl x1, x1, #1 391 br x5 392 393L(ipred_dc_left_h4): 394 AARCH64_VALID_JUMP_TARGET 395 ld1r {v0.2s}, [x2] 396 uaddlv h0, v0.8b 397 rshrn v0.8b, v0.8h, #3 398 dup v0.16b, v0.b[0] 399 br x3 400L(ipred_dc_left_w4): 401 AARCH64_VALID_JUMP_TARGET 402 st1 {v0.s}[0], [x0], x1 403 st1 {v0.s}[0], [x6], x1 404 subs w4, w4, #4 405 st1 {v0.s}[0], [x0], x1 406 st1 {v0.s}[0], [x6], x1 407 b.gt L(ipred_dc_left_w4) 408 ret 409 410L(ipred_dc_left_h8): 411 AARCH64_VALID_JUMP_TARGET 412 ld1 {v0.8b}, [x2] 413 uaddlv h0, v0.8b 414 rshrn v0.8b, v0.8h, #3 415 dup v0.16b, v0.b[0] 416 br x3 417L(ipred_dc_left_w8): 418 AARCH64_VALID_JUMP_TARGET 419 st1 {v0.8b}, [x0], x1 420 st1 {v0.8b}, [x6], x1 421 subs w4, w4, #4 422 st1 {v0.8b}, [x0], x1 423 st1 {v0.8b}, [x6], x1 424 b.gt L(ipred_dc_left_w8) 425 ret 426 427L(ipred_dc_left_h16): 428 AARCH64_VALID_JUMP_TARGET 429 ld1 {v0.16b}, [x2] 430 uaddlv h0, v0.16b 431 rshrn v0.8b, v0.8h, #4 432 dup v0.16b, v0.b[0] 433 br x3 434L(ipred_dc_left_w16): 435 AARCH64_VALID_JUMP_TARGET 436 st1 {v0.16b}, [x0], x1 437 st1 {v0.16b}, [x6], x1 438 subs w4, w4, #4 439 st1 {v0.16b}, [x0], x1 440 st1 {v0.16b}, [x6], x1 441 b.gt L(ipred_dc_left_w16) 442 ret 443 444L(ipred_dc_left_h32): 445 AARCH64_VALID_JUMP_TARGET 446 ld1 {v0.16b, v1.16b}, [x2] 447 uaddlv h0, v0.16b 448 uaddlv h1, v1.16b 449 add v0.4h, v0.4h, v1.4h 450 rshrn v0.8b, v0.8h, #5 451 dup v0.16b, v0.b[0] 452 br x3 453L(ipred_dc_left_w32): 454 AARCH64_VALID_JUMP_TARGET 455 mov v1.16b, v0.16b 4561: 457 st1 {v0.16b, v1.16b}, [x0], x1 458 st1 {v0.16b, v1.16b}, [x6], x1 459 subs w4, w4, #4 460 st1 {v0.16b, v1.16b}, [x0], x1 461 st1 {v0.16b, v1.16b}, [x6], x1 462 b.gt 1b 463 ret 464 465L(ipred_dc_left_h64): 466 AARCH64_VALID_JUMP_TARGET 467 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] 468 uaddlv h0, v0.16b 469 uaddlv h1, v1.16b 470 uaddlv h2, v2.16b 471 uaddlv h3, v3.16b 472 add v0.4h, v0.4h, v1.4h 473 add v2.4h, v2.4h, v3.4h 474 add v0.4h, v0.4h, v2.4h 475 rshrn v0.8b, v0.8h, #6 476 dup v0.16b, v0.b[0] 477 br x3 478L(ipred_dc_left_w64): 479 AARCH64_VALID_JUMP_TARGET 480 mov v1.16b, v0.16b 481 mov v2.16b, v0.16b 482 mov v3.16b, v0.16b 4831: 484 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 485 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 486 subs w4, w4, #4 487 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 488 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 489 b.gt 1b 490 ret 491 492L(ipred_dc_left_tbl): 493 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) 494 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) 495 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) 496 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) 497 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) 498 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) 499 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) 500 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) 501 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) 502 .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) 503endfunc 504 505// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride, 506// const pixel *const topleft, 507// const int width, const int height, const int a, 508// const int max_width, const int max_height); 509function ipred_dc_8bpc_neon, export=1 510 sub x2, x2, w4, uxtw 511 add w7, w3, w4 // width + height 512 clz w3, w3 513 clz w6, w4 514 dup v16.8h, w7 // width + height 515 adr x5, L(ipred_dc_tbl) 516 rbit w7, w7 // rbit(width + height) 517 sub w3, w3, #20 // 25 leading bits, minus table offset 5 518 sub w6, w6, #25 519 clz w7, w7 // ctz(width + height) 520 ldrh w3, [x5, w3, uxtw #1] 521 ldrh w6, [x5, w6, uxtw #1] 522 neg w7, w7 // -ctz(width + height) 523 sub x3, x5, w3, uxtw 524 sub x5, x5, w6, uxtw 525 ushr v16.8h, v16.8h, #1 // (width + height) >> 1 526 dup v17.8h, w7 // -ctz(width + height) 527 add x6, x0, x1 528 lsl x1, x1, #1 529 br x5 530 531L(ipred_dc_h4): 532 AARCH64_VALID_JUMP_TARGET 533 ld1 {v0.s}[0], [x2], #4 534 ins v0.s[1], wzr 535 uaddlv h0, v0.8b 536 add x2, x2, #1 537 br x3 538L(ipred_dc_w4): 539 AARCH64_VALID_JUMP_TARGET 540 ld1 {v1.s}[0], [x2] 541 ins v1.s[1], wzr 542 add v0.4h, v0.4h, v16.4h 543 uaddlv h1, v1.8b 544 cmp w4, #4 545 add v0.4h, v0.4h, v1.4h 546 ushl v0.4h, v0.4h, v17.4h 547 b.eq 1f 548 // h = 8/16 549 mov w16, #(0x3334/2) 550 movk w16, #(0x5556/2), lsl #16 551 add w17, w4, w4 // w17 = 2*h = 16 or 32 552 lsr w16, w16, w17 553 dup v16.4h, w16 554 sqdmulh v0.4h, v0.4h, v16.4h 5551: 556 dup v0.8b, v0.b[0] 5572: 558 st1 {v0.s}[0], [x0], x1 559 st1 {v0.s}[0], [x6], x1 560 subs w4, w4, #4 561 st1 {v0.s}[0], [x0], x1 562 st1 {v0.s}[0], [x6], x1 563 b.gt 2b 564 ret 565 566L(ipred_dc_h8): 567 AARCH64_VALID_JUMP_TARGET 568 ld1 {v0.8b}, [x2], #8 569 uaddlv h0, v0.8b 570 add x2, x2, #1 571 br x3 572L(ipred_dc_w8): 573 AARCH64_VALID_JUMP_TARGET 574 ld1 {v1.8b}, [x2] 575 add v0.4h, v0.4h, v16.4h 576 uaddlv h1, v1.8b 577 cmp w4, #8 578 add v0.4h, v0.4h, v1.4h 579 ushl v0.4h, v0.4h, v17.4h 580 b.eq 1f 581 // h = 4/16/32 582 cmp w4, #32 583 mov w16, #(0x3334/2) 584 mov w17, #(0x5556/2) 585 csel w16, w16, w17, eq 586 dup v16.4h, w16 587 sqdmulh v0.4h, v0.4h, v16.4h 5881: 589 dup v0.8b, v0.b[0] 5902: 591 st1 {v0.8b}, [x0], x1 592 st1 {v0.8b}, [x6], x1 593 subs w4, w4, #4 594 st1 {v0.8b}, [x0], x1 595 st1 {v0.8b}, [x6], x1 596 b.gt 2b 597 ret 598 599L(ipred_dc_h16): 600 AARCH64_VALID_JUMP_TARGET 601 ld1 {v0.16b}, [x2], #16 602 uaddlv h0, v0.16b 603 add x2, x2, #1 604 br x3 605L(ipred_dc_w16): 606 AARCH64_VALID_JUMP_TARGET 607 ld1 {v1.16b}, [x2] 608 add v0.4h, v0.4h, v16.4h 609 uaddlv h1, v1.16b 610 cmp w4, #16 611 add v0.4h, v0.4h, v1.4h 612 ushl v0.4h, v0.4h, v17.4h 613 b.eq 1f 614 // h = 4/8/32/64 615 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask 616 mov w16, #(0x3334/2) 617 mov w17, #(0x5556/2) 618 csel w16, w16, w17, eq 619 dup v16.4h, w16 620 sqdmulh v0.4h, v0.4h, v16.4h 6211: 622 dup v0.16b, v0.b[0] 6232: 624 st1 {v0.16b}, [x0], x1 625 st1 {v0.16b}, [x6], x1 626 subs w4, w4, #4 627 st1 {v0.16b}, [x0], x1 628 st1 {v0.16b}, [x6], x1 629 b.gt 2b 630 ret 631 632L(ipred_dc_h32): 633 AARCH64_VALID_JUMP_TARGET 634 ld1 {v0.16b, v1.16b}, [x2], #32 635 uaddlv h0, v0.16b 636 uaddlv h1, v1.16b 637 add x2, x2, #1 638 add v0.4h, v0.4h, v1.4h 639 br x3 640L(ipred_dc_w32): 641 AARCH64_VALID_JUMP_TARGET 642 ld1 {v1.16b, v2.16b}, [x2] 643 add v0.4h, v0.4h, v16.4h 644 uaddlv h1, v1.16b 645 uaddlv h2, v2.16b 646 cmp w4, #32 647 add v0.4h, v0.4h, v1.4h 648 add v0.4h, v0.4h, v2.4h 649 ushl v4.4h, v0.4h, v17.4h 650 b.eq 1f 651 // h = 8/16/64 652 cmp w4, #8 653 mov w16, #(0x3334/2) 654 mov w17, #(0x5556/2) 655 csel w16, w16, w17, eq 656 dup v16.4h, w16 657 sqdmulh v4.4h, v4.4h, v16.4h 6581: 659 dup v0.16b, v4.b[0] 660 dup v1.16b, v4.b[0] 6612: 662 st1 {v0.16b, v1.16b}, [x0], x1 663 st1 {v0.16b, v1.16b}, [x6], x1 664 subs w4, w4, #4 665 st1 {v0.16b, v1.16b}, [x0], x1 666 st1 {v0.16b, v1.16b}, [x6], x1 667 b.gt 2b 668 ret 669 670L(ipred_dc_h64): 671 AARCH64_VALID_JUMP_TARGET 672 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 673 uaddlv h0, v0.16b 674 uaddlv h1, v1.16b 675 uaddlv h2, v2.16b 676 uaddlv h3, v3.16b 677 add v0.4h, v0.4h, v1.4h 678 add v2.4h, v2.4h, v3.4h 679 add x2, x2, #1 680 add v0.4h, v0.4h, v2.4h 681 br x3 682L(ipred_dc_w64): 683 AARCH64_VALID_JUMP_TARGET 684 ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x2] 685 add v0.4h, v0.4h, v16.4h 686 uaddlv h1, v1.16b 687 uaddlv h2, v2.16b 688 uaddlv h3, v3.16b 689 uaddlv h4, v4.16b 690 add v1.4h, v1.4h, v2.4h 691 add v3.4h, v3.4h, v4.4h 692 cmp w4, #64 693 add v0.4h, v0.4h, v1.4h 694 add v0.4h, v0.4h, v3.4h 695 ushl v4.4h, v0.4h, v17.4h 696 b.eq 1f 697 // h = 16/32 698 mov w16, #(0x5556/2) 699 movk w16, #(0x3334/2), lsl #16 700 lsr w16, w16, w4 701 dup v16.4h, w16 702 sqdmulh v4.4h, v4.4h, v16.4h 7031: 704 dup v0.16b, v4.b[0] 705 dup v1.16b, v4.b[0] 706 dup v2.16b, v4.b[0] 707 dup v3.16b, v4.b[0] 7082: 709 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 710 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 711 subs w4, w4, #4 712 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 713 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 714 b.gt 2b 715 ret 716 717L(ipred_dc_tbl): 718 .hword L(ipred_dc_tbl) - L(ipred_dc_h64) 719 .hword L(ipred_dc_tbl) - L(ipred_dc_h32) 720 .hword L(ipred_dc_tbl) - L(ipred_dc_h16) 721 .hword L(ipred_dc_tbl) - L(ipred_dc_h8) 722 .hword L(ipred_dc_tbl) - L(ipred_dc_h4) 723 .hword L(ipred_dc_tbl) - L(ipred_dc_w64) 724 .hword L(ipred_dc_tbl) - L(ipred_dc_w32) 725 .hword L(ipred_dc_tbl) - L(ipred_dc_w16) 726 .hword L(ipred_dc_tbl) - L(ipred_dc_w8) 727 .hword L(ipred_dc_tbl) - L(ipred_dc_w4) 728endfunc 729 730// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride, 731// const pixel *const topleft, 732// const int width, const int height, const int a, 733// const int max_width, const int max_height); 734function ipred_paeth_8bpc_neon, export=1 735 clz w9, w3 736 adr x5, L(ipred_paeth_tbl) 737 sub w9, w9, #25 738 ldrh w9, [x5, w9, uxtw #1] 739 ld1r {v4.16b}, [x2] 740 add x8, x2, #1 741 sub x2, x2, #4 742 sub x5, x5, w9, uxtw 743 mov x7, #-4 744 add x6, x0, x1 745 lsl x1, x1, #1 746 br x5 74740: 748 AARCH64_VALID_JUMP_TARGET 749 ld1r {v5.4s}, [x8] 750 usubl v6.8h, v5.8b, v4.8b // top - topleft 7514: 752 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 753 zip1 v0.2s, v0.2s, v1.2s 754 zip1 v2.2s, v2.2s, v3.2s 755 uaddw v16.8h, v6.8h, v0.8b 756 uaddw v17.8h, v6.8h, v2.8b 757 sqxtun v16.8b, v16.8h // base 758 sqxtun2 v16.16b, v17.8h 759 zip1 v0.2d, v0.2d, v2.2d 760 uabd v20.16b, v5.16b, v16.16b // tdiff 761 uabd v22.16b, v4.16b, v16.16b // tldiff 762 uabd v16.16b, v0.16b, v16.16b // ldiff 763 umin v18.16b, v20.16b, v22.16b // min(tdiff, tldiff) 764 cmhs v20.16b, v22.16b, v20.16b // tldiff >= tdiff 765 cmhs v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff 766 bsl v20.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft 767 bit v20.16b, v0.16b, v16.16b // ldiff <= min ? left : ... 768 st1 {v20.s}[3], [x0], x1 769 st1 {v20.s}[2], [x6], x1 770 subs w4, w4, #4 771 st1 {v20.s}[1], [x0], x1 772 st1 {v20.s}[0], [x6], x1 773 b.gt 4b 774 ret 77580: 776 AARCH64_VALID_JUMP_TARGET 777 ld1r {v5.2d}, [x8] 778 usubl v6.8h, v5.8b, v4.8b // top - topleft 7798: 780 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 781 uaddw v16.8h, v6.8h, v0.8b 782 uaddw v17.8h, v6.8h, v1.8b 783 uaddw v18.8h, v6.8h, v2.8b 784 uaddw v19.8h, v6.8h, v3.8b 785 sqxtun v16.8b, v16.8h // base 786 sqxtun2 v16.16b, v17.8h 787 sqxtun v18.8b, v18.8h 788 sqxtun2 v18.16b, v19.8h 789 zip1 v2.2d, v2.2d, v3.2d 790 zip1 v0.2d, v0.2d, v1.2d 791 uabd v21.16b, v5.16b, v18.16b // tdiff 792 uabd v20.16b, v5.16b, v16.16b 793 uabd v23.16b, v4.16b, v18.16b // tldiff 794 uabd v22.16b, v4.16b, v16.16b 795 uabd v17.16b, v2.16b, v18.16b // ldiff 796 uabd v16.16b, v0.16b, v16.16b 797 umin v19.16b, v21.16b, v23.16b // min(tdiff, tldiff) 798 umin v18.16b, v20.16b, v22.16b 799 cmhs v21.16b, v23.16b, v21.16b // tldiff >= tdiff 800 cmhs v20.16b, v22.16b, v20.16b 801 cmhs v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff 802 cmhs v16.16b, v18.16b, v16.16b 803 bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft 804 bsl v20.16b, v5.16b, v4.16b 805 bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... 806 bit v20.16b, v0.16b, v16.16b 807 st1 {v21.d}[1], [x0], x1 808 st1 {v21.d}[0], [x6], x1 809 subs w4, w4, #4 810 st1 {v20.d}[1], [x0], x1 811 st1 {v20.d}[0], [x6], x1 812 b.gt 8b 813 ret 814160: 815320: 816640: 817 AARCH64_VALID_JUMP_TARGET 818 ld1 {v5.16b}, [x8], #16 819 mov w9, w3 820 // Set up pointers for four rows in parallel; x0, x6, x5, x10 821 add x5, x0, x1 822 add x10, x6, x1 823 lsl x1, x1, #1 824 sub x1, x1, w3, uxtw 8251: 826 ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 8272: 828 usubl v6.8h, v5.8b, v4.8b // top - topleft 829 usubl2 v7.8h, v5.16b, v4.16b 830 uaddw v24.8h, v6.8h, v0.8b 831 uaddw v25.8h, v7.8h, v0.8b 832 uaddw v26.8h, v6.8h, v1.8b 833 uaddw v27.8h, v7.8h, v1.8b 834 uaddw v28.8h, v6.8h, v2.8b 835 uaddw v29.8h, v7.8h, v2.8b 836 uaddw v30.8h, v6.8h, v3.8b 837 uaddw v31.8h, v7.8h, v3.8b 838 sqxtun v17.8b, v26.8h // base 839 sqxtun2 v17.16b, v27.8h 840 sqxtun v16.8b, v24.8h 841 sqxtun2 v16.16b, v25.8h 842 sqxtun v19.8b, v30.8h 843 sqxtun2 v19.16b, v31.8h 844 sqxtun v18.8b, v28.8h 845 sqxtun2 v18.16b, v29.8h 846 uabd v23.16b, v5.16b, v19.16b // tdiff 847 uabd v22.16b, v5.16b, v18.16b 848 uabd v21.16b, v5.16b, v17.16b 849 uabd v20.16b, v5.16b, v16.16b 850 uabd v27.16b, v4.16b, v19.16b // tldiff 851 uabd v26.16b, v4.16b, v18.16b 852 uabd v25.16b, v4.16b, v17.16b 853 uabd v24.16b, v4.16b, v16.16b 854 uabd v19.16b, v3.16b, v19.16b // ldiff 855 uabd v18.16b, v2.16b, v18.16b 856 uabd v17.16b, v1.16b, v17.16b 857 uabd v16.16b, v0.16b, v16.16b 858 umin v31.16b, v23.16b, v27.16b // min(tdiff, tldiff) 859 umin v30.16b, v22.16b, v26.16b 860 umin v29.16b, v21.16b, v25.16b 861 umin v28.16b, v20.16b, v24.16b 862 cmhs v23.16b, v27.16b, v23.16b // tldiff >= tdiff 863 cmhs v22.16b, v26.16b, v22.16b 864 cmhs v21.16b, v25.16b, v21.16b 865 cmhs v20.16b, v24.16b, v20.16b 866 cmhs v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff 867 cmhs v18.16b, v30.16b, v18.16b 868 cmhs v17.16b, v29.16b, v17.16b 869 cmhs v16.16b, v28.16b, v16.16b 870 bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft 871 bsl v22.16b, v5.16b, v4.16b 872 bsl v21.16b, v5.16b, v4.16b 873 bsl v20.16b, v5.16b, v4.16b 874 bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... 875 bit v22.16b, v2.16b, v18.16b 876 bit v21.16b, v1.16b, v17.16b 877 bit v20.16b, v0.16b, v16.16b 878 subs w3, w3, #16 879 st1 {v23.16b}, [x0], #16 880 st1 {v22.16b}, [x6], #16 881 st1 {v21.16b}, [x5], #16 882 st1 {v20.16b}, [x10], #16 883 b.le 8f 884 ld1 {v5.16b}, [x8], #16 885 b 2b 8868: 887 subs w4, w4, #4 888 b.le 9f 889 // End of horizontal loop, move pointers to next four rows 890 sub x8, x8, w9, uxtw 891 add x0, x0, x1 892 add x6, x6, x1 893 // Load the top row as early as possible 894 ld1 {v5.16b}, [x8], #16 895 add x5, x5, x1 896 add x10, x10, x1 897 mov w3, w9 898 b 1b 8999: 900 ret 901 902L(ipred_paeth_tbl): 903 .hword L(ipred_paeth_tbl) - 640b 904 .hword L(ipred_paeth_tbl) - 320b 905 .hword L(ipred_paeth_tbl) - 160b 906 .hword L(ipred_paeth_tbl) - 80b 907 .hword L(ipred_paeth_tbl) - 40b 908endfunc 909 910// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride, 911// const pixel *const topleft, 912// const int width, const int height, const int a, 913// const int max_width, const int max_height); 914function ipred_smooth_8bpc_neon, export=1 915 movrel x10, X(sm_weights) 916 add x11, x10, w4, uxtw 917 add x10, x10, w3, uxtw 918 clz w9, w3 919 adr x5, L(ipred_smooth_tbl) 920 sub x12, x2, w4, uxtw 921 sub w9, w9, #25 922 ldrh w9, [x5, w9, uxtw #1] 923 ld1r {v4.16b}, [x12] // bottom 924 add x8, x2, #1 925 sub x5, x5, w9, uxtw 926 add x6, x0, x1 927 lsl x1, x1, #1 928 br x5 92940: 930 AARCH64_VALID_JUMP_TARGET 931 ld1r {v6.2s}, [x8] // top 932 ld1r {v7.2s}, [x10] // weights_hor 933 sub x2, x2, #4 934 mov x7, #-4 935 dup v5.16b, v6.b[3] // right 936 usubl v6.8h, v6.8b, v4.8b // top-bottom 937 uxtl v7.8h, v7.8b // weights_hor 9384: 939 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left 940 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver 941 shll v20.8h, v5.8b, #8 // right*256 942 shll v21.8h, v5.8b, #8 943 zip1 v1.2s, v1.2s, v0.2s // left, flipped 944 zip1 v0.2s, v3.2s, v2.2s 945 zip1 v16.2s, v16.2s, v17.2s // weights_ver 946 zip1 v18.2s, v18.2s, v19.2s 947 shll v22.8h, v4.8b, #8 // bottom*256 948 shll v23.8h, v4.8b, #8 949 usubl v0.8h, v0.8b, v5.8b // left-right 950 usubl v1.8h, v1.8b, v5.8b 951 uxtl v16.8h, v16.8b // weights_ver 952 uxtl v18.8h, v18.8b 953 mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor 954 mla v21.8h, v1.8h, v7.8h 955 mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver 956 mla v23.8h, v6.8h, v18.8h 957 uhadd v20.8h, v20.8h, v22.8h 958 uhadd v21.8h, v21.8h, v23.8h 959 rshrn v20.8b, v20.8h, #8 960 rshrn v21.8b, v21.8h, #8 961 st1 {v20.s}[0], [x0], x1 962 st1 {v20.s}[1], [x6], x1 963 subs w4, w4, #4 964 st1 {v21.s}[0], [x0], x1 965 st1 {v21.s}[1], [x6], x1 966 b.gt 4b 967 ret 96880: 969 AARCH64_VALID_JUMP_TARGET 970 ld1 {v6.8b}, [x8] // top 971 ld1 {v7.8b}, [x10] // weights_hor 972 sub x2, x2, #4 973 mov x7, #-4 974 dup v5.16b, v6.b[7] // right 975 usubl v6.8h, v6.8b, v4.8b // top-bottom 976 uxtl v7.8h, v7.8b // weights_hor 9778: 978 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left 979 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver 980 shll v20.8h, v5.8b, #8 // right*256 981 shll v21.8h, v5.8b, #8 982 shll v22.8h, v5.8b, #8 983 shll v23.8h, v5.8b, #8 984 usubl v0.8h, v0.8b, v5.8b // left-right 985 usubl v1.8h, v1.8b, v5.8b 986 usubl v2.8h, v2.8b, v5.8b 987 usubl v3.8h, v3.8b, v5.8b 988 shll v24.8h, v4.8b, #8 // bottom*256 989 shll v25.8h, v4.8b, #8 990 shll v26.8h, v4.8b, #8 991 shll v27.8h, v4.8b, #8 992 uxtl v16.8h, v16.8b // weights_ver 993 uxtl v17.8h, v17.8b 994 uxtl v18.8h, v18.8b 995 uxtl v19.8h, v19.8b 996 mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor 997 mla v21.8h, v2.8h, v7.8h // (left flipped) 998 mla v22.8h, v1.8h, v7.8h 999 mla v23.8h, v0.8h, v7.8h 1000 mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver 1001 mla v25.8h, v6.8h, v17.8h 1002 mla v26.8h, v6.8h, v18.8h 1003 mla v27.8h, v6.8h, v19.8h 1004 uhadd v20.8h, v20.8h, v24.8h 1005 uhadd v21.8h, v21.8h, v25.8h 1006 uhadd v22.8h, v22.8h, v26.8h 1007 uhadd v23.8h, v23.8h, v27.8h 1008 rshrn v20.8b, v20.8h, #8 1009 rshrn v21.8b, v21.8h, #8 1010 rshrn v22.8b, v22.8h, #8 1011 rshrn v23.8b, v23.8h, #8 1012 st1 {v20.8b}, [x0], x1 1013 st1 {v21.8b}, [x6], x1 1014 subs w4, w4, #4 1015 st1 {v22.8b}, [x0], x1 1016 st1 {v23.8b}, [x6], x1 1017 b.gt 8b 1018 ret 1019160: 1020320: 1021640: 1022 AARCH64_VALID_JUMP_TARGET 1023 add x12, x2, w3, uxtw 1024 sub x2, x2, #2 1025 mov x7, #-2 1026 ld1r {v5.16b}, [x12] // right 1027 sub x1, x1, w3, uxtw 1028 mov w9, w3 1029 10301: 1031 ld2r {v0.8b, v1.8b}, [x2], x7 // left 1032 ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver 1033 usubl v0.8h, v0.8b, v5.8b // left-right 1034 usubl v1.8h, v1.8b, v5.8b 1035 uxtl v16.8h, v16.8b // weights_ver 1036 uxtl v17.8h, v17.8b 10372: 1038 ld1 {v7.16b}, [x10], #16 // weights_hor 1039 ld1 {v3.16b}, [x8], #16 // top 1040 shll v20.8h, v5.8b, #8 // right*256 1041 shll v21.8h, v5.8b, #8 1042 shll v22.8h, v5.8b, #8 1043 shll v23.8h, v5.8b, #8 1044 uxtl v6.8h, v7.8b // weights_hor 1045 uxtl2 v7.8h, v7.16b 1046 usubl v2.8h, v3.8b, v4.8b // top-bottom 1047 usubl2 v3.8h, v3.16b, v4.16b 1048 mla v20.8h, v1.8h, v6.8h // right*256 + (left-right)*weights_hor 1049 mla v21.8h, v1.8h, v7.8h // (left flipped) 1050 mla v22.8h, v0.8h, v6.8h 1051 mla v23.8h, v0.8h, v7.8h 1052 shll v24.8h, v4.8b, #8 // bottom*256 1053 shll v25.8h, v4.8b, #8 1054 shll v26.8h, v4.8b, #8 1055 shll v27.8h, v4.8b, #8 1056 mla v24.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver 1057 mla v25.8h, v3.8h, v16.8h 1058 mla v26.8h, v2.8h, v17.8h 1059 mla v27.8h, v3.8h, v17.8h 1060 uhadd v20.8h, v20.8h, v24.8h 1061 uhadd v21.8h, v21.8h, v25.8h 1062 uhadd v22.8h, v22.8h, v26.8h 1063 uhadd v23.8h, v23.8h, v27.8h 1064 rshrn v20.8b, v20.8h, #8 1065 rshrn2 v20.16b, v21.8h, #8 1066 rshrn v22.8b, v22.8h, #8 1067 rshrn2 v22.16b, v23.8h, #8 1068 subs w3, w3, #16 1069 st1 {v20.16b}, [x0], #16 1070 st1 {v22.16b}, [x6], #16 1071 b.gt 2b 1072 subs w4, w4, #2 1073 b.le 9f 1074 sub x8, x8, w9, uxtw 1075 sub x10, x10, w9, uxtw 1076 add x0, x0, x1 1077 add x6, x6, x1 1078 mov w3, w9 1079 b 1b 10809: 1081 ret 1082 1083L(ipred_smooth_tbl): 1084 .hword L(ipred_smooth_tbl) - 640b 1085 .hword L(ipred_smooth_tbl) - 320b 1086 .hword L(ipred_smooth_tbl) - 160b 1087 .hword L(ipred_smooth_tbl) - 80b 1088 .hword L(ipred_smooth_tbl) - 40b 1089endfunc 1090 1091// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, 1092// const pixel *const topleft, 1093// const int width, const int height, const int a, 1094// const int max_width, const int max_height); 1095function ipred_smooth_v_8bpc_neon, export=1 1096 movrel x7, X(sm_weights) 1097 add x7, x7, w4, uxtw 1098 clz w9, w3 1099 adr x5, L(ipred_smooth_v_tbl) 1100 sub x8, x2, w4, uxtw 1101 sub w9, w9, #25 1102 ldrh w9, [x5, w9, uxtw #1] 1103 ld1r {v4.16b}, [x8] // bottom 1104 add x2, x2, #1 1105 sub x5, x5, w9, uxtw 1106 add x6, x0, x1 1107 lsl x1, x1, #1 1108 br x5 110940: 1110 AARCH64_VALID_JUMP_TARGET 1111 ld1r {v6.2s}, [x2] // top 1112 usubl v6.8h, v6.8b, v4.8b // top-bottom 11134: 1114 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1115 shll v22.8h, v4.8b, #8 // bottom*256 1116 shll v23.8h, v4.8b, #8 1117 zip1 v16.2s, v16.2s, v17.2s // weights_ver 1118 zip1 v18.2s, v18.2s, v19.2s 1119 uxtl v16.8h, v16.8b // weights_ver 1120 uxtl v18.8h, v18.8b 1121 mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver 1122 mla v23.8h, v6.8h, v18.8h 1123 rshrn v22.8b, v22.8h, #8 1124 rshrn v23.8b, v23.8h, #8 1125 st1 {v22.s}[0], [x0], x1 1126 st1 {v22.s}[1], [x6], x1 1127 subs w4, w4, #4 1128 st1 {v23.s}[0], [x0], x1 1129 st1 {v23.s}[1], [x6], x1 1130 b.gt 4b 1131 ret 113280: 1133 AARCH64_VALID_JUMP_TARGET 1134 ld1 {v6.8b}, [x2] // top 1135 usubl v6.8h, v6.8b, v4.8b // top-bottom 11368: 1137 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1138 shll v24.8h, v4.8b, #8 // bottom*256 1139 shll v25.8h, v4.8b, #8 1140 shll v26.8h, v4.8b, #8 1141 shll v27.8h, v4.8b, #8 1142 uxtl v16.8h, v16.8b // weights_ver 1143 uxtl v17.8h, v17.8b 1144 uxtl v18.8h, v18.8b 1145 uxtl v19.8h, v19.8b 1146 mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver 1147 mla v25.8h, v6.8h, v17.8h 1148 mla v26.8h, v6.8h, v18.8h 1149 mla v27.8h, v6.8h, v19.8h 1150 rshrn v24.8b, v24.8h, #8 1151 rshrn v25.8b, v25.8h, #8 1152 rshrn v26.8b, v26.8h, #8 1153 rshrn v27.8b, v27.8h, #8 1154 st1 {v24.8b}, [x0], x1 1155 st1 {v25.8b}, [x6], x1 1156 subs w4, w4, #4 1157 st1 {v26.8b}, [x0], x1 1158 st1 {v27.8b}, [x6], x1 1159 b.gt 8b 1160 ret 1161160: 1162320: 1163640: 1164 AARCH64_VALID_JUMP_TARGET 1165 // Set up pointers for four rows in parallel; x0, x6, x5, x8 1166 add x5, x0, x1 1167 add x8, x6, x1 1168 lsl x1, x1, #1 1169 sub x1, x1, w3, uxtw 1170 mov w9, w3 1171 11721: 1173 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1174 uxtl v16.8h, v16.8b // weights_ver 1175 uxtl v17.8h, v17.8b 1176 uxtl v18.8h, v18.8b 1177 uxtl v19.8h, v19.8b 11782: 1179 ld1 {v3.16b}, [x2], #16 // top 1180 shll v20.8h, v4.8b, #8 // bottom*256 1181 shll v21.8h, v4.8b, #8 1182 shll v22.8h, v4.8b, #8 1183 shll v23.8h, v4.8b, #8 1184 shll v24.8h, v4.8b, #8 1185 shll v25.8h, v4.8b, #8 1186 shll v26.8h, v4.8b, #8 1187 shll v27.8h, v4.8b, #8 1188 usubl v2.8h, v3.8b, v4.8b // top-bottom 1189 usubl2 v3.8h, v3.16b, v4.16b 1190 mla v20.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver 1191 mla v21.8h, v3.8h, v16.8h 1192 mla v22.8h, v2.8h, v17.8h 1193 mla v23.8h, v3.8h, v17.8h 1194 mla v24.8h, v2.8h, v18.8h 1195 mla v25.8h, v3.8h, v18.8h 1196 mla v26.8h, v2.8h, v19.8h 1197 mla v27.8h, v3.8h, v19.8h 1198 rshrn v20.8b, v20.8h, #8 1199 rshrn2 v20.16b, v21.8h, #8 1200 rshrn v22.8b, v22.8h, #8 1201 rshrn2 v22.16b, v23.8h, #8 1202 rshrn v24.8b, v24.8h, #8 1203 rshrn2 v24.16b, v25.8h, #8 1204 rshrn v26.8b, v26.8h, #8 1205 rshrn2 v26.16b, v27.8h, #8 1206 subs w3, w3, #16 1207 st1 {v20.16b}, [x0], #16 1208 st1 {v22.16b}, [x6], #16 1209 st1 {v24.16b}, [x5], #16 1210 st1 {v26.16b}, [x8], #16 1211 b.gt 2b 1212 subs w4, w4, #4 1213 b.le 9f 1214 sub x2, x2, w9, uxtw 1215 add x0, x0, x1 1216 add x6, x6, x1 1217 add x5, x5, x1 1218 add x8, x8, x1 1219 mov w3, w9 1220 b 1b 12219: 1222 ret 1223 1224L(ipred_smooth_v_tbl): 1225 .hword L(ipred_smooth_v_tbl) - 640b 1226 .hword L(ipred_smooth_v_tbl) - 320b 1227 .hword L(ipred_smooth_v_tbl) - 160b 1228 .hword L(ipred_smooth_v_tbl) - 80b 1229 .hword L(ipred_smooth_v_tbl) - 40b 1230endfunc 1231 1232// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, 1233// const pixel *const topleft, 1234// const int width, const int height, const int a, 1235// const int max_width, const int max_height); 1236function ipred_smooth_h_8bpc_neon, export=1 1237 movrel x8, X(sm_weights) 1238 add x8, x8, w3, uxtw 1239 clz w9, w3 1240 adr x5, L(ipred_smooth_h_tbl) 1241 add x12, x2, w3, uxtw 1242 sub w9, w9, #25 1243 ldrh w9, [x5, w9, uxtw #1] 1244 ld1r {v5.16b}, [x12] // right 1245 sub x5, x5, w9, uxtw 1246 add x6, x0, x1 1247 lsl x1, x1, #1 1248 br x5 124940: 1250 AARCH64_VALID_JUMP_TARGET 1251 ld1r {v7.2s}, [x8] // weights_hor 1252 sub x2, x2, #4 1253 mov x7, #-4 1254 uxtl v7.8h, v7.8b // weights_hor 12554: 1256 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left 1257 shll v20.8h, v5.8b, #8 // right*256 1258 shll v21.8h, v5.8b, #8 1259 zip1 v1.2s, v1.2s, v0.2s // left, flipped 1260 zip1 v0.2s, v3.2s, v2.2s 1261 usubl v0.8h, v0.8b, v5.8b // left-right 1262 usubl v1.8h, v1.8b, v5.8b 1263 mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor 1264 mla v21.8h, v1.8h, v7.8h 1265 rshrn v20.8b, v20.8h, #8 1266 rshrn v21.8b, v21.8h, #8 1267 st1 {v20.s}[0], [x0], x1 1268 st1 {v20.s}[1], [x6], x1 1269 subs w4, w4, #4 1270 st1 {v21.s}[0], [x0], x1 1271 st1 {v21.s}[1], [x6], x1 1272 b.gt 4b 1273 ret 127480: 1275 AARCH64_VALID_JUMP_TARGET 1276 ld1 {v7.8b}, [x8] // weights_hor 1277 sub x2, x2, #4 1278 mov x7, #-4 1279 uxtl v7.8h, v7.8b // weights_hor 12808: 1281 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left 1282 shll v20.8h, v5.8b, #8 // right*256 1283 shll v21.8h, v5.8b, #8 1284 shll v22.8h, v5.8b, #8 1285 shll v23.8h, v5.8b, #8 1286 usubl v3.8h, v3.8b, v5.8b // left-right 1287 usubl v2.8h, v2.8b, v5.8b 1288 usubl v1.8h, v1.8b, v5.8b 1289 usubl v0.8h, v0.8b, v5.8b 1290 mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor 1291 mla v21.8h, v2.8h, v7.8h // (left flipped) 1292 mla v22.8h, v1.8h, v7.8h 1293 mla v23.8h, v0.8h, v7.8h 1294 rshrn v20.8b, v20.8h, #8 1295 rshrn v21.8b, v21.8h, #8 1296 rshrn v22.8b, v22.8h, #8 1297 rshrn v23.8b, v23.8h, #8 1298 st1 {v20.8b}, [x0], x1 1299 st1 {v21.8b}, [x6], x1 1300 subs w4, w4, #4 1301 st1 {v22.8b}, [x0], x1 1302 st1 {v23.8b}, [x6], x1 1303 b.gt 8b 1304 ret 1305160: 1306320: 1307640: 1308 AARCH64_VALID_JUMP_TARGET 1309 sub x2, x2, #4 1310 mov x7, #-4 1311 // Set up pointers for four rows in parallel; x0, x6, x5, x10 1312 add x5, x0, x1 1313 add x10, x6, x1 1314 lsl x1, x1, #1 1315 sub x1, x1, w3, uxtw 1316 mov w9, w3 1317 13181: 1319 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left 1320 usubl v0.8h, v0.8b, v5.8b // left-right 1321 usubl v1.8h, v1.8b, v5.8b 1322 usubl v2.8h, v2.8b, v5.8b 1323 usubl v3.8h, v3.8b, v5.8b 13242: 1325 ld1 {v7.16b}, [x8], #16 // weights_hor 1326 shll v20.8h, v5.8b, #8 // right*256 1327 shll v21.8h, v5.8b, #8 1328 shll v22.8h, v5.8b, #8 1329 shll v23.8h, v5.8b, #8 1330 shll v24.8h, v5.8b, #8 1331 shll v25.8h, v5.8b, #8 1332 shll v26.8h, v5.8b, #8 1333 shll v27.8h, v5.8b, #8 1334 uxtl v6.8h, v7.8b // weights_hor 1335 uxtl2 v7.8h, v7.16b 1336 mla v20.8h, v3.8h, v6.8h // right*256 + (left-right)*weights_hor 1337 mla v21.8h, v3.8h, v7.8h // (left flipped) 1338 mla v22.8h, v2.8h, v6.8h 1339 mla v23.8h, v2.8h, v7.8h 1340 mla v24.8h, v1.8h, v6.8h 1341 mla v25.8h, v1.8h, v7.8h 1342 mla v26.8h, v0.8h, v6.8h 1343 mla v27.8h, v0.8h, v7.8h 1344 rshrn v20.8b, v20.8h, #8 1345 rshrn2 v20.16b, v21.8h, #8 1346 rshrn v22.8b, v22.8h, #8 1347 rshrn2 v22.16b, v23.8h, #8 1348 rshrn v24.8b, v24.8h, #8 1349 rshrn2 v24.16b, v25.8h, #8 1350 rshrn v26.8b, v26.8h, #8 1351 rshrn2 v26.16b, v27.8h, #8 1352 subs w3, w3, #16 1353 st1 {v20.16b}, [x0], #16 1354 st1 {v22.16b}, [x6], #16 1355 st1 {v24.16b}, [x5], #16 1356 st1 {v26.16b}, [x10], #16 1357 b.gt 2b 1358 subs w4, w4, #4 1359 b.le 9f 1360 sub x8, x8, w9, uxtw 1361 add x0, x0, x1 1362 add x6, x6, x1 1363 add x5, x5, x1 1364 add x10, x10, x1 1365 mov w3, w9 1366 b 1b 13679: 1368 ret 1369 1370L(ipred_smooth_h_tbl): 1371 .hword L(ipred_smooth_h_tbl) - 640b 1372 .hword L(ipred_smooth_h_tbl) - 320b 1373 .hword L(ipred_smooth_h_tbl) - 160b 1374 .hword L(ipred_smooth_h_tbl) - 80b 1375 .hword L(ipred_smooth_h_tbl) - 40b 1376endfunc 1377 1378const padding_mask_buf 1379 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1380 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1381 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1382 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1383padding_mask: 1384 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1385 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1386 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1387 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1388endconst 1389 1390// void ipred_z1_upsample_edge_8bpc_neon(pixel *out, const int hsz, 1391// const pixel *const in, const int end); 1392function ipred_z1_upsample_edge_8bpc_neon, export=1 1393 movrel x4, padding_mask 1394 ld1 {v0.16b}, [x2] // in[] 1395 add x5, x2, w3, uxtw // in[end] 1396 sub x4, x4, w3, uxtw 1397 1398 ld1r {v1.16b}, [x5] // padding 1399 ld1 {v3.16b}, [x4] // padding_mask 1400 1401 movi v31.8h, #9 1402 1403 bit v0.16b, v1.16b, v3.16b // padded in[] 1404 1405 ext v4.16b, v0.16b, v1.16b, #1 1406 ext v5.16b, v0.16b, v1.16b, #2 1407 ext v6.16b, v0.16b, v1.16b, #3 1408 1409 uaddl v16.8h, v4.8b, v5.8b // in[i+1] + in[i+2] 1410 uaddl2 v17.8h, v4.16b, v5.16b 1411 uaddl v18.8h, v0.8b, v6.8b // in[i+0] + in[i+3] 1412 uaddl2 v19.8h, v0.16b, v6.16b 1413 mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2]) 1414 mul v17.8h, v17.8h, v31.8h 1415 sub v16.8h, v16.8h, v18.8h 1416 sub v17.8h, v17.8h, v19.8h 1417 1418 sqrshrun v16.8b, v16.8h, #4 1419 sqrshrun2 v16.16b, v17.8h, #4 1420 1421 zip1 v0.16b, v4.16b, v16.16b 1422 zip2 v1.16b, v4.16b, v16.16b 1423 1424 st1 {v0.16b, v1.16b}, [x0] 1425 1426 ret 1427endfunc 1428 1429// void ipred_z2_upsample_edge_8bpc_neon(pixel *out, const int sz, 1430// const pixel *const in); 1431function ipred_z2_upsample_edge_8bpc_neon, export=1 1432 // Here, sz is 4 or 8, and we produce 2*sz+1 output elements. 1433 movrel x4, padding_mask 1434 ld1 {v0.16b}, [x2] // in[] 1435 add x5, x2, w1, uxtw // in[sz] 1436 sub x4, x4, w1, uxtw 1437 1438 ld1r {v2.16b}, [x2] // in[0] for padding 1439 ld1r {v1.16b}, [x5] // padding 1440 ld1 {v3.16b}, [x4] // padding_mask 1441 1442 movi v31.8h, #9 1443 1444 bit v0.16b, v1.16b, v3.16b // padded in[] 1445 1446 ext v4.16b, v2.16b, v0.16b, #15 1447 ext v5.16b, v0.16b, v1.16b, #1 1448 ext v6.16b, v0.16b, v1.16b, #2 1449 1450 uaddl v16.8h, v0.8b, v5.8b // in[i+0] + in[i+1] 1451 uaddl v18.8h, v4.8b, v6.8b // in[i-1] + in[i+2] 1452 mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2]) 1453 sub v16.8h, v16.8h, v18.8h 1454 1455 sqrshrun v16.8b, v16.8h, #4 1456 1457 add x5, x0, #16 1458 1459 zip1 v2.16b, v0.16b, v16.16b 1460 1461 st1 {v1.b}[0], [x5] 1462 // In case sz=8, output one single pixel in out[16]. 1463 st1 {v2.16b}, [x0] 1464 1465 ret 1466endfunc 1467 1468const edge_filter 1469 .byte 0, 4, 8, 0 1470 .byte 0, 5, 6, 0 1471// Leaving out the coeffs for strength=3 1472// .byte 2, 4, 4, 0 1473endconst 1474 1475// void ipred_z1_filter_edge_8bpc_neon(pixel *out, const int sz, 1476// const pixel *const in, const int end, 1477// const int strength); 1478function ipred_z1_filter_edge_8bpc_neon, export=1 1479 cmp w4, #3 1480 b.eq L(fivetap) // if (strength == 3) goto fivetap 1481 1482 movrel x5, edge_filter, -3 1483 add x5, x5, w4, uxtw #2 // edge_filter + (strength - 1)*4 + 1 1484 1485 ld1 {v31.h}[0], [x5] // kernel[1-2] 1486 1487 ld1 {v0.16b}, [x2], #16 1488 1489 dup v30.16b, v31.b[0] 1490 dup v31.16b, v31.b[1] 14911: 1492 // in[end], is the last valid pixel. We produce 16 pixels out by 1493 // using 18 pixels in - the last pixel used is [17] of the ones 1494 // read/buffered. 1495 cmp w3, #17 1496 ld1 {v1.16b}, [x2], #16 1497 b.lt 2f 1498 ext v2.16b, v0.16b, v1.16b, #1 1499 ext v3.16b, v0.16b, v1.16b, #2 1500 umull v4.8h, v0.8b, v30.8b 1501 umlal v4.8h, v2.8b, v31.8b 1502 umlal v4.8h, v3.8b, v30.8b 1503 umull2 v5.8h, v0.16b, v30.16b 1504 umlal2 v5.8h, v2.16b, v31.16b 1505 umlal2 v5.8h, v3.16b, v30.16b 1506 subs w1, w1, #16 1507 mov v0.16b, v1.16b 1508 rshrn v4.8b, v4.8h, #4 1509 rshrn2 v4.16b, v5.8h, #4 1510 sub w3, w3, #16 1511 st1 {v4.16b}, [x0], #16 1512 b.gt 1b 1513 ret 15142: 1515 // Right padding 1516 1517 // x2[w3-32] is the padding pixel (x2 points 32 bytes ahead) 1518 movrel x5, padding_mask 1519 sub w6, w3, #32 1520 sub x5, x5, w3, uxtw 1521 add x6, x2, w6, sxtw 1522 1523 ld1 {v2.16b}, [x5] // padding_mask 1524 1525 ld1r {v1.16b}, [x6] 1526 bit v0.16b, v1.16b, v2.16b // Pad v0-v1 1527 1528 // Filter one block 1529 ext v2.16b, v0.16b, v1.16b, #1 1530 ext v3.16b, v0.16b, v1.16b, #2 1531 umull v4.8h, v0.8b, v30.8b 1532 umlal v4.8h, v2.8b, v31.8b 1533 umlal v4.8h, v3.8b, v30.8b 1534 umull2 v5.8h, v0.16b, v30.16b 1535 umlal2 v5.8h, v2.16b, v31.16b 1536 umlal2 v5.8h, v3.16b, v30.16b 1537 subs w1, w1, #16 1538 rshrn v4.8b, v4.8h, #4 1539 rshrn2 v4.16b, v5.8h, #4 1540 st1 {v4.16b}, [x0], #16 1541 b.le 9f 15425: 1543 // After one block, any remaining output would only be filtering 1544 // padding - thus just store the padding. 1545 subs w1, w1, #16 1546 st1 {v1.16b}, [x0], #16 1547 b.gt 5b 15489: 1549 ret 1550 1551L(fivetap): 1552 sub x2, x2, #1 // topleft -= 1 1553 movi v29.16b, #2 1554 ld1 {v0.16b}, [x2], #16 1555 movi v30.16b, #4 1556 movi v31.16b, #4 1557 ins v0.b[0], v0.b[1] 15581: 1559 // in[end+1], is the last valid pixel. We produce 16 pixels out by 1560 // using 20 pixels in - the last pixel used is [19] of the ones 1561 // read/buffered. 1562 cmp w3, #18 1563 ld1 {v1.16b}, [x2], #16 1564 b.lt 2f // if (end + 1 < 19) 1565 ext v2.16b, v0.16b, v1.16b, #1 1566 ext v3.16b, v0.16b, v1.16b, #2 1567 ext v4.16b, v0.16b, v1.16b, #3 1568 ext v5.16b, v0.16b, v1.16b, #4 1569 umull v6.8h, v0.8b, v29.8b 1570 umlal v6.8h, v2.8b, v30.8b 1571 umlal v6.8h, v3.8b, v31.8b 1572 umlal v6.8h, v4.8b, v30.8b 1573 umlal v6.8h, v5.8b, v29.8b 1574 umull2 v7.8h, v0.16b, v29.16b 1575 umlal2 v7.8h, v2.16b, v30.16b 1576 umlal2 v7.8h, v3.16b, v31.16b 1577 umlal2 v7.8h, v4.16b, v30.16b 1578 umlal2 v7.8h, v5.16b, v29.16b 1579 subs w1, w1, #16 1580 mov v0.16b, v1.16b 1581 rshrn v6.8b, v6.8h, #4 1582 rshrn2 v6.16b, v7.8h, #4 1583 sub w3, w3, #16 1584 st1 {v6.16b}, [x0], #16 1585 b.gt 1b 1586 ret 15872: 1588 // Right padding 1589 1590 // x2[w3+1-32] is the padding pixel (x2 points 32 bytes ahead) 1591 movrel x5, padding_mask, -1 1592 sub w6, w3, #31 1593 sub x5, x5, w3, uxtw 1594 add x6, x2, w6, sxtw 1595 1596 ld1 {v2.16b, v3.16b}, [x5] // padding_mask 1597 1598 ld1r {v28.16b}, [x6] 1599 bit v0.16b, v28.16b, v2.16b // Pad v0-v1 1600 bit v1.16b, v28.16b, v3.16b 16014: 1602 // Filter one block 1603 ext v2.16b, v0.16b, v1.16b, #1 1604 ext v3.16b, v0.16b, v1.16b, #2 1605 ext v4.16b, v0.16b, v1.16b, #3 1606 ext v5.16b, v0.16b, v1.16b, #4 1607 umull v6.8h, v0.8b, v29.8b 1608 umlal v6.8h, v2.8b, v30.8b 1609 umlal v6.8h, v3.8b, v31.8b 1610 umlal v6.8h, v4.8b, v30.8b 1611 umlal v6.8h, v5.8b, v29.8b 1612 umull2 v7.8h, v0.16b, v29.16b 1613 umlal2 v7.8h, v2.16b, v30.16b 1614 umlal2 v7.8h, v3.16b, v31.16b 1615 umlal2 v7.8h, v4.16b, v30.16b 1616 umlal2 v7.8h, v5.16b, v29.16b 1617 subs w1, w1, #16 1618 mov v0.16b, v1.16b 1619 mov v1.16b, v28.16b 1620 rshrn v6.8b, v6.8h, #4 1621 rshrn2 v6.16b, v7.8h, #4 1622 sub w3, w3, #16 1623 st1 {v6.16b}, [x0], #16 1624 b.le 9f 1625 // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to 1626 // filter properly once more - aka (w3 >= 0). 1627 cmp w3, #0 1628 b.ge 4b 16295: 1630 // When w3 <= 0, all remaining pixels in v0-v1 are equal to the 1631 // last valid pixel - thus just output that without filtering. 1632 subs w1, w1, #16 1633 st1 {v1.16b}, [x0], #16 1634 b.gt 5b 16359: 1636 ret 1637endfunc 1638 1639// void ipred_pixel_set_8bpc_neon(pixel *out, const pixel px, 1640// const int n); 1641function ipred_pixel_set_8bpc_neon, export=1 1642 dup v0.16b, w1 16431: 1644 subs w2, w2, #16 1645 st1 {v0.16b}, [x0], #16 1646 b.gt 1b 1647 ret 1648endfunc 1649 1650// void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, 1651// const pixel *const top, 1652// const int width, const int height, 1653// const int dx, const int max_base_x); 1654function ipred_z1_fill1_8bpc_neon, export=1 1655 clz w9, w3 1656 adr x8, L(ipred_z1_fill1_tbl) 1657 sub w9, w9, #25 1658 ldrh w9, [x8, w9, uxtw #1] 1659 add x10, x2, w6, uxtw // top[max_base_x] 1660 sub x8, x8, w9, uxtw 1661 ld1r {v31.16b}, [x10] // padding 1662 mov w7, w5 1663 mov w15, #64 1664 br x8 166540: 1666 AARCH64_VALID_JUMP_TARGET 16674: 1668 lsr w8, w7, #6 // base 1669 and w9, w7, #0x3e // frac 1670 add w7, w7, w5 // xpos += dx 1671 cmp w8, w6 // base >= max_base_x 1672 lsr w10, w7, #6 // base 1673 and w11, w7, #0x3e // frac 1674 b.ge 49f 1675 ldr d0, [x2, w8, uxtw] // top[base] 1676 ldr d2, [x2, w10, uxtw] 1677 dup v4.4h, w9 // frac 1678 dup v5.4h, w11 1679 ext v1.8b, v0.8b, v0.8b, #1 // top[base+1] 1680 ext v3.8b, v2.8b, v2.8b, #1 1681 usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base] 1682 usubl v7.8h, v3.8b, v2.8b 1683 ushll v16.8h, v0.8b, #6 // top[base]*64 1684 ushll v17.8h, v2.8b, #6 1685 mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac 1686 mla v17.4h, v7.4h, v5.4h 1687 rshrn v16.8b, v16.8h, #6 1688 rshrn v17.8b, v17.8h, #6 1689 st1 {v16.s}[0], [x0], x1 1690 add w7, w7, w5 // xpos += dx 1691 subs w4, w4, #2 1692 st1 {v17.s}[0], [x0], x1 1693 b.gt 4b 1694 ret 1695 169649: 1697 st1 {v31.s}[0], [x0], x1 1698 subs w4, w4, #2 1699 st1 {v31.s}[0], [x0], x1 1700 b.gt 49b 1701 ret 1702 170380: 1704 AARCH64_VALID_JUMP_TARGET 17058: 1706 lsr w8, w7, #6 // base 1707 and w9, w7, #0x3e // frac 1708 add w7, w7, w5 // xpos += dx 1709 cmp w8, w6 // base >= max_base_x 1710 lsr w10, w7, #6 // base 1711 and w11, w7, #0x3e // frac 1712 b.ge 89f 1713 ldr q0, [x2, w8, uxtw] // top[base] 1714 ldr q2, [x2, w10, uxtw] 1715 dup v4.8b, w9 // frac 1716 dup v5.8b, w11 1717 sub w9, w15, w9 // 64 - frac 1718 sub w11, w15, w11 1719 dup v6.8b, w9 // 64 - frac 1720 dup v7.8b, w11 1721 ext v1.16b, v0.16b, v0.16b, #1 // top[base+1] 1722 ext v3.16b, v2.16b, v2.16b, #1 1723 umull v16.8h, v0.8b, v6.8b // top[base]*(64-frac) 1724 umlal v16.8h, v1.8b, v4.8b // + top[base+1]*frac 1725 umull v17.8h, v2.8b, v7.8b 1726 umlal v17.8h, v3.8b, v5.8b 1727 rshrn v16.8b, v16.8h, #6 1728 rshrn v17.8b, v17.8h, #6 1729 st1 {v16.8b}, [x0], x1 1730 add w7, w7, w5 // xpos += dx 1731 subs w4, w4, #2 1732 st1 {v17.8b}, [x0], x1 1733 b.gt 8b 1734 ret 1735 173689: 1737 st1 {v31.8b}, [x0], x1 1738 subs w4, w4, #2 1739 st1 {v31.8b}, [x0], x1 1740 b.gt 89b 1741 ret 1742 1743160: 1744320: 1745640: 1746 AARCH64_VALID_JUMP_TARGET 1747 1748 mov w12, w3 1749 1750 add x13, x0, x1 1751 lsl x1, x1, #1 1752 sub x1, x1, w3, uxtw 17531: 1754 lsr w8, w7, #6 // base 1755 and w9, w7, #0x3e // frac 1756 add w7, w7, w5 // xpos += dx 1757 cmp w8, w6 // base >= max_base_x 1758 lsr w10, w7, #6 // base 1759 and w11, w7, #0x3e // frac 1760 b.ge 169f 1761 add x8, x2, w8, uxtw 1762 add x10, x2, w10, uxtw 1763 dup v4.16b, w9 // frac 1764 dup v5.16b, w11 1765 ld1 {v0.16b, v1.16b}, [x8], #32 // top[base] 1766 ld1 {v2.16b, v3.16b}, [x10], #32 1767 sub w9, w15, w9 // 64 - frac 1768 sub w11, w15, w11 1769 dup v6.16b, w9 // 64 - frac 1770 dup v7.16b, w11 1771 add w7, w7, w5 // xpos += dx 17722: 1773 ext v16.16b, v0.16b, v1.16b, #1 // top[base+1] 1774 ext v17.16b, v2.16b, v3.16b, #1 1775 subs w3, w3, #16 1776 umull v18.8h, v0.8b, v6.8b // top[base]*(64-frac) 1777 umlal v18.8h, v16.8b, v4.8b // + top[base+1]*frac 1778 umull2 v19.8h, v0.16b, v6.16b 1779 umlal2 v19.8h, v16.16b, v4.16b 1780 umull v20.8h, v2.8b, v7.8b 1781 umlal v20.8h, v17.8b, v5.8b 1782 umull2 v21.8h, v2.16b, v7.16b 1783 umlal2 v21.8h, v17.16b, v5.16b 1784 rshrn v16.8b, v18.8h, #6 1785 rshrn2 v16.16b, v19.8h, #6 1786 rshrn v17.8b, v20.8h, #6 1787 rshrn2 v17.16b, v21.8h, #6 1788 st1 {v16.16b}, [x0], #16 1789 st1 {v17.16b}, [x13], #16 1790 b.le 3f 1791 mov v0.16b, v1.16b 1792 ld1 {v1.16b}, [x8], #16 // top[base] 1793 mov v2.16b, v3.16b 1794 ld1 {v3.16b}, [x10], #16 1795 b 2b 1796 17973: 1798 subs w4, w4, #2 1799 b.le 9f 1800 add x0, x0, x1 1801 add x13, x13, x1 1802 mov w3, w12 1803 b 1b 18049: 1805 ret 1806 1807169: 1808 st1 {v31.16b}, [x0], #16 1809 subs w3, w3, #16 1810 st1 {v31.16b}, [x13], #16 1811 b.gt 169b 1812 subs w4, w4, #2 1813 b.le 9b 1814 add x0, x0, x1 1815 add x13, x13, x1 1816 mov w3, w12 1817 b 169b 1818 1819L(ipred_z1_fill1_tbl): 1820 .hword L(ipred_z1_fill1_tbl) - 640b 1821 .hword L(ipred_z1_fill1_tbl) - 320b 1822 .hword L(ipred_z1_fill1_tbl) - 160b 1823 .hword L(ipred_z1_fill1_tbl) - 80b 1824 .hword L(ipred_z1_fill1_tbl) - 40b 1825endfunc 1826 1827function ipred_z1_fill2_8bpc_neon, export=1 1828 cmp w3, #8 1829 add x10, x2, w6, uxtw // top[max_base_x] 1830 ld1r {v31.16b}, [x10] // padding 1831 mov w7, w5 1832 mov w15, #64 1833 b.eq 8f 1834 18354: // w == 4 1836 lsr w8, w7, #6 // base 1837 and w9, w7, #0x3e // frac 1838 add w7, w7, w5 // xpos += dx 1839 cmp w8, w6 // base >= max_base_x 1840 lsr w10, w7, #6 // base 1841 and w11, w7, #0x3e // frac 1842 b.ge 49f 1843 ldr d0, [x2, w8, uxtw] // top[base] 1844 ldr d2, [x2, w10, uxtw] 1845 dup v4.4h, w9 // frac 1846 dup v5.4h, w11 1847 uzp2 v1.8b, v0.8b, v0.8b // top[base+1] 1848 uzp1 v0.8b, v0.8b, v0.8b // top[base] 1849 uzp2 v3.8b, v2.8b, v2.8b 1850 uzp1 v2.8b, v2.8b, v2.8b 1851 usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base] 1852 usubl v7.8h, v3.8b, v2.8b 1853 ushll v16.8h, v0.8b, #6 // top[base]*64 1854 ushll v17.8h, v2.8b, #6 1855 mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac 1856 mla v17.4h, v7.4h, v5.4h 1857 rshrn v16.8b, v16.8h, #6 1858 rshrn v17.8b, v17.8h, #6 1859 st1 {v16.s}[0], [x0], x1 1860 add w7, w7, w5 // xpos += dx 1861 subs w4, w4, #2 1862 st1 {v17.s}[0], [x0], x1 1863 b.gt 4b 1864 ret 1865 186649: 1867 st1 {v31.s}[0], [x0], x1 1868 subs w4, w4, #2 1869 st1 {v31.s}[0], [x0], x1 1870 b.gt 49b 1871 ret 1872 18738: // w == 8 1874 lsr w8, w7, #6 // base 1875 and w9, w7, #0x3e // frac 1876 add w7, w7, w5 // xpos += dx 1877 cmp w8, w6 // base >= max_base_x 1878 lsr w10, w7, #6 // base 1879 and w11, w7, #0x3e // frac 1880 b.ge 89f 1881 ldr q0, [x2, w8, uxtw] // top[base] 1882 ldr q2, [x2, w10, uxtw] 1883 dup v4.8b, w9 // frac 1884 dup v5.8b, w11 1885 sub w9, w15, w9 // 64 - frac 1886 sub w11, w15, w11 1887 dup v6.8b, w9 // 64 - frac 1888 dup v7.8b, w11 1889 uzp2 v1.16b, v0.16b, v0.16b // top[base+1] 1890 uzp1 v0.16b, v0.16b, v0.16b // top[base] 1891 uzp2 v3.16b, v2.16b, v2.16b 1892 uzp1 v2.16b, v2.16b, v2.16b 1893 umull v16.8h, v1.8b, v4.8b // top[base+1]*frac 1894 umlal v16.8h, v0.8b, v6.8b // + top[base]*(64-frac) 1895 umull v17.8h, v3.8b, v5.8b 1896 umlal v17.8h, v2.8b, v7.8b 1897 rshrn v16.8b, v16.8h, #6 1898 rshrn v17.8b, v17.8h, #6 1899 st1 {v16.8b}, [x0], x1 1900 add w7, w7, w5 // xpos += dx 1901 subs w4, w4, #2 1902 st1 {v17.8b}, [x0], x1 1903 b.gt 8b 1904 ret 1905 190689: 1907 st1 {v31.8b}, [x0], x1 1908 subs w4, w4, #2 1909 st1 {v31.8b}, [x0], x1 1910 b.gt 89b 1911 ret 1912endfunc 1913 1914// void ipred_reverse_8bpc_neon(pixel *dst, const pixel *const src, 1915// const int n); 1916function ipred_reverse_8bpc_neon, export=1 1917 sub x1, x1, #16 1918 add x3, x0, #8 1919 mov x4, #16 19201: 1921 ld1 {v0.16b}, [x1] 1922 subs w2, w2, #16 1923 rev64 v0.16b, v0.16b 1924 sub x1, x1, #16 1925 st1 {v0.d}[1], [x0], x4 1926 st1 {v0.d}[0], [x3], x4 1927 b.gt 1b 1928 ret 1929endfunc 1930 1931const increments 1932 .short 0, 1, 2, 3, 4, 5, 6, 7 1933 .short 8, 9, 10, 11, 12, 13, 14, 15 1934endconst 1935 1936// void ipred_z2_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, 1937// const pixel *const top, 1938// const pixel *const left, 1939// const int width, const int height, 1940// const int dx, const int dy); 1941function ipred_z2_fill1_8bpc_neon, export=1 1942 clz w10, w4 1943 adr x9, L(ipred_z2_fill1_tbl) 1944 sub w10, w10, #25 1945 ldrh w10, [x9, w10, uxtw #1] 1946 mov w8, #(1 << 6) // xpos = 1 << 6 1947 sub x9, x9, w10, uxtw 1948 sub w8, w8, w6 // xpos -= dx 1949 1950 movrel x11, increments 1951 ld1 {v31.8h}, [x11] // increments 1952 neg w7, w7 // -dy 1953 1954 br x9 195540: 1956 AARCH64_VALID_JUMP_TARGET 1957 1958 dup v30.4h, w7 // -dy 1959 movi v17.8b, #1 1960 1961 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy 1962 movi v25.16b, #0x3e 1963 add v30.4h, v16.4h, v30.4h // -= dy 1964 1965 xtn v31.8b, v31.8h // {0,1,2,3} 1966 1967 // Worst case height for w=4 is 16, but we need at least h+1 elements 1968 ld1 {v0.16b, v1.16b}, [x3] // left[] 1969 1970 movi v26.16b, #64 1971 movi v19.16b, #2 1972 1973 xtn v27.8b, v30.8h // (uint8_t)ypos 1974 shrn v29.8b, v30.8h, #6 // ypos >> 6 1975 and v27.8b, v27.8b, v25.8b // frac_y 1976 1977 add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 1978 1979 add v30.8b, v29.8b, v17.8b // base_y + 1 1980 add v28.8b, v29.8b, v19.8b // base_y + 2 1981 1982 tbl v16.8b, {v0.16b}, v29.8b // left[base_y] 1983 1984 trn1 v30.2s, v30.2s, v28.2s // base_y + 1, base_y + 2 1985 1986 sub v28.8b, v26.8b, v27.8b // 64 - frac_y 1987 1988 trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3} 1989 1990 trn1 v27.2s, v27.2s, v27.2s // frac_y 1991 trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y 1992 1993 movi v29.8b, #2 19944: 1995 asr w9, w8, #6 // base_x 1996 dup v6.4h, w8 // xpos 1997 sub w8, w8, w6 // xpos -= dx 1998 cmp w9, #-4 // base_x <= -4 1999 asr w11, w8, #6 // base_x 2000 b.le 49f 2001 2002 dup v7.4h, w8 // xpos 2003 2004 ldr d2, [x2, w9, sxtw] // top[base_x] 2005 ldr d4, [x2, w11, sxtw] 2006 2007 trn1 v6.2d, v6.2d, v7.2d // xpos 2008 2009 // Cut corners here; only doing tbl over v0 here; we only 2010 // seem to need the last pixel, from v1, after skipping to the 2011 // left-only codepath below. 2012 tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2] 2013 2014 shrn v20.8b, v6.8h, #6 // first base_x for each row 2015 xtn v6.8b, v6.8h // (uint8_t)xpos 2016 2017 ext v3.8b, v2.8b, v2.8b, #1 // top[base_x+1] 2018 ext v5.8b, v4.8b, v4.8b, #1 2019 2020 and v6.8b, v6.8b, v25.8b // frac_x 2021 2022 trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] 2023 2024 trn1 v2.2s, v2.2s, v4.2s // top[base_x] 2025 trn1 v3.2s, v3.2s, v5.2s // top[base_x+1] 2026 2027 sub v7.8b, v26.8b, v6.8b // 64 - frac_x 2028 2029 add v20.8b, v20.8b, v31.8b // actual base_x 2030 2031 umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y) 2032 umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y 2033 2034 umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x) 2035 umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x 2036 2037 cmge v20.8b, v20.8b, #0 2038 2039 rshrn v16.8b, v16.8h, #6 2040 rshrn v22.8b, v22.8h, #6 2041 2042 bit v16.8b, v22.8b, v20.8b 2043 2044 st1 {v16.s}[0], [x0], x1 2045 sub w8, w8, w6 // xpos -= dx 2046 subs w5, w5, #2 2047 st1 {v16.s}[1], [x0], x1 2048 b.le 9f 2049 2050 ext v16.8b, v17.8b, v17.8b, #4 2051 add v30.8b, v30.8b, v29.8b // base_y += 2 2052 b 4b 2053 205449: 2055 tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+2] 2056 2057 trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] 2058 2059 umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t) 2060 umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y 2061 rshrn v18.8b, v18.8h, #6 2062 2063 st1 {v18.s}[0], [x0], x1 2064 subs w5, w5, #2 2065 st1 {v18.s}[1], [x0], x1 2066 b.le 9f 2067 2068 ext v16.8b, v17.8b, v17.8b, #4 2069 add v30.8b, v30.8b, v29.8b // base_y += 2 2070 b 49b 2071 20729: 2073 ret 2074 207580: 2076 AARCH64_VALID_JUMP_TARGET 2077 2078 dup v30.8h, w7 // -dy 2079 movi v17.8b, #1 2080 2081 mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy 2082 movi v25.16b, #0x3e 2083 add v30.8h, v16.8h, v30.8h // -= dy 2084 2085 xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} 2086 2087 // Worst case height for w=8 is 32, but we need at least h+1 elements 2088 ld1 {v0.16b, v1.16b, v2.16b}, [x3] // left[] 2089 2090 movi v26.16b, #64 2091 movi v19.16b, #2 2092 2093 xtn v27.8b, v30.8h // (uint8_t)ypos 2094 shrn v29.8b, v30.8h, #6 // ypos >> 6 2095 and v27.8b, v27.8b, v25.8b // frac_y 2096 2097 add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 2098 2099 // Cut corners here; for the first row we don't expect to need to 2100 // read outside of v0. 2101 tbl v18.8b, {v0.16b}, v29.8b // left[base_y] 2102 2103 add v30.8b, v29.8b, v19.8b // base_y + 2 2104 add v29.8b, v29.8b, v17.8b // base_y + 1 2105 2106 sub v28.8b, v26.8b, v27.8b // 64 - frac_y 2107 2108 trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7} 2109 2110 movi v24.8b, #2 // 2 21118: 2112 asr w9, w8, #6 // base_x 2113 dup v16.8h, w8 // xpos 2114 sub w8, w8, w6 // xpos -= dx 2115 cmp w9, #-8 // base_x <= -8 2116 asr w11, w8, #6 // base_x 2117 b.le 89f 2118 2119 dup v17.8h, w8 // xpos 2120 2121 ldr q4, [x2, w9, sxtw] // top[base_x] 2122 ldr q6, [x2, w11, sxtw] 2123 2124 // Cut corners here; only doing tbl over v0-v1 here; we only 2125 // seem to need the last pixel, from v2, after skipping to the 2126 // left-only codepath below. 2127 tbl v19.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+1] 2128 2129 shrn v21.8b, v16.8h, #6 // first base_x 2130 shrn2 v21.16b, v17.8h, #6 2131 xtn v16.8b, v16.8h // (uint8_t)xpos 2132 xtn2 v16.16b, v17.8h 2133 2134 tbl v20.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+2] 2135 2136 ext v5.16b, v4.16b, v4.16b, #1 // top[base_x+1] 2137 ext v7.16b, v6.16b, v6.16b, #1 2138 2139 and v16.16b, v16.16b, v25.16b // frac_x 2140 2141 trn1 v4.2d, v4.2d, v6.2d // top[base_x] 2142 trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] 2143 2144 sub v7.16b, v26.16b, v16.16b // 64 - frac_x 2145 2146 add v21.16b, v21.16b, v31.16b // actual base_x 2147 2148 umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) 2149 umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y 2150 umull v17.8h, v19.8b, v28.8b 2151 umlal v17.8h, v20.8b, v27.8b 2152 2153 umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x) 2154 umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x 2155 umull2 v23.8h, v4.16b, v7.16b 2156 umlal2 v23.8h, v5.16b, v16.16b 2157 2158 cmge v21.16b, v21.16b, #0 2159 2160 rshrn v6.8b, v6.8h, #6 2161 rshrn2 v6.16b, v17.8h, #6 2162 rshrn v22.8b, v22.8h, #6 2163 rshrn2 v22.16b, v23.8h, #6 2164 2165 bit v6.16b, v22.16b, v21.16b 2166 2167 st1 {v6.d}[0], [x0], x1 2168 sub w8, w8, w6 // xpos -= dx 2169 subs w5, w5, #2 2170 st1 {v6.d}[1], [x0], x1 2171 b.le 9f 2172 2173 mov v18.8b, v20.8b 2174 add v29.8b, v29.8b, v24.8b // base_y += 2 2175 add v30.8b, v30.8b, v24.8b // base_y += 2 2176 b 8b 2177 217889: 2179 tbl v19.8b, {v0.16b, v1.16b, v2.16b}, v29.8b // left[base_y+1] 2180 tbl v20.8b, {v0.16b, v1.16b, v2.16b}, v30.8b // left[base_y+2] 2181 2182 umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) 2183 umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y 2184 umull v17.8h, v19.8b, v28.8b 2185 umlal v17.8h, v20.8b, v27.8b 2186 2187 rshrn v6.8b, v6.8h, #6 2188 rshrn2 v6.16b, v17.8h, #6 2189 2190 st1 {v6.d}[0], [x0], x1 2191 subs w5, w5, #2 2192 st1 {v6.d}[1], [x0], x1 2193 b.le 9f 2194 2195 mov v18.8b, v20.8b 2196 add v29.8b, v29.8b, v24.8b // base_y += 2 2197 add v30.8b, v30.8b, v24.8b // base_y += 2 2198 b 89b 2199 22009: 2201 ret 2202 2203160: 2204 AARCH64_VALID_JUMP_TARGET 2205 2206 stp d8, d9, [sp, #-0x40]! 2207 stp d10, d11, [sp, #0x10] 2208 stp d12, d13, [sp, #0x20] 2209 stp d14, d15, [sp, #0x30] 2210 2211 add x11, x11, #16 // increments 2212 2213 dup v18.8h, w7 // -dy 2214 movi v17.16b, #1 2215 add x3, x3, #1 // Skip past left[0] 2216 2217 ld1 {v14.8h}, [x11] // {8,9,10,11,12,13,14,15} 2218 2219 mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy 2220 mul v19.8h, v14.8h, v18.8h // {8,9,10,11,12,13,14,15}* -dy 2221 movi v25.16b, #0x3e 2222 add v16.8h, v16.8h, v18.8h // -= dy 2223 add v18.8h, v19.8h, v18.8h 2224 2225 xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} 2226 xtn2 v31.16b, v14.8h // {8,9,10,11,12,13,14,15} 2227 2228 // Worst case height is 64. 2229 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[] 2230 ld1r {v15.16b}, [x2] // left[0] == top[0] 2231 2232 movi v26.16b, #64 2233 movi v19.16b, #2 2234 2235 xtn v27.8b, v16.8h // (uint8_t)ypos 2236 xtn2 v27.16b, v18.8h 2237 shrn v29.8b, v16.8h, #6 // ypos >> 6 2238 shrn2 v29.16b, v18.8h, #6 2239 mov v18.16b, v15.16b // left[0] 2240 and v27.16b, v27.16b, v25.16b // frac_y 2241 2242 // Cut corners here; for the first row we don't expect to need to 2243 // read outside of v0. 2244 tbx v18.16b, {v0.16b}, v29.16b // left[base_y] 2245 2246 add v30.16b, v29.16b, v19.16b // base_y + 2 2247 add v29.16b, v29.16b, v17.16b // base_y + 1 2248 2249 sub v28.16b, v26.16b, v27.16b // 64 - frac_y 2250 2251 movi v24.16b, #2 // 2 225216: 2253 asr w9, w8, #6 // base_x 2254 dup v16.8h, w8 // xpos 2255 sub w8, w8, w6 // xpos -= dx 2256 cmp w9, #-16 // base_x <= -16 2257 asr w11, w8, #6 // base_x 2258 b.le 169f 2259 2260 dup v17.8h, w8 // xpos 2261 2262 add x9, x2, w9, sxtw 2263 add x11, x2, w11, sxtw 2264 2265 ld1 {v4.16b, v5.16b}, [x9] // top[base_x] 2266 mov v19.16b, v15.16b // left[0] 2267 ld1 {v6.16b, v7.16b}, [x11] 2268 2269 tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] 2270 2271 mov v20.16b, v15.16b // left[0] 2272 2273 shrn v21.8b, v16.8h, #6 // first base_x 2274 shrn v22.8b, v17.8h, #6 2275 xtn v16.8b, v16.8h // (uint8_t)xpos 2276 xtn v17.8b, v17.8h 2277 2278 tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] 2279 2280 trn1 v21.2d, v21.2d, v21.2d // first base_x 2281 trn1 v22.2d, v22.2d, v22.2d 2282 trn1 v16.2d, v16.2d, v16.2d // (uint8_t)xpos 2283 trn1 v17.2d, v17.2d, v17.2d 2284 2285 ext v5.16b, v4.16b, v5.16b, #1 // top[base_x+1] 2286 ext v7.16b, v6.16b, v7.16b, #1 2287 2288 and v16.16b, v16.16b, v25.16b // frac_x 2289 and v17.16b, v17.16b, v25.16b 2290 2291 umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) 2292 umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y 2293 2294 sub v8.16b, v26.16b, v16.16b // 64 - frac_x 2295 sub v9.16b, v26.16b, v17.16b 2296 2297 umull2 v11.8h, v18.16b, v28.16b 2298 umlal2 v11.8h, v19.16b, v27.16b 2299 2300 add v21.16b, v21.16b, v31.16b // actual base_x 2301 add v22.16b, v22.16b, v31.16b 2302 2303 umull v12.8h, v19.8b, v28.8b 2304 umlal v12.8h, v20.8b, v27.8b 2305 umull2 v13.8h, v19.16b, v28.16b 2306 umlal2 v13.8h, v20.16b, v27.16b 2307 2308 rshrn v10.8b, v10.8h, #6 2309 rshrn2 v10.16b, v11.8h, #6 2310 rshrn v11.8b, v12.8h, #6 2311 rshrn2 v11.16b, v13.8h, #6 2312 2313 umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x) 2314 umlal v12.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x 2315 umull2 v13.8h, v4.16b, v8.16b 2316 umlal2 v13.8h, v5.16b, v16.16b 2317 umull v14.8h, v6.8b, v9.8b 2318 umlal v14.8h, v7.8b, v17.8b 2319 umull2 v18.8h, v6.16b, v9.16b 2320 umlal2 v18.8h, v7.16b, v17.16b 2321 2322 cmge v21.16b, v21.16b, #0 2323 cmge v22.16b, v22.16b, #0 2324 2325 rshrn v12.8b, v12.8h, #6 2326 rshrn2 v12.16b, v13.8h, #6 2327 rshrn v13.8b, v14.8h, #6 2328 rshrn2 v13.16b, v18.8h, #6 2329 2330 bit v10.16b, v12.16b, v21.16b 2331 bit v11.16b, v13.16b, v22.16b 2332 2333 st1 {v10.16b}, [x0], x1 2334 subs w5, w5, #2 2335 sub w8, w8, w6 // xpos -= dx 2336 st1 {v11.16b}, [x0], x1 2337 b.le 9f 2338 2339 mov v18.16b, v20.16b 2340 add v29.16b, v29.16b, v24.16b // base_y += 2 2341 add v30.16b, v30.16b, v24.16b // base_y += 2 2342 b 16b 2343 2344169: 2345 mov v19.16b, v15.16b 2346 mov v20.16b, v15.16b 2347 tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] 2348 tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] 2349 2350 umull v4.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) 2351 umlal v4.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y 2352 umull2 v5.8h, v18.16b, v28.16b 2353 umlal2 v5.8h, v19.16b, v27.16b 2354 umull v6.8h, v19.8b, v28.8b 2355 umlal v6.8h, v20.8b, v27.8b 2356 umull2 v7.8h, v19.16b, v28.16b 2357 umlal2 v7.8h, v20.16b, v27.16b 2358 2359 rshrn v4.8b, v4.8h, #6 2360 rshrn2 v4.16b, v5.8h, #6 2361 rshrn v5.8b, v6.8h, #6 2362 rshrn2 v5.16b, v7.8h, #6 2363 2364 st1 {v4.16b}, [x0], x1 2365 subs w5, w5, #2 2366 st1 {v5.16b}, [x0], x1 2367 b.le 9f 2368 2369 mov v18.16b, v20.16b 2370 add v29.16b, v29.16b, v24.16b // base_y += 2 2371 add v30.16b, v30.16b, v24.16b // base_y += 2 2372 b 169b 2373 23749: 2375 ldp d14, d15, [sp, #0x30] 2376 ldp d12, d13, [sp, #0x20] 2377 ldp d10, d11, [sp, #0x10] 2378 ldp d8, d9, [sp], 0x40 2379 ret 2380 2381320: 2382640: 2383 AARCH64_VALID_JUMP_TARGET 2384 2385 stp d8, d9, [sp, #-0x40]! 2386 stp d10, d11, [sp, #0x10] 2387 stp d12, d13, [sp, #0x20] 2388 stp d14, d15, [sp, #0x30] 2389 2390 add x11, x11, #16 // increments 2391 2392 dup v25.8h, w7 // -dy 2393 add x3, x3, #1 // Skip past left[0] 2394 2395 ld1 {v14.8h}, [x11] // {8,9,10,11,12,13,14,15} 2396 2397 add x13, x0, x1 // alternating row 2398 lsl x1, x1, #1 // stride *= 2 2399 sub x1, x1, w4, uxtw // stride -= width 2400 2401 movi v11.8h, #8 2402 mul v26.8h, v31.8h, v25.8h // {0,1,2,3,4,5,6,7}* -dy 2403 add v26.8h, v26.8h, v25.8h // -= dy 2404 mul v25.8h, v25.8h, v11.8h // -8*dy 2405 2406 xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} 2407 xtn2 v31.16b, v14.8h // {8,9,10,11,12,13,14,15} 2408 2409 // Worst case height is 64. 2410 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[] 2411 ld1r {v15.16b}, [x2] // left[0] == top[0] 2412 2413 mov w12, w4 // orig w 2414 neg w14, w4 // -w 2415 24161: 2417 mov v23.16b, v26.16b // reset ypos 2418 2419 asr w9, w8, #6 // base_x 2420 dup v16.8h, w8 // xpos 2421 sub w8, w8, w6 // xpos -= dx 2422 cmp w9, w14 // base_x <= -w 2423 asr w11, w8, #6 // base_x 2424 b.le 329f 2425 2426 dup v17.8h, w8 // xpos 2427 sub w8, w8, w6 // xpos -= dx 2428 2429 add x9, x2, w9, sxtw 2430 add x11, x2, w11, sxtw 2431 2432 sqshrn v21.8b, v16.8h, #6 // first base_x 2433 sqshrn v22.8b, v17.8h, #6 2434 xtn v16.8b, v16.8h // (uint8_t)xpos 2435 xtn v17.8b, v17.8h 2436 2437 ld1 {v4.16b}, [x9], #16 // top[base_x] 2438 ld1 {v6.16b}, [x11], #16 2439 2440 trn1 v21.2d, v21.2d, v21.2d // first base_x 2441 trn1 v22.2d, v22.2d, v22.2d 2442 trn1 v16.2d, v16.2d, v16.2d // (uint8_t)xpos 2443 trn1 v17.2d, v17.2d, v17.2d 2444 2445 movi v10.16b, #0x3e 2446 movi v11.16b, #64 2447 2448 and v16.16b, v16.16b, v10.16b // frac_x 2449 and v17.16b, v17.16b, v10.16b 2450 2451 sub v8.16b, v11.16b, v16.16b // 64 - frac_x 2452 sub v9.16b, v11.16b, v17.16b 2453 2454 add v21.16b, v21.16b, v31.16b // actual base_x 2455 add v22.16b, v22.16b, v31.16b 2456 24572: 2458 add v13.8h, v23.8h, v25.8h // ypos -= 8*dy 2459 movi v12.16b, #64 2460 movi v20.16b, #2 2461 movi v10.16b, #0x3e 2462 2463 smov w10, v22.b[0] 2464 2465 xtn v27.8b, v23.8h // (uint8_t)ypos 2466 xtn2 v27.16b, v13.8h 2467 shrn v29.8b, v23.8h, #6 // ypos >> 6 2468 shrn2 v29.16b, v13.8h, #6 2469 cmp w10, #0 // base_x (bottom left) >= 0 2470 and v27.16b, v27.16b, v10.16b // frac_y 2471 2472 mov v18.16b, v15.16b // left[0] 2473 2474 b.ge 4f 2475 2476 add v23.8h, v13.8h, v25.8h // ypos -= 8*dy 2477 movi v13.16b, #1 2478 2479 tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] 2480 add v29.16b, v29.16b, v13.16b // base_y + 1 2481 mov v19.16b, v15.16b // left[0] 2482 2483 sub v28.16b, v12.16b, v27.16b // 64 - frac_y 2484 2485 ld1 {v5.16b}, [x9], #16 // top[base_x] 2486 ld1 {v7.16b}, [x11], #16 2487 2488 tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] 2489 add v29.16b, v29.16b, v13.16b // base_y + 2 2490 2491 mov v20.16b, v15.16b // left[0] 2492 tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] 2493 2494 umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) 2495 umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y 2496 umull2 v11.8h, v18.16b, v28.16b 2497 umlal2 v11.8h, v19.16b, v27.16b 2498 umull v12.8h, v19.8b, v28.8b 2499 umlal v12.8h, v20.8b, v27.8b 2500 umull2 v13.8h, v19.16b, v28.16b 2501 umlal2 v13.8h, v20.16b, v27.16b 2502 2503 ext v18.16b, v4.16b, v5.16b, #1 // top[base_x+1] 2504 ext v19.16b, v6.16b, v7.16b, #1 2505 2506 rshrn v10.8b, v10.8h, #6 2507 rshrn2 v10.16b, v11.8h, #6 2508 rshrn v11.8b, v12.8h, #6 2509 rshrn2 v11.16b, v13.8h, #6 2510 2511 umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x) 2512 umlal v12.8h, v18.8b, v16.8b // + top[base_x+1]*frac_x 2513 umull2 v13.8h, v4.16b, v8.16b 2514 umlal2 v13.8h, v18.16b, v16.16b 2515 umull v14.8h, v6.8b, v9.8b 2516 umlal v14.8h, v19.8b, v17.8b 2517 umull2 v20.8h, v6.16b, v9.16b 2518 umlal2 v20.8h, v19.16b, v17.16b 2519 2520 cmge v18.16b, v21.16b, #0 2521 cmge v19.16b, v22.16b, #0 2522 2523 rshrn v12.8b, v12.8h, #6 2524 rshrn2 v12.16b, v13.8h, #6 2525 rshrn v13.8b, v14.8h, #6 2526 rshrn2 v13.16b, v20.8h, #6 2527 2528 bit v10.16b, v12.16b, v18.16b 2529 bit v11.16b, v13.16b, v19.16b 2530 2531 st1 {v10.16b}, [x0], #16 2532 subs w4, w4, #16 2533 st1 {v11.16b}, [x13], #16 2534 b.le 3f 2535 2536 movi v10.16b, #16 2537 mov v4.16b, v5.16b 2538 mov v6.16b, v7.16b 2539 add v21.16b, v21.16b, v10.16b // base_x += 16 2540 add v22.16b, v22.16b, v10.16b 2541 b 2b 2542 25433: 2544 subs w5, w5, #2 2545 b.le 9f 2546 movi v10.8h, #128 2547 add x0, x0, x1 2548 add x13, x13, x1 2549 mov w4, w12 // reset w 2550 add v26.8h, v26.8h, v10.8h // ypos += 2*(1<<6) 2551 b 1b 2552 25534: // The rest of the row only predicted from top[] 2554 ld1 {v5.16b}, [x9], #16 // top[base_x] 2555 ld1 {v7.16b}, [x11], #16 2556 2557 ext v18.16b, v4.16b, v5.16b, #1 // top[base_x+1] 2558 ext v19.16b, v6.16b, v7.16b, #1 2559 2560 umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x) 2561 umlal v12.8h, v18.8b, v16.8b // + top[base_x+1]*frac_x 2562 umull2 v13.8h, v4.16b, v8.16b 2563 umlal2 v13.8h, v18.16b, v16.16b 2564 umull v14.8h, v6.8b, v9.8b 2565 umlal v14.8h, v19.8b, v17.8b 2566 umull2 v20.8h, v6.16b, v9.16b 2567 umlal2 v20.8h, v19.16b, v17.16b 2568 2569 rshrn v12.8b, v12.8h, #6 2570 rshrn2 v12.16b, v13.8h, #6 2571 rshrn v13.8b, v14.8h, #6 2572 rshrn2 v13.16b, v20.8h, #6 2573 2574 st1 {v12.16b}, [x0], #16 2575 subs w4, w4, #16 2576 st1 {v13.16b}, [x13], #16 2577 b.le 3b 2578 2579 mov v4.16b, v5.16b 2580 mov v6.16b, v7.16b 2581 b 4b 2582 2583329: // The rest of the block only predicted from left[] 2584 add x1, x1, w4, uxtw // restore stride 2585 mov w12, w5 // orig remaining h 25861: 2587 add v13.8h, v23.8h, v25.8h // ypos -= 8*dy 2588 movi v12.16b, #64 2589 movi v10.16b, #0x3e 2590 2591 xtn v27.8b, v23.8h // (uint8_t)ypos 2592 xtn2 v27.16b, v13.8h 2593 shrn v29.8b, v23.8h, #6 // ypos >> 6 2594 shrn2 v29.16b, v13.8h, #6 2595 and v27.16b, v27.16b, v10.16b // frac_y 2596 2597 mov v18.16b, v15.16b // left[0] 2598 add v23.8h, v13.8h, v25.8h // ypos -= 8*dy 2599 movi v21.16b, #1 2600 2601 tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] 2602 add v29.16b, v29.16b, v21.16b // base_y + 1 2603 2604 sub v28.16b, v12.16b, v27.16b // 64 - frac_y 26052: 2606 mov v19.16b, v15.16b // left[0] 2607 tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] 2608 add v29.16b, v29.16b, v21.16b // base_y + 2 2609 mov v20.16b, v15.16b // left[0] 2610 tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] 2611 add v29.16b, v29.16b, v21.16b // next base_y 2612 2613 umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) 2614 umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y 2615 umull2 v11.8h, v18.16b, v28.16b 2616 umlal2 v11.8h, v19.16b, v27.16b 2617 umull v12.8h, v19.8b, v28.8b 2618 umlal v12.8h, v20.8b, v27.8b 2619 umull2 v13.8h, v19.16b, v28.16b 2620 umlal2 v13.8h, v20.16b, v27.16b 2621 2622 rshrn v10.8b, v10.8h, #6 2623 rshrn2 v10.16b, v11.8h, #6 2624 rshrn v11.8b, v12.8h, #6 2625 rshrn2 v11.16b, v13.8h, #6 2626 2627 st1 {v10.16b}, [x0], x1 2628 subs w5, w5, #2 2629 st1 {v11.16b}, [x13], x1 2630 b.le 3f 2631 mov v18.16b, v20.16b 2632 b 2b 2633 26343: 2635 subs w4, w4, #16 2636 b.le 9f 2637 2638 lsr x1, x1, #1 2639 msub x0, x1, x12, x0 // ptr -= h * stride 2640 msub x13, x1, x12, x13 2641 lsl x1, x1, #1 2642 add x0, x0, #16 2643 add x13, x13, #16 2644 mov w5, w12 // reset h 2645 b 1b 2646 26479: 2648 ldp d14, d15, [sp, #0x30] 2649 ldp d12, d13, [sp, #0x20] 2650 ldp d10, d11, [sp, #0x10] 2651 ldp d8, d9, [sp], 0x40 2652 ret 2653 2654L(ipred_z2_fill1_tbl): 2655 .hword L(ipred_z2_fill1_tbl) - 640b 2656 .hword L(ipred_z2_fill1_tbl) - 320b 2657 .hword L(ipred_z2_fill1_tbl) - 160b 2658 .hword L(ipred_z2_fill1_tbl) - 80b 2659 .hword L(ipred_z2_fill1_tbl) - 40b 2660endfunc 2661 2662function ipred_z2_fill2_8bpc_neon, export=1 2663 cmp w4, #8 2664 mov w8, #(2 << 6) // xpos = 2 << 6 2665 sub w8, w8, w6 // xpos -= dx 2666 2667 movrel x11, increments 2668 ld1 {v31.8h}, [x11] // increments 2669 neg w7, w7 // -dy 2670 b.eq 80f 2671 267240: 2673 dup v30.4h, w7 // -dy 2674 movi v17.8b, #1 2675 2676 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy 2677 movi v25.16b, #0x3e 2678 add v30.4h, v16.4h, v30.4h // -= dy 2679 2680 xtn v31.8b, v31.8h // {0,1,2,3} 2681 2682 // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements 2683 // from left. 2684 ld1 {v0.16b}, [x3] // left[] 2685 2686 movi v26.16b, #64 2687 movi v19.16b, #2 2688 2689 xtn v27.8b, v30.8h // (uint8_t)ypos 2690 shrn v29.8b, v30.8h, #6 // ypos >> 6 2691 and v27.8b, v27.8b, v25.8b // frac_y 2692 2693 add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 2694 2695 add v30.8b, v29.8b, v17.8b // base_y + 1 2696 add v28.8b, v29.8b, v19.8b // base_y + 2 2697 2698 tbl v16.8b, {v0.16b}, v29.8b // left[base_y] 2699 2700 trn1 v30.2s, v30.2s, v28.2s // base_y + 1, base_y + 2 2701 2702 sub v28.8b, v26.8b, v27.8b // 64 - frac_y 2703 2704 trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3} 2705 2706 trn1 v27.2s, v27.2s, v27.2s // frac_y 2707 trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y 2708 2709 movi v29.8b, #2 2710 add v31.8b, v31.8b, v31.8b // {0,2,4,6,0,2,4,6} 27114: 2712 asr w9, w8, #6 // base_x 2713 dup v6.4h, w8 // xpos 2714 sub w8, w8, w6 // xpos -= dx 2715 cmp w9, #-8 // base_x <= -8 2716 asr w11, w8, #6 // base_x 2717 b.le 49f 2718 2719 dup v7.4h, w8 // xpos 2720 2721 ldr d2, [x2, w9, sxtw] // top[base_x] 2722 ldr d4, [x2, w11, sxtw] 2723 2724 trn1 v6.2d, v6.2d, v7.2d // xpos 2725 2726 tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2] 2727 2728 shrn v20.8b, v6.8h, #6 // first base_x for each row 2729 xtn v6.8b, v6.8h // (uint8_t)xpos 2730 2731 uzp2 v3.8b, v2.8b, v4.8b // top[base_x+1] 2732 uzp1 v2.8b, v2.8b, v4.8b // top[base_x] 2733 2734 and v6.8b, v6.8b, v25.8b // frac_x 2735 2736 trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] 2737 2738 sub v7.8b, v26.8b, v6.8b // 64 - frac_x 2739 2740 add v20.8b, v20.8b, v31.8b // actual base_x 2741 2742 umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y) 2743 umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y 2744 2745 umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x) 2746 umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x 2747 2748 cmge v20.8b, v20.8b, #0 2749 2750 rshrn v16.8b, v16.8h, #6 2751 rshrn v22.8b, v22.8h, #6 2752 2753 bit v16.8b, v22.8b, v20.8b 2754 2755 st1 {v16.s}[0], [x0], x1 2756 sub w8, w8, w6 // xpos -= dx 2757 subs w5, w5, #2 2758 st1 {v16.s}[1], [x0], x1 2759 b.le 9f 2760 2761 ext v16.8b, v17.8b, v17.8b, #4 2762 add v30.8b, v30.8b, v29.8b // base_y += 2 2763 b 4b 2764 276549: 2766 tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2] 2767 2768 trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] 2769 2770 umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t) 2771 umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y 2772 rshrn v18.8b, v18.8h, #6 2773 2774 st1 {v18.s}[0], [x0], x1 2775 subs w5, w5, #2 2776 st1 {v18.s}[1], [x0], x1 2777 b.le 9f 2778 2779 ext v16.8b, v17.8b, v17.8b, #4 2780 add v30.8b, v30.8b, v29.8b // base_y += 2 2781 b 49b 2782 27839: 2784 ret 2785 278680: 2787 dup v30.8h, w7 // -dy 2788 movi v17.8b, #1 2789 2790 mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy 2791 movi v25.16b, #0x3e 2792 add v30.8h, v16.8h, v30.8h // -= dy 2793 2794 xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} 2795 2796 // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements 2797 // from left. 2798 ld1 {v0.16b}, [x3] // left[] 2799 2800 movi v26.16b, #64 2801 movi v19.16b, #2 2802 2803 xtn v27.8b, v30.8h // (uint8_t)ypos 2804 shrn v29.8b, v30.8h, #6 // ypos >> 6 2805 and v27.8b, v27.8b, v25.8b // frac_y 2806 2807 add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 2808 2809 tbl v18.8b, {v0.16b}, v29.8b // left[base_y] 2810 2811 add v30.8b, v29.8b, v19.8b // base_y + 2 2812 add v29.8b, v29.8b, v17.8b // base_y + 1 2813 2814 sub v28.8b, v26.8b, v27.8b // 64 - frac_y 2815 2816 trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7} 2817 2818 movi v24.8b, #2 // 2 2819 add v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14} 28208: 2821 asr w9, w8, #6 // base_x 2822 dup v16.8h, w8 // xpos 2823 sub w8, w8, w6 // xpos -= dx 2824 cmp w9, #-16 // base_x <= -16 2825 asr w11, w8, #6 // base_x 2826 b.le 89f 2827 2828 dup v17.8h, w8 // xpos 2829 2830 ldr q4, [x2, w9, sxtw] // top[base_x] 2831 ldr q6, [x2, w11, sxtw] 2832 2833 tbl v19.8b, {v0.16b}, v29.8b // left[base_y+1] 2834 2835 shrn v21.8b, v16.8h, #6 // first base_x 2836 shrn2 v21.16b, v17.8h, #6 2837 xtn v16.8b, v16.8h // (uint8_t)xpos 2838 xtn2 v16.16b, v17.8h 2839 2840 tbl v20.8b, {v0.16b}, v30.8b // left[base_y+2] 2841 2842 uzp2 v5.16b, v4.16b, v6.16b // top[base_x+1] 2843 uzp1 v4.16b, v4.16b, v6.16b // top[base_x] 2844 2845 and v16.16b, v16.16b, v25.16b // frac_x 2846 2847 sub v7.16b, v26.16b, v16.16b // 64 - frac_x 2848 2849 add v21.16b, v21.16b, v31.16b // actual base_x 2850 2851 umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) 2852 umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y 2853 umull v17.8h, v19.8b, v28.8b 2854 umlal v17.8h, v20.8b, v27.8b 2855 2856 umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x) 2857 umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x 2858 umull2 v23.8h, v4.16b, v7.16b 2859 umlal2 v23.8h, v5.16b, v16.16b 2860 2861 cmge v21.16b, v21.16b, #0 2862 2863 rshrn v6.8b, v6.8h, #6 2864 rshrn2 v6.16b, v17.8h, #6 2865 rshrn v22.8b, v22.8h, #6 2866 rshrn2 v22.16b, v23.8h, #6 2867 2868 bit v6.16b, v22.16b, v21.16b 2869 2870 st1 {v6.d}[0], [x0], x1 2871 sub w8, w8, w6 // xpos -= dx 2872 subs w5, w5, #2 2873 st1 {v6.d}[1], [x0], x1 2874 b.le 9f 2875 2876 mov v18.8b, v20.8b 2877 add v29.8b, v29.8b, v24.8b // base_y += 2 2878 add v30.8b, v30.8b, v24.8b // base_y += 2 2879 b 8b 2880 288189: 2882 tbl v19.8b, {v0.16b}, v29.8b // left[base_y+1] 2883 tbl v20.8b, {v0.16b}, v30.8b // left[base_y+2] 2884 2885 umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) 2886 umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y 2887 umull v17.8h, v19.8b, v28.8b 2888 umlal v17.8h, v20.8b, v27.8b 2889 2890 rshrn v6.8b, v6.8h, #6 2891 rshrn2 v6.16b, v17.8h, #6 2892 2893 st1 {v6.d}[0], [x0], x1 2894 subs w5, w5, #2 2895 st1 {v6.d}[1], [x0], x1 2896 b.le 9f 2897 2898 mov v18.8b, v20.8b 2899 add v29.8b, v29.8b, v24.8b // base_y += 2 2900 add v30.8b, v30.8b, v24.8b // base_y += 2 2901 b 89b 2902 29039: 2904 ret 2905endfunc 2906 2907function ipred_z2_fill3_8bpc_neon, export=1 2908 cmp w4, #8 2909 mov w8, #(1 << 6) // xpos = 1 << 6 2910 sub w8, w8, w6 // xpos -= dx 2911 2912 movrel x11, increments 2913 ld1 {v31.8h}, [x11] // increments 2914 neg w7, w7 // -dy 2915 b.eq 80f 2916 291740: 2918 dup v30.4h, w7 // -dy 2919 movi v17.8b, #1 2920 2921 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy 2922 movi v25.16b, #0x3e 2923 add v30.4h, v16.4h, v30.4h // -= dy 2924 2925 xtn v31.8b, v31.8h // {0,1,2,3} 2926 2927 // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. 2928 ld1 {v0.16b, v1.16b}, [x3] // left[] 2929 2930 movi v26.16b, #64 2931 movi v19.16b, #2 2932 2933 xtn v27.8b, v30.8h // (uint8_t)ypos 2934 shrn v29.8b, v30.8h, #6 // ypos >> 6 2935 and v27.8b, v27.8b, v25.8b // frac_y 2936 2937 add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2 2938 2939 add v30.8b, v29.8b, v17.8b // base_y + 1 2940 add v28.8b, v29.8b, v19.8b // base_y + 2 2941 2942 trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3} 2943 2944 add v24.8b, v30.8b, v19.8b // base_y + 3 2945 2946 trn1 v29.2s, v29.2s, v28.2s // base_y + 0, base_y + 2 2947 trn1 v30.2s, v30.2s, v24.2s // base_y + 1, base_y + 3 2948 2949 sub v28.8b, v26.8b, v27.8b // 64 - frac_y 2950 2951 trn1 v27.2s, v27.2s, v27.2s // frac_y 2952 trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y 2953 2954 movi v24.8b, #4 29554: 2956 asr w9, w8, #6 // base_x 2957 dup v6.4h, w8 // xpos 2958 sub w8, w8, w6 // xpos -= dx 2959 cmp w9, #-4 // base_x <= -4 2960 asr w11, w8, #6 // base_x 2961 b.le 49f 2962 2963 dup v7.4h, w8 // xpos 2964 2965 ldr d2, [x2, w9, sxtw] // top[base_x] 2966 ldr d4, [x2, w11, sxtw] 2967 2968 trn1 v6.2d, v6.2d, v7.2d // xpos 2969 2970 tbl v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2] 2971 tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3] 2972 2973 shrn v20.8b, v6.8h, #6 // first base_x for each row 2974 xtn v6.8b, v6.8h // (uint8_t)xpos 2975 2976 ext v3.8b, v2.8b, v2.8b, #1 // top[base_x+1] 2977 ext v5.8b, v4.8b, v4.8b, #1 2978 2979 and v6.8b, v6.8b, v25.8b // frac_x 2980 2981 trn1 v2.2s, v2.2s, v4.2s // top[base_x] 2982 trn1 v3.2s, v3.2s, v5.2s // top[base_x+1] 2983 2984 sub v7.8b, v26.8b, v6.8b // 64 - frac_x 2985 2986 add v20.8b, v20.8b, v31.8b // actual base_x 2987 2988 umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y) 2989 umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y 2990 2991 umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x) 2992 umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x 2993 2994 cmge v20.8b, v20.8b, #0 2995 2996 rshrn v16.8b, v16.8h, #6 2997 rshrn v22.8b, v22.8h, #6 2998 2999 bit v16.8b, v22.8b, v20.8b 3000 3001 st1 {v16.s}[0], [x0], x1 3002 sub w8, w8, w6 // xpos -= dx 3003 subs w5, w5, #2 3004 st1 {v16.s}[1], [x0], x1 3005 b.le 9f 3006 3007 add v29.8b, v29.8b, v24.8b // base_y += 4 3008 add v30.8b, v30.8b, v24.8b // base_y += 4 3009 b 4b 3010 301149: 3012 tbl v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2] 3013 tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3] 3014 3015 umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t) 3016 umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y 3017 rshrn v18.8b, v18.8h, #6 3018 3019 st1 {v18.s}[0], [x0], x1 3020 subs w5, w5, #2 3021 st1 {v18.s}[1], [x0], x1 3022 b.le 9f 3023 3024 add v29.8b, v29.8b, v24.8b // base_y += 4 3025 add v30.8b, v30.8b, v24.8b // base_y += 4 3026 b 49b 3027 30289: 3029 ret 3030 303180: 3032 dup v30.8h, w7 // -dy 3033 movi v17.8b, #1 3034 3035 mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy 3036 movi v25.16b, #0x3e 3037 add v30.8h, v16.8h, v30.8h // -= dy 3038 3039 xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} 3040 3041 // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. 3042 ld1 {v0.16b, v1.16b, v2.16b}, [x3] // left[] 3043 3044 movi v26.16b, #64 3045 movi v19.16b, #2 3046 3047 xtn v27.8b, v30.8h // (uint8_t)ypos 3048 shrn v29.8b, v30.8h, #6 // ypos >> 6 3049 and v27.8b, v27.8b, v25.8b // frac_y 3050 3051 add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2 3052 3053 add v28.8b, v29.8b, v17.8b // base_y + 1 3054 add v30.8b, v29.8b, v19.8b // base_y + 2 3055 3056 trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7} 3057 add v24.8b, v28.8b, v19.8b // base_y + 3 3058 3059 trn1 v29.2d, v29.2d, v30.2d // base_y + 0, base_y + 2 3060 trn1 v30.2d, v28.2d, v24.2d // base_y + 1, base_y + 3 3061 3062 sub v28.8b, v26.8b, v27.8b // 64 - frac_y 3063 3064 movi v24.16b, #4 3065 3066 trn1 v27.2d, v27.2d, v27.2d // frac_y 3067 trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y 30688: 3069 asr w9, w8, #6 // base_x 3070 dup v16.8h, w8 // xpos 3071 sub w8, w8, w6 // xpos -= dx 3072 cmp w9, #-8 // base_x <= -8 3073 asr w11, w8, #6 // base_x 3074 b.le 89f 3075 3076 dup v17.8h, w8 // xpos 3077 3078 ldr q4, [x2, w9, sxtw] // top[base_x] 3079 ldr q6, [x2, w11, sxtw] 3080 3081 tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] 3082 tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] 3083 3084 shrn v21.8b, v16.8h, #6 // first base_x 3085 shrn2 v21.16b, v17.8h, #6 3086 xtn v16.8b, v16.8h // (uint8_t)xpos 3087 xtn2 v16.16b, v17.8h 3088 3089 ext v5.16b, v4.16b, v4.16b, #1 // top[base_x+1] 3090 ext v7.16b, v6.16b, v6.16b, #1 3091 3092 and v16.16b, v16.16b, v25.16b // frac_x 3093 3094 trn1 v4.2d, v4.2d, v6.2d // top[base_x] 3095 trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] 3096 3097 sub v7.16b, v26.16b, v16.16b // 64 - frac_x 3098 3099 add v21.16b, v21.16b, v31.16b // actual base_x 3100 3101 umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) 3102 umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y 3103 umull2 v17.8h, v18.16b, v28.16b 3104 umlal2 v17.8h, v19.16b, v27.16b 3105 3106 umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x) 3107 umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x 3108 umull2 v23.8h, v4.16b, v7.16b 3109 umlal2 v23.8h, v5.16b, v16.16b 3110 3111 cmge v21.16b, v21.16b, #0 3112 3113 rshrn v6.8b, v6.8h, #6 3114 rshrn2 v6.16b, v17.8h, #6 3115 rshrn v22.8b, v22.8h, #6 3116 rshrn2 v22.16b, v23.8h, #6 3117 3118 bit v6.16b, v22.16b, v21.16b 3119 3120 st1 {v6.d}[0], [x0], x1 3121 sub w8, w8, w6 // xpos -= dx 3122 subs w5, w5, #2 3123 st1 {v6.d}[1], [x0], x1 3124 b.le 9f 3125 3126 add v29.16b, v29.16b, v24.16b // base_y += 4 3127 add v30.16b, v30.16b, v24.16b // base_y += 4 3128 b 8b 3129 313089: 3131 tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] 3132 tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] 3133 3134 umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) 3135 umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y 3136 umull2 v17.8h, v18.16b, v28.16b 3137 umlal2 v17.8h, v19.16b, v27.16b 3138 3139 rshrn v6.8b, v6.8h, #6 3140 rshrn2 v6.16b, v17.8h, #6 3141 3142 st1 {v6.d}[0], [x0], x1 3143 subs w5, w5, #2 3144 st1 {v6.d}[1], [x0], x1 3145 b.le 9f 3146 3147 add v29.16b, v29.16b, v24.16b // base_y += 4 3148 add v30.16b, v30.16b, v24.16b // base_y += 4 3149 b 89b 3150 31519: 3152 ret 3153endfunc 3154 3155 3156// void ipred_z3_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, 3157// const pixel *const left, 3158// const int width, const int height, 3159// const int dy, const int max_base_y); 3160function ipred_z3_fill1_8bpc_neon, export=1 3161 cmp w6, #64 3162 clz w9, w3 3163 adr x8, L(ipred_z3_fill1_tbl) 3164 sub w9, w9, #25 3165 ldrh w9, [x8, w9, uxtw #1] 3166 add x10, x2, w6, uxtw // left[max_base_y] 3167 sub x8, x8, w9, uxtw 3168 movrel x11, increments 3169 ld1r {v31.16b}, [x10] // padding 3170 ld1 {v30.8h}, [x11] // increments 3171 mov w7, w5 3172 b.gt L(ipred_z3_fill1_large_h16) 3173 br x8 3174 317540: 3176 AARCH64_VALID_JUMP_TARGET 3177 dup v29.4h, w5 // dy 3178 3179 mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy 3180 movi v23.16b, #0x3e 3181 3182 // Worst case max_base_y is width+height-1, for w=4, h=16, <= 32 3183 ld1 {v0.16b, v1.16b}, [x2] // left[] 3184 add v30.4h, v29.4h, v30.4h // ypos 3185 3186 movi v22.16b, #64 3187 movi v20.16b, #1 3188 movi v21.16b, #2 3189 3190 xtn v24.8b, v30.8h // (uint8_t)ypos 3191 uqshrn v26.8b, v30.8h, #6 // base 3192 and v24.8b, v24.8b, v23.8b // frac 3193 3194 mov v4.8b, v31.8b 3195 uqadd v27.8b, v26.8b, v20.8b // base + 1 3196 uqadd v28.8b, v26.8b, v21.8b // base + 2 3197 sub v25.8b, v22.8b, v24.8b // 64 - frac 3198 3199 tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base] 3200 3201 trn1 v27.2s, v27.2s, v28.2s // base + 1, base + 2 3202 trn1 v24.2s, v24.2s, v24.2s // frac 3203 trn1 v25.2s, v25.2s, v25.2s // 64 - frac 32041: 3205 mov v5.8b, v31.8b 3206 tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+2] 3207 3208 trn1 v4.2s, v4.2s, v5.2s // left[base], left[base+1] 3209 3210 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) 3211 umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac 3212 rshrn v16.8b, v16.8h, #6 3213 st1 {v16.s}[0], [x0], x1 3214 subs w4, w4, #2 3215 st1 {v16.s}[1], [x0], x1 3216 b.le 9f 3217 3218 ext v4.8b, v5.8b, v5.8b, #4 3219 uqadd v27.8b, v27.8b, v21.8b // base += 2 3220 b 1b 3221 32229: 3223 ret 3224 322580: 3226 AARCH64_VALID_JUMP_TARGET 3227 dup v29.8h, w5 // dy 3228 3229 mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy 3230 movi v23.16b, #0x3e 3231 3232 // Worst case max_base_y is width+height-1, for w=8, h=32, <= 48 3233 ld1 {v0.16b, v1.16b, v2.16b}, [x2] // left[] 3234 add v30.8h, v29.8h, v30.8h // ypos 3235 3236 movi v22.16b, #64 3237 movi v20.16b, #1 3238 movi v21.16b, #2 3239 3240 xtn v24.8b, v30.8h // (uint8_t)ypos 3241 uqshrn v26.8b, v30.8h, #6 // base 3242 and v24.8b, v24.8b, v23.8b // frac 3243 3244 mov v4.8b, v31.8b 3245 uqadd v27.8b, v26.8b, v20.8b // base + 1 3246 uqadd v28.8b, v26.8b, v21.8b // base + 2 3247 sub v25.8b, v22.8b, v24.8b // 64 - frac 3248 3249 tbx v4.8b, {v0.16b, v1.16b, v2.16b}, v26.8b // left[base] 32501: 3251 mov v5.8b, v31.8b 3252 mov v6.8b, v31.8b 3253 tbx v5.8b, {v0.16b, v1.16b, v2.16b}, v27.8b // left[base+1] 3254 tbx v6.8b, {v0.16b, v1.16b, v2.16b}, v28.8b // left[base+2] 3255 3256 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) 3257 umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac 3258 umull v17.8h, v5.8b, v25.8b 3259 umlal v17.8h, v6.8b, v24.8b 3260 rshrn v16.8b, v16.8h, #6 3261 rshrn v17.8b, v17.8h, #6 3262 st1 {v16.8b}, [x0], x1 3263 subs w4, w4, #2 3264 st1 {v17.8b}, [x0], x1 3265 b.le 9f 3266 3267 mov v4.8b, v6.8b 3268 uqadd v27.8b, v27.8b, v21.8b // base += 2 3269 uqadd v28.8b, v28.8b, v21.8b // base += 2 3270 b 1b 3271 32729: 3273 ret 3274 3275160: 3276 AARCH64_VALID_JUMP_TARGET 3277 dup v28.8h, w5 // dy 3278 3279 shl v29.8h, v28.8h, #3 // 8*dy 3280 mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy 3281 movi v23.16b, #0x3e 3282 3283 // This is only executed if we've checked that max_base_y <= 64. 3284 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[] 3285 add v28.8h, v28.8h, v30.8h // ypos 3286 3287 movi v22.16b, #64 3288 movi v20.16b, #1 3289 movi v21.16b, #2 3290 3291 add v29.8h, v28.8h, v29.8h // ypos + 8*dy 3292 3293 xtn v24.8b, v28.8h // (uint8_t)ypos 3294 xtn2 v24.16b, v29.8h 3295 uqshrn v26.8b, v28.8h, #6 // base 3296 uqshrn2 v26.16b, v29.8h, #6 3297 and v24.16b, v24.16b, v23.16b // frac 3298 3299 mov v4.16b, v31.16b 3300 uqadd v27.16b, v26.16b, v20.16b // base + 1 3301 uqadd v28.16b, v26.16b, v21.16b // base + 2 3302 sub v25.16b, v22.16b, v24.16b // 64 - frac 3303 3304 tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base] 33051: 3306 mov v5.16b, v31.16b 3307 mov v6.16b, v31.16b 3308 tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1] 3309 tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v28.16b // left[base+2] 3310 3311 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) 3312 umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac 3313 umull2 v17.8h, v4.16b, v25.16b 3314 umlal2 v17.8h, v5.16b, v24.16b 3315 umull v18.8h, v5.8b, v25.8b 3316 umlal v18.8h, v6.8b, v24.8b 3317 umull2 v19.8h, v5.16b, v25.16b 3318 umlal2 v19.8h, v6.16b, v24.16b 3319 rshrn v16.8b, v16.8h, #6 3320 rshrn2 v16.16b, v17.8h, #6 3321 rshrn v17.8b, v18.8h, #6 3322 rshrn2 v17.16b, v19.8h, #6 3323 st1 {v16.16b}, [x0], x1 3324 subs w4, w4, #2 3325 st1 {v17.16b}, [x0], x1 3326 b.le 9f 3327 3328 mov v4.16b, v6.16b 3329 uqadd v27.16b, v27.16b, v21.16b // base += 2 3330 uqadd v28.16b, v28.16b, v21.16b // base += 2 3331 b 1b 3332 33339: 3334 ret 3335320: 3336640: 3337 AARCH64_VALID_JUMP_TARGET 3338 dup v28.8h, w5 // dy 3339 mov w12, w3 3340 3341 add x13, x0, x1 3342 3343 shl v29.8h, v28.8h, #3 // 8*dy 3344 mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy 3345 movi v23.16b, #0x3e 3346 3347 lsl x1, x1, #1 3348 sub x1, x1, w3, uxtw 3349 add v30.8h, v28.8h, v30.8h // ypos 3350 3351 // This is only executed if we've checked that max_base_y <= 64. 3352 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[] 3353 3354 movi v22.16b, #64 3355 movi v20.16b, #1 3356 movi v21.16b, #2 3357 33581: 3359 mov v26.16b, v30.16b // reset ypos 3360 33612: 3362 add v27.8h, v26.8h, v29.8h // ypos + 8*dy 3363 uqshrn v16.8b, v26.8h, #6 // base 3364 uqshrn2 v16.16b, v27.8h, #6 3365 xtn v24.8b, v26.8h // (uint8_t)ypos 3366 xtn2 v24.16b, v27.8h 3367 umov w14, v16.b[0] 3368 and v24.16b, v24.16b, v23.16b // frac 3369 3370 uqadd v17.16b, v16.16b, v20.16b // base + 1 3371 cmp w14, w6 // base >= max_base_y 3372 uqadd v18.16b, v16.16b, v21.16b // base + 2 3373 sub v25.16b, v22.16b, v24.16b // 64 - frac 3374 3375 b.ge 4f 3376 3377 mov v4.16b, v31.16b 3378 mov v5.16b, v31.16b 3379 mov v6.16b, v31.16b 3380 tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v16.16b // left[base] 3381 tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v17.16b // left[base+1] 3382 tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v18.16b // left[base+2] 3383 3384 subs w3, w3, #16 3385 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) 3386 umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac 3387 umull2 v17.8h, v4.16b, v25.16b 3388 umlal2 v17.8h, v5.16b, v24.16b 3389 umull v18.8h, v5.8b, v25.8b 3390 umlal v18.8h, v6.8b, v24.8b 3391 umull2 v19.8h, v5.16b, v25.16b 3392 umlal2 v19.8h, v6.16b, v24.16b 3393 rshrn v16.8b, v16.8h, #6 3394 rshrn2 v16.16b, v17.8h, #6 3395 rshrn v17.8b, v18.8h, #6 3396 rshrn2 v17.16b, v19.8h, #6 3397 st1 {v16.16b}, [x0], #16 3398 st1 {v17.16b}, [x13], #16 3399 b.le 3f 3400 add v26.8h, v27.8h, v29.8h // ypos += 16*dy 3401 b 2b 3402 34033: 3404 subs w4, w4, #2 3405 b.le 9f 3406 movi v16.8h, #128 3407 add x0, x0, x1 3408 add x13, x13, x1 3409 add v30.8h, v30.8h, v16.8h // ypos = dy + y*(1<<6)*2 3410 mov w3, w12 3411 b 1b 3412 34134: 3414 subs w3, w3, #16 3415 st1 {v31.16b}, [x0], #16 3416 st1 {v31.16b}, [x13], #16 3417 b.gt 4b 3418 b 3b 3419 34209: 3421 ret 3422 3423L(ipred_z3_fill1_large_h16): 3424 // Fallback case for max_base_y > 64; similar to the z1 3425 // implementation. This does the filtering vertically, filling out 3426 // a 2x pixel column at a time. 3427 mov w15, #64 3428 add x13, x0, x1 3429 lsl x1, x1, #1 3430 3431 mov w12, w4 34321: 3433 lsr w8, w7, #6 // base 3434 and w9, w7, #0x3e // frac 3435 add w7, w7, w5 // ypos += dy 3436 cmp w8, w6 // base >= max_base_y 3437 lsr w10, w7, #6 // base 3438 and w11, w7, #0x3e // frac 3439 b.ge ipred_z3_fill_padding_neon 3440 add x8, x2, w8, uxtw 3441 add x10, x2, w10, uxtw 3442 dup v4.16b, w9 // frac 3443 dup v5.16b, w11 3444 ld1 {v0.16b, v1.16b}, [x8], #32 // left[base] 3445 ld1 {v2.16b, v3.16b}, [x10], #32 3446 sub w9, w15, w9 // 64 - frac 3447 sub w11, w15, w11 3448 dup v6.16b, w9 // 64 - frac 3449 dup v7.16b, w11 3450 add w7, w7, w5 // ypos += dy 34512: 3452 ext v16.16b, v0.16b, v1.16b, #1 // left[base+1] 3453 ext v17.16b, v2.16b, v3.16b, #1 3454 subs w4, w4, #16 3455 umull v18.8h, v16.8b, v4.8b // left[base+1]*frac 3456 umlal v18.8h, v0.8b, v6.8b // + left[base]*(64-frac) 3457 umull2 v19.8h, v16.16b, v4.16b 3458 umlal2 v19.8h, v0.16b, v6.16b 3459 umull v20.8h, v17.8b, v5.8b 3460 umlal v20.8h, v2.8b, v7.8b 3461 umull2 v21.8h, v17.16b, v5.16b 3462 umlal2 v21.8h, v2.16b, v7.16b 3463 rshrn v16.8b, v18.8h, #6 3464 rshrn2 v16.16b, v19.8h, #6 3465 rshrn v17.8b, v20.8h, #6 3466 rshrn2 v17.16b, v21.8h, #6 3467 zip1 v18.16b, v16.16b, v17.16b 3468 zip2 v19.16b, v16.16b, v17.16b 3469 st1 {v18.h}[0], [x0], x1 3470 st1 {v18.h}[1], [x13], x1 3471 st1 {v18.h}[2], [x0], x1 3472 st1 {v18.h}[3], [x13], x1 3473 st1 {v18.h}[4], [x0], x1 3474 st1 {v18.h}[5], [x13], x1 3475 st1 {v18.h}[6], [x0], x1 3476 st1 {v18.h}[7], [x13], x1 3477 st1 {v19.h}[0], [x0], x1 3478 st1 {v19.h}[1], [x13], x1 3479 st1 {v19.h}[2], [x0], x1 3480 st1 {v19.h}[3], [x13], x1 3481 st1 {v19.h}[4], [x0], x1 3482 st1 {v19.h}[5], [x13], x1 3483 st1 {v19.h}[6], [x0], x1 3484 st1 {v19.h}[7], [x13], x1 3485 b.le 3f 3486 mov v0.16b, v1.16b 3487 ld1 {v1.16b}, [x8], #16 // left[base] 3488 mov v2.16b, v3.16b 3489 ld1 {v3.16b}, [x10], #16 3490 b 2b 3491 34923: 3493 subs w3, w3, #2 3494 b.le 9f 3495 lsr x1, x1, #1 3496 msub x0, x1, x12, x0 // ptr -= h * stride 3497 msub x13, x1, x12, x13 3498 lsl x1, x1, #1 3499 add x0, x0, #2 3500 add x13, x13, #2 3501 mov w4, w12 3502 b 1b 35039: 3504 ret 3505 3506L(ipred_z3_fill1_tbl): 3507 .hword L(ipred_z3_fill1_tbl) - 640b 3508 .hword L(ipred_z3_fill1_tbl) - 320b 3509 .hword L(ipred_z3_fill1_tbl) - 160b 3510 .hword L(ipred_z3_fill1_tbl) - 80b 3511 .hword L(ipred_z3_fill1_tbl) - 40b 3512endfunc 3513 3514function ipred_z3_fill_padding_neon, export=0 3515 cmp w3, #16 3516 adr x8, L(ipred_z3_fill_padding_tbl) 3517 b.gt L(ipred_z3_fill_padding_wide) 3518 // w3 = remaining width, w4 = constant height 3519 mov w12, w4 3520 35211: 3522 // Fill a WxH rectangle with padding. W can be any number; 3523 // this fills the exact width by filling in the largest 3524 // power of two in the remaining width, and repeating. 3525 clz w9, w3 3526 sub w9, w9, #25 3527 ldrh w9, [x8, w9, uxtw #1] 3528 sub x9, x8, w9, uxtw 3529 br x9 3530 35312: 3532 AARCH64_VALID_JUMP_TARGET 3533 st1 {v31.h}[0], [x0], x1 3534 subs w4, w4, #4 3535 st1 {v31.h}[0], [x13], x1 3536 st1 {v31.h}[0], [x0], x1 3537 st1 {v31.h}[0], [x13], x1 3538 b.gt 2b 3539 subs w3, w3, #2 3540 lsr x1, x1, #1 3541 msub x0, x1, x12, x0 // ptr -= h * stride 3542 msub x13, x1, x12, x13 3543 b.le 9f 3544 lsl x1, x1, #1 3545 add x0, x0, #2 3546 add x13, x13, #2 3547 mov w4, w12 3548 b 1b 3549 35504: 3551 AARCH64_VALID_JUMP_TARGET 3552 st1 {v31.s}[0], [x0], x1 3553 subs w4, w4, #4 3554 st1 {v31.s}[0], [x13], x1 3555 st1 {v31.s}[0], [x0], x1 3556 st1 {v31.s}[0], [x13], x1 3557 b.gt 4b 3558 subs w3, w3, #4 3559 lsr x1, x1, #1 3560 msub x0, x1, x12, x0 // ptr -= h * stride 3561 msub x13, x1, x12, x13 3562 b.le 9f 3563 lsl x1, x1, #1 3564 add x0, x0, #4 3565 add x13, x13, #4 3566 mov w4, w12 3567 b 1b 3568 35698: 3570 AARCH64_VALID_JUMP_TARGET 3571 st1 {v31.8b}, [x0], x1 3572 subs w4, w4, #4 3573 st1 {v31.8b}, [x13], x1 3574 st1 {v31.8b}, [x0], x1 3575 st1 {v31.8b}, [x13], x1 3576 b.gt 4b 3577 subs w3, w3, #8 3578 lsr x1, x1, #1 3579 msub x0, x1, x12, x0 // ptr -= h * stride 3580 msub x13, x1, x12, x13 3581 b.le 9f 3582 lsl x1, x1, #1 3583 add x0, x0, #8 3584 add x13, x13, #8 3585 mov w4, w12 3586 b 1b 3587 358816: 358932: 359064: 3591 AARCH64_VALID_JUMP_TARGET 3592 st1 {v31.16b}, [x0], x1 3593 subs w4, w4, #4 3594 st1 {v31.16b}, [x13], x1 3595 st1 {v31.16b}, [x0], x1 3596 st1 {v31.16b}, [x13], x1 3597 b.gt 4b 3598 subs w3, w3, #16 3599 lsr x1, x1, #1 3600 msub x0, x1, x12, x0 // ptr -= h * stride 3601 msub x13, x1, x12, x13 3602 b.le 9f 3603 lsl x1, x1, #1 3604 add x0, x0, #16 3605 add x13, x13, #16 3606 mov w4, w12 3607 b 1b 3608 36099: 3610 ret 3611 3612L(ipred_z3_fill_padding_tbl): 3613 .hword L(ipred_z3_fill_padding_tbl) - 64b 3614 .hword L(ipred_z3_fill_padding_tbl) - 32b 3615 .hword L(ipred_z3_fill_padding_tbl) - 16b 3616 .hword L(ipred_z3_fill_padding_tbl) - 8b 3617 .hword L(ipred_z3_fill_padding_tbl) - 4b 3618 .hword L(ipred_z3_fill_padding_tbl) - 2b 3619 3620L(ipred_z3_fill_padding_wide): 3621 // Fill a WxH rectangle with padding, with W > 16. 3622 lsr x1, x1, #1 3623 mov w12, w3 3624 sub x1, x1, w3, uxtw 36251: 3626 ands w5, w3, #15 3627 b.eq 2f 3628 // If the width isn't aligned to 16, first do one 16 byte write 3629 // and align the start pointer. 3630 sub w3, w3, w5 3631 st1 {v31.16b}, [x0] 3632 add x0, x0, w5, uxtw 36332: 3634 // Fill the rest of the line with aligned 16 byte writes. 3635 subs w3, w3, #16 3636 st1 {v31.16b}, [x0], #16 3637 b.gt 2b 3638 subs w4, w4, #1 3639 add x0, x0, x1 3640 b.le 9f 3641 mov w3, w12 3642 b 1b 36439: 3644 ret 3645endfunc 3646 3647function ipred_z3_fill2_8bpc_neon, export=1 3648 cmp w3, #8 3649 add x10, x2, w6, uxtw // left[max_base_y] 3650 movrel x11, increments 3651 ld1r {v31.16b}, [x10] // padding 3652 ld1 {v30.8h}, [x11] // increments 3653 b.eq 80f 3654 365540: // w == 4 3656 dup v29.4h, w5 // dy 3657 3658 mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy 3659 movi v23.16b, #0x3e 3660 3661 // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16, 3662 // so max_base_y <= 32. 3663 ld1 {v0.16b, v1.16b}, [x2] // left[] 3664 add v30.4h, v29.4h, v30.4h // ypos 3665 3666 movi v22.16b, #64 3667 movi v20.16b, #1 3668 movi v21.16b, #2 3669 3670 xtn v24.8b, v30.8h // (uint8_t)ypos 3671 uqshrn v26.8b, v30.8h, #6 // base 3672 and v24.8b, v24.8b, v23.8b // frac 3673 3674 uqadd v27.8b, v26.8b, v20.8b // base + 1 3675 uqadd v28.8b, v26.8b, v21.8b // base + 2 3676 sub v25.8b, v22.8b, v24.8b // 64 - frac 3677 uqadd v29.8b, v27.8b, v21.8b // base + 3 3678 3679 trn1 v24.2s, v24.2s, v24.2s // frac 3680 trn1 v26.2s, v26.2s, v28.2s // base + 0, base + 2 3681 trn1 v27.2s, v27.2s, v29.2s // base + 1, base + 3 3682 trn1 v25.2s, v25.2s, v25.2s // 64 - frac 3683 3684 movi v21.16b, #4 36851: 3686 mov v4.8b, v31.8b 3687 mov v5.8b, v31.8b 3688 tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base], left[base+2] 3689 tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+3] 3690 3691 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) 3692 umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac 3693 rshrn v16.8b, v16.8h, #6 3694 st1 {v16.s}[0], [x0], x1 3695 subs w4, w4, #2 3696 st1 {v16.s}[1], [x0], x1 3697 b.le 9f 3698 3699 uqadd v26.8b, v26.8b, v21.8b // base += 4 3700 uqadd v27.8b, v27.8b, v21.8b // base += 4 3701 b 1b 3702 37039: 3704 ret 3705 370680: // w == 8 3707 dup v29.8h, w5 // dy 3708 3709 mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy 3710 movi v23.16b, #0x3e 3711 3712 // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16, 3713 // so max_base_y <= 32. 3714 ld1 {v0.16b, v1.16b}, [x2] // left[] 3715 add v30.8h, v29.8h, v30.8h // ypos 3716 3717 movi v22.16b, #64 3718 movi v20.16b, #1 3719 movi v21.16b, #2 3720 3721 xtn v24.8b, v30.8h // (uint8_t)ypos 3722 uqshrn v26.8b, v30.8h, #6 // base 3723 and v24.8b, v24.8b, v23.8b // frac 3724 3725 uqadd v27.8b, v26.8b, v20.8b // base + 1 3726 uqadd v28.8b, v26.8b, v21.8b // base + 2 3727 sub v25.8b, v22.8b, v24.8b // 64 - frac 3728 uqadd v29.8b, v27.8b, v21.8b // base + 3 3729 3730 trn1 v24.2d, v24.2d, v24.2d // frac 3731 trn1 v26.2d, v26.2d, v28.2d // base + 0, base + 2 3732 trn1 v27.2d, v27.2d, v29.2d // base + 1, base + 3 3733 trn1 v25.2d, v25.2d, v25.2d // 64 - frac 3734 3735 movi v21.16b, #4 37361: 3737 mov v4.16b, v31.16b 3738 mov v5.16b, v31.16b 3739 tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base], left[base+2] 3740 tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1], left[base+3] 3741 3742 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) 3743 umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac 3744 umull2 v17.8h, v4.16b, v25.16b 3745 umlal2 v17.8h, v5.16b, v24.16b 3746 rshrn v16.8b, v16.8h, #6 3747 rshrn v17.8b, v17.8h, #6 3748 st1 {v16.8b}, [x0], x1 3749 subs w4, w4, #2 3750 st1 {v17.8b}, [x0], x1 3751 b.le 9f 3752 3753 uqadd v26.16b, v26.16b, v21.16b // base += 4 3754 uqadd v27.16b, v27.16b, v21.16b // base += 4 3755 b 1b 3756 37579: 3758 ret 3759endfunc 3760 3761 3762// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride, 3763// const pixel *const topleft, 3764// const int width, const int height, const int filt_idx, 3765// const int max_width, const int max_height); 3766function ipred_filter_8bpc_neon, export=1 3767 and w5, w5, #511 3768 movrel x6, X(filter_intra_taps) 3769 lsl w5, w5, #6 3770 add x6, x6, w5, uxtw 3771 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 3772 clz w9, w3 3773 adr x5, L(ipred_filter_tbl) 3774 ld1 {v20.8b, v21.8b, v22.8b}, [x6] 3775 sub w9, w9, #26 3776 ldrh w9, [x5, w9, uxtw #1] 3777 sxtl v16.8h, v16.8b 3778 sxtl v17.8h, v17.8b 3779 sub x5, x5, w9, uxtw 3780 sxtl v18.8h, v18.8b 3781 sxtl v19.8h, v19.8b 3782 add x6, x0, x1 3783 lsl x1, x1, #1 3784 sxtl v20.8h, v20.8b 3785 sxtl v21.8h, v21.8b 3786 sxtl v22.8h, v22.8b 3787 br x5 378840: 3789 AARCH64_VALID_JUMP_TARGET 3790 ldur s0, [x2, #1] // top (0-3) 3791 sub x2, x2, #2 3792 mov x7, #-2 3793 uxtl v0.8h, v0.8b // top (0-3) 37944: 3795 ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2) 3796 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 3797 mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 3798 mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 3799 uxtl v1.8h, v1.8b // left (0-1) + topleft (2) 3800 mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 3801 mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) 3802 mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 3803 mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 3804 sqrshrun v2.8b, v2.8h, #4 3805 subs w4, w4, #2 3806 st1 {v2.s}[0], [x0], x1 3807 uxtl v0.8h, v2.8b 3808 st1 {v2.s}[1], [x6], x1 3809 ext v0.16b, v0.16b, v0.16b, #8 // move top from [4-7] to [0-3] 3810 b.gt 4b 3811 ret 381280: 3813 AARCH64_VALID_JUMP_TARGET 3814 ldur d0, [x2, #1] // top (0-7) 3815 sub x2, x2, #2 3816 mov x7, #-2 3817 uxtl v0.8h, v0.8b // top (0-7) 38188: 3819 ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2) 3820 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 3821 mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 3822 mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 3823 uxtl v1.8h, v1.8b // left (0-1) + topleft (2) 3824 mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 3825 mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) 3826 mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 3827 mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 3828 mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) 3829 mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) 3830 mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) 3831 sqrshrun v2.8b, v2.8h, #4 3832 uxtl v1.8h, v2.8b // first block, in 16 bit 3833 mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) 3834 mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) 3835 mla v3.8h, v21.8h, v1.h[3] // p5(left[0]) * filter(5) 3836 mla v3.8h, v22.8h, v1.h[7] // p6(left[1]) * filter(6) 3837 sqrshrun v3.8b, v3.8h, #4 3838 subs w4, w4, #2 3839 st2 {v2.s, v3.s}[0], [x0], x1 3840 zip2 v0.2s, v2.2s, v3.2s 3841 st2 {v2.s, v3.s}[1], [x6], x1 3842 uxtl v0.8h, v0.8b 3843 b.gt 8b 3844 ret 3845160: 3846320: 3847 AARCH64_VALID_JUMP_TARGET 3848 add x8, x2, #1 3849 sub x2, x2, #2 3850 mov x7, #-2 3851 sub x1, x1, w3, uxtw 3852 mov w9, w3 3853 38541: 3855 ld1 {v0.s}[0], [x2], x7 // left (0-1) + topleft (2) 3856 uxtl v0.8h, v0.8b // left (0-1) + topleft (2) 38572: 3858 ld1 {v2.16b}, [x8], #16 // top(0-15) 3859 mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) 3860 mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) 3861 uxtl v1.8h, v2.8b // top(0-7) 3862 uxtl2 v2.8h, v2.16b // top(8-15) 3863 mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) 3864 mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) 3865 mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) 3866 mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) 3867 mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) 3868 3869 mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) 3870 mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) 3871 mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) 3872 sqrshrun v3.8b, v3.8h, #4 3873 uxtl v0.8h, v3.8b // first block, in 16 bit 3874 mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) 3875 mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) 3876 mla v4.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) 3877 mla v4.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) 3878 3879 mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) 3880 mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) 3881 mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) 3882 sqrshrun v4.8b, v4.8h, #4 3883 uxtl v0.8h, v4.8b // second block, in 16 bit 3884 mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) 3885 mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) 3886 mla v5.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) 3887 mla v5.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) 3888 3889 mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) 3890 mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) 3891 mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) 3892 sqrshrun v5.8b, v5.8h, #4 3893 uxtl v0.8h, v5.8b // third block, in 16 bit 3894 mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) 3895 mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) 3896 mla v6.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) 3897 mla v6.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) 3898 3899 subs w3, w3, #16 3900 sqrshrun v6.8b, v6.8h, #4 3901 3902 st4 {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16 3903 st4 {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16 3904 b.le 8f 3905 ins v0.h[2], v2.h[7] 3906 ins v0.b[0], v6.b[7] 3907 ins v0.b[2], v6.b[3] 3908 b 2b 39098: 3910 subs w4, w4, #2 3911 b.le 9f 3912 sub x8, x6, w9, uxtw 3913 add x0, x0, x1 3914 add x6, x6, x1 3915 mov w3, w9 3916 b 1b 39179: 3918 ret 3919 3920L(ipred_filter_tbl): 3921 .hword L(ipred_filter_tbl) - 320b 3922 .hword L(ipred_filter_tbl) - 160b 3923 .hword L(ipred_filter_tbl) - 80b 3924 .hword L(ipred_filter_tbl) - 40b 3925endfunc 3926 3927// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride, 3928// const pixel *const pal, const uint8_t *idx, 3929// const int w, const int h); 3930function pal_pred_8bpc_neon, export=1 3931 ld1 {v0.8b}, [x2] 3932 clz w9, w4 3933 adr x6, L(pal_pred_tbl) 3934 sub w9, w9, #25 3935 movi v31.16b, #7 3936 ldrh w9, [x6, w9, uxtw #1] 3937 sub x6, x6, w9, uxtw 3938 add x2, x0, x1 3939 lsl x1, x1, #1 3940 br x6 39414: 3942 AARCH64_VALID_JUMP_TARGET 3943 ld1 {v1.8b}, [x3], #8 3944 subs w5, w5, #4 3945 ushr v3.8b, v1.8b, #4 3946 and v2.8b, v1.8b, v31.8b 3947 zip1 v1.16b, v2.16b, v3.16b 3948 tbl v1.16b, {v0.16b}, v1.16b 3949 st1 {v1.s}[0], [x0], x1 3950 st1 {v1.s}[1], [x2], x1 3951 st1 {v1.s}[2], [x0], x1 3952 st1 {v1.s}[3], [x2], x1 3953 b.gt 4b 3954 ret 39558: 3956 AARCH64_VALID_JUMP_TARGET 3957 ld1 {v1.16b}, [x3], #16 3958 subs w5, w5, #4 3959 ushr v4.16b, v1.16b, #4 3960 and v3.16b, v1.16b, v31.16b 3961 zip1 v1.16b, v3.16b, v4.16b 3962 zip2 v2.16b, v3.16b, v4.16b 3963 tbl v1.16b, {v0.16b}, v1.16b 3964 st1 {v1.d}[0], [x0], x1 3965 tbl v2.16b, {v0.16b}, v2.16b 3966 st1 {v1.d}[1], [x2], x1 3967 st1 {v2.d}[0], [x0], x1 3968 st1 {v2.d}[1], [x2], x1 3969 b.gt 8b 3970 ret 397116: 3972 AARCH64_VALID_JUMP_TARGET 3973 ld1 {v1.16b, v2.16b}, [x3], #32 3974 subs w5, w5, #4 3975 ushr v5.16b, v1.16b, #4 3976 and v4.16b, v1.16b, v31.16b 3977 ushr v7.16b, v2.16b, #4 3978 and v6.16b, v2.16b, v31.16b 3979 zip1 v1.16b, v4.16b, v5.16b 3980 zip2 v2.16b, v4.16b, v5.16b 3981 zip1 v3.16b, v6.16b, v7.16b 3982 tbl v1.16b, {v0.16b}, v1.16b 3983 zip2 v4.16b, v6.16b, v7.16b 3984 tbl v2.16b, {v0.16b}, v2.16b 3985 st1 {v1.16b}, [x0], x1 3986 tbl v3.16b, {v0.16b}, v3.16b 3987 st1 {v2.16b}, [x2], x1 3988 tbl v4.16b, {v0.16b}, v4.16b 3989 st1 {v3.16b}, [x0], x1 3990 st1 {v4.16b}, [x2], x1 3991 b.gt 16b 3992 ret 399332: 3994 AARCH64_VALID_JUMP_TARGET 3995 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 3996 subs w5, w5, #4 3997 ushr v21.16b, v16.16b, #4 3998 and v20.16b, v16.16b, v31.16b 3999 ushr v23.16b, v17.16b, #4 4000 and v22.16b, v17.16b, v31.16b 4001 ushr v25.16b, v18.16b, #4 4002 and v24.16b, v18.16b, v31.16b 4003 ushr v27.16b, v19.16b, #4 4004 and v26.16b, v19.16b, v31.16b 4005 zip1 v16.16b, v20.16b, v21.16b 4006 zip2 v17.16b, v20.16b, v21.16b 4007 zip1 v18.16b, v22.16b, v23.16b 4008 zip2 v19.16b, v22.16b, v23.16b 4009 zip1 v20.16b, v24.16b, v25.16b 4010 zip2 v21.16b, v24.16b, v25.16b 4011 tbl v16.16b, {v0.16b}, v16.16b 4012 zip1 v22.16b, v26.16b, v27.16b 4013 tbl v17.16b, {v0.16b}, v17.16b 4014 zip2 v23.16b, v26.16b, v27.16b 4015 tbl v18.16b, {v0.16b}, v18.16b 4016 tbl v19.16b, {v0.16b}, v19.16b 4017 tbl v20.16b, {v0.16b}, v20.16b 4018 st1 {v16.16b, v17.16b}, [x0], x1 4019 tbl v21.16b, {v0.16b}, v21.16b 4020 st1 {v18.16b, v19.16b}, [x2], x1 4021 tbl v22.16b, {v0.16b}, v22.16b 4022 st1 {v20.16b, v21.16b}, [x0], x1 4023 tbl v23.16b, {v0.16b}, v23.16b 4024 st1 {v22.16b, v23.16b}, [x2], x1 4025 b.gt 32b 4026 ret 402764: 4028 AARCH64_VALID_JUMP_TARGET 4029 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 4030 subs w5, w5, #2 4031 ushr v21.16b, v16.16b, #4 4032 and v20.16b, v16.16b, v31.16b 4033 ushr v23.16b, v17.16b, #4 4034 and v22.16b, v17.16b, v31.16b 4035 ushr v25.16b, v18.16b, #4 4036 and v24.16b, v18.16b, v31.16b 4037 ushr v27.16b, v19.16b, #4 4038 and v26.16b, v19.16b, v31.16b 4039 zip1 v16.16b, v20.16b, v21.16b 4040 zip2 v17.16b, v20.16b, v21.16b 4041 zip1 v18.16b, v22.16b, v23.16b 4042 zip2 v19.16b, v22.16b, v23.16b 4043 zip1 v20.16b, v24.16b, v25.16b 4044 zip2 v21.16b, v24.16b, v25.16b 4045 tbl v16.16b, {v0.16b}, v16.16b 4046 zip1 v22.16b, v26.16b, v27.16b 4047 tbl v17.16b, {v0.16b}, v17.16b 4048 zip2 v23.16b, v26.16b, v27.16b 4049 tbl v18.16b, {v0.16b}, v18.16b 4050 tbl v19.16b, {v0.16b}, v19.16b 4051 st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1 4052 tbl v20.16b, {v0.16b}, v20.16b 4053 tbl v21.16b, {v0.16b}, v21.16b 4054 tbl v22.16b, {v0.16b}, v22.16b 4055 tbl v23.16b, {v0.16b}, v23.16b 4056 st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1 4057 b.gt 64b 4058 ret 4059 4060L(pal_pred_tbl): 4061 .hword L(pal_pred_tbl) - 64b 4062 .hword L(pal_pred_tbl) - 32b 4063 .hword L(pal_pred_tbl) - 16b 4064 .hword L(pal_pred_tbl) - 8b 4065 .hword L(pal_pred_tbl) - 4b 4066endfunc 4067 4068// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, 4069// const pixel *const topleft, 4070// const int width, const int height, 4071// const int16_t *ac, const int alpha); 4072function ipred_cfl_128_8bpc_neon, export=1 4073 clz w9, w3 4074 adr x7, L(ipred_cfl_128_tbl) 4075 sub w9, w9, #26 4076 ldrh w9, [x7, w9, uxtw #1] 4077 movi v0.8h, #128 // dc 4078 dup v1.8h, w6 // alpha 4079 sub x7, x7, w9, uxtw 4080 add x6, x0, x1 4081 lsl x1, x1, #1 4082 br x7 4083L(ipred_cfl_splat_w4): 4084 AARCH64_VALID_JUMP_TARGET 4085 ld1 {v2.8h, v3.8h}, [x5], #32 4086 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha 4087 mul v3.8h, v3.8h, v1.8h 4088 cmlt v4.8h, v2.8h, #0 // sign 4089 cmlt v5.8h, v3.8h, #0 4090 add v2.8h, v2.8h, v4.8h // diff + sign 4091 add v3.8h, v3.8h, v5.8h 4092 srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() 4093 srshr v3.8h, v3.8h, #6 4094 add v2.8h, v2.8h, v0.8h // dc + apply_sign() 4095 add v3.8h, v3.8h, v0.8h 4096 sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) 4097 sqxtun v3.8b, v3.8h 4098 st1 {v2.s}[0], [x0], x1 4099 st1 {v2.s}[1], [x6], x1 4100 subs w4, w4, #4 4101 st1 {v3.s}[0], [x0], x1 4102 st1 {v3.s}[1], [x6], x1 4103 b.gt L(ipred_cfl_splat_w4) 4104 ret 4105L(ipred_cfl_splat_w8): 4106 AARCH64_VALID_JUMP_TARGET 4107 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64 4108 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha 4109 mul v3.8h, v3.8h, v1.8h 4110 mul v4.8h, v4.8h, v1.8h 4111 mul v5.8h, v5.8h, v1.8h 4112 cmlt v16.8h, v2.8h, #0 // sign 4113 cmlt v17.8h, v3.8h, #0 4114 cmlt v18.8h, v4.8h, #0 4115 cmlt v19.8h, v5.8h, #0 4116 add v2.8h, v2.8h, v16.8h // diff + sign 4117 add v3.8h, v3.8h, v17.8h 4118 add v4.8h, v4.8h, v18.8h 4119 add v5.8h, v5.8h, v19.8h 4120 srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() 4121 srshr v3.8h, v3.8h, #6 4122 srshr v4.8h, v4.8h, #6 4123 srshr v5.8h, v5.8h, #6 4124 add v2.8h, v2.8h, v0.8h // dc + apply_sign() 4125 add v3.8h, v3.8h, v0.8h 4126 add v4.8h, v4.8h, v0.8h 4127 add v5.8h, v5.8h, v0.8h 4128 sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) 4129 sqxtun v3.8b, v3.8h 4130 sqxtun v4.8b, v4.8h 4131 sqxtun v5.8b, v5.8h 4132 st1 {v2.8b}, [x0], x1 4133 st1 {v3.8b}, [x6], x1 4134 subs w4, w4, #4 4135 st1 {v4.8b}, [x0], x1 4136 st1 {v5.8b}, [x6], x1 4137 b.gt L(ipred_cfl_splat_w8) 4138 ret 4139L(ipred_cfl_splat_w16): 4140 AARCH64_VALID_JUMP_TARGET 4141 add x7, x5, w3, uxtw #1 4142 sub x1, x1, w3, uxtw 4143 mov w9, w3 41441: 4145 ld1 {v2.8h, v3.8h}, [x5], #32 4146 ld1 {v4.8h, v5.8h}, [x7], #32 4147 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha 4148 mul v3.8h, v3.8h, v1.8h 4149 mul v4.8h, v4.8h, v1.8h 4150 mul v5.8h, v5.8h, v1.8h 4151 cmlt v16.8h, v2.8h, #0 // sign 4152 cmlt v17.8h, v3.8h, #0 4153 cmlt v18.8h, v4.8h, #0 4154 cmlt v19.8h, v5.8h, #0 4155 add v2.8h, v2.8h, v16.8h // diff + sign 4156 add v3.8h, v3.8h, v17.8h 4157 add v4.8h, v4.8h, v18.8h 4158 add v5.8h, v5.8h, v19.8h 4159 srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() 4160 srshr v3.8h, v3.8h, #6 4161 srshr v4.8h, v4.8h, #6 4162 srshr v5.8h, v5.8h, #6 4163 add v2.8h, v2.8h, v0.8h // dc + apply_sign() 4164 add v3.8h, v3.8h, v0.8h 4165 add v4.8h, v4.8h, v0.8h 4166 add v5.8h, v5.8h, v0.8h 4167 sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) 4168 sqxtun v3.8b, v3.8h 4169 sqxtun v4.8b, v4.8h 4170 sqxtun v5.8b, v5.8h 4171 subs w3, w3, #16 4172 st1 {v2.8b, v3.8b}, [x0], #16 4173 st1 {v4.8b, v5.8b}, [x6], #16 4174 b.gt 1b 4175 subs w4, w4, #2 4176 add x5, x5, w9, uxtw #1 4177 add x7, x7, w9, uxtw #1 4178 add x0, x0, x1 4179 add x6, x6, x1 4180 mov w3, w9 4181 b.gt 1b 4182 ret 4183 4184L(ipred_cfl_128_tbl): 4185L(ipred_cfl_splat_tbl): 4186 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) 4187 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) 4188 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) 4189 .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) 4190endfunc 4191 4192// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, 4193// const pixel *const topleft, 4194// const int width, const int height, 4195// const int16_t *ac, const int alpha); 4196function ipred_cfl_top_8bpc_neon, export=1 4197 clz w9, w3 4198 adr x7, L(ipred_cfl_top_tbl) 4199 sub w9, w9, #26 4200 ldrh w9, [x7, w9, uxtw #1] 4201 dup v1.8h, w6 // alpha 4202 add x2, x2, #1 4203 sub x7, x7, w9, uxtw 4204 add x6, x0, x1 4205 lsl x1, x1, #1 4206 br x7 42074: 4208 AARCH64_VALID_JUMP_TARGET 4209 ld1r {v0.2s}, [x2] 4210 uaddlv h0, v0.8b 4211 urshr v0.4h, v0.4h, #3 4212 dup v0.8h, v0.h[0] 4213 b L(ipred_cfl_splat_w4) 42148: 4215 AARCH64_VALID_JUMP_TARGET 4216 ld1 {v0.8b}, [x2] 4217 uaddlv h0, v0.8b 4218 urshr v0.4h, v0.4h, #3 4219 dup v0.8h, v0.h[0] 4220 b L(ipred_cfl_splat_w8) 422116: 4222 AARCH64_VALID_JUMP_TARGET 4223 ld1 {v0.16b}, [x2] 4224 uaddlv h0, v0.16b 4225 urshr v0.4h, v0.4h, #4 4226 dup v0.8h, v0.h[0] 4227 b L(ipred_cfl_splat_w16) 422832: 4229 AARCH64_VALID_JUMP_TARGET 4230 ld1 {v2.16b, v3.16b}, [x2] 4231 uaddlv h2, v2.16b 4232 uaddlv h3, v3.16b 4233 add v2.4h, v2.4h, v3.4h 4234 urshr v2.4h, v2.4h, #5 4235 dup v0.8h, v2.h[0] 4236 b L(ipred_cfl_splat_w16) 4237 4238L(ipred_cfl_top_tbl): 4239 .hword L(ipred_cfl_top_tbl) - 32b 4240 .hword L(ipred_cfl_top_tbl) - 16b 4241 .hword L(ipred_cfl_top_tbl) - 8b 4242 .hword L(ipred_cfl_top_tbl) - 4b 4243endfunc 4244 4245// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, 4246// const pixel *const topleft, 4247// const int width, const int height, 4248// const int16_t *ac, const int alpha); 4249function ipred_cfl_left_8bpc_neon, export=1 4250 sub x2, x2, w4, uxtw 4251 clz w9, w3 4252 clz w8, w4 4253 adr x10, L(ipred_cfl_splat_tbl) 4254 adr x7, L(ipred_cfl_left_tbl) 4255 sub w9, w9, #26 4256 sub w8, w8, #26 4257 ldrh w9, [x10, w9, uxtw #1] 4258 ldrh w8, [x7, w8, uxtw #1] 4259 dup v1.8h, w6 // alpha 4260 sub x9, x10, w9, uxtw 4261 sub x7, x7, w8, uxtw 4262 add x6, x0, x1 4263 lsl x1, x1, #1 4264 br x7 4265 4266L(ipred_cfl_left_h4): 4267 AARCH64_VALID_JUMP_TARGET 4268 ld1r {v0.2s}, [x2] 4269 uaddlv h0, v0.8b 4270 urshr v0.4h, v0.4h, #3 4271 dup v0.8h, v0.h[0] 4272 br x9 4273 4274L(ipred_cfl_left_h8): 4275 AARCH64_VALID_JUMP_TARGET 4276 ld1 {v0.8b}, [x2] 4277 uaddlv h0, v0.8b 4278 urshr v0.4h, v0.4h, #3 4279 dup v0.8h, v0.h[0] 4280 br x9 4281 4282L(ipred_cfl_left_h16): 4283 AARCH64_VALID_JUMP_TARGET 4284 ld1 {v0.16b}, [x2] 4285 uaddlv h0, v0.16b 4286 urshr v0.4h, v0.4h, #4 4287 dup v0.8h, v0.h[0] 4288 br x9 4289 4290L(ipred_cfl_left_h32): 4291 AARCH64_VALID_JUMP_TARGET 4292 ld1 {v2.16b, v3.16b}, [x2] 4293 uaddlv h2, v2.16b 4294 uaddlv h3, v3.16b 4295 add v2.4h, v2.4h, v3.4h 4296 urshr v2.4h, v2.4h, #5 4297 dup v0.8h, v2.h[0] 4298 br x9 4299 4300L(ipred_cfl_left_tbl): 4301 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) 4302 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) 4303 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) 4304 .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) 4305endfunc 4306 4307// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride, 4308// const pixel *const topleft, 4309// const int width, const int height, 4310// const int16_t *ac, const int alpha); 4311function ipred_cfl_8bpc_neon, export=1 4312 sub x2, x2, w4, uxtw 4313 add w8, w3, w4 // width + height 4314 dup v1.8h, w6 // alpha 4315 clz w9, w3 4316 clz w6, w4 4317 dup v16.8h, w8 // width + height 4318 adr x7, L(ipred_cfl_tbl) 4319 rbit w8, w8 // rbit(width + height) 4320 sub w9, w9, #22 // 26 leading bits, minus table offset 4 4321 sub w6, w6, #26 4322 clz w8, w8 // ctz(width + height) 4323 ldrh w9, [x7, w9, uxtw #1] 4324 ldrh w6, [x7, w6, uxtw #1] 4325 neg w8, w8 // -ctz(width + height) 4326 sub x9, x7, w9, uxtw 4327 sub x7, x7, w6, uxtw 4328 ushr v16.8h, v16.8h, #1 // (width + height) >> 1 4329 dup v17.8h, w8 // -ctz(width + height) 4330 add x6, x0, x1 4331 lsl x1, x1, #1 4332 br x7 4333 4334L(ipred_cfl_h4): 4335 AARCH64_VALID_JUMP_TARGET 4336 ld1 {v0.s}[0], [x2], #4 4337 ins v0.s[1], wzr 4338 add x2, x2, #1 4339 uaddlv h0, v0.8b 4340 br x9 4341L(ipred_cfl_w4): 4342 AARCH64_VALID_JUMP_TARGET 4343 ld1 {v2.s}[0], [x2] 4344 ins v2.s[1], wzr 4345 add v0.4h, v0.4h, v16.4h 4346 uaddlv h2, v2.8b 4347 cmp w4, #4 4348 add v0.4h, v0.4h, v2.4h 4349 ushl v0.4h, v0.4h, v17.4h 4350 b.eq 1f 4351 // h = 8/16 4352 mov w16, #(0x3334/2) 4353 movk w16, #(0x5556/2), lsl #16 4354 add w17, w4, w4 // w17 = 2*h = 16 or 32 4355 lsr w16, w16, w17 4356 dup v16.4h, w16 4357 sqdmulh v0.4h, v0.4h, v16.4h 43581: 4359 dup v0.8h, v0.h[0] 4360 b L(ipred_cfl_splat_w4) 4361 4362L(ipred_cfl_h8): 4363 AARCH64_VALID_JUMP_TARGET 4364 ld1 {v0.8b}, [x2], #8 4365 uaddlv h0, v0.8b 4366 add x2, x2, #1 4367 br x9 4368L(ipred_cfl_w8): 4369 AARCH64_VALID_JUMP_TARGET 4370 ld1 {v2.8b}, [x2] 4371 add v0.4h, v0.4h, v16.4h 4372 uaddlv h2, v2.8b 4373 cmp w4, #8 4374 add v0.4h, v0.4h, v2.4h 4375 ushl v0.4h, v0.4h, v17.4h 4376 b.eq 1f 4377 // h = 4/16/32 4378 cmp w4, #32 4379 mov w16, #(0x3334/2) 4380 mov w17, #(0x5556/2) 4381 csel w16, w16, w17, eq 4382 dup v16.4h, w16 4383 sqdmulh v0.4h, v0.4h, v16.4h 43841: 4385 dup v0.8h, v0.h[0] 4386 b L(ipred_cfl_splat_w8) 4387 4388L(ipred_cfl_h16): 4389 AARCH64_VALID_JUMP_TARGET 4390 ld1 {v0.16b}, [x2], #16 4391 uaddlv h0, v0.16b 4392 add x2, x2, #1 4393 br x9 4394L(ipred_cfl_w16): 4395 AARCH64_VALID_JUMP_TARGET 4396 ld1 {v2.16b}, [x2] 4397 add v0.4h, v0.4h, v16.4h 4398 uaddlv h2, v2.16b 4399 cmp w4, #16 4400 add v0.4h, v0.4h, v2.4h 4401 ushl v0.4h, v0.4h, v17.4h 4402 b.eq 1f 4403 // h = 4/8/32 4404 cmp w4, #4 4405 mov w16, #(0x3334/2) 4406 mov w17, #(0x5556/2) 4407 csel w16, w16, w17, eq 4408 dup v16.4h, w16 4409 sqdmulh v0.4h, v0.4h, v16.4h 44101: 4411 dup v0.8h, v0.h[0] 4412 b L(ipred_cfl_splat_w16) 4413 4414L(ipred_cfl_h32): 4415 AARCH64_VALID_JUMP_TARGET 4416 ld1 {v2.16b, v3.16b}, [x2], #32 4417 uaddlv h2, v2.16b 4418 uaddlv h3, v3.16b 4419 add x2, x2, #1 4420 add v0.4h, v2.4h, v3.4h 4421 br x9 4422L(ipred_cfl_w32): 4423 AARCH64_VALID_JUMP_TARGET 4424 ld1 {v2.16b, v3.16b}, [x2] 4425 add v0.4h, v0.4h, v16.4h 4426 uaddlv h2, v2.16b 4427 uaddlv h3, v3.16b 4428 cmp w4, #32 4429 add v0.4h, v0.4h, v2.4h 4430 add v0.4h, v0.4h, v3.4h 4431 ushl v0.4h, v0.4h, v17.4h 4432 b.eq 1f 4433 // h = 8/16 4434 mov w16, #(0x5556/2) 4435 movk w16, #(0x3334/2), lsl #16 4436 add w17, w4, w4 // w17 = 2*h = 16 or 32 4437 lsr w16, w16, w17 4438 dup v16.4h, w16 4439 sqdmulh v0.4h, v0.4h, v16.4h 44401: 4441 dup v0.8h, v0.h[0] 4442 b L(ipred_cfl_splat_w16) 4443 4444L(ipred_cfl_tbl): 4445 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) 4446 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) 4447 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) 4448 .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) 4449 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) 4450 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) 4451 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) 4452 .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) 4453endfunc 4454 4455// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx, 4456// const ptrdiff_t stride, const int w_pad, 4457// const int h_pad, const int cw, const int ch); 4458function ipred_cfl_ac_420_8bpc_neon, export=1 4459 clz w8, w5 4460 lsl w4, w4, #2 4461 adr x7, L(ipred_cfl_ac_420_tbl) 4462 sub w8, w8, #27 4463 ldrh w8, [x7, w8, uxtw #1] 4464 movi v16.8h, #0 4465 movi v17.8h, #0 4466 movi v18.8h, #0 4467 movi v19.8h, #0 4468 sub x7, x7, w8, uxtw 4469 sub w8, w6, w4 // height - h_pad 4470 rbit w9, w5 // rbit(width) 4471 rbit w10, w6 // rbit(height) 4472 clz w9, w9 // ctz(width) 4473 clz w10, w10 // ctz(height) 4474 add w9, w9, w10 // log2sz 4475 add x10, x1, x2 4476 dup v31.4s, w9 4477 lsl x2, x2, #1 4478 neg v31.4s, v31.4s // -log2sz 4479 br x7 4480 4481L(ipred_cfl_ac_420_w4): 4482 AARCH64_VALID_JUMP_TARGET 44831: // Copy and subsample input 4484 ld1 {v0.8b}, [x1], x2 4485 ld1 {v1.8b}, [x10], x2 4486 ld1 {v0.d}[1], [x1], x2 4487 ld1 {v1.d}[1], [x10], x2 4488 uaddlp v0.8h, v0.16b 4489 uaddlp v1.8h, v1.16b 4490 add v0.8h, v0.8h, v1.8h 4491 shl v0.8h, v0.8h, #1 4492 subs w8, w8, #2 4493 st1 {v0.8h}, [x0], #16 4494 add v16.8h, v16.8h, v0.8h 4495 b.gt 1b 4496 trn2 v1.2d, v0.2d, v0.2d 4497 trn2 v0.2d, v0.2d, v0.2d 4498L(ipred_cfl_ac_420_w4_hpad): 4499 cbz w4, 3f 45002: // Vertical padding (h_pad > 0) 4501 subs w4, w4, #4 4502 st1 {v0.8h, v1.8h}, [x0], #32 4503 add v16.8h, v16.8h, v0.8h 4504 add v17.8h, v17.8h, v1.8h 4505 b.gt 2b 45063: 4507 // Aggregate the sums 4508 add v0.8h, v16.8h, v17.8h 4509 uaddlv s0, v0.8h // sum 4510 sub x0, x0, w6, uxtw #3 4511 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz 4512 dup v4.8h, v4.h[0] 45136: // Subtract dc from ac 4514 ld1 {v0.8h, v1.8h}, [x0] 4515 subs w6, w6, #4 4516 sub v0.8h, v0.8h, v4.8h 4517 sub v1.8h, v1.8h, v4.8h 4518 st1 {v0.8h, v1.8h}, [x0], #32 4519 b.gt 6b 4520 ret 4521 4522L(ipred_cfl_ac_420_w8): 4523 AARCH64_VALID_JUMP_TARGET 4524 cbnz w3, L(ipred_cfl_ac_420_w8_wpad) 45251: // Copy and subsample input, without padding 4526 ld1 {v0.16b}, [x1], x2 4527 ld1 {v1.16b}, [x10], x2 4528 ld1 {v2.16b}, [x1], x2 4529 uaddlp v0.8h, v0.16b 4530 ld1 {v3.16b}, [x10], x2 4531 uaddlp v1.8h, v1.16b 4532 uaddlp v2.8h, v2.16b 4533 uaddlp v3.8h, v3.16b 4534 add v0.8h, v0.8h, v1.8h 4535 add v2.8h, v2.8h, v3.8h 4536 shl v0.8h, v0.8h, #1 4537 shl v1.8h, v2.8h, #1 4538 subs w8, w8, #2 4539 st1 {v0.8h, v1.8h}, [x0], #32 4540 add v16.8h, v16.8h, v0.8h 4541 add v17.8h, v17.8h, v1.8h 4542 b.gt 1b 4543 mov v0.16b, v1.16b 4544 b L(ipred_cfl_ac_420_w8_hpad) 4545 4546L(ipred_cfl_ac_420_w8_wpad): 45471: // Copy and subsample input, padding 4 4548 ld1 {v0.8b}, [x1], x2 4549 ld1 {v1.8b}, [x10], x2 4550 ld1 {v0.d}[1], [x1], x2 4551 ld1 {v1.d}[1], [x10], x2 4552 uaddlp v0.8h, v0.16b 4553 uaddlp v1.8h, v1.16b 4554 add v0.8h, v0.8h, v1.8h 4555 shl v0.8h, v0.8h, #1 4556 dup v1.4h, v0.h[3] 4557 dup v3.4h, v0.h[7] 4558 trn2 v2.2d, v0.2d, v0.2d 4559 subs w8, w8, #2 4560 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 4561 add v16.4h, v16.4h, v0.4h 4562 add v17.4h, v17.4h, v1.4h 4563 add v18.4h, v18.4h, v2.4h 4564 add v19.4h, v19.4h, v3.4h 4565 b.gt 1b 4566 trn1 v0.2d, v2.2d, v3.2d 4567 trn1 v1.2d, v2.2d, v3.2d 4568 4569L(ipred_cfl_ac_420_w8_hpad): 4570 cbz w4, 3f 45712: // Vertical padding (h_pad > 0) 4572 subs w4, w4, #4 4573 st1 {v0.8h, v1.8h}, [x0], #32 4574 add v16.8h, v16.8h, v0.8h 4575 add v17.8h, v17.8h, v1.8h 4576 st1 {v0.8h, v1.8h}, [x0], #32 4577 add v18.8h, v18.8h, v0.8h 4578 add v19.8h, v19.8h, v1.8h 4579 b.gt 2b 45803: 4581 4582L(ipred_cfl_ac_420_w8_calc_subtract_dc): 4583 // Aggregate the sums 4584 add v0.8h, v16.8h, v17.8h 4585 add v2.8h, v18.8h, v19.8h 4586 uaddlp v0.4s, v0.8h 4587 uaddlp v2.4s, v2.8h 4588 add v0.4s, v0.4s, v2.4s 4589 addv s0, v0.4s // sum 4590 sub x0, x0, w6, uxtw #4 4591 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz 4592 dup v4.8h, v4.h[0] 4593L(ipred_cfl_ac_420_w8_subtract_dc): 45946: // Subtract dc from ac 4595 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] 4596 subs w6, w6, #4 4597 sub v0.8h, v0.8h, v4.8h 4598 sub v1.8h, v1.8h, v4.8h 4599 sub v2.8h, v2.8h, v4.8h 4600 sub v3.8h, v3.8h, v4.8h 4601 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4602 b.gt 6b 4603 ret 4604 4605L(ipred_cfl_ac_420_w16): 4606 AARCH64_VALID_JUMP_TARGET 4607 adr x7, L(ipred_cfl_ac_420_w16_tbl) 4608 ldrh w3, [x7, w3, uxtw #1] 4609 sub x7, x7, w3, uxtw 4610 br x7 4611 4612L(ipred_cfl_ac_420_w16_wpad0): 4613 AARCH64_VALID_JUMP_TARGET 46141: // Copy and subsample input, without padding 4615 ld1 {v0.16b, v1.16b}, [x1], x2 4616 ld1 {v2.16b, v3.16b}, [x10], x2 4617 uaddlp v0.8h, v0.16b 4618 ld1 {v4.16b, v5.16b}, [x1], x2 4619 uaddlp v1.8h, v1.16b 4620 ld1 {v6.16b, v7.16b}, [x10], x2 4621 uaddlp v2.8h, v2.16b 4622 uaddlp v3.8h, v3.16b 4623 uaddlp v4.8h, v4.16b 4624 uaddlp v5.8h, v5.16b 4625 uaddlp v6.8h, v6.16b 4626 uaddlp v7.8h, v7.16b 4627 add v0.8h, v0.8h, v2.8h 4628 add v1.8h, v1.8h, v3.8h 4629 add v4.8h, v4.8h, v6.8h 4630 add v5.8h, v5.8h, v7.8h 4631 shl v0.8h, v0.8h, #1 4632 shl v1.8h, v1.8h, #1 4633 shl v2.8h, v4.8h, #1 4634 shl v3.8h, v5.8h, #1 4635 subs w8, w8, #2 4636 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4637 add v16.8h, v16.8h, v0.8h 4638 add v17.8h, v17.8h, v1.8h 4639 add v18.8h, v18.8h, v2.8h 4640 add v19.8h, v19.8h, v3.8h 4641 b.gt 1b 4642 mov v0.16b, v2.16b 4643 mov v1.16b, v3.16b 4644 b L(ipred_cfl_ac_420_w16_hpad) 4645 4646L(ipred_cfl_ac_420_w16_wpad1): 4647 AARCH64_VALID_JUMP_TARGET 46481: // Copy and subsample input, padding 4 4649 ldr d1, [x1, #16] 4650 ld1 {v0.16b}, [x1], x2 4651 ldr d3, [x10, #16] 4652 ld1 {v2.16b}, [x10], x2 4653 uaddlp v1.4h, v1.8b 4654 ldr d5, [x1, #16] 4655 uaddlp v0.8h, v0.16b 4656 ld1 {v4.16b}, [x1], x2 4657 uaddlp v3.4h, v3.8b 4658 ldr d7, [x10, #16] 4659 uaddlp v2.8h, v2.16b 4660 ld1 {v6.16b}, [x10], x2 4661 uaddlp v5.4h, v5.8b 4662 uaddlp v4.8h, v4.16b 4663 uaddlp v7.4h, v7.8b 4664 uaddlp v6.8h, v6.16b 4665 add v1.4h, v1.4h, v3.4h 4666 add v0.8h, v0.8h, v2.8h 4667 add v5.4h, v5.4h, v7.4h 4668 add v4.8h, v4.8h, v6.8h 4669 shl v1.4h, v1.4h, #1 4670 shl v0.8h, v0.8h, #1 4671 shl v3.4h, v5.4h, #1 4672 shl v2.8h, v4.8h, #1 4673 dup v4.4h, v1.h[3] 4674 dup v5.4h, v3.h[3] 4675 trn1 v1.2d, v1.2d, v4.2d 4676 trn1 v3.2d, v3.2d, v5.2d 4677 subs w8, w8, #2 4678 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4679 add v16.8h, v16.8h, v0.8h 4680 add v17.8h, v17.8h, v1.8h 4681 add v18.8h, v18.8h, v2.8h 4682 add v19.8h, v19.8h, v3.8h 4683 b.gt 1b 4684 mov v0.16b, v2.16b 4685 mov v1.16b, v3.16b 4686 b L(ipred_cfl_ac_420_w16_hpad) 4687 4688L(ipred_cfl_ac_420_w16_wpad2): 4689 AARCH64_VALID_JUMP_TARGET 46901: // Copy and subsample input, padding 8 4691 ld1 {v0.16b}, [x1], x2 4692 ld1 {v2.16b}, [x10], x2 4693 ld1 {v4.16b}, [x1], x2 4694 uaddlp v0.8h, v0.16b 4695 ld1 {v6.16b}, [x10], x2 4696 uaddlp v2.8h, v2.16b 4697 uaddlp v4.8h, v4.16b 4698 uaddlp v6.8h, v6.16b 4699 add v0.8h, v0.8h, v2.8h 4700 add v4.8h, v4.8h, v6.8h 4701 shl v0.8h, v0.8h, #1 4702 shl v2.8h, v4.8h, #1 4703 dup v1.8h, v0.h[7] 4704 dup v3.8h, v2.h[7] 4705 subs w8, w8, #2 4706 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4707 add v16.8h, v16.8h, v0.8h 4708 add v17.8h, v17.8h, v1.8h 4709 add v18.8h, v18.8h, v2.8h 4710 add v19.8h, v19.8h, v3.8h 4711 b.gt 1b 4712 mov v0.16b, v2.16b 4713 mov v1.16b, v3.16b 4714 b L(ipred_cfl_ac_420_w16_hpad) 4715 4716L(ipred_cfl_ac_420_w16_wpad3): 4717 AARCH64_VALID_JUMP_TARGET 47181: // Copy and subsample input, padding 12 4719 ld1 {v0.8b}, [x1], x2 4720 ld1 {v2.8b}, [x10], x2 4721 ld1 {v4.8b}, [x1], x2 4722 uaddlp v0.4h, v0.8b 4723 ld1 {v6.8b}, [x10], x2 4724 uaddlp v2.4h, v2.8b 4725 uaddlp v4.4h, v4.8b 4726 uaddlp v6.4h, v6.8b 4727 add v0.4h, v0.4h, v2.4h 4728 add v4.4h, v4.4h, v6.4h 4729 shl v0.4h, v0.4h, #1 4730 shl v2.4h, v4.4h, #1 4731 dup v1.8h, v0.h[3] 4732 dup v3.8h, v2.h[3] 4733 trn1 v0.2d, v0.2d, v1.2d 4734 trn1 v2.2d, v2.2d, v3.2d 4735 subs w8, w8, #2 4736 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4737 add v16.8h, v16.8h, v0.8h 4738 add v17.8h, v17.8h, v1.8h 4739 add v18.8h, v18.8h, v2.8h 4740 add v19.8h, v19.8h, v3.8h 4741 b.gt 1b 4742 mov v0.16b, v2.16b 4743 mov v1.16b, v3.16b 4744 4745L(ipred_cfl_ac_420_w16_hpad): 4746 cbz w4, 3f 47472: // Vertical padding (h_pad > 0) 4748 subs w4, w4, #4 4749 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4750 add v16.8h, v16.8h, v0.8h 4751 add v17.8h, v17.8h, v1.8h 4752 add v18.8h, v18.8h, v2.8h 4753 add v19.8h, v19.8h, v3.8h 4754 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4755 add v16.8h, v16.8h, v0.8h 4756 add v17.8h, v17.8h, v1.8h 4757 add v18.8h, v18.8h, v2.8h 4758 add v19.8h, v19.8h, v3.8h 4759 b.gt 2b 47603: 4761 4762 // Double the height and reuse the w8 summing/subtracting 4763 lsl w6, w6, #1 4764 b L(ipred_cfl_ac_420_w8_calc_subtract_dc) 4765 4766L(ipred_cfl_ac_420_tbl): 4767 .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) 4768 .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) 4769 .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) 4770 .hword 0 4771 4772L(ipred_cfl_ac_420_w16_tbl): 4773 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) 4774 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) 4775 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) 4776 .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) 4777endfunc 4778 4779// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx, 4780// const ptrdiff_t stride, const int w_pad, 4781// const int h_pad, const int cw, const int ch); 4782function ipred_cfl_ac_422_8bpc_neon, export=1 4783 clz w8, w5 4784 lsl w4, w4, #2 4785 adr x7, L(ipred_cfl_ac_422_tbl) 4786 sub w8, w8, #27 4787 ldrh w8, [x7, w8, uxtw #1] 4788 movi v16.8h, #0 4789 movi v17.8h, #0 4790 movi v18.8h, #0 4791 movi v19.8h, #0 4792 sub x7, x7, w8, uxtw 4793 sub w8, w6, w4 // height - h_pad 4794 rbit w9, w5 // rbit(width) 4795 rbit w10, w6 // rbit(height) 4796 clz w9, w9 // ctz(width) 4797 clz w10, w10 // ctz(height) 4798 add w9, w9, w10 // log2sz 4799 add x10, x1, x2 4800 dup v31.4s, w9 4801 lsl x2, x2, #1 4802 neg v31.4s, v31.4s // -log2sz 4803 br x7 4804 4805L(ipred_cfl_ac_422_w4): 4806 AARCH64_VALID_JUMP_TARGET 48071: // Copy and subsample input 4808 ld1 {v0.8b}, [x1], x2 4809 ld1 {v0.d}[1], [x10], x2 4810 ld1 {v1.8b}, [x1], x2 4811 ld1 {v1.d}[1], [x10], x2 4812 uaddlp v0.8h, v0.16b 4813 uaddlp v1.8h, v1.16b 4814 shl v0.8h, v0.8h, #2 4815 shl v1.8h, v1.8h, #2 4816 subs w8, w8, #4 4817 add v16.8h, v16.8h, v0.8h 4818 add v17.8h, v17.8h, v1.8h 4819 st1 {v0.8h, v1.8h}, [x0], #32 4820 b.gt 1b 4821 trn2 v0.2d, v1.2d, v1.2d 4822 trn2 v1.2d, v1.2d, v1.2d 4823 b L(ipred_cfl_ac_420_w4_hpad) 4824 4825L(ipred_cfl_ac_422_w8): 4826 AARCH64_VALID_JUMP_TARGET 4827 cbnz w3, L(ipred_cfl_ac_422_w8_wpad) 48281: // Copy and subsample input, without padding 4829 ld1 {v0.16b}, [x1], x2 4830 ld1 {v1.16b}, [x10], x2 4831 ld1 {v2.16b}, [x1], x2 4832 uaddlp v0.8h, v0.16b 4833 ld1 {v3.16b}, [x10], x2 4834 uaddlp v1.8h, v1.16b 4835 uaddlp v2.8h, v2.16b 4836 uaddlp v3.8h, v3.16b 4837 shl v0.8h, v0.8h, #2 4838 shl v1.8h, v1.8h, #2 4839 shl v2.8h, v2.8h, #2 4840 shl v3.8h, v3.8h, #2 4841 subs w8, w8, #4 4842 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4843 add v16.8h, v16.8h, v0.8h 4844 add v17.8h, v17.8h, v1.8h 4845 add v18.8h, v18.8h, v2.8h 4846 add v19.8h, v19.8h, v3.8h 4847 b.gt 1b 4848 mov v0.16b, v3.16b 4849 mov v1.16b, v3.16b 4850 b L(ipred_cfl_ac_420_w8_hpad) 4851 4852L(ipred_cfl_ac_422_w8_wpad): 48531: // Copy and subsample input, padding 4 4854 ld1 {v0.8b}, [x1], x2 4855 ld1 {v0.d}[1], [x10], x2 4856 ld1 {v2.8b}, [x1], x2 4857 ld1 {v2.d}[1], [x10], x2 4858 uaddlp v0.8h, v0.16b 4859 uaddlp v2.8h, v2.16b 4860 shl v0.8h, v0.8h, #2 4861 shl v2.8h, v2.8h, #2 4862 dup v4.4h, v0.h[3] 4863 dup v5.8h, v0.h[7] 4864 dup v6.4h, v2.h[3] 4865 dup v7.8h, v2.h[7] 4866 trn2 v1.2d, v0.2d, v5.2d 4867 trn1 v0.2d, v0.2d, v4.2d 4868 trn2 v3.2d, v2.2d, v7.2d 4869 trn1 v2.2d, v2.2d, v6.2d 4870 subs w8, w8, #4 4871 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4872 add v16.8h, v16.8h, v0.8h 4873 add v17.8h, v17.8h, v1.8h 4874 add v18.8h, v18.8h, v2.8h 4875 add v19.8h, v19.8h, v3.8h 4876 b.gt 1b 4877 mov v0.16b, v3.16b 4878 mov v1.16b, v3.16b 4879 b L(ipred_cfl_ac_420_w8_hpad) 4880 4881L(ipred_cfl_ac_422_w16): 4882 AARCH64_VALID_JUMP_TARGET 4883 adr x7, L(ipred_cfl_ac_422_w16_tbl) 4884 ldrh w3, [x7, w3, uxtw #1] 4885 sub x7, x7, w3, uxtw 4886 br x7 4887 4888L(ipred_cfl_ac_422_w16_wpad0): 4889 AARCH64_VALID_JUMP_TARGET 48901: // Copy and subsample input, without padding 4891 ld1 {v0.16b, v1.16b}, [x1], x2 4892 ld1 {v2.16b, v3.16b}, [x10], x2 4893 uaddlp v0.8h, v0.16b 4894 uaddlp v1.8h, v1.16b 4895 uaddlp v2.8h, v2.16b 4896 uaddlp v3.8h, v3.16b 4897 shl v0.8h, v0.8h, #2 4898 shl v1.8h, v1.8h, #2 4899 shl v2.8h, v2.8h, #2 4900 shl v3.8h, v3.8h, #2 4901 subs w8, w8, #2 4902 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4903 add v16.8h, v16.8h, v0.8h 4904 add v17.8h, v17.8h, v1.8h 4905 add v18.8h, v18.8h, v2.8h 4906 add v19.8h, v19.8h, v3.8h 4907 b.gt 1b 4908 mov v0.16b, v2.16b 4909 mov v1.16b, v3.16b 4910 b L(ipred_cfl_ac_420_w16_hpad) 4911 4912L(ipred_cfl_ac_422_w16_wpad1): 4913 AARCH64_VALID_JUMP_TARGET 49141: // Copy and subsample input, padding 4 4915 ldr d1, [x1, #16] 4916 ld1 {v0.16b}, [x1], x2 4917 ldr d3, [x10, #16] 4918 ld1 {v2.16b}, [x10], x2 4919 uaddlp v1.4h, v1.8b 4920 uaddlp v0.8h, v0.16b 4921 uaddlp v3.4h, v3.8b 4922 uaddlp v2.8h, v2.16b 4923 shl v1.4h, v1.4h, #2 4924 shl v0.8h, v0.8h, #2 4925 shl v3.4h, v3.4h, #2 4926 shl v2.8h, v2.8h, #2 4927 dup v4.4h, v1.h[3] 4928 dup v5.4h, v3.h[3] 4929 trn1 v1.2d, v1.2d, v4.2d 4930 trn1 v3.2d, v3.2d, v5.2d 4931 subs w8, w8, #2 4932 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4933 add v16.8h, v16.8h, v0.8h 4934 add v17.8h, v17.8h, v1.8h 4935 add v18.8h, v18.8h, v2.8h 4936 add v19.8h, v19.8h, v3.8h 4937 b.gt 1b 4938 mov v0.16b, v2.16b 4939 mov v1.16b, v3.16b 4940 b L(ipred_cfl_ac_420_w16_hpad) 4941 4942L(ipred_cfl_ac_422_w16_wpad2): 4943 AARCH64_VALID_JUMP_TARGET 49441: // Copy and subsample input, padding 8 4945 ld1 {v0.16b}, [x1], x2 4946 ld1 {v2.16b}, [x10], x2 4947 uaddlp v0.8h, v0.16b 4948 uaddlp v2.8h, v2.16b 4949 shl v0.8h, v0.8h, #2 4950 shl v2.8h, v2.8h, #2 4951 dup v1.8h, v0.h[7] 4952 dup v3.8h, v2.h[7] 4953 subs w8, w8, #2 4954 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4955 add v16.8h, v16.8h, v0.8h 4956 add v17.8h, v17.8h, v1.8h 4957 add v18.8h, v18.8h, v2.8h 4958 add v19.8h, v19.8h, v3.8h 4959 b.gt 1b 4960 mov v0.16b, v2.16b 4961 mov v1.16b, v3.16b 4962 b L(ipred_cfl_ac_420_w16_hpad) 4963 4964L(ipred_cfl_ac_422_w16_wpad3): 4965 AARCH64_VALID_JUMP_TARGET 49661: // Copy and subsample input, padding 12 4967 ld1 {v0.8b}, [x1], x2 4968 ld1 {v2.8b}, [x10], x2 4969 uaddlp v0.4h, v0.8b 4970 uaddlp v2.4h, v2.8b 4971 shl v0.4h, v0.4h, #2 4972 shl v2.4h, v2.4h, #2 4973 dup v1.8h, v0.h[3] 4974 dup v3.8h, v2.h[3] 4975 trn1 v0.2d, v0.2d, v1.2d 4976 trn1 v2.2d, v2.2d, v3.2d 4977 subs w8, w8, #2 4978 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4979 add v16.8h, v16.8h, v0.8h 4980 add v17.8h, v17.8h, v1.8h 4981 add v18.8h, v18.8h, v2.8h 4982 add v19.8h, v19.8h, v3.8h 4983 b.gt 1b 4984 mov v0.16b, v2.16b 4985 mov v1.16b, v3.16b 4986 b L(ipred_cfl_ac_420_w16_hpad) 4987 4988L(ipred_cfl_ac_422_tbl): 4989 .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) 4990 .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) 4991 .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) 4992 .hword 0 4993 4994L(ipred_cfl_ac_422_w16_tbl): 4995 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) 4996 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) 4997 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) 4998 .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) 4999endfunc 5000 5001// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx, 5002// const ptrdiff_t stride, const int w_pad, 5003// const int h_pad, const int cw, const int ch); 5004function ipred_cfl_ac_444_8bpc_neon, export=1 5005 clz w8, w5 5006 lsl w4, w4, #2 5007 adr x7, L(ipred_cfl_ac_444_tbl) 5008 sub w8, w8, #26 5009 ldrh w8, [x7, w8, uxtw #1] 5010 movi v16.8h, #0 5011 movi v17.8h, #0 5012 movi v18.8h, #0 5013 movi v19.8h, #0 5014 sub x7, x7, w8, uxtw 5015 sub w8, w6, w4 // height - h_pad 5016 rbit w9, w5 // rbit(width) 5017 rbit w10, w6 // rbit(height) 5018 clz w9, w9 // ctz(width) 5019 clz w10, w10 // ctz(height) 5020 add w9, w9, w10 // log2sz 5021 add x10, x1, x2 5022 dup v31.4s, w9 5023 lsl x2, x2, #1 5024 neg v31.4s, v31.4s // -log2sz 5025 br x7 5026 5027L(ipred_cfl_ac_444_w4): 5028 AARCH64_VALID_JUMP_TARGET 50291: // Copy and expand input 5030 ld1 {v0.s}[0], [x1], x2 5031 ld1 {v0.s}[1], [x10], x2 5032 ld1 {v1.s}[0], [x1], x2 5033 ld1 {v1.s}[1], [x10], x2 5034 ushll v0.8h, v0.8b, #3 5035 ushll v1.8h, v1.8b, #3 5036 subs w8, w8, #4 5037 add v16.8h, v16.8h, v0.8h 5038 add v17.8h, v17.8h, v1.8h 5039 st1 {v0.8h, v1.8h}, [x0], #32 5040 b.gt 1b 5041 trn2 v0.2d, v1.2d, v1.2d 5042 trn2 v1.2d, v1.2d, v1.2d 5043 b L(ipred_cfl_ac_420_w4_hpad) 5044 5045L(ipred_cfl_ac_444_w8): 5046 AARCH64_VALID_JUMP_TARGET 50471: // Copy and expand input 5048 ld1 {v0.8b}, [x1], x2 5049 ld1 {v1.8b}, [x10], x2 5050 ld1 {v2.8b}, [x1], x2 5051 ushll v0.8h, v0.8b, #3 5052 ld1 {v3.8b}, [x10], x2 5053 ushll v1.8h, v1.8b, #3 5054 ushll v2.8h, v2.8b, #3 5055 ushll v3.8h, v3.8b, #3 5056 subs w8, w8, #4 5057 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5058 add v16.8h, v16.8h, v0.8h 5059 add v17.8h, v17.8h, v1.8h 5060 add v18.8h, v18.8h, v2.8h 5061 add v19.8h, v19.8h, v3.8h 5062 b.gt 1b 5063 mov v0.16b, v3.16b 5064 mov v1.16b, v3.16b 5065 b L(ipred_cfl_ac_420_w8_hpad) 5066 5067L(ipred_cfl_ac_444_w16): 5068 AARCH64_VALID_JUMP_TARGET 5069 cbnz w3, L(ipred_cfl_ac_444_w16_wpad) 50701: // Copy and expand input, without padding 5071 ld1 {v0.16b}, [x1], x2 5072 ld1 {v2.16b}, [x10], x2 5073 ld1 {v4.16b}, [x1], x2 5074 ushll2 v1.8h, v0.16b, #3 5075 ushll v0.8h, v0.8b, #3 5076 ld1 {v6.16b}, [x10], x2 5077 ushll2 v3.8h, v2.16b, #3 5078 ushll v2.8h, v2.8b, #3 5079 ushll2 v5.8h, v4.16b, #3 5080 ushll v4.8h, v4.8b, #3 5081 ushll2 v7.8h, v6.16b, #3 5082 ushll v6.8h, v6.8b, #3 5083 subs w8, w8, #4 5084 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5085 add v16.8h, v16.8h, v0.8h 5086 add v17.8h, v17.8h, v1.8h 5087 add v18.8h, v18.8h, v2.8h 5088 add v19.8h, v19.8h, v3.8h 5089 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 5090 add v16.8h, v16.8h, v4.8h 5091 add v17.8h, v17.8h, v5.8h 5092 add v18.8h, v18.8h, v6.8h 5093 add v19.8h, v19.8h, v7.8h 5094 b.gt 1b 5095 mov v0.16b, v6.16b 5096 mov v1.16b, v7.16b 5097 mov v2.16b, v6.16b 5098 mov v3.16b, v7.16b 5099 b L(ipred_cfl_ac_420_w16_hpad) 5100 5101L(ipred_cfl_ac_444_w16_wpad): 51021: // Copy and expand input, padding 8 5103 ld1 {v0.8b}, [x1], x2 5104 ld1 {v2.8b}, [x10], x2 5105 ld1 {v4.8b}, [x1], x2 5106 ld1 {v6.8b}, [x10], x2 5107 ushll v0.8h, v0.8b, #3 5108 ushll v2.8h, v2.8b, #3 5109 ushll v4.8h, v4.8b, #3 5110 ushll v6.8h, v6.8b, #3 5111 dup v1.8h, v0.h[7] 5112 dup v3.8h, v2.h[7] 5113 dup v5.8h, v4.h[7] 5114 dup v7.8h, v6.h[7] 5115 subs w8, w8, #4 5116 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5117 add v16.8h, v16.8h, v0.8h 5118 add v17.8h, v17.8h, v1.8h 5119 add v18.8h, v18.8h, v2.8h 5120 add v19.8h, v19.8h, v3.8h 5121 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 5122 add v16.8h, v16.8h, v4.8h 5123 add v17.8h, v17.8h, v5.8h 5124 add v18.8h, v18.8h, v6.8h 5125 add v19.8h, v19.8h, v7.8h 5126 b.gt 1b 5127 mov v0.16b, v6.16b 5128 mov v1.16b, v7.16b 5129 mov v2.16b, v6.16b 5130 mov v3.16b, v7.16b 5131 b L(ipred_cfl_ac_420_w16_hpad) 5132 5133L(ipred_cfl_ac_444_w32): 5134 AARCH64_VALID_JUMP_TARGET 5135 adr x7, L(ipred_cfl_ac_444_w32_tbl) 5136 ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 5137 sub x7, x7, w3, uxtw 5138 br x7 5139 5140L(ipred_cfl_ac_444_w32_wpad0): 5141 AARCH64_VALID_JUMP_TARGET 51421: // Copy and expand input, without padding 5143 ld1 {v2.16b, v3.16b}, [x1], x2 5144 ld1 {v6.16b, v7.16b}, [x10], x2 5145 ushll v0.8h, v2.8b, #3 5146 ushll2 v1.8h, v2.16b, #3 5147 ushll v2.8h, v3.8b, #3 5148 ushll2 v3.8h, v3.16b, #3 5149 ushll v4.8h, v6.8b, #3 5150 ushll2 v5.8h, v6.16b, #3 5151 ushll v6.8h, v7.8b, #3 5152 ushll2 v7.8h, v7.16b, #3 5153 subs w8, w8, #2 5154 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5155 add v16.8h, v16.8h, v0.8h 5156 add v17.8h, v17.8h, v1.8h 5157 add v18.8h, v18.8h, v2.8h 5158 add v19.8h, v19.8h, v3.8h 5159 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 5160 add v16.8h, v16.8h, v4.8h 5161 add v17.8h, v17.8h, v5.8h 5162 add v18.8h, v18.8h, v6.8h 5163 add v19.8h, v19.8h, v7.8h 5164 b.gt 1b 5165 b L(ipred_cfl_ac_444_w32_hpad) 5166 5167L(ipred_cfl_ac_444_w32_wpad2): 5168 AARCH64_VALID_JUMP_TARGET 51691: // Copy and expand input, padding 8 5170 ldr d2, [x1, #16] 5171 ld1 {v1.16b}, [x1], x2 5172 ldr d6, [x10, #16] 5173 ld1 {v5.16b}, [x10], x2 5174 ushll v2.8h, v2.8b, #3 5175 ushll v0.8h, v1.8b, #3 5176 ushll2 v1.8h, v1.16b, #3 5177 ushll v6.8h, v6.8b, #3 5178 ushll v4.8h, v5.8b, #3 5179 ushll2 v5.8h, v5.16b, #3 5180 dup v3.8h, v2.h[7] 5181 dup v7.8h, v6.h[7] 5182 subs w8, w8, #2 5183 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5184 add v16.8h, v16.8h, v0.8h 5185 add v17.8h, v17.8h, v1.8h 5186 add v18.8h, v18.8h, v2.8h 5187 add v19.8h, v19.8h, v3.8h 5188 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 5189 add v16.8h, v16.8h, v4.8h 5190 add v17.8h, v17.8h, v5.8h 5191 add v18.8h, v18.8h, v6.8h 5192 add v19.8h, v19.8h, v7.8h 5193 b.gt 1b 5194 b L(ipred_cfl_ac_444_w32_hpad) 5195 5196L(ipred_cfl_ac_444_w32_wpad4): 5197 AARCH64_VALID_JUMP_TARGET 51981: // Copy and expand input, padding 16 5199 ld1 {v1.16b}, [x1], x2 5200 ld1 {v5.16b}, [x10], x2 5201 ushll v0.8h, v1.8b, #3 5202 ushll2 v1.8h, v1.16b, #3 5203 ushll v4.8h, v5.8b, #3 5204 ushll2 v5.8h, v5.16b, #3 5205 dup v2.8h, v1.h[7] 5206 dup v3.8h, v1.h[7] 5207 dup v6.8h, v5.h[7] 5208 dup v7.8h, v5.h[7] 5209 subs w8, w8, #2 5210 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5211 add v16.8h, v16.8h, v0.8h 5212 add v17.8h, v17.8h, v1.8h 5213 add v18.8h, v18.8h, v2.8h 5214 add v19.8h, v19.8h, v3.8h 5215 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 5216 add v16.8h, v16.8h, v4.8h 5217 add v17.8h, v17.8h, v5.8h 5218 add v18.8h, v18.8h, v6.8h 5219 add v19.8h, v19.8h, v7.8h 5220 b.gt 1b 5221 b L(ipred_cfl_ac_444_w32_hpad) 5222 5223L(ipred_cfl_ac_444_w32_wpad6): 5224 AARCH64_VALID_JUMP_TARGET 52251: // Copy and expand input, padding 24 5226 ld1 {v0.8b}, [x1], x2 5227 ld1 {v4.8b}, [x10], x2 5228 ushll v0.8h, v0.8b, #3 5229 ushll v4.8h, v4.8b, #3 5230 dup v1.8h, v0.h[7] 5231 dup v2.8h, v0.h[7] 5232 dup v3.8h, v0.h[7] 5233 dup v5.8h, v4.h[7] 5234 dup v6.8h, v4.h[7] 5235 dup v7.8h, v4.h[7] 5236 subs w8, w8, #2 5237 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5238 add v16.8h, v16.8h, v0.8h 5239 add v17.8h, v17.8h, v1.8h 5240 add v18.8h, v18.8h, v2.8h 5241 add v19.8h, v19.8h, v3.8h 5242 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 5243 add v16.8h, v16.8h, v4.8h 5244 add v17.8h, v17.8h, v5.8h 5245 add v18.8h, v18.8h, v6.8h 5246 add v19.8h, v19.8h, v7.8h 5247 b.gt 1b 5248 5249L(ipred_cfl_ac_444_w32_hpad): 5250 cbz w4, 3f 52512: // Vertical padding (h_pad > 0) 5252 subs w4, w4, #2 5253 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 5254 add v16.8h, v16.8h, v4.8h 5255 add v17.8h, v17.8h, v5.8h 5256 add v18.8h, v18.8h, v6.8h 5257 add v19.8h, v19.8h, v7.8h 5258 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 5259 add v16.8h, v16.8h, v4.8h 5260 add v17.8h, v17.8h, v5.8h 5261 add v18.8h, v18.8h, v6.8h 5262 add v19.8h, v19.8h, v7.8h 5263 b.gt 2b 52643: 5265 5266 // Quadruple the height and reuse the w8 subtracting 5267 lsl w6, w6, #2 5268 // Aggregate the sums, with wider intermediates earlier than in 5269 // ipred_cfl_ac_420_w8_calc_subtract_dc. 5270 uaddlp v0.4s, v16.8h 5271 uaddlp v1.4s, v17.8h 5272 uaddlp v2.4s, v18.8h 5273 uaddlp v3.4s, v19.8h 5274 add v0.4s, v0.4s, v1.4s 5275 add v2.4s, v2.4s, v3.4s 5276 add v0.4s, v0.4s, v2.4s 5277 addv s0, v0.4s // sum 5278 sub x0, x0, w6, uxtw #4 5279 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz 5280 dup v4.8h, v4.h[0] 5281 b L(ipred_cfl_ac_420_w8_subtract_dc) 5282 5283L(ipred_cfl_ac_444_tbl): 5284 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) 5285 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) 5286 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) 5287 .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) 5288 5289L(ipred_cfl_ac_444_w32_tbl): 5290 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) 5291 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) 5292 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) 5293 .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) 5294endfunc 5295