1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Janne Grunau 4 * Copyright © 2018, Martin Storsjo 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, this 11 * list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include "src/arm/asm.S" 30#include "util.S" 31 32.macro avg dst, t0, t1, t2, t3 33 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 34 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 35 add \t0\().8h, \t0\().8h, \t2\().8h 36 add \t1\().8h, \t1\().8h, \t3\().8h 37 sqrshrun \dst\().8b, \t0\().8h, #5 38 sqrshrun2 \dst\().16b, \t1\().8h, #5 39.endm 40 41.macro w_avg dst, t0, t1, t2, t3 42 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 43 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 44 sub \t0\().8h, \t2\().8h, \t0\().8h 45 sub \t1\().8h, \t3\().8h, \t1\().8h 46 sqdmulh \t0\().8h, \t0\().8h, v30.8h 47 sqdmulh \t1\().8h, \t1\().8h, v30.8h 48 add \t0\().8h, \t2\().8h, \t0\().8h 49 add \t1\().8h, \t3\().8h, \t1\().8h 50 sqrshrun \dst\().8b, \t0\().8h, #4 51 sqrshrun2 \dst\().16b, \t1\().8h, #4 52.endm 53 54.macro mask dst, t0, t1, t2, t3 55 ld1 {v30.16b}, [x6], 16 56 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 57 mul v30.16b, v30.16b, v31.16b 58 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 59 shll v28.8h, v30.8b, #8 60 shll2 v29.8h, v30.16b, #8 61 sub \t0\().8h, \t2\().8h, \t0\().8h 62 sub \t1\().8h, \t3\().8h, \t1\().8h 63 sqdmulh \t0\().8h, \t0\().8h, v28.8h 64 sqdmulh \t1\().8h, \t1\().8h, v29.8h 65 add \t0\().8h, \t2\().8h, \t0\().8h 66 add \t1\().8h, \t3\().8h, \t1\().8h 67 sqrshrun \dst\().8b, \t0\().8h, #4 68 sqrshrun2 \dst\().16b, \t1\().8h, #4 69.endm 70 71.macro bidir_fn type 72function \type\()_8bpc_neon, export=1 73 clz w4, w4 74.ifc \type, w_avg 75 dup v30.8h, w6 76 neg v30.8h, v30.8h 77 shl v30.8h, v30.8h, #11 78.endif 79.ifc \type, mask 80 movi v31.16b, #256-2 81.endif 82 adr x7, L(\type\()_tbl) 83 sub w4, w4, #24 84 ldrh w4, [x7, x4, lsl #1] 85 \type v4, v0, v1, v2, v3 86 sub x7, x7, w4, uxtw 87 br x7 8840: 89 AARCH64_VALID_JUMP_TARGET 90 add x7, x0, x1 91 lsl x1, x1, #1 924: 93 cmp w5, #4 94 st1 {v4.s}[0], [x0], x1 95 st1 {v4.s}[1], [x7], x1 96 st1 {v4.s}[2], [x0], x1 97 st1 {v4.s}[3], [x7], x1 98 b.eq 0f 99 \type v5, v0, v1, v2, v3 100 cmp w5, #8 101 st1 {v5.s}[0], [x0], x1 102 st1 {v5.s}[1], [x7], x1 103 st1 {v5.s}[2], [x0], x1 104 st1 {v5.s}[3], [x7], x1 105 b.eq 0f 106 \type v4, v0, v1, v2, v3 107 st1 {v4.s}[0], [x0], x1 108 st1 {v4.s}[1], [x7], x1 109 \type v5, v0, v1, v2, v3 110 st1 {v4.s}[2], [x0], x1 111 st1 {v4.s}[3], [x7], x1 112 st1 {v5.s}[0], [x0], x1 113 st1 {v5.s}[1], [x7], x1 114 st1 {v5.s}[2], [x0], x1 115 st1 {v5.s}[3], [x7], x1 116 ret 11780: 118 AARCH64_VALID_JUMP_TARGET 119 add x7, x0, x1 120 lsl x1, x1, #1 1218: 122 st1 {v4.d}[0], [x0], x1 123 \type v5, v0, v1, v2, v3 124 st1 {v4.d}[1], [x7], x1 125 st1 {v5.d}[0], [x0], x1 126 subs w5, w5, #4 127 st1 {v5.d}[1], [x7], x1 128 b.le 0f 129 \type v4, v0, v1, v2, v3 130 b 8b 13116: 132 AARCH64_VALID_JUMP_TARGET 133 \type v5, v0, v1, v2, v3 134 st1 {v4.16b}, [x0], x1 135 \type v6, v0, v1, v2, v3 136 st1 {v5.16b}, [x0], x1 137 \type v7, v0, v1, v2, v3 138 st1 {v6.16b}, [x0], x1 139 subs w5, w5, #4 140 st1 {v7.16b}, [x0], x1 141 b.le 0f 142 \type v4, v0, v1, v2, v3 143 b 16b 144320: 145 AARCH64_VALID_JUMP_TARGET 146 add x7, x0, x1 147 lsl x1, x1, #1 14832: 149 \type v5, v0, v1, v2, v3 150 \type v6, v0, v1, v2, v3 151 st1 {v4.16b,v5.16b}, [x0], x1 152 \type v7, v0, v1, v2, v3 153 subs w5, w5, #2 154 st1 {v6.16b,v7.16b}, [x7], x1 155 b.le 0f 156 \type v4, v0, v1, v2, v3 157 b 32b 158640: 159 AARCH64_VALID_JUMP_TARGET 160 add x7, x0, x1 161 lsl x1, x1, #1 16264: 163 \type v5, v0, v1, v2, v3 164 \type v6, v0, v1, v2, v3 165 \type v7, v0, v1, v2, v3 166 \type v16, v0, v1, v2, v3 167 \type v17, v0, v1, v2, v3 168 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 169 \type v18, v0, v1, v2, v3 170 \type v19, v0, v1, v2, v3 171 subs w5, w5, #2 172 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 173 b.le 0f 174 \type v4, v0, v1, v2, v3 175 b 64b 1761280: 177 AARCH64_VALID_JUMP_TARGET 178 add x7, x0, #64 179128: 180 \type v5, v0, v1, v2, v3 181 \type v6, v0, v1, v2, v3 182 \type v7, v0, v1, v2, v3 183 \type v16, v0, v1, v2, v3 184 \type v17, v0, v1, v2, v3 185 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 186 \type v18, v0, v1, v2, v3 187 \type v19, v0, v1, v2, v3 188 subs w5, w5, #1 189 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 190 b.le 0f 191 \type v4, v0, v1, v2, v3 192 b 128b 1930: 194 ret 195L(\type\()_tbl): 196 .hword L(\type\()_tbl) - 1280b 197 .hword L(\type\()_tbl) - 640b 198 .hword L(\type\()_tbl) - 320b 199 .hword L(\type\()_tbl) - 16b 200 .hword L(\type\()_tbl) - 80b 201 .hword L(\type\()_tbl) - 40b 202endfunc 203.endm 204 205bidir_fn avg 206bidir_fn w_avg 207bidir_fn mask 208 209 210.macro w_mask_fn type 211function w_mask_\type\()_8bpc_neon, export=1 212 clz w8, w4 213 adr x9, L(w_mask_\type\()_tbl) 214 sub w8, w8, #24 215 ldrh w8, [x9, x8, lsl #1] 216 sub x9, x9, w8, uxtw 217 mov w10, #6903 218 dup v0.8h, w10 219.if \type == 444 220 movi v1.16b, #64 221.elseif \type == 422 222 dup v2.8b, w7 223 movi v3.8b, #129 224 sub v3.8b, v3.8b, v2.8b 225.elseif \type == 420 226 dup v2.8h, w7 227 movi v3.8h, #1, lsl #8 228 sub v3.8h, v3.8h, v2.8h 229.endif 230 add x12, x0, x1 231 lsl x1, x1, #1 232 br x9 2334: 234 AARCH64_VALID_JUMP_TARGET 235 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) 236 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) 237 subs w5, w5, #4 238 sub v16.8h, v6.8h, v4.8h 239 sub v17.8h, v7.8h, v5.8h 240 sabd v18.8h, v4.8h, v6.8h 241 sabd v19.8h, v5.8h, v7.8h 242 uqsub v18.8h, v0.8h, v18.8h 243 uqsub v19.8h, v0.8h, v19.8h 244 ushr v18.8h, v18.8h, #8 245 ushr v19.8h, v19.8h, #8 246 shl v20.8h, v18.8h, #9 247 shl v21.8h, v19.8h, #9 248 sqdmulh v20.8h, v20.8h, v16.8h 249 sqdmulh v21.8h, v21.8h, v17.8h 250 add v20.8h, v20.8h, v4.8h 251 add v21.8h, v21.8h, v5.8h 252 sqrshrun v22.8b, v20.8h, #4 253 sqrshrun v23.8b, v21.8h, #4 254.if \type == 444 255 uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2 256 sub v18.16b, v1.16b, v18.16b 257 st1 {v18.16b}, [x6], #16 258.elseif \type == 422 259 addp v18.8h, v18.8h, v19.8h 260 xtn v18.8b, v18.8h 261 uhsub v18.8b, v3.8b, v18.8b 262 st1 {v18.8b}, [x6], #8 263.elseif \type == 420 264 trn1 v24.2d, v18.2d, v19.2d 265 trn2 v25.2d, v18.2d, v19.2d 266 add v24.8h, v24.8h, v25.8h 267 addp v18.8h, v24.8h, v24.8h 268 sub v18.4h, v3.4h, v18.4h 269 rshrn v18.8b, v18.8h, #2 270 st1 {v18.s}[0], [x6], #4 271.endif 272 st1 {v22.s}[0], [x0], x1 273 st1 {v22.s}[1], [x12], x1 274 st1 {v23.s}[0], [x0], x1 275 st1 {v23.s}[1], [x12], x1 276 b.gt 4b 277 ret 2788: 279 AARCH64_VALID_JUMP_TARGET 280 ld1 {v4.8h, v5.8h}, [x2], #32 281 ld1 {v6.8h, v7.8h}, [x3], #32 282 subs w5, w5, #2 283 sub v16.8h, v6.8h, v4.8h 284 sub v17.8h, v7.8h, v5.8h 285 sabd v18.8h, v4.8h, v6.8h 286 sabd v19.8h, v5.8h, v7.8h 287 uqsub v18.8h, v0.8h, v18.8h 288 uqsub v19.8h, v0.8h, v19.8h 289 ushr v18.8h, v18.8h, #8 290 ushr v19.8h, v19.8h, #8 291 shl v20.8h, v18.8h, #9 292 shl v21.8h, v19.8h, #9 293 sqdmulh v20.8h, v20.8h, v16.8h 294 sqdmulh v21.8h, v21.8h, v17.8h 295 add v20.8h, v20.8h, v4.8h 296 add v21.8h, v21.8h, v5.8h 297 sqrshrun v22.8b, v20.8h, #4 298 sqrshrun v23.8b, v21.8h, #4 299.if \type == 444 300 uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2 301 sub v18.16b, v1.16b, v18.16b 302 st1 {v18.16b}, [x6], #16 303.elseif \type == 422 304 addp v18.8h, v18.8h, v19.8h 305 xtn v18.8b, v18.8h 306 uhsub v18.8b, v3.8b, v18.8b 307 st1 {v18.8b}, [x6], #8 308.elseif \type == 420 309 add v18.8h, v18.8h, v19.8h 310 addp v18.8h, v18.8h, v18.8h 311 sub v18.4h, v3.4h, v18.4h 312 rshrn v18.8b, v18.8h, #2 313 st1 {v18.s}[0], [x6], #4 314.endif 315 st1 {v22.8b}, [x0], x1 316 st1 {v23.8b}, [x12], x1 317 b.gt 8b 318 ret 3191280: 320640: 321320: 322160: 323 AARCH64_VALID_JUMP_TARGET 324 mov w11, w4 325 sub x1, x1, w4, uxtw 326.if \type == 444 327 add x10, x6, w4, uxtw 328.elseif \type == 422 329 add x10, x6, x11, lsr #1 330.endif 331 add x9, x3, w4, uxtw #1 332 add x7, x2, w4, uxtw #1 333161: 334 mov w8, w4 33516: 336 ld1 {v4.8h, v5.8h}, [x2], #32 337 ld1 {v6.8h, v7.8h}, [x3], #32 338 ld1 {v16.8h, v17.8h}, [x7], #32 339 ld1 {v18.8h, v19.8h}, [x9], #32 340 subs w8, w8, #16 341 sub v6.8h, v6.8h, v4.8h 342 sub v7.8h, v7.8h, v5.8h 343 sub v18.8h, v18.8h, v16.8h 344 sub v19.8h, v19.8h, v17.8h 345 abs v20.8h, v6.8h 346 abs v21.8h, v7.8h 347 abs v22.8h, v18.8h 348 abs v23.8h, v19.8h 349 uqsub v20.8h, v0.8h, v20.8h 350 uqsub v21.8h, v0.8h, v21.8h 351 uqsub v22.8h, v0.8h, v22.8h 352 uqsub v23.8h, v0.8h, v23.8h 353 ushr v20.8h, v20.8h, #8 354 ushr v21.8h, v21.8h, #8 355 ushr v22.8h, v22.8h, #8 356 ushr v23.8h, v23.8h, #8 357 shl v24.8h, v20.8h, #9 358 shl v25.8h, v21.8h, #9 359 shl v26.8h, v22.8h, #9 360 shl v27.8h, v23.8h, #9 361 sqdmulh v24.8h, v24.8h, v6.8h 362 sqdmulh v25.8h, v25.8h, v7.8h 363 sqdmulh v26.8h, v26.8h, v18.8h 364 sqdmulh v27.8h, v27.8h, v19.8h 365 add v24.8h, v24.8h, v4.8h 366 add v25.8h, v25.8h, v5.8h 367 add v26.8h, v26.8h, v16.8h 368 add v27.8h, v27.8h, v17.8h 369 sqrshrun v24.8b, v24.8h, #4 370 sqrshrun v25.8b, v25.8h, #4 371 sqrshrun v26.8b, v26.8h, #4 372 sqrshrun v27.8b, v27.8h, #4 373.if \type == 444 374 uzp1 v20.16b, v20.16b, v21.16b // Same as xtn, xtn2 375 uzp1 v21.16b, v22.16b, v23.16b // Ditto 376 sub v20.16b, v1.16b, v20.16b 377 sub v21.16b, v1.16b, v21.16b 378 st1 {v20.16b}, [x6], #16 379 st1 {v21.16b}, [x10], #16 380.elseif \type == 422 381 addp v20.8h, v20.8h, v21.8h 382 addp v21.8h, v22.8h, v23.8h 383 xtn v20.8b, v20.8h 384 xtn v21.8b, v21.8h 385 uhsub v20.8b, v3.8b, v20.8b 386 uhsub v21.8b, v3.8b, v21.8b 387 st1 {v20.8b}, [x6], #8 388 st1 {v21.8b}, [x10], #8 389.elseif \type == 420 390 add v20.8h, v20.8h, v22.8h 391 add v21.8h, v21.8h, v23.8h 392 addp v20.8h, v20.8h, v21.8h 393 sub v20.8h, v3.8h, v20.8h 394 rshrn v20.8b, v20.8h, #2 395 st1 {v20.8b}, [x6], #8 396.endif 397 st1 {v24.8b, v25.8b}, [x0], #16 398 st1 {v26.8b, v27.8b}, [x12], #16 399 b.gt 16b 400 subs w5, w5, #2 401 add x2, x2, w4, uxtw #1 402 add x3, x3, w4, uxtw #1 403 add x7, x7, w4, uxtw #1 404 add x9, x9, w4, uxtw #1 405.if \type == 444 406 add x6, x6, w4, uxtw 407 add x10, x10, w4, uxtw 408.elseif \type == 422 409 add x6, x6, x11, lsr #1 410 add x10, x10, x11, lsr #1 411.endif 412 add x0, x0, x1 413 add x12, x12, x1 414 b.gt 161b 415 ret 416L(w_mask_\type\()_tbl): 417 .hword L(w_mask_\type\()_tbl) - 1280b 418 .hword L(w_mask_\type\()_tbl) - 640b 419 .hword L(w_mask_\type\()_tbl) - 320b 420 .hword L(w_mask_\type\()_tbl) - 160b 421 .hword L(w_mask_\type\()_tbl) - 8b 422 .hword L(w_mask_\type\()_tbl) - 4b 423endfunc 424.endm 425 426w_mask_fn 444 427w_mask_fn 422 428w_mask_fn 420 429 430 431function blend_8bpc_neon, export=1 432 adr x6, L(blend_tbl) 433 clz w3, w3 434 sub w3, w3, #26 435 ldrh w3, [x6, x3, lsl #1] 436 sub x6, x6, w3, uxtw 437 movi v4.16b, #64 438 add x8, x0, x1 439 lsl x1, x1, #1 440 br x6 4414: 442 AARCH64_VALID_JUMP_TARGET 443 ld1 {v2.8b}, [x5], #8 444 ld1 {v1.d}[0], [x2], #8 445 ld1 {v0.s}[0], [x0] 446 subs w4, w4, #2 447 ld1 {v0.s}[1], [x8] 448 sub v3.8b, v4.8b, v2.8b 449 umull v5.8h, v1.8b, v2.8b 450 umlal v5.8h, v0.8b, v3.8b 451 rshrn v6.8b, v5.8h, #6 452 st1 {v6.s}[0], [x0], x1 453 st1 {v6.s}[1], [x8], x1 454 b.gt 4b 455 ret 4568: 457 AARCH64_VALID_JUMP_TARGET 458 ld1 {v2.16b}, [x5], #16 459 ld1 {v1.16b}, [x2], #16 460 ld1 {v0.d}[0], [x0] 461 ld1 {v0.d}[1], [x8] 462 sub v3.16b, v4.16b, v2.16b 463 subs w4, w4, #2 464 umull v5.8h, v1.8b, v2.8b 465 umlal v5.8h, v0.8b, v3.8b 466 umull2 v6.8h, v1.16b, v2.16b 467 umlal2 v6.8h, v0.16b, v3.16b 468 rshrn v7.8b, v5.8h, #6 469 rshrn2 v7.16b, v6.8h, #6 470 st1 {v7.d}[0], [x0], x1 471 st1 {v7.d}[1], [x8], x1 472 b.gt 8b 473 ret 47416: 475 AARCH64_VALID_JUMP_TARGET 476 ld1 {v1.16b, v2.16b}, [x5], #32 477 ld1 {v5.16b, v6.16b}, [x2], #32 478 ld1 {v0.16b}, [x0] 479 subs w4, w4, #2 480 sub v7.16b, v4.16b, v1.16b 481 sub v20.16b, v4.16b, v2.16b 482 ld1 {v3.16b}, [x8] 483 umull v16.8h, v5.8b, v1.8b 484 umlal v16.8h, v0.8b, v7.8b 485 umull2 v17.8h, v5.16b, v1.16b 486 umlal2 v17.8h, v0.16b, v7.16b 487 umull v21.8h, v6.8b, v2.8b 488 umlal v21.8h, v3.8b, v20.8b 489 umull2 v22.8h, v6.16b, v2.16b 490 umlal2 v22.8h, v3.16b, v20.16b 491 rshrn v18.8b, v16.8h, #6 492 rshrn2 v18.16b, v17.8h, #6 493 rshrn v19.8b, v21.8h, #6 494 rshrn2 v19.16b, v22.8h, #6 495 st1 {v18.16b}, [x0], x1 496 st1 {v19.16b}, [x8], x1 497 b.gt 16b 498 ret 49932: 500 AARCH64_VALID_JUMP_TARGET 501 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64 502 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 503 ld1 {v20.16b, v21.16b}, [x0] 504 subs w4, w4, #2 505 ld1 {v22.16b, v23.16b}, [x8] 506 sub v5.16b, v4.16b, v0.16b 507 sub v6.16b, v4.16b, v1.16b 508 sub v30.16b, v4.16b, v2.16b 509 sub v31.16b, v4.16b, v3.16b 510 umull v24.8h, v16.8b, v0.8b 511 umlal v24.8h, v20.8b, v5.8b 512 umull2 v26.8h, v16.16b, v0.16b 513 umlal2 v26.8h, v20.16b, v5.16b 514 umull v28.8h, v17.8b, v1.8b 515 umlal v28.8h, v21.8b, v6.8b 516 umull2 v7.8h, v17.16b, v1.16b 517 umlal2 v7.8h, v21.16b, v6.16b 518 umull v27.8h, v18.8b, v2.8b 519 umlal v27.8h, v22.8b, v30.8b 520 umull2 v1.8h, v18.16b, v2.16b 521 umlal2 v1.8h, v22.16b, v30.16b 522 umull v29.8h, v19.8b, v3.8b 523 umlal v29.8h, v23.8b, v31.8b 524 umull2 v21.8h, v19.16b, v3.16b 525 umlal2 v21.8h, v23.16b, v31.16b 526 rshrn v24.8b, v24.8h, #6 527 rshrn2 v24.16b, v26.8h, #6 528 rshrn v25.8b, v28.8h, #6 529 rshrn2 v25.16b, v7.8h, #6 530 rshrn v27.8b, v27.8h, #6 531 rshrn2 v27.16b, v1.8h, #6 532 rshrn v28.8b, v29.8h, #6 533 rshrn2 v28.16b, v21.8h, #6 534 st1 {v24.16b, v25.16b}, [x0], x1 535 st1 {v27.16b, v28.16b}, [x8], x1 536 b.gt 32b 537 ret 538L(blend_tbl): 539 .hword L(blend_tbl) - 32b 540 .hword L(blend_tbl) - 16b 541 .hword L(blend_tbl) - 8b 542 .hword L(blend_tbl) - 4b 543endfunc 544 545function blend_h_8bpc_neon, export=1 546 adr x6, L(blend_h_tbl) 547 movrel x5, X(obmc_masks) 548 add x5, x5, w4, uxtw 549 sub w4, w4, w4, lsr #2 550 clz w7, w3 551 movi v4.16b, #64 552 add x8, x0, x1 553 lsl x1, x1, #1 554 sub w7, w7, #24 555 ldrh w7, [x6, x7, lsl #1] 556 sub x6, x6, w7, uxtw 557 br x6 5582: 559 AARCH64_VALID_JUMP_TARGET 560 ld1 {v0.h}[0], [x5], #2 561 ld1 {v1.s}[0], [x2], #4 562 subs w4, w4, #2 563 ld1 {v2.h}[0], [x0] 564 zip1 v0.8b, v0.8b, v0.8b 565 sub v3.8b, v4.8b, v0.8b 566 ld1 {v2.h}[1], [x8] 567 umull v5.8h, v1.8b, v0.8b 568 umlal v5.8h, v2.8b, v3.8b 569 rshrn v5.8b, v5.8h, #6 570 st1 {v5.h}[0], [x0], x1 571 st1 {v5.h}[1], [x8], x1 572 b.gt 2b 573 ret 5744: 575 AARCH64_VALID_JUMP_TARGET 576 ld2r {v0.8b, v1.8b}, [x5], #2 577 ld1 {v2.8b}, [x2], #8 578 subs w4, w4, #2 579 ext v0.8b, v0.8b, v1.8b, #4 580 ld1 {v3.s}[0], [x0] 581 sub v5.8b, v4.8b, v0.8b 582 ld1 {v3.s}[1], [x8] 583 umull v6.8h, v2.8b, v0.8b 584 umlal v6.8h, v3.8b, v5.8b 585 rshrn v6.8b, v6.8h, #6 586 st1 {v6.s}[0], [x0], x1 587 st1 {v6.s}[1], [x8], x1 588 b.gt 4b 589 ret 5908: 591 AARCH64_VALID_JUMP_TARGET 592 ld2r {v0.16b, v1.16b}, [x5], #2 593 ld1 {v2.16b}, [x2], #16 594 ld1 {v3.d}[0], [x0] 595 ext v0.16b, v0.16b, v1.16b, #8 596 sub v5.16b, v4.16b, v0.16b 597 ld1 {v3.d}[1], [x8] 598 subs w4, w4, #2 599 umull v6.8h, v0.8b, v2.8b 600 umlal v6.8h, v3.8b, v5.8b 601 umull2 v7.8h, v0.16b, v2.16b 602 umlal2 v7.8h, v3.16b, v5.16b 603 rshrn v16.8b, v6.8h, #6 604 rshrn2 v16.16b, v7.8h, #6 605 st1 {v16.d}[0], [x0], x1 606 st1 {v16.d}[1], [x8], x1 607 b.gt 8b 608 ret 60916: 610 AARCH64_VALID_JUMP_TARGET 611 ld2r {v0.16b, v1.16b}, [x5], #2 612 ld1 {v2.16b, v3.16b}, [x2], #32 613 ld1 {v5.16b}, [x0] 614 sub v7.16b, v4.16b, v0.16b 615 sub v16.16b, v4.16b, v1.16b 616 ld1 {v6.16b}, [x8] 617 subs w4, w4, #2 618 umull v17.8h, v0.8b, v2.8b 619 umlal v17.8h, v5.8b, v7.8b 620 umull2 v18.8h, v0.16b, v2.16b 621 umlal2 v18.8h, v5.16b, v7.16b 622 umull v19.8h, v1.8b, v3.8b 623 umlal v19.8h, v6.8b, v16.8b 624 umull2 v20.8h, v1.16b, v3.16b 625 umlal2 v20.8h, v6.16b, v16.16b 626 rshrn v21.8b, v17.8h, #6 627 rshrn2 v21.16b, v18.8h, #6 628 rshrn v22.8b, v19.8h, #6 629 rshrn2 v22.16b, v20.8h, #6 630 st1 {v21.16b}, [x0], x1 631 st1 {v22.16b}, [x8], x1 632 b.gt 16b 633 ret 6341280: 635640: 636320: 637 AARCH64_VALID_JUMP_TARGET 638 sub x1, x1, w3, uxtw 639 add x7, x2, w3, uxtw 640321: 641 ld2r {v0.16b, v1.16b}, [x5], #2 642 mov w6, w3 643 sub v20.16b, v4.16b, v0.16b 644 sub v21.16b, v4.16b, v1.16b 64532: 646 ld1 {v16.16b, v17.16b}, [x2], #32 647 ld1 {v2.16b, v3.16b}, [x0] 648 subs w6, w6, #32 649 umull v23.8h, v0.8b, v16.8b 650 umlal v23.8h, v2.8b, v20.8b 651 ld1 {v18.16b, v19.16b}, [x7], #32 652 umull2 v27.8h, v0.16b, v16.16b 653 umlal2 v27.8h, v2.16b, v20.16b 654 ld1 {v6.16b, v7.16b}, [x8] 655 umull v24.8h, v0.8b, v17.8b 656 umlal v24.8h, v3.8b, v20.8b 657 umull2 v28.8h, v0.16b, v17.16b 658 umlal2 v28.8h, v3.16b, v20.16b 659 umull v25.8h, v1.8b, v18.8b 660 umlal v25.8h, v6.8b, v21.8b 661 umull2 v5.8h, v1.16b, v18.16b 662 umlal2 v5.8h, v6.16b, v21.16b 663 rshrn v29.8b, v23.8h, #6 664 rshrn2 v29.16b, v27.8h, #6 665 umull v26.8h, v1.8b, v19.8b 666 umlal v26.8h, v7.8b, v21.8b 667 umull2 v31.8h, v1.16b, v19.16b 668 umlal2 v31.8h, v7.16b, v21.16b 669 rshrn v30.8b, v24.8h, #6 670 rshrn2 v30.16b, v28.8h, #6 671 rshrn v23.8b, v25.8h, #6 672 rshrn2 v23.16b, v5.8h, #6 673 rshrn v24.8b, v26.8h, #6 674 st1 {v29.16b, v30.16b}, [x0], #32 675 rshrn2 v24.16b, v31.8h, #6 676 st1 {v23.16b, v24.16b}, [x8], #32 677 b.gt 32b 678 subs w4, w4, #2 679 add x0, x0, x1 680 add x8, x8, x1 681 add x2, x2, w3, uxtw 682 add x7, x7, w3, uxtw 683 b.gt 321b 684 ret 685L(blend_h_tbl): 686 .hword L(blend_h_tbl) - 1280b 687 .hword L(blend_h_tbl) - 640b 688 .hword L(blend_h_tbl) - 320b 689 .hword L(blend_h_tbl) - 16b 690 .hword L(blend_h_tbl) - 8b 691 .hword L(blend_h_tbl) - 4b 692 .hword L(blend_h_tbl) - 2b 693endfunc 694 695function blend_v_8bpc_neon, export=1 696 adr x6, L(blend_v_tbl) 697 movrel x5, X(obmc_masks) 698 add x5, x5, w3, uxtw 699 clz w3, w3 700 movi v4.16b, #64 701 add x8, x0, x1 702 lsl x1, x1, #1 703 sub w3, w3, #26 704 ldrh w3, [x6, x3, lsl #1] 705 sub x6, x6, w3, uxtw 706 br x6 70720: 708 AARCH64_VALID_JUMP_TARGET 709 ld1r {v0.8b}, [x5] 710 sub v1.8b, v4.8b, v0.8b 7112: 712 ld1 {v2.h}[0], [x2], #2 713 ld1 {v3.b}[0], [x0] 714 subs w4, w4, #2 715 ld1 {v2.b}[1], [x2] 716 ld1 {v3.b}[1], [x8] 717 umull v5.8h, v2.8b, v0.8b 718 umlal v5.8h, v3.8b, v1.8b 719 rshrn v5.8b, v5.8h, #6 720 add x2, x2, #2 721 st1 {v5.b}[0], [x0], x1 722 st1 {v5.b}[1], [x8], x1 723 b.gt 2b 724 ret 72540: 726 AARCH64_VALID_JUMP_TARGET 727 ld1r {v0.2s}, [x5] 728 sub x1, x1, #2 729 sub v1.8b, v4.8b, v0.8b 7304: 731 ld1 {v2.8b}, [x2], #8 732 ld1 {v3.s}[0], [x0] 733 ld1 {v3.s}[1], [x8] 734 subs w4, w4, #2 735 umull v5.8h, v2.8b, v0.8b 736 umlal v5.8h, v3.8b, v1.8b 737 rshrn v5.8b, v5.8h, #6 738 st1 {v5.h}[0], [x0], #2 739 st1 {v5.h}[2], [x8], #2 740 st1 {v5.b}[2], [x0], x1 741 st1 {v5.b}[6], [x8], x1 742 b.gt 4b 743 ret 74480: 745 AARCH64_VALID_JUMP_TARGET 746 ld1r {v0.2d}, [x5] 747 sub x1, x1, #4 748 sub v1.16b, v4.16b, v0.16b 7498: 750 ld1 {v2.16b}, [x2], #16 751 ld1 {v3.d}[0], [x0] 752 ld1 {v3.d}[1], [x8] 753 subs w4, w4, #2 754 umull v5.8h, v0.8b, v2.8b 755 umlal v5.8h, v3.8b, v1.8b 756 umull2 v6.8h, v0.16b, v2.16b 757 umlal2 v6.8h, v3.16b, v1.16b 758 rshrn v7.8b, v5.8h, #6 759 rshrn2 v7.16b, v6.8h, #6 760 st1 {v7.s}[0], [x0], #4 761 st1 {v7.s}[2], [x8], #4 762 st1 {v7.h}[2], [x0], x1 763 st1 {v7.h}[6], [x8], x1 764 b.gt 8b 765 ret 766160: 767 AARCH64_VALID_JUMP_TARGET 768 ld1 {v0.16b}, [x5] 769 sub x1, x1, #8 770 sub v2.16b, v4.16b, v0.16b 77116: 772 ld1 {v5.16b, v6.16b}, [x2], #32 773 ld1 {v7.16b}, [x0] 774 subs w4, w4, #2 775 ld1 {v16.16b}, [x8] 776 umull v17.8h, v5.8b, v0.8b 777 umlal v17.8h, v7.8b, v2.8b 778 umull2 v18.8h, v5.16b, v0.16b 779 umlal2 v18.8h, v7.16b, v2.16b 780 umull v20.8h, v6.8b, v0.8b 781 umlal v20.8h, v16.8b, v2.8b 782 umull2 v21.8h, v6.16b, v0.16b 783 umlal2 v21.8h, v16.16b, v2.16b 784 rshrn v19.8b, v17.8h, #6 785 rshrn2 v19.16b, v18.8h, #6 786 rshrn v22.8b, v20.8h, #6 787 rshrn2 v22.16b, v21.8h, #6 788 st1 {v19.8b}, [x0], #8 789 st1 {v22.8b}, [x8], #8 790 st1 {v19.s}[2], [x0], x1 791 st1 {v22.s}[2], [x8], x1 792 b.gt 16b 793 ret 794320: 795 AARCH64_VALID_JUMP_TARGET 796 ld1 {v0.16b, v1.16b}, [x5] 797 sub x1, x1, #16 798 sub v2.16b, v4.16b, v0.16b 799 sub v3.8b, v4.8b, v1.8b 80032: 801 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 802 ld1 {v5.16b, v6.16b}, [x0] 803 subs w4, w4, #2 804 ld1 {v20.16b, v21.16b}, [x8] 805 umull v22.8h, v16.8b, v0.8b 806 umlal v22.8h, v5.8b, v2.8b 807 umull2 v23.8h, v16.16b, v0.16b 808 umlal2 v23.8h, v5.16b, v2.16b 809 umull v28.8h, v17.8b, v1.8b 810 umlal v28.8h, v6.8b, v3.8b 811 umull v30.8h, v18.8b, v0.8b 812 umlal v30.8h, v20.8b, v2.8b 813 umull2 v31.8h, v18.16b, v0.16b 814 umlal2 v31.8h, v20.16b, v2.16b 815 umull v25.8h, v19.8b, v1.8b 816 umlal v25.8h, v21.8b, v3.8b 817 rshrn v24.8b, v22.8h, #6 818 rshrn2 v24.16b, v23.8h, #6 819 rshrn v28.8b, v28.8h, #6 820 rshrn v30.8b, v30.8h, #6 821 rshrn2 v30.16b, v31.8h, #6 822 rshrn v27.8b, v25.8h, #6 823 st1 {v24.16b}, [x0], #16 824 st1 {v30.16b}, [x8], #16 825 st1 {v28.8b}, [x0], x1 826 st1 {v27.8b}, [x8], x1 827 b.gt 32b 828 ret 829L(blend_v_tbl): 830 .hword L(blend_v_tbl) - 320b 831 .hword L(blend_v_tbl) - 160b 832 .hword L(blend_v_tbl) - 80b 833 .hword L(blend_v_tbl) - 40b 834 .hword L(blend_v_tbl) - 20b 835endfunc 836 837 838// This has got the same signature as the put_8tap functions, 839// and assumes that x8 is set to (clz(w)-24). 840function put_neon, export=1 841 adr x9, L(put_tbl) 842 ldrh w8, [x9, x8, lsl #1] 843 sub x9, x9, x8 844 br x9 845 84620: 847 AARCH64_VALID_JUMP_TARGET 8482: 849 ldrh w9, [x2] 850 ldrh w10, [x2, x3] 851 add x2, x2, x3, lsl #1 852 subs w5, w5, #2 853 strh w9, [x0] 854 strh w10, [x0, x1] 855 add x0, x0, x1, lsl #1 856 b.gt 2b 857 ret 85840: 859 AARCH64_VALID_JUMP_TARGET 8604: 861 ldr w9, [x2] 862 ldr w10, [x2, x3] 863 add x2, x2, x3, lsl #1 864 subs w5, w5, #2 865 str w9, [x0] 866 str w10, [x0, x1] 867 add x0, x0, x1, lsl #1 868 b.gt 4b 869 ret 87080: 871 AARCH64_VALID_JUMP_TARGET 8728: 873 ldr x9, [x2] 874 ldr x10, [x2, x3] 875 add x2, x2, x3, lsl #1 876 subs w5, w5, #2 877 str x9, [x0] 878 str x10, [x0, x1] 879 add x0, x0, x1, lsl #1 880 b.gt 8b 881 ret 882160: 883 AARCH64_VALID_JUMP_TARGET 88416: 885 ldr q0, [x2] 886 ldr q1, [x2, x3] 887 add x2, x2, x3, lsl #1 888 subs w5, w5, #2 889 str q0, [x0] 890 str q1, [x0, x1] 891 add x0, x0, x1, lsl #1 892 b.gt 16b 893 ret 894320: 895 AARCH64_VALID_JUMP_TARGET 89632: 897 ldp q0, q1, [x2] 898 add x2, x2, x3 899 stp q0, q1, [x0] 900 add x0, x0, x1 901 ldp q2, q3, [x2] 902 add x2, x2, x3 903 stp q2, q3, [x0] 904 subs w5, w5, #2 905 add x0, x0, x1 906 b.gt 32b 907 ret 908640: 909 AARCH64_VALID_JUMP_TARGET 91064: 911 ldp q0, q1, [x2] 912 stp q0, q1, [x0] 913 ldp q2, q3, [x2, #32] 914 add x2, x2, x3 915 stp q2, q3, [x0, #32] 916 subs w5, w5, #1 917 add x0, x0, x1 918 b.gt 64b 919 ret 9201280: 921 AARCH64_VALID_JUMP_TARGET 922128: 923 ldp q0, q1, [x2] 924 stp q0, q1, [x0] 925 ldp q2, q3, [x2, #32] 926 stp q2, q3, [x0, #32] 927 ldp q4, q5, [x2, #64] 928 stp q4, q5, [x0, #64] 929 ldp q6, q7, [x2, #96] 930 add x2, x2, x3 931 stp q6, q7, [x0, #96] 932 subs w5, w5, #1 933 add x0, x0, x1 934 b.gt 128b 935 ret 936 937L(put_tbl): 938 .hword L(put_tbl) - 1280b 939 .hword L(put_tbl) - 640b 940 .hword L(put_tbl) - 320b 941 .hword L(put_tbl) - 160b 942 .hword L(put_tbl) - 80b 943 .hword L(put_tbl) - 40b 944 .hword L(put_tbl) - 20b 945endfunc 946 947 948// This has got the same signature as the prep_8tap functions, 949// and assumes that x8 is set to (clz(w)-24), and x7 to w*2. 950function prep_neon, export=1 951 adr x9, L(prep_tbl) 952 ldrh w8, [x9, x8, lsl #1] 953 movi v24.16b, #16 954 sub x9, x9, x8 955 br x9 956 95740: 958 AARCH64_VALID_JUMP_TARGET 9594: 960 ld1 {v0.s}[0], [x1], x2 961 ld1 {v0.s}[1], [x1], x2 962 ld1 {v1.s}[0], [x1], x2 963 ld1 {v1.s}[1], [x1], x2 964 ushll v0.8h, v0.8b, #4 965 ushll v1.8h, v1.8b, #4 966 subs w4, w4, #4 967 stp q0, q1, [x0], #32 968 b.gt 4b 969 ret 97080: 971 AARCH64_VALID_JUMP_TARGET 9728: 973 ldr d0, [x1] 974 ldr d1, [x1, x2] 975 add x1, x1, x2, lsl #1 976 ldr d2, [x1] 977 ldr d3, [x1, x2] 978 add x1, x1, x2, lsl #1 979 ushll v0.8h, v0.8b, #4 980 ushll v1.8h, v1.8b, #4 981 umull v2.8h, v2.8b, v24.8b 982 umull v3.8h, v3.8b, v24.8b 983 subs w4, w4, #4 984 stp q0, q1, [x0] 985 stp q2, q3, [x0, #32] 986 add x0, x0, #64 987 b.gt 8b 988 ret 989160: 990 AARCH64_VALID_JUMP_TARGET 99116: 992 ldr q1, [x1] 993 ldr q3, [x1, x2] 994 add x1, x1, x2, lsl #1 995 ushll v0.8h, v1.8b, #4 996 ushll2 v1.8h, v1.16b, #4 997 ldr q5, [x1] 998 ldr q7, [x1, x2] 999 add x1, x1, x2, lsl #1 1000 umull v2.8h, v3.8b, v24.8b 1001 umull2 v3.8h, v3.16b, v24.16b 1002 ushll v4.8h, v5.8b, #4 1003 ushll2 v5.8h, v5.16b, #4 1004 umull v6.8h, v7.8b, v24.8b 1005 umull2 v7.8h, v7.16b, v24.16b 1006 subs w4, w4, #4 1007 stp q0, q1, [x0] 1008 stp q2, q3, [x0, #32] 1009 stp q4, q5, [x0, #64] 1010 stp q6, q7, [x0, #96] 1011 add x0, x0, #128 1012 b.gt 16b 1013 ret 1014320: 1015 AARCH64_VALID_JUMP_TARGET 101632: 1017 ldp q4, q5, [x1] 1018 add x1, x1, x2 1019 ldp q6, q7, [x1] 1020 add x1, x1, x2 1021 ushll v0.8h, v4.8b, #4 1022 ushll2 v1.8h, v4.16b, #4 1023 umull v2.8h, v5.8b, v24.8b 1024 umull2 v3.8h, v5.16b, v24.16b 1025 ushll v4.8h, v6.8b, #4 1026 ushll2 v5.8h, v6.16b, #4 1027 umull v6.8h, v7.8b, v24.8b 1028 umull2 v7.8h, v7.16b, v24.16b 1029 subs w4, w4, #2 1030 stp q0, q1, [x0] 1031 stp q2, q3, [x0, #32] 1032 stp q4, q5, [x0, #64] 1033 stp q6, q7, [x0, #96] 1034 add x0, x0, #128 1035 b.gt 32b 1036 ret 1037640: 1038 AARCH64_VALID_JUMP_TARGET 103964: 1040 ldp q4, q5, [x1] 1041 ldp q6, q7, [x1, #32] 1042 add x1, x1, x2 1043 ushll v0.8h, v4.8b, #4 1044 ushll2 v1.8h, v4.16b, #4 1045 umull v2.8h, v5.8b, v24.8b 1046 umull2 v3.8h, v5.16b, v24.16b 1047 ushll v4.8h, v6.8b, #4 1048 ushll2 v5.8h, v6.16b, #4 1049 umull v6.8h, v7.8b, v24.8b 1050 umull2 v7.8h, v7.16b, v24.16b 1051 subs w4, w4, #1 1052 stp q0, q1, [x0] 1053 stp q2, q3, [x0, #32] 1054 stp q4, q5, [x0, #64] 1055 stp q6, q7, [x0, #96] 1056 add x0, x0, #128 1057 b.gt 64b 1058 ret 10591280: 1060 AARCH64_VALID_JUMP_TARGET 1061128: 1062 ldp q28, q29, [x1] 1063 ldp q30, q31, [x1, #32] 1064 ushll v16.8h, v28.8b, #4 1065 ushll2 v17.8h, v28.16b, #4 1066 umull v18.8h, v29.8b, v24.8b 1067 umull2 v19.8h, v29.16b, v24.16b 1068 ushll v20.8h, v30.8b, #4 1069 ushll2 v21.8h, v30.16b, #4 1070 umull v22.8h, v31.8b, v24.8b 1071 umull2 v23.8h, v31.16b, v24.16b 1072 ldp q28, q29, [x1, #64] 1073 ldp q30, q31, [x1, #96] 1074 add x1, x1, x2 1075 stp q16, q17, [x0] 1076 stp q18, q19, [x0, #32] 1077 stp q20, q21, [x0, #64] 1078 stp q22, q23, [x0, #96] 1079 ushll v16.8h, v28.8b, #4 1080 ushll2 v17.8h, v28.16b, #4 1081 umull v18.8h, v29.8b, v24.8b 1082 umull2 v19.8h, v29.16b, v24.16b 1083 ushll v20.8h, v30.8b, #4 1084 ushll2 v21.8h, v30.16b, #4 1085 umull v22.8h, v31.8b, v24.8b 1086 umull2 v23.8h, v31.16b, v24.16b 1087 subs w4, w4, #1 1088 stp q16, q17, [x0, #128] 1089 stp q18, q19, [x0, #160] 1090 stp q20, q21, [x0, #192] 1091 stp q22, q23, [x0, #224] 1092 add x0, x0, #256 1093 b.gt 128b 1094 ret 1095 1096L(prep_tbl): 1097 .hword L(prep_tbl) - 1280b 1098 .hword L(prep_tbl) - 640b 1099 .hword L(prep_tbl) - 320b 1100 .hword L(prep_tbl) - 160b 1101 .hword L(prep_tbl) - 80b 1102 .hword L(prep_tbl) - 40b 1103endfunc 1104 1105 1106.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1107 ld1 {\d0\wd}[0], [\s0], \strd 1108 ld1 {\d1\wd}[0], [\s1], \strd 1109.ifnb \d2 1110 ld1 {\d2\wd}[0], [\s0], \strd 1111 ld1 {\d3\wd}[0], [\s1], \strd 1112.endif 1113.ifnb \d4 1114 ld1 {\d4\wd}[0], [\s0], \strd 1115.endif 1116.ifnb \d5 1117 ld1 {\d5\wd}[0], [\s1], \strd 1118.endif 1119.ifnb \d6 1120 ld1 {\d6\wd}[0], [\s0], \strd 1121.endif 1122.endm 1123.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1124 ld1 {\d0\wd}, [\s0], \strd 1125 ld1 {\d1\wd}, [\s1], \strd 1126.ifnb \d2 1127 ld1 {\d2\wd}, [\s0], \strd 1128 ld1 {\d3\wd}, [\s1], \strd 1129.endif 1130.ifnb \d4 1131 ld1 {\d4\wd}, [\s0], \strd 1132.endif 1133.ifnb \d5 1134 ld1 {\d5\wd}, [\s1], \strd 1135.endif 1136.ifnb \d6 1137 ld1 {\d6\wd}, [\s0], \strd 1138.endif 1139.endm 1140.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1141 load_slice \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1142.endm 1143.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1144 load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1145.endm 1146.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1147 load_reg \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1148.endm 1149.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1150 load_reg \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1151.endm 1152.macro interleave_1 wd, r0, r1, r2, r3, r4 1153 trn1 \r0\wd, \r0\wd, \r1\wd 1154 trn1 \r1\wd, \r1\wd, \r2\wd 1155.ifnb \r3 1156 trn1 \r2\wd, \r2\wd, \r3\wd 1157 trn1 \r3\wd, \r3\wd, \r4\wd 1158.endif 1159.endm 1160.macro interleave_1_h r0, r1, r2, r3, r4 1161 interleave_1 .4h, \r0, \r1, \r2, \r3, \r4 1162.endm 1163.macro interleave_1_s r0, r1, r2, r3, r4 1164 interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 1165.endm 1166.macro interleave_2 wd, r0, r1, r2, r3, r4, r5 1167 trn1 \r0\wd, \r0\wd, \r2\wd 1168 trn1 \r1\wd, \r1\wd, \r3\wd 1169 trn1 \r2\wd, \r2\wd, \r4\wd 1170 trn1 \r3\wd, \r3\wd, \r5\wd 1171.endm 1172.macro interleave_2_s r0, r1, r2, r3, r4, r5 1173 interleave_2 .2s, \r0, \r1, \r2, \r3, \r4, \r5 1174.endm 1175.macro uxtl_b r0, r1, r2, r3, r4, r5, r6 1176 uxtl \r0\().8h, \r0\().8b 1177 uxtl \r1\().8h, \r1\().8b 1178.ifnb \r2 1179 uxtl \r2\().8h, \r2\().8b 1180 uxtl \r3\().8h, \r3\().8b 1181.endif 1182.ifnb \r4 1183 uxtl \r4\().8h, \r4\().8b 1184.endif 1185.ifnb \r5 1186 uxtl \r5\().8h, \r5\().8b 1187.endif 1188.ifnb \r6 1189 uxtl \r6\().8h, \r6\().8b 1190.endif 1191.endm 1192.macro mul_mla_4tap d, s0, s1, s2, s3, wd 1193 mul \d\wd, \s0\wd, v0.h[0] 1194 mla \d\wd, \s1\wd, v0.h[1] 1195 mla \d\wd, \s2\wd, v0.h[2] 1196 mla \d\wd, \s3\wd, v0.h[3] 1197.endm 1198// Interleaving the mul/mla chains actually hurts performance 1199// significantly on Cortex A53, thus keeping mul/mla tightly 1200// chained like this. 1201.macro mul_mla_6tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7 1202 mul \d0\().4h, \s1\().4h, v0.h[1] 1203 mla \d0\().4h, \s2\().4h, v0.h[2] 1204 mla \d0\().4h, \s3\().4h, v0.h[3] 1205 mla \d0\().4h, \s4\().4h, v0.h[4] 1206 mla \d0\().4h, \s5\().4h, v0.h[5] 1207 mla \d0\().4h, \s6\().4h, v0.h[6] 1208.endm 1209.macro mul_mla_6tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 1210 mul \d0\().8h, \s1\().8h, v0.h[1] 1211 mla \d0\().8h, \s2\().8h, v0.h[2] 1212 mla \d0\().8h, \s3\().8h, v0.h[3] 1213 mla \d0\().8h, \s4\().8h, v0.h[4] 1214 mla \d0\().8h, \s5\().8h, v0.h[5] 1215 mla \d0\().8h, \s6\().8h, v0.h[6] 1216.endm 1217.macro mul_mla_6tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 1218 mul \d0\().8h, \s1\().8h, v0.h[1] 1219 mla \d0\().8h, \s2\().8h, v0.h[2] 1220 mla \d0\().8h, \s3\().8h, v0.h[3] 1221 mla \d0\().8h, \s4\().8h, v0.h[4] 1222 mla \d0\().8h, \s5\().8h, v0.h[5] 1223 mla \d0\().8h, \s6\().8h, v0.h[6] 1224 mul \d1\().8h, \s2\().8h, v0.h[1] 1225 mla \d1\().8h, \s3\().8h, v0.h[2] 1226 mla \d1\().8h, \s4\().8h, v0.h[3] 1227 mla \d1\().8h, \s5\().8h, v0.h[4] 1228 mla \d1\().8h, \s6\().8h, v0.h[5] 1229 mla \d1\().8h, \s7\().8h, v0.h[6] 1230.endm 1231.macro mul_mla_6tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 1232 mul \d0\().8h, \s1\().8h, v0.h[1] 1233 mla \d0\().8h, \s2\().8h, v0.h[2] 1234 mla \d0\().8h, \s3\().8h, v0.h[3] 1235 mla \d0\().8h, \s4\().8h, v0.h[4] 1236 mla \d0\().8h, \s5\().8h, v0.h[5] 1237 mla \d0\().8h, \s6\().8h, v0.h[6] 1238 mul \d1\().8h, \s3\().8h, v0.h[1] 1239 mla \d1\().8h, \s4\().8h, v0.h[2] 1240 mla \d1\().8h, \s5\().8h, v0.h[3] 1241 mla \d1\().8h, \s6\().8h, v0.h[4] 1242 mla \d1\().8h, \s7\().8h, v0.h[5] 1243 mla \d1\().8h, \s8\().8h, v0.h[6] 1244.endm 1245.macro mul_mla_8tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7 1246 mul \d0\().4h, \s0\().4h, v0.h[0] 1247 mla \d0\().4h, \s1\().4h, v0.h[1] 1248 mla \d0\().4h, \s2\().4h, v0.h[2] 1249 mla \d0\().4h, \s3\().4h, v0.h[3] 1250 mla \d0\().4h, \s4\().4h, v0.h[4] 1251 mla \d0\().4h, \s5\().4h, v0.h[5] 1252 mla \d0\().4h, \s6\().4h, v0.h[6] 1253 mla \d0\().4h, \s7\().4h, v0.h[7] 1254.endm 1255.macro mul_mla_8tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 1256 mul \d0\().8h, \s0\().8h, v0.h[0] 1257 mla \d0\().8h, \s1\().8h, v0.h[1] 1258 mla \d0\().8h, \s2\().8h, v0.h[2] 1259 mla \d0\().8h, \s3\().8h, v0.h[3] 1260 mla \d0\().8h, \s4\().8h, v0.h[4] 1261 mla \d0\().8h, \s5\().8h, v0.h[5] 1262 mla \d0\().8h, \s6\().8h, v0.h[6] 1263 mla \d0\().8h, \s7\().8h, v0.h[7] 1264.endm 1265.macro mul_mla_8tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 1266 mul \d0\().8h, \s0\().8h, v0.h[0] 1267 mla \d0\().8h, \s1\().8h, v0.h[1] 1268 mla \d0\().8h, \s2\().8h, v0.h[2] 1269 mla \d0\().8h, \s3\().8h, v0.h[3] 1270 mla \d0\().8h, \s4\().8h, v0.h[4] 1271 mla \d0\().8h, \s5\().8h, v0.h[5] 1272 mla \d0\().8h, \s6\().8h, v0.h[6] 1273 mla \d0\().8h, \s7\().8h, v0.h[7] 1274 mul \d1\().8h, \s1\().8h, v0.h[0] 1275 mla \d1\().8h, \s2\().8h, v0.h[1] 1276 mla \d1\().8h, \s3\().8h, v0.h[2] 1277 mla \d1\().8h, \s4\().8h, v0.h[3] 1278 mla \d1\().8h, \s5\().8h, v0.h[4] 1279 mla \d1\().8h, \s6\().8h, v0.h[5] 1280 mla \d1\().8h, \s7\().8h, v0.h[6] 1281 mla \d1\().8h, \s8\().8h, v0.h[7] 1282.endm 1283.macro mul_mla_8tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 1284 mul \d0\().8h, \s0\().8h, v0.h[0] 1285 mla \d0\().8h, \s1\().8h, v0.h[1] 1286 mla \d0\().8h, \s2\().8h, v0.h[2] 1287 mla \d0\().8h, \s3\().8h, v0.h[3] 1288 mla \d0\().8h, \s4\().8h, v0.h[4] 1289 mla \d0\().8h, \s5\().8h, v0.h[5] 1290 mla \d0\().8h, \s6\().8h, v0.h[6] 1291 mla \d0\().8h, \s7\().8h, v0.h[7] 1292 mul \d1\().8h, \s2\().8h, v0.h[0] 1293 mla \d1\().8h, \s3\().8h, v0.h[1] 1294 mla \d1\().8h, \s4\().8h, v0.h[2] 1295 mla \d1\().8h, \s5\().8h, v0.h[3] 1296 mla \d1\().8h, \s6\().8h, v0.h[4] 1297 mla \d1\().8h, \s7\().8h, v0.h[5] 1298 mla \d1\().8h, \s8\().8h, v0.h[6] 1299 mla \d1\().8h, \s9\().8h, v0.h[7] 1300.endm 1301.macro sqrshrun_b shift, r0, r1, r2, r3 1302 sqrshrun \r0\().8b, \r0\().8h, #\shift 1303.ifnb \r1 1304 sqrshrun \r1\().8b, \r1\().8h, #\shift 1305.endif 1306.ifnb \r2 1307 sqrshrun \r2\().8b, \r2\().8h, #\shift 1308 sqrshrun \r3\().8b, \r3\().8h, #\shift 1309.endif 1310.endm 1311.macro srshr_h shift, r0, r1, r2, r3 1312 srshr \r0\().8h, \r0\().8h, #\shift 1313.ifnb \r1 1314 srshr \r1\().8h, \r1\().8h, #\shift 1315.endif 1316.ifnb \r2 1317 srshr \r2\().8h, \r2\().8h, #\shift 1318 srshr \r3\().8h, \r3\().8h, #\shift 1319.endif 1320.endm 1321.macro st_h strd, reg, lanes 1322 st1 {\reg\().h}[0], [x0], \strd 1323 st1 {\reg\().h}[1], [x8], \strd 1324.if \lanes > 2 1325 st1 {\reg\().h}[2], [x0], \strd 1326 st1 {\reg\().h}[3], [x8], \strd 1327.endif 1328.endm 1329.macro st_s strd, r0, r1 1330 st1 {\r0\().s}[0], [x0], \strd 1331 st1 {\r0\().s}[1], [x8], \strd 1332.ifnb \r1 1333 st1 {\r1\().s}[0], [x0], \strd 1334 st1 {\r1\().s}[1], [x8], \strd 1335.endif 1336.endm 1337.macro st_d strd, r0, r1 1338 st1 {\r0\().d}[0], [x0], \strd 1339 st1 {\r0\().d}[1], [x8], \strd 1340.ifnb \r1 1341 st1 {\r1\().d}[0], [x0], \strd 1342 st1 {\r1\().d}[1], [x8], \strd 1343.endif 1344.endm 1345.macro shift_store_4 type, strd, r0, r1 1346.ifc \type, put 1347 sqrshrun_b 6, \r0, \r1 1348 st_s \strd, \r0, \r1 1349.else 1350 srshr_h 2, \r0, \r1 1351 st_d \strd, \r0, \r1 1352.endif 1353.endm 1354.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 1355 st1 {\r0\wd}, [x0], \strd 1356 st1 {\r1\wd}, [x8], \strd 1357.ifnb \r2 1358 st1 {\r2\wd}, [x0], \strd 1359 st1 {\r3\wd}, [x8], \strd 1360.endif 1361.ifnb \r4 1362 st1 {\r4\wd}, [x0], \strd 1363 st1 {\r5\wd}, [x8], \strd 1364 st1 {\r6\wd}, [x0], \strd 1365 st1 {\r7\wd}, [x8], \strd 1366.endif 1367.endm 1368.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7 1369 st_reg \strd, .8b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 1370.endm 1371.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7 1372 st_reg \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 1373.endm 1374.macro shift_store_8 type, strd, r0, r1, r2, r3 1375.ifc \type, put 1376 sqrshrun_b 6, \r0, \r1, \r2, \r3 1377 st_8b \strd, \r0, \r1, \r2, \r3 1378.else 1379 srshr_h 2, \r0, \r1, \r2, \r3 1380 st_16b \strd, \r0, \r1, \r2, \r3 1381.endif 1382.endm 1383.macro shift_store_16 type, strd, r0, r1, r2, r3 1384.ifc \type, put 1385 sqrshrun \r0\().8b, \r0\().8h, #6 1386 sqrshrun2 \r0\().16b, \r1\().8h, #6 1387 sqrshrun \r2\().8b, \r2\().8h, #6 1388 sqrshrun2 \r2\().16b, \r3\().8h, #6 1389 st_16b \strd, \r0, \r2 1390.else 1391 srshr_h 2, \r0, \r1, \r2, \r3 1392 st1 {\r0\().8h, \r1\().8h}, [x0], \strd 1393 st1 {\r2\().8h, \r3\().8h}, [x8], \strd 1394.endif 1395.endm 1396 1397.macro make_8tap_fn op, type, type_h, type_v, taps 1398function \op\()_8tap_\type\()_8bpc_neon, export=1 1399 mov x8, \type_h 1400 mov x9, \type_v 1401 b \op\()_\taps\()_neon 1402endfunc 1403.endm 1404 1405// No spaces in these expressions, due to gas-preprocessor. 1406#define REGULAR ((0*15<<7)|3*15) 1407#define SMOOTH ((1*15<<7)|4*15) 1408#define SHARP ((2*15<<7)|3*15) 1409 1410.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv, taps 1411function \type\()_\taps\()_neon 1412 mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) 1413 mul \mx, \mx, w10 1414 mul \my, \my, w10 1415 add \mx, \mx, w8 // mx, 8tap_h, 4tap_h 1416 add \my, \my, w9 // my, 8tap_v, 4tap_v 1417.ifc \type, prep 1418 uxtw \d_strd, \w 1419 lsl \d_strd, \d_strd, #1 1420.endif 1421 1422 clz w8, \w 1423 tst \mx, #(0x7f << 14) 1424 sub w8, w8, #24 1425 movrel x10, X(mc_subpel_filters), -8 1426 b.ne L(\type\()_\taps\()_h) 1427 tst \my, #(0x7f << 14) 1428 b.ne L(\type\()_\taps\()_v) 1429 b \type\()_neon 1430 1431L(\type\()_\taps\()_h): 1432 cmp \w, #4 1433 ubfx w9, \mx, #7, #7 1434 and \mx, \mx, #0x7f 1435 b.le 4f 1436 mov \mx, w9 14374: 1438 tst \my, #(0x7f << 14) 1439 add \xmx, x10, \mx, uxtw #3 1440 b.ne L(\type\()_\taps\()_hv) 1441 1442 adr x9, L(\type\()_\taps\()_h_tbl) 1443 ldrh w8, [x9, x8, lsl #1] 1444 sub x9, x9, w8, uxtw 1445 br x9 1446 144720: // 2xN h 1448 AARCH64_VALID_JUMP_TARGET 1449.ifc \type, put 1450 add \xmx, \xmx, #2 1451 ld1 {v0.s}[0], [\xmx] 1452 sub \src, \src, #1 1453 add \ds2, \dst, \d_strd 1454 add \sr2, \src, \s_strd 1455 lsl \d_strd, \d_strd, #1 1456 lsl \s_strd, \s_strd, #1 1457 sxtl v0.8h, v0.8b 14582: 1459 ld1 {v4.8b}, [\src], \s_strd 1460 ld1 {v6.8b}, [\sr2], \s_strd 1461 uxtl v4.8h, v4.8b 1462 uxtl v6.8h, v6.8b 1463 ext v5.16b, v4.16b, v4.16b, #2 1464 ext v7.16b, v6.16b, v6.16b, #2 1465 subs \h, \h, #2 1466 trn1 v3.2s, v4.2s, v6.2s 1467 trn2 v6.2s, v4.2s, v6.2s 1468 trn1 v4.2s, v5.2s, v7.2s 1469 trn2 v7.2s, v5.2s, v7.2s 1470 mul v3.4h, v3.4h, v0.h[0] 1471 mla v3.4h, v4.4h, v0.h[1] 1472 mla v3.4h, v6.4h, v0.h[2] 1473 mla v3.4h, v7.4h, v0.h[3] 1474 srshr v3.4h, v3.4h, #2 1475 sqrshrun v3.8b, v3.8h, #4 1476 st1 {v3.h}[0], [\dst], \d_strd 1477 st1 {v3.h}[1], [\ds2], \d_strd 1478 b.gt 2b 1479 ret 1480.endif 1481 148240: // 4xN h 1483 AARCH64_VALID_JUMP_TARGET 1484 add \xmx, \xmx, #2 1485 ld1 {v0.s}[0], [\xmx] 1486 sub \src, \src, #1 1487 add \ds2, \dst, \d_strd 1488 add \sr2, \src, \s_strd 1489 lsl \d_strd, \d_strd, #1 1490 lsl \s_strd, \s_strd, #1 1491 sxtl v0.8h, v0.8b 14924: 1493 ld1 {v16.8b}, [\src], \s_strd 1494 ld1 {v20.8b}, [\sr2], \s_strd 1495 uxtl v16.8h, v16.8b 1496 uxtl v20.8h, v20.8b 1497 ext v17.16b, v16.16b, v16.16b, #2 1498 ext v18.16b, v16.16b, v16.16b, #4 1499 ext v19.16b, v16.16b, v16.16b, #6 1500 ext v21.16b, v20.16b, v20.16b, #2 1501 ext v22.16b, v20.16b, v20.16b, #4 1502 ext v23.16b, v20.16b, v20.16b, #6 1503 subs \h, \h, #2 1504 mul v16.4h, v16.4h, v0.h[0] 1505 mla v16.4h, v17.4h, v0.h[1] 1506 mla v16.4h, v18.4h, v0.h[2] 1507 mla v16.4h, v19.4h, v0.h[3] 1508 mul v20.4h, v20.4h, v0.h[0] 1509 mla v20.4h, v21.4h, v0.h[1] 1510 mla v20.4h, v22.4h, v0.h[2] 1511 mla v20.4h, v23.4h, v0.h[3] 1512 srshr v16.4h, v16.4h, #2 1513 srshr v20.4h, v20.4h, #2 1514.ifc \type, put 1515 sqrshrun v16.8b, v16.8h, #4 1516 sqrshrun v20.8b, v20.8h, #4 1517 st1 {v16.s}[0], [\dst], \d_strd 1518 st1 {v20.s}[0], [\ds2], \d_strd 1519.else 1520 st1 {v16.4h}, [\dst], \d_strd 1521 st1 {v20.4h}, [\ds2], \d_strd 1522.endif 1523 b.gt 4b 1524 ret 1525 152680: // 8xN h 1527 AARCH64_VALID_JUMP_TARGET 1528 ld1 {v0.8b}, [\xmx] 1529 sub \src, \src, #3 1530 add \ds2, \dst, \d_strd 1531 add \sr2, \src, \s_strd 1532 lsl \d_strd, \d_strd, #1 1533 lsl \s_strd, \s_strd, #1 1534 sxtl v0.8h, v0.8b 15358: 1536 ld1 {v16.8b, v17.8b}, [\src], \s_strd 1537 ld1 {v20.8b, v21.8b}, [\sr2], \s_strd 1538 uxtl v16.8h, v16.8b 1539 uxtl v17.8h, v17.8b 1540 uxtl v20.8h, v20.8b 1541 uxtl v21.8h, v21.8b 1542 1543.ifc \taps, 6tap 1544 ext v19.16b, v16.16b, v17.16b, #2 1545 ext v23.16b, v20.16b, v21.16b, #2 1546 mul v18.8h, v19.8h, v0.h[1] 1547 mul v22.8h, v23.8h, v0.h[1] 1548.irpc i, 23456 1549 ext v19.16b, v16.16b, v17.16b, #(2*\i) 1550 ext v23.16b, v20.16b, v21.16b, #(2*\i) 1551 mla v18.8h, v19.8h, v0.h[\i] 1552 mla v22.8h, v23.8h, v0.h[\i] 1553.endr 1554.else // 8tap 1555 mul v18.8h, v16.8h, v0.h[0] 1556 mul v22.8h, v20.8h, v0.h[0] 1557.irpc i, 1234567 1558 ext v19.16b, v16.16b, v17.16b, #(2*\i) 1559 ext v23.16b, v20.16b, v21.16b, #(2*\i) 1560 mla v18.8h, v19.8h, v0.h[\i] 1561 mla v22.8h, v23.8h, v0.h[\i] 1562.endr 1563.endif 1564 subs \h, \h, #2 1565 srshr v18.8h, v18.8h, #2 1566 srshr v22.8h, v22.8h, #2 1567.ifc \type, put 1568 sqrshrun v18.8b, v18.8h, #4 1569 sqrshrun v22.8b, v22.8h, #4 1570 st1 {v18.8b}, [\dst], \d_strd 1571 st1 {v22.8b}, [\ds2], \d_strd 1572.else 1573 st1 {v18.8h}, [\dst], \d_strd 1574 st1 {v22.8h}, [\ds2], \d_strd 1575.endif 1576 b.gt 8b 1577 ret 1578160: 1579320: 1580640: 15811280: // 16xN, 32xN, ... h 1582 AARCH64_VALID_JUMP_TARGET 1583 ld1 {v0.8b}, [\xmx] 1584 sub \src, \src, #3 1585 add \ds2, \dst, \d_strd 1586 add \sr2, \src, \s_strd 1587 lsl \s_strd, \s_strd, #1 1588 sxtl v0.8h, v0.8b 1589 1590 sub \s_strd, \s_strd, \w, uxtw 1591 sub \s_strd, \s_strd, #8 1592.ifc \type, put 1593 lsl \d_strd, \d_strd, #1 1594 sub \d_strd, \d_strd, \w, uxtw 1595.endif 1596161: 1597 ld1 {v16.8b, v17.8b, v18.8b}, [\src], #24 1598 ld1 {v20.8b, v21.8b, v22.8b}, [\sr2], #24 1599 mov \mx, \w 1600 uxtl v16.8h, v16.8b 1601 uxtl v17.8h, v17.8b 1602 uxtl v18.8h, v18.8b 1603 uxtl v20.8h, v20.8b 1604 uxtl v21.8h, v21.8b 1605 uxtl v22.8h, v22.8b 1606 160716: 1608.ifc \taps, 6tap 1609 ext v28.16b, v16.16b, v17.16b, #2 1610 ext v29.16b, v17.16b, v18.16b, #2 1611 ext v30.16b, v20.16b, v21.16b, #2 1612 ext v31.16b, v21.16b, v22.16b, #2 1613 mul v24.8h, v28.8h, v0.h[1] 1614 mul v25.8h, v29.8h, v0.h[1] 1615 mul v26.8h, v30.8h, v0.h[1] 1616 mul v27.8h, v31.8h, v0.h[1] 1617.irpc i, 23456 1618 ext v28.16b, v16.16b, v17.16b, #(2*\i) 1619 ext v29.16b, v17.16b, v18.16b, #(2*\i) 1620 ext v30.16b, v20.16b, v21.16b, #(2*\i) 1621 ext v31.16b, v21.16b, v22.16b, #(2*\i) 1622 mla v24.8h, v28.8h, v0.h[\i] 1623 mla v25.8h, v29.8h, v0.h[\i] 1624 mla v26.8h, v30.8h, v0.h[\i] 1625 mla v27.8h, v31.8h, v0.h[\i] 1626.endr 1627.else // 8tap 1628 mul v24.8h, v16.8h, v0.h[0] 1629 mul v25.8h, v17.8h, v0.h[0] 1630 mul v26.8h, v20.8h, v0.h[0] 1631 mul v27.8h, v21.8h, v0.h[0] 1632.irpc i, 1234567 1633 ext v28.16b, v16.16b, v17.16b, #(2*\i) 1634 ext v29.16b, v17.16b, v18.16b, #(2*\i) 1635 ext v30.16b, v20.16b, v21.16b, #(2*\i) 1636 ext v31.16b, v21.16b, v22.16b, #(2*\i) 1637 mla v24.8h, v28.8h, v0.h[\i] 1638 mla v25.8h, v29.8h, v0.h[\i] 1639 mla v26.8h, v30.8h, v0.h[\i] 1640 mla v27.8h, v31.8h, v0.h[\i] 1641.endr 1642.endif 1643 srshr v24.8h, v24.8h, #2 1644 srshr v25.8h, v25.8h, #2 1645 srshr v26.8h, v26.8h, #2 1646 srshr v27.8h, v27.8h, #2 1647 subs \mx, \mx, #16 1648.ifc \type, put 1649 sqrshrun v24.8b, v24.8h, #4 1650 sqrshrun2 v24.16b, v25.8h, #4 1651 sqrshrun v26.8b, v26.8h, #4 1652 sqrshrun2 v26.16b, v27.8h, #4 1653 st1 {v24.16b}, [\dst], #16 1654 st1 {v26.16b}, [\ds2], #16 1655.else 1656 st1 {v24.8h, v25.8h}, [\dst], #32 1657 st1 {v26.8h, v27.8h}, [\ds2], #32 1658.endif 1659 b.le 9f 1660 1661 mov v16.16b, v18.16b 1662 mov v20.16b, v22.16b 1663 ld1 {v17.8b, v18.8b}, [\src], #16 1664 ld1 {v21.8b, v22.8b}, [\sr2], #16 1665 uxtl v17.8h, v17.8b 1666 uxtl v18.8h, v18.8b 1667 uxtl v21.8h, v21.8b 1668 uxtl v22.8h, v22.8b 1669 b 16b 1670 16719: 1672 add \dst, \dst, \d_strd 1673 add \ds2, \ds2, \d_strd 1674 add \src, \src, \s_strd 1675 add \sr2, \sr2, \s_strd 1676 1677 subs \h, \h, #2 1678 b.gt 161b 1679 ret 1680 1681L(\type\()_\taps\()_h_tbl): 1682 .hword L(\type\()_\taps\()_h_tbl) - 1280b 1683 .hword L(\type\()_\taps\()_h_tbl) - 640b 1684 .hword L(\type\()_\taps\()_h_tbl) - 320b 1685 .hword L(\type\()_\taps\()_h_tbl) - 160b 1686 .hword L(\type\()_\taps\()_h_tbl) - 80b 1687 .hword L(\type\()_\taps\()_h_tbl) - 40b 1688 .hword L(\type\()_\taps\()_h_tbl) - 20b 1689 .hword 0 1690 1691 1692L(\type\()_\taps\()_v): 1693 cmp \h, #4 1694 ubfx w9, \my, #7, #7 1695 and \my, \my, #0x7f 1696 b.le 4f 1697 mov \my, w9 16984: 1699 add \xmy, x10, \my, uxtw #3 1700 1701 adr x9, L(\type\()_\taps\()_v_tbl) 1702 ldrh w8, [x9, x8, lsl #1] 1703 sub x9, x9, w8, uxtw 1704 br x9 1705 170620: // 2xN v 1707 AARCH64_VALID_JUMP_TARGET 1708.ifc \type, put 1709 b.gt 28f 1710 1711 cmp \h, #2 1712 add \xmy, \xmy, #2 1713 ld1 {v0.s}[0], [\xmy] 1714 sub \src, \src, \s_strd 1715 add \ds2, \dst, \d_strd 1716 add \sr2, \src, \s_strd 1717 lsl \s_strd, \s_strd, #1 1718 lsl \d_strd, \d_strd, #1 1719 sxtl v0.8h, v0.8b 1720 1721 // 2x2 v 1722 load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1723 interleave_1_h v1, v2, v3, v4, v5 1724 b.gt 24f 1725 uxtl_b v1, v2, v3, v4 1726 mul_mla_4tap v6, v1, v2, v3, v4, .4h 1727 sqrshrun_b 6, v6 1728 st_h \d_strd, v6, 2 1729 ret 1730 173124: // 2x4 v 1732 load_h \sr2, \src, \s_strd, v6, v7 1733 interleave_1_h v5, v6, v7 1734 interleave_2_s v1, v2, v3, v4, v5, v6 1735 uxtl_b v1, v2, v3, v4 1736 mul_mla_4tap v6, v1, v2, v3, v4, .8h 1737 sqrshrun_b 6, v6 1738 st_h \d_strd, v6, 4 1739 ret 1740 174128: // 2x6, 2x8, 2x12, 2x16 v 1742 ld1 {v0.8b}, [\xmy] 1743 sub \sr2, \src, \s_strd, lsl #1 1744 add \ds2, \dst, \d_strd 1745 sub \src, \sr2, \s_strd 1746 lsl \d_strd, \d_strd, #1 1747 lsl \s_strd, \s_strd, #1 1748 sxtl v0.8h, v0.8b 1749 1750 load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 1751 interleave_1_h v1, v2, v3, v4, v5 1752 interleave_1_h v5, v6, v7 1753 interleave_2_s v1, v2, v3, v4, v5, v6 1754 uxtl_b v1, v2, v3, v4 1755216: 1756 subs \h, \h, #4 1757 load_h \sr2, \src, \s_strd, v16, v17, v18, v19 1758 interleave_1_h v7, v16, v17, v18, v19 1759 interleave_2_s v5, v6, v7, v16, v17, v18 1760 uxtl_b v5, v6, v7, v16 1761 mul_mla_\taps\()_0 v30, v1, v2, v3, v4, v5, v6, v7, v16 1762 sqrshrun_b 6, v30 1763 st_h \d_strd, v30, 4 1764 b.le 0f 1765 cmp \h, #2 1766 mov v1.16b, v5.16b 1767 mov v2.16b, v6.16b 1768 mov v3.16b, v7.16b 1769 mov v4.16b, v16.16b 1770 mov v5.16b, v17.16b 1771 mov v6.16b, v18.16b 1772 mov v7.16b, v19.16b 1773 b.eq 26f 1774 b 216b 177526: 1776 load_h \sr2, \src, \s_strd, v16, v17 1777 interleave_1_h v7, v16, v17 1778 uxtl_b v5, v6, v7, v16 1779 mul_mla_\taps\()_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16 1780 sqrshrun_b 6, v30 1781 st_h \d_strd, v30, 2 17820: 1783 ret 1784.endif 1785 178640: 1787 AARCH64_VALID_JUMP_TARGET 1788 b.gt 480f 1789 1790 // 4x2, 4x4 v 1791 cmp \h, #2 1792 add \xmy, \xmy, #2 1793 ld1 {v0.s}[0], [\xmy] 1794 sub \src, \src, \s_strd 1795 add \ds2, \dst, \d_strd 1796 add \sr2, \src, \s_strd 1797 lsl \s_strd, \s_strd, #1 1798 lsl \d_strd, \d_strd, #1 1799 sxtl v0.8h, v0.8b 1800 1801 load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1802 interleave_1_s v1, v2, v3, v4, v5 1803 uxtl_b v1, v2, v3, v4 1804 mul_mla_4tap v6, v1, v2, v3, v4, .8h 1805 shift_store_4 \type, \d_strd, v6 1806 b.le 0f 1807 load_s \sr2, \src, \s_strd, v6, v7 1808 interleave_1_s v5, v6, v7 1809 uxtl_b v5, v6 1810 mul_mla_4tap v7, v3, v4, v5, v6, .8h 1811 shift_store_4 \type, \d_strd, v7 18120: 1813 ret 1814 1815480: // 4x6, 4x8, 4x12, 4x16 v 1816 ld1 {v0.8b}, [\xmy] 1817 sub \sr2, \src, \s_strd, lsl #1 1818 add \ds2, \dst, \d_strd 1819 sub \src, \sr2, \s_strd 1820 lsl \s_strd, \s_strd, #1 1821 lsl \d_strd, \d_strd, #1 1822 sxtl v0.8h, v0.8b 1823 1824 load_s \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1825 interleave_1_s v16, v17, v18 1826 interleave_1_s v18, v19, v20, v21, v22 1827 uxtl_b v16, v17 1828 uxtl_b v18, v19, v20, v21 1829 183048: 1831 subs \h, \h, #4 1832 load_s \sr2, \src, \s_strd, v23, v24, v25, v26 1833 interleave_1_s v22, v23, v24, v25, v26 1834 uxtl_b v22, v23, v24, v25 1835 mul_mla_\taps\()_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 1836 shift_store_4 \type, \d_strd, v1, v2 1837 b.le 0f 1838 load_s \sr2, \src, \s_strd, v27, v16 1839 subs \h, \h, #2 1840 interleave_1_s v26, v27, v16 1841 uxtl_b v26, v27 1842 mul_mla_\taps\()_0 v1, v20, v21, v22, v23, v24, v25, v26, v27 1843 shift_store_4 \type, \d_strd, v1 1844 b.le 0f 1845 load_s \sr2, \src, \s_strd, v17, v18 1846 subs \h, \h, #2 1847 interleave_1_s v16, v17, v18 1848 uxtl_b v16, v17 1849 mul_mla_\taps\()_0 v2, v22, v23, v24, v25, v26, v27, v16, v17 1850 shift_store_4 \type, \d_strd, v2 1851 b.le 0f 1852 subs \h, \h, #4 1853 load_s \sr2, \src, \s_strd, v19, v20, v21, v22 1854 interleave_1_s v18, v19, v20, v21, v22 1855 uxtl_b v18, v19, v20, v21 1856 mul_mla_\taps\()_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 1857 shift_store_4 \type, \d_strd, v1, v2 1858 b.gt 48b 18590: 1860 ret 1861 186280: 1863 AARCH64_VALID_JUMP_TARGET 1864 b.gt 880f 1865 1866 // 8x2, 8x4 v 1867 cmp \h, #2 1868 add \xmy, \xmy, #2 1869 ld1 {v0.s}[0], [\xmy] 1870 sub \src, \src, \s_strd 1871 add \ds2, \dst, \d_strd 1872 add \sr2, \src, \s_strd 1873 lsl \s_strd, \s_strd, #1 1874 lsl \d_strd, \d_strd, #1 1875 sxtl v0.8h, v0.8b 1876 1877 load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1878 uxtl_b v1, v2, v3, v4, v5 1879 mul_mla_4tap v6, v1, v2, v3, v4, .8h 1880 mul_mla_4tap v7, v2, v3, v4, v5, .8h 1881 shift_store_8 \type, \d_strd, v6, v7 1882 b.le 0f 1883 load_8b \sr2, \src, \s_strd, v6, v7 1884 uxtl_b v6, v7 1885 mul_mla_4tap v1, v3, v4, v5, v6, .8h 1886 mul_mla_4tap v2, v4, v5, v6, v7, .8h 1887 shift_store_8 \type, \d_strd, v1, v2 18880: 1889 ret 1890 1891880: // 8x6, 8x8, 8x16, 8x32 v 18921680: // 16x8, 16x16, ... 1893320: // 32x8, 32x16, ... 1894640: 18951280: 1896 AARCH64_VALID_JUMP_TARGET 1897 ld1 {v0.8b}, [\xmy] 1898 sub \src, \src, \s_strd 1899 sub \src, \src, \s_strd, lsl #1 1900 sxtl v0.8h, v0.8b 1901 mov \my, \h 1902168: 1903 add \ds2, \dst, \d_strd 1904 add \sr2, \src, \s_strd 1905 lsl \s_strd, \s_strd, #1 1906 lsl \d_strd, \d_strd, #1 1907 1908 load_8b \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1909 uxtl_b v16, v17, v18, v19, v20, v21, v22 1910 191188: 1912 subs \h, \h, #2 1913 load_8b \sr2, \src, \s_strd, v23, v24 1914 uxtl_b v23, v24 1915 mul_mla_\taps\()_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24 1916 shift_store_8 \type, \d_strd, v1, v2 1917 b.le 9f 1918 subs \h, \h, #2 1919 load_8b \sr2, \src, \s_strd, v25, v26 1920 uxtl_b v25, v26 1921 mul_mla_\taps\()_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26 1922 shift_store_8 \type, \d_strd, v3, v4 1923 b.le 9f 1924 subs \h, \h, #2 1925 load_8b \sr2, \src, \s_strd, v27, v16 1926 uxtl_b v27, v16 1927 mul_mla_\taps\()_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16 1928 shift_store_8 \type, \d_strd, v1, v2 1929 b.le 9f 1930 subs \h, \h, #2 1931 load_8b \sr2, \src, \s_strd, v17, v18 1932 uxtl_b v17, v18 1933 mul_mla_\taps\()_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18 1934 shift_store_8 \type, \d_strd, v3, v4 1935 b.le 9f 1936 subs \h, \h, #4 1937 load_8b \sr2, \src, \s_strd, v19, v20, v21, v22 1938 uxtl_b v19, v20, v21, v22 1939 mul_mla_\taps\()_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20 1940 mul_mla_\taps\()_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22 1941 shift_store_8 \type, \d_strd, v1, v2, v3, v4 1942 b.gt 88b 19439: 1944 subs \w, \w, #8 1945 b.le 0f 1946 asr \s_strd, \s_strd, #1 1947 asr \d_strd, \d_strd, #1 1948 msub \src, \s_strd, \xmy, \src 1949 msub \dst, \d_strd, \xmy, \dst 1950 sub \src, \src, \s_strd, lsl #3 1951 mov \h, \my 1952 add \src, \src, #8 1953.ifc \type, put 1954 add \dst, \dst, #8 1955.else 1956 add \dst, \dst, #16 1957.endif 1958 b 168b 19590: 1960 ret 1961 1962160: 1963 AARCH64_VALID_JUMP_TARGET 1964 b.gt 1680b 1965 1966 // 16x2, 16x4 v 1967 add \xmy, \xmy, #2 1968 ld1 {v0.s}[0], [\xmy] 1969 sub \src, \src, \s_strd 1970 add \ds2, \dst, \d_strd 1971 add \sr2, \src, \s_strd 1972 lsl \s_strd, \s_strd, #1 1973 lsl \d_strd, \d_strd, #1 1974 sxtl v0.8h, v0.8b 1975 1976 cmp \h, #2 1977 load_16b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1978 uxtl v16.8h, v1.8b 1979 uxtl v17.8h, v2.8b 1980 uxtl v18.8h, v3.8b 1981 uxtl v19.8h, v4.8b 1982 uxtl v20.8h, v5.8b 1983 uxtl2 v23.8h, v1.16b 1984 uxtl2 v24.8h, v2.16b 1985 uxtl2 v25.8h, v3.16b 1986 uxtl2 v26.8h, v4.16b 1987 uxtl2 v27.8h, v5.16b 1988 mul_mla_4tap v1, v16, v17, v18, v19, .8h 1989 mul_mla_4tap v16, v17, v18, v19, v20, .8h 1990 mul_mla_4tap v2, v23, v24, v25, v26, .8h 1991 mul_mla_4tap v17, v24, v25, v26, v27, .8h 1992 shift_store_16 \type, \d_strd, v1, v2, v16, v17 1993 b.le 0f 1994 load_16b \sr2, \src, \s_strd, v6, v7 1995 uxtl v21.8h, v6.8b 1996 uxtl v22.8h, v7.8b 1997 uxtl2 v28.8h, v6.16b 1998 uxtl2 v29.8h, v7.16b 1999 mul_mla_4tap v1, v18, v19, v20, v21, .8h 2000 mul_mla_4tap v3, v19, v20, v21, v22, .8h 2001 mul_mla_4tap v2, v25, v26, v27, v28, .8h 2002 mul_mla_4tap v4, v26, v27, v28, v29, .8h 2003 shift_store_16 \type, \d_strd, v1, v2, v3, v4 20040: 2005 ret 2006 2007L(\type\()_\taps\()_v_tbl): 2008 .hword L(\type\()_\taps\()_v_tbl) - 1280b 2009 .hword L(\type\()_\taps\()_v_tbl) - 640b 2010 .hword L(\type\()_\taps\()_v_tbl) - 320b 2011 .hword L(\type\()_\taps\()_v_tbl) - 160b 2012 .hword L(\type\()_\taps\()_v_tbl) - 80b 2013 .hword L(\type\()_\taps\()_v_tbl) - 40b 2014 .hword L(\type\()_\taps\()_v_tbl) - 20b 2015 .hword 0 2016 2017L(\type\()_\taps\()_hv): 2018 cmp \h, #4 2019 ubfx w9, \my, #7, #7 2020 and \my, \my, #0x7f 2021 b.le 4f 2022 mov \my, w9 20234: 2024 add \xmy, x10, \my, uxtw #3 2025 2026 adr x9, L(\type\()_\taps\()_hv_tbl) 2027 ldrh w8, [x9, x8, lsl #1] 2028 sub x9, x9, w8, uxtw 2029 br x9 2030 203120: 2032 AARCH64_VALID_JUMP_TARGET 2033.ifc \type, put 2034 add \xmx, \xmx, #2 2035 ld1 {v0.s}[0], [\xmx] 2036 b.gt 280f 2037 add \xmy, \xmy, #2 2038 ld1 {v1.s}[0], [\xmy] 2039 2040 // 2x2, 2x4 hv 2041 sub \sr2, \src, #1 2042 sub \src, \sr2, \s_strd 2043 add \ds2, \dst, \d_strd 2044 lsl \s_strd, \s_strd, #1 2045 lsl \d_strd, \d_strd, #1 2046 sxtl v0.8h, v0.8b 2047 sxtl v1.8h, v1.8b 2048 mov x15, x30 2049 2050 ld1 {v28.8b}, [\src], \s_strd 2051 uxtl v28.8h, v28.8b 2052 ext v29.16b, v28.16b, v28.16b, #2 2053 mul v28.4h, v28.4h, v0.4h 2054 mul v29.4h, v29.4h, v0.4h 2055 addp v28.4h, v28.4h, v29.4h 2056 addp v16.4h, v28.4h, v28.4h 2057 srshr v16.4h, v16.4h, #2 2058 bl L(\type\()_\taps\()_filter_2) 2059 2060 trn1 v16.2s, v16.2s, v28.2s 2061 mov v17.8b, v28.8b 2062 20632: 2064 bl L(\type\()_\taps\()_filter_2) 2065 2066 ext v18.8b, v17.8b, v28.8b, #4 2067 smull v2.4s, v16.4h, v1.h[0] 2068 smlal v2.4s, v17.4h, v1.h[1] 2069 smlal v2.4s, v18.4h, v1.h[2] 2070 smlal v2.4s, v28.4h, v1.h[3] 2071 2072 sqrshrn v2.4h, v2.4s, #\shift_hv 2073 sqxtun v2.8b, v2.8h 2074 subs \h, \h, #2 2075 st1 {v2.h}[0], [\dst], \d_strd 2076 st1 {v2.h}[1], [\ds2], \d_strd 2077 b.le 0f 2078 mov v16.8b, v18.8b 2079 mov v17.8b, v28.8b 2080 b 2b 2081 2082280: // 2x8, 2x16, 2x32 hv 2083 ld1 {v1.8b}, [\xmy] 2084 sub \src, \src, #1 2085 sub \sr2, \src, \s_strd, lsl #1 2086 sub \src, \sr2, \s_strd 2087 add \ds2, \dst, \d_strd 2088 lsl \s_strd, \s_strd, #1 2089 lsl \d_strd, \d_strd, #1 2090 sxtl v0.8h, v0.8b 2091 sxtl v1.8h, v1.8b 2092 mov x15, x30 2093 2094 ld1 {v28.8b}, [\src], \s_strd 2095 uxtl v28.8h, v28.8b 2096 ext v29.16b, v28.16b, v28.16b, #2 2097 mul v28.4h, v28.4h, v0.4h 2098 mul v29.4h, v29.4h, v0.4h 2099 addp v28.4h, v28.4h, v29.4h 2100 addp v16.4h, v28.4h, v28.4h 2101 srshr v16.4h, v16.4h, #2 2102 2103 bl L(\type\()_\taps\()_filter_2) 2104 trn1 v16.2s, v16.2s, v28.2s 2105 mov v17.8b, v28.8b 2106 bl L(\type\()_\taps\()_filter_2) 2107 ext v18.8b, v17.8b, v28.8b, #4 2108 mov v19.8b, v28.8b 2109 bl L(\type\()_\taps\()_filter_2) 2110 ext v20.8b, v19.8b, v28.8b, #4 2111 mov v21.8b, v28.8b 2112 211328: 2114 bl L(\type\()_\taps\()_filter_2) 2115 ext v22.8b, v21.8b, v28.8b, #4 2116.ifc \taps, 6tap 2117 smull v2.4s, v17.4h, v1.h[1] 2118 smlal v2.4s, v18.4h, v1.h[2] 2119 smlal v2.4s, v19.4h, v1.h[3] 2120 smlal v2.4s, v20.4h, v1.h[4] 2121 smlal v2.4s, v21.4h, v1.h[5] 2122 smlal v2.4s, v22.4h, v1.h[6] 2123.else // 8tap 2124 smull v2.4s, v16.4h, v1.h[0] 2125 smlal v2.4s, v17.4h, v1.h[1] 2126 smlal v2.4s, v18.4h, v1.h[2] 2127 smlal v2.4s, v19.4h, v1.h[3] 2128 smlal v2.4s, v20.4h, v1.h[4] 2129 smlal v2.4s, v21.4h, v1.h[5] 2130 smlal v2.4s, v22.4h, v1.h[6] 2131 smlal v2.4s, v28.4h, v1.h[7] 2132.endif 2133 2134 sqrshrn v2.4h, v2.4s, #\shift_hv 2135 sqxtun v2.8b, v2.8h 2136 subs \h, \h, #2 2137 st1 {v2.h}[0], [\dst], \d_strd 2138 st1 {v2.h}[1], [\ds2], \d_strd 2139 b.le 0f 2140 mov v16.8b, v18.8b 2141 mov v17.8b, v19.8b 2142 mov v18.8b, v20.8b 2143 mov v19.8b, v21.8b 2144 mov v20.8b, v22.8b 2145 mov v21.8b, v28.8b 2146 b 28b 2147 21480: 2149 ret x15 2150 2151L(\type\()_\taps\()_filter_2): 2152 ld1 {v28.8b}, [\sr2], \s_strd 2153 ld1 {v30.8b}, [\src], \s_strd 2154 uxtl v28.8h, v28.8b 2155 uxtl v30.8h, v30.8b 2156 ext v29.16b, v28.16b, v28.16b, #2 2157 ext v31.16b, v30.16b, v30.16b, #2 2158 trn1 v27.2s, v28.2s, v30.2s 2159 trn2 v30.2s, v28.2s, v30.2s 2160 trn1 v28.2s, v29.2s, v31.2s 2161 trn2 v31.2s, v29.2s, v31.2s 2162 mul v27.4h, v27.4h, v0.h[0] 2163 mla v27.4h, v28.4h, v0.h[1] 2164 mla v27.4h, v30.4h, v0.h[2] 2165 mla v27.4h, v31.4h, v0.h[3] 2166 srshr v28.4h, v27.4h, #2 2167 ret 2168.endif 2169 217040: 2171 AARCH64_VALID_JUMP_TARGET 2172 add \xmx, \xmx, #2 2173 ld1 {v0.s}[0], [\xmx] 2174 b.gt 480f 2175 add \xmy, \xmy, #2 2176 ld1 {v1.s}[0], [\xmy] 2177 sub \sr2, \src, #1 2178 sub \src, \sr2, \s_strd 2179 add \ds2, \dst, \d_strd 2180 lsl \s_strd, \s_strd, #1 2181 lsl \d_strd, \d_strd, #1 2182 sxtl v0.8h, v0.8b 2183 sxtl v1.8h, v1.8b 2184 mov x15, x30 2185 2186 // 4x2, 4x4 hv 2187 ld1 {v26.8b}, [\src], \s_strd 2188 uxtl v26.8h, v26.8b 2189 ext v28.16b, v26.16b, v26.16b, #2 2190 ext v29.16b, v26.16b, v26.16b, #4 2191 ext v30.16b, v26.16b, v26.16b, #6 2192 mul v31.4h, v26.4h, v0.h[0] 2193 mla v31.4h, v28.4h, v0.h[1] 2194 mla v31.4h, v29.4h, v0.h[2] 2195 mla v31.4h, v30.4h, v0.h[3] 2196 srshr v16.4h, v31.4h, #2 2197 2198 bl L(\type\()_\taps\()_filter_4) 2199 mov v17.8b, v28.8b 2200 mov v18.8b, v29.8b 2201 22024: 2203 bl L(\type\()_\taps\()_filter_4) 2204 // Interleaving the mul/mla chains actually hurts performance 2205 // significantly on Cortex A53, thus keeping mul/mla tightly 2206 // chained like this. 2207 smull v2.4s, v16.4h, v1.h[0] 2208 smlal v2.4s, v17.4h, v1.h[1] 2209 smlal v2.4s, v18.4h, v1.h[2] 2210 smlal v2.4s, v28.4h, v1.h[3] 2211 smull v3.4s, v17.4h, v1.h[0] 2212 smlal v3.4s, v18.4h, v1.h[1] 2213 smlal v3.4s, v28.4h, v1.h[2] 2214 smlal v3.4s, v29.4h, v1.h[3] 2215 sqrshrn v2.4h, v2.4s, #\shift_hv 2216 sqrshrn v3.4h, v3.4s, #\shift_hv 2217 subs \h, \h, #2 2218.ifc \type, put 2219 sqxtun v2.8b, v2.8h 2220 sqxtun v3.8b, v3.8h 2221 st1 {v2.s}[0], [\dst], \d_strd 2222 st1 {v3.s}[0], [\ds2], \d_strd 2223.else 2224 st1 {v2.4h}, [\dst], \d_strd 2225 st1 {v3.4h}, [\ds2], \d_strd 2226.endif 2227 b.le 0f 2228 mov v16.8b, v18.8b 2229 mov v17.8b, v28.8b 2230 mov v18.8b, v29.8b 2231 b 4b 2232 2233480: // 4x8, 4x16, 4x32 hv 2234 ld1 {v1.8b}, [\xmy] 2235 sub \src, \src, #1 2236.ifc \taps, 6tap 2237 sub \sr2, \src, \s_strd 2238 sub \src, \src, \s_strd, lsl #1 2239.else 2240 sub \sr2, \src, \s_strd, lsl #1 2241 sub \src, \sr2, \s_strd 2242.endif 2243 add \ds2, \dst, \d_strd 2244 lsl \s_strd, \s_strd, #1 2245 lsl \d_strd, \d_strd, #1 2246 sxtl v0.8h, v0.8b 2247 sxtl v1.8h, v1.8b 2248 mov x15, x30 2249 2250 ld1 {v26.8b}, [\src], \s_strd 2251 uxtl v26.8h, v26.8b 2252 ext v28.16b, v26.16b, v26.16b, #2 2253 ext v29.16b, v26.16b, v26.16b, #4 2254 ext v30.16b, v26.16b, v26.16b, #6 2255 mul v31.4h, v26.4h, v0.h[0] 2256 mla v31.4h, v28.4h, v0.h[1] 2257 mla v31.4h, v29.4h, v0.h[2] 2258 mla v31.4h, v30.4h, v0.h[3] 2259.ifc \taps, 6tap 2260 srshr v18.4h, v31.4h, #2 2261.else 2262 srshr v16.4h, v31.4h, #2 2263 2264 bl L(\type\()_\taps\()_filter_4) 2265 mov v17.8b, v28.8b 2266 mov v18.8b, v29.8b 2267.endif 2268 bl L(\type\()_\taps\()_filter_4) 2269 mov v19.8b, v28.8b 2270 mov v20.8b, v29.8b 2271 bl L(\type\()_\taps\()_filter_4) 2272 mov v21.8b, v28.8b 2273 mov v22.8b, v29.8b 2274 227548: 2276 bl L(\type\()_\taps\()_filter_4) 2277.ifc \taps, 6tap 2278 smull v2.4s, v18.4h, v1.h[1] 2279 smlal v2.4s, v19.4h, v1.h[2] 2280 smlal v2.4s, v20.4h, v1.h[3] 2281 smlal v2.4s, v21.4h, v1.h[4] 2282 smlal v2.4s, v22.4h, v1.h[5] 2283 smlal v2.4s, v28.4h, v1.h[6] 2284 smull v3.4s, v19.4h, v1.h[1] 2285 smlal v3.4s, v20.4h, v1.h[2] 2286 smlal v3.4s, v21.4h, v1.h[3] 2287 smlal v3.4s, v22.4h, v1.h[4] 2288 smlal v3.4s, v28.4h, v1.h[5] 2289 smlal v3.4s, v29.4h, v1.h[6] 2290.else // 8tap 2291 smull v2.4s, v16.4h, v1.h[0] 2292 smlal v2.4s, v17.4h, v1.h[1] 2293 smlal v2.4s, v18.4h, v1.h[2] 2294 smlal v2.4s, v19.4h, v1.h[3] 2295 smlal v2.4s, v20.4h, v1.h[4] 2296 smlal v2.4s, v21.4h, v1.h[5] 2297 smlal v2.4s, v22.4h, v1.h[6] 2298 smlal v2.4s, v28.4h, v1.h[7] 2299 smull v3.4s, v17.4h, v1.h[0] 2300 smlal v3.4s, v18.4h, v1.h[1] 2301 smlal v3.4s, v19.4h, v1.h[2] 2302 smlal v3.4s, v20.4h, v1.h[3] 2303 smlal v3.4s, v21.4h, v1.h[4] 2304 smlal v3.4s, v22.4h, v1.h[5] 2305 smlal v3.4s, v28.4h, v1.h[6] 2306 smlal v3.4s, v29.4h, v1.h[7] 2307.endif 2308 sqrshrn v2.4h, v2.4s, #\shift_hv 2309 sqrshrn v3.4h, v3.4s, #\shift_hv 2310 subs \h, \h, #2 2311.ifc \type, put 2312 sqxtun v2.8b, v2.8h 2313 sqxtun v3.8b, v3.8h 2314 st1 {v2.s}[0], [\dst], \d_strd 2315 st1 {v3.s}[0], [\ds2], \d_strd 2316.else 2317 st1 {v2.4h}, [\dst], \d_strd 2318 st1 {v3.4h}, [\ds2], \d_strd 2319.endif 2320 b.le 0f 2321.ifc \taps, 8tap 2322 mov v16.8b, v18.8b 2323 mov v17.8b, v19.8b 2324.endif 2325 mov v18.8b, v20.8b 2326 mov v19.8b, v21.8b 2327 mov v20.8b, v22.8b 2328 mov v21.8b, v28.8b 2329 mov v22.8b, v29.8b 2330 b 48b 23310: 2332 ret x15 2333 2334L(\type\()_\taps\()_filter_4): 2335 ld1 {v26.8b}, [\sr2], \s_strd 2336 ld1 {v27.8b}, [\src], \s_strd 2337 uxtl v26.8h, v26.8b 2338 uxtl v27.8h, v27.8b 2339 ext v28.16b, v26.16b, v26.16b, #2 2340 ext v29.16b, v26.16b, v26.16b, #4 2341 ext v30.16b, v26.16b, v26.16b, #6 2342 mul v31.4h, v26.4h, v0.h[0] 2343 mla v31.4h, v28.4h, v0.h[1] 2344 mla v31.4h, v29.4h, v0.h[2] 2345 mla v31.4h, v30.4h, v0.h[3] 2346 ext v28.16b, v27.16b, v27.16b, #2 2347 ext v29.16b, v27.16b, v27.16b, #4 2348 ext v30.16b, v27.16b, v27.16b, #6 2349 mul v27.4h, v27.4h, v0.h[0] 2350 mla v27.4h, v28.4h, v0.h[1] 2351 mla v27.4h, v29.4h, v0.h[2] 2352 mla v27.4h, v30.4h, v0.h[3] 2353 srshr v28.4h, v31.4h, #2 2354 srshr v29.4h, v27.4h, #2 2355 ret 2356 235780: 2358160: 2359320: 2360 AARCH64_VALID_JUMP_TARGET 2361 b.gt 880f 2362 add \xmy, \xmy, #2 2363 ld1 {v0.8b}, [\xmx] 2364 ld1 {v1.s}[0], [\xmy] 2365 sub \src, \src, #3 2366 sub \src, \src, \s_strd 2367 sxtl v0.8h, v0.8b 2368 sxtl v1.8h, v1.8b 2369 mov x15, x30 2370 mov \my, \h 2371 2372164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv 2373 add \ds2, \dst, \d_strd 2374 add \sr2, \src, \s_strd 2375 lsl \d_strd, \d_strd, #1 2376 lsl \s_strd, \s_strd, #1 2377 2378 bl L(\type\()_\taps\()_filter_8_first) 2379 bl L(\type\()_\taps\()_filter_8) 2380 mov v17.16b, v24.16b 2381 mov v18.16b, v25.16b 2382 23838: 2384 smull v2.4s, v16.4h, v1.h[0] 2385 smull2 v3.4s, v16.8h, v1.h[0] 2386 bl L(\type\()_\taps\()_filter_8) 2387 smull v4.4s, v17.4h, v1.h[0] 2388 smull2 v5.4s, v17.8h, v1.h[0] 2389 smlal v2.4s, v17.4h, v1.h[1] 2390 smlal2 v3.4s, v17.8h, v1.h[1] 2391 smlal v4.4s, v18.4h, v1.h[1] 2392 smlal2 v5.4s, v18.8h, v1.h[1] 2393 smlal v2.4s, v18.4h, v1.h[2] 2394 smlal2 v3.4s, v18.8h, v1.h[2] 2395 smlal v4.4s, v24.4h, v1.h[2] 2396 smlal2 v5.4s, v24.8h, v1.h[2] 2397 smlal v2.4s, v24.4h, v1.h[3] 2398 smlal2 v3.4s, v24.8h, v1.h[3] 2399 smlal v4.4s, v25.4h, v1.h[3] 2400 smlal2 v5.4s, v25.8h, v1.h[3] 2401 sqrshrn v2.4h, v2.4s, #\shift_hv 2402 sqrshrn2 v2.8h, v3.4s, #\shift_hv 2403 sqrshrn v4.4h, v4.4s, #\shift_hv 2404 sqrshrn2 v4.8h, v5.4s, #\shift_hv 2405 subs \h, \h, #2 2406.ifc \type, put 2407 sqxtun v2.8b, v2.8h 2408 sqxtun v4.8b, v4.8h 2409 st1 {v2.8b}, [\dst], \d_strd 2410 st1 {v4.8b}, [\ds2], \d_strd 2411.else 2412 st1 {v2.8h}, [\dst], \d_strd 2413 st1 {v4.8h}, [\ds2], \d_strd 2414.endif 2415 b.le 9f 2416 mov v16.16b, v18.16b 2417 mov v17.16b, v24.16b 2418 mov v18.16b, v25.16b 2419 b 8b 24209: 2421 subs \w, \w, #8 2422 b.le 0f 2423 asr \s_strd, \s_strd, #1 2424 asr \d_strd, \d_strd, #1 2425 msub \src, \s_strd, \xmy, \src 2426 msub \dst, \d_strd, \xmy, \dst 2427 sub \src, \src, \s_strd, lsl #2 2428 mov \h, \my 2429 add \src, \src, #8 2430.ifc \type, put 2431 add \dst, \dst, #8 2432.else 2433 add \dst, \dst, #16 2434.endif 2435 b 164b 2436 2437880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 2438640: 24391280: 2440 AARCH64_VALID_JUMP_TARGET 2441 ld1 {v0.8b}, [\xmx] 2442 ld1 {v1.8b}, [\xmy] 2443 sub \src, \src, #3 2444.ifc \taps, 8tap 2445 sub \src, \src, \s_strd 2446.endif 2447 sub \src, \src, \s_strd, lsl #1 2448 sxtl v0.8h, v0.8b 2449 sxtl v1.8h, v1.8b 2450 mov x15, x30 2451 mov \my, \h 2452 2453168: 2454 add \ds2, \dst, \d_strd 2455 add \sr2, \src, \s_strd 2456 lsl \d_strd, \d_strd, #1 2457 lsl \s_strd, \s_strd, #1 2458 2459 bl L(\type\()_\taps\()_filter_8_first) 2460.ifc \taps, 6tap 2461 mov v18.16b, v16.16b 2462.else 2463 bl L(\type\()_\taps\()_filter_8) 2464 mov v17.16b, v24.16b 2465 mov v18.16b, v25.16b 2466.endif 2467 bl L(\type\()_\taps\()_filter_8) 2468 mov v19.16b, v24.16b 2469 mov v20.16b, v25.16b 2470 bl L(\type\()_\taps\()_filter_8) 2471 mov v21.16b, v24.16b 2472 mov v22.16b, v25.16b 2473 247488: 2475.ifc \taps, 6tap 2476 smull v2.4s, v18.4h, v1.h[1] 2477 smull2 v3.4s, v18.8h, v1.h[1] 2478 bl L(\type\()_\taps\()_filter_8) 2479 smull v4.4s, v19.4h, v1.h[1] 2480 smull2 v5.4s, v19.8h, v1.h[1] 2481 smlal v2.4s, v19.4h, v1.h[2] 2482 smlal2 v3.4s, v19.8h, v1.h[2] 2483 smlal v4.4s, v20.4h, v1.h[2] 2484 smlal2 v5.4s, v20.8h, v1.h[2] 2485 smlal v2.4s, v20.4h, v1.h[3] 2486 smlal2 v3.4s, v20.8h, v1.h[3] 2487 smlal v4.4s, v21.4h, v1.h[3] 2488 smlal2 v5.4s, v21.8h, v1.h[3] 2489 smlal v2.4s, v21.4h, v1.h[4] 2490 smlal2 v3.4s, v21.8h, v1.h[4] 2491 smlal v4.4s, v22.4h, v1.h[4] 2492 smlal2 v5.4s, v22.8h, v1.h[4] 2493 smlal v2.4s, v22.4h, v1.h[5] 2494 smlal2 v3.4s, v22.8h, v1.h[5] 2495 smlal v4.4s, v24.4h, v1.h[5] 2496 smlal2 v5.4s, v24.8h, v1.h[5] 2497 smlal v2.4s, v24.4h, v1.h[6] 2498 smlal2 v3.4s, v24.8h, v1.h[6] 2499 smlal v4.4s, v25.4h, v1.h[6] 2500 smlal2 v5.4s, v25.8h, v1.h[6] 2501.else // 8tap 2502 smull v2.4s, v16.4h, v1.h[0] 2503 smull2 v3.4s, v16.8h, v1.h[0] 2504 bl L(\type\()_\taps\()_filter_8) 2505 smull v4.4s, v17.4h, v1.h[0] 2506 smull2 v5.4s, v17.8h, v1.h[0] 2507 smlal v2.4s, v17.4h, v1.h[1] 2508 smlal2 v3.4s, v17.8h, v1.h[1] 2509 smlal v4.4s, v18.4h, v1.h[1] 2510 smlal2 v5.4s, v18.8h, v1.h[1] 2511 smlal v2.4s, v18.4h, v1.h[2] 2512 smlal2 v3.4s, v18.8h, v1.h[2] 2513 smlal v4.4s, v19.4h, v1.h[2] 2514 smlal2 v5.4s, v19.8h, v1.h[2] 2515 smlal v2.4s, v19.4h, v1.h[3] 2516 smlal2 v3.4s, v19.8h, v1.h[3] 2517 smlal v4.4s, v20.4h, v1.h[3] 2518 smlal2 v5.4s, v20.8h, v1.h[3] 2519 smlal v2.4s, v20.4h, v1.h[4] 2520 smlal2 v3.4s, v20.8h, v1.h[4] 2521 smlal v4.4s, v21.4h, v1.h[4] 2522 smlal2 v5.4s, v21.8h, v1.h[4] 2523 smlal v2.4s, v21.4h, v1.h[5] 2524 smlal2 v3.4s, v21.8h, v1.h[5] 2525 smlal v4.4s, v22.4h, v1.h[5] 2526 smlal2 v5.4s, v22.8h, v1.h[5] 2527 smlal v2.4s, v22.4h, v1.h[6] 2528 smlal2 v3.4s, v22.8h, v1.h[6] 2529 smlal v4.4s, v24.4h, v1.h[6] 2530 smlal2 v5.4s, v24.8h, v1.h[6] 2531 smlal v2.4s, v24.4h, v1.h[7] 2532 smlal2 v3.4s, v24.8h, v1.h[7] 2533 smlal v4.4s, v25.4h, v1.h[7] 2534 smlal2 v5.4s, v25.8h, v1.h[7] 2535.endif 2536 sqrshrn v2.4h, v2.4s, #\shift_hv 2537 sqrshrn2 v2.8h, v3.4s, #\shift_hv 2538 sqrshrn v4.4h, v4.4s, #\shift_hv 2539 sqrshrn2 v4.8h, v5.4s, #\shift_hv 2540 subs \h, \h, #2 2541.ifc \type, put 2542 sqxtun v2.8b, v2.8h 2543 sqxtun v4.8b, v4.8h 2544 st1 {v2.8b}, [\dst], \d_strd 2545 st1 {v4.8b}, [\ds2], \d_strd 2546.else 2547 st1 {v2.8h}, [\dst], \d_strd 2548 st1 {v4.8h}, [\ds2], \d_strd 2549.endif 2550 b.le 9f 2551.ifc \taps, 8tap 2552 mov v16.16b, v18.16b 2553 mov v17.16b, v19.16b 2554.endif 2555 mov v18.16b, v20.16b 2556 mov v19.16b, v21.16b 2557 mov v20.16b, v22.16b 2558 mov v21.16b, v24.16b 2559 mov v22.16b, v25.16b 2560 b 88b 25619: 2562 subs \w, \w, #8 2563 b.le 0f 2564 asr \s_strd, \s_strd, #1 2565 asr \d_strd, \d_strd, #1 2566 msub \src, \s_strd, \xmy, \src 2567 msub \dst, \d_strd, \xmy, \dst 2568 sub \src, \src, \s_strd, lsl #3 2569 mov \h, \my 2570 add \src, \src, #8 2571.ifc \type, put 2572 add \dst, \dst, #8 2573.else 2574 add \dst, \dst, #16 2575.endif 2576.ifc \taps, 6tap 2577 add \src, \src, \s_strd, lsl #1 2578.endif 2579 b 168b 25800: 2581 ret x15 2582 2583L(\type\()_\taps\()_filter_8_first): 2584 ld1 {v28.8b, v29.8b}, [\src], \s_strd 2585 uxtl v28.8h, v28.8b 2586 uxtl v29.8h, v29.8b 2587.ifc \taps, 6tap 2588 ext v24.16b, v28.16b, v29.16b, #(2*1) 2589 ext v25.16b, v28.16b, v29.16b, #(2*2) 2590 ext v26.16b, v28.16b, v29.16b, #(2*3) 2591 ext v27.16b, v28.16b, v29.16b, #(2*4) 2592 mul v16.8h, v24.8h, v0.h[1] 2593 mla v16.8h, v25.8h, v0.h[2] 2594 mla v16.8h, v26.8h, v0.h[3] 2595 mla v16.8h, v27.8h, v0.h[4] 2596 ext v24.16b, v28.16b, v29.16b, #(2*5) 2597 ext v25.16b, v28.16b, v29.16b, #(2*6) 2598 ext v26.16b, v28.16b, v29.16b, #(2*7) 2599 mla v16.8h, v24.8h, v0.h[5] 2600 mla v16.8h, v25.8h, v0.h[6] 2601.else // 8tap 2602 mul v16.8h, v28.8h, v0.h[0] 2603 ext v24.16b, v28.16b, v29.16b, #(2*1) 2604 ext v25.16b, v28.16b, v29.16b, #(2*2) 2605 ext v26.16b, v28.16b, v29.16b, #(2*3) 2606 ext v27.16b, v28.16b, v29.16b, #(2*4) 2607 mla v16.8h, v24.8h, v0.h[1] 2608 mla v16.8h, v25.8h, v0.h[2] 2609 mla v16.8h, v26.8h, v0.h[3] 2610 mla v16.8h, v27.8h, v0.h[4] 2611 ext v24.16b, v28.16b, v29.16b, #(2*5) 2612 ext v25.16b, v28.16b, v29.16b, #(2*6) 2613 ext v26.16b, v28.16b, v29.16b, #(2*7) 2614 mla v16.8h, v24.8h, v0.h[5] 2615 mla v16.8h, v25.8h, v0.h[6] 2616 mla v16.8h, v26.8h, v0.h[7] 2617.endif 2618 srshr v16.8h, v16.8h, #2 2619 ret 2620 2621L(\type\()_\taps\()_filter_8): 2622 ld1 {v28.8b, v29.8b}, [\sr2], \s_strd 2623 ld1 {v30.8b, v31.8b}, [\src], \s_strd 2624 uxtl v28.8h, v28.8b 2625 uxtl v29.8h, v29.8b 2626 uxtl v30.8h, v30.8b 2627 uxtl v31.8h, v31.8b 2628.ifc \taps, 6tap 2629 ext v26.16b, v28.16b, v29.16b, #2 2630 ext v27.16b, v30.16b, v31.16b, #2 2631 mul v24.8h, v26.8h, v0.h[1] 2632 mul v25.8h, v27.8h, v0.h[1] 2633.irpc i, 23456 2634 ext v26.16b, v28.16b, v29.16b, #(2*\i) 2635 ext v27.16b, v30.16b, v31.16b, #(2*\i) 2636 mla v24.8h, v26.8h, v0.h[\i] 2637 mla v25.8h, v27.8h, v0.h[\i] 2638.endr 2639.else // 8tap 2640 mul v24.8h, v28.8h, v0.h[0] 2641 mul v25.8h, v30.8h, v0.h[0] 2642.irpc i, 1234567 2643 ext v26.16b, v28.16b, v29.16b, #(2*\i) 2644 ext v27.16b, v30.16b, v31.16b, #(2*\i) 2645 mla v24.8h, v26.8h, v0.h[\i] 2646 mla v25.8h, v27.8h, v0.h[\i] 2647.endr 2648.endif 2649 srshr v24.8h, v24.8h, #2 2650 srshr v25.8h, v25.8h, #2 2651 ret 2652 2653L(\type\()_\taps\()_hv_tbl): 2654 .hword L(\type\()_\taps\()_hv_tbl) - 1280b 2655 .hword L(\type\()_\taps\()_hv_tbl) - 640b 2656 .hword L(\type\()_\taps\()_hv_tbl) - 320b 2657 .hword L(\type\()_\taps\()_hv_tbl) - 160b 2658 .hword L(\type\()_\taps\()_hv_tbl) - 80b 2659 .hword L(\type\()_\taps\()_hv_tbl) - 40b 2660 .hword L(\type\()_\taps\()_hv_tbl) - 20b 2661 .hword 0 2662endfunc 2663.endm 2664 2665 2666.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv 2667function \type\()_bilin_8bpc_neon, export=1 2668 dup v1.16b, \mx 2669 dup v3.16b, \my 2670 mov w9, #16 2671 sub w8, w9, \mx 2672 sub w9, w9, \my 2673 dup v0.16b, w8 2674 dup v2.16b, w9 2675.ifc \type, prep 2676 uxtw \d_strd, \w 2677 lsl \d_strd, \d_strd, #1 2678.endif 2679 2680 clz w8, \w 2681 sub w8, w8, #24 2682 cbnz \mx, L(\type\()_bilin_h) 2683 cbnz \my, L(\type\()_bilin_v) 2684 b \type\()_neon 2685 2686L(\type\()_bilin_h): 2687 cbnz \my, L(\type\()_bilin_hv) 2688 2689 adr x9, L(\type\()_bilin_h_tbl) 2690 ldrh w8, [x9, x8, lsl #1] 2691 sub x9, x9, w8, uxtw 2692 br x9 2693 269420: // 2xN h 2695 AARCH64_VALID_JUMP_TARGET 2696.ifc \type, put 2697 add \ds2, \dst, \d_strd 2698 add \sr2, \src, \s_strd 2699 lsl \d_strd, \d_strd, #1 2700 lsl \s_strd, \s_strd, #1 27012: 2702 ld1 {v4.s}[0], [\src], \s_strd 2703 ld1 {v6.s}[0], [\sr2], \s_strd 2704 ext v5.8b, v4.8b, v4.8b, #1 2705 ext v7.8b, v6.8b, v6.8b, #1 2706 trn1 v4.4h, v4.4h, v6.4h 2707 trn1 v5.4h, v5.4h, v7.4h 2708 subs \h, \h, #2 2709 umull v4.8h, v4.8b, v0.8b 2710 umlal v4.8h, v5.8b, v1.8b 2711 uqrshrn v4.8b, v4.8h, #4 2712 st1 {v4.h}[0], [\dst], \d_strd 2713 st1 {v4.h}[1], [\ds2], \d_strd 2714 b.gt 2b 2715 ret 2716.endif 2717 271840: // 4xN h 2719 AARCH64_VALID_JUMP_TARGET 2720 add \ds2, \dst, \d_strd 2721 add \sr2, \src, \s_strd 2722 lsl \d_strd, \d_strd, #1 2723 lsl \s_strd, \s_strd, #1 27244: 2725 ld1 {v4.8b}, [\src], \s_strd 2726 ld1 {v6.8b}, [\sr2], \s_strd 2727 ext v5.8b, v4.8b, v4.8b, #1 2728 ext v7.8b, v6.8b, v6.8b, #1 2729 trn1 v4.2s, v4.2s, v6.2s 2730 trn1 v5.2s, v5.2s, v7.2s 2731 subs \h, \h, #2 2732 umull v4.8h, v4.8b, v0.8b 2733 umlal v4.8h, v5.8b, v1.8b 2734.ifc \type, put 2735 uqrshrn v4.8b, v4.8h, #4 2736 st1 {v4.s}[0], [\dst], \d_strd 2737 st1 {v4.s}[1], [\ds2], \d_strd 2738.else 2739 st1 {v4.d}[0], [\dst], \d_strd 2740 st1 {v4.d}[1], [\ds2], \d_strd 2741.endif 2742 b.gt 4b 2743 ret 2744 274580: // 8xN h 2746 AARCH64_VALID_JUMP_TARGET 2747 add \ds2, \dst, \d_strd 2748 add \sr2, \src, \s_strd 2749 lsl \d_strd, \d_strd, #1 2750 lsl \s_strd, \s_strd, #1 27518: 2752 ld1 {v4.16b}, [\src], \s_strd 2753 ld1 {v6.16b}, [\sr2], \s_strd 2754 ext v5.16b, v4.16b, v4.16b, #1 2755 ext v7.16b, v6.16b, v6.16b, #1 2756 subs \h, \h, #2 2757 umull v4.8h, v4.8b, v0.8b 2758 umull v6.8h, v6.8b, v0.8b 2759 umlal v4.8h, v5.8b, v1.8b 2760 umlal v6.8h, v7.8b, v1.8b 2761.ifc \type, put 2762 uqrshrn v4.8b, v4.8h, #4 2763 uqrshrn v6.8b, v6.8h, #4 2764 st1 {v4.8b}, [\dst], \d_strd 2765 st1 {v6.8b}, [\ds2], \d_strd 2766.else 2767 st1 {v4.8h}, [\dst], \d_strd 2768 st1 {v6.8h}, [\ds2], \d_strd 2769.endif 2770 b.gt 8b 2771 ret 2772160: 2773320: 2774640: 27751280: // 16xN, 32xN, ... h 2776 AARCH64_VALID_JUMP_TARGET 2777 add \ds2, \dst, \d_strd 2778 add \sr2, \src, \s_strd 2779 lsl \s_strd, \s_strd, #1 2780 2781 sub \s_strd, \s_strd, \w, uxtw 2782 sub \s_strd, \s_strd, #8 2783.ifc \type, put 2784 lsl \d_strd, \d_strd, #1 2785 sub \d_strd, \d_strd, \w, uxtw 2786.endif 2787161: 2788 ld1 {v16.d}[1], [\src], #8 2789 ld1 {v20.d}[1], [\sr2], #8 2790 mov \mx, \w 2791 279216: 2793 ld1 {v18.16b}, [\src], #16 2794 ld1 {v22.16b}, [\sr2], #16 2795 ext v17.16b, v16.16b, v18.16b, #8 2796 ext v19.16b, v16.16b, v18.16b, #9 2797 ext v21.16b, v20.16b, v22.16b, #8 2798 ext v23.16b, v20.16b, v22.16b, #9 2799 umull v16.8h, v17.8b, v0.8b 2800 umull2 v17.8h, v17.16b, v0.16b 2801 umull v20.8h, v21.8b, v0.8b 2802 umull2 v21.8h, v21.16b, v0.16b 2803 umlal v16.8h, v19.8b, v1.8b 2804 umlal2 v17.8h, v19.16b, v1.16b 2805 umlal v20.8h, v23.8b, v1.8b 2806 umlal2 v21.8h, v23.16b, v1.16b 2807 subs \mx, \mx, #16 2808.ifc \type, put 2809 uqrshrn v16.8b, v16.8h, #4 2810 uqrshrn2 v16.16b, v17.8h, #4 2811 uqrshrn v20.8b, v20.8h, #4 2812 uqrshrn2 v20.16b, v21.8h, #4 2813 st1 {v16.16b}, [\dst], #16 2814 st1 {v20.16b}, [\ds2], #16 2815.else 2816 st1 {v16.8h, v17.8h}, [\dst], #32 2817 st1 {v20.8h, v21.8h}, [\ds2], #32 2818.endif 2819 b.le 9f 2820 2821 mov v16.16b, v18.16b 2822 mov v20.16b, v22.16b 2823 b 16b 2824 28259: 2826 add \dst, \dst, \d_strd 2827 add \ds2, \ds2, \d_strd 2828 add \src, \src, \s_strd 2829 add \sr2, \sr2, \s_strd 2830 2831 subs \h, \h, #2 2832 b.gt 161b 2833 ret 2834 2835L(\type\()_bilin_h_tbl): 2836 .hword L(\type\()_bilin_h_tbl) - 1280b 2837 .hword L(\type\()_bilin_h_tbl) - 640b 2838 .hword L(\type\()_bilin_h_tbl) - 320b 2839 .hword L(\type\()_bilin_h_tbl) - 160b 2840 .hword L(\type\()_bilin_h_tbl) - 80b 2841 .hword L(\type\()_bilin_h_tbl) - 40b 2842 .hword L(\type\()_bilin_h_tbl) - 20b 2843 .hword 0 2844 2845 2846L(\type\()_bilin_v): 2847 cmp \h, #4 2848 adr x9, L(\type\()_bilin_v_tbl) 2849 ldrh w8, [x9, x8, lsl #1] 2850 sub x9, x9, w8, uxtw 2851 br x9 2852 285320: // 2xN v 2854 AARCH64_VALID_JUMP_TARGET 2855.ifc \type, put 2856 cmp \h, #2 2857 add \ds2, \dst, \d_strd 2858 add \sr2, \src, \s_strd 2859 lsl \s_strd, \s_strd, #1 2860 lsl \d_strd, \d_strd, #1 2861 2862 // 2x2 v 2863 ld1 {v16.h}[0], [\src], \s_strd 2864 b.gt 24f 286522: 2866 ld1 {v17.h}[0], [\sr2], \s_strd 2867 ld1 {v18.h}[0], [\src], \s_strd 2868 trn1 v16.4h, v16.4h, v17.4h 2869 trn1 v17.4h, v17.4h, v18.4h 2870 umull v4.8h, v16.8b, v2.8b 2871 umlal v4.8h, v17.8b, v3.8b 2872 uqrshrn v4.8b, v4.8h, #4 2873 st1 {v4.h}[0], [\dst] 2874 st1 {v4.h}[1], [\ds2] 2875 ret 287624: // 2x4, 2x6, 2x8, ... v 2877 ld1 {v17.h}[0], [\sr2], \s_strd 2878 ld1 {v18.h}[0], [\src], \s_strd 2879 ld1 {v19.h}[0], [\sr2], \s_strd 2880 ld1 {v20.h}[0], [\src], \s_strd 2881 sub \h, \h, #4 2882 trn1 v16.4h, v16.4h, v17.4h 2883 trn1 v17.4h, v17.4h, v18.4h 2884 trn1 v18.4h, v18.4h, v19.4h 2885 trn1 v19.4h, v19.4h, v20.4h 2886 trn1 v16.2s, v16.2s, v18.2s 2887 trn1 v17.2s, v17.2s, v19.2s 2888 umull v4.8h, v16.8b, v2.8b 2889 umlal v4.8h, v17.8b, v3.8b 2890 cmp \h, #2 2891 uqrshrn v4.8b, v4.8h, #4 2892 st1 {v4.h}[0], [\dst], \d_strd 2893 st1 {v4.h}[1], [\ds2], \d_strd 2894 st1 {v4.h}[2], [\dst], \d_strd 2895 st1 {v4.h}[3], [\ds2], \d_strd 2896 b.lt 0f 2897 mov v16.8b, v20.8b 2898 b.eq 22b 2899 b 24b 29000: 2901 ret 2902.endif 2903 290440: // 4xN v 2905 AARCH64_VALID_JUMP_TARGET 2906 add \ds2, \dst, \d_strd 2907 add \sr2, \src, \s_strd 2908 lsl \s_strd, \s_strd, #1 2909 lsl \d_strd, \d_strd, #1 2910 ld1 {v16.s}[0], [\src], \s_strd 29114: 2912 ld1 {v17.s}[0], [\sr2], \s_strd 2913 ld1 {v18.s}[0], [\src], \s_strd 2914 trn1 v16.2s, v16.2s, v17.2s 2915 trn1 v17.2s, v17.2s, v18.2s 2916 umull v4.8h, v16.8b, v2.8b 2917 umlal v4.8h, v17.8b, v3.8b 2918 subs \h, \h, #2 2919.ifc \type, put 2920 uqrshrn v4.8b, v4.8h, #4 2921 st1 {v4.s}[0], [\dst], \d_strd 2922 st1 {v4.s}[1], [\ds2], \d_strd 2923.else 2924 st1 {v4.d}[0], [\dst], \d_strd 2925 st1 {v4.d}[1], [\ds2], \d_strd 2926.endif 2927 b.le 0f 2928 mov v16.8b, v18.8b 2929 b 4b 29300: 2931 ret 2932 293380: // 8xN v 2934 AARCH64_VALID_JUMP_TARGET 2935 add \ds2, \dst, \d_strd 2936 add \sr2, \src, \s_strd 2937 lsl \s_strd, \s_strd, #1 2938 lsl \d_strd, \d_strd, #1 2939 ld1 {v16.8b}, [\src], \s_strd 29408: 2941 ld1 {v17.8b}, [\sr2], \s_strd 2942 ld1 {v18.8b}, [\src], \s_strd 2943 umull v4.8h, v16.8b, v2.8b 2944 umull v5.8h, v17.8b, v2.8b 2945 umlal v4.8h, v17.8b, v3.8b 2946 umlal v5.8h, v18.8b, v3.8b 2947 subs \h, \h, #2 2948.ifc \type, put 2949 uqrshrn v4.8b, v4.8h, #4 2950 uqrshrn v5.8b, v5.8h, #4 2951 st1 {v4.8b}, [\dst], \d_strd 2952 st1 {v5.8b}, [\ds2], \d_strd 2953.else 2954 st1 {v4.8h}, [\dst], \d_strd 2955 st1 {v5.8h}, [\ds2], \d_strd 2956.endif 2957 b.le 0f 2958 mov v16.8b, v18.8b 2959 b 8b 29600: 2961 ret 2962 2963160: // 16xN, 32xN, ... 2964320: 2965640: 29661280: 2967 AARCH64_VALID_JUMP_TARGET 2968 mov \my, \h 29691: 2970 add \ds2, \dst, \d_strd 2971 add \sr2, \src, \s_strd 2972 lsl \s_strd, \s_strd, #1 2973 lsl \d_strd, \d_strd, #1 2974 2975 ld1 {v16.16b}, [\src], \s_strd 29762: 2977 ld1 {v17.16b}, [\sr2], \s_strd 2978 ld1 {v18.16b}, [\src], \s_strd 2979 umull v4.8h, v16.8b, v2.8b 2980 umull2 v5.8h, v16.16b, v2.16b 2981 umull v6.8h, v17.8b, v2.8b 2982 umull2 v7.8h, v17.16b, v2.16b 2983 umlal v4.8h, v17.8b, v3.8b 2984 umlal2 v5.8h, v17.16b, v3.16b 2985 umlal v6.8h, v18.8b, v3.8b 2986 umlal2 v7.8h, v18.16b, v3.16b 2987 subs \h, \h, #2 2988.ifc \type, put 2989 uqrshrn v4.8b, v4.8h, #4 2990 uqrshrn2 v4.16b, v5.8h, #4 2991 uqrshrn v6.8b, v6.8h, #4 2992 uqrshrn2 v6.16b, v7.8h, #4 2993 st1 {v4.16b}, [\dst], \d_strd 2994 st1 {v6.16b}, [\ds2], \d_strd 2995.else 2996 st1 {v4.8h, v5.8h}, [\dst], \d_strd 2997 st1 {v6.8h, v7.8h}, [\ds2], \d_strd 2998.endif 2999 b.le 9f 3000 mov v16.16b, v18.16b 3001 b 2b 30029: 3003 subs \w, \w, #16 3004 b.le 0f 3005 asr \s_strd, \s_strd, #1 3006 asr \d_strd, \d_strd, #1 3007 msub \src, \s_strd, \xmy, \src 3008 msub \dst, \d_strd, \xmy, \dst 3009 sub \src, \src, \s_strd, lsl #1 3010 mov \h, \my 3011 add \src, \src, #16 3012.ifc \type, put 3013 add \dst, \dst, #16 3014.else 3015 add \dst, \dst, #32 3016.endif 3017 b 1b 30180: 3019 ret 3020 3021L(\type\()_bilin_v_tbl): 3022 .hword L(\type\()_bilin_v_tbl) - 1280b 3023 .hword L(\type\()_bilin_v_tbl) - 640b 3024 .hword L(\type\()_bilin_v_tbl) - 320b 3025 .hword L(\type\()_bilin_v_tbl) - 160b 3026 .hword L(\type\()_bilin_v_tbl) - 80b 3027 .hword L(\type\()_bilin_v_tbl) - 40b 3028 .hword L(\type\()_bilin_v_tbl) - 20b 3029 .hword 0 3030 3031L(\type\()_bilin_hv): 3032 uxtl v2.8h, v2.8b 3033 uxtl v3.8h, v3.8b 3034 adr x9, L(\type\()_bilin_hv_tbl) 3035 ldrh w8, [x9, x8, lsl #1] 3036 sub x9, x9, w8, uxtw 3037 br x9 3038 303920: // 2xN hv 3040 AARCH64_VALID_JUMP_TARGET 3041.ifc \type, put 3042 add \sr2, \src, \s_strd 3043 add \ds2, \dst, \d_strd 3044 lsl \s_strd, \s_strd, #1 3045 lsl \d_strd, \d_strd, #1 3046 3047 ld1 {v28.s}[0], [\src], \s_strd 3048 ext v29.8b, v28.8b, v28.8b, #1 3049 umull v16.8h, v28.8b, v0.8b 3050 umlal v16.8h, v29.8b, v1.8b 3051 30522: 3053 ld1 {v28.s}[0], [\sr2], \s_strd 3054 ld1 {v30.s}[0], [\src], \s_strd 3055 ext v29.8b, v28.8b, v28.8b, #1 3056 ext v31.8b, v30.8b, v30.8b, #1 3057 trn1 v28.4h, v28.4h, v30.4h 3058 trn1 v29.4h, v29.4h, v31.4h 3059 umull v17.8h, v28.8b, v0.8b 3060 umlal v17.8h, v29.8b, v1.8b 3061 3062 trn1 v16.2s, v16.2s, v17.2s 3063 3064 mul v4.4h, v16.4h, v2.4h 3065 mla v4.4h, v17.4h, v3.4h 3066 uqrshrn v4.8b, v4.8h, #8 3067 subs \h, \h, #2 3068 st1 {v4.h}[0], [\dst], \d_strd 3069 st1 {v4.h}[1], [\ds2], \d_strd 3070 b.le 0f 3071 trn2 v16.2s, v17.2s, v17.2s 3072 b 2b 30730: 3074 ret 3075.endif 3076 307740: // 4xN hv 3078 AARCH64_VALID_JUMP_TARGET 3079 add \sr2, \src, \s_strd 3080 add \ds2, \dst, \d_strd 3081 lsl \s_strd, \s_strd, #1 3082 lsl \d_strd, \d_strd, #1 3083 3084 ld1 {v28.8b}, [\src], \s_strd 3085 ext v29.8b, v28.8b, v28.8b, #1 3086 umull v16.8h, v28.8b, v0.8b 3087 umlal v16.8h, v29.8b, v1.8b 3088 30894: 3090 ld1 {v28.8b}, [\sr2], \s_strd 3091 ld1 {v30.8b}, [\src], \s_strd 3092 ext v29.8b, v28.8b, v28.8b, #1 3093 ext v31.8b, v30.8b, v30.8b, #1 3094 trn1 v28.2s, v28.2s, v30.2s 3095 trn1 v29.2s, v29.2s, v31.2s 3096 umull v17.8h, v28.8b, v0.8b 3097 umlal v17.8h, v29.8b, v1.8b 3098 3099 trn1 v16.2d, v16.2d, v17.2d 3100 3101 mul v4.8h, v16.8h, v2.8h 3102 mla v4.8h, v17.8h, v3.8h 3103 subs \h, \h, #2 3104.ifc \type, put 3105 uqrshrn v4.8b, v4.8h, #8 3106 st1 {v4.s}[0], [\dst], \d_strd 3107 st1 {v4.s}[1], [\ds2], \d_strd 3108.else 3109 urshr v4.8h, v4.8h, #4 3110 st1 {v4.d}[0], [\dst], \d_strd 3111 st1 {v4.d}[1], [\ds2], \d_strd 3112.endif 3113 b.le 0f 3114 trn2 v16.2d, v17.2d, v17.2d 3115 b 4b 31160: 3117 ret 3118 311980: // 8xN, 16xN, ... hv 3120160: 3121320: 3122640: 31231280: 3124 AARCH64_VALID_JUMP_TARGET 3125 mov \my, \h 3126 31271: 3128 add \sr2, \src, \s_strd 3129 add \ds2, \dst, \d_strd 3130 lsl \s_strd, \s_strd, #1 3131 lsl \d_strd, \d_strd, #1 3132 3133 ld1 {v28.16b}, [\src], \s_strd 3134 ext v29.16b, v28.16b, v28.16b, #1 3135 umull v16.8h, v28.8b, v0.8b 3136 umlal v16.8h, v29.8b, v1.8b 3137 31382: 3139 ld1 {v28.16b}, [\sr2], \s_strd 3140 ld1 {v30.16b}, [\src], \s_strd 3141 ext v29.16b, v28.16b, v28.16b, #1 3142 ext v31.16b, v30.16b, v30.16b, #1 3143 umull v17.8h, v28.8b, v0.8b 3144 umlal v17.8h, v29.8b, v1.8b 3145 umull v18.8h, v30.8b, v0.8b 3146 umlal v18.8h, v31.8b, v1.8b 3147 3148 mul v4.8h, v16.8h, v2.8h 3149 mla v4.8h, v17.8h, v3.8h 3150 mul v5.8h, v17.8h, v2.8h 3151 mla v5.8h, v18.8h, v3.8h 3152 subs \h, \h, #2 3153.ifc \type, put 3154 uqrshrn v4.8b, v4.8h, #8 3155 uqrshrn v5.8b, v5.8h, #8 3156 st1 {v4.8b}, [\dst], \d_strd 3157 st1 {v5.8b}, [\ds2], \d_strd 3158.else 3159 urshr v4.8h, v4.8h, #4 3160 urshr v5.8h, v5.8h, #4 3161 st1 {v4.8h}, [\dst], \d_strd 3162 st1 {v5.8h}, [\ds2], \d_strd 3163.endif 3164 b.le 9f 3165 mov v16.16b, v18.16b 3166 b 2b 31679: 3168 subs \w, \w, #8 3169 b.le 0f 3170 asr \s_strd, \s_strd, #1 3171 asr \d_strd, \d_strd, #1 3172 msub \src, \s_strd, \xmy, \src 3173 msub \dst, \d_strd, \xmy, \dst 3174 sub \src, \src, \s_strd, lsl #1 3175 mov \h, \my 3176 add \src, \src, #8 3177.ifc \type, put 3178 add \dst, \dst, #8 3179.else 3180 add \dst, \dst, #16 3181.endif 3182 b 1b 31830: 3184 ret 3185 3186L(\type\()_bilin_hv_tbl): 3187 .hword L(\type\()_bilin_hv_tbl) - 1280b 3188 .hword L(\type\()_bilin_hv_tbl) - 640b 3189 .hword L(\type\()_bilin_hv_tbl) - 320b 3190 .hword L(\type\()_bilin_hv_tbl) - 160b 3191 .hword L(\type\()_bilin_hv_tbl) - 80b 3192 .hword L(\type\()_bilin_hv_tbl) - 40b 3193 .hword L(\type\()_bilin_hv_tbl) - 20b 3194 .hword 0 3195endfunc 3196.endm 3197 3198make_8tap_fn put, regular_sharp, REGULAR, SHARP, 8tap 3199make_8tap_fn put, smooth_sharp, SMOOTH, SHARP, 8tap 3200make_8tap_fn put, sharp, SHARP, SHARP, 8tap 3201make_8tap_fn put, sharp_regular, SHARP, REGULAR, 8tap 3202make_8tap_fn put, sharp_smooth, SHARP, SMOOTH, 8tap 3203filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 8tap 3204 3205make_8tap_fn put, regular, REGULAR, REGULAR, 6tap 3206make_8tap_fn put, regular_smooth, REGULAR, SMOOTH, 6tap 3207make_8tap_fn put, smooth, SMOOTH, SMOOTH, 6tap 3208make_8tap_fn put, smooth_regular, SMOOTH, REGULAR, 6tap 3209filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 6tap 3210filter_bilin_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10 3211 3212make_8tap_fn prep, regular_sharp, REGULAR, SHARP, 8tap 3213make_8tap_fn prep, smooth_sharp, SMOOTH, SHARP, 8tap 3214make_8tap_fn prep, sharp, SHARP, SHARP, 8tap 3215make_8tap_fn prep, sharp_regular, SHARP, REGULAR, 8tap 3216make_8tap_fn prep, sharp_smooth, SHARP, SMOOTH, 8tap 3217filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6, 8tap 3218 3219make_8tap_fn prep, regular, REGULAR, REGULAR, 6tap 3220make_8tap_fn prep, regular_smooth, REGULAR, SMOOTH, 6tap 3221make_8tap_fn prep, smooth, SMOOTH, SMOOTH, 6tap 3222make_8tap_fn prep, smooth_regular, SMOOTH, REGULAR, 6tap 3223filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6, 6tap 3224filter_bilin_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6 3225 3226 3227.macro load_filter_row dst, src, inc 3228 asr w13, \src, #10 3229 add \src, \src, \inc 3230 ldr \dst, [x11, w13, sxtw #3] 3231.endm 3232 3233function warp_filter_horz_neon 3234 add w12, w5, #512 3235 3236 ld1 {v16.8b, v17.8b}, [x2], x3 3237 3238 load_filter_row d0, w12, w7 3239 load_filter_row d1, w12, w7 3240 load_filter_row d2, w12, w7 3241 load_filter_row d3, w12, w7 3242 load_filter_row d4, w12, w7 3243 load_filter_row d5, w12, w7 3244 load_filter_row d6, w12, w7 3245 // subtract by 128 to allow using smull 3246 eor v16.8b, v16.8b, v22.8b 3247 eor v17.8b, v17.8b, v22.8b 3248 load_filter_row d7, w12, w7 3249 3250 ext v18.8b, v16.8b, v17.8b, #1 3251 ext v19.8b, v16.8b, v17.8b, #2 3252 smull v0.8h, v0.8b, v16.8b 3253 smull v1.8h, v1.8b, v18.8b 3254 ext v18.8b, v16.8b, v17.8b, #3 3255 ext v20.8b, v16.8b, v17.8b, #4 3256 smull v2.8h, v2.8b, v19.8b 3257 smull v3.8h, v3.8b, v18.8b 3258 ext v18.8b, v16.8b, v17.8b, #5 3259 ext v19.8b, v16.8b, v17.8b, #6 3260 smull v4.8h, v4.8b, v20.8b 3261 smull v5.8h, v5.8b, v18.8b 3262 ext v18.8b, v16.8b, v17.8b, #7 3263 smull v6.8h, v6.8b, v19.8b 3264 smull v7.8h, v7.8b, v18.8b 3265 3266 addp v0.8h, v0.8h, v1.8h 3267 addp v2.8h, v2.8h, v3.8h 3268 addp v4.8h, v4.8h, v5.8h 3269 addp v6.8h, v6.8h, v7.8h 3270 3271 addp v0.8h, v0.8h, v2.8h 3272 addp v4.8h, v4.8h, v6.8h 3273 3274 addp v0.8h, v0.8h, v4.8h 3275 3276 add w5, w5, w8 3277 3278 ret 3279endfunc 3280 3281// void dav1d_warp_affine_8x8_8bpc_neon( 3282// pixel *dst, const ptrdiff_t dst_stride, 3283// const pixel *src, const ptrdiff_t src_stride, 3284// const int16_t *const abcd, int mx, int my) 3285.macro warp t, shift 3286function warp_affine_8x8\t\()_8bpc_neon, export=1 3287 ldr x4, [x4] 3288 sbfx x7, x4, #0, #16 3289 sbfx x8, x4, #16, #16 3290 sbfx x9, x4, #32, #16 3291 sbfx x4, x4, #48, #16 3292 mov w10, #8 3293 sub x2, x2, x3, lsl #1 3294 sub x2, x2, x3 3295 sub x2, x2, #3 3296 movrel x11, X(mc_warp_filter), 64*8 3297 mov x15, x30 3298.ifnb \t 3299 lsl x1, x1, #1 3300.endif 3301 3302 movi v22.8b, #128 3303.ifb \t 3304 movi v23.8h, #128 3305.else 3306 movi v23.8h, #8, lsl #8 3307.endif 3308 3309 bl warp_filter_horz_neon 3310 srshr v24.8h, v0.8h, #3 3311 bl warp_filter_horz_neon 3312 srshr v25.8h, v0.8h, #3 3313 bl warp_filter_horz_neon 3314 srshr v26.8h, v0.8h, #3 3315 bl warp_filter_horz_neon 3316 srshr v27.8h, v0.8h, #3 3317 bl warp_filter_horz_neon 3318 srshr v28.8h, v0.8h, #3 3319 bl warp_filter_horz_neon 3320 srshr v29.8h, v0.8h, #3 3321 bl warp_filter_horz_neon 3322 srshr v30.8h, v0.8h, #3 3323 33241: 3325 add w14, w6, #512 3326 bl warp_filter_horz_neon 3327 srshr v31.8h, v0.8h, #3 3328 3329 load_filter_row d0, w14, w9 3330 load_filter_row d1, w14, w9 3331 load_filter_row d2, w14, w9 3332 load_filter_row d3, w14, w9 3333 load_filter_row d4, w14, w9 3334 load_filter_row d5, w14, w9 3335 load_filter_row d6, w14, w9 3336 load_filter_row d7, w14, w9 3337 transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl 3338 3339 // This ordering of smull/smlal/smull2/smlal2 is highly 3340 // beneficial for Cortex A53 here. 3341 smull v16.4s, v24.4h, v0.4h 3342 smlal v16.4s, v25.4h, v1.4h 3343 smlal v16.4s, v26.4h, v2.4h 3344 smlal v16.4s, v27.4h, v3.4h 3345 smlal v16.4s, v28.4h, v4.4h 3346 smlal v16.4s, v29.4h, v5.4h 3347 smlal v16.4s, v30.4h, v6.4h 3348 smlal v16.4s, v31.4h, v7.4h 3349 smull2 v17.4s, v24.8h, v0.8h 3350 smlal2 v17.4s, v25.8h, v1.8h 3351 smlal2 v17.4s, v26.8h, v2.8h 3352 smlal2 v17.4s, v27.8h, v3.8h 3353 smlal2 v17.4s, v28.8h, v4.8h 3354 smlal2 v17.4s, v29.8h, v5.8h 3355 smlal2 v17.4s, v30.8h, v6.8h 3356 smlal2 v17.4s, v31.8h, v7.8h 3357 3358 mov v24.16b, v25.16b 3359 mov v25.16b, v26.16b 3360 sqrshrn v16.4h, v16.4s, #\shift 3361 mov v26.16b, v27.16b 3362 sqrshrn2 v16.8h, v17.4s, #\shift 3363 mov v27.16b, v28.16b 3364 mov v28.16b, v29.16b 3365 add v16.8h, v16.8h, v23.8h 3366.ifb \t 3367 sqxtun v16.8b, v16.8h 3368.endif 3369 mov v29.16b, v30.16b 3370 mov v30.16b, v31.16b 3371 subs w10, w10, #1 3372.ifnb \t 3373 st1 {v16.8h}, [x0], x1 3374.else 3375 st1 {v16.8b}, [x0], x1 3376.endif 3377 3378 add w6, w6, w4 3379 b.gt 1b 3380 3381 ret x15 3382endfunc 3383.endm 3384 3385warp , 11 3386warp t, 7 3387 3388// void dav1d_emu_edge_8bpc_neon( 3389// const intptr_t bw, const intptr_t bh, 3390// const intptr_t iw, const intptr_t ih, 3391// const intptr_t x, const intptr_t y, 3392// pixel *dst, const ptrdiff_t dst_stride, 3393// const pixel *ref, const ptrdiff_t ref_stride) 3394function emu_edge_8bpc_neon, export=1 3395 ldp x8, x9, [sp] 3396 3397 // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) 3398 // ref += iclip(x, 0, iw - 1) 3399 sub x12, x3, #1 // ih - 1 3400 cmp x5, x3 3401 sub x13, x2, #1 // iw - 1 3402 csel x12, x12, x5, ge // min(y, ih - 1) 3403 cmp x4, x2 3404 bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) 3405 csel x13, x13, x4, ge // min(x, iw - 1) 3406 bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) 3407 madd x8, x12, x9, x8 // ref += iclip() * stride 3408 add x8, x8, x13 // ref += iclip() 3409 3410 // bottom_ext = iclip(y + bh - ih, 0, bh - 1) 3411 // top_ext = iclip(-y, 0, bh - 1) 3412 add x10, x5, x1 // y + bh 3413 neg x5, x5 // -y 3414 sub x10, x10, x3 // y + bh - ih 3415 sub x12, x1, #1 // bh - 1 3416 cmp x10, x1 3417 bic x5, x5, x5, asr #63 // max(-y, 0) 3418 csel x10, x10, x12, lt // min(y + bh - ih, bh-1) 3419 cmp x5, x1 3420 bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) 3421 csel x5, x5, x12, lt // min(max(-y, 0), bh-1) 3422 3423 // right_ext = iclip(x + bw - iw, 0, bw - 1) 3424 // left_ext = iclip(-x, 0, bw - 1) 3425 add x11, x4, x0 // x + bw 3426 neg x4, x4 // -x 3427 sub x11, x11, x2 // x + bw - iw 3428 sub x13, x0, #1 // bw - 1 3429 cmp x11, x0 3430 bic x4, x4, x4, asr #63 // max(-x, 0) 3431 csel x11, x11, x13, lt // min(x + bw - iw, bw-1) 3432 cmp x4, x0 3433 bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) 3434 csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) 3435 3436 // center_h = bh - top_ext - bottom_ext 3437 // dst += top_ext * PXSTRIDE(dst_stride) 3438 // center_w = bw - left_ext - right_ext 3439 sub x1, x1, x5 // bh - top_ext 3440 madd x6, x5, x7, x6 3441 sub x2, x0, x4 // bw - left_ext 3442 sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext 3443 sub x2, x2, x11 // center_w = bw - left_ext - right_ext 3444 3445 mov x14, x6 // backup of dst 3446 3447.macro v_loop need_left, need_right 34480: 3449.if \need_left 3450 ld1r {v0.16b}, [x8] 3451 mov x12, x6 // out = dst 3452 mov x3, x4 34531: 3454 subs x3, x3, #16 3455 st1 {v0.16b}, [x12], #16 3456 b.gt 1b 3457.endif 3458 mov x13, x8 3459 add x12, x6, x4 // out = dst + left_ext 3460 mov x3, x2 34611: 3462 ld1 {v0.16b, v1.16b}, [x13], #32 3463 subs x3, x3, #32 3464 st1 {v0.16b, v1.16b}, [x12], #32 3465 b.gt 1b 3466.if \need_right 3467 add x3, x8, x2 // in + center_w 3468 sub x3, x3, #1 // in + center_w - 1 3469 add x12, x6, x4 // dst + left_ext 3470 ld1r {v0.16b}, [x3] 3471 add x12, x12, x2 // out = dst + left_ext + center_w 3472 mov x3, x11 34731: 3474 subs x3, x3, #16 3475 st1 {v0.16b}, [x12], #16 3476 b.gt 1b 3477.endif 3478 3479 subs x1, x1, #1 // center_h-- 3480 add x6, x6, x7 3481 add x8, x8, x9 3482 b.gt 0b 3483.endm 3484 3485 cbz x4, 2f 3486 // need_left 3487 cbz x11, 3f 3488 // need_left + need_right 3489 v_loop 1, 1 3490 b 5f 3491 34922: 3493 // !need_left 3494 cbz x11, 4f 3495 // !need_left + need_right 3496 v_loop 0, 1 3497 b 5f 3498 34993: 3500 // need_left + !need_right 3501 v_loop 1, 0 3502 b 5f 3503 35044: 3505 // !need_left + !need_right 3506 v_loop 0, 0 3507 35085: 3509 3510 cbz x10, 3f 3511 // need_bottom 3512 sub x8, x6, x7 // ref = dst - stride 3513 mov x4, x0 35141: 3515 ld1 {v0.16b, v1.16b}, [x8], #32 3516 mov x3, x10 35172: 3518 subs x3, x3, #1 3519 st1 {v0.16b, v1.16b}, [x6], x7 3520 b.gt 2b 3521 msub x6, x7, x10, x6 // dst -= bottom_ext * stride 3522 subs x4, x4, #32 // bw -= 32 3523 add x6, x6, #32 // dst += 32 3524 b.gt 1b 3525 35263: 3527 cbz x5, 3f 3528 // need_top 3529 msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride 35301: 3531 ld1 {v0.16b, v1.16b}, [x14], #32 3532 mov x3, x5 35332: 3534 subs x3, x3, #1 3535 st1 {v0.16b, v1.16b}, [x6], x7 3536 b.gt 2b 3537 msub x6, x7, x5, x6 // dst -= top_ext * stride 3538 subs x0, x0, #32 // bw -= 32 3539 add x6, x6, #32 // dst += 32 3540 b.gt 1b 3541 35423: 3543 ret 3544endfunc 3545