1/* 2 * Copyright © 2024, VideoLAN and dav1d authors 3 * Copyright © 2024, Janne Grunau 4 * Copyright © 2024, Martin Storsjo 5 * Copyright © 2024, Arm Limited 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright notice, this 12 * list of conditions and the following disclaimer. 13 * 14 * 2. Redistributions in binary form must reproduce the above copyright notice, 15 * this list of conditions and the following disclaimer in the documentation 16 * and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 25 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30#include "src/arm/asm.S" 31#include "util.S" 32 33 34#if HAVE_DOTPROD 35ENABLE_DOTPROD 36 37// No spaces in these expressions, due to gas-preprocessor. It is translated by 38// -1 to save the negative offset at getting the address of `mc_subpel_filters`. 39#define REGULAR1 (((0*15-1)<<7)|(3*15-1)) 40#define SMOOTH1 (((1*15-1)<<7)|(4*15-1)) 41#define SHARP1 (((2*15-1)<<7)|(3*15-1)) 42 43#define FUNC_ALIGN 2 44#define JUMP_ALIGN 2 45#define LOOP_ALIGN 2 46 47 48// Lookup table used to help conversion of shifted 32-bit values to 8-bit. 49 .align 4 50L(hv_tbl_neon_dotprod): 51 .byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 52 53// Shuffle indices to permute horizontal samples in preparation for input to 54// SDOT instructions. The 8-tap horizontal convolution uses sample indices in the 55// interval of [-3, 4] relative to the current sample position. We load samples 56// from index value -4 to keep loads word aligned, so the shuffle bytes are 57// translated by 1 to handle this. 58 .align 4 59L(h_tbl_neon_dotprod): 60 .byte 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7 61 .byte 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11 62 .byte 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15 63 64// Vertical convolutions are also using SDOT instructions, where a 128-bit 65// register contains a transposed 4x4 matrix of values. Subsequent iterations of 66// the vertical convolution can reuse the 3x4 sub-matrix from the previous loop 67// iteration. These shuffle indices shift and merge this 4x4 matrix with the 68// values of a new line. 69 .align 4 70L(v_tbl_neon_dotprod): 71 .byte 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28 72 .byte 1, 2, 3, 16, 5, 6, 7, 17, 9, 10, 11, 18, 13, 14, 15, 19 73 .byte 1, 2, 3, 20, 5, 6, 7, 21, 9, 10, 11, 22, 13, 14, 15, 23 74 .byte 1, 2, 3, 24, 5, 6, 7, 25, 9, 10, 11, 26, 13, 14, 15, 27 75 .byte 1, 2, 3, 28, 5, 6, 7, 29, 9, 10, 11, 30, 13, 14, 15, 31 76 77 78.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1 79function \op\()_8tap_\type\()_8bpc_\isa, export=1, align=FUNC_ALIGN 80 mov x9, \type_h 81 mov x10, \type_v 82 .if \jump 83 b \op\()_8tap_\isa 84 .endif 85endfunc 86.endm 87 88.macro filter_8tap_fn type, dot, isa, dst, d_strd, src, s_strd, w, h, mx, my, xmx, xmy, ldst, lsrc, wd_strd 89make_8tap_fn \type, sharp, SHARP1, SHARP1, \isa 90make_8tap_fn \type, sharp_smooth, SHARP1, SMOOTH1, \isa 91make_8tap_fn \type, sharp_regular, SHARP1, REGULAR1, \isa 92make_8tap_fn \type, smooth_sharp, SMOOTH1, SHARP1, \isa 93make_8tap_fn \type, smooth, SMOOTH1, SMOOTH1, \isa 94make_8tap_fn \type, smooth_regular, SMOOTH1, REGULAR1, \isa 95make_8tap_fn \type, regular_sharp, REGULAR1, SHARP1, \isa 96make_8tap_fn \type, regular_smooth, REGULAR1, SMOOTH1, \isa 97make_8tap_fn \type, regular, REGULAR1, REGULAR1, \isa, jump=0 98 99function \type\()_8tap_\isa, align=FUNC_ALIGN 100 clz w8, \w 101 mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) 102 sub w8, w8, #24 // for jump tables 103 movrel x12, X(mc_subpel_filters) 104 cbnz \mx, L(\type\()_8tap_h_hv_\isa) 105 cbnz \my, L(\type\()_8tap_v_\isa) 106.ifc \type, prep 107 add \wd_strd, \w, \w // prep_neon needs w * 2 as stride 108.endif 109 b X(\type\()_neon) 110 111 .align JUMP_ALIGN 112L(\type\()_8tap_v_\isa): 113 madd \my, \my, w11, w10 114 ldr q6, L(v_tbl_neon_dotprod) 115 sub \src, \src, \s_strd 116.ifc \isa, neon_dotprod 117 .ifc \type, prep 118 mov w8, 0x2002 // FILTER_WEIGHT * 128 + rounding 119 dup v4.4s, w8 120 .else 121 movi v4.4s, #32, lsl 8 // FILTER_WEIGHT * 128, bias for SDOT 122 .endif 123.endif 124 ubfx w11, \my, #7, #7 125 and \my, \my, #0x7F 126 ldr q28, L(v_tbl_neon_dotprod) + 16 127 cmp \h, #4 128 csel \my, \my, w11, le 129 sub \src, \src, \s_strd, lsl #1 // src - s_strd * 3 130 add \xmy, x12, \xmy, lsl #3 // subpel V filter address 131 ldr q29, L(v_tbl_neon_dotprod) + 32 132.ifc \isa, neon_dotprod 133 movi v5.16b, #128 134.endif 135 ldr d7, [\xmy] 136 cmp \w, #8 137 b.eq 80f 138 b.lt 40f 139 140 // .align JUMP_ALIGN // fallthrough 141160: // V - 16xN+ 142 ldr q30, L(v_tbl_neon_dotprod) + 48 143 ldr q31, L(v_tbl_neon_dotprod) + 64 144.ifc \type, prep 145 add \wd_strd, \w, \w 146.endif 147 .align LOOP_ALIGN 148161: 149 mov \lsrc, \src 150 mov \ldst, \dst 151 sub w8, \h, #1 152 153 ldr q16, [\lsrc] 154 ldr q17, [\lsrc, \s_strd] 155 add \lsrc, \lsrc, \s_strd, lsl #1 156 ldr q18, [\lsrc] 157 ldr q19, [\lsrc, \s_strd] 158 add \lsrc, \lsrc, \s_strd, lsl #1 159 160 zip1 v0.16b, v16.16b, v17.16b 161 zip2 v1.16b, v16.16b, v17.16b 162 zip1 v2.16b, v18.16b, v19.16b 163 zip2 v3.16b, v18.16b, v19.16b 164 165 ldr q20, [\lsrc] 166 ldr q21, [\lsrc, \s_strd] 167 add \lsrc, \lsrc, \s_strd, lsl #1 168 ldr q22, [\lsrc] 169 ldr q23, [\lsrc, \s_strd] 170 add \lsrc, \lsrc, \s_strd, lsl #1 171 172 zip1 v18.16b, v20.16b, v21.16b 173 zip2 v21.16b, v20.16b, v21.16b 174 zip1 v24.16b, v22.16b, v23.16b 175 zip2 v27.16b, v22.16b, v23.16b 176 177 zip1 v16.8h, v0.8h, v2.8h 178 zip2 v19.8h, v0.8h, v2.8h 179 zip1 v22.8h, v1.8h, v3.8h 180 zip2 v25.8h, v1.8h, v3.8h 181 182 zip1 v17.8h, v18.8h, v24.8h 183 zip2 v20.8h, v18.8h, v24.8h 184 zip1 v23.8h, v21.8h, v27.8h 185 zip2 v26.8h, v21.8h, v27.8h 186.ifc \isa, neon_dotprod 187 sub v16.16b, v16.16b, v5.16b 188 sub v19.16b, v19.16b, v5.16b 189 sub v22.16b, v22.16b, v5.16b 190 sub v25.16b, v25.16b, v5.16b 191 192 sub v17.16b, v17.16b, v5.16b 193 sub v20.16b, v20.16b, v5.16b 194 sub v23.16b, v23.16b, v5.16b 195 sub v26.16b, v26.16b, v5.16b 196.endif 197 .align LOOP_ALIGN 19816: 199.ifc \isa, neon_i8mm 200 ld1 {v18.16b}, [\lsrc], \s_strd 201 movi v0.4s, #0 202 movi v1.4s, #0 203 movi v2.4s, #0 204 movi v3.4s, #0 205 mov v21.16b, v18.16b 206 mov v24.16b, v18.16b 207 mov v27.16b, v18.16b 208.else // neon_dotprod 209 ld1 {v27.16b}, [\lsrc], \s_strd 210 mov v0.16b, v4.16b 211 mov v1.16b, v4.16b 212 mov v2.16b, v4.16b 213 mov v3.16b, v4.16b 214 sub v18.16b, v27.16b, v5.16b 215 sub v21.16b, v27.16b, v5.16b 216 sub v24.16b, v27.16b, v5.16b 217 sub v27.16b, v27.16b, v5.16b 218.endif 219 \dot v0.4s, v16.16b, v7.4b[0] 220 \dot v1.4s, v19.16b, v7.4b[0] 221 \dot v2.4s, v22.16b, v7.4b[0] 222 \dot v3.4s, v25.16b, v7.4b[0] 223 224 tbl v16.16b, {v16.16b, v17.16b}, v6.16b 225 tbl v19.16b, {v19.16b, v20.16b}, v6.16b 226 tbl v22.16b, {v22.16b, v23.16b}, v6.16b 227 tbl v25.16b, {v25.16b, v26.16b}, v6.16b 228 229 \dot v0.4s, v17.16b, v7.4b[1] 230 \dot v1.4s, v20.16b, v7.4b[1] 231 \dot v2.4s, v23.16b, v7.4b[1] 232 \dot v3.4s, v26.16b, v7.4b[1] 233 234 tbl v17.16b, {v17.16b, v18.16b}, v28.16b 235 tbl v20.16b, {v20.16b, v21.16b}, v29.16b 236 tbl v23.16b, {v23.16b, v24.16b}, v30.16b 237 tbl v26.16b, {v26.16b, v27.16b}, v31.16b 238 239 subs w8, w8, #1 240 uzp1 v0.8h, v0.8h, v1.8h 241 uzp1 v2.8h, v2.8h, v3.8h 242.ifc \type, prep 243 .ifc \isa, neon_i8mm 244 srshr v0.8h, v0.8h, #2 245 srshr v1.8h, v2.8h, #2 246 .else 247 sshr v0.8h, v0.8h, #2 248 sshr v1.8h, v2.8h, #2 249 .endif 250 st1 {v0.8h, v1.8h}, [\ldst], \d_strd 251.else // put 252 sqrshrun v0.8b, v0.8h, #6 253 sqrshrun2 v0.16b, v2.8h, #6 254 st1 {v0.16b}, [\ldst], \d_strd 255.endif 256 b.gt 16b 257 258.ifc \isa, neon_i8mm 259 movi v0.4s, #0 260 movi v1.4s, #0 261 movi v2.4s, #0 262 movi v3.4s, #0 263.else // neon_dotprod 264 mov v0.16b, v4.16b 265 mov v1.16b, v4.16b 266 mov v2.16b, v4.16b 267 mov v3.16b, v4.16b 268.endif 269 \dot v0.4s, v16.16b, v7.4b[0] 270 \dot v1.4s, v19.16b, v7.4b[0] 271 \dot v2.4s, v22.16b, v7.4b[0] 272 \dot v3.4s, v25.16b, v7.4b[0] 273 274 \dot v0.4s, v17.16b, v7.4b[1] 275 \dot v1.4s, v20.16b, v7.4b[1] 276 \dot v2.4s, v23.16b, v7.4b[1] 277 \dot v3.4s, v26.16b, v7.4b[1] 278 279 subs \w, \w, #16 280 uzp1 v0.8h, v0.8h, v1.8h 281 uzp1 v2.8h, v2.8h, v3.8h 282.ifc \type, prep 283 .ifc \isa, neon_i8mm 284 srshr v0.8h, v0.8h, #2 285 srshr v1.8h, v2.8h, #2 286 .else 287 sshr v0.8h, v0.8h, #2 288 sshr v1.8h, v2.8h, #2 289 .endif 290 stp q0, q1, [\ldst] 291 add \dst, \dst, #32 292.else // put 293 sqrshrun v0.8b, v0.8h, #6 294 sqrshrun2 v0.16b, v2.8h, #6 295 str q0, [\ldst] 296 add \dst, \dst, #16 297.endif 298 add \src, \src, #16 299 b.gt 161b 300 ret 301 302 .align JUMP_ALIGN 30380: // V - 8xN 304 ldr d16, [\src] 305 ldr d17, [\src, \s_strd] 306 add \src, \src, \s_strd, lsl #1 307 ldr d18, [\src] 308 ldr d19, [\src, \s_strd] 309 add \src, \src, \s_strd, lsl #1 310 311 ldr d20, [\src] 312 ldr d21, [\src, \s_strd] 313 add \src, \src, \s_strd, lsl #1 314 ldr d22, [\src] 315 ldr d23, [\src, \s_strd] 316 add \src, \src, \s_strd, lsl #1 317 subs \h, \h, #2 // for prep: sub is enough 318 319 zip1 v0.16b, v16.16b, v17.16b 320 zip1 v2.16b, v18.16b, v19.16b 321 zip1 v18.16b, v20.16b, v21.16b 322 zip1 v24.16b, v22.16b, v23.16b 323 324 zip1 v16.8h, v0.8h, v2.8h 325 zip2 v19.8h, v0.8h, v2.8h 326 zip1 v17.8h, v18.8h, v24.8h 327 zip2 v20.8h, v18.8h, v24.8h 328.ifc \isa, neon_dotprod 329 sub v16.16b, v16.16b, v5.16b 330 sub v19.16b, v19.16b, v5.16b 331 sub v17.16b, v17.16b, v5.16b 332 sub v20.16b, v20.16b, v5.16b 333.endif 334.ifc \type, put 335 b.eq 82f 336.endif 337 .align LOOP_ALIGN 3388: 339.ifc \isa, neon_i8mm 340 ldr d18, [\src] 341 movi v0.4s, #0 342 movi v1.4s, #0 343 ldr d24, [\src, \s_strd] 344 add \src, \src, \s_strd, lsl #1 345 movi v2.4s, #0 346 movi v3.4s, #0 347 mov v21.8b, v18.8b 348 mov v27.8b, v24.8b 349.else // neon_dotprod 350 ldr d21, [\src] 351 ldr d27, [\src, \s_strd] 352 add \src, \src, \s_strd, lsl #1 353 mov v0.16b, v4.16b 354 mov v1.16b, v4.16b 355 mov v2.16b, v4.16b 356 mov v3.16b, v4.16b 357 sub v18.16b, v21.16b, v5.16b 358 sub v21.16b, v21.16b, v5.16b 359 sub v24.16b, v27.16b, v5.16b 360 sub v27.16b, v27.16b, v5.16b 361.endif 362 tbl v22.16b, {v16.16b, v17.16b}, v6.16b 363 tbl v25.16b, {v19.16b, v20.16b}, v6.16b 364 tbl v23.16b, {v17.16b, v18.16b}, v28.16b 365 tbl v26.16b, {v20.16b, v21.16b}, v29.16b 366 367 \dot v0.4s, v16.16b, v7.4b[0] 368 \dot v0.4s, v17.16b, v7.4b[1] 369 \dot v1.4s, v19.16b, v7.4b[0] 370 \dot v1.4s, v20.16b, v7.4b[1] 371 372 tbl v16.16b, {v22.16b, v23.16b}, v6.16b 373 tbl v19.16b, {v25.16b, v26.16b}, v6.16b 374 tbl v17.16b, {v23.16b, v24.16b}, v28.16b 375 tbl v20.16b, {v26.16b, v27.16b}, v29.16b 376 377 \dot v2.4s, v22.16b, v7.4b[0] 378 \dot v2.4s, v23.16b, v7.4b[1] 379 \dot v3.4s, v25.16b, v7.4b[0] 380 \dot v3.4s, v26.16b, v7.4b[1] 381 382 subs \h, \h, #2 383 uzp1 v0.8h, v0.8h, v1.8h 384 uzp1 v2.8h, v2.8h, v3.8h 385.ifc \type, prep 386 .ifc \isa, neon_i8mm 387 srshr v0.8h, v0.8h, #2 388 srshr v1.8h, v2.8h, #2 389 .else 390 sshr v0.8h, v0.8h, #2 391 sshr v1.8h, v2.8h, #2 392 .endif 393 stp q0, q1, [\dst], #32 394.else // put 395 sqrshrun v0.8b, v0.8h, #6 396 sqrshrun v1.8b, v2.8h, #6 397 str d0, [\dst] 398 str d1, [\dst, \d_strd] 399 add \dst, \dst, \d_strd, lsl #1 400.endif 401 b.gt 8b 402 403.ifc \type, put 404 .align JUMP_ALIGN 40582: 406.endif 407.ifc \isa, neon_i8mm 408 ldr d18, [\src] 409 movi v0.4s, #0 410 movi v1.4s, #0 411 movi v2.4s, #0 412 movi v3.4s, #0 413 mov v21.8b, v18.8b 414.else // neon_dotprod 415 ldr d21, [\src] 416 mov v0.16b, v4.16b 417 mov v1.16b, v4.16b 418 mov v2.16b, v4.16b 419 mov v3.16b, v4.16b 420 sub v18.16b, v21.16b, v5.16b 421 sub v21.16b, v21.16b, v5.16b 422.endif 423 tbl v22.16b, {v16.16b, v17.16b}, v6.16b 424 tbl v25.16b, {v19.16b, v20.16b}, v6.16b 425 tbl v23.16b, {v17.16b, v18.16b}, v28.16b 426 tbl v26.16b, {v20.16b, v21.16b}, v29.16b 427 428 \dot v0.4s, v16.16b, v7.4b[0] 429 \dot v0.4s, v17.16b, v7.4b[1] 430 \dot v1.4s, v19.16b, v7.4b[0] 431 \dot v1.4s, v20.16b, v7.4b[1] 432 433 \dot v2.4s, v22.16b, v7.4b[0] 434 \dot v2.4s, v23.16b, v7.4b[1] 435 \dot v3.4s, v25.16b, v7.4b[0] 436 \dot v3.4s, v26.16b, v7.4b[1] 437 438 uzp1 v0.8h, v0.8h, v1.8h 439 uzp1 v2.8h, v2.8h, v3.8h 440.ifc \type, prep 441 .ifc \isa, neon_i8mm 442 srshr v0.8h, v0.8h, #2 443 srshr v1.8h, v2.8h, #2 444 .else 445 sshr v0.8h, v0.8h, #2 446 sshr v1.8h, v2.8h, #2 447 .endif 448 stp q0, q1, [\dst] 449.else // put 450 sqrshrun v0.8b, v0.8h, #6 451 sqrshrun v1.8b, v2.8h, #6 452 str d0, [\dst] 453 str d1, [\dst, \d_strd] 454.endif 455 ret 456 457 .align JUMP_ALIGN 45840: // V - 4xN or 2xN (put only) 459.ifc \type, put 460 cmp \w, #2 461 b.eq 20f 462.endif 463 ldr s16, [\src] 464 ldr s17, [\src, \s_strd] 465 add \src, \src, \s_strd, lsl #1 466 ldr s18, [\src] 467 ldr s19, [\src, \s_strd] 468 add \src, \src, \s_strd, lsl #1 469 470 ldr s20, [\src] 471 ldr s21, [\src, \s_strd] 472 add \src, \src, \s_strd, lsl #1 473 ldr s22, [\src] 474 ldr s23, [\src, \s_strd] 475 add \src, \src, \s_strd, lsl #1 476 subs \h, \h, #2 // for prep: sub is enough 477 478 zip1 v0.8b, v16.8b, v17.8b 479 zip1 v2.8b, v18.8b, v19.8b 480 zip1 v18.8b, v20.8b, v21.8b 481 zip1 v24.8b, v22.8b, v23.8b 482 483 zip1 v16.8h, v0.8h, v2.8h 484 zip1 v17.8h, v18.8h, v24.8h 485.ifc \isa, neon_dotprod 486 sub v16.16b, v16.16b, v5.16b 487 sub v17.16b, v17.16b, v5.16b 488.endif 489.ifc \type, put 490 b.eq 42f 491.endif 492 .align LOOP_ALIGN 4934: 494 ldr s18, [\src] 495 ldr s21, [\src, \s_strd] 496 add \src, \src, \s_strd, lsl #1 497.ifc \isa, neon_i8mm 498 movi v0.4s, #0 499 movi v1.4s, #0 500.else // neon_dotprod 501 mov v0.16b, v4.16b 502 mov v1.16b, v4.16b 503 sub v18.16b, v18.16b, v5.16b 504 sub v21.16b, v21.16b, v5.16b 505.endif 506 tbl v19.16b, {v16.16b, v17.16b}, v6.16b 507 tbl v20.16b, {v17.16b, v18.16b}, v28.16b 508 509 \dot v0.4s, v16.16b, v7.4b[0] 510 \dot v0.4s, v17.16b, v7.4b[1] 511 512 tbl v16.16b, {v19.16b, v20.16b}, v6.16b 513 tbl v17.16b, {v20.16b, v21.16b}, v28.16b 514 515 \dot v1.4s, v19.16b, v7.4b[0] 516 \dot v1.4s, v20.16b, v7.4b[1] 517.ifc \type, prep 518 subs \h, \h, #2 519 .ifc \isa, neon_i8mm 520 rshrn v0.4h, v0.4s, #2 521 rshrn2 v0.8h, v1.4s, #2 522 .else 523 shrn v0.4h, v0.4s, #2 524 shrn2 v0.8h, v1.4s, #2 525 .endif 526 str q0, [\dst], #16 527.else 528 uzp1 v0.8h, v0.8h, v1.8h 529 sqrshrun v0.8b, v0.8h, #6 530 subs \h, \h, #2 531 fmov x8, d0 532 lsr x9, x8, #32 533 str w8, [\dst] 534 str w9, [\dst, \d_strd] 535 add \dst, \dst, \d_strd, lsl #1 536.endif 537 b.gt 4b 538 539.ifc \type, put 540 .align JUMP_ALIGN 54142: 542.endif 543 ldr s18, [\src] 544.ifc \isa, neon_i8mm 545 movi v0.4s, #0 546 movi v1.4s, #0 547.else // neon_dotprod 548 mov v0.16b, v4.16b 549 mov v1.16b, v4.16b 550 sub v18.16b, v18.16b, v5.16b 551.endif 552 tbl v19.16b, {v16.16b, v17.16b}, v6.16b 553 tbl v20.16b, {v17.16b, v18.16b}, v28.16b 554 555 \dot v0.4s, v16.16b, v7.4b[0] 556 \dot v0.4s, v17.16b, v7.4b[1] 557 558 \dot v1.4s, v19.16b, v7.4b[0] 559 \dot v1.4s, v20.16b, v7.4b[1] 560.ifc \type, prep 561 .ifc \isa, neon_i8mm 562 rshrn v0.4h, v0.4s, #2 563 rshrn2 v0.8h, v1.4s, #2 564 .else 565 shrn v0.4h, v0.4s, #2 566 shrn2 v0.8h, v1.4s, #2 567 .endif 568 str q0, [\dst] 569.else 570 uzp1 v0.8h, v0.8h, v1.8h 571 sqrshrun v0.8b, v0.8h, #6 572 fmov x8, d0 573 lsr x9, x8, #32 574 str w8, [\dst] 575 str w9, [\dst, \d_strd] 576.endif 577 ret 578 579.ifc \type, put 580 .align JUMP_ALIGN 58120: // V - 2xN 582 ldr h16, [\src] 583 ldr h17, [\src, \s_strd] 584 add \src, \src, \s_strd, lsl #1 585 ldr h18, [\src] 586 ldr h19, [\src, \s_strd] 587 add \src, \src, \s_strd, lsl #1 588 589 ldr h20, [\src] 590 ldr h21, [\src, \s_strd] 591 add \src, \src, \s_strd, lsl #1 592 ldr h22, [\src] 593 ldr h23, [\src, \s_strd] 594 add \src, \src, \s_strd, lsl #1 595 subs \h, \h, #2 596 597 zip1 v0.8b, v16.8b, v17.8b 598 zip1 v2.8b, v18.8b, v19.8b 599 zip1 v18.8b, v20.8b, v21.8b 600 zip1 v24.8b, v22.8b, v23.8b 601 602 zip1 v16.4h, v0.4h, v2.4h 603 zip1 v17.4h, v18.4h, v24.4h 604 .ifc \isa, neon_dotprod 605 sub v16.8b, v16.8b, v5.8b 606 sub v17.8b, v17.8b, v5.8b 607 .endif 608 b.eq 22f 609 610 .align LOOP_ALIGN 6112: 612 ldr h18, [\src] 613 ldr h21, [\src, \s_strd] 614 add \src, \src, \s_strd, lsl #1 615 .ifc \isa, neon_i8mm 616 movi v0.4s, #0 617 movi v1.4s, #0 618 .else // put 619 mov v0.16b, v4.16b 620 mov v1.16b, v4.16b 621 sub v18.8b, v18.8b, v5.8b 622 sub v21.8b, v21.8b, v5.8b 623 .endif 624 tbl v19.16b, {v16.16b, v17.16b}, v6.16b 625 tbl v20.16b, {v17.16b, v18.16b}, v28.16b 626 627 \dot v0.4s, v16.16b, v7.4b[0] 628 \dot v0.4s, v17.16b, v7.4b[1] 629 630 tbl v16.16b, {v19.16b, v20.16b}, v6.16b 631 tbl v17.16b, {v20.16b, v21.16b}, v28.16b 632 633 \dot v1.4s, v19.16b, v7.4b[0] 634 \dot v1.4s, v20.16b, v7.4b[1] 635 636 uzp1 v0.8h, v0.8h, v1.8h 637 sqrshrun v0.8b, v0.8h, #6 638 639 subs \h, \h, #2 640 fmov x8, d0 641 lsr x9, x8, #32 642 strh w8, [\dst] 643 strh w9, [\dst, \d_strd] 644 add \dst, \dst, \d_strd, lsl #1 645 b.gt 2b 646 647 .align JUMP_ALIGN 64822: 649 ldr h18, [\src] 650 .ifc \isa, neon_i8mm 651 movi v0.4s, #0 652 movi v1.4s, #0 653 .else // put 654 mov v0.16b, v4.16b 655 mov v1.16b, v4.16b 656 sub v18.8b, v18.8b, v5.8b 657 .endif 658 tbl v19.16b, {v16.16b, v17.16b}, v6.16b 659 tbl v20.16b, {v17.16b, v18.16b}, v28.16b 660 661 \dot v0.4s, v16.16b, v7.4b[0] 662 \dot v0.4s, v17.16b, v7.4b[1] 663 664 \dot v1.4s, v19.16b, v7.4b[0] 665 \dot v1.4s, v20.16b, v7.4b[1] 666 667 uzp1 v0.8h, v0.8h, v1.8h 668 sqrshrun v0.8b, v0.8h, #6 669 670 fmov x8, d0 671 lsr x9, x8, #32 672 strh w8, [\dst] 673 strh w9, [\dst, \d_strd] 674 ret 675.endif 676 677 .align JUMP_ALIGN 678L(\type\()_8tap_h_hv_\isa): 679 madd \mx, \mx, w11, w9 680 madd w14, \my, w11, w10 // for HV 681 ldr q28, L(h_tbl_neon_dotprod) 682.ifc \isa, neon_dotprod 683 mov w13, 0x2002 // FILTER_WEIGHT * 128 + rounding 684 dup v27.4s, w13 // put H overrides this 685.endif 686 sub \src, \src, #4 // src - 4 687 ubfx w9, \mx, #7, #7 688 and \mx, \mx, #0x7F 689 ubfx w11, w14, #7, #7 // for HV 690 and w14, w14, #0x7F // for HV 691 cmp \w, #4 692 csel \mx, \mx, w9, le 693 add \xmx, x12, \xmx, lsl #3 // subpel H filter address 694.ifc \isa, neon_dotprod 695 movi v24.16b, #128 696.endif 697 cbz \my, L(\type\()_8tap_h_\isa) 698 699 // HV cases 700 cmp \h, #4 701 csel w14, w14, w11, le 702 sub \src, \src, \s_strd, lsl #1 // src - s_strd * 2 - 4 703 add \xmy, x12, x14, lsl #3 // subpel V filter address 704 mov x15, x30 705 ldr d7, [\xmy] 706.ifc \type, put 707 ldr q25, L(hv_tbl_neon_dotprod) 708.endif 709 sxtl v7.8h, v7.8b 710 cmp w10, SHARP1 711 b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1 712 713 // HV 8-tap cases 714 sub \src, \src, \s_strd // src - s_strd * 3 - 4 715 cmp \w, #4 716 b.eq 40f 717.ifc \type, put 718 b.lt 20f 719.endif 720 721 // .align JUMP_ALIGN // fallthrough 72280: // HV8 - 8xN+ 723 ldr q29, L(h_tbl_neon_dotprod) + 16 724 ldr q30, L(h_tbl_neon_dotprod) + 32 725 ldr d26, [\xmx] 726.ifc \type, prep 727 add \wd_strd, \w, \w 728.endif 729 .align LOOP_ALIGN 73081: 731 mov \lsrc, \src 732 mov \ldst, \dst 733 mov w8, \h 734.ifc \isa, neon_i8mm 735 bl L(\type\()_hv_filter8_\isa) 736 srshr v16.8h, v22.8h, #2 737 bl L(\type\()_hv_filter8_\isa) 738 srshr v17.8h, v22.8h, #2 739 bl L(\type\()_hv_filter8_\isa) 740 srshr v18.8h, v22.8h, #2 741 bl L(\type\()_hv_filter8_\isa) 742 srshr v19.8h, v22.8h, #2 743 bl L(\type\()_hv_filter8_\isa) 744 srshr v20.8h, v22.8h, #2 745 bl L(\type\()_hv_filter8_\isa) 746 srshr v21.8h, v22.8h, #2 747 bl L(\type\()_hv_filter8_\isa) 748 srshr v22.8h, v22.8h, #2 749.else 750 bl L(\type\()_hv_filter8_\isa) 751 sshr v16.8h, v22.8h, #2 752 bl L(\type\()_hv_filter8_\isa) 753 sshr v17.8h, v22.8h, #2 754 bl L(\type\()_hv_filter8_\isa) 755 sshr v18.8h, v22.8h, #2 756 bl L(\type\()_hv_filter8_\isa) 757 sshr v19.8h, v22.8h, #2 758 bl L(\type\()_hv_filter8_\isa) 759 sshr v20.8h, v22.8h, #2 760 bl L(\type\()_hv_filter8_\isa) 761 sshr v21.8h, v22.8h, #2 762 bl L(\type\()_hv_filter8_\isa) 763 sshr v22.8h, v22.8h, #2 764.endif 765 .align LOOP_ALIGN 7668: 767 ldr q23, [\lsrc] 768 add \lsrc, \lsrc, \s_strd 769 770 smull v0.4s, v16.4h, v7.h[0] 771 smull2 v1.4s, v16.8h, v7.h[0] 772 mov v16.16b, v17.16b 773.ifc \isa, neon_i8mm 774 movi v5.4s, #0 775 movi v6.4s, #0 776 tbl v2.16b, {v23.16b}, v28.16b 777 tbl v3.16b, {v23.16b}, v29.16b 778.else // neon_dotprod 779 sub v23.16b, v23.16b, v24.16b 780 mov v5.16b, v27.16b 781 mov v6.16b, v27.16b 782.endif 783 smlal v0.4s, v17.4h, v7.h[1] 784 smlal2 v1.4s, v17.8h, v7.h[1] 785.ifc \isa, neon_i8mm 786 tbl v4.16b, {v23.16b}, v30.16b 787 mov v17.16b, v18.16b 788.else // neon_dotprod 789 mov v17.16b, v18.16b 790 tbl v2.16b, {v23.16b}, v28.16b 791 tbl v3.16b, {v23.16b}, v29.16b 792 tbl v4.16b, {v23.16b}, v30.16b 793.endif 794 smlal v0.4s, v18.4h, v7.h[2] 795 smlal2 v1.4s, v18.8h, v7.h[2] 796 mov v18.16b, v19.16b 797 798 \dot v5.4s, v2.16b, v26.4b[0] 799 \dot v6.4s, v3.16b, v26.4b[0] 800 801 smlal v0.4s, v19.4h, v7.h[3] 802 smlal2 v1.4s, v19.8h, v7.h[3] 803 mov v19.16b, v20.16b 804 805 \dot v5.4s, v3.16b, v26.4b[1] 806 \dot v6.4s, v4.16b, v26.4b[1] 807 808 smlal v0.4s, v20.4h, v7.h[4] 809 smlal2 v1.4s, v20.8h, v7.h[4] 810 mov v20.16b, v21.16b 811 812 smlal v0.4s, v21.4h, v7.h[5] 813 smlal2 v1.4s, v21.8h, v7.h[5] 814.ifc \type, prep 815 uzp1 v23.8h, v5.8h, v6.8h 816.endif 817 mov v21.16b, v22.16b 818 smlal v0.4s, v22.4h, v7.h[6] 819 smlal2 v1.4s, v22.8h, v7.h[6] 820.ifc \isa, neon_i8mm 821 subs w8, w8, #1 822.endif 823.ifc \type, prep 824 .ifc \isa, neon_i8mm 825 srshr v22.8h, v23.8h, #2 826 .else 827 sshr v22.8h, v23.8h, #2 828 .endif 829 smlal v0.4s, v22.4h, v7.h[7] 830 smlal2 v1.4s, v22.8h, v7.h[7] 831 rshrn v0.4h, v0.4s, #6 832 rshrn2 v0.8h, v1.4s, #6 833.else // put 834 .ifc \isa, neon_i8mm 835 rshrn v22.4h, v5.4s, #2 836 rshrn2 v22.8h, v6.4s, #2 837 .else 838 shrn v22.4h, v5.4s, #2 839 shrn2 v22.8h, v6.4s, #2 840 .endif 841 smlal v0.4s, v22.4h, v7.h[7] 842 smlal2 v1.4s, v22.8h, v7.h[7] 843 tbl v0.16b, {v0.16b, v1.16b}, v25.16b 844 sqrshrun v0.8b, v0.8h, #2 845.endif 846.ifc \isa, neon_dotprod 847 subs w8, w8, #1 848.endif 849.ifc \type, prep 850 st1 {v0.8h}, [\ldst], \d_strd 851 b.gt 8b 852 add \dst, \dst, #16 853.else 854 st1 {v0.8b}, [\ldst], \d_strd 855 b.gt 8b 856 add \dst, \dst, #8 857.endif 858 add \src, \src, #8 859 subs \w, \w, #8 860 b.gt 81b 861 ret x15 862 863 .align JUMP_ALIGN 86440: // HV8 - 4xN 865 ldr s26, [\xmx, #2] 866 add \src, \src, #2 867 868 bl L(\type\()_hv_filter4_\isa) 869 shrn v16.4h, v22.4s, #2 870 bl L(\type\()_hv_filter4_\isa) 871 shrn v17.4h, v22.4s, #2 872 bl L(\type\()_hv_filter4_\isa) 873 shrn v18.4h, v22.4s, #2 874 bl L(\type\()_hv_filter4_\isa) 875 shrn v19.4h, v22.4s, #2 876 bl L(\type\()_hv_filter4_\isa) 877 shrn v20.4h, v22.4s, #2 878 bl L(\type\()_hv_filter4_\isa) 879 shrn v21.4h, v22.4s, #2 880 bl L(\type\()_hv_filter4_\isa) 881 shrn v22.4h, v22.4s, #2 882 883 .align LOOP_ALIGN 8844: 885 ld1 {v4.8b}, [\src], \s_strd 886 887 smull v0.4s, v16.4h, v7.h[0] 888 smlal v0.4s, v17.4h, v7.h[1] 889 mov v16.16b, v17.16b 890 mov v17.16b, v18.16b 891.ifc \isa, neon_dotprod 892 sub v4.16b, v4.16b, v24.16b 893.endif 894 smlal v0.4s, v18.4h, v7.h[2] 895 smlal v0.4s, v19.4h, v7.h[3] 896 tbl v2.16b, {v4.16b}, v28.16b 897.ifc \isa, neon_i8mm 898 movi v5.4s, #0 899.else 900 mov v5.16b, v27.16b 901.endif 902 mov v18.16b, v19.16b 903 mov v19.16b, v20.16b 904 905 smlal v0.4s, v20.4h, v7.h[4] 906 smlal v0.4s, v21.4h, v7.h[5] 907 908 \dot v5.4s, v2.16b, v26.4b[0] 909 mov v20.16b, v21.16b 910 mov v21.16b, v22.16b 911 smlal v0.4s, v22.4h, v7.h[6] 912.ifc \isa, neon_i8mm 913 rshrn v22.4h, v5.4s, #2 914.else 915 shrn v22.4h, v5.4s, #2 916.endif 917 smlal v0.4s, v22.4h, v7.h[7] 918.ifc \type, prep 919 rshrn v0.4h, v0.4s, #6 920 str d0, [\dst], #8 921 subs \h, \h, #1 922.else 923 subs \h, \h, #1 924 tbl v0.8b, {v0.16b}, v25.8b 925 sqrshrun v0.8b, v0.8h, #2 926 str s0, [\dst] 927 add \dst, \dst, \d_strd 928.endif 929 b.gt 4b 930 ret x15 931 932.ifc \type, put 933 .align JUMP_ALIGN 93420: // HV8 - 2xN 935 ldr s26, [\xmx, #2] 936 add \src, \src, #2 937 938 bl L(\type\()_hv_filter4_\isa) 939 shrn v16.4h, v22.4s, #2 940 bl L(\type\()_hv_filter4_\isa) 941 shrn v17.4h, v22.4s, #2 942 bl L(\type\()_hv_filter4_\isa) 943 shrn v18.4h, v22.4s, #2 944 bl L(\type\()_hv_filter4_\isa) 945 shrn v19.4h, v22.4s, #2 946 bl L(\type\()_hv_filter4_\isa) 947 shrn v20.4h, v22.4s, #2 948 bl L(\type\()_hv_filter4_\isa) 949 shrn v21.4h, v22.4s, #2 950 bl L(\type\()_hv_filter4_\isa) 951 shrn v22.4h, v22.4s, #2 952 953 .align LOOP_ALIGN 9542: 955 ld1 {v4.8b}, [\src], \s_strd 956 957 smull v0.4s, v16.4h, v7.h[0] 958 smlal v0.4s, v17.4h, v7.h[1] 959 mov v16.16b, v17.16b 960 mov v17.16b, v18.16b 961 .ifc \isa, neon_dotprod 962 sub v4.16b, v4.16b, v24.16b 963 .endif 964 smlal v0.4s, v18.4h, v7.h[2] 965 smlal v0.4s, v19.4h, v7.h[3] 966 tbl v2.16b, {v4.16b}, v28.16b 967 .ifc \isa, neon_i8mm 968 movi v5.4s, #0 969 .else 970 mov v5.16b, v27.16b 971 .endif 972 mov v18.16b, v19.16b 973 mov v19.16b, v20.16b 974 975 smlal v0.4s, v20.4h, v7.h[4] 976 smlal v0.4s, v21.4h, v7.h[5] 977 978 \dot v5.4s, v2.16b, v26.4b[0] 979 mov v20.16b, v21.16b 980 mov v21.16b, v22.16b 981 982 smlal v0.4s, v22.4h, v7.h[6] 983 .ifc \isa, neon_i8mm 984 rshrn v22.4h, v5.4s, #2 985 .else 986 shrn v22.4h, v5.4s, #2 987 .endif 988 smlal v0.4s, v22.4h, v7.h[7] 989 subs \h, \h, #1 990 991 tbl v0.8b, {v0.16b}, v25.8b 992 sqrshrun v0.8b, v0.8h, #2 993 994 str h0, [\dst] 995 add \dst, \dst, \d_strd 996 b.gt 2b 997 ret x15 998.endif 999 1000 .align JUMP_ALIGN 1001L(\type\()_6tap_hv_\isa): 1002 cmp \w, #4 1003 b.eq 40f 1004.ifc \type, put 1005 b.lt 20f 1006.endif 1007 1008 // .align JUMP_ALIGN // fallthrough 100980: // HV6 - 8xN+ 1010 ldr q29, L(h_tbl_neon_dotprod) + 16 1011 ldr q30, L(h_tbl_neon_dotprod) + 32 1012 ldr d26, [\xmx] 1013.ifc \type, prep 1014 add \wd_strd, \w, \w 1015.endif 1016 1017 .align LOOP_ALIGN 101881: 1019 mov \lsrc, \src 1020 mov \ldst, \dst 1021 mov w8, \h 1022.ifc \isa, neon_i8mm 1023 bl L(\type\()_hv_filter8_\isa) 1024 srshr v16.8h, v22.8h, #2 1025 bl L(\type\()_hv_filter8_\isa) 1026 srshr v17.8h, v22.8h, #2 1027 bl L(\type\()_hv_filter8_\isa) 1028 srshr v18.8h, v22.8h, #2 1029 bl L(\type\()_hv_filter8_\isa) 1030 srshr v19.8h, v22.8h, #2 1031 bl L(\type\()_hv_filter8_\isa) 1032 srshr v20.8h, v22.8h, #2 1033.else 1034 bl L(\type\()_hv_filter8_\isa) 1035 sshr v16.8h, v22.8h, #2 1036 bl L(\type\()_hv_filter8_\isa) 1037 sshr v17.8h, v22.8h, #2 1038 bl L(\type\()_hv_filter8_\isa) 1039 sshr v18.8h, v22.8h, #2 1040 bl L(\type\()_hv_filter8_\isa) 1041 sshr v19.8h, v22.8h, #2 1042 bl L(\type\()_hv_filter8_\isa) 1043 sshr v20.8h, v22.8h, #2 1044.endif 1045 .align LOOP_ALIGN 10468: 1047 ldr q23, [\xmy] 1048 add \xmy, \xmy, \s_strd 1049 1050 smull v0.4s, v16.4h, v7.h[1] 1051 smull2 v1.4s, v16.8h, v7.h[1] 1052.ifc \isa, neon_dotprod 1053 sub v23.16b, v23.16b, v24.16b 1054.endif 1055 mov v16.16b, v17.16b 1056.ifc \isa, neon_i8mm 1057 movi v5.4s, #0 1058 movi v6.4s, #0 1059.else 1060 mov v5.16b, v27.16b 1061 mov v6.16b, v27.16b 1062.endif 1063 tbl v2.16b, {v23.16b}, v28.16b 1064 tbl v3.16b, {v23.16b}, v29.16b 1065 1066 smlal v0.4s, v17.4h, v7.h[2] 1067 smlal2 v1.4s, v17.8h, v7.h[2] 1068 tbl v4.16b, {v23.16b}, v30.16b 1069 mov v17.16b, v18.16b 1070 1071 \dot v5.4s, v2.16b, v26.4b[0] 1072 \dot v6.4s, v3.16b, v26.4b[0] 1073 1074 smlal v0.4s, v18.4h, v7.h[3] 1075 smlal2 v1.4s, v18.8h, v7.h[3] 1076 mov v18.16b, v19.16b 1077 1078 \dot v5.4s, v3.16b, v26.4b[1] 1079 \dot v6.4s, v4.16b, v26.4b[1] 1080 1081 smlal v0.4s, v19.4h, v7.h[4] 1082 smlal2 v1.4s, v19.8h, v7.h[4] 1083 mov v19.16b, v20.16b 1084 uzp1 v23.8h, v5.8h, v6.8h 1085 1086 smlal v0.4s, v20.4h, v7.h[5] 1087 smlal2 v1.4s, v20.8h, v7.h[5] 1088.ifc \isa, neon_i8mm 1089 srshr v20.8h, v23.8h, #2 1090.else 1091 sshr v20.8h, v23.8h, #2 1092.endif 1093 subs w8, w8, #1 1094 smlal v0.4s, v20.4h, v7.h[6] 1095 smlal2 v1.4s, v20.8h, v7.h[6] 1096.ifc \type, prep 1097 rshrn v0.4h, v0.4s, #6 1098 rshrn2 v0.8h, v1.4s, #6 1099 st1 {v0.8h}, [\ldst], \d_strd 1100 b.gt 8b 1101 add \dst, \dst, #16 1102.else 1103 tbl v0.16b, {v0.16b, v1.16b}, v25.16b 1104 sqrshrun v0.8b, v0.8h, #2 1105 st1 {v0.8b}, [\ldst], \d_strd 1106 b.gt 8b 1107 add \dst, \dst, #8 1108.endif 1109 add \src, \src, #8 1110 subs \w, \w, #8 1111 b.gt 81b 1112 ret x15 1113 1114 .align FUNC_ALIGN 1115L(\type\()_hv_filter8_\isa): 1116 ld1 {v4.16b}, [\lsrc], \s_strd 1117.ifc \isa, neon_i8mm 1118 movi v22.4s, #0 1119 movi v23.4s, #0 1120.else // neon_dotprod 1121 sub v4.16b, v4.16b, v24.16b 1122 mov v22.16b, v27.16b 1123 mov v23.16b, v27.16b 1124.endif 1125 tbl v2.16b, {v4.16b}, v28.16b 1126 tbl v3.16b, {v4.16b}, v29.16b 1127 tbl v4.16b, {v4.16b}, v30.16b 1128 \dot v22.4s, v2.16b, v26.4b[0] 1129 \dot v23.4s, v3.16b, v26.4b[0] 1130 \dot v22.4s, v3.16b, v26.4b[1] 1131 \dot v23.4s, v4.16b, v26.4b[1] 1132 uzp1 v22.8h, v22.8h, v23.8h 1133 ret 1134 1135 .align FUNC_ALIGN 1136L(\type\()_hv_filter4_\isa): 1137 ld1 {v4.8b}, [\src], \s_strd 1138.ifc \isa, neon_i8mm 1139 movi v22.4s, #2 1140.else 1141 mov v22.16b, v27.16b 1142 sub v4.16b, v4.16b, v24.16b 1143.endif 1144 tbl v2.16b, {v4.16b}, v28.16b 1145 \dot v22.4s, v2.16b, v26.4b[0] 1146 ret 1147 1148 .align JUMP_ALIGN 114940: // HV6 - 4xN 1150 ldr s26, [\xmx, #2] 1151 add \src, \src, #2 1152 1153 bl L(\type\()_hv_filter4_\isa) 1154 shrn v16.4h, v22.4s, #2 1155 bl L(\type\()_hv_filter4_\isa) 1156 shrn v17.4h, v22.4s, #2 1157 bl L(\type\()_hv_filter4_\isa) 1158 shrn v18.4h, v22.4s, #2 1159 bl L(\type\()_hv_filter4_\isa) 1160 shrn v19.4h, v22.4s, #2 1161 bl L(\type\()_hv_filter4_\isa) 1162 shrn v20.4h, v22.4s, #2 1163 1164 .align LOOP_ALIGN 11654: 1166 ld1 {v4.8b}, [\src], \s_strd 1167 1168 smull v0.4s, v16.4h, v7.h[1] 1169 smlal v0.4s, v17.4h, v7.h[2] 1170.ifc \isa, neon_dotprod 1171 sub v4.16b, v4.16b, v24.16b 1172.endif 1173 mov v16.16b, v17.16b 1174 mov v17.16b, v18.16b 1175 1176 smlal v0.4s, v18.4h, v7.h[3] 1177 smlal v0.4s, v19.4h, v7.h[4] 1178 tbl v2.16b, {v4.16b}, v28.16b 1179.ifc \isa, neon_i8mm 1180 movi v5.4s, #0 1181.else 1182 mov v5.16b, v27.16b 1183.endif 1184 mov v18.16b, v19.16b 1185 mov v19.16b, v20.16b 1186 \dot v5.4s, v2.16b, v26.4b[0] 1187 1188 smlal v0.4s, v20.4h, v7.h[5] 1189.ifc \isa, neon_i8mm 1190 rshrn v20.4h, v5.4s, #2 1191.else 1192 shrn v20.4h, v5.4s, #2 1193.endif 1194 subs \h, \h, #1 1195 smlal v0.4s, v20.4h, v7.h[6] 1196.ifc \type, prep 1197 rshrn v0.4h, v0.4s, #6 1198 str d0, [\dst], #8 1199.else 1200 tbl v0.8b, {v0.16b}, v25.8b 1201 sqrshrun v0.8b, v0.8h, #2 1202 str s0, [\dst] 1203 add \dst, \dst, \d_strd 1204.endif 1205 b.gt 4b 1206 ret x15 1207 1208.ifc \type, put 1209 .align JUMP_ALIGN 121020: // HV6 - 2xN 1211 ldr s26, [\xmx, #2] 1212 add \src, \src, #2 1213 1214 bl L(\type\()_hv_filter4_\isa) 1215 shrn v16.4h, v22.4s, #2 1216 bl L(\type\()_hv_filter4_\isa) 1217 shrn v17.4h, v22.4s, #2 1218 bl L(\type\()_hv_filter4_\isa) 1219 shrn v18.4h, v22.4s, #2 1220 bl L(\type\()_hv_filter4_\isa) 1221 shrn v19.4h, v22.4s, #2 1222 bl L(\type\()_hv_filter4_\isa) 1223 shrn v20.4h, v22.4s, #2 1224 1225 .align LOOP_ALIGN 12262: 1227 ld1 {v4.8b}, [\src], \s_strd 1228 1229 smull v0.4s, v16.4h, v7.h[1] 1230 smlal v0.4s, v17.4h, v7.h[2] 1231 .ifc \isa, neon_dotprod 1232 sub v4.16b, v4.16b, v24.16b 1233 .endif 1234 mov v16.16b, v17.16b 1235 mov v17.16b, v18.16b 1236 1237 smlal v0.4s, v18.4h, v7.h[3] 1238 smlal v0.4s, v19.4h, v7.h[4] 1239 tbl v2.16b, {v4.16b}, v28.16b 1240 .ifc \isa, neon_i8mm 1241 movi v5.4s, #0 1242 .else 1243 mov v5.16b, v27.16b 1244 .endif 1245 1246 mov v18.16b, v19.16b 1247 mov v19.16b, v20.16b 1248 \dot v5.4s, v2.16b, v26.4b[0] 1249 1250 smlal v0.4s, v20.4h, v7.h[5] 1251 .ifc \isa, neon_i8mm 1252 rshrn v20.4h, v5.4s, #2 1253 .else 1254 shrn v20.4h, v5.4s, #2 1255 .endif 1256 1257 subs \h, \h, #1 1258 smlal v0.4s, v20.4h, v7.h[6] 1259 1260 tbl v0.8b, {v0.16b}, v25.8b 1261 sqrshrun v0.8b, v0.8h, #2 1262 1263 str h0, [\dst] 1264 add \dst, \dst, \d_strd 1265 b.gt 2b 1266 ret x15 1267.endif 1268 1269 .align JUMP_ALIGN 1270L(\type\()_8tap_h_\isa): 1271 adr x9, L(\type\()_8tap_h_\isa\()_tbl) 1272 ldrh w8, [x9, x8, lsl #1] 1273.ifc \type, put 1274 .ifc \isa, neon_i8mm 1275 movi v27.4s, #34 // special rounding 1276 .else 1277 mov w10, #0x2022 // 64 * 128 + 34, bias and rounding for SDOT 1278 dup v27.4s, w10 1279 .endif 1280.endif 1281 sub x9, x9, x8 1282 br x9 1283 1284.ifc \type, put 1285 .align JUMP_ALIGN 128620: // H - 2xN 1287 AARCH64_VALID_JUMP_TARGET 1288 add \src, \src, #2 1289 ldr s26, [\xmx, #2] 1290 1291 .align LOOP_ALIGN 12922: 1293 ldr d0, [\src] 1294 ldr d1, [\src, \s_strd] 1295 add \src, \src, \s_strd, lsl #1 1296 .ifc \isa, neon_dotprod 1297 sub v0.8b, v0.8b, v24.8b 1298 sub v1.8b, v1.8b, v24.8b 1299 .endif 1300 mov v4.16b, v27.16b 1301 mov v5.16b, v27.16b 1302 1303 tbl v2.16b, {v0.16b}, v28.16b 1304 tbl v3.16b, {v1.16b}, v28.16b 1305 1306 \dot v4.4s, v2.16b, v26.4b[0] 1307 \dot v5.4s, v3.16b, v26.4b[0] 1308 1309 uzp1 v4.8h, v4.8h, v5.8h 1310 sqshrun v4.8b, v4.8h, #6 1311 1312 subs \h, \h, #2 1313 fmov x8, d4 1314 lsr x9, x8, #32 1315 strh w8, [\dst] 1316 strh w9, [\dst, \d_strd] 1317 add \dst, \dst, \d_strd, lsl #1 1318 b.gt 2b 1319 ret 1320.endif 1321 1322 .align JUMP_ALIGN 132340: // H - 4xN 1324 AARCH64_VALID_JUMP_TARGET 1325 add \src, \src, #2 1326 ldr s26, [\xmx, #2] 1327 1328 .align LOOP_ALIGN 13294: 1330 ldr d0, [\src] 1331 ldr d1, [\src, \s_strd] 1332 add \src, \src, \s_strd, lsl #1 1333.ifc \type\()_\isa, prep_neon_i8mm 1334 movi v4.4s, #0 1335 movi v5.4s, #0 1336.else 1337 .ifc \isa, neon_dotprod 1338 sub v0.8b, v0.8b, v24.8b 1339 sub v1.8b, v1.8b, v24.8b 1340 .endif 1341 mov v4.16b, v27.16b 1342 mov v5.16b, v27.16b 1343.endif 1344 tbl v2.16b, {v0.16b}, v28.16b 1345 tbl v3.16b, {v1.16b}, v28.16b 1346 1347 \dot v4.4s, v2.16b, v26.4b[0] 1348 \dot v5.4s, v3.16b, v26.4b[0] 1349.ifc \type, prep 1350 subs \h, \h, #2 1351 .ifc \isa, neon_i8mm 1352 uzp1 v4.8h, v4.8h, v5.8h 1353 srshr v4.8h, v4.8h, #2 1354 .else 1355 shrn v4.4h, v4.4s, #2 1356 shrn2 v4.8h, v5.4s, #2 1357 .endif 1358 str q4, [\dst], #16 1359.else // put 1360 uzp1 v4.8h, v4.8h, v5.8h 1361 sqshrun v4.8b, v4.8h, #6 1362 subs \h, \h, #2 1363 fmov x8, d4 1364 lsr x9, x8, #32 1365 str w8, [\dst] 1366 str w9, [\dst, \d_strd] 1367 add \dst, \dst, \d_strd, lsl #1 1368.endif 1369 b.gt 4b 1370 ret 1371 1372 .align JUMP_ALIGN 137380: // H - 8xN 1374 AARCH64_VALID_JUMP_TARGET 1375 ldr q29, L(h_tbl_neon_dotprod) + 16 1376 ldr q30, L(h_tbl_neon_dotprod) + 32 1377 ldr d26, [\xmx] 1378 1379 .align LOOP_ALIGN 13808: 1381 ldr q0, [\src] 1382 ldr q16, [\src, \s_strd] 1383 add \src, \src, \s_strd, lsl #1 1384.ifc \type\()_\isa, prep_neon_i8mm 1385 movi v4.4s, #0 1386 movi v5.4s, #0 1387 movi v20.4s, #0 1388 movi v21.4s, #0 1389.else 1390 .ifc \isa, neon_dotprod 1391 sub v0.16b, v0.16b, v24.16b 1392 sub v16.16b, v16.16b, v24.16b 1393 .endif 1394 mov v4.16b, v27.16b 1395 mov v5.16b, v27.16b 1396 mov v20.16b, v27.16b 1397 mov v21.16b, v27.16b 1398.endif 1399 tbl v1.16b, {v0.16b}, v28.16b 1400 tbl v2.16b, {v0.16b}, v29.16b 1401 tbl v3.16b, {v0.16b}, v30.16b 1402 tbl v17.16b, {v16.16b}, v28.16b 1403 tbl v18.16b, {v16.16b}, v29.16b 1404 tbl v19.16b, {v16.16b}, v30.16b 1405 1406 \dot v4.4s, v1.16b, v26.4b[0] 1407 \dot v5.4s, v2.16b, v26.4b[0] 1408 \dot v20.4s, v17.16b, v26.4b[0] 1409 \dot v21.4s, v18.16b, v26.4b[0] 1410 \dot v4.4s, v2.16b, v26.4b[1] 1411 \dot v5.4s, v3.16b, v26.4b[1] 1412 \dot v20.4s, v18.16b, v26.4b[1] 1413 \dot v21.4s, v19.16b, v26.4b[1] 1414 1415 uzp1 v4.8h, v4.8h, v5.8h 1416 uzp1 v20.8h, v20.8h, v21.8h 1417.ifc \type, prep 1418 .ifc \isa, neon_i8mm 1419 srshr v4.8h, v4.8h, #2 1420 srshr v20.8h, v20.8h, #2 1421 .else 1422 sshr v4.8h, v4.8h, #2 1423 sshr v20.8h, v20.8h, #2 1424 .endif 1425 subs \h, \h, #2 1426 stp q4, q20, [\dst], #32 1427.else // put 1428 sqshrun v4.8b, v4.8h, #6 1429 sqshrun v20.8b, v20.8h, #6 1430 subs \h, \h, #2 1431 str d4, [\dst] 1432 str d20, [\dst, \d_strd] 1433 add \dst, \dst, \d_strd, lsl #1 1434.endif 1435 b.gt 8b 1436 ret 1437 1438 .align JUMP_ALIGN 1439160: // H - 16xN 1440 AARCH64_VALID_JUMP_TARGET 1441 ldr q29, L(h_tbl_neon_dotprod) + 16 1442 ldr q30, L(h_tbl_neon_dotprod) + 32 1443 ldr d26, [\xmx] 1444 1445 .align LOOP_ALIGN 144616: 1447 ldr q16, [\src] 1448 ldr q17, [\src, #12] // avoid 2 register TBL for small cores 1449 add \src, \src, \s_strd 1450.ifc \type\()_\isa, prep_neon_i8mm 1451 movi v6.4s, #0 1452 movi v7.4s, #0 1453 movi v22.4s, #0 1454 movi v23.4s, #0 1455.else 1456 .ifc \isa, neon_dotprod 1457 sub v16.16b, v16.16b, v24.16b 1458 sub v17.16b, v17.16b, v24.16b 1459 .endif 1460 mov v6.16b, v27.16b 1461 mov v7.16b, v27.16b 1462 mov v22.16b, v27.16b 1463 mov v23.16b, v27.16b 1464.endif 1465 tbl v0.16b, {v16.16b}, v28.16b 1466 tbl v1.16b, {v16.16b}, v29.16b 1467 tbl v2.16b, {v16.16b}, v30.16b 1468 tbl v3.16b, {v17.16b}, v28.16b 1469 tbl v4.16b, {v17.16b}, v29.16b 1470 1471 \dot v6.4s, v0.16b, v26.4b[0] 1472 \dot v7.4s, v1.16b, v26.4b[0] 1473 \dot v22.4s, v2.16b, v26.4b[0] 1474 \dot v23.4s, v3.16b, v26.4b[0] 1475 \dot v6.4s, v1.16b, v26.4b[1] 1476 \dot v7.4s, v2.16b, v26.4b[1] 1477 \dot v22.4s, v3.16b, v26.4b[1] 1478 \dot v23.4s, v4.16b, v26.4b[1] 1479 1480 uzp1 v6.8h, v6.8h, v7.8h 1481 uzp1 v22.8h, v22.8h, v23.8h 1482.ifc \type, prep 1483 .ifc \isa, neon_i8mm 1484 srshr v6.8h, v6.8h, #2 1485 srshr v22.8h, v22.8h, #2 1486 .else 1487 sshr v6.8h, v6.8h, #2 1488 sshr v22.8h, v22.8h, #2 1489 .endif 1490 subs \h, \h, #1 1491 stp q6, q22, [\dst], #32 1492.else // put 1493 sqshrun v6.8b, v6.8h, #6 1494 sqshrun2 v6.16b, v22.8h, #6 1495 subs \h, \h, #1 1496 st1 {v6.16b}, [\dst], \d_strd 1497.endif 1498 b.gt 16b 1499 ret 1500 1501 .align JUMP_ALIGN 1502320: // H - 32xN+ 1503640: 15041280: 1505 AARCH64_VALID_JUMP_TARGET 1506 ldr q29, L(h_tbl_neon_dotprod) + 16 1507 ldr q30, L(h_tbl_neon_dotprod) + 32 1508 ldr d26, [\xmx] 1509.ifc \type, put 1510 sub \d_strd, \d_strd, \w, uxtw 1511.endif 1512 sub \s_strd, \s_strd, \w, uxtw 1513 mov w8, \w 1514 1515 .align LOOP_ALIGN 151632: 1517 ldr q16, [\src] 1518 ldr q17, [\src, #12] // avoid 2 register TBL for small cores 1519 add \src, \src, #16 1520.ifc \type\()_\isa, prep_neon_i8mm 1521 movi v6.4s, #0 1522 movi v7.4s, #0 1523 movi v22.4s, #0 1524 movi v23.4s, #0 1525.else 1526 .ifc \isa, neon_dotprod 1527 sub v16.16b, v16.16b, v24.16b 1528 sub v17.16b, v17.16b, v24.16b 1529 .endif 1530 mov v6.16b, v27.16b 1531 mov v7.16b, v27.16b 1532 mov v22.16b, v27.16b 1533 mov v23.16b, v27.16b 1534.endif 1535 tbl v0.16b, {v16.16b}, v28.16b 1536 tbl v1.16b, {v16.16b}, v29.16b 1537 tbl v2.16b, {v16.16b}, v30.16b 1538 tbl v3.16b, {v17.16b}, v28.16b 1539 tbl v4.16b, {v17.16b}, v29.16b 1540 1541 \dot v6.4s, v0.16b, v26.4b[0] 1542 \dot v7.4s, v1.16b, v26.4b[0] 1543 \dot v22.4s, v2.16b, v26.4b[0] 1544 \dot v23.4s, v3.16b, v26.4b[0] 1545 \dot v6.4s, v1.16b, v26.4b[1] 1546 \dot v7.4s, v2.16b, v26.4b[1] 1547 \dot v22.4s, v3.16b, v26.4b[1] 1548 \dot v23.4s, v4.16b, v26.4b[1] 1549 1550 uzp1 v6.8h, v6.8h, v7.8h 1551 uzp1 v22.8h, v22.8h, v23.8h 1552.ifc \type, prep 1553 .ifc \isa, neon_i8mm 1554 srshr v6.8h, v6.8h, #2 1555 srshr v22.8h, v22.8h, #2 1556 .else 1557 sshr v6.8h, v6.8h, #2 1558 sshr v22.8h, v22.8h, #2 1559 .endif 1560 subs w8, w8, #16 1561 stp q6, q22, [\dst], #32 1562.else // put 1563 sqshrun v6.8b, v6.8h, #6 1564 sqshrun2 v6.16b, v22.8h, #6 1565 subs w8, w8, #16 1566 str q6, [\dst], #16 1567.endif 1568 b.gt 32b 1569 1570 add \src, \src, \s_strd 1571.ifc \type, put 1572 add \dst, \dst, \d_strd 1573.endif 1574 mov w8, \w 1575 subs \h, \h, #1 1576 b.gt 32b 1577 ret 1578 1579L(\type\()_8tap_h_\isa\()_tbl): 1580 .hword (L(\type\()_8tap_h_\isa\()_tbl) - 1280b) 1581 .hword (L(\type\()_8tap_h_\isa\()_tbl) - 640b) 1582 .hword (L(\type\()_8tap_h_\isa\()_tbl) - 320b) 1583 .hword (L(\type\()_8tap_h_\isa\()_tbl) - 160b) 1584 .hword (L(\type\()_8tap_h_\isa\()_tbl) - 80b) 1585 .hword (L(\type\()_8tap_h_\isa\()_tbl) - 40b) 1586.ifc \type, put 1587 .hword (L(\type\()_8tap_h_\isa\()_tbl) - 20b) 1588 .hword 0 1589.endif 1590endfunc 1591.endm 1592 1593// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6) 1594// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7) 1595filter_8tap_fn prep, sdot, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7 1596 1597// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7) 1598// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1) 1599filter_8tap_fn put, sdot, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 1600 1601#if HAVE_I8MM 1602ENABLE_I8MM 1603 1604// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6) 1605// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7) 1606filter_8tap_fn prep, usdot, neon_i8mm, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7 1607 1608// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7) 1609// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1) 1610filter_8tap_fn put, usdot, neon_i8mm, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 1611 1612DISABLE_I8MM 1613#endif // HAVE_I8MM 1614 1615DISABLE_DOTPROD 1616#endif // HAVE_DOTPROD 1617