1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Janne Grunau 4 * Copyright © 2020, Martin Storsjo 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, this 11 * list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include "src/arm/asm.S" 30#include "util.S" 31 32#define PREP_BIAS 8192 33 34.macro avg d0, d1, t0, t1, t2, t3 35 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 36 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 37 sqadd \t0\().8h, \t0\().8h, \t2\().8h 38 sqadd \t1\().8h, \t1\().8h, \t3\().8h 39 smax \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 40 smax \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 41 sqsub \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 42 sqsub \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 43 sshl \d0\().8h, \t0\().8h, v29.8h // -(intermediate_bits+1) 44 sshl \d1\().8h, \t1\().8h, v29.8h // -(intermediate_bits+1) 45.endm 46 47.macro w_avg d0, d1, t0, t1, t2, t3 48 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 49 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 50 // This difference requires a 17 bit range, and all bits are 51 // significant for the following multiplication. 52 ssubl \d0\().4s, \t2\().4h, \t0\().4h 53 ssubl2 \t0\().4s, \t2\().8h, \t0\().8h 54 ssubl \d1\().4s, \t3\().4h, \t1\().4h 55 ssubl2 \t1\().4s, \t3\().8h, \t1\().8h 56 mul \d0\().4s, \d0\().4s, v27.4s 57 mul \t0\().4s, \t0\().4s, v27.4s 58 mul \d1\().4s, \d1\().4s, v27.4s 59 mul \t1\().4s, \t1\().4s, v27.4s 60 sshr \d0\().4s, \d0\().4s, #4 61 sshr \t0\().4s, \t0\().4s, #4 62 sshr \d1\().4s, \d1\().4s, #4 63 sshr \t1\().4s, \t1\().4s, #4 64 saddw \d0\().4s, \d0\().4s, \t2\().4h 65 saddw2 \t0\().4s, \t0\().4s, \t2\().8h 66 saddw \d1\().4s, \d1\().4s, \t3\().4h 67 saddw2 \t1\().4s, \t1\().4s, \t3\().8h 68 uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2 69 uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto 70 srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits 71 srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits 72 add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits 73 add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits 74 smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max 75 smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max 76 smax \d0\().8h, \d0\().8h, v30.8h // 0 77 smax \d1\().8h, \d1\().8h, v30.8h // 0 78.endm 79 80.macro mask d0, d1, t0, t1, t2, t3 81 ld1 {v27.16b}, [x6], 16 82 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 83 neg v27.16b, v27.16b 84 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 85 sxtl v26.8h, v27.8b 86 sxtl2 v27.8h, v27.16b 87 sxtl v24.4s, v26.4h 88 sxtl2 v25.4s, v26.8h 89 sxtl v26.4s, v27.4h 90 sxtl2 v27.4s, v27.8h 91 ssubl \d0\().4s, \t2\().4h, \t0\().4h 92 ssubl2 \t0\().4s, \t2\().8h, \t0\().8h 93 ssubl \d1\().4s, \t3\().4h, \t1\().4h 94 ssubl2 \t1\().4s, \t3\().8h, \t1\().8h 95 mul \d0\().4s, \d0\().4s, v24.4s 96 mul \t0\().4s, \t0\().4s, v25.4s 97 mul \d1\().4s, \d1\().4s, v26.4s 98 mul \t1\().4s, \t1\().4s, v27.4s 99 sshr \d0\().4s, \d0\().4s, #6 100 sshr \t0\().4s, \t0\().4s, #6 101 sshr \d1\().4s, \d1\().4s, #6 102 sshr \t1\().4s, \t1\().4s, #6 103 saddw \d0\().4s, \d0\().4s, \t2\().4h 104 saddw2 \t0\().4s, \t0\().4s, \t2\().8h 105 saddw \d1\().4s, \d1\().4s, \t3\().4h 106 saddw2 \t1\().4s, \t1\().4s, \t3\().8h 107 uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2 108 uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto 109 srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits 110 srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits 111 add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits 112 add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits 113 smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max 114 smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max 115 smax \d0\().8h, \d0\().8h, v30.8h // 0 116 smax \d1\().8h, \d1\().8h, v30.8h // 0 117.endm 118 119.macro bidir_fn type, bdmax 120function \type\()_16bpc_neon, export=1 121 clz w4, w4 122.ifnc \type, avg 123 dup v31.8h, \bdmax // bitdepth_max 124 movi v30.8h, #0 125.endif 126 clz w7, \bdmax 127 sub w7, w7, #18 // intermediate_bits = clz(bitdepth_max) - 18 128.ifc \type, avg 129 mov w9, #1 130 mov w8, #-2*PREP_BIAS 131 lsl w9, w9, w7 // 1 << intermediate_bits 132 add w7, w7, #1 133 sub w8, w8, w9 // -2*PREP_BIAS - 1 << intermediate_bits 134 neg w7, w7 // -(intermediate_bits+1) 135 dup v28.8h, w8 // -2*PREP_BIAS - 1 << intermediate_bits 136 dup v29.8h, w7 // -(intermediate_bits+1) 137.else 138 mov w8, #PREP_BIAS 139 lsr w8, w8, w7 // PREP_BIAS >> intermediate_bits 140 neg w7, w7 // -intermediate_bits 141 dup v28.8h, w8 // PREP_BIAS >> intermediate_bits 142 dup v29.8h, w7 // -intermediate_bits 143.endif 144.ifc \type, w_avg 145 dup v27.4s, w6 146 neg v27.4s, v27.4s 147.endif 148 adr x7, L(\type\()_tbl) 149 sub w4, w4, #24 150 \type v4, v5, v0, v1, v2, v3 151 ldrh w4, [x7, x4, lsl #1] 152 sub x7, x7, w4, uxtw 153 br x7 15440: 155 AARCH64_VALID_JUMP_TARGET 156 add x7, x0, x1 157 lsl x1, x1, #1 1584: 159 subs w5, w5, #4 160 st1 {v4.d}[0], [x0], x1 161 st1 {v4.d}[1], [x7], x1 162 st1 {v5.d}[0], [x0], x1 163 st1 {v5.d}[1], [x7], x1 164 b.le 0f 165 \type v4, v5, v0, v1, v2, v3 166 b 4b 16780: 168 AARCH64_VALID_JUMP_TARGET 169 add x7, x0, x1 170 lsl x1, x1, #1 1718: 172 st1 {v4.8h}, [x0], x1 173 subs w5, w5, #2 174 st1 {v5.8h}, [x7], x1 175 b.le 0f 176 \type v4, v5, v0, v1, v2, v3 177 b 8b 17816: 179 AARCH64_VALID_JUMP_TARGET 180 \type v6, v7, v0, v1, v2, v3 181 st1 {v4.8h, v5.8h}, [x0], x1 182 subs w5, w5, #2 183 st1 {v6.8h, v7.8h}, [x0], x1 184 b.le 0f 185 \type v4, v5, v0, v1, v2, v3 186 b 16b 18732: 188 AARCH64_VALID_JUMP_TARGET 189 \type v6, v7, v0, v1, v2, v3 190 subs w5, w5, #1 191 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 192 b.le 0f 193 \type v4, v5, v0, v1, v2, v3 194 b 32b 195640: 196 AARCH64_VALID_JUMP_TARGET 197 add x7, x0, #64 19864: 199 \type v6, v7, v0, v1, v2, v3 200 \type v16, v17, v0, v1, v2, v3 201 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 202 \type v18, v19, v0, v1, v2, v3 203 subs w5, w5, #1 204 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1 205 b.le 0f 206 \type v4, v5, v0, v1, v2, v3 207 b 64b 2081280: 209 AARCH64_VALID_JUMP_TARGET 210 add x7, x0, #64 211 mov x8, #128 212 sub x1, x1, #128 213128: 214 \type v6, v7, v0, v1, v2, v3 215 \type v16, v17, v0, v1, v2, v3 216 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x8 217 \type v18, v19, v0, v1, v2, v3 218 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8 219 \type v4, v5, v0, v1, v2, v3 220 \type v6, v7, v0, v1, v2, v3 221 \type v16, v17, v0, v1, v2, v3 222 subs w5, w5, #1 223 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 224 \type v18, v19, v0, v1, v2, v3 225 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1 226 b.le 0f 227 \type v4, v5, v0, v1, v2, v3 228 b 128b 2290: 230 ret 231L(\type\()_tbl): 232 .hword L(\type\()_tbl) - 1280b 233 .hword L(\type\()_tbl) - 640b 234 .hword L(\type\()_tbl) - 32b 235 .hword L(\type\()_tbl) - 16b 236 .hword L(\type\()_tbl) - 80b 237 .hword L(\type\()_tbl) - 40b 238endfunc 239.endm 240 241bidir_fn avg, w6 242bidir_fn w_avg, w7 243bidir_fn mask, w7 244 245 246.macro w_mask_fn type 247function w_mask_\type\()_16bpc_neon, export=1 248 ldr w8, [sp] 249 clz w9, w4 250 adr x10, L(w_mask_\type\()_tbl) 251 dup v31.8h, w8 // bitdepth_max 252 sub w9, w9, #24 253 clz w8, w8 // clz(bitdepth_max) 254 ldrh w9, [x10, x9, lsl #1] 255 sub x10, x10, w9, uxtw 256 sub w8, w8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12 257 mov w9, #PREP_BIAS*64 258 neg w8, w8 // -sh 259 mov w11, #27615 // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd 260 dup v30.4s, w9 // PREP_BIAS*64 261 dup v29.4s, w8 // -sh 262 dup v0.8h, w11 263.if \type == 444 264 movi v1.16b, #64 265.elseif \type == 422 266 dup v2.8b, w7 267 movi v3.8b, #129 268 sub v3.8b, v3.8b, v2.8b 269.elseif \type == 420 270 dup v2.8h, w7 271 movi v3.8h, #1, lsl #8 272 sub v3.8h, v3.8h, v2.8h 273.endif 274 add x12, x0, x1 275 lsl x1, x1, #1 276 br x10 2774: 278 AARCH64_VALID_JUMP_TARGET 279 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) 280 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) 281 subs w5, w5, #4 282 sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2) 283 sabd v21.8h, v5.8h, v7.8h 284 ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) 285 ssubl2 v17.4s, v6.8h, v4.8h 286 ssubl v18.4s, v7.4h, v5.4h 287 ssubl2 v19.4s, v7.8h, v5.8h 288 uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() 289 uqsub v21.8h, v0.8h, v21.8h 290 sshll2 v7.4s, v5.8h, #6 // tmp1 << 6 291 sshll v6.4s, v5.4h, #6 292 sshll2 v5.4s, v4.8h, #6 293 sshll v4.4s, v4.4h, #6 294 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh 295 ushr v21.8h, v21.8h, #10 296 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 297 add v5.4s, v5.4s, v30.4s 298 add v6.4s, v6.4s, v30.4s 299 add v7.4s, v7.4s, v30.4s 300 uxtl v22.4s, v20.4h 301 uxtl2 v23.4s, v20.8h 302 uxtl v24.4s, v21.4h 303 uxtl2 v25.4s, v21.8h 304 mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m) 305 mla v5.4s, v17.4s, v23.4s 306 mla v6.4s, v18.4s, v24.4s 307 mla v7.4s, v19.4s, v25.4s 308 srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh 309 srshl v5.4s, v5.4s, v29.4s 310 srshl v6.4s, v6.4s, v29.4s 311 srshl v7.4s, v7.4s, v29.4s 312 sqxtun v4.4h, v4.4s // iclip_pixel 313 sqxtun2 v4.8h, v5.4s 314 sqxtun v5.4h, v6.4s 315 sqxtun2 v5.8h, v7.4s 316 umin v4.8h, v4.8h, v31.8h // iclip_pixel 317 umin v5.8h, v5.8h, v31.8h 318.if \type == 444 319 uzp1 v20.16b, v20.16b, v21.16b // 64 - m 320 sub v20.16b, v1.16b, v20.16b // m 321 st1 {v20.16b}, [x6], #16 322.elseif \type == 422 323 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) 324 xtn v20.8b, v20.8h 325 uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 326 st1 {v20.8b}, [x6], #8 327.elseif \type == 420 328 trn1 v24.2d, v20.2d, v21.2d 329 trn2 v25.2d, v20.2d, v21.2d 330 add v24.8h, v24.8h, v25.8h // (64 - my1) + (64 - my2) (row wise addition) 331 addp v20.8h, v24.8h, v24.8h // (128 - m) + (128 - n) (column wise addition) 332 sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) 333 rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 334 st1 {v20.s}[0], [x6], #4 335.endif 336 st1 {v4.d}[0], [x0], x1 337 st1 {v4.d}[1], [x12], x1 338 st1 {v5.d}[0], [x0], x1 339 st1 {v5.d}[1], [x12], x1 340 b.gt 4b 341 ret 3428: 343 AARCH64_VALID_JUMP_TARGET 344 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 345 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 346 subs w5, w5, #2 347 sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2) 348 sabd v21.8h, v5.8h, v7.8h 349 ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) 350 ssubl2 v17.4s, v6.8h, v4.8h 351 ssubl v18.4s, v7.4h, v5.4h 352 ssubl2 v19.4s, v7.8h, v5.8h 353 uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() 354 uqsub v21.8h, v0.8h, v21.8h 355 sshll2 v7.4s, v5.8h, #6 // tmp1 << 6 356 sshll v6.4s, v5.4h, #6 357 sshll2 v5.4s, v4.8h, #6 358 sshll v4.4s, v4.4h, #6 359 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh 360 ushr v21.8h, v21.8h, #10 361 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 362 add v5.4s, v5.4s, v30.4s 363 add v6.4s, v6.4s, v30.4s 364 add v7.4s, v7.4s, v30.4s 365 uxtl v22.4s, v20.4h 366 uxtl2 v23.4s, v20.8h 367 uxtl v24.4s, v21.4h 368 uxtl2 v25.4s, v21.8h 369 mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m) 370 mla v5.4s, v17.4s, v23.4s 371 mla v6.4s, v18.4s, v24.4s 372 mla v7.4s, v19.4s, v25.4s 373 srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh 374 srshl v5.4s, v5.4s, v29.4s 375 srshl v6.4s, v6.4s, v29.4s 376 srshl v7.4s, v7.4s, v29.4s 377 sqxtun v4.4h, v4.4s // iclip_pixel 378 sqxtun2 v4.8h, v5.4s 379 sqxtun v5.4h, v6.4s 380 sqxtun2 v5.8h, v7.4s 381 umin v4.8h, v4.8h, v31.8h // iclip_pixel 382 umin v5.8h, v5.8h, v31.8h 383.if \type == 444 384 uzp1 v20.16b, v20.16b, v21.16b // 64 - m 385 sub v20.16b, v1.16b, v20.16b // m 386 st1 {v20.16b}, [x6], #16 387.elseif \type == 422 388 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) 389 xtn v20.8b, v20.8h 390 uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 391 st1 {v20.8b}, [x6], #8 392.elseif \type == 420 393 add v20.8h, v20.8h, v21.8h // (64 - my1) + (64 - my2) (row wise addition) 394 addp v20.8h, v20.8h, v20.8h // (128 - m) + (128 - n) (column wise addition) 395 sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) 396 rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 397 st1 {v20.s}[0], [x6], #4 398.endif 399 st1 {v4.8h}, [x0], x1 400 st1 {v5.8h}, [x12], x1 401 b.gt 8b 402 ret 4031280: 404640: 405320: 406160: 407 AARCH64_VALID_JUMP_TARGET 408 mov w11, w4 409 sub x1, x1, w4, uxtw #1 410.if \type == 444 411 add x10, x6, w4, uxtw 412.elseif \type == 422 413 add x10, x6, x11, lsr #1 414.endif 415 add x9, x3, w4, uxtw #1 416 add x7, x2, w4, uxtw #1 417161: 418 mov w8, w4 41916: 420 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 421 ld1 {v16.8h, v17.8h}, [x3], #32 // tmp2 422 ld1 {v6.8h, v7.8h}, [x7], #32 423 ld1 {v18.8h, v19.8h}, [x9], #32 424 subs w8, w8, #16 425 sabd v20.8h, v4.8h, v16.8h // abs(tmp1 - tmp2) 426 sabd v21.8h, v5.8h, v17.8h 427 ssubl v22.4s, v16.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) 428 ssubl2 v23.4s, v16.8h, v4.8h 429 ssubl v24.4s, v17.4h, v5.4h 430 ssubl2 v25.4s, v17.8h, v5.8h 431 uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() 432 uqsub v21.8h, v0.8h, v21.8h 433 sshll2 v27.4s, v5.8h, #6 // tmp1 << 6 434 sshll v26.4s, v5.4h, #6 435 sshll2 v5.4s, v4.8h, #6 436 sshll v4.4s, v4.4h, #6 437 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh 438 ushr v21.8h, v21.8h, #10 439 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 440 add v5.4s, v5.4s, v30.4s 441 add v26.4s, v26.4s, v30.4s 442 add v27.4s, v27.4s, v30.4s 443 uxtl v16.4s, v20.4h 444 uxtl2 v17.4s, v20.8h 445 uxtl v28.4s, v21.4h 446 mla v4.4s, v22.4s, v16.4s // (tmp2-tmp1)*(64-m) 447 uxtl2 v16.4s, v21.8h 448 mla v5.4s, v23.4s, v17.4s 449 mla v26.4s, v24.4s, v28.4s 450 mla v27.4s, v25.4s, v16.4s 451 srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh 452 srshl v5.4s, v5.4s, v29.4s 453 srshl v26.4s, v26.4s, v29.4s 454 srshl v27.4s, v27.4s, v29.4s 455 sqxtun v4.4h, v4.4s // iclip_pixel 456 sqxtun2 v4.8h, v5.4s 457 sqxtun v5.4h, v26.4s 458 sqxtun2 v5.8h, v27.4s 459 460 // Start of other half 461 sabd v22.8h, v6.8h, v18.8h // abs(tmp1 - tmp2) 462 sabd v23.8h, v7.8h, v19.8h 463 464 umin v4.8h, v4.8h, v31.8h // iclip_pixel 465 umin v5.8h, v5.8h, v31.8h 466 467 ssubl v16.4s, v18.4h, v6.4h // tmp2 - tmp1 (requires 17 bit) 468 ssubl2 v17.4s, v18.8h, v6.8h 469 ssubl v18.4s, v19.4h, v7.4h 470 ssubl2 v19.4s, v19.8h, v7.8h 471 uqsub v22.8h, v0.8h, v22.8h // 27615 - abs() 472 uqsub v23.8h, v0.8h, v23.8h 473 sshll v24.4s, v6.4h, #6 // tmp1 << 6 474 sshll2 v25.4s, v6.8h, #6 475 sshll v26.4s, v7.4h, #6 476 sshll2 v27.4s, v7.8h, #6 477 ushr v22.8h, v22.8h, #10 // 64-m = (27615 - abs()) >> mask_sh 478 ushr v23.8h, v23.8h, #10 479 add v24.4s, v24.4s, v30.4s // += PREP_BIAS*64 480 add v25.4s, v25.4s, v30.4s 481 add v26.4s, v26.4s, v30.4s 482 add v27.4s, v27.4s, v30.4s 483 uxtl v6.4s, v22.4h 484 uxtl2 v7.4s, v22.8h 485 uxtl v28.4s, v23.4h 486 mla v24.4s, v16.4s, v6.4s // (tmp2-tmp1)*(64-m) 487 uxtl2 v6.4s, v23.8h 488 mla v25.4s, v17.4s, v7.4s 489 mla v26.4s, v18.4s, v28.4s 490 mla v27.4s, v19.4s, v6.4s 491 srshl v24.4s, v24.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh 492 srshl v25.4s, v25.4s, v29.4s 493 srshl v26.4s, v26.4s, v29.4s 494 srshl v27.4s, v27.4s, v29.4s 495 sqxtun v6.4h, v24.4s // iclip_pixel 496 sqxtun2 v6.8h, v25.4s 497 sqxtun v7.4h, v26.4s 498 sqxtun2 v7.8h, v27.4s 499 umin v6.8h, v6.8h, v31.8h // iclip_pixel 500 umin v7.8h, v7.8h, v31.8h 501.if \type == 444 502 uzp1 v20.16b, v20.16b, v21.16b // 64 - m 503 uzp1 v21.16b, v22.16b, v23.16b 504 sub v20.16b, v1.16b, v20.16b // m 505 sub v21.16b, v1.16b, v21.16b 506 st1 {v20.16b}, [x6], #16 507 st1 {v21.16b}, [x10], #16 508.elseif \type == 422 509 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) 510 addp v21.8h, v22.8h, v23.8h 511 xtn v20.8b, v20.8h 512 xtn v21.8b, v21.8h 513 uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 514 uhsub v21.8b, v3.8b, v21.8b 515 st1 {v20.8b}, [x6], #8 516 st1 {v21.8b}, [x10], #8 517.elseif \type == 420 518 add v20.8h, v20.8h, v22.8h // (64 - my1) + (64 - my2) (row wise addition) 519 add v21.8h, v21.8h, v23.8h 520 addp v20.8h, v20.8h, v21.8h // (128 - m) + (128 - n) (column wise addition) 521 sub v20.8h, v3.8h, v20.8h // (256 - sign) - ((128 - m) + (128 - n)) 522 rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 523 st1 {v20.8b}, [x6], #8 524.endif 525 st1 {v4.8h, v5.8h}, [x0], #32 526 st1 {v6.8h, v7.8h}, [x12], #32 527 b.gt 16b 528 subs w5, w5, #2 529 add x2, x2, w4, uxtw #1 530 add x3, x3, w4, uxtw #1 531 add x7, x7, w4, uxtw #1 532 add x9, x9, w4, uxtw #1 533.if \type == 444 534 add x6, x6, w4, uxtw 535 add x10, x10, w4, uxtw 536.elseif \type == 422 537 add x6, x6, x11, lsr #1 538 add x10, x10, x11, lsr #1 539.endif 540 add x0, x0, x1 541 add x12, x12, x1 542 b.gt 161b 543 ret 544L(w_mask_\type\()_tbl): 545 .hword L(w_mask_\type\()_tbl) - 1280b 546 .hword L(w_mask_\type\()_tbl) - 640b 547 .hword L(w_mask_\type\()_tbl) - 320b 548 .hword L(w_mask_\type\()_tbl) - 160b 549 .hword L(w_mask_\type\()_tbl) - 8b 550 .hword L(w_mask_\type\()_tbl) - 4b 551endfunc 552.endm 553 554w_mask_fn 444 555w_mask_fn 422 556w_mask_fn 420 557 558 559function blend_16bpc_neon, export=1 560 adr x6, L(blend_tbl) 561 clz w3, w3 562 sub w3, w3, #26 563 ldrh w3, [x6, x3, lsl #1] 564 sub x6, x6, w3, uxtw 565 add x8, x0, x1 566 br x6 56740: 568 AARCH64_VALID_JUMP_TARGET 569 lsl x1, x1, #1 5704: 571 ld1 {v2.8b}, [x5], #8 572 ld1 {v1.8h}, [x2], #16 573 ld1 {v0.d}[0], [x0] 574 neg v2.8b, v2.8b // -m 575 subs w4, w4, #2 576 ld1 {v0.d}[1], [x8] 577 sxtl v2.8h, v2.8b 578 shl v2.8h, v2.8h, #9 // -m << 9 579 sub v1.8h, v0.8h, v1.8h // a - b 580 sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 581 add v0.8h, v0.8h, v1.8h 582 st1 {v0.d}[0], [x0], x1 583 st1 {v0.d}[1], [x8], x1 584 b.gt 4b 585 ret 58680: 587 AARCH64_VALID_JUMP_TARGET 588 lsl x1, x1, #1 5898: 590 ld1 {v4.16b}, [x5], #16 591 ld1 {v2.8h, v3.8h}, [x2], #32 592 neg v5.16b, v4.16b // -m 593 ld1 {v0.8h}, [x0] 594 ld1 {v1.8h}, [x8] 595 sxtl v4.8h, v5.8b 596 sxtl2 v5.8h, v5.16b 597 shl v4.8h, v4.8h, #9 // -m << 9 598 shl v5.8h, v5.8h, #9 599 sub v2.8h, v0.8h, v2.8h // a - b 600 sub v3.8h, v1.8h, v3.8h 601 subs w4, w4, #2 602 sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 603 sqrdmulh v3.8h, v3.8h, v5.8h 604 add v0.8h, v0.8h, v2.8h 605 add v1.8h, v1.8h, v3.8h 606 st1 {v0.8h}, [x0], x1 607 st1 {v1.8h}, [x8], x1 608 b.gt 8b 609 ret 610160: 611 AARCH64_VALID_JUMP_TARGET 612 lsl x1, x1, #1 61316: 614 ld1 {v16.16b, v17.16b}, [x5], #32 615 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 616 subs w4, w4, #2 617 neg v18.16b, v16.16b // -m 618 neg v19.16b, v17.16b 619 ld1 {v0.8h, v1.8h}, [x0] 620 sxtl v16.8h, v18.8b 621 sxtl2 v17.8h, v18.16b 622 sxtl v18.8h, v19.8b 623 sxtl2 v19.8h, v19.16b 624 ld1 {v2.8h, v3.8h}, [x8] 625 shl v16.8h, v16.8h, #9 // -m << 9 626 shl v17.8h, v17.8h, #9 627 shl v18.8h, v18.8h, #9 628 shl v19.8h, v19.8h, #9 629 sub v4.8h, v0.8h, v4.8h // a - b 630 sub v5.8h, v1.8h, v5.8h 631 sub v6.8h, v2.8h, v6.8h 632 sub v7.8h, v3.8h, v7.8h 633 sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 634 sqrdmulh v5.8h, v5.8h, v17.8h 635 sqrdmulh v6.8h, v6.8h, v18.8h 636 sqrdmulh v7.8h, v7.8h, v19.8h 637 add v0.8h, v0.8h, v4.8h 638 add v1.8h, v1.8h, v5.8h 639 add v2.8h, v2.8h, v6.8h 640 add v3.8h, v3.8h, v7.8h 641 st1 {v0.8h, v1.8h}, [x0], x1 642 st1 {v2.8h, v3.8h}, [x8], x1 643 b.gt 16b 644 ret 64532: 646 AARCH64_VALID_JUMP_TARGET 647 ld1 {v16.16b, v17.16b}, [x5], #32 648 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 649 subs w4, w4, #1 650 neg v18.16b, v16.16b // -m 651 neg v19.16b, v17.16b 652 sxtl v16.8h, v18.8b 653 sxtl2 v17.8h, v18.16b 654 sxtl v18.8h, v19.8b 655 sxtl2 v19.8h, v19.16b 656 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] 657 shl v16.8h, v16.8h, #9 // -m << 9 658 shl v17.8h, v17.8h, #9 659 shl v18.8h, v18.8h, #9 660 shl v19.8h, v19.8h, #9 661 sub v4.8h, v0.8h, v4.8h // a - b 662 sub v5.8h, v1.8h, v5.8h 663 sub v6.8h, v2.8h, v6.8h 664 sub v7.8h, v3.8h, v7.8h 665 sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 666 sqrdmulh v5.8h, v5.8h, v17.8h 667 sqrdmulh v6.8h, v6.8h, v18.8h 668 sqrdmulh v7.8h, v7.8h, v19.8h 669 add v0.8h, v0.8h, v4.8h 670 add v1.8h, v1.8h, v5.8h 671 add v2.8h, v2.8h, v6.8h 672 add v3.8h, v3.8h, v7.8h 673 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 674 b.gt 32b 675 ret 676L(blend_tbl): 677 .hword L(blend_tbl) - 32b 678 .hword L(blend_tbl) - 160b 679 .hword L(blend_tbl) - 80b 680 .hword L(blend_tbl) - 40b 681endfunc 682 683function blend_h_16bpc_neon, export=1 684 adr x6, L(blend_h_tbl) 685 movrel x5, X(obmc_masks) 686 add x5, x5, w4, uxtw 687 sub w4, w4, w4, lsr #2 688 clz w7, w3 689 add x8, x0, x1 690 lsl x1, x1, #1 691 sub w7, w7, #24 692 ldrh w7, [x6, x7, lsl #1] 693 sub x6, x6, w7, uxtw 694 br x6 6952: 696 AARCH64_VALID_JUMP_TARGET 697 ld2r {v2.8b, v3.8b}, [x5], #2 698 ld1 {v1.4h}, [x2], #8 699 ext v2.8b, v2.8b, v3.8b, #6 700 subs w4, w4, #2 701 neg v2.8b, v2.8b // -m 702 ld1 {v0.s}[0], [x0] 703 ld1 {v0.s}[1], [x8] 704 sxtl v2.8h, v2.8b 705 shl v2.4h, v2.4h, #9 // -m << 9 706 sub v1.4h, v0.4h, v1.4h // a - b 707 sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6 708 add v0.4h, v0.4h, v1.4h 709 st1 {v0.s}[0], [x0], x1 710 st1 {v0.s}[1], [x8], x1 711 b.gt 2b 712 ret 7134: 714 AARCH64_VALID_JUMP_TARGET 715 ld2r {v2.8b, v3.8b}, [x5], #2 716 ld1 {v1.8h}, [x2], #16 717 ext v2.8b, v2.8b, v3.8b, #4 718 subs w4, w4, #2 719 neg v2.8b, v2.8b // -m 720 ld1 {v0.d}[0], [x0] 721 ld1 {v0.d}[1], [x8] 722 sxtl v2.8h, v2.8b 723 shl v2.8h, v2.8h, #9 // -m << 9 724 sub v1.8h, v0.8h, v1.8h // a - b 725 sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 726 add v0.8h, v0.8h, v1.8h 727 st1 {v0.d}[0], [x0], x1 728 st1 {v0.d}[1], [x8], x1 729 b.gt 4b 730 ret 7318: 732 AARCH64_VALID_JUMP_TARGET 733 ld2r {v4.8b, v5.8b}, [x5], #2 734 ld1 {v2.8h, v3.8h}, [x2], #32 735 neg v4.8b, v4.8b // -m 736 neg v5.8b, v5.8b 737 ld1 {v0.8h}, [x0] 738 subs w4, w4, #2 739 sxtl v4.8h, v4.8b 740 sxtl v5.8h, v5.8b 741 ld1 {v1.8h}, [x8] 742 shl v4.8h, v4.8h, #9 // -m << 9 743 shl v5.8h, v5.8h, #9 744 sub v2.8h, v0.8h, v2.8h // a - b 745 sub v3.8h, v1.8h, v3.8h 746 sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 747 sqrdmulh v3.8h, v3.8h, v5.8h 748 add v0.8h, v0.8h, v2.8h 749 add v1.8h, v1.8h, v3.8h 750 st1 {v0.8h}, [x0], x1 751 st1 {v1.8h}, [x8], x1 752 b.gt 8b 753 ret 75416: 755 AARCH64_VALID_JUMP_TARGET 756 ld2r {v16.8b, v17.8b}, [x5], #2 757 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 758 neg v16.8b, v16.8b // -m 759 neg v17.8b, v17.8b 760 ld1 {v0.8h, v1.8h}, [x0] 761 ld1 {v2.8h, v3.8h}, [x8] 762 subs w4, w4, #2 763 sxtl v16.8h, v16.8b 764 sxtl v17.8h, v17.8b 765 shl v16.8h, v16.8h, #9 // -m << 9 766 shl v17.8h, v17.8h, #9 767 sub v4.8h, v0.8h, v4.8h // a - b 768 sub v5.8h, v1.8h, v5.8h 769 sub v6.8h, v2.8h, v6.8h 770 sub v7.8h, v3.8h, v7.8h 771 sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 772 sqrdmulh v5.8h, v5.8h, v16.8h 773 sqrdmulh v6.8h, v6.8h, v17.8h 774 sqrdmulh v7.8h, v7.8h, v17.8h 775 add v0.8h, v0.8h, v4.8h 776 add v1.8h, v1.8h, v5.8h 777 add v2.8h, v2.8h, v6.8h 778 add v3.8h, v3.8h, v7.8h 779 st1 {v0.8h, v1.8h}, [x0], x1 780 st1 {v2.8h, v3.8h}, [x8], x1 781 b.gt 16b 782 ret 7831280: 784640: 785320: 786 AARCH64_VALID_JUMP_TARGET 787 sub x1, x1, w3, uxtw #1 788 add x7, x2, w3, uxtw #1 789321: 790 ld2r {v24.8b, v25.8b}, [x5], #2 791 mov w6, w3 792 neg v24.8b, v24.8b // -m 793 neg v25.8b, v25.8b 794 sxtl v24.8h, v24.8b 795 sxtl v25.8h, v25.8b 796 shl v24.8h, v24.8h, #9 // -m << 9 797 shl v25.8h, v25.8h, #9 79832: 799 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64 800 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] 801 subs w6, w6, #32 802 sub v16.8h, v0.8h, v16.8h // a - b 803 sub v17.8h, v1.8h, v17.8h 804 sub v18.8h, v2.8h, v18.8h 805 sub v19.8h, v3.8h, v19.8h 806 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 807 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8] 808 sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6 809 sqrdmulh v17.8h, v17.8h, v24.8h 810 sqrdmulh v18.8h, v18.8h, v24.8h 811 sqrdmulh v19.8h, v19.8h, v24.8h 812 sub v20.8h, v4.8h, v20.8h // a - b 813 sub v21.8h, v5.8h, v21.8h 814 sub v22.8h, v6.8h, v22.8h 815 sub v23.8h, v7.8h, v23.8h 816 add v0.8h, v0.8h, v16.8h 817 add v1.8h, v1.8h, v17.8h 818 add v2.8h, v2.8h, v18.8h 819 add v3.8h, v3.8h, v19.8h 820 sqrdmulh v20.8h, v20.8h, v25.8h // ((a-b)*-m + 32) >> 6 821 sqrdmulh v21.8h, v21.8h, v25.8h 822 sqrdmulh v22.8h, v22.8h, v25.8h 823 sqrdmulh v23.8h, v23.8h, v25.8h 824 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 825 add v4.8h, v4.8h, v20.8h 826 add v5.8h, v5.8h, v21.8h 827 add v6.8h, v6.8h, v22.8h 828 add v7.8h, v7.8h, v23.8h 829 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8], #64 830 b.gt 32b 831 subs w4, w4, #2 832 add x0, x0, x1 833 add x8, x8, x1 834 add x2, x2, w3, uxtw #1 835 add x7, x7, w3, uxtw #1 836 b.gt 321b 837 ret 838L(blend_h_tbl): 839 .hword L(blend_h_tbl) - 1280b 840 .hword L(blend_h_tbl) - 640b 841 .hword L(blend_h_tbl) - 320b 842 .hword L(blend_h_tbl) - 16b 843 .hword L(blend_h_tbl) - 8b 844 .hword L(blend_h_tbl) - 4b 845 .hword L(blend_h_tbl) - 2b 846endfunc 847 848function blend_v_16bpc_neon, export=1 849 adr x6, L(blend_v_tbl) 850 movrel x5, X(obmc_masks) 851 add x5, x5, w3, uxtw 852 clz w3, w3 853 add x8, x0, x1 854 lsl x1, x1, #1 855 sub w3, w3, #26 856 ldrh w3, [x6, x3, lsl #1] 857 sub x6, x6, w3, uxtw 858 br x6 85920: 860 AARCH64_VALID_JUMP_TARGET 861 ld1r {v2.8b}, [x5] 862 neg v2.8b, v2.8b // -m 863 sxtl v2.8h, v2.8b 864 shl v2.4h, v2.4h, #9 // -m << 9 8652: 866 ld1 {v1.s}[0], [x2], #4 867 ld1 {v0.h}[0], [x0] 868 subs w4, w4, #2 869 ld1 {v1.h}[1], [x2] 870 ld1 {v0.h}[1], [x8] 871 add x2, x2, #4 872 sub v1.4h, v0.4h, v1.4h // a - b 873 sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6 874 add v0.4h, v0.4h, v1.4h 875 st1 {v0.h}[0], [x0], x1 876 st1 {v0.h}[1], [x8], x1 877 b.gt 2b 878 ret 87940: 880 AARCH64_VALID_JUMP_TARGET 881 ld1r {v2.2s}, [x5] 882 sub x1, x1, #4 883 neg v2.8b, v2.8b // -m 884 sxtl v2.8h, v2.8b 885 shl v2.8h, v2.8h, #9 // -m << 9 8864: 887 ld1 {v1.8h}, [x2], #16 888 ld1 {v0.d}[0], [x0] 889 ld1 {v0.d}[1], [x8] 890 subs w4, w4, #2 891 sub v1.8h, v0.8h, v1.8h // a - b 892 sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 893 add v0.8h, v0.8h, v1.8h 894 st1 {v0.s}[0], [x0], #4 895 st1 {v0.s}[2], [x8], #4 896 st1 {v0.h}[2], [x0], x1 897 st1 {v0.h}[6], [x8], x1 898 b.gt 4b 899 ret 90080: 901 AARCH64_VALID_JUMP_TARGET 902 ld1 {v4.8b}, [x5] 903 sub x1, x1, #8 904 neg v4.8b, v4.8b // -m 905 sxtl v4.8h, v4.8b 906 shl v4.8h, v4.8h, #9 // -m << 9 9078: 908 ld1 {v2.8h, v3.8h}, [x2], #32 909 ld1 {v0.8h}, [x0] 910 ld1 {v1.8h}, [x8] 911 subs w4, w4, #2 912 sub v2.8h, v0.8h, v2.8h // a - b 913 sub v3.8h, v1.8h, v3.8h 914 sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 915 sqrdmulh v3.8h, v3.8h, v4.8h 916 add v0.8h, v0.8h, v2.8h 917 add v1.8h, v1.8h, v3.8h 918 st1 {v0.d}[0], [x0], #8 919 st1 {v1.d}[0], [x8], #8 920 st1 {v0.s}[2], [x0], x1 921 st1 {v1.s}[2], [x8], x1 922 b.gt 8b 923 ret 924160: 925 AARCH64_VALID_JUMP_TARGET 926 ld1 {v16.16b}, [x5] 927 sub x1, x1, #16 928 neg v17.16b, v16.16b // -m 929 sxtl v16.8h, v17.8b 930 sxtl2 v17.8h, v17.16b 931 shl v16.8h, v16.8h, #9 // -m << 9 932 shl v17.4h, v17.4h, #9 93316: 934 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 935 ld1 {v0.8h, v1.8h}, [x0] 936 subs w4, w4, #2 937 ld1 {v2.8h, v3.8h}, [x8] 938 sub v4.8h, v0.8h, v4.8h // a - b 939 sub v5.4h, v1.4h, v5.4h 940 sub v6.8h, v2.8h, v6.8h 941 sub v7.4h, v3.4h, v7.4h 942 sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 943 sqrdmulh v5.4h, v5.4h, v17.4h 944 sqrdmulh v6.8h, v6.8h, v16.8h 945 sqrdmulh v7.4h, v7.4h, v17.4h 946 add v0.8h, v0.8h, v4.8h 947 add v1.4h, v1.4h, v5.4h 948 add v2.8h, v2.8h, v6.8h 949 add v3.4h, v3.4h, v7.4h 950 st1 {v0.8h}, [x0], #16 951 st1 {v2.8h}, [x8], #16 952 st1 {v1.4h}, [x0], x1 953 st1 {v3.4h}, [x8], x1 954 b.gt 16b 955 ret 956320: 957 AARCH64_VALID_JUMP_TARGET 958 ld1 {v24.16b, v25.16b}, [x5] 959 neg v26.16b, v24.16b // -m 960 neg v27.8b, v25.8b 961 sxtl v24.8h, v26.8b 962 sxtl2 v25.8h, v26.16b 963 sxtl v26.8h, v27.8b 964 shl v24.8h, v24.8h, #9 // -m << 9 965 shl v25.8h, v25.8h, #9 966 shl v26.8h, v26.8h, #9 96732: 968 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64 969 ld1 {v0.8h, v1.8h, v2.8h}, [x0] 970 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64 971 ld1 {v4.8h, v5.8h, v6.8h}, [x8] 972 subs w4, w4, #2 973 sub v16.8h, v0.8h, v16.8h // a - b 974 sub v17.8h, v1.8h, v17.8h 975 sub v18.8h, v2.8h, v18.8h 976 sub v20.8h, v4.8h, v20.8h 977 sub v21.8h, v5.8h, v21.8h 978 sub v22.8h, v6.8h, v22.8h 979 sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6 980 sqrdmulh v17.8h, v17.8h, v25.8h 981 sqrdmulh v18.8h, v18.8h, v26.8h 982 sqrdmulh v20.8h, v20.8h, v24.8h 983 sqrdmulh v21.8h, v21.8h, v25.8h 984 sqrdmulh v22.8h, v22.8h, v26.8h 985 add v0.8h, v0.8h, v16.8h 986 add v1.8h, v1.8h, v17.8h 987 add v2.8h, v2.8h, v18.8h 988 add v4.8h, v4.8h, v20.8h 989 add v5.8h, v5.8h, v21.8h 990 add v6.8h, v6.8h, v22.8h 991 st1 {v0.8h, v1.8h, v2.8h}, [x0], x1 992 st1 {v4.8h, v5.8h, v6.8h}, [x8], x1 993 b.gt 32b 994 ret 995L(blend_v_tbl): 996 .hword L(blend_v_tbl) - 320b 997 .hword L(blend_v_tbl) - 160b 998 .hword L(blend_v_tbl) - 80b 999 .hword L(blend_v_tbl) - 40b 1000 .hword L(blend_v_tbl) - 20b 1001endfunc 1002 1003 1004// This has got the same signature as the put_8tap functions, 1005// and assumes that x9 is set to (clz(w)-24). 1006function put_neon 1007 adr x10, L(put_tbl) 1008 ldrh w9, [x10, x9, lsl #1] 1009 sub x10, x10, w9, uxtw 1010 br x10 1011 10122: 1013 AARCH64_VALID_JUMP_TARGET 1014 ld1 {v0.s}[0], [x2], x3 1015 ld1 {v1.s}[0], [x2], x3 1016 subs w5, w5, #2 1017 st1 {v0.s}[0], [x0], x1 1018 st1 {v1.s}[0], [x0], x1 1019 b.gt 2b 1020 ret 10214: 1022 AARCH64_VALID_JUMP_TARGET 1023 ld1 {v0.4h}, [x2], x3 1024 ld1 {v1.4h}, [x2], x3 1025 subs w5, w5, #2 1026 st1 {v0.4h}, [x0], x1 1027 st1 {v1.4h}, [x0], x1 1028 b.gt 4b 1029 ret 103080: 1031 AARCH64_VALID_JUMP_TARGET 1032 add x8, x0, x1 1033 lsl x1, x1, #1 1034 add x9, x2, x3 1035 lsl x3, x3, #1 10368: 1037 ld1 {v0.8h}, [x2], x3 1038 ld1 {v1.8h}, [x9], x3 1039 subs w5, w5, #2 1040 st1 {v0.8h}, [x0], x1 1041 st1 {v1.8h}, [x8], x1 1042 b.gt 8b 1043 ret 104416: 1045 AARCH64_VALID_JUMP_TARGET 1046 ldp x6, x7, [x2] 1047 ldp x8, x9, [x2, #16] 1048 stp x6, x7, [x0] 1049 subs w5, w5, #1 1050 stp x8, x9, [x0, #16] 1051 add x2, x2, x3 1052 add x0, x0, x1 1053 b.gt 16b 1054 ret 105532: 1056 AARCH64_VALID_JUMP_TARGET 1057 ldp x6, x7, [x2] 1058 ldp x8, x9, [x2, #16] 1059 stp x6, x7, [x0] 1060 ldp x10, x11, [x2, #32] 1061 stp x8, x9, [x0, #16] 1062 subs w5, w5, #1 1063 ldp x12, x13, [x2, #48] 1064 stp x10, x11, [x0, #32] 1065 stp x12, x13, [x0, #48] 1066 add x2, x2, x3 1067 add x0, x0, x1 1068 b.gt 32b 1069 ret 107064: 1071 AARCH64_VALID_JUMP_TARGET 1072 ldp q0, q1, [x2] 1073 ldp q2, q3, [x2, #32] 1074 stp q0, q1, [x0] 1075 ldp q4, q5, [x2, #64] 1076 stp q2, q3, [x0, #32] 1077 ldp q6, q7, [x2, #96] 1078 subs w5, w5, #1 1079 stp q4, q5, [x0, #64] 1080 stp q6, q7, [x0, #96] 1081 add x2, x2, x3 1082 add x0, x0, x1 1083 b.gt 64b 1084 ret 1085128: 1086 AARCH64_VALID_JUMP_TARGET 1087 ldp q0, q1, [x2] 1088 ldp q2, q3, [x2, #32] 1089 stp q0, q1, [x0] 1090 ldp q4, q5, [x2, #64] 1091 stp q2, q3, [x0, #32] 1092 ldp q6, q7, [x2, #96] 1093 subs w5, w5, #1 1094 stp q4, q5, [x0, #64] 1095 ldp q16, q17, [x2, #128] 1096 stp q6, q7, [x0, #96] 1097 ldp q18, q19, [x2, #160] 1098 stp q16, q17, [x0, #128] 1099 ldp q20, q21, [x2, #192] 1100 stp q18, q19, [x0, #160] 1101 ldp q22, q23, [x2, #224] 1102 stp q20, q21, [x0, #192] 1103 stp q22, q23, [x0, #224] 1104 add x2, x2, x3 1105 add x0, x0, x1 1106 b.gt 128b 1107 ret 1108 1109L(put_tbl): 1110 .hword L(put_tbl) - 128b 1111 .hword L(put_tbl) - 64b 1112 .hword L(put_tbl) - 32b 1113 .hword L(put_tbl) - 16b 1114 .hword L(put_tbl) - 80b 1115 .hword L(put_tbl) - 4b 1116 .hword L(put_tbl) - 2b 1117endfunc 1118 1119 1120// This has got the same signature as the prep_8tap functions, 1121// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and 1122// x8 to w*2. 1123function prep_neon 1124 adr x10, L(prep_tbl) 1125 ldrh w9, [x10, x9, lsl #1] 1126 dup v31.8h, w7 // intermediate_bits 1127 movi v30.8h, #(PREP_BIAS >> 8), lsl #8 1128 sub x10, x10, w9, uxtw 1129 br x10 1130 113140: 1132 AARCH64_VALID_JUMP_TARGET 1133 add x9, x1, x2 1134 lsl x2, x2, #1 11354: 1136 ld1 {v0.d}[0], [x1], x2 1137 ld1 {v0.d}[1], [x9], x2 1138 subs w4, w4, #2 1139 sshl v0.8h, v0.8h, v31.8h 1140 sub v0.8h, v0.8h, v30.8h 1141 st1 {v0.8h}, [x0], #16 1142 b.gt 4b 1143 ret 114480: 1145 AARCH64_VALID_JUMP_TARGET 1146 add x9, x1, x2 1147 lsl x2, x2, #1 11488: 1149 ld1 {v0.8h}, [x1], x2 1150 ld1 {v1.8h}, [x9], x2 1151 subs w4, w4, #2 1152 sshl v0.8h, v0.8h, v31.8h 1153 sshl v1.8h, v1.8h, v31.8h 1154 sub v0.8h, v0.8h, v30.8h 1155 sub v1.8h, v1.8h, v30.8h 1156 st1 {v0.8h, v1.8h}, [x0], #32 1157 b.gt 8b 1158 ret 115916: 1160 AARCH64_VALID_JUMP_TARGET 1161 ldp q0, q1, [x1] 1162 add x1, x1, x2 1163 sshl v0.8h, v0.8h, v31.8h 1164 ldp q2, q3, [x1] 1165 add x1, x1, x2 1166 subs w4, w4, #2 1167 sshl v1.8h, v1.8h, v31.8h 1168 sshl v2.8h, v2.8h, v31.8h 1169 sshl v3.8h, v3.8h, v31.8h 1170 sub v0.8h, v0.8h, v30.8h 1171 sub v1.8h, v1.8h, v30.8h 1172 sub v2.8h, v2.8h, v30.8h 1173 sub v3.8h, v3.8h, v30.8h 1174 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 1175 b.gt 16b 1176 ret 117732: 1178 AARCH64_VALID_JUMP_TARGET 1179 ldp q0, q1, [x1] 1180 sshl v0.8h, v0.8h, v31.8h 1181 ldp q2, q3, [x1, #32] 1182 add x1, x1, x2 1183 sshl v1.8h, v1.8h, v31.8h 1184 sshl v2.8h, v2.8h, v31.8h 1185 sshl v3.8h, v3.8h, v31.8h 1186 subs w4, w4, #1 1187 sub v0.8h, v0.8h, v30.8h 1188 sub v1.8h, v1.8h, v30.8h 1189 sub v2.8h, v2.8h, v30.8h 1190 sub v3.8h, v3.8h, v30.8h 1191 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 1192 b.gt 32b 1193 ret 119464: 1195 AARCH64_VALID_JUMP_TARGET 1196 ldp q0, q1, [x1] 1197 subs w4, w4, #1 1198 sshl v0.8h, v0.8h, v31.8h 1199 ldp q2, q3, [x1, #32] 1200 sshl v1.8h, v1.8h, v31.8h 1201 ldp q4, q5, [x1, #64] 1202 sshl v2.8h, v2.8h, v31.8h 1203 sshl v3.8h, v3.8h, v31.8h 1204 ldp q6, q7, [x1, #96] 1205 add x1, x1, x2 1206 sshl v4.8h, v4.8h, v31.8h 1207 sshl v5.8h, v5.8h, v31.8h 1208 sshl v6.8h, v6.8h, v31.8h 1209 sshl v7.8h, v7.8h, v31.8h 1210 sub v0.8h, v0.8h, v30.8h 1211 sub v1.8h, v1.8h, v30.8h 1212 sub v2.8h, v2.8h, v30.8h 1213 sub v3.8h, v3.8h, v30.8h 1214 stp q0, q1, [x0] 1215 sub v4.8h, v4.8h, v30.8h 1216 sub v5.8h, v5.8h, v30.8h 1217 stp q2, q3, [x0, #32] 1218 sub v6.8h, v6.8h, v30.8h 1219 sub v7.8h, v7.8h, v30.8h 1220 stp q4, q5, [x0, #64] 1221 stp q6, q7, [x0, #96] 1222 add x0, x0, x8 1223 b.gt 64b 1224 ret 1225128: 1226 AARCH64_VALID_JUMP_TARGET 1227 ldp q0, q1, [x1] 1228 subs w4, w4, #1 1229 sshl v0.8h, v0.8h, v31.8h 1230 ldp q2, q3, [x1, #32] 1231 sshl v1.8h, v1.8h, v31.8h 1232 ldp q4, q5, [x1, #64] 1233 sshl v2.8h, v2.8h, v31.8h 1234 sshl v3.8h, v3.8h, v31.8h 1235 ldp q6, q7, [x1, #96] 1236 sshl v4.8h, v4.8h, v31.8h 1237 sshl v5.8h, v5.8h, v31.8h 1238 ldp q16, q17, [x1, #128] 1239 sshl v6.8h, v6.8h, v31.8h 1240 sshl v7.8h, v7.8h, v31.8h 1241 ldp q18, q19, [x1, #160] 1242 sshl v16.8h, v16.8h, v31.8h 1243 sshl v17.8h, v17.8h, v31.8h 1244 ldp q20, q21, [x1, #192] 1245 sshl v18.8h, v18.8h, v31.8h 1246 sshl v19.8h, v19.8h, v31.8h 1247 ldp q22, q23, [x1, #224] 1248 add x1, x1, x2 1249 sshl v20.8h, v20.8h, v31.8h 1250 sshl v21.8h, v21.8h, v31.8h 1251 sshl v22.8h, v22.8h, v31.8h 1252 sshl v23.8h, v23.8h, v31.8h 1253 sub v0.8h, v0.8h, v30.8h 1254 sub v1.8h, v1.8h, v30.8h 1255 sub v2.8h, v2.8h, v30.8h 1256 sub v3.8h, v3.8h, v30.8h 1257 stp q0, q1, [x0] 1258 sub v4.8h, v4.8h, v30.8h 1259 sub v5.8h, v5.8h, v30.8h 1260 stp q2, q3, [x0, #32] 1261 sub v6.8h, v6.8h, v30.8h 1262 sub v7.8h, v7.8h, v30.8h 1263 stp q4, q5, [x0, #64] 1264 sub v16.8h, v16.8h, v30.8h 1265 sub v17.8h, v17.8h, v30.8h 1266 stp q6, q7, [x0, #96] 1267 sub v18.8h, v18.8h, v30.8h 1268 sub v19.8h, v19.8h, v30.8h 1269 stp q16, q17, [x0, #128] 1270 sub v20.8h, v20.8h, v30.8h 1271 sub v21.8h, v21.8h, v30.8h 1272 stp q18, q19, [x0, #160] 1273 sub v22.8h, v22.8h, v30.8h 1274 sub v23.8h, v23.8h, v30.8h 1275 stp q20, q21, [x0, #192] 1276 stp q22, q23, [x0, #224] 1277 add x0, x0, x8 1278 b.gt 128b 1279 ret 1280 1281L(prep_tbl): 1282 .hword L(prep_tbl) - 128b 1283 .hword L(prep_tbl) - 64b 1284 .hword L(prep_tbl) - 32b 1285 .hword L(prep_tbl) - 16b 1286 .hword L(prep_tbl) - 80b 1287 .hword L(prep_tbl) - 40b 1288endfunc 1289 1290 1291.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1292 ld1 {\d0\wd}[0], [\s0], \strd 1293 ld1 {\d1\wd}[0], [\s1], \strd 1294.ifnb \d2 1295 ld1 {\d2\wd}[0], [\s0], \strd 1296 ld1 {\d3\wd}[0], [\s1], \strd 1297.endif 1298.ifnb \d4 1299 ld1 {\d4\wd}[0], [\s0], \strd 1300.endif 1301.ifnb \d5 1302 ld1 {\d5\wd}[0], [\s1], \strd 1303.endif 1304.ifnb \d6 1305 ld1 {\d6\wd}[0], [\s0], \strd 1306.endif 1307.endm 1308.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1309 ld1 {\d0\wd}, [\s0], \strd 1310 ld1 {\d1\wd}, [\s1], \strd 1311.ifnb \d2 1312 ld1 {\d2\wd}, [\s0], \strd 1313 ld1 {\d3\wd}, [\s1], \strd 1314.endif 1315.ifnb \d4 1316 ld1 {\d4\wd}, [\s0], \strd 1317.endif 1318.ifnb \d5 1319 ld1 {\d5\wd}, [\s1], \strd 1320.endif 1321.ifnb \d6 1322 ld1 {\d6\wd}, [\s0], \strd 1323.endif 1324.endm 1325.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5 1326 ld1 {\d0\wd, \d1\wd}, [\s0], \strd 1327.ifnb \d2 1328 ld1 {\d2\wd, \d3\wd}, [\s1], \strd 1329.endif 1330.ifnb \d4 1331 ld1 {\d4\wd, \d5\wd}, [\s0], \strd 1332.endif 1333.endm 1334.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1335 load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1336.endm 1337.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1338 load_reg \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1339.endm 1340.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1341 load_reg \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1342.endm 1343.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5 1344 load_regpair \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5 1345.endm 1346.macro interleave_1 wd, r0, r1, r2, r3, r4 1347 trn1 \r0\wd, \r0\wd, \r1\wd 1348 trn1 \r1\wd, \r1\wd, \r2\wd 1349.ifnb \r3 1350 trn1 \r2\wd, \r2\wd, \r3\wd 1351 trn1 \r3\wd, \r3\wd, \r4\wd 1352.endif 1353.endm 1354.macro interleave_1_s r0, r1, r2, r3, r4 1355 interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 1356.endm 1357.macro umin_h c, wd, r0, r1, r2, r3 1358 umin \r0\wd, \r0\wd, \c\wd 1359.ifnb \r1 1360 umin \r1\wd, \r1\wd, \c\wd 1361.endif 1362.ifnb \r2 1363 umin \r2\wd, \r2\wd, \c\wd 1364 umin \r3\wd, \r3\wd, \c\wd 1365.endif 1366.endm 1367.macro sub_h c, wd, r0, r1, r2, r3 1368 sub \r0\wd, \r0\wd, \c\wd 1369.ifnb \r1 1370 sub \r1\wd, \r1\wd, \c\wd 1371.endif 1372.ifnb \r2 1373 sub \r2\wd, \r2\wd, \c\wd 1374 sub \r3\wd, \r3\wd, \c\wd 1375.endif 1376.endm 1377.macro smull_smlal_4tap d, s0, s1, s2, s3 1378 smull \d\().4s, \s0\().4h, v0.h[0] 1379 smlal \d\().4s, \s1\().4h, v0.h[1] 1380 smlal \d\().4s, \s2\().4h, v0.h[2] 1381 smlal \d\().4s, \s3\().4h, v0.h[3] 1382.endm 1383.macro smull2_smlal2_4tap d, s0, s1, s2, s3 1384 smull2 \d\().4s, \s0\().8h, v0.h[0] 1385 smlal2 \d\().4s, \s1\().8h, v0.h[1] 1386 smlal2 \d\().4s, \s2\().8h, v0.h[2] 1387 smlal2 \d\().4s, \s3\().8h, v0.h[3] 1388.endm 1389.macro smull_smlal_6tap d, s0, s1, s2, s3, s4, s5, s6, s7 1390 smull \d\().4s, \s1\().4h, v0.h[1] 1391 smlal \d\().4s, \s2\().4h, v0.h[2] 1392 smlal \d\().4s, \s3\().4h, v0.h[3] 1393 smlal \d\().4s, \s4\().4h, v0.h[4] 1394 smlal \d\().4s, \s5\().4h, v0.h[5] 1395 smlal \d\().4s, \s6\().4h, v0.h[6] 1396.endm 1397.macro smull2_smlal2_6tap d, s0, s1, s2, s3, s4, s5, s6, s7 1398 smull2 \d\().4s, \s1\().8h, v0.h[1] 1399 smlal2 \d\().4s, \s2\().8h, v0.h[2] 1400 smlal2 \d\().4s, \s3\().8h, v0.h[3] 1401 smlal2 \d\().4s, \s4\().8h, v0.h[4] 1402 smlal2 \d\().4s, \s5\().8h, v0.h[5] 1403 smlal2 \d\().4s, \s6\().8h, v0.h[6] 1404.endm 1405.macro smull_smlal_8tap d, s0, s1, s2, s3, s4, s5, s6, s7 1406 smull \d\().4s, \s0\().4h, v0.h[0] 1407 smlal \d\().4s, \s1\().4h, v0.h[1] 1408 smlal \d\().4s, \s2\().4h, v0.h[2] 1409 smlal \d\().4s, \s3\().4h, v0.h[3] 1410 smlal \d\().4s, \s4\().4h, v0.h[4] 1411 smlal \d\().4s, \s5\().4h, v0.h[5] 1412 smlal \d\().4s, \s6\().4h, v0.h[6] 1413 smlal \d\().4s, \s7\().4h, v0.h[7] 1414.endm 1415.macro smull2_smlal2_8tap d, s0, s1, s2, s3, s4, s5, s6, s7 1416 smull2 \d\().4s, \s0\().8h, v0.h[0] 1417 smlal2 \d\().4s, \s1\().8h, v0.h[1] 1418 smlal2 \d\().4s, \s2\().8h, v0.h[2] 1419 smlal2 \d\().4s, \s3\().8h, v0.h[3] 1420 smlal2 \d\().4s, \s4\().8h, v0.h[4] 1421 smlal2 \d\().4s, \s5\().8h, v0.h[5] 1422 smlal2 \d\().4s, \s6\().8h, v0.h[6] 1423 smlal2 \d\().4s, \s7\().8h, v0.h[7] 1424.endm 1425.macro sqrshrun_h shift, r0, r1, r2, r3 1426 sqrshrun \r0\().4h, \r0\().4s, #\shift 1427.ifnb \r1 1428 sqrshrun2 \r0\().8h, \r1\().4s, #\shift 1429.endif 1430.ifnb \r2 1431 sqrshrun \r2\().4h, \r2\().4s, #\shift 1432 sqrshrun2 \r2\().8h, \r3\().4s, #\shift 1433.endif 1434.endm 1435.macro xtn_h r0, r1, r2, r3 1436 uzp1 \r0\().8h, \r0\().8h, \r1\().8h // Same as xtn, xtn2 1437.ifnb \r2 1438 uzp1 \r2\().8h, \r2\().8h, \r3\().8h // Ditto 1439.endif 1440.endm 1441.macro srshl_s shift, r0, r1, r2, r3 1442 srshl \r0\().4s, \r0\().4s, \shift\().4s 1443 srshl \r1\().4s, \r1\().4s, \shift\().4s 1444.ifnb \r2 1445 srshl \r2\().4s, \r2\().4s, \shift\().4s 1446 srshl \r3\().4s, \r3\().4s, \shift\().4s 1447.endif 1448.endm 1449.macro st_s strd, reg, lanes 1450 st1 {\reg\().s}[0], [x0], \strd 1451 st1 {\reg\().s}[1], [x9], \strd 1452.if \lanes > 2 1453 st1 {\reg\().s}[2], [x0], \strd 1454 st1 {\reg\().s}[3], [x9], \strd 1455.endif 1456.endm 1457.macro st_d strd, r0, r1 1458 st1 {\r0\().d}[0], [x0], \strd 1459 st1 {\r0\().d}[1], [x9], \strd 1460.ifnb \r1 1461 st1 {\r1\().d}[0], [x0], \strd 1462 st1 {\r1\().d}[1], [x9], \strd 1463.endif 1464.endm 1465.macro shift_store_4 type, strd, r0, r1, r2, r3 1466.ifc \type, put 1467 sqrshrun_h 6, \r0, \r1, \r2, \r3 1468 umin_h v31, .8h, \r0, \r2 1469.else 1470 srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) 1471 xtn_h \r0, \r1, \r2, \r3 1472 sub_h v29, .8h, \r0, \r2 // PREP_BIAS 1473.endif 1474 st_d \strd, \r0, \r2 1475.endm 1476.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 1477 st1 {\r0\wd}, [x0], \strd 1478 st1 {\r1\wd}, [x9], \strd 1479.ifnb \r2 1480 st1 {\r2\wd}, [x0], \strd 1481 st1 {\r3\wd}, [x9], \strd 1482.endif 1483.ifnb \r4 1484 st1 {\r4\wd}, [x0], \strd 1485 st1 {\r5\wd}, [x9], \strd 1486 st1 {\r6\wd}, [x0], \strd 1487 st1 {\r7\wd}, [x9], \strd 1488.endif 1489.endm 1490.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7 1491 st_reg \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 1492.endm 1493.macro shift_store_8 type, strd, r0, r1, r2, r3 1494.ifc \type, put 1495 sqrshrun_h 6, \r0, \r1, \r2, \r3 1496 umin_h v31, .8h, \r0, \r2 1497.else 1498 srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) 1499 xtn_h \r0, \r1, \r2, \r3 1500 sub_h v29, .8h, \r0, \r2 // PREP_BIAS 1501.endif 1502 st_8h \strd, \r0, \r2 1503.endm 1504.macro shift_store_16 type, strd, dst, r0, r1, r2, r3 1505.ifc \type, put 1506 sqrshrun_h 6, \r0, \r1, \r2, \r3 1507 umin \r0\().8h, \r0\().8h, v31.8h 1508 umin \r1\().8h, \r2\().8h, v31.8h 1509.else 1510 srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) 1511 xtn_h \r0, \r1, \r2, \r3 1512 sub \r0\().8h, \r0\().8h, v29.8h 1513 sub \r1\().8h, \r2\().8h, v29.8h 1514.endif 1515 st1 {\r0\().8h, \r1\().8h}, [\dst], \strd 1516.endm 1517 1518.macro make_8tap_fn op, type, type_h, type_v, taps 1519function \op\()_8tap_\type\()_16bpc_neon, export=1 1520 mov w9, \type_h 1521 mov w10, \type_v 1522 b \op\()_\taps\()_neon 1523endfunc 1524.endm 1525 1526// No spaces in these expressions, due to gas-preprocessor. 1527#define REGULAR ((0*15<<7)|3*15) 1528#define SMOOTH ((1*15<<7)|4*15) 1529#define SHARP ((2*15<<7)|3*15) 1530 1531.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2, taps 1532function \type\()_\taps\()_neon 1533.ifc \bdmax, w8 1534 ldr w8, [sp] 1535.endif 1536 mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) 1537 mul \mx, \mx, w11 1538 mul \my, \my, w11 1539 add \mx, \mx, w9 // mx, 8tap_h, 4tap_h 1540 add \my, \my, w10 // my, 8tap_v, 4tap_v 1541.ifc \type, prep 1542 uxtw \d_strd, \w 1543 lsl \d_strd, \d_strd, #1 1544.endif 1545 1546 dup v31.8h, \bdmax // bitdepth_max 1547 clz \bdmax, \bdmax 1548 clz w9, \w 1549 sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 1550 mov w12, #6 1551 tst \mx, #(0x7f << 14) 1552 sub w9, w9, #24 1553 add w13, w12, \bdmax // 6 + intermediate_bits 1554 sub w12, w12, \bdmax // 6 - intermediate_bits 1555 movrel x11, X(mc_subpel_filters), -8 1556 b.ne L(\type\()_\taps\()_h) 1557 tst \my, #(0x7f << 14) 1558 b.ne L(\type\()_\taps\()_v) 1559 b \type\()_neon 1560 1561L(\type\()_\taps\()_h): 1562 cmp \w, #4 1563 ubfx w10, \mx, #7, #7 1564 and \mx, \mx, #0x7f 1565 b.le 4f 1566 mov \mx, w10 15674: 1568 tst \my, #(0x7f << 14) 1569 add \xmx, x11, \mx, uxtw #3 1570 b.ne L(\type\()_\taps\()_hv) 1571 1572 adr x10, L(\type\()_\taps\()_h_tbl) 1573 dup v30.4s, w12 // 6 - intermediate_bits 1574 ldrh w9, [x10, x9, lsl #1] 1575 neg v30.4s, v30.4s // -(6-intermediate_bits) 1576.ifc \type, put 1577 dup v29.8h, \bdmax // intermediate_bits 1578.else 1579 movi v28.8h, #(PREP_BIAS >> 8), lsl #8 1580.endif 1581 sub x10, x10, w9, uxtw 1582.ifc \type, put 1583 neg v29.8h, v29.8h // -intermediate_bits 1584.endif 1585 br x10 1586 158720: // 2xN h 1588 AARCH64_VALID_JUMP_TARGET 1589.ifc \type, put 1590 add \xmx, \xmx, #2 1591 ld1 {v0.s}[0], [\xmx] 1592 sub \src, \src, #2 1593 add \ds2, \dst, \d_strd 1594 add \sr2, \src, \s_strd 1595 lsl \d_strd, \d_strd, #1 1596 lsl \s_strd, \s_strd, #1 1597 sxtl v0.8h, v0.8b 15982: 1599 ld1 {v4.8h}, [\src], \s_strd 1600 ld1 {v6.8h}, [\sr2], \s_strd 1601 ext v5.16b, v4.16b, v4.16b, #2 1602 ext v7.16b, v6.16b, v6.16b, #2 1603 subs \h, \h, #2 1604 trn1 v3.2s, v4.2s, v6.2s 1605 trn2 v6.2s, v4.2s, v6.2s 1606 trn1 v4.2s, v5.2s, v7.2s 1607 trn2 v7.2s, v5.2s, v7.2s 1608 smull v3.4s, v3.4h, v0.h[0] 1609 smlal v3.4s, v4.4h, v0.h[1] 1610 smlal v3.4s, v6.4h, v0.h[2] 1611 smlal v3.4s, v7.4h, v0.h[3] 1612 srshl v3.4s, v3.4s, v30.4s // -(6-intermediate_bits) 1613 sqxtun v3.4h, v3.4s 1614 srshl v3.4h, v3.4h, v29.4h // -intermediate_bits 1615 umin v3.4h, v3.4h, v31.4h 1616 st1 {v3.s}[0], [\dst], \d_strd 1617 st1 {v3.s}[1], [\ds2], \d_strd 1618 b.gt 2b 1619 ret 1620.endif 1621 162240: // 4xN h 1623 AARCH64_VALID_JUMP_TARGET 1624 add \xmx, \xmx, #2 1625 ld1 {v0.s}[0], [\xmx] 1626 sub \src, \src, #2 1627 add \ds2, \dst, \d_strd 1628 add \sr2, \src, \s_strd 1629 lsl \d_strd, \d_strd, #1 1630 lsl \s_strd, \s_strd, #1 1631 sxtl v0.8h, v0.8b 16324: 1633 ld1 {v16.8h}, [\src], \s_strd 1634 ld1 {v20.8h}, [\sr2], \s_strd 1635 ext v17.16b, v16.16b, v16.16b, #2 1636 ext v18.16b, v16.16b, v16.16b, #4 1637 ext v19.16b, v16.16b, v16.16b, #6 1638 ext v21.16b, v20.16b, v20.16b, #2 1639 ext v22.16b, v20.16b, v20.16b, #4 1640 ext v23.16b, v20.16b, v20.16b, #6 1641 subs \h, \h, #2 1642 smull v16.4s, v16.4h, v0.h[0] 1643 smlal v16.4s, v17.4h, v0.h[1] 1644 smlal v16.4s, v18.4h, v0.h[2] 1645 smlal v16.4s, v19.4h, v0.h[3] 1646 smull v20.4s, v20.4h, v0.h[0] 1647 smlal v20.4s, v21.4h, v0.h[1] 1648 smlal v20.4s, v22.4h, v0.h[2] 1649 smlal v20.4s, v23.4h, v0.h[3] 1650 srshl v16.4s, v16.4s, v30.4s // -(6-intermediate_bits) 1651 srshl v20.4s, v20.4s, v30.4s // -(6-intermediate_bits) 1652.ifc \type, put 1653 sqxtun v16.4h, v16.4s 1654 sqxtun2 v16.8h, v20.4s 1655 srshl v16.8h, v16.8h, v29.8h // -intermediate_bits 1656 umin v16.8h, v16.8h, v31.8h 1657.else 1658 uzp1 v16.8h, v16.8h, v20.8h // Same as xtn, xtn2 1659 sub v16.8h, v16.8h, v28.8h // PREP_BIAS 1660.endif 1661 st1 {v16.d}[0], [\dst], \d_strd 1662 st1 {v16.d}[1], [\ds2], \d_strd 1663 b.gt 4b 1664 ret 1665 166680: 1667160: 1668320: 1669640: 16701280: // 8xN, 16xN, 32xN, ... h 1671 AARCH64_VALID_JUMP_TARGET 1672 ld1 {v0.8b}, [\xmx] 1673 sub \src, \src, #6 1674 add \ds2, \dst, \d_strd 1675 add \sr2, \src, \s_strd 1676 lsl \s_strd, \s_strd, #1 1677 sxtl v0.8h, v0.8b 1678 1679 sub \s_strd, \s_strd, \w, uxtw #1 1680 sub \s_strd, \s_strd, #16 1681.ifc \type, put 1682 lsl \d_strd, \d_strd, #1 1683 sub \d_strd, \d_strd, \w, uxtw #1 1684.endif 168581: 1686 ld1 {v16.8h, v17.8h}, [\src], #32 1687 ld1 {v20.8h, v21.8h}, [\sr2], #32 1688 mov \mx, \w 1689 16908: 1691.ifc \taps, 6tap 1692 ext v24.16b, v16.16b, v17.16b, #2 1693 ext v25.16b, v20.16b, v21.16b, #2 1694 smull v18.4s, v24.4h, v0.h[1] 1695 smull2 v19.4s, v24.8h, v0.h[1] 1696 smull v22.4s, v25.4h, v0.h[1] 1697 smull2 v23.4s, v25.8h, v0.h[1] 1698.irpc i, 23456 1699 ext v24.16b, v16.16b, v17.16b, #(2*\i) 1700 ext v25.16b, v20.16b, v21.16b, #(2*\i) 1701 smlal v18.4s, v24.4h, v0.h[\i] 1702 smlal2 v19.4s, v24.8h, v0.h[\i] 1703 smlal v22.4s, v25.4h, v0.h[\i] 1704 smlal2 v23.4s, v25.8h, v0.h[\i] 1705.endr 1706.else // 8tap 1707 smull v18.4s, v16.4h, v0.h[0] 1708 smull2 v19.4s, v16.8h, v0.h[0] 1709 smull v22.4s, v20.4h, v0.h[0] 1710 smull2 v23.4s, v20.8h, v0.h[0] 1711.irpc i, 1234567 1712 ext v24.16b, v16.16b, v17.16b, #(2*\i) 1713 ext v25.16b, v20.16b, v21.16b, #(2*\i) 1714 smlal v18.4s, v24.4h, v0.h[\i] 1715 smlal2 v19.4s, v24.8h, v0.h[\i] 1716 smlal v22.4s, v25.4h, v0.h[\i] 1717 smlal2 v23.4s, v25.8h, v0.h[\i] 1718.endr 1719.endif 1720 subs \mx, \mx, #8 1721 srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits) 1722 srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits) 1723 srshl v22.4s, v22.4s, v30.4s // -(6-intermediate_bits) 1724 srshl v23.4s, v23.4s, v30.4s // -(6-intermediate_bits) 1725.ifc \type, put 1726 sqxtun v18.4h, v18.4s 1727 sqxtun2 v18.8h, v19.4s 1728 sqxtun v22.4h, v22.4s 1729 sqxtun2 v22.8h, v23.4s 1730 srshl v18.8h, v18.8h, v29.8h // -intermediate_bits 1731 srshl v22.8h, v22.8h, v29.8h // -intermediate_bits 1732 umin v18.8h, v18.8h, v31.8h 1733 umin v22.8h, v22.8h, v31.8h 1734.else 1735 uzp1 v18.8h, v18.8h, v19.8h // Same as xtn, xtn2 1736 uzp1 v22.8h, v22.8h, v23.8h // Ditto 1737 sub v18.8h, v18.8h, v28.8h // PREP_BIAS 1738 sub v22.8h, v22.8h, v28.8h // PREP_BIAS 1739.endif 1740 st1 {v18.8h}, [\dst], #16 1741 st1 {v22.8h}, [\ds2], #16 1742 b.le 9f 1743 1744 mov v16.16b, v17.16b 1745 mov v20.16b, v21.16b 1746 ld1 {v17.8h}, [\src], #16 1747 ld1 {v21.8h}, [\sr2], #16 1748 b 8b 1749 17509: 1751 add \dst, \dst, \d_strd 1752 add \ds2, \ds2, \d_strd 1753 add \src, \src, \s_strd 1754 add \sr2, \sr2, \s_strd 1755 1756 subs \h, \h, #2 1757 b.gt 81b 1758 ret 1759 1760L(\type\()_\taps\()_h_tbl): 1761 .hword L(\type\()_\taps\()_h_tbl) - 1280b 1762 .hword L(\type\()_\taps\()_h_tbl) - 640b 1763 .hword L(\type\()_\taps\()_h_tbl) - 320b 1764 .hword L(\type\()_\taps\()_h_tbl) - 160b 1765 .hword L(\type\()_\taps\()_h_tbl) - 80b 1766 .hword L(\type\()_\taps\()_h_tbl) - 40b 1767 .hword L(\type\()_\taps\()_h_tbl) - 20b 1768 .hword 0 1769 1770 1771L(\type\()_\taps\()_v): 1772 cmp \h, #4 1773 ubfx w10, \my, #7, #7 1774 and \my, \my, #0x7f 1775 b.le 4f 1776 mov \my, w10 17774: 1778 add \xmy, x11, \my, uxtw #3 1779 1780.ifc \type, prep 1781 dup v30.4s, w12 // 6 - intermediate_bits 1782 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 1783.endif 1784 adr x10, L(\type\()_\taps\()_v_tbl) 1785 ldrh w9, [x10, x9, lsl #1] 1786.ifc \type, prep 1787 neg v30.4s, v30.4s // -(6-intermediate_bits) 1788.endif 1789 sub x10, x10, w9, uxtw 1790 br x10 1791 179220: // 2xN v 1793 AARCH64_VALID_JUMP_TARGET 1794.ifc \type, put 1795 b.gt 28f 1796 1797 cmp \h, #2 1798 add \xmy, \xmy, #2 1799 ld1 {v0.s}[0], [\xmy] 1800 sub \src, \src, \s_strd 1801 add \ds2, \dst, \d_strd 1802 add \sr2, \src, \s_strd 1803 lsl \s_strd, \s_strd, #1 1804 lsl \d_strd, \d_strd, #1 1805 sxtl v0.8h, v0.8b 1806 1807 // 2x2 v 1808 load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1809 interleave_1_s v1, v2, v3, v4, v5 1810 b.gt 24f 1811 smull_smlal_4tap v6, v1, v2, v3, v4 1812 sqrshrun_h 6, v6 1813 umin_h v31, .8h, v6 1814 st_s \d_strd, v6, 2 1815 ret 1816 181724: // 2x4 v 1818 load_s \sr2, \src, \s_strd, v6, v7 1819 interleave_1_s v5, v6, v7 1820 smull_smlal_4tap v16, v1, v2, v3, v4 1821 smull_smlal_4tap v17, v3, v4, v5, v6 1822 sqrshrun_h 6, v16, v17 1823 umin_h v31, .8h, v16 1824 st_s \d_strd, v16, 4 1825 ret 1826 182728: // 2x6, 2x8, 2x12, 2x16 v 1828 ld1 {v0.8b}, [\xmy] 1829 sub \sr2, \src, \s_strd, lsl #1 1830 add \ds2, \dst, \d_strd 1831 sub \src, \sr2, \s_strd 1832 lsl \d_strd, \d_strd, #1 1833 lsl \s_strd, \s_strd, #1 1834 sxtl v0.8h, v0.8b 1835 1836 load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 1837 interleave_1_s v1, v2, v3, v4, v5 1838 interleave_1_s v5, v6, v7 1839216: 1840 subs \h, \h, #4 1841 load_s \sr2, \src, \s_strd, v16, v17, v18, v19 1842 interleave_1_s v7, v16, v17, v18, v19 1843 smull_smlal_\taps v24, v1, v2, v3, v4, v5, v6, v7, v16 1844 smull_smlal_\taps v25, v3, v4, v5, v6, v7, v16, v17, v18 1845 sqrshrun_h 6, v24, v25 1846 umin_h v31, .8h, v24 1847 st_s \d_strd, v24, 4 1848 b.le 0f 1849 cmp \h, #2 1850 mov v1.16b, v5.16b 1851 mov v2.16b, v6.16b 1852 mov v3.16b, v7.16b 1853 mov v4.16b, v16.16b 1854 mov v5.16b, v17.16b 1855 mov v6.16b, v18.16b 1856 mov v7.16b, v19.16b 1857 b.eq 26f 1858 b 216b 185926: 1860 load_s \sr2, \src, \s_strd, v16, v17 1861 interleave_1_s v7, v16, v17 1862 smull_smlal_\taps v24, v1, v2, v3, v4, v5, v6, v7, v16 1863 sqrshrun_h 6, v24 1864 umin_h v31, .4h, v24 1865 st_s \d_strd, v24, 2 18660: 1867 ret 1868.endif 1869 187040: 1871 AARCH64_VALID_JUMP_TARGET 1872 b.gt 480f 1873 1874 // 4x2, 4x4 v 1875 cmp \h, #2 1876 add \xmy, \xmy, #2 1877 ld1 {v0.s}[0], [\xmy] 1878 sub \src, \src, \s_strd 1879 add \ds2, \dst, \d_strd 1880 add \sr2, \src, \s_strd 1881 lsl \s_strd, \s_strd, #1 1882 lsl \d_strd, \d_strd, #1 1883 sxtl v0.8h, v0.8b 1884 1885 load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1886 smull_smlal_4tap v6, v1, v2, v3, v4 1887 smull_smlal_4tap v7, v2, v3, v4, v5 1888 shift_store_4 \type, \d_strd, v6, v7 1889 b.le 0f 1890 load_4h \sr2, \src, \s_strd, v6, v7 1891 smull_smlal_4tap v1, v3, v4, v5, v6 1892 smull_smlal_4tap v2, v4, v5, v6, v7 1893 shift_store_4 \type, \d_strd, v1, v2 18940: 1895 ret 1896 1897480: // 4x6, 4x8, 4x12, 4x16 v 1898 ld1 {v0.8b}, [\xmy] 1899 sub \sr2, \src, \s_strd, lsl #1 1900 add \ds2, \dst, \d_strd 1901 sub \src, \sr2, \s_strd 1902 lsl \s_strd, \s_strd, #1 1903 lsl \d_strd, \d_strd, #1 1904 sxtl v0.8h, v0.8b 1905 1906 load_4h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1907 190848: 1909 subs \h, \h, #4 1910 load_4h \sr2, \src, \s_strd, v23, v24, v25, v26 1911 smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23 1912 smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24 1913 smull_smlal_\taps v3, v18, v19, v20, v21, v22, v23, v24, v25 1914 smull_smlal_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26 1915 shift_store_4 \type, \d_strd, v1, v2, v3, v4 1916 b.le 0f 1917 cmp \h, #2 1918 mov v16.8b, v20.8b 1919 mov v17.8b, v21.8b 1920 mov v18.8b, v22.8b 1921 mov v19.8b, v23.8b 1922 mov v20.8b, v24.8b 1923 mov v21.8b, v25.8b 1924 mov v22.8b, v26.8b 1925 b.eq 46f 1926 b 48b 192746: 1928 load_4h \sr2, \src, \s_strd, v23, v24 1929 smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23 1930 smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24 1931 shift_store_4 \type, \d_strd, v1, v2 19320: 1933 ret 1934 193580: 1936 AARCH64_VALID_JUMP_TARGET 1937 b.gt 880f 1938 1939 // 8x2, 8x4 v 1940 cmp \h, #2 1941 add \xmy, \xmy, #2 1942 ld1 {v0.s}[0], [\xmy] 1943 sub \src, \src, \s_strd 1944 add \ds2, \dst, \d_strd 1945 add \sr2, \src, \s_strd 1946 lsl \s_strd, \s_strd, #1 1947 lsl \d_strd, \d_strd, #1 1948 sxtl v0.8h, v0.8b 1949 1950 load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1951 smull_smlal_4tap v16, v1, v2, v3, v4 1952 smull2_smlal2_4tap v17, v1, v2, v3, v4 1953 smull_smlal_4tap v18, v2, v3, v4, v5 1954 smull2_smlal2_4tap v19, v2, v3, v4, v5 1955 shift_store_8 \type, \d_strd, v16, v17, v18, v19 1956 b.le 0f 1957 load_8h \sr2, \src, \s_strd, v6, v7 1958 smull_smlal_4tap v16, v3, v4, v5, v6 1959 smull2_smlal2_4tap v17, v3, v4, v5, v6 1960 smull_smlal_4tap v18, v4, v5, v6, v7 1961 smull2_smlal2_4tap v19, v4, v5, v6, v7 1962 shift_store_8 \type, \d_strd, v16, v17, v18, v19 19630: 1964 ret 1965 1966880: // 8x6, 8x8, 8x16, 8x32 v 19671680: // 16x8, 16x16, ... 1968320: // 32x8, 32x16, ... 1969640: 19701280: 1971 AARCH64_VALID_JUMP_TARGET 1972 ld1 {v0.8b}, [\xmy] 1973 sub \src, \src, \s_strd 1974 sub \src, \src, \s_strd, lsl #1 1975 sxtl v0.8h, v0.8b 1976 mov \my, \h 1977168: 1978 add \ds2, \dst, \d_strd 1979 add \sr2, \src, \s_strd 1980 lsl \s_strd, \s_strd, #1 1981 lsl \d_strd, \d_strd, #1 1982 1983 load_8h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1984 198588: 1986 subs \h, \h, #2 1987 load_8h \sr2, \src, \s_strd, v23, v24 1988 smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23 1989 smull2_smlal2_\taps v2, v16, v17, v18, v19, v20, v21, v22, v23 1990 smull_smlal_\taps v3, v17, v18, v19, v20, v21, v22, v23, v24 1991 smull2_smlal2_\taps v4, v17, v18, v19, v20, v21, v22, v23, v24 1992 shift_store_8 \type, \d_strd, v1, v2, v3, v4 1993 b.le 9f 1994 subs \h, \h, #2 1995 load_8h \sr2, \src, \s_strd, v25, v26 1996 smull_smlal_\taps v1, v18, v19, v20, v21, v22, v23, v24, v25 1997 smull2_smlal2_\taps v2, v18, v19, v20, v21, v22, v23, v24, v25 1998 smull_smlal_\taps v3, v19, v20, v21, v22, v23, v24, v25, v26 1999 smull2_smlal2_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26 2000 shift_store_8 \type, \d_strd, v1, v2, v3, v4 2001 b.le 9f 2002 mov v16.16b, v20.16b 2003 mov v17.16b, v21.16b 2004 mov v18.16b, v22.16b 2005 mov v19.16b, v23.16b 2006 mov v20.16b, v24.16b 2007 mov v21.16b, v25.16b 2008 mov v22.16b, v26.16b 2009 b 88b 20109: 2011 subs \w, \w, #8 2012 b.le 0f 2013 asr \s_strd, \s_strd, #1 2014 asr \d_strd, \d_strd, #1 2015 msub \src, \s_strd, \xmy, \src 2016 msub \dst, \d_strd, \xmy, \dst 2017 sub \src, \src, \s_strd, lsl #3 2018 mov \h, \my 2019 add \src, \src, #16 2020 add \dst, \dst, #16 2021 b 168b 20220: 2023 ret 2024 2025160: 2026 AARCH64_VALID_JUMP_TARGET 2027 b.gt 1680b 2028 2029 // 16x2, 16x4 v 2030 add \xmy, \xmy, #2 2031 ld1 {v0.s}[0], [\xmy] 2032 sub \src, \src, \s_strd 2033 sxtl v0.8h, v0.8b 2034 2035 load_16h \src, \src, \s_strd, v16, v17, v18, v19, v20, v21 203616: 2037 load_16h \src, \src, \s_strd, v22, v23 2038 subs \h, \h, #1 2039 smull_smlal_4tap v1, v16, v18, v20, v22 2040 smull2_smlal2_4tap v2, v16, v18, v20, v22 2041 smull_smlal_4tap v3, v17, v19, v21, v23 2042 smull2_smlal2_4tap v4, v17, v19, v21, v23 2043 shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4 2044 b.le 0f 2045 mov v16.16b, v18.16b 2046 mov v17.16b, v19.16b 2047 mov v18.16b, v20.16b 2048 mov v19.16b, v21.16b 2049 mov v20.16b, v22.16b 2050 mov v21.16b, v23.16b 2051 b 16b 20520: 2053 ret 2054 2055L(\type\()_\taps\()_v_tbl): 2056 .hword L(\type\()_\taps\()_v_tbl) - 1280b 2057 .hword L(\type\()_\taps\()_v_tbl) - 640b 2058 .hword L(\type\()_\taps\()_v_tbl) - 320b 2059 .hword L(\type\()_\taps\()_v_tbl) - 160b 2060 .hword L(\type\()_\taps\()_v_tbl) - 80b 2061 .hword L(\type\()_\taps\()_v_tbl) - 40b 2062 .hword L(\type\()_\taps\()_v_tbl) - 20b 2063 .hword 0 2064 2065L(\type\()_\taps\()_hv): 2066 cmp \h, #4 2067 ubfx w10, \my, #7, #7 2068 and \my, \my, #0x7f 2069 b.le 4f 2070 mov \my, w10 20714: 2072 add \xmy, x11, \my, uxtw #3 2073 2074 adr x10, L(\type\()_\taps\()_hv_tbl) 2075 dup v30.4s, w12 // 6 - intermediate_bits 2076 ldrh w9, [x10, x9, lsl #1] 2077 neg v30.4s, v30.4s // -(6-intermediate_bits) 2078.ifc \type, put 2079 dup v29.4s, w13 // 6 + intermediate_bits 2080.else 2081 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 2082.endif 2083 sub x10, x10, w9, uxtw 2084.ifc \type, put 2085 neg v29.4s, v29.4s // -(6+intermediate_bits) 2086.endif 2087 br x10 2088 208920: 2090 AARCH64_VALID_JUMP_TARGET 2091.ifc \type, put 2092 add \xmx, \xmx, #2 2093 ld1 {v0.s}[0], [\xmx] 2094 b.gt 280f 2095 add \xmy, \xmy, #2 2096 ld1 {v1.s}[0], [\xmy] 2097 2098 // 2x2, 2x4 hv 2099 sub \sr2, \src, #2 2100 sub \src, \sr2, \s_strd 2101 add \ds2, \dst, \d_strd 2102 lsl \s_strd, \s_strd, #1 2103 lsl \d_strd, \d_strd, #1 2104 sxtl v0.8h, v0.8b 2105 sxtl v1.8h, v1.8b 2106 mov x15, x30 2107 2108 ld1 {v27.8h}, [\src], \s_strd 2109 ext v28.16b, v27.16b, v27.16b, #2 2110 smull v27.4s, v27.4h, v0.4h 2111 smull v28.4s, v28.4h, v0.4h 2112 addp v27.4s, v27.4s, v28.4s 2113 addp v16.4s, v27.4s, v27.4s 2114 srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) 2115 bl L(\type\()_\taps\()_filter_2) 2116 // The intermediates from the horizontal pass fit in 16 bit without 2117 // any bias; we could just as well keep them as .4s, but narrowing 2118 // them to .4h gives a significant speedup on out of order cores 2119 // (at the cost of a smaller slowdown on in-order cores such as A53). 2120 xtn v16.4h, v16.4s 2121 2122 trn1 v16.2s, v16.2s, v24.2s 2123 mov v17.8b, v24.8b 2124 21252: 2126 bl L(\type\()_\taps\()_filter_2) 2127 2128 ext v18.8b, v17.8b, v24.8b, #4 2129 smull v2.4s, v16.4h, v1.h[0] 2130 smlal v2.4s, v17.4h, v1.h[1] 2131 smlal v2.4s, v18.4h, v1.h[2] 2132 smlal v2.4s, v24.4h, v1.h[3] 2133 2134 srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) 2135 sqxtun v2.4h, v2.4s 2136 umin v2.4h, v2.4h, v31.4h 2137 subs \h, \h, #2 2138 st1 {v2.s}[0], [\dst], \d_strd 2139 st1 {v2.s}[1], [\ds2], \d_strd 2140 b.le 0f 2141 mov v16.8b, v18.8b 2142 mov v17.8b, v24.8b 2143 b 2b 2144 2145280: // 2x8, 2x16, 2x32 hv 2146 ld1 {v1.8b}, [\xmy] 2147 sub \src, \src, #2 2148 sub \sr2, \src, \s_strd, lsl #1 2149 sub \src, \sr2, \s_strd 2150 add \ds2, \dst, \d_strd 2151 lsl \s_strd, \s_strd, #1 2152 lsl \d_strd, \d_strd, #1 2153 sxtl v0.8h, v0.8b 2154 sxtl v1.8h, v1.8b 2155 mov x15, x30 2156 2157 ld1 {v27.8h}, [\src], \s_strd 2158 ext v28.16b, v27.16b, v27.16b, #2 2159 smull v27.4s, v27.4h, v0.4h 2160 smull v28.4s, v28.4h, v0.4h 2161 addp v27.4s, v27.4s, v28.4s 2162 addp v16.4s, v27.4s, v27.4s 2163 srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) 2164 // The intermediates from the horizontal pass fit in 16 bit without 2165 // any bias; we could just as well keep them as .4s, but narrowing 2166 // them to .4h gives a significant speedup on out of order cores 2167 // (at the cost of a smaller slowdown on in-order cores such as A53). 2168 2169 bl L(\type\()_\taps\()_filter_2) 2170 xtn v16.4h, v16.4s 2171 trn1 v16.2s, v16.2s, v24.2s 2172 mov v17.8b, v24.8b 2173 bl L(\type\()_\taps\()_filter_2) 2174 ext v18.8b, v17.8b, v24.8b, #4 2175 mov v19.8b, v24.8b 2176 bl L(\type\()_\taps\()_filter_2) 2177 ext v20.8b, v19.8b, v24.8b, #4 2178 mov v21.8b, v24.8b 2179 218028: 2181 bl L(\type\()_\taps\()_filter_2) 2182 ext v22.8b, v21.8b, v24.8b, #4 2183.ifc \taps, 6tap 2184 smull v3.4s, v17.4h, v1.h[1] 2185 smlal v3.4s, v18.4h, v1.h[2] 2186 smlal v3.4s, v19.4h, v1.h[3] 2187 smlal v3.4s, v20.4h, v1.h[4] 2188 smlal v3.4s, v21.4h, v1.h[5] 2189 smlal v3.4s, v22.4h, v1.h[6] 2190.else // 8tap 2191 smull v3.4s, v16.4h, v1.h[0] 2192 smlal v3.4s, v17.4h, v1.h[1] 2193 smlal v3.4s, v18.4h, v1.h[2] 2194 smlal v3.4s, v19.4h, v1.h[3] 2195 smlal v3.4s, v20.4h, v1.h[4] 2196 smlal v3.4s, v21.4h, v1.h[5] 2197 smlal v3.4s, v22.4h, v1.h[6] 2198 smlal v3.4s, v24.4h, v1.h[7] 2199.endif 2200 2201 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2202 sqxtun v3.4h, v3.4s 2203 umin v3.4h, v3.4h, v31.4h 2204 subs \h, \h, #2 2205 st1 {v3.s}[0], [\dst], \d_strd 2206 st1 {v3.s}[1], [\ds2], \d_strd 2207 b.le 0f 2208 mov v16.8b, v18.8b 2209 mov v17.8b, v19.8b 2210 mov v18.8b, v20.8b 2211 mov v19.8b, v21.8b 2212 mov v20.8b, v22.8b 2213 mov v21.8b, v24.8b 2214 b 28b 2215 22160: 2217 ret x15 2218 2219L(\type\()_\taps\()_filter_2): 2220 ld1 {v25.8h}, [\sr2], \s_strd 2221 ld1 {v27.8h}, [\src], \s_strd 2222 ext v26.16b, v25.16b, v25.16b, #2 2223 ext v28.16b, v27.16b, v27.16b, #2 2224 trn1 v24.2s, v25.2s, v27.2s 2225 trn2 v27.2s, v25.2s, v27.2s 2226 trn1 v25.2s, v26.2s, v28.2s 2227 trn2 v28.2s, v26.2s, v28.2s 2228 smull v24.4s, v24.4h, v0.h[0] 2229 smlal v24.4s, v25.4h, v0.h[1] 2230 smlal v24.4s, v27.4h, v0.h[2] 2231 smlal v24.4s, v28.4h, v0.h[3] 2232 srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) 2233 xtn v24.4h, v24.4s 2234 ret 2235.endif 2236 223740: 2238 AARCH64_VALID_JUMP_TARGET 2239 add \xmx, \xmx, #2 2240 ld1 {v0.s}[0], [\xmx] 2241 b.gt 480f 2242 add \xmy, \xmy, #2 2243 ld1 {v1.s}[0], [\xmy] 2244 sub \sr2, \src, #2 2245 sub \src, \sr2, \s_strd 2246 add \ds2, \dst, \d_strd 2247 lsl \s_strd, \s_strd, #1 2248 lsl \d_strd, \d_strd, #1 2249 sxtl v0.8h, v0.8b 2250 sxtl v1.8h, v1.8b 2251 mov x15, x30 2252 2253 // 4x2, 4x4 hv 2254 ld1 {v25.8h}, [\src], \s_strd 2255 ext v26.16b, v25.16b, v25.16b, #2 2256 ext v27.16b, v25.16b, v25.16b, #4 2257 ext v28.16b, v25.16b, v25.16b, #6 2258 smull v25.4s, v25.4h, v0.h[0] 2259 smlal v25.4s, v26.4h, v0.h[1] 2260 smlal v25.4s, v27.4h, v0.h[2] 2261 smlal v25.4s, v28.4h, v0.h[3] 2262 srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2263 // The intermediates from the horizontal pass fit in 16 bit without 2264 // any bias; we could just as well keep them as .4s, but narrowing 2265 // them to .4h gives a significant speedup on out of order cores 2266 // (at the cost of a smaller slowdown on in-order cores such as A53). 2267 xtn v16.4h, v16.4s 2268 2269 bl L(\type\()_\taps\()_filter_4) 2270 mov v17.8b, v24.8b 2271 mov v18.8b, v25.8b 2272 22734: 2274 bl L(\type\()_\taps\()_filter_4) 2275 smull v2.4s, v16.4h, v1.h[0] 2276 smlal v2.4s, v17.4h, v1.h[1] 2277 smlal v2.4s, v18.4h, v1.h[2] 2278 smlal v2.4s, v24.4h, v1.h[3] 2279 smull v3.4s, v17.4h, v1.h[0] 2280 smlal v3.4s, v18.4h, v1.h[1] 2281 smlal v3.4s, v24.4h, v1.h[2] 2282 smlal v3.4s, v25.4h, v1.h[3] 2283.ifc \type, put 2284 srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) 2285 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2286 sqxtun v2.4h, v2.4s 2287 sqxtun2 v2.8h, v3.4s 2288 umin v2.8h, v2.8h, v31.8h 2289.else 2290 rshrn v2.4h, v2.4s, #6 2291 rshrn2 v2.8h, v3.4s, #6 2292 sub v2.8h, v2.8h, v29.8h // PREP_BIAS 2293.endif 2294 subs \h, \h, #2 2295 2296 st1 {v2.d}[0], [\dst], \d_strd 2297 st1 {v2.d}[1], [\ds2], \d_strd 2298 b.le 0f 2299 mov v16.8b, v18.8b 2300 mov v17.8b, v24.8b 2301 mov v18.8b, v25.8b 2302 b 4b 2303 2304480: // 4x8, 4x16, 4x32 hv 2305 ld1 {v1.8b}, [\xmy] 2306 sub \src, \src, #2 2307.ifc \taps, 6tap 2308 sub \sr2, \src, \s_strd 2309 sub \src, \src, \s_strd, lsl #1 2310.else 2311 sub \sr2, \src, \s_strd, lsl #1 2312 sub \src, \sr2, \s_strd 2313.endif 2314 add \ds2, \dst, \d_strd 2315 lsl \s_strd, \s_strd, #1 2316 lsl \d_strd, \d_strd, #1 2317 sxtl v0.8h, v0.8b 2318 sxtl v1.8h, v1.8b 2319 mov x15, x30 2320 2321 ld1 {v25.8h}, [\src], \s_strd 2322 ext v26.16b, v25.16b, v25.16b, #2 2323 ext v27.16b, v25.16b, v25.16b, #4 2324 ext v28.16b, v25.16b, v25.16b, #6 2325 smull v25.4s, v25.4h, v0.h[0] 2326 smlal v25.4s, v26.4h, v0.h[1] 2327 smlal v25.4s, v27.4h, v0.h[2] 2328 smlal v25.4s, v28.4h, v0.h[3] 2329 srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2330 // The intermediates from the horizontal pass fit in 16 bit without 2331 // any bias; we could just as well keep them as .4s, but narrowing 2332 // them to .4h gives a significant speedup on out of order cores 2333 // (at the cost of a smaller slowdown on in-order cores such as A53). 2334.ifc \taps, 6tap 2335 xtn v18.4h, v16.4s 2336.else 2337 xtn v16.4h, v16.4s 2338 2339 bl L(\type\()_\taps\()_filter_4) 2340 mov v17.8b, v24.8b 2341 mov v18.8b, v25.8b 2342.endif 2343 bl L(\type\()_\taps\()_filter_4) 2344 mov v19.8b, v24.8b 2345 mov v20.8b, v25.8b 2346 bl L(\type\()_\taps\()_filter_4) 2347 mov v21.8b, v24.8b 2348 mov v22.8b, v25.8b 2349 235048: 2351 bl L(\type\()_\taps\()_filter_4) 2352.ifc \taps, 6tap 2353 smull v3.4s, v18.4h, v1.h[1] 2354 smlal v3.4s, v19.4h, v1.h[2] 2355 smlal v3.4s, v20.4h, v1.h[3] 2356 smlal v3.4s, v21.4h, v1.h[4] 2357 smlal v3.4s, v22.4h, v1.h[5] 2358 smlal v3.4s, v24.4h, v1.h[6] 2359 smull v4.4s, v19.4h, v1.h[1] 2360 smlal v4.4s, v20.4h, v1.h[2] 2361 smlal v4.4s, v21.4h, v1.h[3] 2362 smlal v4.4s, v22.4h, v1.h[4] 2363 smlal v4.4s, v24.4h, v1.h[5] 2364 smlal v4.4s, v25.4h, v1.h[6] 2365.else // 8tap 2366 smull v3.4s, v16.4h, v1.h[0] 2367 smlal v3.4s, v17.4h, v1.h[1] 2368 smlal v3.4s, v18.4h, v1.h[2] 2369 smlal v3.4s, v19.4h, v1.h[3] 2370 smlal v3.4s, v20.4h, v1.h[4] 2371 smlal v3.4s, v21.4h, v1.h[5] 2372 smlal v3.4s, v22.4h, v1.h[6] 2373 smlal v3.4s, v24.4h, v1.h[7] 2374 smull v4.4s, v17.4h, v1.h[0] 2375 smlal v4.4s, v18.4h, v1.h[1] 2376 smlal v4.4s, v19.4h, v1.h[2] 2377 smlal v4.4s, v20.4h, v1.h[3] 2378 smlal v4.4s, v21.4h, v1.h[4] 2379 smlal v4.4s, v22.4h, v1.h[5] 2380 smlal v4.4s, v24.4h, v1.h[6] 2381 smlal v4.4s, v25.4h, v1.h[7] 2382.endif 2383.ifc \type, put 2384 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2385 srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) 2386 sqxtun v3.4h, v3.4s 2387 sqxtun2 v3.8h, v4.4s 2388 umin v3.8h, v3.8h, v31.8h 2389.else 2390 rshrn v3.4h, v3.4s, #6 2391 rshrn2 v3.8h, v4.4s, #6 2392 sub v3.8h, v3.8h, v29.8h // PREP_BIAS 2393.endif 2394 subs \h, \h, #2 2395 st1 {v3.d}[0], [\dst], \d_strd 2396 st1 {v3.d}[1], [\ds2], \d_strd 2397 b.le 0f 2398.ifc \taps, 8tap 2399 mov v16.8b, v18.8b 2400 mov v17.8b, v19.8b 2401.endif 2402 mov v18.8b, v20.8b 2403 mov v19.8b, v21.8b 2404 mov v20.8b, v22.8b 2405 mov v21.8b, v24.8b 2406 mov v22.8b, v25.8b 2407 b 48b 24080: 2409 ret x15 2410 2411L(\type\()_\taps\()_filter_4): 2412 ld1 {v24.8h}, [\sr2], \s_strd 2413 ld1 {v25.8h}, [\src], \s_strd 2414 ext v26.16b, v24.16b, v24.16b, #2 2415 ext v27.16b, v24.16b, v24.16b, #4 2416 ext v28.16b, v24.16b, v24.16b, #6 2417 smull v24.4s, v24.4h, v0.h[0] 2418 smlal v24.4s, v26.4h, v0.h[1] 2419 smlal v24.4s, v27.4h, v0.h[2] 2420 smlal v24.4s, v28.4h, v0.h[3] 2421 ext v26.16b, v25.16b, v25.16b, #2 2422 ext v27.16b, v25.16b, v25.16b, #4 2423 ext v28.16b, v25.16b, v25.16b, #6 2424 smull v25.4s, v25.4h, v0.h[0] 2425 smlal v25.4s, v26.4h, v0.h[1] 2426 smlal v25.4s, v27.4h, v0.h[2] 2427 smlal v25.4s, v28.4h, v0.h[3] 2428 srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) 2429 srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2430 xtn v24.4h, v24.4s 2431 xtn v25.4h, v25.4s 2432 ret 2433 243480: 2435160: 2436320: 2437 AARCH64_VALID_JUMP_TARGET 2438 b.gt 880f 2439 add \xmy, \xmy, #2 2440 ld1 {v0.8b}, [\xmx] 2441 ld1 {v1.s}[0], [\xmy] 2442 sub \src, \src, #6 2443 sub \src, \src, \s_strd 2444 sxtl v0.8h, v0.8b 2445 sxtl v1.8h, v1.8b 2446 mov x15, x30 2447 mov \my, \h 2448 2449164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv 2450 add \ds2, \dst, \d_strd 2451 add \sr2, \src, \s_strd 2452 lsl \d_strd, \d_strd, #1 2453 lsl \s_strd, \s_strd, #1 2454 2455 ld1 {v27.8h, v28.8h}, [\src], \s_strd 2456 smull v24.4s, v27.4h, v0.h[0] 2457 smull2 v25.4s, v27.8h, v0.h[0] 2458.irpc i, 1234567 2459 ext v26.16b, v27.16b, v28.16b, #(2*\i) 2460 smlal v24.4s, v26.4h, v0.h[\i] 2461 smlal2 v25.4s, v26.8h, v0.h[\i] 2462.endr 2463 srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) 2464 srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2465 // The intermediates from the horizontal pass fit in 16 bit without 2466 // any bias; we could just as well keep them as .4s, but narrowing 2467 // them to .4h gives a significant speedup on out of order cores 2468 // (at the cost of a smaller slowdown on in-order cores such as A53), 2469 // and conserves register space (no need to clobber v8-v15). 2470 uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 2471 2472 bl L(\type\()_\taps\()_filter_8) 2473 mov v17.16b, v23.16b 2474 mov v18.16b, v24.16b 2475 24768: 2477 smull v2.4s, v16.4h, v1.h[0] 2478 smull2 v3.4s, v16.8h, v1.h[0] 2479 bl L(\type\()_\taps\()_filter_8) 2480 smull v4.4s, v17.4h, v1.h[0] 2481 smull2 v5.4s, v17.8h, v1.h[0] 2482 smlal v2.4s, v17.4h, v1.h[1] 2483 smlal2 v3.4s, v17.8h, v1.h[1] 2484 smlal v4.4s, v18.4h, v1.h[1] 2485 smlal2 v5.4s, v18.8h, v1.h[1] 2486 smlal v2.4s, v18.4h, v1.h[2] 2487 smlal2 v3.4s, v18.8h, v1.h[2] 2488 smlal v4.4s, v23.4h, v1.h[2] 2489 smlal2 v5.4s, v23.8h, v1.h[2] 2490 smlal v2.4s, v23.4h, v1.h[3] 2491 smlal2 v3.4s, v23.8h, v1.h[3] 2492 smlal v4.4s, v24.4h, v1.h[3] 2493 smlal2 v5.4s, v24.8h, v1.h[3] 2494.ifc \type, put 2495 srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) 2496 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2497 srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) 2498 srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits) 2499 sqxtun v2.4h, v2.4s 2500 sqxtun2 v2.8h, v3.4s 2501 sqxtun v3.4h, v4.4s 2502 sqxtun2 v3.8h, v5.4s 2503 umin v2.8h, v2.8h, v31.8h 2504 umin v3.8h, v3.8h, v31.8h 2505.else 2506 rshrn v2.4h, v2.4s, #6 2507 rshrn2 v2.8h, v3.4s, #6 2508 rshrn v3.4h, v4.4s, #6 2509 rshrn2 v3.8h, v5.4s, #6 2510 sub v2.8h, v2.8h, v29.8h // PREP_BIAS 2511 sub v3.8h, v3.8h, v29.8h // PREP_BIAS 2512.endif 2513 subs \h, \h, #2 2514 st1 {v2.8h}, [\dst], \d_strd 2515 st1 {v3.8h}, [\ds2], \d_strd 2516 b.le 9f 2517 mov v16.16b, v18.16b 2518 mov v17.16b, v23.16b 2519 mov v18.16b, v24.16b 2520 b 8b 25219: 2522 subs \w, \w, #8 2523 b.le 0f 2524 asr \s_strd, \s_strd, #1 2525 asr \d_strd, \d_strd, #1 2526 msub \src, \s_strd, \xmy, \src 2527 msub \dst, \d_strd, \xmy, \dst 2528 sub \src, \src, \s_strd, lsl #2 2529 mov \h, \my 2530 add \src, \src, #16 2531 add \dst, \dst, #16 2532 b 164b 2533 2534880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 2535640: 25361280: 2537 AARCH64_VALID_JUMP_TARGET 2538 ld1 {v0.8b}, [\xmx] 2539 ld1 {v1.8b}, [\xmy] 2540 sub \src, \src, #6 2541.ifc \taps, 8tap 2542 sub \src, \src, \s_strd 2543.endif 2544 sub \src, \src, \s_strd, lsl #1 2545 sxtl v0.8h, v0.8b 2546 sxtl v1.8h, v1.8b 2547 mov x15, x30 2548 mov \my, \h 2549 2550168: 2551 add \ds2, \dst, \d_strd 2552 add \sr2, \src, \s_strd 2553 lsl \d_strd, \d_strd, #1 2554 lsl \s_strd, \s_strd, #1 2555 2556 ld1 {v27.8h, v28.8h}, [\src], \s_strd 2557.ifc \taps, 6tap 2558 ext v26.16b, v27.16b, v28.16b, #2 2559 smull v24.4s, v26.4h, v0.h[1] 2560 smull2 v25.4s, v26.8h, v0.h[1] 2561.irpc i, 23456 2562 ext v26.16b, v27.16b, v28.16b, #(2*\i) 2563 smlal v24.4s, v26.4h, v0.h[\i] 2564 smlal2 v25.4s, v26.8h, v0.h[\i] 2565.endr 2566.else // 8tap 2567 smull v24.4s, v27.4h, v0.h[0] 2568 smull2 v25.4s, v27.8h, v0.h[0] 2569.irpc i, 1234567 2570 ext v26.16b, v27.16b, v28.16b, #(2*\i) 2571 smlal v24.4s, v26.4h, v0.h[\i] 2572 smlal2 v25.4s, v26.8h, v0.h[\i] 2573.endr 2574.endif 2575 srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) 2576 srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2577 // The intermediates from the horizontal pass fit in 16 bit without 2578 // any bias; we could just as well keep them as .4s, but narrowing 2579 // them to .4h gives a significant speedup on out of order cores 2580 // (at the cost of a smaller slowdown on in-order cores such as A53), 2581 // and conserves register space (no need to clobber v8-v15). 2582.ifc \taps, 6tap 2583 uzp1 v18.8h, v24.8h, v25.8h // Same as xtn, xtn2 2584.else 2585 uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 2586 2587 bl L(\type\()_\taps\()_filter_8) 2588 mov v17.16b, v23.16b 2589 mov v18.16b, v24.16b 2590.endif 2591 bl L(\type\()_\taps\()_filter_8) 2592 mov v19.16b, v23.16b 2593 mov v20.16b, v24.16b 2594 bl L(\type\()_\taps\()_filter_8) 2595 mov v21.16b, v23.16b 2596 mov v22.16b, v24.16b 2597 259888: 2599.ifc \taps, 6tap 2600 smull v2.4s, v18.4h, v1.h[1] 2601 smull2 v3.4s, v18.8h, v1.h[1] 2602 bl L(\type\()_\taps\()_filter_8) 2603 smull v4.4s, v19.4h, v1.h[1] 2604 smull2 v5.4s, v19.8h, v1.h[1] 2605 smlal v2.4s, v19.4h, v1.h[2] 2606 smlal2 v3.4s, v19.8h, v1.h[2] 2607 smlal v4.4s, v20.4h, v1.h[2] 2608 smlal2 v5.4s, v20.8h, v1.h[2] 2609 smlal v2.4s, v20.4h, v1.h[3] 2610 smlal2 v3.4s, v20.8h, v1.h[3] 2611 smlal v4.4s, v21.4h, v1.h[3] 2612 smlal2 v5.4s, v21.8h, v1.h[3] 2613 smlal v2.4s, v21.4h, v1.h[4] 2614 smlal2 v3.4s, v21.8h, v1.h[4] 2615 smlal v4.4s, v22.4h, v1.h[4] 2616 smlal2 v5.4s, v22.8h, v1.h[4] 2617 smlal v2.4s, v22.4h, v1.h[5] 2618 smlal2 v3.4s, v22.8h, v1.h[5] 2619 smlal v4.4s, v23.4h, v1.h[5] 2620 smlal2 v5.4s, v23.8h, v1.h[5] 2621 smlal v2.4s, v23.4h, v1.h[6] 2622 smlal2 v3.4s, v23.8h, v1.h[6] 2623 smlal v4.4s, v24.4h, v1.h[6] 2624 smlal2 v5.4s, v24.8h, v1.h[6] 2625.else // 8tap 2626 smull v2.4s, v16.4h, v1.h[0] 2627 smull2 v3.4s, v16.8h, v1.h[0] 2628 bl L(\type\()_\taps\()_filter_8) 2629 smull v4.4s, v17.4h, v1.h[0] 2630 smull2 v5.4s, v17.8h, v1.h[0] 2631 smlal v2.4s, v17.4h, v1.h[1] 2632 smlal2 v3.4s, v17.8h, v1.h[1] 2633 smlal v4.4s, v18.4h, v1.h[1] 2634 smlal2 v5.4s, v18.8h, v1.h[1] 2635 smlal v2.4s, v18.4h, v1.h[2] 2636 smlal2 v3.4s, v18.8h, v1.h[2] 2637 smlal v4.4s, v19.4h, v1.h[2] 2638 smlal2 v5.4s, v19.8h, v1.h[2] 2639 smlal v2.4s, v19.4h, v1.h[3] 2640 smlal2 v3.4s, v19.8h, v1.h[3] 2641 smlal v4.4s, v20.4h, v1.h[3] 2642 smlal2 v5.4s, v20.8h, v1.h[3] 2643 smlal v2.4s, v20.4h, v1.h[4] 2644 smlal2 v3.4s, v20.8h, v1.h[4] 2645 smlal v4.4s, v21.4h, v1.h[4] 2646 smlal2 v5.4s, v21.8h, v1.h[4] 2647 smlal v2.4s, v21.4h, v1.h[5] 2648 smlal2 v3.4s, v21.8h, v1.h[5] 2649 smlal v4.4s, v22.4h, v1.h[5] 2650 smlal2 v5.4s, v22.8h, v1.h[5] 2651 smlal v2.4s, v22.4h, v1.h[6] 2652 smlal2 v3.4s, v22.8h, v1.h[6] 2653 smlal v4.4s, v23.4h, v1.h[6] 2654 smlal2 v5.4s, v23.8h, v1.h[6] 2655 smlal v2.4s, v23.4h, v1.h[7] 2656 smlal2 v3.4s, v23.8h, v1.h[7] 2657 smlal v4.4s, v24.4h, v1.h[7] 2658 smlal2 v5.4s, v24.8h, v1.h[7] 2659.endif 2660.ifc \type, put 2661 srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) 2662 srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) 2663 srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) 2664 srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits) 2665 sqxtun v2.4h, v2.4s 2666 sqxtun2 v2.8h, v3.4s 2667 sqxtun v3.4h, v4.4s 2668 sqxtun2 v3.8h, v5.4s 2669 umin v2.8h, v2.8h, v31.8h 2670 umin v3.8h, v3.8h, v31.8h 2671.else 2672 rshrn v2.4h, v2.4s, #6 2673 rshrn2 v2.8h, v3.4s, #6 2674 rshrn v3.4h, v4.4s, #6 2675 rshrn2 v3.8h, v5.4s, #6 2676 sub v2.8h, v2.8h, v29.8h // PREP_BIAS 2677 sub v3.8h, v3.8h, v29.8h // PREP_BIAS 2678.endif 2679 subs \h, \h, #2 2680 st1 {v2.8h}, [\dst], \d_strd 2681 st1 {v3.8h}, [\ds2], \d_strd 2682 b.le 9f 2683.ifc \taps, 8tap 2684 mov v16.16b, v18.16b 2685 mov v17.16b, v19.16b 2686.endif 2687 mov v18.16b, v20.16b 2688 mov v19.16b, v21.16b 2689 mov v20.16b, v22.16b 2690 mov v21.16b, v23.16b 2691 mov v22.16b, v24.16b 2692 b 88b 26939: 2694 subs \w, \w, #8 2695 b.le 0f 2696 asr \s_strd, \s_strd, #1 2697 asr \d_strd, \d_strd, #1 2698 msub \src, \s_strd, \xmy, \src 2699 msub \dst, \d_strd, \xmy, \dst 2700 sub \src, \src, \s_strd, lsl #3 2701 mov \h, \my 2702 add \src, \src, #16 2703 add \dst, \dst, #16 2704.ifc \taps, 6tap 2705 add \src, \src, \s_strd, lsl #1 2706.endif 2707 b 168b 27080: 2709 ret x15 2710 2711L(\type\()_\taps\()_filter_8): 2712 ld1 {v4.8h, v5.8h}, [\sr2], \s_strd 2713 ld1 {v6.8h, v7.8h}, [\src], \s_strd 2714.ifc \taps, 6tap 2715 ext v23.16b, v4.16b, v5.16b, #2 2716 ext v24.16b, v6.16b, v7.16b, #2 2717 smull v25.4s, v23.4h, v0.h[1] 2718 smull2 v26.4s, v23.8h, v0.h[1] 2719 smull v27.4s, v24.4h, v0.h[1] 2720 smull2 v28.4s, v24.8h, v0.h[1] 2721.irpc i, 23456 2722 ext v23.16b, v4.16b, v5.16b, #(2*\i) 2723 ext v24.16b, v6.16b, v7.16b, #(2*\i) 2724 smlal v25.4s, v23.4h, v0.h[\i] 2725 smlal2 v26.4s, v23.8h, v0.h[\i] 2726 smlal v27.4s, v24.4h, v0.h[\i] 2727 smlal2 v28.4s, v24.8h, v0.h[\i] 2728.endr 2729.else // 8tap 2730 smull v25.4s, v4.4h, v0.h[0] 2731 smull2 v26.4s, v4.8h, v0.h[0] 2732 smull v27.4s, v6.4h, v0.h[0] 2733 smull2 v28.4s, v6.8h, v0.h[0] 2734.irpc i, 1234567 2735 ext v23.16b, v4.16b, v5.16b, #(2*\i) 2736 ext v24.16b, v6.16b, v7.16b, #(2*\i) 2737 smlal v25.4s, v23.4h, v0.h[\i] 2738 smlal2 v26.4s, v23.8h, v0.h[\i] 2739 smlal v27.4s, v24.4h, v0.h[\i] 2740 smlal2 v28.4s, v24.8h, v0.h[\i] 2741.endr 2742.endif 2743 srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) 2744 srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits) 2745 srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits) 2746 srshl v28.4s, v28.4s, v30.4s // -(6-intermediate_bits) 2747 uzp1 v23.8h, v25.8h, v26.8h // Same as xtn, xtn2 2748 uzp1 v24.8h, v27.8h, v28.8h // Ditto 2749 ret 2750 2751L(\type\()_\taps\()_hv_tbl): 2752 .hword L(\type\()_\taps\()_hv_tbl) - 1280b 2753 .hword L(\type\()_\taps\()_hv_tbl) - 640b 2754 .hword L(\type\()_\taps\()_hv_tbl) - 320b 2755 .hword L(\type\()_\taps\()_hv_tbl) - 160b 2756 .hword L(\type\()_\taps\()_hv_tbl) - 80b 2757 .hword L(\type\()_\taps\()_hv_tbl) - 40b 2758 .hword L(\type\()_\taps\()_hv_tbl) - 20b 2759 .hword 0 2760endfunc 2761.endm 2762 2763 2764.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2 2765function \type\()_bilin_16bpc_neon, export=1 2766.ifc \bdmax, w8 2767 ldr w8, [sp] 2768.endif 2769 dup v1.8h, \mx 2770 dup v3.8h, \my 2771 mov w10, #16 2772 sub w9, w10, \mx 2773 sub w10, w10, \my 2774 dup v0.8h, w9 2775 dup v2.8h, w10 2776.ifc \type, prep 2777 uxtw \d_strd, \w 2778 lsl \d_strd, \d_strd, #1 2779.endif 2780 2781 clz \bdmax, \bdmax // bitdepth_max 2782 clz w9, \w 2783 sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 2784 mov w11, #4 2785 sub w9, w9, #24 2786 sub w11, w11, \bdmax // 4 - intermediate_bits 2787 add w12, \bdmax, #4 // 4 + intermediate_bits 2788 cbnz \mx, L(\type\()_bilin_h) 2789 cbnz \my, L(\type\()_bilin_v) 2790 b \type\()_neon 2791 2792L(\type\()_bilin_h): 2793 cbnz \my, L(\type\()_bilin_hv) 2794 2795 adr x10, L(\type\()_bilin_h_tbl) 2796 dup v31.8h, w11 // 4 - intermediate_bits 2797 ldrh w9, [x10, x9, lsl #1] 2798 neg v31.8h, v31.8h // -(4-intermediate_bits) 2799.ifc \type, put 2800 dup v30.8h, \bdmax // intermediate_bits 2801.else 2802 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 2803.endif 2804 sub x10, x10, w9, uxtw 2805.ifc \type, put 2806 neg v30.8h, v30.8h // -intermediate_bits 2807.endif 2808 br x10 2809 281020: // 2xN h 2811 AARCH64_VALID_JUMP_TARGET 2812.ifc \type, put 2813 add \ds2, \dst, \d_strd 2814 add \sr2, \src, \s_strd 2815 lsl \d_strd, \d_strd, #1 2816 lsl \s_strd, \s_strd, #1 28172: 2818 ld1 {v4.4h}, [\src], \s_strd 2819 ld1 {v6.4h}, [\sr2], \s_strd 2820 ext v5.8b, v4.8b, v4.8b, #2 2821 ext v7.8b, v6.8b, v6.8b, #2 2822 trn1 v4.2s, v4.2s, v6.2s 2823 trn1 v5.2s, v5.2s, v7.2s 2824 subs \h, \h, #2 2825 mul v4.4h, v4.4h, v0.4h 2826 mla v4.4h, v5.4h, v1.4h 2827 urshl v4.4h, v4.4h, v31.4h 2828 urshl v4.4h, v4.4h, v30.4h 2829 st1 {v4.s}[0], [\dst], \d_strd 2830 st1 {v4.s}[1], [\ds2], \d_strd 2831 b.gt 2b 2832 ret 2833.endif 2834 283540: // 4xN h 2836 AARCH64_VALID_JUMP_TARGET 2837 add \ds2, \dst, \d_strd 2838 add \sr2, \src, \s_strd 2839 lsl \d_strd, \d_strd, #1 2840 lsl \s_strd, \s_strd, #1 28414: 2842 ld1 {v4.8h}, [\src], \s_strd 2843 ld1 {v6.8h}, [\sr2], \s_strd 2844 ext v5.16b, v4.16b, v4.16b, #2 2845 ext v7.16b, v6.16b, v6.16b, #2 2846 trn1 v4.2d, v4.2d, v6.2d 2847 trn1 v5.2d, v5.2d, v7.2d 2848 subs \h, \h, #2 2849 mul v4.8h, v4.8h, v0.8h 2850 mla v4.8h, v5.8h, v1.8h 2851 urshl v4.8h, v4.8h, v31.8h 2852.ifc \type, put 2853 urshl v4.8h, v4.8h, v30.8h 2854.else 2855 sub v4.8h, v4.8h, v29.8h 2856.endif 2857 st1 {v4.d}[0], [\dst], \d_strd 2858 st1 {v4.d}[1], [\ds2], \d_strd 2859 b.gt 4b 2860 ret 2861 286280: // 8xN h 2863 AARCH64_VALID_JUMP_TARGET 2864 add \ds2, \dst, \d_strd 2865 add \sr2, \src, \s_strd 2866 lsl \d_strd, \d_strd, #1 2867 lsl \s_strd, \s_strd, #1 28688: 2869 ldr h5, [\src, #16] 2870 ldr h7, [\sr2, #16] 2871 ld1 {v4.8h}, [\src], \s_strd 2872 ld1 {v6.8h}, [\sr2], \s_strd 2873 ext v5.16b, v4.16b, v5.16b, #2 2874 ext v7.16b, v6.16b, v7.16b, #2 2875 subs \h, \h, #2 2876 mul v4.8h, v4.8h, v0.8h 2877 mla v4.8h, v5.8h, v1.8h 2878 mul v6.8h, v6.8h, v0.8h 2879 mla v6.8h, v7.8h, v1.8h 2880 urshl v4.8h, v4.8h, v31.8h 2881 urshl v6.8h, v6.8h, v31.8h 2882.ifc \type, put 2883 urshl v4.8h, v4.8h, v30.8h 2884 urshl v6.8h, v6.8h, v30.8h 2885.else 2886 sub v4.8h, v4.8h, v29.8h 2887 sub v6.8h, v6.8h, v29.8h 2888.endif 2889 st1 {v4.8h}, [\dst], \d_strd 2890 st1 {v6.8h}, [\ds2], \d_strd 2891 b.gt 8b 2892 ret 2893160: 2894320: 2895640: 28961280: // 16xN, 32xN, ... h 2897 AARCH64_VALID_JUMP_TARGET 2898 add \ds2, \dst, \d_strd 2899 add \sr2, \src, \s_strd 2900 lsl \s_strd, \s_strd, #1 2901 2902 sub \s_strd, \s_strd, \w, uxtw #1 2903 sub \s_strd, \s_strd, #16 2904.ifc \type, put 2905 lsl \d_strd, \d_strd, #1 2906 sub \d_strd, \d_strd, \w, uxtw #1 2907.endif 2908161: 2909 ld1 {v16.8h}, [\src], #16 2910 ld1 {v21.8h}, [\sr2], #16 2911 mov \mx, \w 2912 291316: 2914 ld1 {v17.8h, v18.8h}, [\src], #32 2915 ld1 {v22.8h, v23.8h}, [\sr2], #32 2916 ext v19.16b, v16.16b, v17.16b, #2 2917 ext v20.16b, v17.16b, v18.16b, #2 2918 ext v24.16b, v21.16b, v22.16b, #2 2919 ext v25.16b, v22.16b, v23.16b, #2 2920 mul v16.8h, v16.8h, v0.8h 2921 mla v16.8h, v19.8h, v1.8h 2922 mul v17.8h, v17.8h, v0.8h 2923 mla v17.8h, v20.8h, v1.8h 2924 mul v21.8h, v21.8h, v0.8h 2925 mla v21.8h, v24.8h, v1.8h 2926 mul v22.8h, v22.8h, v0.8h 2927 mla v22.8h, v25.8h, v1.8h 2928 urshl v16.8h, v16.8h, v31.8h 2929 urshl v17.8h, v17.8h, v31.8h 2930 urshl v21.8h, v21.8h, v31.8h 2931 urshl v22.8h, v22.8h, v31.8h 2932 subs \mx, \mx, #16 2933.ifc \type, put 2934 urshl v16.8h, v16.8h, v30.8h 2935 urshl v17.8h, v17.8h, v30.8h 2936 urshl v21.8h, v21.8h, v30.8h 2937 urshl v22.8h, v22.8h, v30.8h 2938.else 2939 sub v16.8h, v16.8h, v29.8h 2940 sub v17.8h, v17.8h, v29.8h 2941 sub v21.8h, v21.8h, v29.8h 2942 sub v22.8h, v22.8h, v29.8h 2943.endif 2944 st1 {v16.8h, v17.8h}, [\dst], #32 2945 st1 {v21.8h, v22.8h}, [\ds2], #32 2946 b.le 9f 2947 2948 mov v16.16b, v18.16b 2949 mov v21.16b, v23.16b 2950 b 16b 2951 29529: 2953 add \dst, \dst, \d_strd 2954 add \ds2, \ds2, \d_strd 2955 add \src, \src, \s_strd 2956 add \sr2, \sr2, \s_strd 2957 2958 subs \h, \h, #2 2959 b.gt 161b 2960 ret 2961 2962L(\type\()_bilin_h_tbl): 2963 .hword L(\type\()_bilin_h_tbl) - 1280b 2964 .hword L(\type\()_bilin_h_tbl) - 640b 2965 .hword L(\type\()_bilin_h_tbl) - 320b 2966 .hword L(\type\()_bilin_h_tbl) - 160b 2967 .hword L(\type\()_bilin_h_tbl) - 80b 2968 .hword L(\type\()_bilin_h_tbl) - 40b 2969 .hword L(\type\()_bilin_h_tbl) - 20b 2970 .hword 0 2971 2972 2973L(\type\()_bilin_v): 2974 cmp \h, #4 2975 adr x10, L(\type\()_bilin_v_tbl) 2976.ifc \type, prep 2977 dup v31.8h, w11 // 4 - intermediate_bits 2978.endif 2979 ldrh w9, [x10, x9, lsl #1] 2980.ifc \type, prep 2981 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 2982 neg v31.8h, v31.8h // -(4-intermediate_bits) 2983.endif 2984 sub x10, x10, w9, uxtw 2985 br x10 2986 298720: // 2xN v 2988 AARCH64_VALID_JUMP_TARGET 2989.ifc \type, put 2990 cmp \h, #2 2991 add \ds2, \dst, \d_strd 2992 add \sr2, \src, \s_strd 2993 lsl \s_strd, \s_strd, #1 2994 lsl \d_strd, \d_strd, #1 2995 2996 // 2x2 v 2997 ld1 {v16.s}[0], [\src], \s_strd 2998 b.gt 24f 299922: 3000 ld1 {v17.s}[0], [\sr2], \s_strd 3001 ld1 {v18.s}[0], [\src], \s_strd 3002 trn1 v16.2s, v16.2s, v17.2s 3003 trn1 v17.2s, v17.2s, v18.2s 3004 mul v4.4h, v16.4h, v2.4h 3005 mla v4.4h, v17.4h, v3.4h 3006 urshr v4.8h, v4.8h, #4 3007 st1 {v4.s}[0], [\dst] 3008 st1 {v4.s}[1], [\ds2] 3009 ret 301024: // 2x4, 2x6, 2x8, ... v 3011 ld1 {v17.s}[0], [\sr2], \s_strd 3012 ld1 {v18.s}[0], [\src], \s_strd 3013 ld1 {v19.s}[0], [\sr2], \s_strd 3014 ld1 {v20.s}[0], [\src], \s_strd 3015 sub \h, \h, #4 3016 trn1 v16.2s, v16.2s, v17.2s 3017 trn1 v17.2s, v17.2s, v18.2s 3018 trn1 v18.2s, v18.2s, v19.2s 3019 trn1 v19.2s, v19.2s, v20.2s 3020 trn1 v16.2d, v16.2d, v18.2d 3021 trn1 v17.2d, v17.2d, v19.2d 3022 mul v4.8h, v16.8h, v2.8h 3023 mla v4.8h, v17.8h, v3.8h 3024 cmp \h, #2 3025 urshr v4.8h, v4.8h, #4 3026 st1 {v4.s}[0], [\dst], \d_strd 3027 st1 {v4.s}[1], [\ds2], \d_strd 3028 st1 {v4.s}[2], [\dst], \d_strd 3029 st1 {v4.s}[3], [\ds2], \d_strd 3030 b.lt 0f 3031 mov v16.8b, v20.8b 3032 b.eq 22b 3033 b 24b 30340: 3035 ret 3036.endif 3037 303840: // 4xN v 3039 AARCH64_VALID_JUMP_TARGET 3040 add \ds2, \dst, \d_strd 3041 add \sr2, \src, \s_strd 3042 lsl \s_strd, \s_strd, #1 3043 lsl \d_strd, \d_strd, #1 3044 ld1 {v16.4h}, [\src], \s_strd 30454: 3046 ld1 {v17.4h}, [\sr2], \s_strd 3047 ld1 {v18.4h}, [\src], \s_strd 3048 trn1 v16.2d, v16.2d, v17.2d 3049 trn1 v17.2d, v17.2d, v18.2d 3050 mul v4.8h, v16.8h, v2.8h 3051 mla v4.8h, v17.8h, v3.8h 3052 subs \h, \h, #2 3053.ifc \type, put 3054 urshr v4.8h, v4.8h, #4 3055.else 3056 urshl v4.8h, v4.8h, v31.8h 3057 sub v4.8h, v4.8h, v29.8h 3058.endif 3059 st1 {v4.d}[0], [\dst], \d_strd 3060 st1 {v4.d}[1], [\ds2], \d_strd 3061 b.le 0f 3062 mov v16.8b, v18.8b 3063 b 4b 30640: 3065 ret 3066 306780: // 8xN v 3068 AARCH64_VALID_JUMP_TARGET 3069 add \ds2, \dst, \d_strd 3070 add \sr2, \src, \s_strd 3071 lsl \s_strd, \s_strd, #1 3072 lsl \d_strd, \d_strd, #1 3073 ld1 {v16.8h}, [\src], \s_strd 30748: 3075 ld1 {v17.8h}, [\sr2], \s_strd 3076 ld1 {v18.8h}, [\src], \s_strd 3077 mul v4.8h, v16.8h, v2.8h 3078 mla v4.8h, v17.8h, v3.8h 3079 mul v5.8h, v17.8h, v2.8h 3080 mla v5.8h, v18.8h, v3.8h 3081 subs \h, \h, #2 3082.ifc \type, put 3083 urshr v4.8h, v4.8h, #4 3084 urshr v5.8h, v5.8h, #4 3085.else 3086 urshl v4.8h, v4.8h, v31.8h 3087 urshl v5.8h, v5.8h, v31.8h 3088 sub v4.8h, v4.8h, v29.8h 3089 sub v5.8h, v5.8h, v29.8h 3090.endif 3091 st1 {v4.8h}, [\dst], \d_strd 3092 st1 {v5.8h}, [\ds2], \d_strd 3093 b.le 0f 3094 mov v16.16b, v18.16b 3095 b 8b 30960: 3097 ret 3098 3099160: // 16xN, 32xN, ... 3100320: 3101640: 31021280: 3103 AARCH64_VALID_JUMP_TARGET 3104 mov \my, \h 31051: 3106 add \ds2, \dst, \d_strd 3107 add \sr2, \src, \s_strd 3108 lsl \s_strd, \s_strd, #1 3109 lsl \d_strd, \d_strd, #1 3110 3111 ld1 {v16.8h, v17.8h}, [\src], \s_strd 31122: 3113 ld1 {v18.8h, v19.8h}, [\sr2], \s_strd 3114 ld1 {v20.8h, v21.8h}, [\src], \s_strd 3115 mul v4.8h, v16.8h, v2.8h 3116 mla v4.8h, v18.8h, v3.8h 3117 mul v5.8h, v17.8h, v2.8h 3118 mla v5.8h, v19.8h, v3.8h 3119 mul v6.8h, v18.8h, v2.8h 3120 mla v6.8h, v20.8h, v3.8h 3121 mul v7.8h, v19.8h, v2.8h 3122 mla v7.8h, v21.8h, v3.8h 3123 subs \h, \h, #2 3124.ifc \type, put 3125 urshr v4.8h, v4.8h, #4 3126 urshr v5.8h, v5.8h, #4 3127 urshr v6.8h, v6.8h, #4 3128 urshr v7.8h, v7.8h, #4 3129.else 3130 urshl v4.8h, v4.8h, v31.8h 3131 urshl v5.8h, v5.8h, v31.8h 3132 urshl v6.8h, v6.8h, v31.8h 3133 urshl v7.8h, v7.8h, v31.8h 3134 sub v4.8h, v4.8h, v29.8h 3135 sub v5.8h, v5.8h, v29.8h 3136 sub v6.8h, v6.8h, v29.8h 3137 sub v7.8h, v7.8h, v29.8h 3138.endif 3139 st1 {v4.8h, v5.8h}, [\dst], \d_strd 3140 st1 {v6.8h, v7.8h}, [\ds2], \d_strd 3141 b.le 9f 3142 mov v16.16b, v20.16b 3143 mov v17.16b, v21.16b 3144 b 2b 31459: 3146 subs \w, \w, #16 3147 b.le 0f 3148 asr \s_strd, \s_strd, #1 3149 asr \d_strd, \d_strd, #1 3150 msub \src, \s_strd, \xmy, \src 3151 msub \dst, \d_strd, \xmy, \dst 3152 sub \src, \src, \s_strd, lsl #1 3153 mov \h, \my 3154 add \src, \src, #32 3155 add \dst, \dst, #32 3156 b 1b 31570: 3158 ret 3159 3160L(\type\()_bilin_v_tbl): 3161 .hword L(\type\()_bilin_v_tbl) - 1280b 3162 .hword L(\type\()_bilin_v_tbl) - 640b 3163 .hword L(\type\()_bilin_v_tbl) - 320b 3164 .hword L(\type\()_bilin_v_tbl) - 160b 3165 .hword L(\type\()_bilin_v_tbl) - 80b 3166 .hword L(\type\()_bilin_v_tbl) - 40b 3167 .hword L(\type\()_bilin_v_tbl) - 20b 3168 .hword 0 3169 3170L(\type\()_bilin_hv): 3171 adr x10, L(\type\()_bilin_hv_tbl) 3172 dup v31.8h, w11 // 4 - intermediate_bits 3173 ldrh w9, [x10, x9, lsl #1] 3174 neg v31.8h, v31.8h // -(4-intermediate_bits) 3175.ifc \type, put 3176 dup v30.4s, w12 // 4 + intermediate_bits 3177.else 3178 movi v29.8h, #(PREP_BIAS >> 8), lsl #8 3179.endif 3180 sub x10, x10, w9, uxtw 3181.ifc \type, put 3182 neg v30.4s, v30.4s // -(4+intermediate_bits) 3183.endif 3184 br x10 3185 318620: // 2xN hv 3187 AARCH64_VALID_JUMP_TARGET 3188.ifc \type, put 3189 add \sr2, \src, \s_strd 3190 add \ds2, \dst, \d_strd 3191 lsl \s_strd, \s_strd, #1 3192 lsl \d_strd, \d_strd, #1 3193 3194 ld1 {v20.4h}, [\src], \s_strd 3195 ext v21.8b, v20.8b, v20.8b, #2 3196 mul v16.4h, v20.4h, v0.4h 3197 mla v16.4h, v21.4h, v1.4h 3198 urshl v16.4h, v16.4h, v31.4h 3199 32002: 3201 ld1 {v22.4h}, [\sr2], \s_strd 3202 ld1 {v24.4h}, [\src], \s_strd 3203 ext v23.8b, v22.8b, v22.8b, #2 3204 ext v25.8b, v24.8b, v24.8b, #2 3205 trn1 v22.2s, v22.2s, v24.2s 3206 trn1 v23.2s, v23.2s, v25.2s 3207 mul v17.4h, v22.4h, v0.4h 3208 mla v17.4h, v23.4h, v1.4h 3209 urshl v17.4h, v17.4h, v31.4h 3210 3211 trn1 v16.2s, v16.2s, v17.2s 3212 3213 umull v4.4s, v16.4h, v2.4h 3214 umlal v4.4s, v17.4h, v3.4h 3215 urshl v4.4s, v4.4s, v30.4s 3216 xtn v4.4h, v4.4s 3217 subs \h, \h, #2 3218 st1 {v4.s}[0], [\dst], \d_strd 3219 st1 {v4.s}[1], [\ds2], \d_strd 3220 b.le 0f 3221 trn2 v16.2s, v17.2s, v17.2s 3222 b 2b 32230: 3224 ret 3225.endif 3226 322740: // 4xN hv 3228 AARCH64_VALID_JUMP_TARGET 3229 add \sr2, \src, \s_strd 3230 add \ds2, \dst, \d_strd 3231 lsl \s_strd, \s_strd, #1 3232 lsl \d_strd, \d_strd, #1 3233 3234 ld1 {v20.8h}, [\src], \s_strd 3235 ext v21.16b, v20.16b, v20.16b, #2 3236 mul v16.4h, v20.4h, v0.4h 3237 mla v16.4h, v21.4h, v1.4h 3238 urshl v16.4h, v16.4h, v31.4h 3239 32404: 3241 ld1 {v22.8h}, [\sr2], \s_strd 3242 ld1 {v24.8h}, [\src], \s_strd 3243 ext v23.16b, v22.16b, v22.16b, #2 3244 ext v25.16b, v24.16b, v24.16b, #2 3245 trn1 v22.2d, v22.2d, v24.2d 3246 trn1 v23.2d, v23.2d, v25.2d 3247 mul v17.8h, v22.8h, v0.8h 3248 mla v17.8h, v23.8h, v1.8h 3249 urshl v17.8h, v17.8h, v31.8h 3250 3251 trn1 v16.2d, v16.2d, v17.2d 3252 3253 umull v4.4s, v16.4h, v2.4h 3254 umlal v4.4s, v17.4h, v3.4h 3255 umull2 v5.4s, v16.8h, v2.8h 3256 umlal2 v5.4s, v17.8h, v3.8h 3257.ifc \type, put 3258 urshl v4.4s, v4.4s, v30.4s 3259 urshl v5.4s, v5.4s, v30.4s 3260 uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2 3261.else 3262 rshrn v4.4h, v4.4s, #4 3263 rshrn2 v4.8h, v5.4s, #4 3264 sub v4.8h, v4.8h, v29.8h 3265.endif 3266 subs \h, \h, #2 3267 st1 {v4.d}[0], [\dst], \d_strd 3268 st1 {v4.d}[1], [\ds2], \d_strd 3269 b.le 0f 3270 trn2 v16.2d, v17.2d, v17.2d 3271 b 4b 32720: 3273 ret 3274 327580: // 8xN, 16xN, ... hv 3276160: 3277320: 3278640: 32791280: 3280 AARCH64_VALID_JUMP_TARGET 3281 mov \my, \h 3282 32831: 3284 add \sr2, \src, \s_strd 3285 add \ds2, \dst, \d_strd 3286 lsl \s_strd, \s_strd, #1 3287 lsl \d_strd, \d_strd, #1 3288 3289 ldr h21, [\src, #16] 3290 ld1 {v20.8h}, [\src], \s_strd 3291 ext v21.16b, v20.16b, v21.16b, #2 3292 mul v16.8h, v20.8h, v0.8h 3293 mla v16.8h, v21.8h, v1.8h 3294 urshl v16.8h, v16.8h, v31.8h 3295 32962: 3297 ldr h23, [\sr2, #16] 3298 ld1 {v22.8h}, [\sr2], \s_strd 3299 ldr h25, [\src, #16] 3300 ld1 {v24.8h}, [\src], \s_strd 3301 ext v23.16b, v22.16b, v23.16b, #2 3302 ext v25.16b, v24.16b, v25.16b, #2 3303 mul v17.8h, v22.8h, v0.8h 3304 mla v17.8h, v23.8h, v1.8h 3305 mul v18.8h, v24.8h, v0.8h 3306 mla v18.8h, v25.8h, v1.8h 3307 urshl v17.8h, v17.8h, v31.8h 3308 urshl v18.8h, v18.8h, v31.8h 3309 3310 umull v4.4s, v16.4h, v2.4h 3311 umlal v4.4s, v17.4h, v3.4h 3312 umull2 v5.4s, v16.8h, v2.8h 3313 umlal2 v5.4s, v17.8h, v3.8h 3314 umull v6.4s, v17.4h, v2.4h 3315 umlal v6.4s, v18.4h, v3.4h 3316 umull2 v7.4s, v17.8h, v2.8h 3317 umlal2 v7.4s, v18.8h, v3.8h 3318.ifc \type, put 3319 urshl v4.4s, v4.4s, v30.4s 3320 urshl v5.4s, v5.4s, v30.4s 3321 urshl v6.4s, v6.4s, v30.4s 3322 urshl v7.4s, v7.4s, v30.4s 3323 uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2 3324 uzp1 v5.8h, v6.8h, v7.8h // Ditto 3325.else 3326 rshrn v4.4h, v4.4s, #4 3327 rshrn2 v4.8h, v5.4s, #4 3328 rshrn v5.4h, v6.4s, #4 3329 rshrn2 v5.8h, v7.4s, #4 3330 sub v4.8h, v4.8h, v29.8h 3331 sub v5.8h, v5.8h, v29.8h 3332.endif 3333 subs \h, \h, #2 3334 st1 {v4.8h}, [\dst], \d_strd 3335 st1 {v5.8h}, [\ds2], \d_strd 3336 b.le 9f 3337 mov v16.16b, v18.16b 3338 b 2b 33399: 3340 subs \w, \w, #8 3341 b.le 0f 3342 asr \s_strd, \s_strd, #1 3343 asr \d_strd, \d_strd, #1 3344 msub \src, \s_strd, \xmy, \src 3345 msub \dst, \d_strd, \xmy, \dst 3346 sub \src, \src, \s_strd, lsl #1 3347 mov \h, \my 3348 add \src, \src, #16 3349 add \dst, \dst, #16 3350 b 1b 33510: 3352 ret 3353 3354L(\type\()_bilin_hv_tbl): 3355 .hword L(\type\()_bilin_hv_tbl) - 1280b 3356 .hword L(\type\()_bilin_hv_tbl) - 640b 3357 .hword L(\type\()_bilin_hv_tbl) - 320b 3358 .hword L(\type\()_bilin_hv_tbl) - 160b 3359 .hword L(\type\()_bilin_hv_tbl) - 80b 3360 .hword L(\type\()_bilin_hv_tbl) - 40b 3361 .hword L(\type\()_bilin_hv_tbl) - 20b 3362 .hword 0 3363endfunc 3364.endm 3365 3366make_8tap_fn put, regular_sharp, REGULAR, SHARP, 8tap 3367make_8tap_fn put, smooth_sharp, SMOOTH, SHARP, 8tap 3368make_8tap_fn put, sharp, SHARP, SHARP, 8tap 3369make_8tap_fn put, sharp_regular, SHARP, REGULAR, 8tap 3370make_8tap_fn put, sharp_smooth, SHARP, SMOOTH, 8tap 3371filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 8tap 3372 3373make_8tap_fn put, regular, REGULAR, REGULAR, 6tap 3374make_8tap_fn put, regular_smooth, REGULAR, SMOOTH, 6tap 3375make_8tap_fn put, smooth, SMOOTH, SMOOTH, 6tap 3376make_8tap_fn put, smooth_regular, SMOOTH, REGULAR, 6tap 3377filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 6tap 3378filter_bilin_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10 3379 3380make_8tap_fn prep, regular_sharp, REGULAR, SHARP, 8tap 3381make_8tap_fn prep, smooth_sharp, SMOOTH, SHARP, 8tap 3382make_8tap_fn prep, sharp, SHARP, SHARP, 8tap 3383make_8tap_fn prep, sharp_regular, SHARP, REGULAR, 8tap 3384make_8tap_fn prep, sharp_smooth, SHARP, SMOOTH, 8tap 3385filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 8tap 3386 3387make_8tap_fn prep, regular, REGULAR, REGULAR, 6tap 3388make_8tap_fn prep, regular_smooth, REGULAR, SMOOTH, 6tap 3389make_8tap_fn prep, smooth, SMOOTH, SMOOTH, 6tap 3390make_8tap_fn prep, smooth_regular, SMOOTH, REGULAR, 6tap 3391filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 6tap 3392filter_bilin_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10 3393 3394 3395.macro load_filter_row dst, src, inc 3396 asr w13, \src, #10 3397 add \src, \src, \inc 3398 ldr \dst, [x11, w13, sxtw #3] 3399.endm 3400 3401function warp_filter_horz_neon 3402 add w12, w5, #512 3403 3404 ld1 {v16.8h, v17.8h}, [x2], x3 3405 3406 load_filter_row d0, w12, w7 3407 load_filter_row d1, w12, w7 3408 load_filter_row d2, w12, w7 3409 sxtl v0.8h, v0.8b 3410 load_filter_row d3, w12, w7 3411 sxtl v1.8h, v1.8b 3412 load_filter_row d4, w12, w7 3413 sxtl v2.8h, v2.8b 3414 load_filter_row d5, w12, w7 3415 sxtl v3.8h, v3.8b 3416 load_filter_row d6, w12, w7 3417 sxtl v4.8h, v4.8b 3418 load_filter_row d7, w12, w7 3419 sxtl v5.8h, v5.8b 3420 ext v18.16b, v16.16b, v17.16b, #2*1 3421 smull v8.4s, v16.4h, v0.4h 3422 smull2 v9.4s, v16.8h, v0.8h 3423 sxtl v6.8h, v6.8b 3424 ext v19.16b, v16.16b, v17.16b, #2*2 3425 smull v10.4s, v18.4h, v1.4h 3426 smull2 v11.4s, v18.8h, v1.8h 3427 sxtl v7.8h, v7.8b 3428 ext v20.16b, v16.16b, v17.16b, #2*3 3429 smull v0.4s, v19.4h, v2.4h 3430 smull2 v1.4s, v19.8h, v2.8h 3431 ext v21.16b, v16.16b, v17.16b, #2*4 3432 addp v8.4s, v8.4s, v9.4s 3433 smull v2.4s, v20.4h, v3.4h 3434 smull2 v3.4s, v20.8h, v3.8h 3435 ext v22.16b, v16.16b, v17.16b, #2*5 3436 addp v9.4s, v10.4s, v11.4s 3437 smull v10.4s, v21.4h, v4.4h 3438 smull2 v11.4s, v21.8h, v4.8h 3439 ext v23.16b, v16.16b, v17.16b, #2*6 3440 addp v0.4s, v0.4s, v1.4s 3441 smull v18.4s, v22.4h, v5.4h 3442 smull2 v19.4s, v22.8h, v5.8h 3443 ext v16.16b, v16.16b, v17.16b, #2*7 3444 addp v1.4s, v2.4s, v3.4s 3445 addp v2.4s, v10.4s, v11.4s 3446 smull v20.4s, v23.4h, v6.4h 3447 smull2 v21.4s, v23.8h, v6.8h 3448 addp v3.4s, v18.4s, v19.4s 3449 smull v22.4s, v16.4h, v7.4h 3450 smull2 v23.4s, v16.8h, v7.8h 3451 addp v4.4s, v20.4s, v21.4s 3452 addp v5.4s, v22.4s, v23.4s 3453 3454 addp v8.4s, v8.4s, v9.4s 3455 addp v0.4s, v0.4s, v1.4s 3456 addp v2.4s, v2.4s, v3.4s 3457 addp v4.4s, v4.4s, v5.4s 3458 3459 addp v16.4s, v8.4s, v0.4s 3460 addp v17.4s, v2.4s, v4.4s 3461 3462 add w5, w5, w8 3463 3464 srshl v16.4s, v16.4s, v14.4s // -(7 - intermediate_bits) 3465 srshl v17.4s, v17.4s, v14.4s // -(7 - intermediate_bits) 3466 3467 ret 3468endfunc 3469 3470// void dav1d_warp_affine_8x8_16bpc_neon( 3471// pixel *dst, const ptrdiff_t dst_stride, 3472// const pixel *src, const ptrdiff_t src_stride, 3473// const int16_t *const abcd, int mx, int my, 3474// const int bitdepth_max) 3475.macro warp t 3476function warp_affine_8x8\t\()_16bpc_neon, export=1 3477 stp d8, d9, [sp, #-0x40]! 3478 stp d10, d11, [sp, #0x10] 3479 stp d12, d13, [sp, #0x20] 3480 stp d14, d15, [sp, #0x30] 3481 3482.ifb \t 3483 dup v15.8h, w7 // bitdepth_max 3484.else 3485 movi v15.8h, #(PREP_BIAS >> 8), lsl #8 3486.endif 3487 clz w7, w7 3488 // intermediate_bits = clz(bitdepth_max) - 18 3489.ifb \t 3490 sub w8, w7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7 3491.endif 3492 sub w7, w7, #25 // -(7 - intermediate_bits) 3493.ifb \t 3494 neg w8, w8 // -(7 + intermediate_bits) 3495.endif 3496 dup v14.4s, w7 // -(7 - intermediate_bits) 3497.ifb \t 3498 dup v13.4s, w8 // -(7 + intermediate_bits) 3499.endif 3500 3501 ldr x4, [x4] 3502 sbfx x7, x4, #0, #16 3503 sbfx x8, x4, #16, #16 3504 sbfx x9, x4, #32, #16 3505 sbfx x4, x4, #48, #16 3506 mov w10, #8 3507 sub x2, x2, x3, lsl #1 3508 sub x2, x2, x3 3509 sub x2, x2, #6 3510 movrel x11, X(mc_warp_filter), 64*8 3511 mov x15, x30 3512.ifnb \t 3513 lsl x1, x1, #1 3514.endif 3515 3516 bl warp_filter_horz_neon 3517 uzp1 v24.8h, v16.8h, v17.8h // Same as xtn, xtn2 3518 bl warp_filter_horz_neon 3519 uzp1 v25.8h, v16.8h, v17.8h // Ditto 3520 bl warp_filter_horz_neon 3521 uzp1 v26.8h, v16.8h, v17.8h // Ditto 3522 bl warp_filter_horz_neon 3523 uzp1 v27.8h, v16.8h, v17.8h // Ditto 3524 bl warp_filter_horz_neon 3525 uzp1 v28.8h, v16.8h, v17.8h // Ditto 3526 bl warp_filter_horz_neon 3527 uzp1 v29.8h, v16.8h, v17.8h // Ditto 3528 bl warp_filter_horz_neon 3529 uzp1 v30.8h, v16.8h, v17.8h // Ditto 3530 35311: 3532 add w14, w6, #512 3533 bl warp_filter_horz_neon 3534 uzp1 v31.8h, v16.8h, v17.8h // Same as xtn, xtn2 3535 3536 load_filter_row d0, w14, w9 3537 load_filter_row d1, w14, w9 3538 load_filter_row d2, w14, w9 3539 load_filter_row d3, w14, w9 3540 load_filter_row d4, w14, w9 3541 load_filter_row d5, w14, w9 3542 load_filter_row d6, w14, w9 3543 load_filter_row d7, w14, w9 3544 transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl 3545 3546 // This ordering of smull/smlal/smull2/smlal2 is highly 3547 // beneficial for Cortex A53 here. 3548 smull v16.4s, v24.4h, v0.4h 3549 smlal v16.4s, v25.4h, v1.4h 3550 smlal v16.4s, v26.4h, v2.4h 3551 smlal v16.4s, v27.4h, v3.4h 3552 smlal v16.4s, v28.4h, v4.4h 3553 smlal v16.4s, v29.4h, v5.4h 3554 smlal v16.4s, v30.4h, v6.4h 3555 smlal v16.4s, v31.4h, v7.4h 3556 smull2 v17.4s, v24.8h, v0.8h 3557 smlal2 v17.4s, v25.8h, v1.8h 3558 smlal2 v17.4s, v26.8h, v2.8h 3559 smlal2 v17.4s, v27.8h, v3.8h 3560 smlal2 v17.4s, v28.8h, v4.8h 3561 smlal2 v17.4s, v29.8h, v5.8h 3562 smlal2 v17.4s, v30.8h, v6.8h 3563 smlal2 v17.4s, v31.8h, v7.8h 3564 3565 mov v24.16b, v25.16b 3566 mov v25.16b, v26.16b 3567.ifb \t 3568 srshl v16.4s, v16.4s, v13.4s // -(7 + intermediate_bits) 3569 srshl v17.4s, v17.4s, v13.4s // -(7 + intermediate_bits) 3570.else 3571 rshrn v16.4h, v16.4s, #7 3572 rshrn2 v16.8h, v17.4s, #7 3573.endif 3574 mov v26.16b, v27.16b 3575.ifb \t 3576 sqxtun v16.4h, v16.4s 3577 sqxtun2 v16.8h, v17.4s 3578.else 3579 sub v16.8h, v16.8h, v15.8h // PREP_BIAS 3580.endif 3581 mov v27.16b, v28.16b 3582 mov v28.16b, v29.16b 3583.ifb \t 3584 umin v16.8h, v16.8h, v15.8h // bitdepth_max 3585.endif 3586 mov v29.16b, v30.16b 3587 mov v30.16b, v31.16b 3588 subs w10, w10, #1 3589 st1 {v16.8h}, [x0], x1 3590 3591 add w6, w6, w4 3592 b.gt 1b 3593 3594 ldp d14, d15, [sp, #0x30] 3595 ldp d12, d13, [sp, #0x20] 3596 ldp d10, d11, [sp, #0x10] 3597 ldp d8, d9, [sp], 0x40 3598 3599 ret x15 3600endfunc 3601.endm 3602 3603warp 3604warp t 3605 3606// void dav1d_emu_edge_16bpc_neon( 3607// const intptr_t bw, const intptr_t bh, 3608// const intptr_t iw, const intptr_t ih, 3609// const intptr_t x, const intptr_t y, 3610// pixel *dst, const ptrdiff_t dst_stride, 3611// const pixel *ref, const ptrdiff_t ref_stride) 3612function emu_edge_16bpc_neon, export=1 3613 ldp x8, x9, [sp] 3614 3615 // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) 3616 // ref += iclip(x, 0, iw - 1) 3617 sub x12, x3, #1 // ih - 1 3618 cmp x5, x3 3619 sub x13, x2, #1 // iw - 1 3620 csel x12, x12, x5, ge // min(y, ih - 1) 3621 cmp x4, x2 3622 bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) 3623 csel x13, x13, x4, ge // min(x, iw - 1) 3624 bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) 3625 madd x8, x12, x9, x8 // ref += iclip() * stride 3626 add x8, x8, x13, lsl #1 // ref += iclip() 3627 3628 // bottom_ext = iclip(y + bh - ih, 0, bh - 1) 3629 // top_ext = iclip(-y, 0, bh - 1) 3630 add x10, x5, x1 // y + bh 3631 neg x5, x5 // -y 3632 sub x10, x10, x3 // y + bh - ih 3633 sub x12, x1, #1 // bh - 1 3634 cmp x10, x1 3635 bic x5, x5, x5, asr #63 // max(-y, 0) 3636 csel x10, x10, x12, lt // min(y + bh - ih, bh-1) 3637 cmp x5, x1 3638 bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) 3639 csel x5, x5, x12, lt // min(max(-y, 0), bh-1) 3640 3641 // right_ext = iclip(x + bw - iw, 0, bw - 1) 3642 // left_ext = iclip(-x, 0, bw - 1) 3643 add x11, x4, x0 // x + bw 3644 neg x4, x4 // -x 3645 sub x11, x11, x2 // x + bw - iw 3646 sub x13, x0, #1 // bw - 1 3647 cmp x11, x0 3648 bic x4, x4, x4, asr #63 // max(-x, 0) 3649 csel x11, x11, x13, lt // min(x + bw - iw, bw-1) 3650 cmp x4, x0 3651 bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) 3652 csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) 3653 3654 // center_h = bh - top_ext - bottom_ext 3655 // dst += top_ext * PXSTRIDE(dst_stride) 3656 // center_w = bw - left_ext - right_ext 3657 sub x1, x1, x5 // bh - top_ext 3658 madd x6, x5, x7, x6 3659 sub x2, x0, x4 // bw - left_ext 3660 sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext 3661 sub x2, x2, x11 // center_w = bw - left_ext - right_ext 3662 3663 mov x14, x6 // backup of dst 3664 3665.macro v_loop need_left, need_right 36660: 3667.if \need_left 3668 ld1r {v0.8h}, [x8] 3669 mov x12, x6 // out = dst 3670 mov x3, x4 3671 mov v1.16b, v0.16b 36721: 3673 subs x3, x3, #16 3674 st1 {v0.8h, v1.8h}, [x12], #32 3675 b.gt 1b 3676.endif 3677 mov x13, x8 3678 add x12, x6, x4, lsl #1 // out = dst + left_ext 3679 mov x3, x2 36801: 3681 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64 3682 subs x3, x3, #32 3683 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64 3684 b.gt 1b 3685.if \need_right 3686 add x3, x8, x2, lsl #1 // in + center_w 3687 sub x3, x3, #2 // in + center_w - 1 3688 add x12, x6, x4, lsl #1 // dst + left_ext 3689 ld1r {v0.8h}, [x3] 3690 add x12, x12, x2, lsl #1 // out = dst + left_ext + center_w 3691 mov x3, x11 3692 mov v1.16b, v0.16b 36931: 3694 subs x3, x3, #16 3695 st1 {v0.8h, v1.8h}, [x12], #32 3696 b.gt 1b 3697.endif 3698 3699 subs x1, x1, #1 // center_h-- 3700 add x6, x6, x7 3701 add x8, x8, x9 3702 b.gt 0b 3703.endm 3704 3705 cbz x4, 2f 3706 // need_left 3707 cbz x11, 3f 3708 // need_left + need_right 3709 v_loop 1, 1 3710 b 5f 3711 37122: 3713 // !need_left 3714 cbz x11, 4f 3715 // !need_left + need_right 3716 v_loop 0, 1 3717 b 5f 3718 37193: 3720 // need_left + !need_right 3721 v_loop 1, 0 3722 b 5f 3723 37244: 3725 // !need_left + !need_right 3726 v_loop 0, 0 3727 37285: 3729 3730 cbz x10, 3f 3731 // need_bottom 3732 sub x8, x6, x7 // ref = dst - stride 3733 mov x4, x0 37341: 3735 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64 3736 mov x3, x10 37372: 3738 subs x3, x3, #1 3739 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 3740 b.gt 2b 3741 msub x6, x7, x10, x6 // dst -= bottom_ext * stride 3742 subs x4, x4, #32 // bw -= 32 3743 add x6, x6, #64 // dst += 32 3744 b.gt 1b 3745 37463: 3747 cbz x5, 3f 3748 // need_top 3749 msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride 37501: 3751 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64 3752 mov x3, x5 37532: 3754 subs x3, x3, #1 3755 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 3756 b.gt 2b 3757 msub x6, x7, x5, x6 // dst -= top_ext * stride 3758 subs x0, x0, #32 // bw -= 32 3759 add x6, x6, #64 // dst += 32 3760 b.gt 1b 3761 37623: 3763 ret 3764endfunc 3765