1/* 2 * Copyright (c) 2016 Google Inc. 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/aarch64/asm.S" 22 23// All public functions in this file have the following signature: 24// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride, 25// const uint8_t *ref, ptrdiff_t ref_stride, 26// int h, int mx, int my); 27 28function ff_vp9_avg64_neon, export=1 29 mov x5, x0 301: 31 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x2], x3 32 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 33 ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3 34 urhadd v0.16b, v0.16b, v4.16b 35 urhadd v1.16b, v1.16b, v5.16b 36 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1 37 urhadd v2.16b, v2.16b, v6.16b 38 urhadd v3.16b, v3.16b, v7.16b 39 subs w4, w4, #2 40 urhadd v16.16b, v16.16b, v20.16b 41 urhadd v17.16b, v17.16b, v21.16b 42 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], x1 43 urhadd v18.16b, v18.16b, v22.16b 44 urhadd v19.16b, v19.16b, v23.16b 45 st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], x1 46 b.ne 1b 47 ret 48endfunc 49 50function ff_vp9_avg32_neon, export=1 511: 52 ld1 {v2.16b, v3.16b}, [x2], x3 53 ld1 {v0.16b, v1.16b}, [x0] 54 urhadd v0.16b, v0.16b, v2.16b 55 urhadd v1.16b, v1.16b, v3.16b 56 subs w4, w4, #1 57 st1 {v0.16b, v1.16b}, [x0], x1 58 b.ne 1b 59 ret 60endfunc 61 62function ff_vp9_copy16_neon, export=1 63 add x5, x0, x1 64 lsl x1, x1, #1 65 add x6, x2, x3 66 lsl x3, x3, #1 671: 68 ld1 {v0.16b}, [x2], x3 69 ld1 {v1.16b}, [x6], x3 70 ld1 {v2.16b}, [x2], x3 71 ld1 {v3.16b}, [x6], x3 72 subs w4, w4, #4 73 st1 {v0.16b}, [x0], x1 74 st1 {v1.16b}, [x5], x1 75 st1 {v2.16b}, [x0], x1 76 st1 {v3.16b}, [x5], x1 77 b.ne 1b 78 ret 79endfunc 80 81function ff_vp9_avg16_neon, export=1 82 mov x5, x0 831: 84 ld1 {v2.16b}, [x2], x3 85 ld1 {v0.16b}, [x0], x1 86 ld1 {v3.16b}, [x2], x3 87 urhadd v0.16b, v0.16b, v2.16b 88 ld1 {v1.16b}, [x0], x1 89 urhadd v1.16b, v1.16b, v3.16b 90 subs w4, w4, #2 91 st1 {v0.16b}, [x5], x1 92 st1 {v1.16b}, [x5], x1 93 b.ne 1b 94 ret 95endfunc 96 97function ff_vp9_copy8_neon, export=1 981: 99 ld1 {v0.8b}, [x2], x3 100 ld1 {v1.8b}, [x2], x3 101 subs w4, w4, #2 102 st1 {v0.8b}, [x0], x1 103 st1 {v1.8b}, [x0], x1 104 b.ne 1b 105 ret 106endfunc 107 108function ff_vp9_avg8_neon, export=1 109 mov x5, x0 1101: 111 ld1 {v2.8b}, [x2], x3 112 ld1 {v0.8b}, [x0], x1 113 ld1 {v3.8b}, [x2], x3 114 urhadd v0.8b, v0.8b, v2.8b 115 ld1 {v1.8b}, [x0], x1 116 urhadd v1.8b, v1.8b, v3.8b 117 subs w4, w4, #2 118 st1 {v0.8b}, [x5], x1 119 st1 {v1.8b}, [x5], x1 120 b.ne 1b 121 ret 122endfunc 123 124function ff_vp9_copy4_neon, export=1 1251: 126 ld1 {v0.s}[0], [x2], x3 127 ld1 {v1.s}[0], [x2], x3 128 st1 {v0.s}[0], [x0], x1 129 ld1 {v2.s}[0], [x2], x3 130 st1 {v1.s}[0], [x0], x1 131 ld1 {v3.s}[0], [x2], x3 132 subs w4, w4, #4 133 st1 {v2.s}[0], [x0], x1 134 st1 {v3.s}[0], [x0], x1 135 b.ne 1b 136 ret 137endfunc 138 139function ff_vp9_avg4_neon, export=1 140 mov x5, x0 1411: 142 ld1 {v2.s}[0], [x2], x3 143 ld1 {v0.s}[0], [x0], x1 144 ld1 {v2.s}[1], [x2], x3 145 ld1 {v0.s}[1], [x0], x1 146 ld1 {v3.s}[0], [x2], x3 147 ld1 {v1.s}[0], [x0], x1 148 ld1 {v3.s}[1], [x2], x3 149 ld1 {v1.s}[1], [x0], x1 150 subs w4, w4, #4 151 urhadd v0.8b, v0.8b, v2.8b 152 urhadd v1.8b, v1.8b, v3.8b 153 st1 {v0.s}[0], [x5], x1 154 st1 {v0.s}[1], [x5], x1 155 st1 {v1.s}[0], [x5], x1 156 st1 {v1.s}[1], [x5], x1 157 b.ne 1b 158 ret 159endfunc 160 161 162// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6 163// for size >= 16), and multiply-accumulate into dst1 and dst3 (or 164// dst1-dst2 and dst3-dst4 for size >= 16) 165.macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size 166 ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset) 167 ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset) 168.if \size >= 16 169 mla \dst1\().8h, v20.8h, v0.h[\offset] 170 ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset) 171 mla \dst3\().8h, v22.8h, v0.h[\offset] 172 ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) 173 mla \dst2\().8h, v21.8h, v0.h[\offset] 174 mla \dst4\().8h, v23.8h, v0.h[\offset] 175.elseif \size == 8 176 mla \dst1\().8h, v20.8h, v0.h[\offset] 177 mla \dst3\().8h, v22.8h, v0.h[\offset] 178.else 179 mla \dst1\().4h, v20.4h, v0.h[\offset] 180 mla \dst3\().4h, v22.4h, v0.h[\offset] 181.endif 182.endm 183// The same as above, but don't accumulate straight into the 184// destination, but use a temp register and accumulate with saturation. 185.macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size 186 ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset) 187 ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset) 188.if \size >= 16 189 mul v20.8h, v20.8h, v0.h[\offset] 190 ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset) 191 mul v22.8h, v22.8h, v0.h[\offset] 192 ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) 193 mul v21.8h, v21.8h, v0.h[\offset] 194 mul v23.8h, v23.8h, v0.h[\offset] 195.elseif \size == 8 196 mul v20.8h, v20.8h, v0.h[\offset] 197 mul v22.8h, v22.8h, v0.h[\offset] 198.else 199 mul v20.4h, v20.4h, v0.h[\offset] 200 mul v22.4h, v22.4h, v0.h[\offset] 201.endif 202.if \size == 4 203 sqadd \dst1\().4h, \dst1\().4h, v20.4h 204 sqadd \dst3\().4h, \dst3\().4h, v22.4h 205.else 206 sqadd \dst1\().8h, \dst1\().8h, v20.8h 207 sqadd \dst3\().8h, \dst3\().8h, v22.8h 208.if \size >= 16 209 sqadd \dst2\().8h, \dst2\().8h, v21.8h 210 sqadd \dst4\().8h, \dst4\().8h, v23.8h 211.endif 212.endif 213.endm 214 215 216// Instantiate a horizontal filter function for the given size. 217// This can work on 4, 8 or 16 pixels in parallel; for larger 218// widths it will do 16 pixels at a time and loop horizontally. 219// The actual width is passed in x5, the height in w4 and the 220// filter coefficients in x9. idx2 is the index of the largest 221// filter coefficient (3 or 4) and idx1 is the other one of them. 222.macro do_8tap_h type, size, idx1, idx2 223function \type\()_8tap_\size\()h_\idx1\idx2 224 sub x2, x2, #3 225 add x6, x0, x1 226 add x7, x2, x3 227 add x1, x1, x1 228 add x3, x3, x3 229 // Only size >= 16 loops horizontally and needs 230 // reduced dst stride 231.if \size >= 16 232 sub x1, x1, x5 233.endif 234 // size >= 16 loads two qwords and increments x2, 235 // for size 4/8 it's enough with one qword and no 236 // postincrement 237.if \size >= 16 238 sub x3, x3, x5 239 sub x3, x3, #8 240.endif 241 // Load the filter vector 242 ld1 {v0.8h}, [x9] 2431: 244.if \size >= 16 245 mov x9, x5 246.endif 247 // Load src 248.if \size >= 16 249 ld1 {v4.8b, v5.8b, v6.8b}, [x2], #24 250 ld1 {v16.8b, v17.8b, v18.8b}, [x7], #24 251.else 252 ld1 {v4.8b, v5.8b}, [x2] 253 ld1 {v16.8b, v17.8b}, [x7] 254.endif 255 uxtl v4.8h, v4.8b 256 uxtl v5.8h, v5.8b 257 uxtl v16.8h, v16.8b 258 uxtl v17.8h, v17.8b 259.if \size >= 16 260 uxtl v6.8h, v6.8b 261 uxtl v18.8h, v18.8b 262.endif 2632: 264 265 // Accumulate, adding idx2 last with a separate 266 // saturating add. The positive filter coefficients 267 // for all indices except idx2 must add up to less 268 // than 127 for this not to overflow. 269 mul v1.8h, v4.8h, v0.h[0] 270 mul v24.8h, v16.8h, v0.h[0] 271.if \size >= 16 272 mul v2.8h, v5.8h, v0.h[0] 273 mul v25.8h, v17.8h, v0.h[0] 274.endif 275 extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 1, \size 276 extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 2, \size 277 extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx1, \size 278 extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 5, \size 279 extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 6, \size 280 extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 7, \size 281 extmulqadd v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx2, \size 282 283 // Round, shift and saturate 284 sqrshrun v1.8b, v1.8h, #7 285 sqrshrun v24.8b, v24.8h, #7 286.if \size >= 16 287 sqrshrun2 v1.16b, v2.8h, #7 288 sqrshrun2 v24.16b, v25.8h, #7 289.endif 290 // Average 291.ifc \type,avg 292.if \size >= 16 293 ld1 {v2.16b}, [x0] 294 ld1 {v3.16b}, [x6] 295 urhadd v1.16b, v1.16b, v2.16b 296 urhadd v24.16b, v24.16b, v3.16b 297.elseif \size == 8 298 ld1 {v2.8b}, [x0] 299 ld1 {v3.8b}, [x6] 300 urhadd v1.8b, v1.8b, v2.8b 301 urhadd v24.8b, v24.8b, v3.8b 302.else 303 ld1 {v2.s}[0], [x0] 304 ld1 {v3.s}[0], [x6] 305 urhadd v1.8b, v1.8b, v2.8b 306 urhadd v24.8b, v24.8b, v3.8b 307.endif 308.endif 309 // Store and loop horizontally (for size >= 16) 310.if \size >= 16 311 subs x9, x9, #16 312 st1 {v1.16b}, [x0], #16 313 st1 {v24.16b}, [x6], #16 314 b.eq 3f 315 mov v4.16b, v6.16b 316 mov v16.16b, v18.16b 317 ld1 {v6.16b}, [x2], #16 318 ld1 {v18.16b}, [x7], #16 319 uxtl v5.8h, v6.8b 320 uxtl2 v6.8h, v6.16b 321 uxtl v17.8h, v18.8b 322 uxtl2 v18.8h, v18.16b 323 b 2b 324.elseif \size == 8 325 st1 {v1.8b}, [x0] 326 st1 {v24.8b}, [x6] 327.else // \size == 4 328 st1 {v1.s}[0], [x0] 329 st1 {v24.s}[0], [x6] 330.endif 3313: 332 // Loop vertically 333 add x0, x0, x1 334 add x6, x6, x1 335 add x2, x2, x3 336 add x7, x7, x3 337 subs w4, w4, #2 338 b.ne 1b 339 ret 340endfunc 341.endm 342 343.macro do_8tap_h_size size 344do_8tap_h put, \size, 3, 4 345do_8tap_h avg, \size, 3, 4 346do_8tap_h put, \size, 4, 3 347do_8tap_h avg, \size, 4, 3 348.endm 349 350do_8tap_h_size 4 351do_8tap_h_size 8 352do_8tap_h_size 16 353 354.macro do_8tap_h_func type, filter, offset, size 355function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1 356 movrel x6, X(ff_vp9_subpel_filters), 256*\offset 357 cmp w5, #8 358 add x9, x6, w5, uxtw #4 359 mov x5, #\size 360.if \size >= 16 361 b.ge \type\()_8tap_16h_34 362 b \type\()_8tap_16h_43 363.else 364 b.ge \type\()_8tap_\size\()h_34 365 b \type\()_8tap_\size\()h_43 366.endif 367endfunc 368.endm 369 370.macro do_8tap_h_filters size 371do_8tap_h_func put, regular, 1, \size 372do_8tap_h_func avg, regular, 1, \size 373do_8tap_h_func put, sharp, 2, \size 374do_8tap_h_func avg, sharp, 2, \size 375do_8tap_h_func put, smooth, 0, \size 376do_8tap_h_func avg, smooth, 0, \size 377.endm 378 379do_8tap_h_filters 64 380do_8tap_h_filters 32 381do_8tap_h_filters 16 382do_8tap_h_filters 8 383do_8tap_h_filters 4 384 385 386// Vertical filters 387 388// Round, shift and saturate and store reg1-reg2 over 4 lines 389.macro do_store4 reg1, reg2, tmp1, tmp2, type 390 sqrshrun \reg1\().8b, \reg1\().8h, #7 391 sqrshrun \reg2\().8b, \reg2\().8h, #7 392.ifc \type,avg 393 ld1 {\tmp1\().s}[0], [x7], x1 394 ld1 {\tmp2\().s}[0], [x7], x1 395 ld1 {\tmp1\().s}[1], [x7], x1 396 ld1 {\tmp2\().s}[1], [x7], x1 397 urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b 398 urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b 399.endif 400 st1 {\reg1\().s}[0], [x0], x1 401 st1 {\reg2\().s}[0], [x0], x1 402 st1 {\reg1\().s}[1], [x0], x1 403 st1 {\reg2\().s}[1], [x0], x1 404.endm 405 406// Round, shift and saturate and store reg1-4 407.macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type 408 sqrshrun \reg1\().8b, \reg1\().8h, #7 409 sqrshrun \reg2\().8b, \reg2\().8h, #7 410 sqrshrun \reg3\().8b, \reg3\().8h, #7 411 sqrshrun \reg4\().8b, \reg4\().8h, #7 412.ifc \type,avg 413 ld1 {\tmp1\().8b}, [x7], x1 414 ld1 {\tmp2\().8b}, [x7], x1 415 ld1 {\tmp3\().8b}, [x7], x1 416 ld1 {\tmp4\().8b}, [x7], x1 417 urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b 418 urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b 419 urhadd \reg3\().8b, \reg3\().8b, \tmp3\().8b 420 urhadd \reg4\().8b, \reg4\().8b, \tmp4\().8b 421.endif 422 st1 {\reg1\().8b}, [x0], x1 423 st1 {\reg2\().8b}, [x0], x1 424 st1 {\reg3\().8b}, [x0], x1 425 st1 {\reg4\().8b}, [x0], x1 426.endm 427 428// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2 429// (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately 430// at the end with saturation. Indices 0 and 7 always have negative or zero 431// coefficients, so they can be accumulated into tmp1-tmp2 together with the 432// largest coefficient. 433.macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2 434 mul \dst1\().8h, \src2\().8h, v0.h[1] 435 mul \dst2\().8h, \src3\().8h, v0.h[1] 436 mul \tmp1\().8h, \src1\().8h, v0.h[0] 437 mul \tmp2\().8h, \src2\().8h, v0.h[0] 438 mla \dst1\().8h, \src3\().8h, v0.h[2] 439 mla \dst2\().8h, \src4\().8h, v0.h[2] 440.if \idx1 == 3 441 mla \dst1\().8h, \src4\().8h, v0.h[3] 442 mla \dst2\().8h, \src5\().8h, v0.h[3] 443.else 444 mla \dst1\().8h, \src5\().8h, v0.h[4] 445 mla \dst2\().8h, \src6\().8h, v0.h[4] 446.endif 447 mla \dst1\().8h, \src6\().8h, v0.h[5] 448 mla \dst2\().8h, \src7\().8h, v0.h[5] 449 mla \tmp1\().8h, \src8\().8h, v0.h[7] 450 mla \tmp2\().8h, \src9\().8h, v0.h[7] 451 mla \dst1\().8h, \src7\().8h, v0.h[6] 452 mla \dst2\().8h, \src8\().8h, v0.h[6] 453.if \idx2 == 3 454 mla \tmp1\().8h, \src4\().8h, v0.h[3] 455 mla \tmp2\().8h, \src5\().8h, v0.h[3] 456.else 457 mla \tmp1\().8h, \src5\().8h, v0.h[4] 458 mla \tmp2\().8h, \src6\().8h, v0.h[4] 459.endif 460 sqadd \dst1\().8h, \dst1\().8h, \tmp1\().8h 461 sqadd \dst2\().8h, \dst2\().8h, \tmp2\().8h 462.endm 463 464// Load pixels and extend them to 16 bit 465.macro loadl dst1, dst2, dst3, dst4 466 ld1 {v1.8b}, [x2], x3 467 ld1 {v2.8b}, [x2], x3 468 ld1 {v3.8b}, [x2], x3 469.ifnb \dst4 470 ld1 {v4.8b}, [x2], x3 471.endif 472 uxtl \dst1\().8h, v1.8b 473 uxtl \dst2\().8h, v2.8b 474 uxtl \dst3\().8h, v3.8b 475.ifnb \dst4 476 uxtl \dst4\().8h, v4.8b 477.endif 478.endm 479 480// Instantiate a vertical filter function for filtering 8 pixels at a time. 481// The height is passed in x4, the width in x5 and the filter coefficients 482// in x6. idx2 is the index of the largest filter coefficient (3 or 4) 483// and idx1 is the other one of them. 484.macro do_8tap_8v type, idx1, idx2 485function \type\()_8tap_8v_\idx1\idx2 486 sub x2, x2, x3, lsl #1 487 sub x2, x2, x3 488 ld1 {v0.8h}, [x6] 4891: 490.ifc \type,avg 491 mov x7, x0 492.endif 493 mov x6, x4 494 495 loadl v17, v18, v19 496 497 loadl v20, v21, v22, v23 4982: 499 loadl v24, v25, v26, v27 500 convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v5, v6 501 convolve v3, v4, v19, v20, v21, v22, v23, v24, v25, v26, v27, \idx1, \idx2, v5, v6 502 do_store v1, v2, v3, v4, v5, v6, v7, v28, \type 503 504 subs x6, x6, #4 505 b.eq 8f 506 507 loadl v16, v17, v18, v19 508 convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v16, v17, \idx1, \idx2, v5, v6 509 convolve v3, v4, v23, v24, v25, v26, v27, v16, v17, v18, v19, \idx1, \idx2, v5, v6 510 do_store v1, v2, v3, v4, v5, v6, v7, v28, \type 511 512 subs x6, x6, #4 513 b.eq 8f 514 515 loadl v20, v21, v22, v23 516 convolve v1, v2, v25, v26, v27, v16, v17, v18, v19, v20, v21, \idx1, \idx2, v5, v6 517 convolve v3, v4, v27, v16, v17, v18, v19, v20, v21, v22, v23, \idx1, \idx2, v5, v6 518 do_store v1, v2, v3, v4, v5, v6, v7, v28, \type 519 520 subs x6, x6, #4 521 b.ne 2b 522 5238: 524 subs x5, x5, #8 525 b.eq 9f 526 // x0 -= h * dst_stride 527 msub x0, x1, x4, x0 528 // x2 -= h * src_stride 529 msub x2, x3, x4, x2 530 // x2 -= 8 * src_stride 531 sub x2, x2, x3, lsl #3 532 // x2 += 1 * src_stride 533 add x2, x2, x3 534 add x2, x2, #8 535 add x0, x0, #8 536 b 1b 5379: 538 ret 539endfunc 540.endm 541 542do_8tap_8v put, 3, 4 543do_8tap_8v put, 4, 3 544do_8tap_8v avg, 3, 4 545do_8tap_8v avg, 4, 3 546 547 548// Instantiate a vertical filter function for filtering a 4 pixels wide 549// slice. The first half of the registers contain one row, while the second 550// half of a register contains the second-next row (also stored in the first 551// half of the register two steps ahead). The convolution does two outputs 552// at a time; the output of v17-v24 into one, and v18-v25 into another one. 553// The first half of first output is the first output row, the first half 554// of the other output is the second output row. The second halves of the 555// registers are rows 3 and 4. 556// This only is designed to work for 4 or 8 output lines. 557.macro do_8tap_4v type, idx1, idx2 558function \type\()_8tap_4v_\idx1\idx2 559 sub x2, x2, x3, lsl #1 560 sub x2, x2, x3 561 ld1 {v0.8h}, [x6] 562.ifc \type,avg 563 mov x7, x0 564.endif 565 566 ld1 {v1.s}[0], [x2], x3 567 ld1 {v2.s}[0], [x2], x3 568 ld1 {v3.s}[0], [x2], x3 569 ld1 {v4.s}[0], [x2], x3 570 ld1 {v5.s}[0], [x2], x3 571 ld1 {v6.s}[0], [x2], x3 572 trn1 v1.2s, v1.2s, v3.2s 573 ld1 {v7.s}[0], [x2], x3 574 trn1 v2.2s, v2.2s, v4.2s 575 ld1 {v26.s}[0], [x2], x3 576 uxtl v17.8h, v1.8b 577 trn1 v3.2s, v3.2s, v5.2s 578 ld1 {v27.s}[0], [x2], x3 579 uxtl v18.8h, v2.8b 580 trn1 v4.2s, v4.2s, v6.2s 581 ld1 {v28.s}[0], [x2], x3 582 uxtl v19.8h, v3.8b 583 trn1 v5.2s, v5.2s, v7.2s 584 ld1 {v29.s}[0], [x2], x3 585 uxtl v20.8h, v4.8b 586 trn1 v6.2s, v6.2s, v26.2s 587 uxtl v21.8h, v5.8b 588 trn1 v7.2s, v7.2s, v27.2s 589 uxtl v22.8h, v6.8b 590 trn1 v26.2s, v26.2s, v28.2s 591 uxtl v23.8h, v7.8b 592 trn1 v27.2s, v27.2s, v29.2s 593 uxtl v24.8h, v26.8b 594 uxtl v25.8h, v27.8b 595 596 convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v3, v4 597 do_store4 v1, v2, v5, v6, \type 598 599 subs x4, x4, #4 600 b.eq 9f 601 602 ld1 {v1.s}[0], [x2], x3 603 ld1 {v2.s}[0], [x2], x3 604 trn1 v28.2s, v28.2s, v1.2s 605 trn1 v29.2s, v29.2s, v2.2s 606 ld1 {v1.s}[1], [x2], x3 607 uxtl v26.8h, v28.8b 608 ld1 {v2.s}[1], [x2], x3 609 uxtl v27.8h, v29.8b 610 uxtl v28.8h, v1.8b 611 uxtl v29.8h, v2.8b 612 613 convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v28, v29, \idx1, \idx2, v3, v4 614 do_store4 v1, v2, v5, v6, \type 615 6169: 617 ret 618endfunc 619.endm 620 621do_8tap_4v put, 3, 4 622do_8tap_4v put, 4, 3 623do_8tap_4v avg, 3, 4 624do_8tap_4v avg, 4, 3 625 626 627.macro do_8tap_v_func type, filter, offset, size 628function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1 629 uxtw x4, w4 630 movrel x5, X(ff_vp9_subpel_filters), 256*\offset 631 cmp w6, #8 632 add x6, x5, w6, uxtw #4 633 mov x5, #\size 634.if \size >= 8 635 b.ge \type\()_8tap_8v_34 636 b \type\()_8tap_8v_43 637.else 638 b.ge \type\()_8tap_4v_34 639 b \type\()_8tap_4v_43 640.endif 641endfunc 642.endm 643 644.macro do_8tap_v_filters size 645do_8tap_v_func put, regular, 1, \size 646do_8tap_v_func avg, regular, 1, \size 647do_8tap_v_func put, sharp, 2, \size 648do_8tap_v_func avg, sharp, 2, \size 649do_8tap_v_func put, smooth, 0, \size 650do_8tap_v_func avg, smooth, 0, \size 651.endm 652 653do_8tap_v_filters 64 654do_8tap_v_filters 32 655do_8tap_v_filters 16 656do_8tap_v_filters 8 657do_8tap_v_filters 4 658