1/* 2 * Copyright (c) 2016 Google Inc. 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/arm/asm.S" 22 23@ All public functions in this file have the following signature: 24@ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride, 25@ const uint8_t *ref, ptrdiff_t ref_stride, 26@ int h, int mx, int my); 27 28function ff_vp9_copy64_neon, export=1 29 ldr r12, [sp] 30 sub r1, r1, #32 31 sub r3, r3, #32 321: 33 vld1.8 {q0, q1}, [r2]! 34 vst1.8 {q0, q1}, [r0, :128]! 35 vld1.8 {q2, q3}, [r2], r3 36 subs r12, r12, #1 37 vst1.8 {q2, q3}, [r0, :128], r1 38 bne 1b 39 bx lr 40endfunc 41 42function ff_vp9_avg64_neon, export=1 43 push {lr} 44 ldr r12, [sp, #4] 45 sub r1, r1, #32 46 sub r3, r3, #32 47 mov lr, r0 481: 49 vld1.8 {q8, q9}, [r2]! 50 vld1.8 {q0, q1}, [r0, :128]! 51 vld1.8 {q10, q11}, [r2], r3 52 vrhadd.u8 q0, q0, q8 53 vld1.8 {q2, q3}, [r0, :128], r1 54 vrhadd.u8 q1, q1, q9 55 vrhadd.u8 q2, q2, q10 56 vst1.8 {q0, q1}, [lr, :128]! 57 vrhadd.u8 q3, q3, q11 58 vst1.8 {q2, q3}, [lr, :128], r1 59 subs r12, r12, #1 60 bne 1b 61 pop {pc} 62endfunc 63 64function ff_vp9_copy32_neon, export=1 65 ldr r12, [sp] 661: 67 vld1.8 {q0, q1}, [r2], r3 68 subs r12, r12, #1 69 vst1.8 {q0, q1}, [r0, :128], r1 70 bne 1b 71 bx lr 72endfunc 73 74function ff_vp9_avg32_neon, export=1 75 ldr r12, [sp] 761: 77 vld1.8 {q2, q3}, [r2], r3 78 vld1.8 {q0, q1}, [r0, :128] 79 vrhadd.u8 q0, q0, q2 80 vrhadd.u8 q1, q1, q3 81 subs r12, r12, #1 82 vst1.8 {q0, q1}, [r0, :128], r1 83 bne 1b 84 bx lr 85endfunc 86 87function ff_vp9_copy16_neon, export=1 88 push {r4,lr} 89 ldr r12, [sp, #8] 90 add r4, r0, r1 91 add lr, r2, r3 92 add r1, r1, r1 93 add r3, r3, r3 941: 95 vld1.8 {q0}, [r2], r3 96 vld1.8 {q1}, [lr], r3 97 subs r12, r12, #2 98 vst1.8 {q0}, [r0, :128], r1 99 vst1.8 {q1}, [r4, :128], r1 100 bne 1b 101 pop {r4,pc} 102endfunc 103 104function ff_vp9_avg16_neon, export=1 105 push {lr} 106 ldr r12, [sp, #4] 107 mov lr, r0 1081: 109 vld1.8 {q2}, [r2], r3 110 vld1.8 {q0}, [r0, :128], r1 111 vld1.8 {q3}, [r2], r3 112 vrhadd.u8 q0, q0, q2 113 vld1.8 {q1}, [r0, :128], r1 114 vrhadd.u8 q1, q1, q3 115 subs r12, r12, #2 116 vst1.8 {q0}, [lr, :128], r1 117 vst1.8 {q1}, [lr, :128], r1 118 bne 1b 119 pop {pc} 120endfunc 121 122function ff_vp9_copy8_neon, export=1 123 ldr r12, [sp] 1241: 125 vld1.8 {d0}, [r2], r3 126 vld1.8 {d1}, [r2], r3 127 subs r12, r12, #2 128 vst1.8 {d0}, [r0, :64], r1 129 vst1.8 {d1}, [r0, :64], r1 130 bne 1b 131 bx lr 132endfunc 133 134function ff_vp9_avg8_neon, export=1 135 ldr r12, [sp] 1361: 137 vld1.8 {d2}, [r2], r3 138 vld1.8 {d0}, [r0, :64], r1 139 vld1.8 {d3}, [r2], r3 140 vrhadd.u8 d0, d0, d2 141 vld1.8 {d1}, [r0, :64] 142 sub r0, r0, r1 143 vrhadd.u8 d1, d1, d3 144 subs r12, r12, #2 145 vst1.8 {d0}, [r0, :64], r1 146 vst1.8 {d1}, [r0, :64], r1 147 bne 1b 148 bx lr 149endfunc 150 151function ff_vp9_copy4_neon, export=1 152 ldr r12, [sp] 1531: 154 vld1.32 {d0[]}, [r2], r3 155 vld1.32 {d1[]}, [r2], r3 156 vst1.32 {d0[0]}, [r0, :32], r1 157 vld1.32 {d2[]}, [r2], r3 158 vst1.32 {d1[0]}, [r0, :32], r1 159 vld1.32 {d3[]}, [r2], r3 160 subs r12, r12, #4 161 vst1.32 {d2[0]}, [r0, :32], r1 162 vst1.32 {d3[0]}, [r0, :32], r1 163 bne 1b 164 bx lr 165endfunc 166 167function ff_vp9_avg4_neon, export=1 168 push {lr} 169 ldr r12, [sp, #4] 170 mov lr, r0 1711: 172 vld1.32 {d4[]}, [r2], r3 173 vld1.32 {d0[]}, [r0, :32], r1 174 vld1.32 {d5[]}, [r2], r3 175 vrhadd.u8 d0, d0, d4 176 vld1.32 {d1[]}, [r0, :32], r1 177 vld1.32 {d6[]}, [r2], r3 178 vrhadd.u8 d1, d1, d5 179 vld1.32 {d2[]}, [r0, :32], r1 180 vld1.32 {d7[]}, [r2], r3 181 vrhadd.u8 d2, d2, d6 182 vld1.32 {d3[]}, [r0, :32], r1 183 subs r12, r12, #4 184 vst1.32 {d0[0]}, [lr, :32], r1 185 vrhadd.u8 d3, d3, d7 186 vst1.32 {d1[0]}, [lr, :32], r1 187 vst1.32 {d2[0]}, [lr, :32], r1 188 vst1.32 {d3[0]}, [lr, :32], r1 189 bne 1b 190 pop {pc} 191endfunc 192 193@ Helper macros for vmul/vmla with a constant from either d0 or d1 depending on index 194.macro vmul_lane dst, src, idx 195.if \idx < 4 196 vmul.s16 \dst, \src, d0[\idx] 197.else 198 vmul.s16 \dst, \src, d1[\idx - 4] 199.endif 200.endm 201.macro vmla_lane dst, src, idx 202.if \idx < 4 203 vmla.s16 \dst, \src, d0[\idx] 204.else 205 vmla.s16 \dst, \src, d1[\idx - 4] 206.endif 207.endm 208 209@ Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6 210@ for size >= 16), and multiply-accumulate into dst1 and dst3 (or 211@ dst1-dst2 and dst3-dst4 for size >= 16) 212.macro extmla dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size 213 vext.8 q14, \src1, \src2, #(2*\offset) 214 vext.8 q15, \src4, \src5, #(2*\offset) 215.if \size >= 16 216 vmla_lane \dst1, q14, \offset 217 vext.8 q5, \src2, \src3, #(2*\offset) 218 vmla_lane \dst3, q15, \offset 219 vext.8 q6, \src5, \src6, #(2*\offset) 220 vmla_lane \dst2, q5, \offset 221 vmla_lane \dst4, q6, \offset 222.elseif \size == 8 223 vmla_lane \dst1, q14, \offset 224 vmla_lane \dst3, q15, \offset 225.else 226 vmla_lane \dst1d, d28, \offset 227 vmla_lane \dst3d, d30, \offset 228.endif 229.endm 230@ The same as above, but don't accumulate straight into the 231@ destination, but use a temp register and accumulate with saturation. 232.macro extmulqadd dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size 233 vext.8 q14, \src1, \src2, #(2*\offset) 234 vext.8 q15, \src4, \src5, #(2*\offset) 235.if \size >= 16 236 vmul_lane q14, q14, \offset 237 vext.8 q5, \src2, \src3, #(2*\offset) 238 vmul_lane q15, q15, \offset 239 vext.8 q6, \src5, \src6, #(2*\offset) 240 vmul_lane q5, q5, \offset 241 vmul_lane q6, q6, \offset 242.elseif \size == 8 243 vmul_lane q14, q14, \offset 244 vmul_lane q15, q15, \offset 245.else 246 vmul_lane d28, d28, \offset 247 vmul_lane d30, d30, \offset 248.endif 249.if \size == 4 250 vqadd.s16 \dst1d, \dst1d, d28 251 vqadd.s16 \dst3d, \dst3d, d30 252.else 253 vqadd.s16 \dst1, \dst1, q14 254 vqadd.s16 \dst3, \dst3, q15 255.if \size >= 16 256 vqadd.s16 \dst2, \dst2, q5 257 vqadd.s16 \dst4, \dst4, q6 258.endif 259.endif 260.endm 261 262 263@ Instantiate a horizontal filter function for the given size. 264@ This can work on 4, 8 or 16 pixels in parallel; for larger 265@ widths it will do 16 pixels at a time and loop horizontally. 266@ The actual width is passed in r5, the height in r4 and 267@ the filter coefficients in r12. idx2 is the index of the largest 268@ filter coefficient (3 or 4) and idx1 is the other one of them. 269.macro do_8tap_h type, size, idx1, idx2 270function \type\()_8tap_\size\()h_\idx1\idx2 271 sub r2, r2, #3 272 add r6, r0, r1 273 add r7, r2, r3 274 add r1, r1, r1 275 add r3, r3, r3 276 @ Only size >= 16 loops horizontally and needs 277 @ reduced dst stride 278.if \size >= 16 279 sub r1, r1, r5 280.endif 281 @ size >= 16 loads two qwords and increments r2, 282 @ for size 4/8 it's enough with one qword and no 283 @ postincrement 284.if \size >= 16 285 sub r3, r3, r5 286 sub r3, r3, #8 287.endif 288 @ Load the filter vector 289 vld1.16 {q0}, [r12,:128] 2901: 291.if \size >= 16 292 mov r12, r5 293.endif 294 @ Load src 295.if \size >= 16 296 vld1.8 {d18, d19, d20}, [r2]! 297 vld1.8 {d24, d25, d26}, [r7]! 298.else 299 vld1.8 {q9}, [r2] 300 vld1.8 {q12}, [r7] 301.endif 302 vmovl.u8 q8, d18 303 vmovl.u8 q9, d19 304 vmovl.u8 q11, d24 305 vmovl.u8 q12, d25 306.if \size >= 16 307 vmovl.u8 q10, d20 308 vmovl.u8 q13, d26 309.endif 3102: 311 312 @ Accumulate, adding idx2 last with a separate 313 @ saturating add. The positive filter coefficients 314 @ for all indices except idx2 must add up to less 315 @ than 127 for this not to overflow. 316 vmul.s16 q1, q8, d0[0] 317 vmul.s16 q3, q11, d0[0] 318.if \size >= 16 319 vmul.s16 q2, q9, d0[0] 320 vmul.s16 q4, q12, d0[0] 321.endif 322 extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 1, \size 323 extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 2, \size 324 extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, \idx1, \size 325 extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 5, \size 326 extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 6, \size 327 extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 7, \size 328 extmulqadd q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, \idx2, \size 329 330 @ Round, shift and saturate 331 vqrshrun.s16 d2, q1, #7 332 vqrshrun.s16 d6, q3, #7 333.if \size >= 16 334 vqrshrun.s16 d3, q2, #7 335 vqrshrun.s16 d7, q4, #7 336.endif 337 @ Average 338.ifc \type,avg 339.if \size >= 16 340 vld1.8 {q14}, [r0,:128] 341 vld1.8 {q15}, [r6,:128] 342 vrhadd.u8 q1, q1, q14 343 vrhadd.u8 q3, q3, q15 344.elseif \size == 8 345 vld1.8 {d28}, [r0,:64] 346 vld1.8 {d30}, [r6,:64] 347 vrhadd.u8 d2, d2, d28 348 vrhadd.u8 d6, d6, d30 349.else 350 @ We only need d28[0], but [] is faster on some cores 351 vld1.32 {d28[]}, [r0,:32] 352 vld1.32 {d30[]}, [r6,:32] 353 vrhadd.u8 d2, d2, d28 354 vrhadd.u8 d6, d6, d30 355.endif 356.endif 357 @ Store and loop horizontally (for size >= 16) 358.if \size >= 16 359 subs r12, r12, #16 360 vst1.8 {q1}, [r0,:128]! 361 vst1.8 {q3}, [r6,:128]! 362 beq 3f 363 vmov q8, q10 364 vmov q11, q13 365 vld1.8 {q10}, [r2]! 366 vld1.8 {q13}, [r7]! 367 vmovl.u8 q9, d20 368 vmovl.u8 q10, d21 369 vmovl.u8 q12, d26 370 vmovl.u8 q13, d27 371 b 2b 372.elseif \size == 8 373 vst1.8 {d2}, [r0,:64] 374 vst1.8 {d6}, [r6,:64] 375.else @ \size == 4 376 vst1.32 {d2[0]}, [r0,:32] 377 vst1.32 {d6[0]}, [r6,:32] 378.endif 3793: 380 @ Loop vertically 381 add r0, r0, r1 382 add r6, r6, r1 383 add r2, r2, r3 384 add r7, r7, r3 385 subs r4, r4, #2 386 bne 1b 387.if \size >= 16 388 vpop {q4-q6} 389.endif 390 pop {r4-r7} 391 bx lr 392endfunc 393.endm 394 395.macro do_8tap_h_size size 396do_8tap_h put, \size, 3, 4 397do_8tap_h avg, \size, 3, 4 398do_8tap_h put, \size, 4, 3 399do_8tap_h avg, \size, 4, 3 400.endm 401 402do_8tap_h_size 4 403do_8tap_h_size 8 404do_8tap_h_size 16 405 406.macro do_8tap_h_func type, filter, offset, size 407function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1 408 push {r4-r7} 409.if \size >= 16 410 vpush {q4-q6} 411 ldr r4, [sp, #64] 412 ldr r5, [sp, #68] 413.else 414 ldr r4, [sp, #16] 415 ldr r5, [sp, #20] 416.endif 417 movrelx r12, X(ff_vp9_subpel_filters), r6 418 add r12, r12, 256*\offset 419 cmp r5, #8 420 add r12, r12, r5, lsl #4 421 mov r5, #\size 422.if \size >= 16 423 bge \type\()_8tap_16h_34 424 b \type\()_8tap_16h_43 425.else 426 bge \type\()_8tap_\size\()h_34 427 b \type\()_8tap_\size\()h_43 428.endif 429endfunc 430.endm 431 432.macro do_8tap_h_filters size 433do_8tap_h_func put, regular, 1, \size 434do_8tap_h_func avg, regular, 1, \size 435do_8tap_h_func put, sharp, 2, \size 436do_8tap_h_func avg, sharp, 2, \size 437do_8tap_h_func put, smooth, 0, \size 438do_8tap_h_func avg, smooth, 0, \size 439.endm 440 441do_8tap_h_filters 64 442do_8tap_h_filters 32 443do_8tap_h_filters 16 444do_8tap_h_filters 8 445do_8tap_h_filters 4 446 447.ltorg 448 449@ Vertical filters 450 451@ Round, shift and saturate and store qreg1-2 over 4 lines 452.macro do_store4 qreg1, dreg1, qreg2, dreg2, tmp1, tmp2, type 453 vqrshrun.s16 \dreg1, \qreg1, #7 454 vqrshrun.s16 \dreg2, \qreg2, #7 455.ifc \type,avg 456 vld1.32 {\tmp1[]}, [r0,:32], r1 457 vld1.32 {\tmp2[]}, [r0,:32], r1 458 vld1.32 {\tmp1[1]}, [r0,:32], r1 459 vld1.32 {\tmp2[1]}, [r0,:32], r1 460 vrhadd.u8 \dreg1, \dreg1, \tmp1 461 vrhadd.u8 \dreg2, \dreg2, \tmp2 462 sub r0, r0, r1, lsl #2 463.endif 464 vst1.32 {\dreg1[0]}, [r0,:32], r1 465 vst1.32 {\dreg2[0]}, [r0,:32], r1 466 vst1.32 {\dreg1[1]}, [r0,:32], r1 467 vst1.32 {\dreg2[1]}, [r0,:32], r1 468.endm 469 470@ Round, shift and saturate and store qreg1-4 471.macro do_store qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, type 472 vqrshrun.s16 \dreg1, \qreg1, #7 473 vqrshrun.s16 \dreg2, \qreg2, #7 474 vqrshrun.s16 \dreg3, \qreg3, #7 475 vqrshrun.s16 \dreg4, \qreg4, #7 476.ifc \type,avg 477 vld1.8 {\tmp1}, [r0,:64], r1 478 vld1.8 {\tmp2}, [r0,:64], r1 479 vld1.8 {\tmp3}, [r0,:64], r1 480 vld1.8 {\tmp4}, [r0,:64], r1 481 vrhadd.u8 \dreg1, \dreg1, \tmp1 482 vrhadd.u8 \dreg2, \dreg2, \tmp2 483 vrhadd.u8 \dreg3, \dreg3, \tmp3 484 vrhadd.u8 \dreg4, \dreg4, \tmp4 485 sub r0, r0, r1, lsl #2 486.endif 487 vst1.8 {\dreg1}, [r0,:64], r1 488 vst1.8 {\dreg2}, [r0,:64], r1 489 vst1.8 {\dreg3}, [r0,:64], r1 490 vst1.8 {\dreg4}, [r0,:64], r1 491.endm 492 493@ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2 494@ (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately 495@ at the end with saturation. Indices 0 and 7 always have negative or zero 496@ coefficients, so they can be accumulated into tmp1-tmp2 together with the 497@ largest coefficient. 498.macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2 499 vmul.s16 \dst1, \src2, d0[1] 500 vmul.s16 \dst2, \src3, d0[1] 501 vmul.s16 \tmp1, \src1, d0[0] 502 vmul.s16 \tmp2, \src2, d0[0] 503 vmla.s16 \dst1, \src3, d0[2] 504 vmla.s16 \dst2, \src4, d0[2] 505.if \idx1 == 3 506 vmla.s16 \dst1, \src4, d0[3] 507 vmla.s16 \dst2, \src5, d0[3] 508.else 509 vmla.s16 \dst1, \src5, d1[0] 510 vmla.s16 \dst2, \src6, d1[0] 511.endif 512 vmla.s16 \dst1, \src6, d1[1] 513 vmla.s16 \dst2, \src7, d1[1] 514 vmla.s16 \tmp1, \src8, d1[3] 515 vmla.s16 \tmp2, \src9, d1[3] 516 vmla.s16 \dst1, \src7, d1[2] 517 vmla.s16 \dst2, \src8, d1[2] 518.if \idx2 == 3 519 vmla.s16 \tmp1, \src4, d0[3] 520 vmla.s16 \tmp2, \src5, d0[3] 521.else 522 vmla.s16 \tmp1, \src5, d1[0] 523 vmla.s16 \tmp2, \src6, d1[0] 524.endif 525 vqadd.s16 \dst1, \dst1, \tmp1 526 vqadd.s16 \dst2, \dst2, \tmp2 527.endm 528 529@ Load pixels and extend them to 16 bit 530.macro loadl dst1, dst2, dst3, dst4 531 vld1.8 {d2}, [r2], r3 532 vld1.8 {d3}, [r2], r3 533 vld1.8 {d4}, [r2], r3 534.ifnb \dst4 535 vld1.8 {d5}, [r2], r3 536.endif 537 vmovl.u8 \dst1, d2 538 vmovl.u8 \dst2, d3 539 vmovl.u8 \dst3, d4 540.ifnb \dst4 541 vmovl.u8 \dst4, d5 542.endif 543.endm 544 545@ Instantiate a vertical filter function for filtering 8 pixels at a time. 546@ The height is passed in r4, the width in r5 and the filter coefficients 547@ in r12. idx2 is the index of the largest filter coefficient (3 or 4) 548@ and idx1 is the other one of them. 549.macro do_8tap_8v type, idx1, idx2 550function \type\()_8tap_8v_\idx1\idx2 551 sub r2, r2, r3, lsl #1 552 sub r2, r2, r3 553 vld1.16 {q0}, [r12, :128] 5541: 555 mov r12, r4 556 557 loadl q5, q6, q7 558 loadl q8, q9, q10, q11 5592: 560 loadl q12, q13, q14, q15 561 convolve q1, q2, q5, q6, q7, q8, q9, q10, q11, q12, q13, \idx1, \idx2, q4, q5 562 convolve q3, q4, q7, q8, q9, q10, q11, q12, q13, q14, q15, \idx1, \idx2, q5, q6 563 do_store q1, d2, q2, d4, q3, d6, q4, d8, d3, d5, d7, d9, \type 564 565 subs r12, r12, #4 566 beq 8f 567 568 loadl q4, q5, q6, q7 569 convolve q1, q2, q9, q10, q11, q12, q13, q14, q15, q4, q5, \idx1, \idx2, q8, q9 570 convolve q3, q8, q11, q12, q13, q14, q15, q4, q5, q6, q7, \idx1, \idx2, q9, q10 571 do_store q1, d2, q2, d4, q3, d6, q8, d16, d3, d5, d7, d17, \type 572 573 subs r12, r12, #4 574 beq 8f 575 576 loadl q8, q9, q10, q11 577 convolve q1, q2, q13, q14, q15, q4, q5, q6, q7, q8, q9, \idx1, \idx2, q12, q13 578 convolve q3, q12, q15, q4, q5, q6, q7, q8, q9, q10, q11, \idx1, \idx2, q13, q14 579 do_store q1, d2, q2, d4, q3, d6, q12, d24, d3, d5, d7, d25, \type 580 581 subs r12, r12, #4 582 bne 2b 583 5848: 585 subs r5, r5, #8 586 beq 9f 587 @ r0 -= h * dst_stride 588 mls r0, r1, r4, r0 589 @ r2 -= h * src_stride 590 mls r2, r3, r4, r2 591 @ r2 -= 8 * src_stride 592 sub r2, r2, r3, lsl #3 593 @ r2 += 1 * src_stride 594 add r2, r2, r3 595 add r2, r2, #8 596 add r0, r0, #8 597 b 1b 5989: 599 vpop {q4-q7} 600 pop {r4-r5} 601 bx lr 602endfunc 603.endm 604 605do_8tap_8v put, 3, 4 606do_8tap_8v put, 4, 3 607do_8tap_8v avg, 3, 4 608do_8tap_8v avg, 4, 3 609 610@ Instantiate a vertical filter function for filtering a 4 pixels wide 611@ slice. The first half of the registers contain one row, while the second 612@ half of a register contains the second-next row (also stored in the first 613@ half of the register two steps ahead). The convolution does two outputs 614@ at a time; the output of q5-q12 into one, and q4-q13 into another one. 615@ The first half of first output is the first output row, the first half 616@ of the other output is the second output row. The second halves of the 617@ registers are rows 3 and 4. 618@ This only is designed to work for 4 or 8 output lines. 619.macro do_8tap_4v type, idx1, idx2 620function \type\()_8tap_4v_\idx1\idx2 621 sub r2, r2, r3, lsl #1 622 sub r2, r2, r3 623 vld1.16 {q0}, [r12, :128] 624 625 vld1.32 {d2[]}, [r2], r3 626 vld1.32 {d3[]}, [r2], r3 627 vld1.32 {d4[]}, [r2], r3 628 vld1.32 {d5[]}, [r2], r3 629 vld1.32 {d6[]}, [r2], r3 630 vld1.32 {d7[]}, [r2], r3 631 vext.8 d2, d2, d4, #4 632 vld1.32 {d8[]}, [r2], r3 633 vext.8 d3, d3, d5, #4 634 vld1.32 {d9[]}, [r2], r3 635 vmovl.u8 q5, d2 636 vext.8 d4, d4, d6, #4 637 vld1.32 {d28[]}, [r2], r3 638 vmovl.u8 q6, d3 639 vext.8 d5, d5, d7, #4 640 vld1.32 {d29[]}, [r2], r3 641 vmovl.u8 q7, d4 642 vext.8 d6, d6, d8, #4 643 vld1.32 {d30[]}, [r2], r3 644 vmovl.u8 q8, d5 645 vext.8 d7, d7, d9, #4 646 vmovl.u8 q9, d6 647 vext.8 d8, d8, d28, #4 648 vmovl.u8 q10, d7 649 vext.8 d9, d9, d29, #4 650 vmovl.u8 q11, d8 651 vext.8 d28, d28, d30, #4 652 vmovl.u8 q12, d9 653 vmovl.u8 q13, d28 654 655 convolve q1, q2, q5, q6, q7, q8, q9, q10, q11, q12, q13, \idx1, \idx2, q4, q3 656 do_store4 q1, d2, q2, d4, d3, d5, \type 657 subs r4, r4, #4 658 beq 9f 659 660 vld1.32 {d2[]}, [r2], r3 661 vld1.32 {d3[]}, [r2], r3 662 vext.8 d29, d29, d2, #4 663 vext.8 d30, d30, d3, #4 664 vld1.32 {d2[1]}, [r2], r3 665 vmovl.u8 q14, d29 666 vld1.32 {d3[1]}, [r2], r3 667 vmovl.u8 q15, d30 668 vmovl.u8 q5, d2 669 vmovl.u8 q6, d3 670 671 convolve q1, q2, q9, q10, q11, q12, q13, q14, q15, q5, q6, \idx1, \idx2, q4, q3 672 do_store4 q1, d2, q2, d4, d3, d5, \type 673 6749: 675 vpop {q4-q7} 676 pop {r4-r5} 677 bx lr 678endfunc 679.endm 680 681do_8tap_4v put, 3, 4 682do_8tap_4v put, 4, 3 683do_8tap_4v avg, 3, 4 684do_8tap_4v avg, 4, 3 685 686.macro do_8tap_v_func type, filter, offset, size 687function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1 688 push {r4-r5} 689 vpush {q4-q7} 690 ldr r4, [sp, #72] 691 movrelx r12, X(ff_vp9_subpel_filters), r5 692 ldr r5, [sp, #80] 693 add r12, r12, 256*\offset 694 add r12, r12, r5, lsl #4 695 cmp r5, #8 696 mov r5, #\size 697.if \size >= 8 698 bge \type\()_8tap_8v_34 699 b \type\()_8tap_8v_43 700.else 701 bge \type\()_8tap_4v_34 702 b \type\()_8tap_4v_43 703.endif 704endfunc 705.endm 706 707.macro do_8tap_v_filters size 708do_8tap_v_func put, regular, 1, \size 709do_8tap_v_func avg, regular, 1, \size 710do_8tap_v_func put, sharp, 2, \size 711do_8tap_v_func avg, sharp, 2, \size 712do_8tap_v_func put, smooth, 0, \size 713do_8tap_v_func avg, smooth, 0, \size 714.endm 715 716do_8tap_v_filters 64 717do_8tap_v_filters 32 718do_8tap_v_filters 16 719do_8tap_v_filters 8 720do_8tap_v_filters 4 721