1/* 2 * Copyright (c) 2017 Google Inc. 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/aarch64/asm.S" 22 23// All public functions in this file have the following signature: 24// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride, 25// const uint8_t *ref, ptrdiff_t ref_stride, 26// int h, int mx, int my); 27 28function ff_vp9_avg64_16_neon, export=1 29 mov x5, x0 30 sub x1, x1, #64 31 sub x3, x3, #64 321: 33 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 34 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 35 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3 36 urhadd v0.8h, v0.8h, v4.8h 37 urhadd v1.8h, v1.8h, v5.8h 38 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1 39 urhadd v2.8h, v2.8h, v6.8h 40 urhadd v3.8h, v3.8h, v7.8h 41 subs w4, w4, #1 42 urhadd v16.8h, v16.8h, v20.8h 43 urhadd v17.8h, v17.8h, v21.8h 44 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], #64 45 urhadd v18.8h, v18.8h, v22.8h 46 urhadd v19.8h, v19.8h, v23.8h 47 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1 48 b.ne 1b 49 ret 50endfunc 51 52function ff_vp9_avg32_16_neon, export=1 53 mov x5, x0 541: 55 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x3 56 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 57 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3 58 urhadd v0.8h, v0.8h, v4.8h 59 urhadd v1.8h, v1.8h, v5.8h 60 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1 61 urhadd v2.8h, v2.8h, v6.8h 62 urhadd v3.8h, v3.8h, v7.8h 63 subs w4, w4, #2 64 urhadd v16.8h, v16.8h, v20.8h 65 urhadd v17.8h, v17.8h, v21.8h 66 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], x1 67 urhadd v18.8h, v18.8h, v22.8h 68 urhadd v19.8h, v19.8h, v23.8h 69 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1 70 b.ne 1b 71 ret 72endfunc 73 74function ff_vp9_avg16_16_neon, export=1 751: 76 ld1 {v2.8h, v3.8h}, [x2], x3 77 ld1 {v0.8h, v1.8h}, [x0] 78 urhadd v0.8h, v0.8h, v2.8h 79 urhadd v1.8h, v1.8h, v3.8h 80 subs w4, w4, #1 81 st1 {v0.8h, v1.8h}, [x0], x1 82 b.ne 1b 83 ret 84endfunc 85 86function ff_vp9_avg8_16_neon, export=1 87 mov x5, x0 881: 89 ld1 {v2.8h}, [x2], x3 90 ld1 {v0.8h}, [x0], x1 91 ld1 {v3.8h}, [x2], x3 92 urhadd v0.8h, v0.8h, v2.8h 93 ld1 {v1.8h}, [x0], x1 94 urhadd v1.8h, v1.8h, v3.8h 95 subs w4, w4, #2 96 st1 {v0.8h}, [x5], x1 97 st1 {v1.8h}, [x5], x1 98 b.ne 1b 99 ret 100endfunc 101 102function ff_vp9_avg4_16_neon, export=1 103 mov x5, x0 1041: 105 ld1 {v2.4h}, [x2], x3 106 ld1 {v0.4h}, [x0], x1 107 ld1 {v3.4h}, [x2], x3 108 urhadd v0.4h, v0.4h, v2.4h 109 ld1 {v1.4h}, [x0], x1 110 urhadd v1.4h, v1.4h, v3.4h 111 subs w4, w4, #2 112 st1 {v0.4h}, [x5], x1 113 st1 {v1.8b}, [x5], x1 114 b.ne 1b 115 ret 116endfunc 117 118 119// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6 120// for size >= 16), and multiply-accumulate into dst1 and dst5 (or 121// dst1-dst2 and dst5-dst6 for size >= 8 and dst1-dst4 and dst5-dst8 122// for size >= 16) 123.macro extmlal dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, src1, src2, src3, src4, src5, src6, offset, size 124 ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset) 125 ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset) 126 smlal \dst1\().4s, v20.4h, v0.h[\offset] 127 smlal \dst5\().4s, v22.4h, v0.h[\offset] 128.if \size >= 16 129 ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset) 130 ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) 131.endif 132.if \size >= 8 133 smlal2 \dst2\().4s, v20.8h, v0.h[\offset] 134 smlal2 \dst6\().4s, v22.8h, v0.h[\offset] 135.endif 136.if \size >= 16 137 smlal \dst3\().4s, v21.4h, v0.h[\offset] 138 smlal \dst7\().4s, v23.4h, v0.h[\offset] 139 smlal2 \dst4\().4s, v21.8h, v0.h[\offset] 140 smlal2 \dst8\().4s, v23.8h, v0.h[\offset] 141.endif 142.endm 143 144 145// Instantiate a horizontal filter function for the given size. 146// This can work on 4, 8 or 16 pixels in parallel; for larger 147// widths it will do 16 pixels at a time and loop horizontally. 148// The actual width (in bytes) is passed in x5, the height in w4 and 149// the filter coefficients in x9. 150.macro do_8tap_h type, size 151function \type\()_8tap_\size\()h 152 sub x2, x2, #6 153 add x6, x0, x1 154 add x7, x2, x3 155 add x1, x1, x1 156 add x3, x3, x3 157 // Only size >= 16 loops horizontally and needs 158 // reduced dst stride 159.if \size >= 16 160 sub x1, x1, x5 161.endif 162 // size >= 16 loads two qwords and increments r2, 163 // for size 4/8 it's enough with one qword and no 164 // postincrement 165.if \size >= 16 166 sub x3, x3, x5 167 sub x3, x3, #16 168.endif 169 // Load the filter vector 170 ld1 {v0.8h}, [x9] 1711: 172.if \size >= 16 173 mov x9, x5 174.endif 175 // Load src 176.if \size >= 16 177 ld1 {v5.8h, v6.8h, v7.8h}, [x2], #48 178 ld1 {v16.8h, v17.8h, v18.8h}, [x7], #48 179.else 180 ld1 {v5.8h, v6.8h}, [x2] 181 ld1 {v16.8h, v17.8h}, [x7] 182.endif 1832: 184 185 smull v1.4s, v5.4h, v0.h[0] 186 smull v24.4s, v16.4h, v0.h[0] 187.if \size >= 8 188 smull2 v2.4s, v5.8h, v0.h[0] 189 smull2 v25.4s, v16.8h, v0.h[0] 190.endif 191.if \size >= 16 192 smull v3.4s, v6.4h, v0.h[0] 193 smull v26.4s, v17.4h, v0.h[0] 194 smull2 v4.4s, v6.8h, v0.h[0] 195 smull2 v27.4s, v17.8h, v0.h[0] 196.endif 197 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 1, \size 198 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 2, \size 199 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 3, \size 200 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 4, \size 201 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 5, \size 202 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 6, \size 203 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 7, \size 204 205 // Round, shift and saturate 206 // The sqrshrun takes care of clamping negative values to zero, but 207 // we manually need to do umin with the max pixel value. 208 sqrshrun v1.4h, v1.4s, #7 209 sqrshrun v24.4h, v24.4s, #7 210.if \size >= 8 211 sqrshrun2 v1.8h, v2.4s, #7 212 sqrshrun2 v24.8h, v25.4s, #7 213 umin v1.8h, v1.8h, v31.8h 214 umin v24.8h, v24.8h, v31.8h 215.if \size >= 16 216 sqrshrun v2.4h, v3.4s, #7 217 sqrshrun v25.4h, v26.4s, #7 218 sqrshrun2 v2.8h, v4.4s, #7 219 sqrshrun2 v25.8h, v27.4s, #7 220 umin v2.8h, v2.8h, v31.8h 221 umin v25.8h, v25.8h, v31.8h 222.endif 223.else 224 umin v1.4h, v1.4h, v31.4h 225 umin v24.4h, v24.4h, v31.4h 226.endif 227 // Average 228.ifc \type,avg 229.if \size >= 16 230 ld1 {v3.8h, v4.8h}, [x0] 231 ld1 {v29.8h, v30.8h}, [x6] 232 urhadd v1.8h, v1.8h, v3.8h 233 urhadd v2.8h, v2.8h, v4.8h 234 urhadd v24.8h, v24.8h, v29.8h 235 urhadd v25.8h, v25.8h, v30.8h 236.elseif \size >= 8 237 ld1 {v3.8h}, [x0] 238 ld1 {v4.8h}, [x6] 239 urhadd v1.8h, v1.8h, v3.8h 240 urhadd v24.8h, v24.8h, v4.8h 241.else 242 ld1 {v3.4h}, [x0] 243 ld1 {v4.4h}, [x6] 244 urhadd v1.4h, v1.4h, v3.4h 245 urhadd v24.4h, v24.4h, v4.4h 246.endif 247.endif 248 // Store and loop horizontally (for size >= 16) 249.if \size >= 16 250 subs x9, x9, #32 251 st1 {v1.8h, v2.8h}, [x0], #32 252 st1 {v24.8h, v25.8h}, [x6], #32 253 b.eq 3f 254 mov v5.16b, v7.16b 255 mov v16.16b, v18.16b 256 ld1 {v6.8h, v7.8h}, [x2], #32 257 ld1 {v17.8h, v18.8h}, [x7], #32 258 b 2b 259.elseif \size == 8 260 st1 {v1.8h}, [x0] 261 st1 {v24.8h}, [x6] 262.else // \size == 4 263 st1 {v1.4h}, [x0] 264 st1 {v24.4h}, [x6] 265.endif 2663: 267 // Loop vertically 268 add x0, x0, x1 269 add x6, x6, x1 270 add x2, x2, x3 271 add x7, x7, x3 272 subs w4, w4, #2 273 b.ne 1b 274 ret 275endfunc 276.endm 277 278.macro do_8tap_h_size size 279do_8tap_h put, \size 280do_8tap_h avg, \size 281.endm 282 283do_8tap_h_size 4 284do_8tap_h_size 8 285do_8tap_h_size 16 286 287.macro do_8tap_h_func type, filter, offset, size, bpp 288function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1 289 mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8 290 movrel x6, X(ff_vp9_subpel_filters), 256*\offset 291 cmp w5, #8 292 add x9, x6, w5, uxtw #4 293 mov x5, #2*\size 294.if \size >= 16 295 b \type\()_8tap_16h 296.else 297 b \type\()_8tap_\size\()h 298.endif 299endfunc 300.endm 301 302.macro do_8tap_h_filters size, bpp 303do_8tap_h_func put, regular, 1, \size, \bpp 304do_8tap_h_func avg, regular, 1, \size, \bpp 305do_8tap_h_func put, sharp, 2, \size, \bpp 306do_8tap_h_func avg, sharp, 2, \size, \bpp 307do_8tap_h_func put, smooth, 0, \size, \bpp 308do_8tap_h_func avg, smooth, 0, \size, \bpp 309.endm 310 311.macro do_8tap_h_filters_bpp bpp 312do_8tap_h_filters 64, \bpp 313do_8tap_h_filters 32, \bpp 314do_8tap_h_filters 16, \bpp 315do_8tap_h_filters 8, \bpp 316do_8tap_h_filters 4, \bpp 317.endm 318 319do_8tap_h_filters_bpp 10 320do_8tap_h_filters_bpp 12 321 322 323// Vertical filters 324 325// Round, shift and saturate and store reg1-reg4 326.macro do_store4 reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, minreg, type 327 sqrshrun \reg1\().4h, \reg1\().4s, #7 328 sqrshrun \reg2\().4h, \reg2\().4s, #7 329 sqrshrun \reg3\().4h, \reg3\().4s, #7 330 sqrshrun \reg4\().4h, \reg4\().4s, #7 331.ifc \type,avg 332 ld1 {\tmp1\().4h}, [x7], x1 333 ld1 {\tmp2\().4h}, [x7], x1 334 ld1 {\tmp3\().4h}, [x7], x1 335 ld1 {\tmp4\().4h}, [x7], x1 336.endif 337 umin \reg1\().4h, \reg1\().4h, \minreg\().4h 338 umin \reg2\().4h, \reg2\().4h, \minreg\().4h 339 umin \reg3\().4h, \reg3\().4h, \minreg\().4h 340 umin \reg4\().4h, \reg4\().4h, \minreg\().4h 341.ifc \type,avg 342 urhadd \reg1\().4h, \reg1\().4h, \tmp1\().4h 343 urhadd \reg2\().4h, \reg2\().4h, \tmp2\().4h 344 urhadd \reg3\().4h, \reg3\().4h, \tmp3\().4h 345 urhadd \reg4\().4h, \reg4\().4h, \tmp4\().4h 346.endif 347 st1 {\reg1\().4h}, [x0], x1 348 st1 {\reg2\().4h}, [x0], x1 349 st1 {\reg3\().4h}, [x0], x1 350 st1 {\reg4\().4h}, [x0], x1 351.endm 352 353// Round, shift and saturate and store reg1-8, where 354// reg1-2, reg3-4 etc pairwise correspond to 4 rows. 355.macro do_store8 reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, minreg, type 356 sqrshrun \reg1\().4h, \reg1\().4s, #7 357 sqrshrun2 \reg1\().8h, \reg2\().4s, #7 358 sqrshrun \reg2\().4h, \reg3\().4s, #7 359 sqrshrun2 \reg2\().8h, \reg4\().4s, #7 360 sqrshrun \reg3\().4h, \reg5\().4s, #7 361 sqrshrun2 \reg3\().8h, \reg6\().4s, #7 362 sqrshrun \reg4\().4h, \reg7\().4s, #7 363 sqrshrun2 \reg4\().8h, \reg8\().4s, #7 364.ifc \type,avg 365 ld1 {\reg5\().8h}, [x7], x1 366 ld1 {\reg6\().8h}, [x7], x1 367 ld1 {\reg7\().8h}, [x7], x1 368 ld1 {\reg8\().8h}, [x7], x1 369.endif 370 umin \reg1\().8h, \reg1\().8h, \minreg\().8h 371 umin \reg2\().8h, \reg2\().8h, \minreg\().8h 372 umin \reg3\().8h, \reg3\().8h, \minreg\().8h 373 umin \reg4\().8h, \reg4\().8h, \minreg\().8h 374.ifc \type,avg 375 urhadd \reg1\().8h, \reg1\().8h, \reg5\().8h 376 urhadd \reg2\().8h, \reg2\().8h, \reg6\().8h 377 urhadd \reg3\().8h, \reg3\().8h, \reg7\().8h 378 urhadd \reg4\().8h, \reg4\().8h, \reg8\().8h 379.endif 380 st1 {\reg1\().8h}, [x0], x1 381 st1 {\reg2\().8h}, [x0], x1 382 st1 {\reg3\().8h}, [x0], x1 383 st1 {\reg4\().8h}, [x0], x1 384.endm 385 386// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2 387// (src1-src8 into dst1, src2-src9 into dst2). 388.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2 389 smull \dst1\().4s, \src1\().4h, v0.h[0] 390 smull \dst2\().4s, \src2\().4h, v0.h[0] 391 smull \tmp1\().4s, \src2\().4h, v0.h[1] 392 smull \tmp2\().4s, \src3\().4h, v0.h[1] 393 smlal \dst1\().4s, \src3\().4h, v0.h[2] 394 smlal \dst2\().4s, \src4\().4h, v0.h[2] 395 smlal \tmp1\().4s, \src4\().4h, v0.h[3] 396 smlal \tmp2\().4s, \src5\().4h, v0.h[3] 397 smlal \dst1\().4s, \src5\().4h, v0.h[4] 398 smlal \dst2\().4s, \src6\().4h, v0.h[4] 399 smlal \tmp1\().4s, \src6\().4h, v0.h[5] 400 smlal \tmp2\().4s, \src7\().4h, v0.h[5] 401 smlal \dst1\().4s, \src7\().4h, v0.h[6] 402 smlal \dst2\().4s, \src8\().4h, v0.h[6] 403 smlal \tmp1\().4s, \src8\().4h, v0.h[7] 404 smlal \tmp2\().4s, \src9\().4h, v0.h[7] 405 add \dst1\().4s, \dst1\().4s, \tmp1\().4s 406 add \dst2\().4s, \dst2\().4s, \tmp2\().4s 407.endm 408 409// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst4 410// (src1-src8 into dst1-dst2, src2-src9 into dst3-dst4). 411.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9 412 smull \dst1\().4s, \src1\().4h, v0.h[0] 413 smull2 \dst2\().4s, \src1\().8h, v0.h[0] 414 smull \dst3\().4s, \src2\().4h, v0.h[0] 415 smull2 \dst4\().4s, \src2\().8h, v0.h[0] 416 smlal \dst1\().4s, \src2\().4h, v0.h[1] 417 smlal2 \dst2\().4s, \src2\().8h, v0.h[1] 418 smlal \dst3\().4s, \src3\().4h, v0.h[1] 419 smlal2 \dst4\().4s, \src3\().8h, v0.h[1] 420 smlal \dst1\().4s, \src3\().4h, v0.h[2] 421 smlal2 \dst2\().4s, \src3\().8h, v0.h[2] 422 smlal \dst3\().4s, \src4\().4h, v0.h[2] 423 smlal2 \dst4\().4s, \src4\().8h, v0.h[2] 424 smlal \dst1\().4s, \src4\().4h, v0.h[3] 425 smlal2 \dst2\().4s, \src4\().8h, v0.h[3] 426 smlal \dst3\().4s, \src5\().4h, v0.h[3] 427 smlal2 \dst4\().4s, \src5\().8h, v0.h[3] 428 smlal \dst1\().4s, \src5\().4h, v0.h[4] 429 smlal2 \dst2\().4s, \src5\().8h, v0.h[4] 430 smlal \dst3\().4s, \src6\().4h, v0.h[4] 431 smlal2 \dst4\().4s, \src6\().8h, v0.h[4] 432 smlal \dst1\().4s, \src6\().4h, v0.h[5] 433 smlal2 \dst2\().4s, \src6\().8h, v0.h[5] 434 smlal \dst3\().4s, \src7\().4h, v0.h[5] 435 smlal2 \dst4\().4s, \src7\().8h, v0.h[5] 436 smlal \dst1\().4s, \src7\().4h, v0.h[6] 437 smlal2 \dst2\().4s, \src7\().8h, v0.h[6] 438 smlal \dst3\().4s, \src8\().4h, v0.h[6] 439 smlal2 \dst4\().4s, \src8\().8h, v0.h[6] 440 smlal \dst1\().4s, \src8\().4h, v0.h[7] 441 smlal2 \dst2\().4s, \src8\().8h, v0.h[7] 442 smlal \dst3\().4s, \src9\().4h, v0.h[7] 443 smlal2 \dst4\().4s, \src9\().8h, v0.h[7] 444.endm 445 446// Instantiate a vertical filter function for filtering 8 pixels at a time. 447// The height is passed in x4, the width in x5 and the filter coefficients 448// in x6. 449.macro do_8tap_8v type 450function \type\()_8tap_8v 451 sub x2, x2, x3, lsl #1 452 sub x2, x2, x3 453 ld1 {v0.8h}, [x6] 4541: 455.ifc \type,avg 456 mov x7, x0 457.endif 458 mov x6, x4 459 460 ld1 {v17.8h}, [x2], x3 461 ld1 {v18.8h}, [x2], x3 462 ld1 {v19.8h}, [x2], x3 463 ld1 {v20.8h}, [x2], x3 464 ld1 {v21.8h}, [x2], x3 465 ld1 {v22.8h}, [x2], x3 466 ld1 {v23.8h}, [x2], x3 4672: 468 ld1 {v24.8h}, [x2], x3 469 ld1 {v25.8h}, [x2], x3 470 ld1 {v26.8h}, [x2], x3 471 ld1 {v27.8h}, [x2], x3 472 473 convolve8 v2, v3, v4, v5, v17, v18, v19, v20, v21, v22, v23, v24, v25 474 convolve8 v6, v7, v30, v31, v19, v20, v21, v22, v23, v24, v25, v26, v27 475 do_store8 v2, v3, v4, v5, v6, v7, v30, v31, v1, \type 476 477 subs x6, x6, #4 478 b.eq 8f 479 480 ld1 {v16.8h}, [x2], x3 481 ld1 {v17.8h}, [x2], x3 482 ld1 {v18.8h}, [x2], x3 483 ld1 {v19.8h}, [x2], x3 484 convolve8 v2, v3, v4, v5, v21, v22, v23, v24, v25, v26, v27, v16, v17 485 convolve8 v6, v7, v20, v21, v23, v24, v25, v26, v27, v16, v17, v18, v19 486 do_store8 v2, v3, v4, v5, v6, v7, v20, v21, v1, \type 487 488 subs x6, x6, #4 489 b.eq 8f 490 491 ld1 {v20.8h}, [x2], x3 492 ld1 {v21.8h}, [x2], x3 493 ld1 {v22.8h}, [x2], x3 494 ld1 {v23.8h}, [x2], x3 495 convolve8 v2, v3, v4, v5, v25, v26, v27, v16, v17, v18, v19, v20, v21 496 convolve8 v6, v7, v24, v25, v27, v16, v17, v18, v19, v20, v21, v22, v23 497 do_store8 v2, v3, v4, v5, v6, v7, v24, v25, v1, \type 498 499 subs x6, x6, #4 500 b.ne 2b 501 5028: 503 subs x5, x5, #8 504 b.eq 9f 505 // x0 -= h * dst_stride 506 msub x0, x1, x4, x0 507 // x2 -= h * src_stride 508 msub x2, x3, x4, x2 509 // x2 -= 8 * src_stride 510 sub x2, x2, x3, lsl #3 511 // x2 += 1 * src_stride 512 add x2, x2, x3 513 add x2, x2, #16 514 add x0, x0, #16 515 b 1b 5169: 517 ret 518endfunc 519.endm 520 521do_8tap_8v put 522do_8tap_8v avg 523 524 525// Instantiate a vertical filter function for filtering a 4 pixels wide 526// slice. This only is designed to work for 4 or 8 output lines. 527.macro do_8tap_4v type 528function \type\()_8tap_4v 529 sub x2, x2, x3, lsl #1 530 sub x2, x2, x3 531 ld1 {v0.8h}, [x6] 532.ifc \type,avg 533 mov x7, x0 534.endif 535 536 ld1 {v16.4h}, [x2], x3 537 ld1 {v17.4h}, [x2], x3 538 ld1 {v18.4h}, [x2], x3 539 ld1 {v19.4h}, [x2], x3 540 ld1 {v20.4h}, [x2], x3 541 ld1 {v21.4h}, [x2], x3 542 ld1 {v22.4h}, [x2], x3 543 ld1 {v23.4h}, [x2], x3 544 ld1 {v24.4h}, [x2], x3 545 ld1 {v25.4h}, [x2], x3 546 ld1 {v26.4h}, [x2], x3 547 548 convolve4 v2, v3, v16, v17, v18, v19, v20, v21, v22, v23, v24, v30, v31 549 convolve4 v4, v5, v18, v19, v20, v21, v22, v23, v24, v25, v26, v30, v31 550 do_store4 v2, v3, v4, v5, v28, v29, v30, v31, v1, \type 551 552 subs x4, x4, #4 553 b.eq 9f 554 555 ld1 {v27.4h}, [x2], x3 556 ld1 {v28.4h}, [x2], x3 557 ld1 {v29.4h}, [x2], x3 558 ld1 {v30.4h}, [x2], x3 559 560 convolve4 v2, v3, v20, v21, v22, v23, v24, v25, v26, v27, v28, v16, v17 561 convolve4 v4, v5, v22, v23, v24, v25, v26, v27, v28, v29, v30, v16, v17 562 do_store4 v2, v3, v4, v5, v16, v17, v18, v19, v1, \type 563 5649: 565 ret 566endfunc 567.endm 568 569do_8tap_4v put 570do_8tap_4v avg 571 572 573.macro do_8tap_v_func type, filter, offset, size, bpp 574function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1 575 uxtw x4, w4 576 mvni v1.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8 577 movrel x5, X(ff_vp9_subpel_filters), 256*\offset 578 add x6, x5, w6, uxtw #4 579 mov x5, #\size 580.if \size >= 8 581 b \type\()_8tap_8v 582.else 583 b \type\()_8tap_4v 584.endif 585endfunc 586.endm 587 588.macro do_8tap_v_filters size, bpp 589do_8tap_v_func put, regular, 1, \size, \bpp 590do_8tap_v_func avg, regular, 1, \size, \bpp 591do_8tap_v_func put, sharp, 2, \size, \bpp 592do_8tap_v_func avg, sharp, 2, \size, \bpp 593do_8tap_v_func put, smooth, 0, \size, \bpp 594do_8tap_v_func avg, smooth, 0, \size, \bpp 595.endm 596 597.macro do_8tap_v_filters_bpp bpp 598do_8tap_v_filters 64, \bpp 599do_8tap_v_filters 32, \bpp 600do_8tap_v_filters 16, \bpp 601do_8tap_v_filters 8, \bpp 602do_8tap_v_filters 4, \bpp 603.endm 604 605do_8tap_v_filters_bpp 10 606do_8tap_v_filters_bpp 12 607