1/* 2 * Copyright (c) 2016 Google Inc. 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/aarch64/asm.S" 22#include "neon.S" 23 24 25// The main loop filter macro is templated and can produce filters for 26// vectors of 8 or 16 bytes. The register mapping throughout the filter 27// is close to identical to the arm version (please try to maintain this, 28// if either is changed!). When the arm version uses e.g. d20 for the 29// input variable p3, the aarch64 version uses v20.8b or v20.16b, depending 30// on vector length. 31// 32// The number of elements in the vector is passed in via the macro parameter 33// \sz, which is either .8b or .16b. For simple instructions that doesn't 34// lengthen or narrow things, this can easily be templated like this: 35// uabd v4\sz, v20\sz, v21\sz 36// 37// For instructions that lengthen or narrow content, the arm version would 38// have used q registers. For these instructions, we have macros that expand 39// into either a single e.g. uaddl instruction, or into a uaddl + uaddl2 40// pair, depending on the \sz parameter. Wherever the arm version would have 41// used a q register, these macros instead take two v registers, i.e. q3 42// is mapped to v6+v7. For the case with 8 byte input vectors, such a 43// lengthening operation is only stored in v6.8h (what was in q3 in the arm 44// case), while the 16 byte input vectors will use v6.8h + v7.8h. 45// Such a macro invocation would look like this: 46// uaddl_sz v8.8h, v9.8h, v17, v18, \sz 47// 48// That is, in the 8 byte input vector case, the second register in these 49// register pairs will be unused. 50// Unfortunately, this makes the code quite hard to read. For readability, 51// see the arm version instead. 52 53 54.macro add_sz dst1, dst2, in1, in2, in3, in4, sz 55 add \dst1, \in1, \in3 56.ifc \sz, .16b 57 add \dst2, \in2, \in4 58.endif 59.endm 60 61.macro sub_sz dst1, dst2, in1, in2, in3, in4, sz 62 sub \dst1, \in1, \in3 63.ifc \sz, .16b 64 sub \dst2, \in2, \in4 65.endif 66.endm 67 68.macro uaddw_sz dst1, dst2, in1, in2, in3, sz 69 uaddw \dst1, \in1, \in3\().8b 70.ifc \sz, .16b 71 uaddw2 \dst2, \in2, \in3\().16b 72.endif 73.endm 74 75.macro usubw_sz dst1, dst2, in1, in2, in3, sz 76 usubw \dst1, \in1, \in3\().8b 77.ifc \sz, .16b 78 usubw2 \dst2, \in2, \in3\().16b 79.endif 80.endm 81 82.macro usubl_sz dst1, dst2, in1, in2, sz 83 usubl \dst1, \in1\().8b, \in2\().8b 84.ifc \sz, .16b 85 usubl2 \dst2, \in1\().16b, \in2\().16b 86.endif 87.endm 88 89.macro sqxtn_sz dst, in1, in2, sz 90 sqxtn \dst\().8b, \in1 91.ifc \sz, .16b 92 sqxtn2 \dst\().16b, \in2 93.endif 94.endm 95 96.macro sqxtun_sz dst, in1, in2, sz 97 sqxtun \dst\().8b, \in1 98.ifc \sz, .16b 99 sqxtun2 \dst\().16b, \in2 100.endif 101.endm 102 103.macro mul_sz dst1, dst2, in1, in2, in3, in4, sz 104 mul \dst1, \in1, \in3 105.ifc \sz, .16b 106 mul \dst2, \in2, \in4 107.endif 108.endm 109 110.macro saddw_sz dst1, dst2, in1, in2, in3, sz 111 saddw \dst1, \in1, \in3\().8b 112.ifc \sz, .16b 113 saddw2 \dst2, \in2, \in3\().16b 114.endif 115.endm 116 117.macro ssubw_sz dst1, dst2, in1, in2, in3, sz 118 ssubw \dst1, \in1, \in3\().8b 119.ifc \sz, .16b 120 ssubw2 \dst2, \in2, \in3\().16b 121.endif 122.endm 123 124.macro uxtl_sz dst1, dst2, in, sz 125 uxtl \dst1, \in\().8b 126.ifc \sz, .16b 127 uxtl2 \dst2, \in\().16b 128.endif 129.endm 130 131.macro uaddl_sz dst1, dst2, in1, in2, sz 132 uaddl \dst1, \in1\().8b, \in2\().8b 133.ifc \sz, .16b 134 uaddl2 \dst2, \in1\().16b, \in2\().16b 135.endif 136.endm 137 138.macro rshrn_sz dst, in1, in2, shift, sz 139 rshrn \dst\().8b, \in1, \shift 140.ifc \sz, .16b 141 rshrn2 \dst\().16b, \in2, \shift 142.endif 143.endm 144 145.macro ushll_sz dst1, dst2, in, shift, sz 146 ushll \dst1, \in\().8b, \shift 147.ifc \sz, .16b 148 ushll2 \dst2, \in\().16b, \shift 149.endif 150.endm 151 152// The input to and output from this macro is in the registers v16-v31, 153// and v0-v7 are used as scratch registers. 154// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31 155// Depending on the width of the loop filter, we either use v16-v19 156// and v28-v31 as temp registers, or v8-v15. 157// When comparing to the arm version, tmpq1 == tmp1 + tmp2, 158// tmpq2 == tmp3 + tmp4, etc. 159.macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8 160.if \mix == 0 161 dup v0\sz, w2 // E 162 dup v2\sz, w3 // I 163 dup v3\sz, w4 // H 164.else 165 dup v0.8h, w2 // E 166 dup v2.8h, w3 // I 167 dup v3.8h, w4 // H 168 rev16 v1.16b, v0.16b // E 169 rev16 v4.16b, v2.16b // I 170 rev16 v5.16b, v3.16b // H 171 uzp1 v0.16b, v0.16b, v1.16b 172 uzp1 v2.16b, v2.16b, v4.16b 173 uzp1 v3.16b, v3.16b, v5.16b 174.endif 175 176 uabd v4\sz, v20\sz, v21\sz // abs(p3 - p2) 177 uabd v5\sz, v21\sz, v22\sz // abs(p2 - p1) 178 uabd v6\sz, v22\sz, v23\sz // abs(p1 - p0) 179 uabd v7\sz, v24\sz, v25\sz // abs(q0 - q1) 180 uabd \tmp1\sz, v25\sz, v26\sz // abs(q1 - q2) 181 uabd \tmp2\sz, v26\sz, v27\sz // abs(q2 - q3) 182 umax v4\sz, v4\sz, v5\sz 183 umax v5\sz, v6\sz, v7\sz 184 umax \tmp1\sz, \tmp1\sz, \tmp2\sz 185 uabd v6\sz, v23\sz, v24\sz // abs(p0 - q0) 186 umax v4\sz, v4\sz, v5\sz 187 uqadd v6\sz, v6\sz, v6\sz // abs(p0 - q0) * 2 188 uabd v5\sz, v22\sz, v25\sz // abs(p1 - q1) 189 umax v4\sz, v4\sz, \tmp1\sz // max(abs(p3 - p2), ..., abs(q2 - q3)) 190 ushr v5\sz, v5\sz, #1 191 cmhs v4\sz, v2\sz, v4\sz // max(abs()) <= I 192 uqadd v6\sz, v6\sz, v5\sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 193 cmhs v5\sz, v0\sz, v6\sz 194 and v4\sz, v4\sz, v5\sz // fm 195 196 // If no pixels need filtering, just exit as soon as possible 197 mov x5, v4.d[0] 198.ifc \sz, .16b 199 mov x6, v4.d[1] 200 adds x5, x5, x6 201 b.eq 9f 202.else 203 cbz x5, 9f 204.endif 205 206.if \wd >= 8 207 movi v0\sz, #1 208 209 uabd v6\sz, v20\sz, v23\sz // abs(p3 - p0) 210 uabd v2\sz, v21\sz, v23\sz // abs(p2 - p0) 211 uabd v1\sz, v22\sz, v23\sz // abs(p1 - p0) 212 uabd \tmp1\sz, v25\sz, v24\sz // abs(q1 - q0) 213 uabd \tmp2\sz, v26\sz, v24\sz // abs(q2 - q0) 214 uabd \tmp3\sz, v27\sz, v24\sz // abs(q3 - q0) 215 umax v6\sz, v6\sz, v2\sz 216 umax v1\sz, v1\sz, \tmp1\sz 217 umax \tmp2\sz, \tmp2\sz, \tmp3\sz 218.if \wd == 16 219 uabd v7\sz, v16\sz, v23\sz // abs(p7 - p0) 220 umax v6\sz, v6\sz, v1\sz 221 uabd v2\sz, v17\sz, v23\sz // abs(p6 - p0) 222 umax v6\sz, v6\sz, \tmp2\sz 223 uabd v1\sz, v18\sz, v23\sz // abs(p5 - p0) 224 cmhs v6\sz, v0\sz, v6\sz // flat8in 225 uabd v8\sz, v19\sz, v23\sz // abs(p4 - p0) 226 and v6\sz, v6\sz, v4\sz // flat8in && fm 227 uabd v9\sz, v28\sz, v24\sz // abs(q4 - q0) 228 bic v4\sz, v4\sz, v6\sz // fm && !flat8in 229 uabd v10\sz, v29\sz, v24\sz // abs(q5 - q0) 230 uabd v11\sz, v30\sz, v24\sz // abs(q6 - q0) 231 uabd v12\sz, v31\sz, v24\sz // abs(q7 - q0) 232 233 umax v7\sz, v7\sz, v2\sz 234 umax v1\sz, v1\sz, v8\sz 235 umax v9\sz, v9\sz, v10\sz 236 umax v11\sz, v11\sz, v12\sz 237 // The rest of the calculation of flat8out is interleaved below 238.else 239 // The rest of the calculation of flat8in is interleaved below 240.endif 241.endif 242 243 // Calculate the normal inner loop filter for 2 or 4 pixels 244 uabd v5\sz, v22\sz, v23\sz // abs(p1 - p0) 245.if \wd == 16 246 umax v7\sz, v7\sz, v1\sz 247 umax v9\sz, v9\sz, v11\sz 248.elseif \wd == 8 249 umax v6\sz, v6\sz, v1\sz 250.endif 251 uabd v1\sz, v25\sz, v24\sz // abs(q1 - q0) 252.if \wd == 16 253 umax v7\sz, v7\sz, v9\sz 254.elseif \wd == 8 255 umax v6\sz, v6\sz, \tmp2\sz 256.endif 257 usubl_sz \tmp1\().8h, \tmp2\().8h, v22, v25, \sz // p1 - q1 258 umax v5\sz, v5\sz, v1\sz // max(abs(p1 - p0), abs(q1 - q0)) 259.if \mix != 0 260 mov v1.d[0], x11 261.endif 262 usubl_sz \tmp3\().8h, \tmp4\().8h, v24, v23, \sz // q0 - p0 263 movi \tmp5\().8h, #3 264.if \wd == 8 265 cmhs v6\sz, v0\sz, v6\sz // flat8in 266.endif 267.if \mix != 0 268 sxtl v1.8h, v1.8b 269.endif 270 cmhs v5\sz, v3\sz, v5\sz // !hev 271.if \wd == 8 272 // If a 4/8 or 8/4 mix is used, clear the relevant half of v6 273.if \mix != 0 274 and v6\sz, v6\sz, v1.16b 275.endif 276 and v6\sz, v6\sz, v4\sz // flat8in && fm 277.endif 278 sqxtn_sz \tmp1, \tmp1\().8h, \tmp2\().8h, \sz // av_clip_int8(p1 - q1) 279.if \wd == 16 280 cmhs v7\sz, v0\sz, v7\sz // flat8out 281.elseif \wd == 8 282 bic v4\sz, v4\sz, v6\sz // fm && !flat8in 283.endif 284 and v5\sz, v5\sz, v4\sz // !hev && fm && !flat8in 285.if \wd == 16 286 and v7\sz, v7\sz, v6\sz // flat8out && flat8in && fm 287.endif 288 289 mul_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp5\().8h, \tmp5\().8h, \sz // 3 * (q0 - p0) 290 bic \tmp1\sz, \tmp1\sz, v5\sz // if (!hev) av_clip_int8 = 0 291 movi v2\sz, #4 292 saddw_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1, \sz // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)] 293 movi v3\sz, #3 294 sqxtn_sz \tmp1, \tmp3\().8h, \tmp4\().8h, \sz // f 295.if \wd == 16 296 bic v6\sz, v6\sz, v7\sz // fm && flat8in && !flat8out 297.endif 298 299 sqadd \tmp3\sz, \tmp1\sz, v2\sz // FFMIN(f + 4, 127) 300 sqadd \tmp4\sz, \tmp1\sz, v3\sz // FFMIN(f + 3, 127) 301 uxtl_sz v0.8h, v1.8h, v23, \sz // p0 302 sshr \tmp3\sz, \tmp3\sz, #3 // f1 303 sshr \tmp4\sz, \tmp4\sz, #3 // f2 304 305 uxtl_sz v2.8h, v3.8h, v24, \sz // q0 306 saddw_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp4, \sz // p0 + f2 307 ssubw_sz v2.8h, v3.8h, v2.8h, v3.8h, \tmp3, \sz // q0 - f1 308 sqxtun_sz v0, v0.8h, v1.8h, \sz // out p0 309 sqxtun_sz v1, v2.8h, v3.8h, \sz // out q0 310 srshr \tmp3\sz, \tmp3\sz, #1 // f = (f1 + 1) >> 1 311 bit v23\sz, v0\sz, v4\sz // if (fm && !flat8in) 312 bit v24\sz, v1\sz, v4\sz 313 314 uxtl_sz v0.8h, v1.8h, v22, \sz // p1 315 uxtl_sz v2.8h, v3.8h, v25, \sz // q1 316.if \wd >= 8 317 mov x5, v6.d[0] 318.ifc \sz, .16b 319 mov x6, v6.d[1] 320.endif 321.endif 322 saddw_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3, \sz // p1 + f 323 ssubw_sz v2.8h, v3.8h, v2.8h, v3.8h, \tmp3, \sz // q1 - f 324 sqxtun_sz v0, v0.8h, v1.8h, \sz // out p1 325 sqxtun_sz v2, v2.8h, v3.8h, \sz // out q1 326.if \wd >= 8 327.ifc \sz, .16b 328 adds x5, x5, x6 329.endif 330.endif 331 bit v22\sz, v0\sz, v5\sz // if (!hev && fm && !flat8in) 332 bit v25\sz, v2\sz, v5\sz 333 334 // If no pixels need flat8in, jump to flat8out 335 // (or to a writeout of the inner 4 pixels, for wd=8) 336.if \wd >= 8 337.ifc \sz, .16b 338 b.eq 6f 339.else 340 cbz x5, 6f 341.endif 342 343 // flat8in 344 uaddl_sz \tmp1\().8h, \tmp2\().8h, v20, v21, \sz 345 uaddl_sz \tmp3\().8h, \tmp4\().8h, v22, v25, \sz 346 uaddl_sz \tmp5\().8h, \tmp6\().8h, v20, v22, \sz 347 uaddl_sz \tmp7\().8h, \tmp8\().8h, v23, v26, \sz 348 add_sz v0.8h, v1.8h, \tmp1\().8h, \tmp2\().8h, \tmp1\().8h, \tmp2\().8h, \sz 349 uaddw_sz v0.8h, v1.8h, v0.8h, v1.8h, v23, \sz 350 uaddw_sz v0.8h, v1.8h, v0.8h, v1.8h, v24, \sz 351 add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp5\().8h, \tmp6\().8h, \sz 352 sub_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1\().8h, \tmp2\().8h, \sz 353 sub_sz \tmp7\().8h, \tmp8\().8h, \tmp7\().8h, \tmp8\().8h, \tmp5\().8h, \tmp6\().8h, \sz 354 rshrn_sz v2, v0.8h, v1.8h, #3, \sz // out p2 355 356 add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3\().8h, \tmp4\().8h, \sz 357 uaddl_sz \tmp1\().8h, \tmp2\().8h, v20, v23, \sz 358 uaddl_sz \tmp3\().8h, \tmp4\().8h, v24, v27, \sz 359 rshrn_sz v3, v0.8h, v1.8h, #3, \sz // out p1 360 361 add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp7\().8h, \tmp8\().8h, \sz 362 sub_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1\().8h, \tmp2\().8h, \sz 363 uaddl_sz \tmp5\().8h, \tmp6\().8h, v21, v24, \sz 364 uaddl_sz \tmp7\().8h, \tmp8\().8h, v25, v27, \sz 365 rshrn_sz v4, v0.8h, v1.8h, #3, \sz // out p0 366 367 add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3\().8h, \tmp4\().8h, \sz 368 sub_sz \tmp7\().8h, \tmp8\().8h, \tmp7\().8h, \tmp8\().8h, \tmp5\().8h, \tmp6\().8h, \sz 369 uaddl_sz \tmp1\().8h, \tmp2\().8h, v22, v25, \sz 370 uaddl_sz \tmp3\().8h, \tmp4\().8h, v26, v27, \sz 371 rshrn_sz v5, v0.8h, v1.8h, #3, \sz // out q0 372 373 add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp7\().8h, \tmp8\().8h, \sz 374 sub_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1\().8h, \tmp2\().8h, \sz 375 rshrn_sz \tmp5, v0.8h, v1.8h, #3, \sz // out q1 376 377 add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3\().8h, \tmp4\().8h, \sz 378 // The output here is written back into the input registers. This doesn't 379 // matter for the flat8part below, since we only update those pixels 380 // which won't be touched below. 381 bit v21\sz, v2\sz, v6\sz 382 bit v22\sz, v3\sz, v6\sz 383 bit v23\sz, v4\sz, v6\sz 384 rshrn_sz \tmp6, v0.8h, v1.8h, #3, \sz // out q2 385 bit v24\sz, v5\sz, v6\sz 386 bit v25\sz, \tmp5\sz, v6\sz 387 bit v26\sz, \tmp6\sz, v6\sz 388.endif 389.if \wd == 16 3906: 391 orr v2\sz, v6\sz, v7\sz 392 mov x5, v2.d[0] 393.ifc \sz, .16b 394 mov x6, v2.d[1] 395 adds x5, x5, x6 396 b.ne 1f 397.else 398 cbnz x5, 1f 399.endif 400 // If no pixels needed flat8in nor flat8out, jump to a 401 // writeout of the inner 4 pixels 402 br x14 4031: 404 405 mov x5, v7.d[0] 406.ifc \sz, .16b 407 mov x6, v7.d[1] 408 adds x5, x5, x6 409 b.ne 1f 410.else 411 cbnz x5, 1f 412.endif 413 // If no pixels need flat8out, jump to a writeout of the inner 6 pixels 414 br x15 415 4161: 417 // flat8out 418 // This writes all outputs into v2-v17 (skipping v6 and v16). 419 // If this part is skipped, the output is read from v21-v26 (which is the input 420 // to this section). 421 ushll_sz v0.8h, v1.8h, v16, #3, \sz // 8 * v16 422 usubw_sz v0.8h, v1.8h, v0.8h, v1.8h, v16, \sz // 7 * v16 423 uaddw_sz v0.8h, v1.8h, v0.8h, v1.8h, v17, \sz 424 uaddl_sz v8.8h, v9.8h, v17, v18, \sz 425 uaddl_sz v10.8h, v11.8h, v19, v20, \sz 426 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v8.8h, v9.8h, \sz 427 uaddl_sz v8.8h, v9.8h, v16, v17, \sz 428 uaddl_sz v12.8h, v13.8h, v21, v22, \sz 429 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz 430 uaddl_sz v10.8h, v11.8h, v18, v25, \sz 431 uaddl_sz v14.8h, v15.8h, v23, v24, \sz 432 sub_sz v10.8h, v11.8h, v10.8h, v11.8h, v8.8h, v9.8h, \sz 433 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v12.8h, v13.8h, \sz 434 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz 435 uaddl_sz v12.8h, v13.8h, v16, v18, \sz 436 uaddl_sz v14.8h, v15.8h, v19, v26, \sz 437 rshrn_sz v2, v0.8h, v1.8h, #4, \sz 438 439 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz 440 uaddl_sz v8.8h, v9.8h, v16, v19, \sz 441 uaddl_sz v10.8h, v11.8h, v20, v27, \sz 442 sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz 443 bif v2\sz, v17\sz, v7\sz 444 rshrn_sz v3, v0.8h, v1.8h, #4, \sz 445 446 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz 447 uaddl_sz v12.8h, v13.8h, v16, v20, \sz 448 uaddl_sz v14.8h, v15.8h, v21, v28, \sz 449 sub_sz v10.8h, v11.8h, v10.8h, v11.8h, v8.8h, v9.8h, \sz 450 bif v3\sz, v18\sz, v7\sz 451 rshrn_sz v4, v0.8h, v1.8h, #4, \sz 452 453 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz 454 uaddl_sz v8.8h, v9.8h, v16, v21, \sz 455 uaddl_sz v10.8h, v11.8h, v22, v29, \sz 456 sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz 457 bif v4\sz, v19\sz, v7\sz 458 rshrn_sz v5, v0.8h, v1.8h, #4, \sz 459 460 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz 461 uaddl_sz v12.8h, v13.8h, v16, v22, \sz 462 uaddl_sz v14.8h, v15.8h, v23, v30, \sz 463 sub_sz v10.8h, v11.8h, v10.8h, v11.8h, v8.8h, v9.8h, \sz 464 bif v5\sz, v20\sz, v7\sz 465 rshrn_sz v6, v0.8h, v1.8h, #4, \sz 466 467 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz 468 uaddl_sz v10.8h, v11.8h, v16, v23, \sz 469 sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz 470 uaddl_sz v12.8h, v13.8h, v24, v31, \sz 471 bif v6\sz, v21\sz, v7\sz 472 rshrn_sz v8, v0.8h, v1.8h, #4, \sz 473 474 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz 475 sub_sz v10.8h, v11.8h, v12.8h, v13.8h, v10.8h, v11.8h, \sz 476 uaddl_sz v12.8h, v13.8h, v17, v24, \sz 477 uaddl_sz v14.8h, v15.8h, v25, v31, \sz 478 bif v8\sz, v22\sz, v7\sz 479 rshrn_sz v9, v0.8h, v1.8h, #4, \sz 480 481 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz 482 sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz 483 uaddl_sz v12.8h, v13.8h, v26, v31, \sz 484 bif v9\sz, v23\sz, v7\sz 485 rshrn_sz v10, v0.8h, v1.8h, #4, \sz 486 487 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz 488 uaddl_sz v14.8h, v15.8h, v18, v25, \sz 489 uaddl_sz v18.8h, v19.8h, v19, v26, \sz 490 sub_sz v12.8h, v13.8h, v12.8h, v13.8h, v14.8h, v15.8h, \sz 491 uaddl_sz v14.8h, v15.8h, v27, v31, \sz 492 bif v10\sz, v24\sz, v7\sz 493 rshrn_sz v11, v0.8h, v1.8h, #4, \sz 494 495 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v12.8h, v13.8h, \sz 496 uaddl_sz v12.8h, v13.8h, v20, v27, \sz 497 sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v18.8h, v19.8h, \sz 498 uaddl_sz v18.8h, v19.8h, v28, v31, \sz 499 bif v11\sz, v25\sz, v7\sz 500 sub_sz v18.8h, v19.8h, v18.8h, v19.8h, v12.8h, v13.8h, \sz 501 rshrn_sz v12, v0.8h, v1.8h, #4, \sz 502 503 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz 504 uaddl_sz v14.8h, v15.8h, v21, v28, \sz 505 uaddl_sz v20.8h, v21.8h, v29, v31, \sz 506 bif v12\sz, v26\sz, v7\sz 507 rshrn_sz v13, v0.8h, v1.8h, #4, \sz 508 509 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v18.8h, v19.8h, \sz 510 sub_sz v20.8h, v21.8h, v20.8h, v21.8h, v14.8h, v15.8h, \sz 511 uaddl_sz v18.8h, v19.8h, v22, v29, \sz 512 uaddl_sz v22.8h, v23.8h, v30, v31, \sz 513 bif v13\sz, v27\sz, v7\sz 514 rshrn_sz v14, v0.8h, v1.8h, #4, \sz 515 516 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v20.8h, v21.8h, \sz 517 sub_sz v22.8h, v23.8h, v22.8h, v23.8h, v18.8h, v19.8h, \sz 518 bif v14\sz, v28\sz, v7\sz 519 rshrn_sz v15, v0.8h, v1.8h, #4, \sz 520 521 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v22.8h, v23.8h, \sz 522 bif v15\sz, v29\sz, v7\sz 523 rshrn_sz v17, v0.8h, v1.8h, #4, \sz 524 bif v17\sz, v30\sz, v7\sz 525.endif 526.endm 527 528// For wd <= 8, we use v16-v19 and v28-v31 for temp registers, 529// while we need those for inputs/outputs in wd=16 and use v8-v15 530// for temp registers there instead. 531function vp9_loop_filter_4 532 loop_filter 4, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31 533 ret 5349: 535 br x10 536endfunc 537 538function vp9_loop_filter_4_16b_mix_44 539 loop_filter 4, .16b, 44, v16, v17, v18, v19, v28, v29, v30, v31 540 ret 5419: 542 br x10 543endfunc 544 545function vp9_loop_filter_8 546 loop_filter 8, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31 547 ret 5486: 549 br x13 5509: 551 br x10 552endfunc 553 554function vp9_loop_filter_8_16b_mix 555 loop_filter 8, .16b, 88, v16, v17, v18, v19, v28, v29, v30, v31 556 ret 5576: 558 br x13 5599: 560 br x10 561endfunc 562 563function vp9_loop_filter_16 564 loop_filter 16, .8b, 0, v8, v9, v10, v11, v12, v13, v14, v15 565 ret 5669: 567 ldp d8, d9, [sp], 0x10 568 ldp d10, d11, [sp], 0x10 569 ldp d12, d13, [sp], 0x10 570 ldp d14, d15, [sp], 0x10 571 br x10 572endfunc 573 574function vp9_loop_filter_16_16b 575 loop_filter 16, .16b, 0, v8, v9, v10, v11, v12, v13, v14, v15 576 ret 5779: 578 ldp d8, d9, [sp], 0x10 579 ldp d10, d11, [sp], 0x10 580 ldp d12, d13, [sp], 0x10 581 ldp d14, d15, [sp], 0x10 582 br x10 583endfunc 584 585.macro loop_filter_4 586 bl vp9_loop_filter_4 587.endm 588 589.macro loop_filter_4_16b_mix mix 590 bl vp9_loop_filter_4_16b_mix_\mix 591.endm 592 593.macro loop_filter_8 594 // calculate alternative 'return' targets 595 adr x13, 6f 596 bl vp9_loop_filter_8 597.endm 598 599.macro loop_filter_8_16b_mix mix 600 // calculate alternative 'return' targets 601 adr x13, 6f 602.if \mix == 48 603 mov x11, #0xffffffff00000000 604.elseif \mix == 84 605 mov x11, #0x00000000ffffffff 606.else 607 mov x11, #0xffffffffffffffff 608.endif 609 bl vp9_loop_filter_8_16b_mix 610.endm 611 612.macro loop_filter_16 613 // calculate alternative 'return' targets 614 adr x14, 7f 615 adr x15, 8f 616 bl vp9_loop_filter_16 617.endm 618 619.macro loop_filter_16_16b 620 // calculate alternative 'return' targets 621 adr x14, 7f 622 adr x15, 8f 623 bl vp9_loop_filter_16_16b 624.endm 625 626 627// The public functions in this file have got the following signature: 628// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr); 629 630function ff_vp9_loop_filter_v_4_8_neon, export=1 631 mov x10, x30 632 sub x9, x0, x1, lsl #2 633 ld1 {v20.8b}, [x9], x1 // p3 634 ld1 {v24.8b}, [x0], x1 // q0 635 ld1 {v21.8b}, [x9], x1 // p2 636 ld1 {v25.8b}, [x0], x1 // q1 637 ld1 {v22.8b}, [x9], x1 // p1 638 ld1 {v26.8b}, [x0], x1 // q2 639 ld1 {v23.8b}, [x9], x1 // p0 640 ld1 {v27.8b}, [x0], x1 // q3 641 sub x0, x0, x1, lsl #2 642 sub x9, x9, x1, lsl #1 643 644 loop_filter_4 645 646 st1 {v22.8b}, [x9], x1 647 st1 {v24.8b}, [x0], x1 648 st1 {v23.8b}, [x9], x1 649 st1 {v25.8b}, [x0], x1 650 651 br x10 652endfunc 653 654function ff_vp9_loop_filter_v_44_16_neon, export=1 655 mov x10, x30 656 sub x9, x0, x1, lsl #2 657 ld1 {v20.16b}, [x9], x1 // p3 658 ld1 {v24.16b}, [x0], x1 // q0 659 ld1 {v21.16b}, [x9], x1 // p2 660 ld1 {v25.16b}, [x0], x1 // q1 661 ld1 {v22.16b}, [x9], x1 // p1 662 ld1 {v26.16b}, [x0], x1 // q2 663 ld1 {v23.16b}, [x9], x1 // p0 664 ld1 {v27.16b}, [x0], x1 // q3 665 sub x0, x0, x1, lsl #2 666 sub x9, x9, x1, lsl #1 667 668 loop_filter_4_16b_mix 44 669 670 st1 {v22.16b}, [x9], x1 671 st1 {v24.16b}, [x0], x1 672 st1 {v23.16b}, [x9], x1 673 st1 {v25.16b}, [x0], x1 674 675 br x10 676endfunc 677 678function ff_vp9_loop_filter_h_4_8_neon, export=1 679 mov x10, x30 680 sub x9, x0, #4 681 add x0, x9, x1, lsl #2 682 ld1 {v20.8b}, [x9], x1 683 ld1 {v24.8b}, [x0], x1 684 ld1 {v21.8b}, [x9], x1 685 ld1 {v25.8b}, [x0], x1 686 ld1 {v22.8b}, [x9], x1 687 ld1 {v26.8b}, [x0], x1 688 ld1 {v23.8b}, [x9], x1 689 ld1 {v27.8b}, [x0], x1 690 691 sub x9, x9, x1, lsl #2 692 sub x0, x0, x1, lsl #2 693 // Move x0/x9 forward by 2 pixels; we don't need to rewrite the 694 // outermost 2 pixels since they aren't changed. 695 add x9, x9, #2 696 add x0, x0, #2 697 698 transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 699 700 loop_filter_4 701 702 // We only will write the mid 4 pixels back; after the loop filter, 703 // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels). 704 // We need to transpose them to columns, done with a 4x8 transpose 705 // (which in practice is two 4x4 transposes of the two 4x4 halves 706 // of the 8x4 pixels; into 4x8 pixels). 707 transpose_4x8B v22, v23, v24, v25, v26, v27, v28, v29 708 st1 {v22.s}[0], [x9], x1 709 st1 {v22.s}[1], [x0], x1 710 st1 {v23.s}[0], [x9], x1 711 st1 {v23.s}[1], [x0], x1 712 st1 {v24.s}[0], [x9], x1 713 st1 {v24.s}[1], [x0], x1 714 st1 {v25.s}[0], [x9], x1 715 st1 {v25.s}[1], [x0], x1 716 717 br x10 718endfunc 719 720function ff_vp9_loop_filter_h_44_16_neon, export=1 721 mov x10, x30 722 sub x9, x0, #4 723 add x0, x9, x1, lsl #3 724 ld1 {v20.8b}, [x9], x1 725 ld1 {v20.d}[1], [x0], x1 726 ld1 {v21.8b}, [x9], x1 727 ld1 {v21.d}[1], [x0], x1 728 ld1 {v22.8b}, [x9], x1 729 ld1 {v22.d}[1], [x0], x1 730 ld1 {v23.8b}, [x9], x1 731 ld1 {v23.d}[1], [x0], x1 732 ld1 {v24.8b}, [x9], x1 733 ld1 {v24.d}[1], [x0], x1 734 ld1 {v25.8b}, [x9], x1 735 ld1 {v25.d}[1], [x0], x1 736 ld1 {v26.8b}, [x9], x1 737 ld1 {v26.d}[1], [x0], x1 738 ld1 {v27.8b}, [x9], x1 739 ld1 {v27.d}[1], [x0], x1 740 741 sub x9, x9, x1, lsl #3 742 sub x0, x0, x1, lsl #3 743 add x9, x9, #2 744 add x0, x0, #2 745 746 transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 747 748 loop_filter_4_16b_mix 44 749 750 transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29 751 752 st1 {v22.s}[0], [x9], x1 753 st1 {v22.s}[2], [x0], x1 754 st1 {v23.s}[0], [x9], x1 755 st1 {v23.s}[2], [x0], x1 756 st1 {v24.s}[0], [x9], x1 757 st1 {v24.s}[2], [x0], x1 758 st1 {v25.s}[0], [x9], x1 759 st1 {v25.s}[2], [x0], x1 760 st1 {v22.s}[1], [x9], x1 761 st1 {v22.s}[3], [x0], x1 762 st1 {v23.s}[1], [x9], x1 763 st1 {v23.s}[3], [x0], x1 764 st1 {v24.s}[1], [x9], x1 765 st1 {v24.s}[3], [x0], x1 766 st1 {v25.s}[1], [x9], x1 767 st1 {v25.s}[3], [x0], x1 768 769 br x10 770endfunc 771 772function ff_vp9_loop_filter_v_8_8_neon, export=1 773 mov x10, x30 774 sub x9, x0, x1, lsl #2 775 ld1 {v20.8b}, [x9], x1 // p3 776 ld1 {v24.8b}, [x0], x1 // q0 777 ld1 {v21.8b}, [x9], x1 // p2 778 ld1 {v25.8b}, [x0], x1 // q1 779 ld1 {v22.8b}, [x9], x1 // p1 780 ld1 {v26.8b}, [x0], x1 // q2 781 ld1 {v23.8b}, [x9], x1 // p0 782 ld1 {v27.8b}, [x0], x1 // q3 783 sub x9, x9, x1, lsl #2 784 sub x0, x0, x1, lsl #2 785 add x9, x9, x1 786 787 loop_filter_8 788 789 st1 {v21.8b}, [x9], x1 790 st1 {v24.8b}, [x0], x1 791 st1 {v22.8b}, [x9], x1 792 st1 {v25.8b}, [x0], x1 793 st1 {v23.8b}, [x9], x1 794 st1 {v26.8b}, [x0], x1 795 796 br x10 7976: 798 sub x9, x0, x1, lsl #1 799 st1 {v22.8b}, [x9], x1 800 st1 {v24.8b}, [x0], x1 801 st1 {v23.8b}, [x9], x1 802 st1 {v25.8b}, [x0], x1 803 br x10 804endfunc 805 806.macro mix_v_16 mix 807function ff_vp9_loop_filter_v_\mix\()_16_neon, export=1 808 mov x10, x30 809 sub x9, x0, x1, lsl #2 810 ld1 {v20.16b}, [x9], x1 // p3 811 ld1 {v24.16b}, [x0], x1 // q0 812 ld1 {v21.16b}, [x9], x1 // p2 813 ld1 {v25.16b}, [x0], x1 // q1 814 ld1 {v22.16b}, [x9], x1 // p1 815 ld1 {v26.16b}, [x0], x1 // q2 816 ld1 {v23.16b}, [x9], x1 // p0 817 ld1 {v27.16b}, [x0], x1 // q3 818 sub x9, x9, x1, lsl #2 819 sub x0, x0, x1, lsl #2 820 add x9, x9, x1 821 822 loop_filter_8_16b_mix \mix 823 824 st1 {v21.16b}, [x9], x1 825 st1 {v24.16b}, [x0], x1 826 st1 {v22.16b}, [x9], x1 827 st1 {v25.16b}, [x0], x1 828 st1 {v23.16b}, [x9], x1 829 st1 {v26.16b}, [x0], x1 830 831 br x10 8326: 833 sub x9, x0, x1, lsl #1 834 st1 {v22.16b}, [x9], x1 835 st1 {v24.16b}, [x0], x1 836 st1 {v23.16b}, [x9], x1 837 st1 {v25.16b}, [x0], x1 838 br x10 839endfunc 840.endm 841 842mix_v_16 48 843mix_v_16 84 844mix_v_16 88 845 846function ff_vp9_loop_filter_h_8_8_neon, export=1 847 mov x10, x30 848 sub x9, x0, #4 849 add x0, x9, x1, lsl #2 850 ld1 {v20.8b}, [x9], x1 851 ld1 {v24.8b}, [x0], x1 852 ld1 {v21.8b}, [x9], x1 853 ld1 {v25.8b}, [x0], x1 854 ld1 {v22.8b}, [x9], x1 855 ld1 {v26.8b}, [x0], x1 856 ld1 {v23.8b}, [x9], x1 857 ld1 {v27.8b}, [x0], x1 858 859 sub x9, x9, x1, lsl #2 860 sub x0, x0, x1, lsl #2 861 862 transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 863 864 loop_filter_8 865 866 // Even though only 6 pixels per row have been changed, we write the 867 // full 8 pixel registers. 868 transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 869 870 st1 {v20.8b}, [x9], x1 871 st1 {v24.8b}, [x0], x1 872 st1 {v21.8b}, [x9], x1 873 st1 {v25.8b}, [x0], x1 874 st1 {v22.8b}, [x9], x1 875 st1 {v26.8b}, [x0], x1 876 st1 {v23.8b}, [x9], x1 877 st1 {v27.8b}, [x0], x1 878 879 br x10 8806: 881 // If we didn't need to do the flat8in part, we use the same writeback 882 // as in loop_filter_h_4_8. 883 add x9, x9, #2 884 add x0, x0, #2 885 transpose_4x8B v22, v23, v24, v25, v26, v27, v28, v29 886 st1 {v22.s}[0], [x9], x1 887 st1 {v22.s}[1], [x0], x1 888 st1 {v23.s}[0], [x9], x1 889 st1 {v23.s}[1], [x0], x1 890 st1 {v24.s}[0], [x9], x1 891 st1 {v24.s}[1], [x0], x1 892 st1 {v25.s}[0], [x9], x1 893 st1 {v25.s}[1], [x0], x1 894 br x10 895endfunc 896 897.macro mix_h_16 mix 898function ff_vp9_loop_filter_h_\mix\()_16_neon, export=1 899 mov x10, x30 900 sub x9, x0, #4 901 add x0, x9, x1, lsl #3 902 ld1 {v20.8b}, [x9], x1 903 ld1 {v20.d}[1], [x0], x1 904 ld1 {v21.8b}, [x9], x1 905 ld1 {v21.d}[1], [x0], x1 906 ld1 {v22.8b}, [x9], x1 907 ld1 {v22.d}[1], [x0], x1 908 ld1 {v23.8b}, [x9], x1 909 ld1 {v23.d}[1], [x0], x1 910 ld1 {v24.8b}, [x9], x1 911 ld1 {v24.d}[1], [x0], x1 912 ld1 {v25.8b}, [x9], x1 913 ld1 {v25.d}[1], [x0], x1 914 ld1 {v26.8b}, [x9], x1 915 ld1 {v26.d}[1], [x0], x1 916 ld1 {v27.8b}, [x9], x1 917 ld1 {v27.d}[1], [x0], x1 918 919 sub x9, x9, x1, lsl #3 920 sub x0, x0, x1, lsl #3 921 922 transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 923 924 loop_filter_8_16b_mix \mix 925 926 transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 927 928 st1 {v20.8b}, [x9], x1 929 st1 {v20.d}[1], [x0], x1 930 st1 {v21.8b}, [x9], x1 931 st1 {v21.d}[1], [x0], x1 932 st1 {v22.8b}, [x9], x1 933 st1 {v22.d}[1], [x0], x1 934 st1 {v23.8b}, [x9], x1 935 st1 {v23.d}[1], [x0], x1 936 st1 {v24.8b}, [x9], x1 937 st1 {v24.d}[1], [x0], x1 938 st1 {v25.8b}, [x9], x1 939 st1 {v25.d}[1], [x0], x1 940 st1 {v26.8b}, [x9], x1 941 st1 {v26.d}[1], [x0], x1 942 st1 {v27.8b}, [x9], x1 943 st1 {v27.d}[1], [x0], x1 944 945 br x10 9466: 947 add x9, x9, #2 948 add x0, x0, #2 949 transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29 950 st1 {v22.s}[0], [x9], x1 951 st1 {v22.s}[2], [x0], x1 952 st1 {v23.s}[0], [x9], x1 953 st1 {v23.s}[2], [x0], x1 954 st1 {v24.s}[0], [x9], x1 955 st1 {v24.s}[2], [x0], x1 956 st1 {v25.s}[0], [x9], x1 957 st1 {v25.s}[2], [x0], x1 958 st1 {v22.s}[1], [x9], x1 959 st1 {v22.s}[3], [x0], x1 960 st1 {v23.s}[1], [x9], x1 961 st1 {v23.s}[3], [x0], x1 962 st1 {v24.s}[1], [x9], x1 963 st1 {v24.s}[3], [x0], x1 964 st1 {v25.s}[1], [x9], x1 965 st1 {v25.s}[3], [x0], x1 966 br x10 967endfunc 968.endm 969 970mix_h_16 48 971mix_h_16 84 972mix_h_16 88 973 974function ff_vp9_loop_filter_v_16_8_neon, export=1 975 mov x10, x30 976 stp d14, d15, [sp, #-0x10]! 977 stp d12, d13, [sp, #-0x10]! 978 stp d10, d11, [sp, #-0x10]! 979 stp d8, d9, [sp, #-0x10]! 980 sub x9, x0, x1, lsl #3 981 ld1 {v16.8b}, [x9], x1 // p7 982 ld1 {v24.8b}, [x0], x1 // q0 983 ld1 {v17.8b}, [x9], x1 // p6 984 ld1 {v25.8b}, [x0], x1 // q1 985 ld1 {v18.8b}, [x9], x1 // p5 986 ld1 {v26.8b}, [x0], x1 // q2 987 ld1 {v19.8b}, [x9], x1 // p4 988 ld1 {v27.8b}, [x0], x1 // q3 989 ld1 {v20.8b}, [x9], x1 // p3 990 ld1 {v28.8b}, [x0], x1 // q4 991 ld1 {v21.8b}, [x9], x1 // p2 992 ld1 {v29.8b}, [x0], x1 // q5 993 ld1 {v22.8b}, [x9], x1 // p1 994 ld1 {v30.8b}, [x0], x1 // q6 995 ld1 {v23.8b}, [x9], x1 // p0 996 ld1 {v31.8b}, [x0], x1 // q7 997 sub x9, x9, x1, lsl #3 998 sub x0, x0, x1, lsl #3 999 add x9, x9, x1 1000 1001 loop_filter_16 1002 1003 // If we did the flat8out part, we get the output in 1004 // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride, 1005 // store v2-v9 there, and v10-v17 into x0. 1006 st1 {v2.8b}, [x9], x1 1007 st1 {v10.8b}, [x0], x1 1008 st1 {v3.8b}, [x9], x1 1009 st1 {v11.8b}, [x0], x1 1010 st1 {v4.8b}, [x9], x1 1011 st1 {v12.8b}, [x0], x1 1012 st1 {v5.8b}, [x9], x1 1013 st1 {v13.8b}, [x0], x1 1014 st1 {v6.8b}, [x9], x1 1015 st1 {v14.8b}, [x0], x1 1016 st1 {v8.8b}, [x9], x1 1017 st1 {v15.8b}, [x0], x1 1018 st1 {v9.8b}, [x9], x1 1019 st1 {v17.8b}, [x0], x1 10209: 1021 ldp d8, d9, [sp], 0x10 1022 ldp d10, d11, [sp], 0x10 1023 ldp d12, d13, [sp], 0x10 1024 ldp d14, d15, [sp], 0x10 1025 br x10 10268: 1027 add x9, x9, x1, lsl #2 1028 // If we didn't do the flat8out part, the output is left in the 1029 // input registers. 1030 st1 {v21.8b}, [x9], x1 1031 st1 {v24.8b}, [x0], x1 1032 st1 {v22.8b}, [x9], x1 1033 st1 {v25.8b}, [x0], x1 1034 st1 {v23.8b}, [x9], x1 1035 st1 {v26.8b}, [x0], x1 1036 b 9b 10377: 1038 sub x9, x0, x1, lsl #1 1039 st1 {v22.8b}, [x9], x1 1040 st1 {v24.8b}, [x0], x1 1041 st1 {v23.8b}, [x9], x1 1042 st1 {v25.8b}, [x0], x1 1043 b 9b 1044endfunc 1045 1046function ff_vp9_loop_filter_v_16_16_neon, export=1 1047 mov x10, x30 1048 stp d14, d15, [sp, #-0x10]! 1049 stp d12, d13, [sp, #-0x10]! 1050 stp d10, d11, [sp, #-0x10]! 1051 stp d8, d9, [sp, #-0x10]! 1052 sub x9, x0, x1, lsl #3 1053 ld1 {v16.16b}, [x9], x1 // p7 1054 ld1 {v24.16b}, [x0], x1 // q0 1055 ld1 {v17.16b}, [x9], x1 // p6 1056 ld1 {v25.16b}, [x0], x1 // q1 1057 ld1 {v18.16b}, [x9], x1 // p5 1058 ld1 {v26.16b}, [x0], x1 // q2 1059 ld1 {v19.16b}, [x9], x1 // p4 1060 ld1 {v27.16b}, [x0], x1 // q3 1061 ld1 {v20.16b}, [x9], x1 // p3 1062 ld1 {v28.16b}, [x0], x1 // q4 1063 ld1 {v21.16b}, [x9], x1 // p2 1064 ld1 {v29.16b}, [x0], x1 // q5 1065 ld1 {v22.16b}, [x9], x1 // p1 1066 ld1 {v30.16b}, [x0], x1 // q6 1067 ld1 {v23.16b}, [x9], x1 // p0 1068 ld1 {v31.16b}, [x0], x1 // q7 1069 sub x9, x9, x1, lsl #3 1070 sub x0, x0, x1, lsl #3 1071 add x9, x9, x1 1072 1073 loop_filter_16_16b 1074 1075 st1 {v2.16b}, [x9], x1 1076 st1 {v10.16b}, [x0], x1 1077 st1 {v3.16b}, [x9], x1 1078 st1 {v11.16b}, [x0], x1 1079 st1 {v4.16b}, [x9], x1 1080 st1 {v12.16b}, [x0], x1 1081 st1 {v5.16b}, [x9], x1 1082 st1 {v13.16b}, [x0], x1 1083 st1 {v6.16b}, [x9], x1 1084 st1 {v14.16b}, [x0], x1 1085 st1 {v8.16b}, [x9], x1 1086 st1 {v15.16b}, [x0], x1 1087 st1 {v9.16b}, [x9], x1 1088 st1 {v17.16b}, [x0], x1 10899: 1090 ldp d8, d9, [sp], 0x10 1091 ldp d10, d11, [sp], 0x10 1092 ldp d12, d13, [sp], 0x10 1093 ldp d14, d15, [sp], 0x10 1094 br x10 10958: 1096 add x9, x9, x1, lsl #2 1097 st1 {v21.16b}, [x9], x1 1098 st1 {v24.16b}, [x0], x1 1099 st1 {v22.16b}, [x9], x1 1100 st1 {v25.16b}, [x0], x1 1101 st1 {v23.16b}, [x9], x1 1102 st1 {v26.16b}, [x0], x1 1103 b 9b 11047: 1105 sub x9, x0, x1, lsl #1 1106 st1 {v22.16b}, [x9], x1 1107 st1 {v24.16b}, [x0], x1 1108 st1 {v23.16b}, [x9], x1 1109 st1 {v25.16b}, [x0], x1 1110 b 9b 1111endfunc 1112 1113function ff_vp9_loop_filter_h_16_8_neon, export=1 1114 mov x10, x30 1115 stp d14, d15, [sp, #-0x10]! 1116 stp d12, d13, [sp, #-0x10]! 1117 stp d10, d11, [sp, #-0x10]! 1118 stp d8, d9, [sp, #-0x10]! 1119 sub x9, x0, #8 1120 ld1 {v16.8b}, [x9], x1 1121 ld1 {v24.8b}, [x0], x1 1122 ld1 {v17.8b}, [x9], x1 1123 ld1 {v25.8b}, [x0], x1 1124 ld1 {v18.8b}, [x9], x1 1125 ld1 {v26.8b}, [x0], x1 1126 ld1 {v19.8b}, [x9], x1 1127 ld1 {v27.8b}, [x0], x1 1128 ld1 {v20.8b}, [x9], x1 1129 ld1 {v28.8b}, [x0], x1 1130 ld1 {v21.8b}, [x9], x1 1131 ld1 {v29.8b}, [x0], x1 1132 ld1 {v22.8b}, [x9], x1 1133 ld1 {v30.8b}, [x0], x1 1134 ld1 {v23.8b}, [x9], x1 1135 ld1 {v31.8b}, [x0], x1 1136 sub x0, x0, x1, lsl #3 1137 sub x9, x9, x1, lsl #3 1138 1139 // The 16x8 pixels read above is in two 8x8 blocks; the left 1140 // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes 1141 // of this, to get one column per register. 1142 transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 1143 transpose_8x8B v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 1144 1145 loop_filter_16 1146 1147 transpose_8x8B v16, v2, v3, v4, v5, v6, v8, v9, v0, v1 1148 transpose_8x8B v10, v11, v12, v13, v14, v15, v17, v31, v0, v1 1149 1150 st1 {v16.8b}, [x9], x1 1151 st1 {v10.8b}, [x0], x1 1152 st1 {v2.8b}, [x9], x1 1153 st1 {v11.8b}, [x0], x1 1154 st1 {v3.8b}, [x9], x1 1155 st1 {v12.8b}, [x0], x1 1156 st1 {v4.8b}, [x9], x1 1157 st1 {v13.8b}, [x0], x1 1158 st1 {v5.8b}, [x9], x1 1159 st1 {v14.8b}, [x0], x1 1160 st1 {v6.8b}, [x9], x1 1161 st1 {v15.8b}, [x0], x1 1162 st1 {v8.8b}, [x9], x1 1163 st1 {v17.8b}, [x0], x1 1164 st1 {v9.8b}, [x9], x1 1165 st1 {v31.8b}, [x0], x1 11669: 1167 ldp d8, d9, [sp], 0x10 1168 ldp d10, d11, [sp], 0x10 1169 ldp d12, d13, [sp], 0x10 1170 ldp d14, d15, [sp], 0x10 1171 br x10 11728: 1173 // The same writeback as in loop_filter_h_8_8 1174 sub x9, x0, #4 1175 add x0, x9, x1, lsl #2 1176 transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 1177 1178 st1 {v20.8b}, [x9], x1 1179 st1 {v24.8b}, [x0], x1 1180 st1 {v21.8b}, [x9], x1 1181 st1 {v25.8b}, [x0], x1 1182 st1 {v22.8b}, [x9], x1 1183 st1 {v26.8b}, [x0], x1 1184 st1 {v23.8b}, [x9], x1 1185 st1 {v27.8b}, [x0], x1 1186 b 9b 11877: 1188 // The same writeback as in loop_filter_h_4_8 1189 sub x9, x0, #2 1190 add x0, x9, x1, lsl #2 1191 transpose_4x8B v22, v23, v24, v25, v26, v27, v28, v29 1192 st1 {v22.s}[0], [x9], x1 1193 st1 {v22.s}[1], [x0], x1 1194 st1 {v23.s}[0], [x9], x1 1195 st1 {v23.s}[1], [x0], x1 1196 st1 {v24.s}[0], [x9], x1 1197 st1 {v24.s}[1], [x0], x1 1198 st1 {v25.s}[0], [x9], x1 1199 st1 {v25.s}[1], [x0], x1 1200 b 9b 1201endfunc 1202 1203function ff_vp9_loop_filter_h_16_16_neon, export=1 1204 mov x10, x30 1205 stp d14, d15, [sp, #-0x10]! 1206 stp d12, d13, [sp, #-0x10]! 1207 stp d10, d11, [sp, #-0x10]! 1208 stp d8, d9, [sp, #-0x10]! 1209 sub x9, x0, #8 1210 ld1 {v16.8b}, [x9], x1 1211 ld1 {v24.8b}, [x0], x1 1212 ld1 {v17.8b}, [x9], x1 1213 ld1 {v25.8b}, [x0], x1 1214 ld1 {v18.8b}, [x9], x1 1215 ld1 {v26.8b}, [x0], x1 1216 ld1 {v19.8b}, [x9], x1 1217 ld1 {v27.8b}, [x0], x1 1218 ld1 {v20.8b}, [x9], x1 1219 ld1 {v28.8b}, [x0], x1 1220 ld1 {v21.8b}, [x9], x1 1221 ld1 {v29.8b}, [x0], x1 1222 ld1 {v22.8b}, [x9], x1 1223 ld1 {v30.8b}, [x0], x1 1224 ld1 {v23.8b}, [x9], x1 1225 ld1 {v31.8b}, [x0], x1 1226 ld1 {v16.d}[1], [x9], x1 1227 ld1 {v24.d}[1], [x0], x1 1228 ld1 {v17.d}[1], [x9], x1 1229 ld1 {v25.d}[1], [x0], x1 1230 ld1 {v18.d}[1], [x9], x1 1231 ld1 {v26.d}[1], [x0], x1 1232 ld1 {v19.d}[1], [x9], x1 1233 ld1 {v27.d}[1], [x0], x1 1234 ld1 {v20.d}[1], [x9], x1 1235 ld1 {v28.d}[1], [x0], x1 1236 ld1 {v21.d}[1], [x9], x1 1237 ld1 {v29.d}[1], [x0], x1 1238 ld1 {v22.d}[1], [x9], x1 1239 ld1 {v30.d}[1], [x0], x1 1240 ld1 {v23.d}[1], [x9], x1 1241 ld1 {v31.d}[1], [x0], x1 1242 sub x0, x0, x1, lsl #4 1243 sub x9, x9, x1, lsl #4 1244 1245 transpose_8x16B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 1246 transpose_8x16B v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 1247 1248 loop_filter_16_16b 1249 1250 transpose_8x16B v16, v2, v3, v4, v5, v6, v8, v9, v0, v1 1251 transpose_8x16B v10, v11, v12, v13, v14, v15, v17, v31, v0, v1 1252 1253 st1 {v16.8b}, [x9], x1 1254 st1 {v10.8b}, [x0], x1 1255 st1 {v2.8b}, [x9], x1 1256 st1 {v11.8b}, [x0], x1 1257 st1 {v3.8b}, [x9], x1 1258 st1 {v12.8b}, [x0], x1 1259 st1 {v4.8b}, [x9], x1 1260 st1 {v13.8b}, [x0], x1 1261 st1 {v5.8b}, [x9], x1 1262 st1 {v14.8b}, [x0], x1 1263 st1 {v6.8b}, [x9], x1 1264 st1 {v15.8b}, [x0], x1 1265 st1 {v8.8b}, [x9], x1 1266 st1 {v17.8b}, [x0], x1 1267 st1 {v9.8b}, [x9], x1 1268 st1 {v31.8b}, [x0], x1 1269 st1 {v16.d}[1], [x9], x1 1270 st1 {v10.d}[1], [x0], x1 1271 st1 {v2.d}[1], [x9], x1 1272 st1 {v11.d}[1], [x0], x1 1273 st1 {v3.d}[1], [x9], x1 1274 st1 {v12.d}[1], [x0], x1 1275 st1 {v4.d}[1], [x9], x1 1276 st1 {v13.d}[1], [x0], x1 1277 st1 {v5.d}[1], [x9], x1 1278 st1 {v14.d}[1], [x0], x1 1279 st1 {v6.d}[1], [x9], x1 1280 st1 {v15.d}[1], [x0], x1 1281 st1 {v8.d}[1], [x9], x1 1282 st1 {v17.d}[1], [x0], x1 1283 st1 {v9.d}[1], [x9], x1 1284 st1 {v31.d}[1], [x0], x1 12859: 1286 ldp d8, d9, [sp], 0x10 1287 ldp d10, d11, [sp], 0x10 1288 ldp d12, d13, [sp], 0x10 1289 ldp d14, d15, [sp], 0x10 1290 br x10 12918: 1292 sub x9, x0, #4 1293 add x0, x9, x1, lsl #3 1294 transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 1295 1296 st1 {v20.8b}, [x9], x1 1297 st1 {v20.d}[1], [x0], x1 1298 st1 {v21.8b}, [x9], x1 1299 st1 {v21.d}[1], [x0], x1 1300 st1 {v22.8b}, [x9], x1 1301 st1 {v22.d}[1], [x0], x1 1302 st1 {v23.8b}, [x9], x1 1303 st1 {v23.d}[1], [x0], x1 1304 st1 {v24.8b}, [x9], x1 1305 st1 {v24.d}[1], [x0], x1 1306 st1 {v25.8b}, [x9], x1 1307 st1 {v25.d}[1], [x0], x1 1308 st1 {v26.8b}, [x9], x1 1309 st1 {v26.d}[1], [x0], x1 1310 st1 {v27.8b}, [x9], x1 1311 st1 {v27.d}[1], [x0], x1 1312 b 9b 13137: 1314 sub x9, x0, #2 1315 add x0, x9, x1, lsl #3 1316 transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29 1317 st1 {v22.s}[0], [x9], x1 1318 st1 {v22.s}[2], [x0], x1 1319 st1 {v23.s}[0], [x9], x1 1320 st1 {v23.s}[2], [x0], x1 1321 st1 {v24.s}[0], [x9], x1 1322 st1 {v24.s}[2], [x0], x1 1323 st1 {v25.s}[0], [x9], x1 1324 st1 {v25.s}[2], [x0], x1 1325 st1 {v22.s}[1], [x9], x1 1326 st1 {v22.s}[3], [x0], x1 1327 st1 {v23.s}[1], [x9], x1 1328 st1 {v23.s}[3], [x0], x1 1329 st1 {v24.s}[1], [x9], x1 1330 st1 {v24.s}[3], [x0], x1 1331 st1 {v25.s}[1], [x9], x1 1332 st1 {v25.s}[3], [x0], x1 1333 b 9b 1334endfunc 1335