1/* 2 * Copyright (c) 2016 Google Inc. 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/arm/asm.S" 22#include "neon.S" 23 24@ Do an 8x8 transpose, using q registers for the subtransposes that don't 25@ need to address the indiviudal d registers. 26@ r0,r1 == rq0, r2,r3 == rq1, etc 27.macro transpose_q_8x8 rq0, rq1, rq2, rq3, r0, r1, r2, r3, r4, r5, r6, r7 28 vtrn.32 \rq0, \rq2 29 vtrn.32 \rq1, \rq3 30 vtrn.16 \rq0, \rq1 31 vtrn.16 \rq2, \rq3 32 vtrn.8 \r0, \r1 33 vtrn.8 \r2, \r3 34 vtrn.8 \r4, \r5 35 vtrn.8 \r6, \r7 36.endm 37 38@ Do a 4x4 transpose, using q registers for the subtransposes that don't 39@ need to address the indiviudal d registers. 40@ r0,r1 == rq0, r2,r3 == rq1 41.macro transpose_q_4x4 rq0, rq1, r0, r1, r2, r3 42 vtrn.16 \rq0, \rq1 43 vtrn.8 \r0, \r1 44 vtrn.8 \r2, \r3 45.endm 46 47@ The input to and output from this macro is in the registers q8-q15, 48@ and q0-q7 are used as scratch registers. 49@ p3 = q8, p0 = q11, q0 = q12, q3 = q15 50.macro loop_filter_q 51 vdup.u8 d0, r2 @ E 52 lsr r2, r2, #8 53 vdup.u8 d2, r3 @ I 54 lsr r3, r3, #8 55 vdup.u8 d1, r2 @ E 56 vdup.u8 d3, r3 @ I 57 58 vabd.u8 q2, q8, q9 @ abs(p3 - p2) 59 vabd.u8 q3, q9, q10 @ abs(p2 - p1) 60 vabd.u8 q4, q10, q11 @ abs(p1 - p0) 61 vabd.u8 q5, q12, q13 @ abs(q0 - q1) 62 vabd.u8 q6, q13, q14 @ abs(q1 - q2) 63 vabd.u8 q7, q14, q15 @ abs(q2 - q3) 64 vmax.u8 q2, q2, q3 65 vmax.u8 q3, q4, q5 66 vmax.u8 q4, q6, q7 67 vabd.u8 q5, q11, q12 @ abs(p0 - q0) 68 vmax.u8 q2, q2, q3 69 vqadd.u8 q5, q5, q5 @ abs(p0 - q0) * 2 70 vabd.u8 q7, q10, q13 @ abs(p1 - q1) 71 vmax.u8 q2, q2, q4 @ max(abs(p3 - p2), ..., abs(q2 - q3)) 72 vshr.u8 q7, q7, #1 73 vcle.u8 q2, q2, q1 @ max(abs()) <= I 74 vqadd.u8 q5, q5, q7 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 75 vcle.u8 q5, q5, q0 76 vand q2, q2, q5 @ fm 77 78 vshrn.u16 d10, q2, #4 79 vmov r2, r3, d10 80 orrs r2, r2, r3 81 @ If no pixels need filtering, just exit as soon as possible 82 beq 9f 83 84 @ Calculate the normal inner loop filter for 2 or 4 pixels 85 ldr r3, [sp, #64] 86 vabd.u8 q3, q10, q11 @ abs(p1 - p0) 87 vabd.u8 q4, q13, q12 @ abs(q1 - q0) 88 89 vsubl.u8 q5, d20, d26 @ p1 - q1 90 vsubl.u8 q6, d21, d27 @ p1 - q1 91 vmax.u8 q3, q3, q4 @ max(abs(p1 - p0), abs(q1 - q0)) 92 vqmovn.s16 d10, q5 @ av_clip_int8p(p1 - q1) 93 vqmovn.s16 d11, q6 @ av_clip_int8p(p1 - q1) 94 vdup.u8 d8, r3 @ H 95 lsr r3, r3, #8 96 vdup.u8 d9, r3 @ H 97 vsubl.u8 q6, d24, d22 @ q0 - p0 98 vsubl.u8 q7, d25, d23 @ q0 - p0 99 vcle.u8 q3, q3, q4 @ hev 100 vmov.s16 q0, #3 101 vand q3, q3, q2 @ !hev && fm && !flat8in 102 103 vmul.s16 q6, q6, q0 @ 3 * (q0 - p0) 104 vmul.s16 q7, q7, q0 @ 3 * (q0 - p0) 105 vbic q5, q5, q3 @ if (!hev) av_clip_int8 = 0 106 vaddw.s8 q6, q6, d10 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)] 107 vaddw.s8 q7, q7, d11 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)] 108 vmov.s8 q5, #4 109 vqmovn.s16 d12, q6 110 vqmovn.s16 d13, q7 @ av_clip_int8(3 * (q0 - p0) [+ av_clip_int8(p1 - q1)], BIT_DEPTH - 1) = f 111 vmov.s8 q0, #3 112 113 vqadd.s8 q5, q6, q5 @ FFMIN(f + 4, 127) 114 vqadd.s8 q0, q6, q0 @ FFMIN(f + 3, 127) 115 vmovl.u8 q6, d22 @ p0 116 vmovl.u8 q7, d23 @ p0 117 vshr.s8 q5, q5, #3 @ f1 118 vshr.s8 q0, q0, #3 @ f2 119 120 vaddw.s8 q6, q6, d0 @ p0 + f2 121 vaddw.s8 q7, q7, d1 @ p0 + f2 122 vqmovun.s16 d0, q6 @ out p0 123 vmovl.u8 q6, d24 @ q0 124 vqmovun.s16 d1, q7 @ out p0 125 vmovl.u8 q7, d25 @ q0 126 vsubw.s8 q6, q6, d10 @ q0 - f1 127 vsubw.s8 q7, q7, d11 @ q0 - f1 128 vqmovun.s16 d12, q6 @ out q0 129 vqmovun.s16 d13, q7 @ out q0 130 vrshr.s8 q5, q5, #1 @ f = (f1 + 1) >> 1 131 vbit q11, q0, q2 @ if (fm && !flat8in) 132 vbit q12, q6, q2 133 134 vmovl.u8 q0, d20 @ p1 135 vmovl.u8 q2, d21 @ p1 136 vmovl.u8 q6, d26 @ q1 137 vmovl.u8 q7, d27 @ q1 138 vaddw.s8 q0, q0, d10 @ p1 + f 139 vaddw.s8 q2, q2, d11 @ p1 + f 140 vsubw.s8 q6, q6, d10 @ q1 - f 141 vsubw.s8 q7, q7, d11 @ q1 - f 142 vqmovun.s16 d0, q0 @ out p1 143 vqmovun.s16 d1, q2 @ out p1 144 vqmovun.s16 d12, q6 @ out q1 145 vqmovun.s16 d13, q7 @ out q1 146 vbit q10, q0, q3 @ if (!hev && fm && !flat8in) 147 vbit q13, q6, q3 148.endm 149 150@ The input to and output from this macro is in the registers d16-d31, 151@ and d0-d7 are used as scratch registers. 152@ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31 153@ Depending on the width of the loop filter, we either use d16-d19 154@ and d28-d31 as temp registers, or d8-d15. 155@ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4 156.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4 157 vdup.u8 d0, r2 @ E 158 vdup.u8 d2, r3 @ I 159 ldr r3, [sp] 160 161 vabd.u8 d4, d20, d21 @ abs(p3 - p2) 162 vabd.u8 d5, d21, d22 @ abs(p2 - p1) 163 vabd.u8 d6, d22, d23 @ abs(p1 - p0) 164 vabd.u8 d7, d24, d25 @ abs(q0 - q1) 165 vabd.u8 \tmp1, d25, d26 @ abs(q1 - q2) 166 vabd.u8 \tmp2, d26, d27 @ abs(q2 - q3) 167 vmax.u8 d4, d4, d5 168 vmax.u8 d5, d6, d7 169 vmax.u8 \tmp1, \tmp1, \tmp2 170 vabd.u8 d6, d23, d24 @ abs(p0 - q0) 171 vmax.u8 d4, d4, d5 172 vqadd.u8 d6, d6, d6 @ abs(p0 - q0) * 2 173 vabd.u8 d5, d22, d25 @ abs(p1 - q1) 174 vmax.u8 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - q3)) 175 vshr.u8 d5, d5, #1 176 vcle.u8 d4, d4, d2 @ max(abs()) <= I 177 vqadd.u8 d6, d6, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 178 vcle.u8 d5, d6, d0 179 vand d4, d4, d5 @ fm 180 181 vdup.u8 d3, r3 @ H 182 vmov r2, r3, d4 183 orrs r2, r2, r3 184 @ If no pixels need filtering, just exit as soon as possible 185 beq 9f 186 187.if \wd >= 8 188 vmov.u8 d0, #1 189 190 vabd.u8 d6, d20, d23 @ abs(p3 - p0) 191 vabd.u8 d2, d21, d23 @ abs(p2 - p0) 192 vabd.u8 d1, d22, d23 @ abs(p1 - p0) 193 vabd.u8 \tmp1, d25, d24 @ abs(q1 - q0) 194 vabd.u8 \tmp2, d26, d24 @ abs(q2 - q0) 195 vabd.u8 \tmp3, d27, d24 @ abs(q3 - q0) 196 vmax.u8 d6, d6, d2 197 vmax.u8 d1, d1, \tmp1 198 vmax.u8 \tmp2, \tmp2, \tmp3 199.if \wd == 16 200 vabd.u8 d7, d16, d23 @ abs(p7 - p0) 201 vmax.u8 d6, d6, d1 202 vabd.u8 d2, d17, d23 @ abs(p6 - p0) 203 vmax.u8 d6, d6, \tmp2 204 vabd.u8 d1, d18, d23 @ abs(p5 - p0) 205 vcle.u8 d6, d6, d0 @ flat8in 206 vabd.u8 d8, d19, d23 @ abs(p4 - p0) 207 vand d6, d6, d4 @ flat8in && fm 208 vabd.u8 d9, d28, d24 @ abs(q4 - q0) 209 vbic d4, d4, d6 @ fm && !flat8in 210 vabd.u8 d10, d29, d24 @ abs(q5 - q0) 211 vabd.u8 d11, d30, d24 @ abs(q6 - q0) 212 vabd.u8 d12, d31, d24 @ abs(q7 - q0) 213 214 vmax.u8 d7, d7, d2 215 vmax.u8 d1, d1, d8 216 vmax.u8 d9, d9, d10 217 vmax.u8 d11, d11, d12 218 @ The rest of the calculation of flat8out is interleaved below 219.else 220 @ The rest of the calculation of flat8in is interleaved below 221.endif 222.endif 223 224 @ Calculate the normal inner loop filter for 2 or 4 pixels 225 vabd.u8 d5, d22, d23 @ abs(p1 - p0) 226.if \wd == 16 227 vmax.u8 d7, d7, d1 228 vmax.u8 d9, d9, d11 229.elseif \wd == 8 230 vmax.u8 d6, d6, d1 231.endif 232 vabd.u8 d1, d25, d24 @ abs(q1 - q0) 233.if \wd == 16 234 vmax.u8 d7, d7, d9 235.elseif \wd == 8 236 vmax.u8 d6, d6, \tmp2 237.endif 238 vsubl.u8 \tmpq1, d22, d25 @ p1 - q1 239 vmax.u8 d5, d5, d1 @ max(abs(p1 - p0), abs(q1 - q0)) 240 vsubl.u8 \tmpq2, d24, d23 @ q0 - p0 241 vmov.s16 \tmpq3, #3 242.if \wd == 8 243 vcle.u8 d6, d6, d0 @ flat8in 244.endif 245 vcle.u8 d5, d5, d3 @ !hev 246.if \wd == 8 247 vand d6, d6, d4 @ flat8in && fm 248.endif 249 vqmovn.s16 \tmp1, \tmpq1 @ av_clip_int8(p1 - q1) 250.if \wd == 16 251 vcle.u8 d7, d7, d0 @ flat8out 252.elseif \wd == 8 253 vbic d4, d4, d6 @ fm && !flat8in 254.endif 255 vand d5, d5, d4 @ !hev && fm && !flat8in 256.if \wd == 16 257 vand d7, d7, d6 @ flat8out && flat8in && fm 258.endif 259 260 vmul.s16 \tmpq2, \tmpq2, \tmpq3 @ 3 * (q0 - p0) 261 vbic \tmp1, \tmp1, d5 @ if (!hev) av_clip_int8 = 0 262 vmov.s8 d2, #4 263 vaddw.s8 \tmpq2, \tmpq2, \tmp1 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)] 264 vmov.s8 d3, #3 265 vqmovn.s16 \tmp1, \tmpq2 @ f 266.if \wd == 16 267 vbic d6, d6, d7 @ fm && flat8in && !flat8out 268.endif 269 270 vqadd.s8 \tmp3, \tmp1, d2 @ FFMIN(f + 4, 127) 271 vqadd.s8 \tmp4, \tmp1, d3 @ FFMIN(f + 3, 127) 272 vmovl.u8 q0, d23 @ p0 273 vshr.s8 \tmp3, \tmp3, #3 @ f1 274 vshr.s8 \tmp4, \tmp4, #3 @ f2 275 276 vmovl.u8 q1, d24 @ q0 277 vaddw.s8 q0, q0, \tmp4 @ p0 + f2 278 vsubw.s8 q1, q1, \tmp3 @ q0 - f1 279 vqmovun.s16 d0, q0 @ out p0 280 vqmovun.s16 d1, q1 @ out q0 281 vrshr.s8 \tmp3, \tmp3, #1 @ f = (f1 + 1) >> 1 282 vbit d23, d0, d4 @ if (fm && !flat8in) 283 vbit d24, d1, d4 284 285 vmovl.u8 q0, d22 @ p1 286 vmovl.u8 q1, d25 @ q1 287.if \wd >= 8 288 vmov r2, r3, d6 289.endif 290 vaddw.s8 q0, q0, \tmp3 @ p1 + f 291 vsubw.s8 q1, q1, \tmp3 @ q1 - f 292.if \wd >= 8 293 orrs r2, r2, r3 294.endif 295 vqmovun.s16 d0, q0 @ out p1 296 vqmovun.s16 d2, q1 @ out q1 297 vbit d22, d0, d5 @ if (!hev && fm && !flat8in) 298 vbit d25, d2, d5 299 300.if \wd >= 8 301 @ If no pixels need flat8in, jump to flat8out 302 @ (or to a writeout of the inner 4 pixels, for wd=8) 303 beq 6f 304 305 @ flat8in 306 vaddl.u8 \tmpq1, d20, d21 307 vaddl.u8 \tmpq2, d22, d25 308 vaddl.u8 \tmpq3, d20, d22 309 vaddl.u8 \tmpq4, d23, d26 310 vadd.u16 q0, \tmpq1, \tmpq1 311 vaddw.u8 q0, q0, d23 312 vaddw.u8 q0, q0, d24 313 vadd.u16 q0, q0, \tmpq3 314 vsub.s16 \tmpq2, \tmpq2, \tmpq1 315 vsub.s16 \tmpq4, \tmpq4, \tmpq3 316 vrshrn.u16 d2, q0, #3 @ out p2 317 318 vadd.u16 q0, q0, \tmpq2 319 vaddl.u8 \tmpq1, d20, d23 320 vaddl.u8 \tmpq2, d24, d27 321 vrshrn.u16 d3, q0, #3 @ out p1 322 323 vadd.u16 q0, q0, \tmpq4 324 vsub.s16 \tmpq2, \tmpq2, \tmpq1 325 vaddl.u8 \tmpq3, d21, d24 326 vaddl.u8 \tmpq4, d25, d27 327 vrshrn.u16 d4, q0, #3 @ out p0 328 329 vadd.u16 q0, q0, \tmpq2 330 vsub.s16 \tmpq4, \tmpq4, \tmpq3 331 vaddl.u8 \tmpq1, d22, d25 332 vaddl.u8 \tmpq2, d26, d27 333 vrshrn.u16 d5, q0, #3 @ out q0 334 335 vadd.u16 q0, q0, \tmpq4 336 vsub.s16 \tmpq2, \tmpq2, \tmpq1 337 vrshrn.u16 \tmp5, q0, #3 @ out q1 338 339 vadd.u16 q0, q0, \tmpq2 340 @ The output here is written back into the input registers. This doesn't 341 @ matter for the flat8out part below, since we only update those pixels 342 @ which won't be touched below. 343 vbit d21, d2, d6 344 vbit d22, d3, d6 345 vbit d23, d4, d6 346 vrshrn.u16 \tmp6, q0, #3 @ out q2 347 vbit d24, d5, d6 348 vbit d25, \tmp5, d6 349 vbit d26, \tmp6, d6 350.endif 351.if \wd == 16 3526: 353 vorr d2, d6, d7 354 vmov r2, r3, d2 355 orrs r2, r2, r3 356 @ If no pixels needed flat8in nor flat8out, jump to a 357 @ writeout of the inner 4 pixels 358 beq 7f 359 vmov r2, r3, d7 360 orrs r2, r2, r3 361 @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels 362 beq 8f 363 364 @ flat8out 365 @ This writes all outputs into d2-d17 (skipping d6 and d16). 366 @ If this part is skipped, the output is read from d21-d26 (which is the input 367 @ to this section). 368 vshll.u8 q0, d16, #3 @ 8 * d16 369 vsubw.u8 q0, q0, d16 @ 7 * d16 370 vaddw.u8 q0, q0, d17 371 vaddl.u8 q4, d17, d18 372 vaddl.u8 q5, d19, d20 373 vadd.s16 q0, q0, q4 374 vaddl.u8 q4, d16, d17 375 vaddl.u8 q6, d21, d22 376 vadd.s16 q0, q0, q5 377 vaddl.u8 q5, d18, d25 378 vaddl.u8 q7, d23, d24 379 vsub.s16 q5, q5, q4 380 vadd.s16 q0, q0, q6 381 vadd.s16 q0, q0, q7 382 vaddl.u8 q6, d16, d18 383 vaddl.u8 q7, d19, d26 384 vrshrn.u16 d2, q0, #4 385 386 vadd.s16 q0, q0, q5 387 vaddl.u8 q4, d16, d19 388 vaddl.u8 q5, d20, d27 389 vsub.s16 q7, q7, q6 390 vbif d2, d17, d7 391 vrshrn.u16 d3, q0, #4 392 393 vadd.s16 q0, q0, q7 394 vaddl.u8 q6, d16, d20 395 vaddl.u8 q7, d21, d28 396 vsub.s16 q5, q5, q4 397 vbif d3, d18, d7 398 vrshrn.u16 d4, q0, #4 399 400 vadd.s16 q0, q0, q5 401 vaddl.u8 q4, d16, d21 402 vaddl.u8 q5, d22, d29 403 vsub.s16 q7, q7, q6 404 vbif d4, d19, d7 405 vrshrn.u16 d5, q0, #4 406 407 vadd.s16 q0, q0, q7 408 vaddl.u8 q6, d16, d22 409 vaddl.u8 q7, d23, d30 410 vsub.s16 q5, q5, q4 411 vbif d5, d20, d7 412 vrshrn.u16 d6, q0, #4 413 414 vadd.s16 q0, q0, q5 415 vaddl.u8 q5, d16, d23 416 vsub.s16 q7, q7, q6 417 vaddl.u8 q6, d24, d31 418 vbif d6, d21, d7 419 vrshrn.u16 d8, q0, #4 420 421 vadd.s16 q0, q0, q7 422 vsub.s16 q5, q6, q5 423 vaddl.u8 q6, d17, d24 424 vaddl.u8 q7, d25, d31 425 vbif d8, d22, d7 426 vrshrn.u16 d9, q0, #4 427 428 vadd.s16 q0, q0, q5 429 vsub.s16 q7, q7, q6 430 vaddl.u8 q6, d26, d31 431 vbif d9, d23, d7 432 vrshrn.u16 d10, q0, #4 433 434 vadd.s16 q0, q0, q7 435 vaddl.u8 q7, d18, d25 436 vaddl.u8 q9, d19, d26 437 vsub.s16 q6, q6, q7 438 vaddl.u8 q7, d27, d31 439 vbif d10, d24, d7 440 vrshrn.u16 d11, q0, #4 441 442 vadd.s16 q0, q0, q6 443 vaddl.u8 q6, d20, d27 444 vsub.s16 q7, q7, q9 445 vaddl.u8 q9, d28, d31 446 vbif d11, d25, d7 447 vsub.s16 q9, q9, q6 448 vrshrn.u16 d12, q0, #4 449 450 vadd.s16 q0, q0, q7 451 vaddl.u8 q7, d21, d28 452 vaddl.u8 q10, d29, d31 453 vbif d12, d26, d7 454 vrshrn.u16 d13, q0, #4 455 456 vadd.s16 q0, q0, q9 457 vsub.s16 q10, q10, q7 458 vaddl.u8 q9, d22, d29 459 vaddl.u8 q11, d30, d31 460 vbif d13, d27, d7 461 vrshrn.u16 d14, q0, #4 462 463 vadd.s16 q0, q0, q10 464 vsub.s16 q11, q11, q9 465 vbif d14, d28, d7 466 vrshrn.u16 d15, q0, #4 467 468 vadd.s16 q0, q0, q11 469 vbif d15, d29, d7 470 vrshrn.u16 d17, q0, #4 471 vbif d17, d30, d7 472.endif 473.endm 474 475@ For wd <= 8, we use d16-d19 and d28-d31 for temp registers, 476@ while we need those for inputs/outputs in wd=16 and use d8-d15 477@ for temp registers there instead. 478.macro loop_filter_4 479 loop_filter 4, d16, d17, d18, d19, d28, d29, d30, d31, q8, q9, q14, q15 480.endm 481 482.macro loop_filter_8 483 loop_filter 8, d16, d17, d18, d19, d28, d29, d30, d31, q8, q9, q14, q15 484.endm 485 486.macro loop_filter_16 487 loop_filter 16, d8, d9, d10, d11, d12, d13, d14, d15, q4, q5, q6, q7 488.endm 489 490 491@ The public functions in this file have got the following signature: 492@ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr); 493 494function ff_vp9_loop_filter_v_4_8_neon, export=1 495 sub r12, r0, r1, lsl #2 496 vld1.8 {d20}, [r12,:64], r1 @ p3 497 vld1.8 {d24}, [r0, :64], r1 @ q0 498 vld1.8 {d21}, [r12,:64], r1 @ p2 499 vld1.8 {d25}, [r0, :64], r1 @ q1 500 vld1.8 {d22}, [r12,:64], r1 @ p1 501 vld1.8 {d26}, [r0, :64], r1 @ q2 502 vld1.8 {d23}, [r12,:64], r1 @ p0 503 vld1.8 {d27}, [r0, :64], r1 @ q3 504 sub r0, r0, r1, lsl #2 505 sub r12, r12, r1, lsl #1 506 507 loop_filter_4 508 509 vst1.8 {d22}, [r12,:64], r1 510 vst1.8 {d24}, [r0, :64], r1 511 vst1.8 {d23}, [r12,:64], r1 512 vst1.8 {d25}, [r0, :64], r1 5139: 514 bx lr 515endfunc 516 517function ff_vp9_loop_filter_h_4_8_neon, export=1 518 sub r12, r0, #4 519 add r0, r12, r1, lsl #2 520 vld1.8 {d20}, [r12], r1 521 vld1.8 {d24}, [r0], r1 522 vld1.8 {d21}, [r12], r1 523 vld1.8 {d25}, [r0], r1 524 vld1.8 {d22}, [r12], r1 525 vld1.8 {d26}, [r0], r1 526 vld1.8 {d23}, [r12], r1 527 vld1.8 {d27}, [r0], r1 528 529 sub r12, r12, r1, lsl #2 530 sub r0, r0, r1, lsl #2 531 @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the 532 @ outermost 2 pixels since they aren't changed. 533 add r12, r12, #2 534 add r0, r0, #2 535 536 @ Transpose the 8x8 pixels, taking advantage of q registers, to get 537 @ one register per column. 538 transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 539 540 loop_filter_4 541 542 @ We only will write the mid 4 pixels back; after the loop filter, 543 @ these are in d22, d23, d24, d25 (q11, q12), ordered as rows 544 @ (8x4 pixels). We need to transpose them to columns, done with a 545 @ 4x4 transpose (which in practice is two 4x4 transposes of the two 546 @ 4x4 halves of the 8x4 pixels; into 4x8 pixels). 547 transpose_q_4x4 q11, q12, d22, d23, d24, d25 548 549 vst1.32 {d22[0]}, [r12], r1 550 vst1.32 {d22[1]}, [r0], r1 551 vst1.32 {d23[0]}, [r12], r1 552 vst1.32 {d23[1]}, [r0], r1 553 vst1.32 {d24[0]}, [r12], r1 554 vst1.32 {d24[1]}, [r0], r1 555 vst1.32 {d25[0]}, [r12], r1 556 vst1.32 {d25[1]}, [r0], r1 5579: 558 bx lr 559endfunc 560 561function ff_vp9_loop_filter_v_44_16_neon, export=1 562 vpush {q4-q7} 563 sub r12, r0, r1, lsl #2 564 vld1.8 {q8}, [r12,:128], r1 @ p3 565 vld1.8 {q12}, [r0, :128], r1 @ q0 566 vld1.8 {q9}, [r12,:128], r1 @ p2 567 vld1.8 {q13}, [r0, :128], r1 @ q1 568 vld1.8 {q10}, [r12,:128], r1 @ p1 569 vld1.8 {q14}, [r0, :128], r1 @ q2 570 vld1.8 {q11}, [r12,:128], r1 @ p0 571 vld1.8 {q15}, [r0, :128], r1 @ q3 572 sub r0, r0, r1, lsl #2 573 sub r12, r12, r1, lsl #1 574 575 loop_filter_q 576 577 vst1.8 {q10}, [r12,:128], r1 578 vst1.8 {q12}, [r0, :128], r1 579 vst1.8 {q11}, [r12,:128], r1 580 vst1.8 {q13}, [r0, :128], r1 5819: 582 vpop {q4-q7} 583 bx lr 584endfunc 585 586function ff_vp9_loop_filter_h_44_16_neon, export=1 587 vpush {q4-q7} 588 sub r12, r0, #4 589 add r0, r12, r1, lsl #2 590 vld1.8 {d16}, [r12], r1 591 vld1.8 {d24}, [r0], r1 592 vld1.8 {d18}, [r12], r1 593 vld1.8 {d26}, [r0], r1 594 vld1.8 {d20}, [r12], r1 595 vld1.8 {d28}, [r0], r1 596 vld1.8 {d22}, [r12], r1 597 vld1.8 {d30}, [r0], r1 598 mov r12, r0 599 add r0, r0, r1, lsl #2 600 vld1.8 {d17}, [r12], r1 601 vld1.8 {d25}, [r0], r1 602 vld1.8 {d19}, [r12], r1 603 vld1.8 {d27}, [r0], r1 604 vld1.8 {d21}, [r12], r1 605 vld1.8 {d29}, [r0], r1 606 vld1.8 {d23}, [r12], r1 607 vld1.8 {d31}, [r0], r1 608 609 @ Transpose the 16x8 pixels, as two 8x8 parts 610 transpose_8x8 q8, q9, q10, q11, q12, q13, q14, q15 611 612 loop_filter_q 613 614 sub r12, r0, r1, lsl #4 615 add r0, r12, r1, lsl #3 616 @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the 617 @ outermost 2 pixels since they aren't changed. 618 add r12, r12, #2 619 add r0, r0, #2 620 621 @ We only will write the mid 4 pixels back; after the loop filter, 622 @ these are in q10, q11, q12, q13, ordered as rows (16x4 pixels). 623 @ We need to transpose them to columns, done with a 4x4 transpose 624 @ (which in practice is four 4x4 transposes of the 4x4 blocks of 625 @ the 16x4 pixels; into 4x16 pixels). 626 transpose_4x4 q10, q11, q12, q13 627 628 vst1.32 {d20[0]}, [r12], r1 629 vst1.32 {d21[0]}, [r0], r1 630 vst1.32 {d22[0]}, [r12], r1 631 vst1.32 {d23[0]}, [r0], r1 632 vst1.32 {d24[0]}, [r12], r1 633 vst1.32 {d25[0]}, [r0], r1 634 vst1.32 {d26[0]}, [r12], r1 635 vst1.32 {d27[0]}, [r0], r1 636 vst1.32 {d20[1]}, [r12], r1 637 vst1.32 {d21[1]}, [r0], r1 638 vst1.32 {d22[1]}, [r12], r1 639 vst1.32 {d23[1]}, [r0], r1 640 vst1.32 {d24[1]}, [r12], r1 641 vst1.32 {d25[1]}, [r0], r1 642 vst1.32 {d26[1]}, [r12], r1 643 vst1.32 {d27[1]}, [r0], r1 6449: 645 vpop {q4-q7} 646 bx lr 647endfunc 648 649function ff_vp9_loop_filter_v_8_8_neon, export=1 650 sub r12, r0, r1, lsl #2 651 vld1.8 {d20}, [r12,:64], r1 @ p3 652 vld1.8 {d24}, [r0, :64], r1 @ q0 653 vld1.8 {d21}, [r12,:64], r1 @ p2 654 vld1.8 {d25}, [r0, :64], r1 @ q1 655 vld1.8 {d22}, [r12,:64], r1 @ p1 656 vld1.8 {d26}, [r0, :64], r1 @ q2 657 vld1.8 {d23}, [r12,:64], r1 @ p0 658 vld1.8 {d27}, [r0, :64], r1 @ q3 659 sub r12, r12, r1, lsl #2 660 sub r0, r0, r1, lsl #2 661 add r12, r12, r1 662 663 loop_filter_8 664 665 vst1.8 {d21}, [r12,:64], r1 666 vst1.8 {d24}, [r0, :64], r1 667 vst1.8 {d22}, [r12,:64], r1 668 vst1.8 {d25}, [r0, :64], r1 669 vst1.8 {d23}, [r12,:64], r1 670 vst1.8 {d26}, [r0, :64], r1 6719: 672 bx lr 6736: 674 sub r12, r0, r1, lsl #1 675 vst1.8 {d22}, [r12,:64], r1 676 vst1.8 {d24}, [r0, :64], r1 677 vst1.8 {d23}, [r12,:64], r1 678 vst1.8 {d25}, [r0, :64], r1 679 bx lr 680endfunc 681 682function ff_vp9_loop_filter_h_8_8_neon, export=1 683 sub r12, r0, #4 684 add r0, r12, r1, lsl #2 685 vld1.8 {d20}, [r12], r1 686 vld1.8 {d24}, [r0], r1 687 vld1.8 {d21}, [r12], r1 688 vld1.8 {d25}, [r0], r1 689 vld1.8 {d22}, [r12], r1 690 vld1.8 {d26}, [r0], r1 691 vld1.8 {d23}, [r12], r1 692 vld1.8 {d27}, [r0], r1 693 694 sub r12, r12, r1, lsl #2 695 sub r0, r0, r1, lsl #2 696 697 transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 698 699 loop_filter_8 700 701 @ Even though only 6 pixels per row have been changed, we write the 702 @ full 8 pixel registers. 703 transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 704 705 vst1.8 {d20}, [r12], r1 706 vst1.8 {d24}, [r0], r1 707 vst1.8 {d21}, [r12], r1 708 vst1.8 {d25}, [r0], r1 709 vst1.8 {d22}, [r12], r1 710 vst1.8 {d26}, [r0], r1 711 vst1.8 {d23}, [r12], r1 712 vst1.8 {d27}, [r0], r1 7139: 714 bx lr 7156: 716 @ If we didn't need to do the flat8in part, we use the same writeback 717 @ as in loop_filter_h_4_8. 718 add r12, r12, #2 719 add r0, r0, #2 720 transpose_q_4x4 q11, q12, d22, d23, d24, d25 721 vst1.32 {d22[0]}, [r12], r1 722 vst1.32 {d22[1]}, [r0], r1 723 vst1.32 {d23[0]}, [r12], r1 724 vst1.32 {d23[1]}, [r0], r1 725 vst1.32 {d24[0]}, [r12], r1 726 vst1.32 {d24[1]}, [r0], r1 727 vst1.32 {d25[0]}, [r12], r1 728 vst1.32 {d25[1]}, [r0], r1 729 bx lr 730endfunc 731 732function vp9_loop_filter_v_16_neon 733 sub r12, r0, r1, lsl #3 734 @ Read p7-p0 using r12 and q0-q7 using r0 735 vld1.8 {d16}, [r12,:64], r1 @ p7 736 vld1.8 {d24}, [r0, :64], r1 @ q0 737 vld1.8 {d17}, [r12,:64], r1 @ p6 738 vld1.8 {d25}, [r0, :64], r1 @ q1 739 vld1.8 {d18}, [r12,:64], r1 @ p5 740 vld1.8 {d26}, [r0, :64], r1 @ q2 741 vld1.8 {d19}, [r12,:64], r1 @ p4 742 vld1.8 {d27}, [r0, :64], r1 @ q3 743 vld1.8 {d20}, [r12,:64], r1 @ p3 744 vld1.8 {d28}, [r0, :64], r1 @ q4 745 vld1.8 {d21}, [r12,:64], r1 @ p2 746 vld1.8 {d29}, [r0, :64], r1 @ q5 747 vld1.8 {d22}, [r12,:64], r1 @ p1 748 vld1.8 {d30}, [r0, :64], r1 @ q6 749 vld1.8 {d23}, [r12,:64], r1 @ p0 750 vld1.8 {d31}, [r0, :64], r1 @ q7 751 sub r12, r12, r1, lsl #3 752 sub r0, r0, r1, lsl #3 753 add r12, r12, r1 754 755 loop_filter_16 756 757 @ If we did the flat8out part, we get the output in 758 @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride, 759 @ store d2-d9 there, and d10-d17 into r0. 760 vst1.8 {d2}, [r12,:64], r1 761 vst1.8 {d10}, [r0, :64], r1 762 vst1.8 {d3}, [r12,:64], r1 763 vst1.8 {d11}, [r0, :64], r1 764 vst1.8 {d4}, [r12,:64], r1 765 vst1.8 {d12}, [r0, :64], r1 766 vst1.8 {d5}, [r12,:64], r1 767 vst1.8 {d13}, [r0, :64], r1 768 vst1.8 {d6}, [r12,:64], r1 769 vst1.8 {d14}, [r0, :64], r1 770 vst1.8 {d8}, [r12,:64], r1 771 vst1.8 {d15}, [r0, :64], r1 772 vst1.8 {d9}, [r12,:64], r1 773 vst1.8 {d17}, [r0, :64], r1 774 sub r0, r0, r1, lsl #3 775 add r0, r0, r1 776 7779: 778 bx lr 779 7808: 781 add r12, r12, r1, lsl #2 782 @ If we didn't do the flat8out part, the output is left in the 783 @ input registers. 784 vst1.8 {d21}, [r12,:64], r1 785 vst1.8 {d24}, [r0, :64], r1 786 vst1.8 {d22}, [r12,:64], r1 787 vst1.8 {d25}, [r0, :64], r1 788 vst1.8 {d23}, [r12,:64], r1 789 vst1.8 {d26}, [r0, :64], r1 790 sub r0, r0, r1, lsl #1 791 sub r0, r0, r1 792 bx lr 7937: 794 sub r12, r0, r1, lsl #1 795 vst1.8 {d22}, [r12,:64], r1 796 vst1.8 {d24}, [r0, :64], r1 797 vst1.8 {d23}, [r12,:64], r1 798 vst1.8 {d25}, [r0, :64], r1 799 sub r0, r0, r1, lsl #1 800 bx lr 801endfunc 802 803function ff_vp9_loop_filter_v_16_8_neon, export=1 804 ldr r12, [sp] 805 push {lr} 806 vpush {q4-q7} 807 push {r12} 808 bl vp9_loop_filter_v_16_neon 809 add sp, sp, #4 810 vpop {q4-q7} 811 pop {pc} 812endfunc 813 814function ff_vp9_loop_filter_v_16_16_neon, export=1 815 ldr r12, [sp] 816 // The filter clobbers r2 and r3, but we need to keep them for the second round 817 push {r2, r3, lr} 818 vpush {q4-q7} 819 push {r12} 820 bl vp9_loop_filter_v_16_neon 821 add r0, #8 822 ldr r2, [sp, #68] 823 ldr r3, [sp, #72] 824 bl vp9_loop_filter_v_16_neon 825 add sp, sp, #4 826 vpop {q4-q7} 827 pop {r2, r3, pc} 828endfunc 829 830function vp9_loop_filter_h_16_neon 831 sub r12, r0, #8 832 vld1.8 {d16}, [r12,:64], r1 833 vld1.8 {d24}, [r0, :64], r1 834 vld1.8 {d17}, [r12,:64], r1 835 vld1.8 {d25}, [r0, :64], r1 836 vld1.8 {d18}, [r12,:64], r1 837 vld1.8 {d26}, [r0, :64], r1 838 vld1.8 {d19}, [r12,:64], r1 839 vld1.8 {d27}, [r0, :64], r1 840 vld1.8 {d20}, [r12,:64], r1 841 vld1.8 {d28}, [r0, :64], r1 842 vld1.8 {d21}, [r12,:64], r1 843 vld1.8 {d29}, [r0, :64], r1 844 vld1.8 {d22}, [r12,:64], r1 845 vld1.8 {d30}, [r0, :64], r1 846 vld1.8 {d23}, [r12,:64], r1 847 vld1.8 {d31}, [r0, :64], r1 848 sub r0, r0, r1, lsl #3 849 sub r12, r12, r1, lsl #3 850 851 @ The 16x8 pixels read above is in two 8x8 blocks; the left 852 @ half in d16-d23, and the right half in d24-d31. Do two 8x8 transposes 853 @ of this, to get one column per register. This could be done with two 854 @ transpose_8x8 as below, but this takes advantage of the q registers. 855 transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15 856 vtrn.8 d16, d17 857 vtrn.8 d18, d19 858 vtrn.8 d20, d21 859 vtrn.8 d22, d23 860 vtrn.8 d24, d25 861 vtrn.8 d26, d27 862 vtrn.8 d28, d29 863 vtrn.8 d30, d31 864 865 loop_filter_16 866 867 @ Transpose back; this is the same transpose as above, but 868 @ we can't take advantage of q registers for the transpose, since 869 @ all d registers in the transpose aren't consecutive. 870 transpose_8x8 d16, d2, d3, d4, d5, d6, d8, d9 871 transpose_8x8 d10, d11, d12, d13, d14, d15, d17, d31 872 873 vst1.8 {d16}, [r12,:64], r1 874 vst1.8 {d10}, [r0, :64], r1 875 876 vst1.8 {d2}, [r12,:64], r1 877 vst1.8 {d11}, [r0, :64], r1 878 879 vst1.8 {d3}, [r12,:64], r1 880 vst1.8 {d12}, [r0, :64], r1 881 882 vst1.8 {d4}, [r12,:64], r1 883 vst1.8 {d13}, [r0, :64], r1 884 885 vst1.8 {d5}, [r12,:64], r1 886 vst1.8 {d14}, [r0, :64], r1 887 888 vst1.8 {d6}, [r12,:64], r1 889 vst1.8 {d15}, [r0, :64], r1 890 891 vst1.8 {d8}, [r12,:64], r1 892 vst1.8 {d17}, [r0, :64], r1 893 894 vst1.8 {d9}, [r12,:64], r1 895 vst1.8 {d31}, [r0, :64], r1 896 sub r0, r0, r1, lsl #3 8979: 898 bx lr 8998: 900 @ The same writeback as in loop_filter_h_8_8 901 sub r12, r0, #4 902 add r0, r12, r1, lsl #2 903 transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 904 905 vst1.8 {d20}, [r12], r1 906 vst1.8 {d24}, [r0], r1 907 vst1.8 {d21}, [r12], r1 908 vst1.8 {d25}, [r0], r1 909 vst1.8 {d22}, [r12], r1 910 vst1.8 {d26}, [r0], r1 911 vst1.8 {d23}, [r12], r1 912 vst1.8 {d27}, [r0], r1 913 sub r0, r0, r1, lsl #3 914 add r0, r0, #4 915 bx lr 9167: 917 @ The same writeback as in loop_filter_h_4_8 918 sub r12, r0, #2 919 add r0, r12, r1, lsl #2 920 transpose_q_4x4 q11, q12, d22, d23, d24, d25 921 vst1.32 {d22[0]}, [r12], r1 922 vst1.32 {d22[1]}, [r0], r1 923 vst1.32 {d23[0]}, [r12], r1 924 vst1.32 {d23[1]}, [r0], r1 925 vst1.32 {d24[0]}, [r12], r1 926 vst1.32 {d24[1]}, [r0], r1 927 vst1.32 {d25[0]}, [r12], r1 928 vst1.32 {d25[1]}, [r0], r1 929 sub r0, r0, r1, lsl #3 930 add r0, r0, #2 931 bx lr 932endfunc 933 934function ff_vp9_loop_filter_h_16_8_neon, export=1 935 ldr r12, [sp] 936 push {lr} 937 vpush {q4-q7} 938 push {r12} 939 bl vp9_loop_filter_h_16_neon 940 add sp, sp, #4 941 vpop {q4-q7} 942 pop {pc} 943endfunc 944 945function ff_vp9_loop_filter_h_16_16_neon, export=1 946 ldr r12, [sp] 947 // The filter clobbers r2 and r3, but we need to keep them for the second round 948 push {r2, r3, lr} 949 vpush {q4-q7} 950 push {r12} 951 bl vp9_loop_filter_h_16_neon 952 add r0, r0, r1, lsl #3 953 ldr r2, [sp, #68] 954 ldr r3, [sp, #72] 955 bl vp9_loop_filter_h_16_neon 956 add sp, sp, #4 957 vpop {q4-q7} 958 pop {r2, r3, pc} 959endfunc 960