1/* 2 * ARM NEON optimised IDCT functions for HEVC decoding 3 * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi> 4 * Copyright (c) 2017 Alexandra Hájková 5 * 6 * Ported from arm/hevcdsp_idct_neon.S by 7 * Copyright (c) 2020 Reimar Döffinger 8 * Copyright (c) 2020 Josh Dekker 9 * 10 * This file is part of FFmpeg. 11 * 12 * FFmpeg is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU Lesser General Public 14 * License as published by the Free Software Foundation; either 15 * version 2.1 of the License, or (at your option) any later version. 16 * 17 * FFmpeg is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 * Lesser General Public License for more details. 21 * 22 * You should have received a copy of the GNU Lesser General Public 23 * License along with FFmpeg; if not, write to the Free Software 24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 25 */ 26 27#include "libavutil/aarch64/asm.S" 28 29const trans, align=4 30 .short 64, 83, 64, 36 31 .short 89, 75, 50, 18 32 .short 90, 87, 80, 70 33 .short 57, 43, 25, 9 34 .short 90, 90, 88, 85 35 .short 82, 78, 73, 67 36 .short 61, 54, 46, 38 37 .short 31, 22, 13, 4 38endconst 39 40.macro clip10 in1, in2, c1, c2 41 smax \in1, \in1, \c1 42 smax \in2, \in2, \c1 43 smin \in1, \in1, \c2 44 smin \in2, \in2, \c2 45.endm 46 47function ff_hevc_add_residual_4x4_8_neon, export=1 48 ld1 {v0.8h-v1.8h}, [x1] 49 ld1 {v2.s}[0], [x0], x2 50 ld1 {v2.s}[1], [x0], x2 51 ld1 {v2.s}[2], [x0], x2 52 ld1 {v2.s}[3], [x0], x2 53 sub x0, x0, x2, lsl #2 54 uxtl v6.8h, v2.8b 55 uxtl2 v7.8h, v2.16b 56 sqadd v0.8h, v0.8h, v6.8h 57 sqadd v1.8h, v1.8h, v7.8h 58 sqxtun v0.8b, v0.8h 59 sqxtun2 v0.16b, v1.8h 60 st1 {v0.s}[0], [x0], x2 61 st1 {v0.s}[1], [x0], x2 62 st1 {v0.s}[2], [x0], x2 63 st1 {v0.s}[3], [x0], x2 64 ret 65endfunc 66 67function ff_hevc_add_residual_4x4_10_neon, export=1 68 mov x12, x0 69 ld1 {v0.8h-v1.8h}, [x1] 70 ld1 {v2.d}[0], [x12], x2 71 ld1 {v2.d}[1], [x12], x2 72 ld1 {v3.d}[0], [x12], x2 73 sqadd v0.8h, v0.8h, v2.8h 74 ld1 {v3.d}[1], [x12], x2 75 movi v4.8h, #0 76 sqadd v1.8h, v1.8h, v3.8h 77 mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF 78 clip10 v0.8h, v1.8h, v4.8h, v5.8h 79 st1 {v0.d}[0], [x0], x2 80 st1 {v0.d}[1], [x0], x2 81 st1 {v1.d}[0], [x0], x2 82 st1 {v1.d}[1], [x0], x2 83 ret 84endfunc 85 86function ff_hevc_add_residual_8x8_8_neon, export=1 87 add x12, x0, x2 88 add x2, x2, x2 89 mov x3, #8 901: subs x3, x3, #2 91 ld1 {v2.d}[0], [x0] 92 ld1 {v2.d}[1], [x12] 93 uxtl v3.8h, v2.8b 94 ld1 {v0.8h-v1.8h}, [x1], #32 95 uxtl2 v2.8h, v2.16b 96 sqadd v0.8h, v0.8h, v3.8h 97 sqadd v1.8h, v1.8h, v2.8h 98 sqxtun v0.8b, v0.8h 99 sqxtun2 v0.16b, v1.8h 100 st1 {v0.d}[0], [x0], x2 101 st1 {v0.d}[1], [x12], x2 102 bne 1b 103 ret 104endfunc 105 106function ff_hevc_add_residual_8x8_10_neon, export=1 107 add x12, x0, x2 108 add x2, x2, x2 109 mov x3, #8 110 movi v4.8h, #0 111 mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF 1121: subs x3, x3, #2 113 ld1 {v0.8h-v1.8h}, [x1], #32 114 ld1 {v2.8h}, [x0] 115 sqadd v0.8h, v0.8h, v2.8h 116 ld1 {v3.8h}, [x12] 117 sqadd v1.8h, v1.8h, v3.8h 118 clip10 v0.8h, v1.8h, v4.8h, v5.8h 119 st1 {v0.8h}, [x0], x2 120 st1 {v1.8h}, [x12], x2 121 bne 1b 122 ret 123endfunc 124 125function ff_hevc_add_residual_16x16_8_neon, export=1 126 mov x3, #16 127 add x12, x0, x2 128 add x2, x2, x2 1291: subs x3, x3, #2 130 ld1 {v16.16b}, [x0] 131 ld1 {v0.8h-v3.8h}, [x1], #64 132 ld1 {v19.16b}, [x12] 133 uxtl v17.8h, v16.8b 134 uxtl2 v18.8h, v16.16b 135 uxtl v20.8h, v19.8b 136 uxtl2 v21.8h, v19.16b 137 sqadd v0.8h, v0.8h, v17.8h 138 sqadd v1.8h, v1.8h, v18.8h 139 sqadd v2.8h, v2.8h, v20.8h 140 sqadd v3.8h, v3.8h, v21.8h 141 sqxtun v0.8b, v0.8h 142 sqxtun2 v0.16b, v1.8h 143 sqxtun v1.8b, v2.8h 144 sqxtun2 v1.16b, v3.8h 145 st1 {v0.16b}, [x0], x2 146 st1 {v1.16b}, [x12], x2 147 bne 1b 148 ret 149endfunc 150 151function ff_hevc_add_residual_16x16_10_neon, export=1 152 mov x3, #16 153 movi v20.8h, #0 154 mvni v21.8h, #0xFC, lsl #8 // movi #0x3FF 155 add x12, x0, x2 156 add x2, x2, x2 1571: subs x3, x3, #2 158 ld1 {v16.8h-v17.8h}, [x0] 159 ld1 {v0.8h-v3.8h}, [x1], #64 160 sqadd v0.8h, v0.8h, v16.8h 161 ld1 {v18.8h-v19.8h}, [x12] 162 sqadd v1.8h, v1.8h, v17.8h 163 sqadd v2.8h, v2.8h, v18.8h 164 sqadd v3.8h, v3.8h, v19.8h 165 clip10 v0.8h, v1.8h, v20.8h, v21.8h 166 clip10 v2.8h, v3.8h, v20.8h, v21.8h 167 st1 {v0.8h-v1.8h}, [x0], x2 168 st1 {v2.8h-v3.8h}, [x12], x2 169 bne 1b 170 ret 171endfunc 172 173function ff_hevc_add_residual_32x32_8_neon, export=1 174 add x12, x0, x2 175 add x2, x2, x2 176 mov x3, #32 1771: subs x3, x3, #2 178 ld1 {v20.16b, v21.16b}, [x0] 179 uxtl v16.8h, v20.8b 180 uxtl2 v17.8h, v20.16b 181 ld1 {v22.16b, v23.16b}, [x12] 182 uxtl v18.8h, v21.8b 183 uxtl2 v19.8h, v21.16b 184 uxtl v20.8h, v22.8b 185 ld1 {v0.8h-v3.8h}, [x1], #64 186 ld1 {v4.8h-v7.8h}, [x1], #64 187 uxtl2 v21.8h, v22.16b 188 uxtl v22.8h, v23.8b 189 uxtl2 v23.8h, v23.16b 190 sqadd v0.8h, v0.8h, v16.8h 191 sqadd v1.8h, v1.8h, v17.8h 192 sqadd v2.8h, v2.8h, v18.8h 193 sqadd v3.8h, v3.8h, v19.8h 194 sqadd v4.8h, v4.8h, v20.8h 195 sqadd v5.8h, v5.8h, v21.8h 196 sqadd v6.8h, v6.8h, v22.8h 197 sqadd v7.8h, v7.8h, v23.8h 198 sqxtun v0.8b, v0.8h 199 sqxtun2 v0.16b, v1.8h 200 sqxtun v1.8b, v2.8h 201 sqxtun2 v1.16b, v3.8h 202 sqxtun v2.8b, v4.8h 203 sqxtun2 v2.16b, v5.8h 204 st1 {v0.16b, v1.16b}, [x0], x2 205 sqxtun v3.8b, v6.8h 206 sqxtun2 v3.16b, v7.8h 207 st1 {v2.16b, v3.16b}, [x12], x2 208 bne 1b 209 ret 210endfunc 211 212function ff_hevc_add_residual_32x32_10_neon, export=1 213 mov x3, #32 214 movi v20.8h, #0 215 mvni v21.8h, #0xFC, lsl #8 // movi #0x3FF 2161: subs x3, x3, #1 217 ld1 {v0.8h-v3.8h}, [x1], #64 218 ld1 {v16.8h-v19.8h}, [x0] 219 sqadd v0.8h, v0.8h, v16.8h 220 sqadd v1.8h, v1.8h, v17.8h 221 sqadd v2.8h, v2.8h, v18.8h 222 sqadd v3.8h, v3.8h, v19.8h 223 clip10 v0.8h, v1.8h, v20.8h, v21.8h 224 clip10 v2.8h, v3.8h, v20.8h, v21.8h 225 st1 {v0.8h-v3.8h}, [x0], x2 226 bne 1b 227 ret 228endfunc 229 230.macro sum_sub out, in, c, op, p 231 .ifc \op, + 232 smlal\p \out, \in, \c 233 .else 234 smlsl\p \out, \in, \c 235 .endif 236.endm 237 238.macro fixsqrshrn d, dt, n, m 239 .ifc \dt, .8h 240 sqrshrn2 \d\dt, \n\().4s, \m 241 .else 242 sqrshrn \n\().4h, \n\().4s, \m 243 mov \d\().d[0], \n\().d[0] 244 .endif 245.endm 246 247// uses and clobbers v28-v31 as temp registers 248.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2 249 sshll\p1 v28.4s, \in0, #6 250 mov v29.16b, v28.16b 251 smull\p1 v30.4s, \in1, v0.h[1] 252 smull\p1 v31.4s, \in1, v0.h[3] 253 smlal\p2 v28.4s, \in2, v0.h[0] //e0 254 smlsl\p2 v29.4s, \in2, v0.h[0] //e1 255 smlal\p2 v30.4s, \in3, v0.h[3] //o0 256 smlsl\p2 v31.4s, \in3, v0.h[1] //o1 257 258 add \out0, v28.4s, v30.4s 259 add \out1, v29.4s, v31.4s 260 sub \out2, v29.4s, v31.4s 261 sub \out3, v28.4s, v30.4s 262.endm 263 264.macro transpose8_4x4 r0, r1, r2, r3 265 trn1 v2.8h, \r0\().8h, \r1\().8h 266 trn2 v3.8h, \r0\().8h, \r1\().8h 267 trn1 v4.8h, \r2\().8h, \r3\().8h 268 trn2 v5.8h, \r2\().8h, \r3\().8h 269 trn1 \r0\().4s, v2.4s, v4.4s 270 trn2 \r2\().4s, v2.4s, v4.4s 271 trn1 \r1\().4s, v3.4s, v5.4s 272 trn2 \r3\().4s, v3.4s, v5.4s 273.endm 274 275.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7 276 transpose8_4x4 \r0, \r1, \r2, \r3 277 transpose8_4x4 \r4, \r5, \r6, \r7 278.endm 279 280.macro tr_8x4 shift, in0,in0t, in1,in1t, in2,in2t, in3,in3t, in4,in4t, in5,in5t, in6,in6t, in7,in7t, p1, p2 281 tr_4x4_8 \in0\in0t, \in2\in2t, \in4\in4t, \in6\in6t, v24.4s, v25.4s, v26.4s, v27.4s, \p1, \p2 282 283 smull\p1 v30.4s, \in1\in1t, v0.h[6] 284 smull\p1 v28.4s, \in1\in1t, v0.h[4] 285 smull\p1 v29.4s, \in1\in1t, v0.h[5] 286 sum_sub v30.4s, \in3\in3t, v0.h[4], -, \p1 287 sum_sub v28.4s, \in3\in3t, v0.h[5], +, \p1 288 sum_sub v29.4s, \in3\in3t, v0.h[7], -, \p1 289 290 sum_sub v30.4s, \in5\in5t, v0.h[7], +, \p2 291 sum_sub v28.4s, \in5\in5t, v0.h[6], +, \p2 292 sum_sub v29.4s, \in5\in5t, v0.h[4], -, \p2 293 294 sum_sub v30.4s, \in7\in7t, v0.h[5], +, \p2 295 sum_sub v28.4s, \in7\in7t, v0.h[7], +, \p2 296 sum_sub v29.4s, \in7\in7t, v0.h[6], -, \p2 297 298 add v31.4s, v26.4s, v30.4s 299 sub v26.4s, v26.4s, v30.4s 300 fixsqrshrn \in2,\in2t, v31, \shift 301 302 303 smull\p1 v31.4s, \in1\in1t, v0.h[7] 304 sum_sub v31.4s, \in3\in3t, v0.h[6], -, \p1 305 sum_sub v31.4s, \in5\in5t, v0.h[5], +, \p2 306 sum_sub v31.4s, \in7\in7t, v0.h[4], -, \p2 307 fixsqrshrn \in5,\in5t, v26, \shift 308 309 310 add v26.4s, v24.4s, v28.4s 311 sub v24.4s, v24.4s, v28.4s 312 add v28.4s, v25.4s, v29.4s 313 sub v25.4s, v25.4s, v29.4s 314 add v30.4s, v27.4s, v31.4s 315 sub v27.4s, v27.4s, v31.4s 316 317 fixsqrshrn \in0,\in0t, v26, \shift 318 fixsqrshrn \in7,\in7t, v24, \shift 319 fixsqrshrn \in1,\in1t, v28, \shift 320 fixsqrshrn \in6,\in6t, v25, \shift 321 fixsqrshrn \in3,\in3t, v30, \shift 322 fixsqrshrn \in4,\in4t, v27, \shift 323.endm 324 325.macro idct_8x8 bitdepth 326function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1 327//x0 - coeffs 328 mov x1, x0 329 ld1 {v16.8h-v19.8h}, [x1], #64 330 ld1 {v20.8h-v23.8h}, [x1] 331 332 movrel x1, trans 333 ld1 {v0.8h}, [x1] 334 335 tr_8x4 7, v16,.4h, v17,.4h, v18,.4h, v19,.4h, v20,.4h, v21,.4h, v22,.4h, v23,.4h 336 tr_8x4 7, v16,.8h, v17,.8h, v18,.8h, v19,.8h, v20,.8h, v21,.8h, v22,.8h, v23,.8h, 2, 2 337 338 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23 339 340 tr_8x4 20 - \bitdepth, v16,.4h, v17,.4h, v18,.4h, v19,.4h, v16,.8h, v17,.8h, v18,.8h, v19,.8h, , 2 341 tr_8x4 20 - \bitdepth, v20,.4h, v21,.4h, v22,.4h, v23,.4h, v20,.8h, v21,.8h, v22,.8h, v23,.8h, , 2 342 343 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23 344 345 mov x1, x0 346 st1 {v16.8h-v19.8h}, [x1], #64 347 st1 {v20.8h-v23.8h}, [x1] 348 349 ret 350endfunc 351.endm 352 353.macro butterfly e, o, tmp_p, tmp_m 354 add \tmp_p, \e, \o 355 sub \tmp_m, \e, \o 356.endm 357 358.macro tr16_8x4 in0, in1, in2, in3, offset 359 tr_4x4_8 \in0\().4h, \in1\().4h, \in2\().4h, \in3\().4h, v24.4s, v25.4s, v26.4s, v27.4s 360 361 smull2 v28.4s, \in0\().8h, v0.h[4] 362 smull2 v29.4s, \in0\().8h, v0.h[5] 363 smull2 v30.4s, \in0\().8h, v0.h[6] 364 smull2 v31.4s, \in0\().8h, v0.h[7] 365 sum_sub v28.4s, \in1\().8h, v0.h[5], +, 2 366 sum_sub v29.4s, \in1\().8h, v0.h[7], -, 2 367 sum_sub v30.4s, \in1\().8h, v0.h[4], -, 2 368 sum_sub v31.4s, \in1\().8h, v0.h[6], -, 2 369 370 sum_sub v28.4s, \in2\().8h, v0.h[6], +, 2 371 sum_sub v29.4s, \in2\().8h, v0.h[4], -, 2 372 sum_sub v30.4s, \in2\().8h, v0.h[7], +, 2 373 sum_sub v31.4s, \in2\().8h, v0.h[5], +, 2 374 375 sum_sub v28.4s, \in3\().8h, v0.h[7], +, 2 376 sum_sub v29.4s, \in3\().8h, v0.h[6], -, 2 377 sum_sub v30.4s, \in3\().8h, v0.h[5], +, 2 378 sum_sub v31.4s, \in3\().8h, v0.h[4], -, 2 379 380 butterfly v24.4s, v28.4s, v16.4s, v23.4s 381 butterfly v25.4s, v29.4s, v17.4s, v22.4s 382 butterfly v26.4s, v30.4s, v18.4s, v21.4s 383 butterfly v27.4s, v31.4s, v19.4s, v20.4s 384 add x4, sp, #\offset 385 st1 {v16.4s-v19.4s}, [x4], #64 386 st1 {v20.4s-v23.4s}, [x4] 387.endm 388 389.macro load16 in0, in1, in2, in3 390 ld1 {\in0}[0], [x1], x2 391 ld1 {\in0}[1], [x3], x2 392 ld1 {\in1}[0], [x1], x2 393 ld1 {\in1}[1], [x3], x2 394 ld1 {\in2}[0], [x1], x2 395 ld1 {\in2}[1], [x3], x2 396 ld1 {\in3}[0], [x1], x2 397 ld1 {\in3}[1], [x3], x2 398.endm 399 400.macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7, p 401 sum_sub v21.4s, \in, \t0, \op0, \p 402 sum_sub v22.4s, \in, \t1, \op1, \p 403 sum_sub v23.4s, \in, \t2, \op2, \p 404 sum_sub v24.4s, \in, \t3, \op3, \p 405 sum_sub v25.4s, \in, \t4, \op4, \p 406 sum_sub v26.4s, \in, \t5, \op5, \p 407 sum_sub v27.4s, \in, \t6, \op6, \p 408 sum_sub v28.4s, \in, \t7, \op7, \p 409.endm 410 411.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7 412 add v20.4s, \in0, \in1 413 sub \in0, \in0, \in1 414 add \in1, \in2, \in3 415 sub \in2, \in2, \in3 416 add \in3, \in4, \in5 417 sub \in4, \in4, \in5 418 add \in5, \in6, \in7 419 sub \in6, \in6, \in7 420.endm 421 422.macro store16 in0, in1, in2, in3, rx 423 st1 {\in0}[0], [x1], x2 424 st1 {\in0}[1], [x3], \rx 425 st1 {\in1}[0], [x1], x2 426 st1 {\in1}[1], [x3], \rx 427 st1 {\in2}[0], [x1], x2 428 st1 {\in2}[1], [x3], \rx 429 st1 {\in3}[0], [x1], x2 430 st1 {\in3}[1], [x3], \rx 431.endm 432 433.macro scale out0, out1, out2, out3, in0, in1, in2, in3, in4, in5, in6, in7, shift 434 sqrshrn \out0\().4h, \in0, \shift 435 sqrshrn2 \out0\().8h, \in1, \shift 436 sqrshrn \out1\().4h, \in2, \shift 437 sqrshrn2 \out1\().8h, \in3, \shift 438 sqrshrn \out2\().4h, \in4, \shift 439 sqrshrn2 \out2\().8h, \in5, \shift 440 sqrshrn \out3\().4h, \in6, \shift 441 sqrshrn2 \out3\().8h, \in7, \shift 442.endm 443 444.macro transpose16_4x4_2 r0, r1, r2, r3 445 // lower halves 446 trn1 v2.4h, \r0\().4h, \r1\().4h 447 trn2 v3.4h, \r0\().4h, \r1\().4h 448 trn1 v4.4h, \r2\().4h, \r3\().4h 449 trn2 v5.4h, \r2\().4h, \r3\().4h 450 trn1 v6.2s, v2.2s, v4.2s 451 trn2 v7.2s, v2.2s, v4.2s 452 trn1 v2.2s, v3.2s, v5.2s 453 trn2 v4.2s, v3.2s, v5.2s 454 mov \r0\().d[0], v6.d[0] 455 mov \r2\().d[0], v7.d[0] 456 mov \r1\().d[0], v2.d[0] 457 mov \r3\().d[0], v4.d[0] 458 459 // upper halves in reverse order 460 trn1 v2.8h, \r3\().8h, \r2\().8h 461 trn2 v3.8h, \r3\().8h, \r2\().8h 462 trn1 v4.8h, \r1\().8h, \r0\().8h 463 trn2 v5.8h, \r1\().8h, \r0\().8h 464 trn1 v6.4s, v2.4s, v4.4s 465 trn2 v7.4s, v2.4s, v4.4s 466 trn1 v2.4s, v3.4s, v5.4s 467 trn2 v4.4s, v3.4s, v5.4s 468 mov \r3\().d[1], v6.d[1] 469 mov \r1\().d[1], v7.d[1] 470 mov \r2\().d[1], v2.d[1] 471 mov \r0\().d[1], v4.d[1] 472.endm 473 474.macro tr_16x4 name, shift, offset, step 475function func_tr_16x4_\name 476 mov x1, x5 477 add x3, x5, #(\step * 64) 478 mov x2, #(\step * 128) 479 load16 v16.d, v17.d, v18.d, v19.d 480 movrel x1, trans 481 ld1 {v0.8h}, [x1] 482 483 tr16_8x4 v16, v17, v18, v19, \offset 484 485 add x1, x5, #(\step * 32) 486 add x3, x5, #(\step * 3 *32) 487 mov x2, #(\step * 128) 488 load16 v20.d, v17.d, v18.d, v19.d 489 movrel x1, trans, 16 490 ld1 {v1.8h}, [x1] 491 smull v21.4s, v20.4h, v1.h[0] 492 smull v22.4s, v20.4h, v1.h[1] 493 smull v23.4s, v20.4h, v1.h[2] 494 smull v24.4s, v20.4h, v1.h[3] 495 smull v25.4s, v20.4h, v1.h[4] 496 smull v26.4s, v20.4h, v1.h[5] 497 smull v27.4s, v20.4h, v1.h[6] 498 smull v28.4s, v20.4h, v1.h[7] 499 500 add_member v20.8h, v1.h[1], v1.h[4], v1.h[7], v1.h[5], v1.h[2], v1.h[0], v1.h[3], v1.h[6], +, +, +, -, -, -, -, -, 2 501 add_member v17.4h, v1.h[2], v1.h[7], v1.h[3], v1.h[1], v1.h[6], v1.h[4], v1.h[0], v1.h[5], +, +, -, -, -, +, +, + 502 add_member v17.8h, v1.h[3], v1.h[5], v1.h[1], v1.h[7], v1.h[0], v1.h[6], v1.h[2], v1.h[4], +, -, -, +, +, +, -, -, 2 503 add_member v18.4h, v1.h[4], v1.h[2], v1.h[6], v1.h[0], v1.h[7], v1.h[1], v1.h[5], v1.h[3], +, -, -, +, -, -, +, + 504 add_member v18.8h, v1.h[5], v1.h[0], v1.h[4], v1.h[6], v1.h[1], v1.h[3], v1.h[7], v1.h[2], +, -, +, +, -, +, +, -, 2 505 add_member v19.4h, v1.h[6], v1.h[3], v1.h[0], v1.h[2], v1.h[5], v1.h[7], v1.h[4], v1.h[1], +, -, +, -, +, +, -, + 506 add_member v19.8h, v1.h[7], v1.h[6], v1.h[5], v1.h[4], v1.h[3], v1.h[2], v1.h[1], v1.h[0], +, -, +, -, +, -, +, -, 2 507 508 add x4, sp, #\offset 509 ld1 {v16.4s-v19.4s}, [x4], #64 510 511 butterfly16 v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, v24.4s 512 scale v29, v30, v31, v24, v20.4s, v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, \shift 513 transpose16_4x4_2 v29, v30, v31, v24 514 mov x1, x6 515 add x3, x6, #(24 +3*32) 516 mov x2, #32 517 mov x4, #-32 518 store16 v29.d, v30.d, v31.d, v24.d, x4 519 520 add x4, sp, #(\offset + 64) 521 ld1 {v16.4s-v19.4s}, [x4] 522 butterfly16 v16.4s, v25.4s, v17.4s, v26.4s, v18.4s, v27.4s, v19.4s, v28.4s 523 scale v29, v30, v31, v20, v20.4s, v16.4s, v25.4s, v17.4s, v26.4s, v18.4s, v27.4s, v19.4s, \shift 524 transpose16_4x4_2 v29, v30, v31, v20 525 526 add x1, x6, #8 527 add x3, x6, #(16 + 3 * 32) 528 mov x2, #32 529 mov x4, #-32 530 store16 v29.d, v30.d, v31.d, v20.d, x4 531 532 ret 533endfunc 534.endm 535 536.macro idct_16x16 bitdepth 537function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1 538//r0 - coeffs 539 mov x15, x30 540 541 // allocate a temp buffer 542 sub sp, sp, #640 543 544.irp i, 0, 1, 2, 3 545 add x5, x0, #(8 * \i) 546 add x6, sp, #(8 * \i * 16) 547 bl func_tr_16x4_firstpass 548.endr 549 550.irp i, 0, 1, 2, 3 551 add x5, sp, #(8 * \i) 552 add x6, x0, #(8 * \i * 16) 553 bl func_tr_16x4_secondpass_\bitdepth 554.endr 555 556 add sp, sp, #640 557 558 mov x30, x15 559 ret 560endfunc 561.endm 562 563idct_8x8 8 564idct_8x8 10 565 566tr_16x4 firstpass, 7, 512, 1 567tr_16x4 secondpass_8, 20 - 8, 512, 1 568tr_16x4 secondpass_10, 20 - 10, 512, 1 569 570idct_16x16 8 571idct_16x16 10 572 573// void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs) 574.macro idct_dc size, bitdepth 575function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1 576 ld1r {v4.8h}, [x0] 577 srshr v4.8h, v4.8h, #1 578 srshr v0.8h, v4.8h, #(14 - \bitdepth) 579 srshr v1.8h, v4.8h, #(14 - \bitdepth) 580.if \size > 4 581 srshr v2.8h, v4.8h, #(14 - \bitdepth) 582 srshr v3.8h, v4.8h, #(14 - \bitdepth) 583.if \size > 16 /* dc 32x32 */ 584 mov x2, #4 5851: 586 subs x2, x2, #1 587.endif 588 add x12, x0, #64 589 mov x13, #128 590.if \size > 8 /* dc 16x16 */ 591 st1 {v0.8h-v3.8h}, [x0], x13 592 st1 {v0.8h-v3.8h}, [x12], x13 593 st1 {v0.8h-v3.8h}, [x0], x13 594 st1 {v0.8h-v3.8h}, [x12], x13 595 st1 {v0.8h-v3.8h}, [x0], x13 596 st1 {v0.8h-v3.8h}, [x12], x13 597.endif /* dc 8x8 */ 598 st1 {v0.8h-v3.8h}, [x0], x13 599 st1 {v0.8h-v3.8h}, [x12], x13 600.if \size > 16 /* dc 32x32 */ 601 bne 1b 602.endif 603.else /* dc 4x4 */ 604 st1 {v0.8h-v1.8h}, [x0] 605.endif 606 ret 607endfunc 608.endm 609 610idct_dc 4, 8 611idct_dc 4, 10 612 613idct_dc 8, 8 614idct_dc 8, 10 615 616idct_dc 16, 8 617idct_dc 16, 10 618 619idct_dc 32, 8 620idct_dc 32, 10 621