• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * ARM NEON optimised IDCT functions for HEVC decoding
3 * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
4 * Copyright (c) 2017 Alexandra Hájková
5 *
6 * Ported from arm/hevcdsp_idct_neon.S by
7 * Copyright (c) 2020 Reimar Döffinger
8 * Copyright (c) 2020 Josh Dekker
9 *
10 * This file is part of FFmpeg.
11 *
12 * FFmpeg is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27#include "libavutil/aarch64/asm.S"
28
29const trans, align=4
30        .short 64, 83, 64, 36
31        .short 89, 75, 50, 18
32        .short 90, 87, 80, 70
33        .short 57, 43, 25, 9
34        .short 90, 90, 88, 85
35        .short 82, 78, 73, 67
36        .short 61, 54, 46, 38
37        .short 31, 22, 13, 4
38endconst
39
40.macro clip10 in1, in2, c1, c2
41        smax        \in1, \in1, \c1
42        smax        \in2, \in2, \c1
43        smin        \in1, \in1, \c2
44        smin        \in2, \in2, \c2
45.endm
46
47function ff_hevc_add_residual_4x4_8_neon, export=1
48        ld1             {v0.8h-v1.8h}, [x1]
49        ld1             {v2.s}[0], [x0], x2
50        ld1             {v2.s}[1], [x0], x2
51        ld1             {v2.s}[2], [x0], x2
52        ld1             {v2.s}[3], [x0], x2
53        sub              x0,  x0,  x2, lsl #2
54        uxtl             v6.8h,  v2.8b
55        uxtl2            v7.8h,  v2.16b
56        sqadd            v0.8h,  v0.8h, v6.8h
57        sqadd            v1.8h,  v1.8h, v7.8h
58        sqxtun           v0.8b,  v0.8h
59        sqxtun2          v0.16b, v1.8h
60        st1             {v0.s}[0], [x0], x2
61        st1             {v0.s}[1], [x0], x2
62        st1             {v0.s}[2], [x0], x2
63        st1             {v0.s}[3], [x0], x2
64        ret
65endfunc
66
67function ff_hevc_add_residual_4x4_10_neon, export=1
68        mov             x12,  x0
69        ld1             {v0.8h-v1.8h}, [x1]
70        ld1             {v2.d}[0], [x12], x2
71        ld1             {v2.d}[1], [x12], x2
72        ld1             {v3.d}[0], [x12], x2
73        sqadd            v0.8h, v0.8h, v2.8h
74        ld1             {v3.d}[1], [x12], x2
75        movi             v4.8h, #0
76        sqadd            v1.8h, v1.8h, v3.8h
77        mvni             v5.8h, #0xFC, lsl #8 // movi #0x3FF
78        clip10           v0.8h, v1.8h, v4.8h, v5.8h
79        st1             {v0.d}[0],  [x0], x2
80        st1             {v0.d}[1],  [x0], x2
81        st1             {v1.d}[0],  [x0], x2
82        st1             {v1.d}[1],  [x0], x2
83        ret
84endfunc
85
86function ff_hevc_add_residual_8x8_8_neon, export=1
87        add             x12,  x0, x2
88        add              x2,  x2, x2
89        mov              x3,  #8
901:      subs             x3,  x3, #2
91        ld1             {v2.d}[0],     [x0]
92        ld1             {v2.d}[1],    [x12]
93        uxtl             v3.8h,  v2.8b
94        ld1             {v0.8h-v1.8h}, [x1], #32
95        uxtl2            v2.8h,  v2.16b
96        sqadd            v0.8h,  v0.8h,   v3.8h
97        sqadd            v1.8h,  v1.8h,   v2.8h
98        sqxtun           v0.8b,  v0.8h
99        sqxtun2          v0.16b, v1.8h
100        st1             {v0.d}[0],     [x0], x2
101        st1             {v0.d}[1],    [x12], x2
102        bne              1b
103        ret
104endfunc
105
106function ff_hevc_add_residual_8x8_10_neon, export=1
107        add             x12,  x0, x2
108        add              x2,  x2, x2
109        mov              x3,  #8
110        movi             v4.8h, #0
111        mvni             v5.8h, #0xFC, lsl #8 // movi #0x3FF
1121:      subs             x3,  x3, #2
113        ld1             {v0.8h-v1.8h}, [x1], #32
114        ld1             {v2.8h},       [x0]
115        sqadd            v0.8h, v0.8h, v2.8h
116        ld1             {v3.8h},      [x12]
117        sqadd            v1.8h, v1.8h, v3.8h
118        clip10           v0.8h, v1.8h, v4.8h, v5.8h
119        st1             {v0.8h},       [x0], x2
120        st1             {v1.8h},      [x12], x2
121        bne              1b
122        ret
123endfunc
124
125function ff_hevc_add_residual_16x16_8_neon, export=1
126        mov              x3,  #16
127        add             x12, x0, x2
128        add              x2,  x2, x2
1291:      subs             x3,  x3, #2
130        ld1             {v16.16b},     [x0]
131        ld1             {v0.8h-v3.8h}, [x1], #64
132        ld1             {v19.16b},    [x12]
133        uxtl            v17.8h, v16.8b
134        uxtl2           v18.8h, v16.16b
135        uxtl            v20.8h, v19.8b
136        uxtl2           v21.8h, v19.16b
137        sqadd            v0.8h,  v0.8h, v17.8h
138        sqadd            v1.8h,  v1.8h, v18.8h
139        sqadd            v2.8h,  v2.8h, v20.8h
140        sqadd            v3.8h,  v3.8h, v21.8h
141        sqxtun           v0.8b,  v0.8h
142        sqxtun2         v0.16b,  v1.8h
143        sqxtun           v1.8b,  v2.8h
144        sqxtun2         v1.16b,  v3.8h
145        st1             {v0.16b},     [x0], x2
146        st1             {v1.16b},    [x12], x2
147        bne              1b
148        ret
149endfunc
150
151function ff_hevc_add_residual_16x16_10_neon, export=1
152        mov              x3,  #16
153        movi            v20.8h, #0
154        mvni            v21.8h, #0xFC, lsl #8 // movi #0x3FF
155        add             x12,  x0, x2
156        add              x2,  x2, x2
1571:      subs             x3,  x3, #2
158        ld1             {v16.8h-v17.8h}, [x0]
159        ld1             {v0.8h-v3.8h},  [x1], #64
160        sqadd            v0.8h, v0.8h, v16.8h
161        ld1             {v18.8h-v19.8h}, [x12]
162        sqadd            v1.8h, v1.8h, v17.8h
163        sqadd            v2.8h, v2.8h, v18.8h
164        sqadd            v3.8h, v3.8h, v19.8h
165        clip10           v0.8h, v1.8h, v20.8h, v21.8h
166        clip10           v2.8h, v3.8h, v20.8h, v21.8h
167        st1             {v0.8h-v1.8h},   [x0], x2
168        st1             {v2.8h-v3.8h},  [x12], x2
169        bne              1b
170        ret
171endfunc
172
173function ff_hevc_add_residual_32x32_8_neon, export=1
174        add             x12,  x0, x2
175        add              x2,  x2, x2
176        mov              x3,  #32
1771:      subs             x3,  x3, #2
178        ld1             {v20.16b, v21.16b}, [x0]
179        uxtl            v16.8h,  v20.8b
180        uxtl2           v17.8h,  v20.16b
181        ld1             {v22.16b, v23.16b}, [x12]
182        uxtl            v18.8h,  v21.8b
183        uxtl2           v19.8h,  v21.16b
184        uxtl            v20.8h,  v22.8b
185        ld1             {v0.8h-v3.8h}, [x1], #64
186        ld1             {v4.8h-v7.8h}, [x1], #64
187        uxtl2           v21.8h,  v22.16b
188        uxtl            v22.8h,  v23.8b
189        uxtl2           v23.8h,  v23.16b
190        sqadd            v0.8h,  v0.8h,  v16.8h
191        sqadd            v1.8h,  v1.8h,  v17.8h
192        sqadd            v2.8h,  v2.8h,  v18.8h
193        sqadd            v3.8h,  v3.8h,  v19.8h
194        sqadd            v4.8h,  v4.8h,  v20.8h
195        sqadd            v5.8h,  v5.8h,  v21.8h
196        sqadd            v6.8h,  v6.8h,  v22.8h
197        sqadd            v7.8h,  v7.8h,  v23.8h
198        sqxtun           v0.8b,  v0.8h
199        sqxtun2         v0.16b,  v1.8h
200        sqxtun           v1.8b,  v2.8h
201        sqxtun2         v1.16b,  v3.8h
202        sqxtun           v2.8b,  v4.8h
203        sqxtun2         v2.16b,  v5.8h
204        st1             {v0.16b, v1.16b},  [x0], x2
205        sqxtun           v3.8b,  v6.8h
206        sqxtun2         v3.16b,  v7.8h
207        st1             {v2.16b, v3.16b}, [x12], x2
208        bne              1b
209        ret
210endfunc
211
212function ff_hevc_add_residual_32x32_10_neon, export=1
213        mov              x3,  #32
214        movi            v20.8h, #0
215        mvni            v21.8h, #0xFC, lsl #8 // movi #0x3FF
2161:      subs             x3,  x3, #1
217        ld1             {v0.8h-v3.8h},   [x1], #64
218        ld1             {v16.8h-v19.8h}, [x0]
219        sqadd            v0.8h, v0.8h, v16.8h
220        sqadd            v1.8h, v1.8h, v17.8h
221        sqadd            v2.8h, v2.8h, v18.8h
222        sqadd            v3.8h, v3.8h, v19.8h
223        clip10           v0.8h, v1.8h, v20.8h, v21.8h
224        clip10           v2.8h, v3.8h, v20.8h, v21.8h
225        st1             {v0.8h-v3.8h},   [x0], x2
226        bne              1b
227        ret
228endfunc
229
230.macro sum_sub out, in, c, op, p
231  .ifc \op, +
232        smlal\p         \out, \in, \c
233  .else
234        smlsl\p         \out, \in, \c
235  .endif
236.endm
237
238.macro fixsqrshrn d, dt, n, m
239  .ifc \dt, .8h
240        sqrshrn2        \d\dt, \n\().4s, \m
241  .else
242        sqrshrn         \n\().4h, \n\().4s, \m
243        mov             \d\().d[0], \n\().d[0]
244  .endif
245.endm
246
247// uses and clobbers v28-v31 as temp registers
248.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2
249         sshll\p1       v28.4s, \in0, #6
250         mov            v29.16b, v28.16b
251         smull\p1       v30.4s, \in1, v0.h[1]
252         smull\p1       v31.4s, \in1, v0.h[3]
253         smlal\p2       v28.4s, \in2, v0.h[0] //e0
254         smlsl\p2       v29.4s, \in2, v0.h[0] //e1
255         smlal\p2       v30.4s, \in3, v0.h[3] //o0
256         smlsl\p2       v31.4s, \in3, v0.h[1] //o1
257
258         add            \out0, v28.4s, v30.4s
259         add            \out1, v29.4s, v31.4s
260         sub            \out2, v29.4s, v31.4s
261         sub            \out3, v28.4s, v30.4s
262.endm
263
264.macro transpose8_4x4 r0, r1, r2, r3
265        trn1            v2.8h, \r0\().8h, \r1\().8h
266        trn2            v3.8h, \r0\().8h, \r1\().8h
267        trn1            v4.8h, \r2\().8h, \r3\().8h
268        trn2            v5.8h, \r2\().8h, \r3\().8h
269        trn1            \r0\().4s, v2.4s, v4.4s
270        trn2            \r2\().4s, v2.4s, v4.4s
271        trn1            \r1\().4s, v3.4s, v5.4s
272        trn2            \r3\().4s, v3.4s, v5.4s
273.endm
274
275.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7
276        transpose8_4x4  \r0, \r1, \r2, \r3
277        transpose8_4x4  \r4, \r5, \r6, \r7
278.endm
279
280.macro tr_8x4 shift, in0,in0t, in1,in1t, in2,in2t, in3,in3t, in4,in4t, in5,in5t, in6,in6t, in7,in7t, p1, p2
281        tr_4x4_8        \in0\in0t, \in2\in2t, \in4\in4t, \in6\in6t, v24.4s, v25.4s, v26.4s, v27.4s, \p1, \p2
282
283        smull\p1        v30.4s, \in1\in1t, v0.h[6]
284        smull\p1        v28.4s, \in1\in1t, v0.h[4]
285        smull\p1        v29.4s, \in1\in1t, v0.h[5]
286        sum_sub         v30.4s, \in3\in3t, v0.h[4], -, \p1
287        sum_sub         v28.4s, \in3\in3t, v0.h[5], +, \p1
288        sum_sub         v29.4s, \in3\in3t, v0.h[7], -, \p1
289
290        sum_sub         v30.4s, \in5\in5t, v0.h[7], +, \p2
291        sum_sub         v28.4s, \in5\in5t, v0.h[6], +, \p2
292        sum_sub         v29.4s, \in5\in5t, v0.h[4], -, \p2
293
294        sum_sub         v30.4s, \in7\in7t, v0.h[5], +, \p2
295        sum_sub         v28.4s, \in7\in7t, v0.h[7], +, \p2
296        sum_sub         v29.4s, \in7\in7t, v0.h[6], -, \p2
297
298        add             v31.4s, v26.4s, v30.4s
299        sub             v26.4s, v26.4s, v30.4s
300        fixsqrshrn      \in2,\in2t, v31, \shift
301
302
303        smull\p1        v31.4s, \in1\in1t, v0.h[7]
304        sum_sub         v31.4s, \in3\in3t, v0.h[6], -, \p1
305        sum_sub         v31.4s, \in5\in5t, v0.h[5], +, \p2
306        sum_sub         v31.4s, \in7\in7t, v0.h[4], -, \p2
307        fixsqrshrn      \in5,\in5t, v26, \shift
308
309
310        add             v26.4s, v24.4s, v28.4s
311        sub             v24.4s, v24.4s, v28.4s
312        add             v28.4s, v25.4s, v29.4s
313        sub             v25.4s, v25.4s, v29.4s
314        add             v30.4s, v27.4s, v31.4s
315        sub             v27.4s, v27.4s, v31.4s
316
317        fixsqrshrn      \in0,\in0t, v26, \shift
318        fixsqrshrn      \in7,\in7t, v24, \shift
319        fixsqrshrn      \in1,\in1t, v28, \shift
320        fixsqrshrn      \in6,\in6t, v25, \shift
321        fixsqrshrn      \in3,\in3t, v30, \shift
322        fixsqrshrn      \in4,\in4t, v27, \shift
323.endm
324
325.macro idct_8x8 bitdepth
326function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
327//x0 - coeffs
328        mov              x1,  x0
329        ld1             {v16.8h-v19.8h}, [x1], #64
330        ld1             {v20.8h-v23.8h}, [x1]
331
332        movrel           x1, trans
333        ld1             {v0.8h}, [x1]
334
335        tr_8x4          7, v16,.4h, v17,.4h, v18,.4h, v19,.4h, v20,.4h, v21,.4h, v22,.4h, v23,.4h
336        tr_8x4          7, v16,.8h, v17,.8h, v18,.8h, v19,.8h, v20,.8h, v21,.8h, v22,.8h, v23,.8h, 2, 2
337
338        transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23
339
340        tr_8x4          20 - \bitdepth, v16,.4h, v17,.4h, v18,.4h, v19,.4h, v16,.8h, v17,.8h, v18,.8h, v19,.8h, , 2
341        tr_8x4          20 - \bitdepth, v20,.4h, v21,.4h, v22,.4h, v23,.4h, v20,.8h, v21,.8h, v22,.8h, v23,.8h, , 2
342
343        transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23
344
345        mov              x1,  x0
346        st1             {v16.8h-v19.8h}, [x1], #64
347        st1             {v20.8h-v23.8h}, [x1]
348
349        ret
350endfunc
351.endm
352
353.macro butterfly e, o, tmp_p, tmp_m
354        add        \tmp_p, \e, \o
355        sub        \tmp_m, \e, \o
356.endm
357
358.macro tr16_8x4 in0, in1, in2, in3, offset
359        tr_4x4_8        \in0\().4h, \in1\().4h, \in2\().4h, \in3\().4h, v24.4s, v25.4s, v26.4s, v27.4s
360
361        smull2          v28.4s, \in0\().8h, v0.h[4]
362        smull2          v29.4s, \in0\().8h, v0.h[5]
363        smull2          v30.4s, \in0\().8h, v0.h[6]
364        smull2          v31.4s, \in0\().8h, v0.h[7]
365        sum_sub         v28.4s, \in1\().8h, v0.h[5], +, 2
366        sum_sub         v29.4s, \in1\().8h, v0.h[7], -, 2
367        sum_sub         v30.4s, \in1\().8h, v0.h[4], -, 2
368        sum_sub         v31.4s, \in1\().8h, v0.h[6], -, 2
369
370        sum_sub         v28.4s, \in2\().8h, v0.h[6], +, 2
371        sum_sub         v29.4s, \in2\().8h, v0.h[4], -, 2
372        sum_sub         v30.4s, \in2\().8h, v0.h[7], +, 2
373        sum_sub         v31.4s, \in2\().8h, v0.h[5], +, 2
374
375        sum_sub         v28.4s, \in3\().8h, v0.h[7], +, 2
376        sum_sub         v29.4s, \in3\().8h, v0.h[6], -, 2
377        sum_sub         v30.4s, \in3\().8h, v0.h[5], +, 2
378        sum_sub         v31.4s, \in3\().8h, v0.h[4], -, 2
379
380        butterfly       v24.4s, v28.4s, v16.4s, v23.4s
381        butterfly       v25.4s, v29.4s, v17.4s, v22.4s
382        butterfly       v26.4s, v30.4s, v18.4s, v21.4s
383        butterfly       v27.4s, v31.4s, v19.4s, v20.4s
384        add              x4,  sp,  #\offset
385        st1             {v16.4s-v19.4s}, [x4], #64
386        st1             {v20.4s-v23.4s}, [x4]
387.endm
388
389.macro load16 in0, in1, in2, in3
390        ld1             {\in0}[0], [x1], x2
391        ld1             {\in0}[1], [x3], x2
392        ld1             {\in1}[0], [x1], x2
393        ld1             {\in1}[1], [x3], x2
394        ld1             {\in2}[0], [x1], x2
395        ld1             {\in2}[1], [x3], x2
396        ld1             {\in3}[0], [x1], x2
397        ld1             {\in3}[1], [x3], x2
398.endm
399
400.macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7, p
401        sum_sub v21.4s, \in, \t0, \op0, \p
402        sum_sub v22.4s, \in, \t1, \op1, \p
403        sum_sub v23.4s, \in, \t2, \op2, \p
404        sum_sub v24.4s, \in, \t3, \op3, \p
405        sum_sub v25.4s, \in, \t4, \op4, \p
406        sum_sub v26.4s, \in, \t5, \op5, \p
407        sum_sub v27.4s, \in, \t6, \op6, \p
408        sum_sub v28.4s, \in, \t7, \op7, \p
409.endm
410
411.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
412        add             v20.4s, \in0, \in1
413        sub             \in0, \in0, \in1
414        add             \in1, \in2, \in3
415        sub             \in2, \in2, \in3
416        add             \in3, \in4, \in5
417        sub             \in4, \in4, \in5
418        add             \in5, \in6, \in7
419        sub             \in6, \in6, \in7
420.endm
421
422.macro store16 in0, in1, in2, in3, rx
423        st1             {\in0}[0], [x1], x2
424        st1             {\in0}[1], [x3], \rx
425        st1             {\in1}[0], [x1], x2
426        st1             {\in1}[1], [x3], \rx
427        st1             {\in2}[0], [x1], x2
428        st1             {\in2}[1], [x3], \rx
429        st1             {\in3}[0], [x1], x2
430        st1             {\in3}[1], [x3], \rx
431.endm
432
433.macro scale out0, out1, out2, out3, in0, in1, in2, in3, in4, in5, in6, in7, shift
434        sqrshrn         \out0\().4h, \in0, \shift
435        sqrshrn2        \out0\().8h, \in1, \shift
436        sqrshrn         \out1\().4h, \in2, \shift
437        sqrshrn2        \out1\().8h, \in3, \shift
438        sqrshrn         \out2\().4h, \in4, \shift
439        sqrshrn2        \out2\().8h, \in5, \shift
440        sqrshrn         \out3\().4h, \in6, \shift
441        sqrshrn2        \out3\().8h, \in7, \shift
442.endm
443
444.macro transpose16_4x4_2 r0, r1, r2, r3
445        // lower halves
446        trn1            v2.4h, \r0\().4h, \r1\().4h
447        trn2            v3.4h, \r0\().4h, \r1\().4h
448        trn1            v4.4h, \r2\().4h, \r3\().4h
449        trn2            v5.4h, \r2\().4h, \r3\().4h
450        trn1            v6.2s, v2.2s, v4.2s
451        trn2            v7.2s, v2.2s, v4.2s
452        trn1            v2.2s, v3.2s, v5.2s
453        trn2            v4.2s, v3.2s, v5.2s
454        mov             \r0\().d[0], v6.d[0]
455        mov             \r2\().d[0], v7.d[0]
456        mov             \r1\().d[0], v2.d[0]
457        mov             \r3\().d[0], v4.d[0]
458
459        // upper halves in reverse order
460        trn1            v2.8h, \r3\().8h, \r2\().8h
461        trn2            v3.8h, \r3\().8h, \r2\().8h
462        trn1            v4.8h, \r1\().8h, \r0\().8h
463        trn2            v5.8h, \r1\().8h, \r0\().8h
464        trn1            v6.4s, v2.4s, v4.4s
465        trn2            v7.4s, v2.4s, v4.4s
466        trn1            v2.4s, v3.4s, v5.4s
467        trn2            v4.4s, v3.4s, v5.4s
468        mov             \r3\().d[1], v6.d[1]
469        mov             \r1\().d[1], v7.d[1]
470        mov             \r2\().d[1], v2.d[1]
471        mov             \r0\().d[1], v4.d[1]
472.endm
473
474.macro tr_16x4 name, shift, offset, step
475function func_tr_16x4_\name
476        mov              x1,  x5
477        add              x3,  x5, #(\step * 64)
478        mov              x2,  #(\step * 128)
479        load16          v16.d, v17.d, v18.d, v19.d
480        movrel           x1,  trans
481        ld1             {v0.8h}, [x1]
482
483        tr16_8x4        v16, v17, v18, v19, \offset
484
485        add              x1,  x5, #(\step * 32)
486        add              x3,  x5, #(\step * 3 *32)
487        mov              x2,  #(\step * 128)
488        load16          v20.d, v17.d, v18.d, v19.d
489        movrel           x1, trans, 16
490        ld1             {v1.8h}, [x1]
491        smull           v21.4s, v20.4h, v1.h[0]
492        smull           v22.4s, v20.4h, v1.h[1]
493        smull           v23.4s, v20.4h, v1.h[2]
494        smull           v24.4s, v20.4h, v1.h[3]
495        smull           v25.4s, v20.4h, v1.h[4]
496        smull           v26.4s, v20.4h, v1.h[5]
497        smull           v27.4s, v20.4h, v1.h[6]
498        smull           v28.4s, v20.4h, v1.h[7]
499
500        add_member      v20.8h, v1.h[1], v1.h[4], v1.h[7], v1.h[5], v1.h[2], v1.h[0], v1.h[3], v1.h[6], +, +, +, -, -, -, -, -, 2
501        add_member      v17.4h, v1.h[2], v1.h[7], v1.h[3], v1.h[1], v1.h[6], v1.h[4], v1.h[0], v1.h[5], +, +, -, -, -, +, +, +
502        add_member      v17.8h, v1.h[3], v1.h[5], v1.h[1], v1.h[7], v1.h[0], v1.h[6], v1.h[2], v1.h[4], +, -, -, +, +, +, -, -, 2
503        add_member      v18.4h, v1.h[4], v1.h[2], v1.h[6], v1.h[0], v1.h[7], v1.h[1], v1.h[5], v1.h[3], +, -, -, +, -, -, +, +
504        add_member      v18.8h, v1.h[5], v1.h[0], v1.h[4], v1.h[6], v1.h[1], v1.h[3], v1.h[7], v1.h[2], +, -, +, +, -, +, +, -, 2
505        add_member      v19.4h, v1.h[6], v1.h[3], v1.h[0], v1.h[2], v1.h[5], v1.h[7], v1.h[4], v1.h[1], +, -, +, -, +, +, -, +
506        add_member      v19.8h, v1.h[7], v1.h[6], v1.h[5], v1.h[4], v1.h[3], v1.h[2], v1.h[1], v1.h[0], +, -, +, -, +, -, +, -, 2
507
508        add              x4, sp, #\offset
509        ld1             {v16.4s-v19.4s}, [x4], #64
510
511        butterfly16     v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, v24.4s
512        scale           v29, v30, v31, v24, v20.4s, v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, \shift
513        transpose16_4x4_2 v29, v30, v31, v24
514        mov              x1,  x6
515        add              x3,  x6, #(24 +3*32)
516        mov              x2, #32
517        mov              x4, #-32
518        store16         v29.d, v30.d, v31.d, v24.d, x4
519
520        add             x4, sp, #(\offset + 64)
521        ld1             {v16.4s-v19.4s}, [x4]
522        butterfly16     v16.4s, v25.4s, v17.4s, v26.4s, v18.4s, v27.4s, v19.4s, v28.4s
523        scale           v29, v30, v31, v20, v20.4s, v16.4s, v25.4s, v17.4s, v26.4s, v18.4s, v27.4s, v19.4s, \shift
524        transpose16_4x4_2 v29, v30, v31, v20
525
526        add              x1,  x6, #8
527        add              x3,  x6, #(16 + 3 * 32)
528        mov              x2, #32
529        mov              x4, #-32
530        store16         v29.d, v30.d, v31.d, v20.d, x4
531
532        ret
533endfunc
534.endm
535
536.macro idct_16x16 bitdepth
537function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1
538//r0 - coeffs
539        mov             x15, x30
540
541        // allocate a temp buffer
542        sub              sp,  sp,  #640
543
544.irp i, 0, 1, 2, 3
545        add              x5,  x0, #(8 * \i)
546        add              x6,  sp, #(8 * \i * 16)
547        bl              func_tr_16x4_firstpass
548.endr
549
550.irp i, 0, 1, 2, 3
551        add              x5,  sp, #(8 * \i)
552        add              x6,  x0, #(8 * \i * 16)
553        bl              func_tr_16x4_secondpass_\bitdepth
554.endr
555
556        add              sp,  sp,  #640
557
558        mov             x30, x15
559        ret
560endfunc
561.endm
562
563idct_8x8 8
564idct_8x8 10
565
566tr_16x4 firstpass, 7, 512, 1
567tr_16x4 secondpass_8, 20 - 8, 512, 1
568tr_16x4 secondpass_10, 20 - 10, 512, 1
569
570idct_16x16 8
571idct_16x16 10
572
573// void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs)
574.macro idct_dc size, bitdepth
575function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1
576        ld1r         {v4.8h}, [x0]
577        srshr         v4.8h,  v4.8h,  #1
578        srshr         v0.8h,  v4.8h,  #(14 - \bitdepth)
579        srshr         v1.8h,  v4.8h,  #(14 - \bitdepth)
580.if \size > 4
581        srshr         v2.8h,  v4.8h,  #(14 - \bitdepth)
582        srshr         v3.8h,  v4.8h,  #(14 - \bitdepth)
583.if \size > 16 /* dc 32x32 */
584        mov              x2,  #4
5851:
586        subs             x2,  x2, #1
587.endif
588        add             x12,  x0, #64
589        mov             x13,  #128
590.if \size > 8 /* dc 16x16 */
591        st1            {v0.8h-v3.8h},  [x0], x13
592        st1            {v0.8h-v3.8h}, [x12], x13
593        st1            {v0.8h-v3.8h},  [x0], x13
594        st1            {v0.8h-v3.8h}, [x12], x13
595        st1            {v0.8h-v3.8h},  [x0], x13
596        st1            {v0.8h-v3.8h}, [x12], x13
597.endif /* dc 8x8 */
598        st1            {v0.8h-v3.8h},  [x0], x13
599        st1            {v0.8h-v3.8h}, [x12], x13
600.if \size > 16 /* dc 32x32 */
601        bne             1b
602.endif
603.else /* dc 4x4 */
604        st1            {v0.8h-v1.8h},  [x0]
605.endif
606        ret
607endfunc
608.endm
609
610idct_dc 4, 8
611idct_dc 4, 10
612
613idct_dc 8, 8
614idct_dc 8, 10
615
616idct_dc 16, 8
617idct_dc 16, 10
618
619idct_dc 32, 8
620idct_dc 32, 10
621