• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (c) 2016 Google Inc.
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/aarch64/asm.S"
22#include "neon.S"
23
24const itxfm4_coeffs, align=4
25        .short  11585, 0, 6270, 15137
26iadst4_coeffs:
27        .short  5283, 15212, 9929, 13377
28endconst
29
30const iadst8_coeffs, align=4
31        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
32idct_coeffs:
33        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
34        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
35        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
36        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
37endconst
38
39const iadst16_coeffs, align=4
40        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
41        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
42endconst
43
44// out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14
45// out2 = ((in1 - in2) * v0[0] + (1 << 13)) >> 14
46// in/out are .8h registers; this can do with 4 temp registers, but is
47// more efficient if 6 temp registers are available.
48.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
49.if \neg > 0
50        neg             \tmp4\().4h, v0.4h
51.endif
52        add             \tmp1\().8h, \in1\().8h,  \in2\().8h
53        sub             \tmp2\().8h, \in1\().8h,  \in2\().8h
54.if \neg > 0
55        smull           \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0]
56        smull2          \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0]
57.else
58        smull           \tmp3\().4s, \tmp1\().4h, v0.h[0]
59        smull2          \tmp4\().4s, \tmp1\().8h, v0.h[0]
60.endif
61.ifb \tmp5
62        rshrn           \out1\().4h, \tmp3\().4s, #14
63        rshrn2          \out1\().8h, \tmp4\().4s, #14
64        smull           \tmp3\().4s, \tmp2\().4h, v0.h[0]
65        smull2          \tmp4\().4s, \tmp2\().8h, v0.h[0]
66        rshrn           \out2\().4h, \tmp3\().4s, #14
67        rshrn2          \out2\().8h, \tmp4\().4s, #14
68.else
69        smull           \tmp5\().4s, \tmp2\().4h, v0.h[0]
70        smull2          \tmp6\().4s, \tmp2\().8h, v0.h[0]
71        rshrn           \out1\().4h, \tmp3\().4s, #14
72        rshrn2          \out1\().8h, \tmp4\().4s, #14
73        rshrn           \out2\().4h, \tmp5\().4s, #14
74        rshrn2          \out2\().8h, \tmp6\().4s, #14
75.endif
76.endm
77
78// Same as dmbutterfly0 above, but treating the input in in2 as zero,
79// writing the same output into both out1 and out2.
80.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
81        smull           \tmp1\().4s,  \in1\().4h,  v0.h[0]
82        smull2          \tmp2\().4s,  \in1\().8h,  v0.h[0]
83        rshrn           \out1\().4h,  \tmp1\().4s, #14
84        rshrn2          \out1\().8h,  \tmp2\().4s, #14
85        rshrn           \out2\().4h,  \tmp1\().4s, #14
86        rshrn2          \out2\().8h,  \tmp2\().4s, #14
87.endm
88
89// out1,out2 = in1 * coef1 - in2 * coef2
90// out3,out4 = in1 * coef2 + in2 * coef1
91// out are 4 x .4s registers, in are 2 x .8h registers
92.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
93        smull           \out1\().4s, \in1\().4h, \coef1
94        smull2          \out2\().4s, \in1\().8h, \coef1
95        smull           \out3\().4s, \in1\().4h, \coef2
96        smull2          \out4\().4s, \in1\().8h, \coef2
97        smlsl           \out1\().4s, \in2\().4h, \coef2
98        smlsl2          \out2\().4s, \in2\().8h, \coef2
99        smlal           \out3\().4s, \in2\().4h, \coef1
100        smlal2          \out4\().4s, \in2\().8h, \coef1
101.endm
102
103// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
104// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
105// inout are 2 x .8h registers
106.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
107        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
108.if \neg > 0
109        neg             \tmp3\().4s, \tmp3\().4s
110        neg             \tmp4\().4s, \tmp4\().4s
111.endif
112        rshrn           \inout1\().4h, \tmp1\().4s,  #14
113        rshrn2          \inout1\().8h, \tmp2\().4s,  #14
114        rshrn           \inout2\().4h, \tmp3\().4s,  #14
115        rshrn2          \inout2\().8h, \tmp4\().4s,  #14
116.endm
117
118// Same as dmbutterfly above, but treating the input in inout2 as zero
119.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
120        smull           \tmp1\().4s, \inout1\().4h, \coef1
121        smull2          \tmp2\().4s, \inout1\().8h, \coef1
122        smull           \tmp3\().4s, \inout1\().4h, \coef2
123        smull2          \tmp4\().4s, \inout1\().8h, \coef2
124        rshrn           \inout1\().4h, \tmp1\().4s, #14
125        rshrn2          \inout1\().8h, \tmp2\().4s, #14
126        rshrn           \inout2\().4h, \tmp3\().4s, #14
127        rshrn2          \inout2\().8h, \tmp4\().4s, #14
128.endm
129
130// Same as dmbutterfly above, but treating the input in inout1 as zero
131.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
132        smull           \tmp1\().4s, \inout2\().4h, \coef2
133        smull2          \tmp2\().4s, \inout2\().8h, \coef2
134        smull           \tmp3\().4s, \inout2\().4h, \coef1
135        smull2          \tmp4\().4s, \inout2\().8h, \coef1
136        neg             \tmp1\().4s, \tmp1\().4s
137        neg             \tmp2\().4s, \tmp2\().4s
138        rshrn           \inout2\().4h, \tmp3\().4s, #14
139        rshrn2          \inout2\().8h, \tmp4\().4s, #14
140        rshrn           \inout1\().4h, \tmp1\().4s, #14
141        rshrn2          \inout1\().8h, \tmp2\().4s, #14
142.endm
143
144.macro dsmull_h out1, out2, in, coef
145        smull           \out1\().4s, \in\().4h, \coef
146        smull2          \out2\().4s, \in\().8h, \coef
147.endm
148
149.macro drshrn_h out, in1, in2, shift
150        rshrn           \out\().4h, \in1\().4s, \shift
151        rshrn2          \out\().8h, \in2\().4s, \shift
152.endm
153
154
155// out1 = in1 + in2
156// out2 = in1 - in2
157.macro butterfly_8h out1, out2, in1, in2
158        add             \out1\().8h, \in1\().8h, \in2\().8h
159        sub             \out2\().8h, \in1\().8h, \in2\().8h
160.endm
161
162// out1 = in1 - in2
163// out2 = in1 + in2
164.macro butterfly_8h_r out1, out2, in1, in2
165        sub             \out1\().8h, \in1\().8h, \in2\().8h
166        add             \out2\().8h, \in1\().8h, \in2\().8h
167.endm
168
169// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
170// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
171// out are 2 x .8h registers, in are 4 x .4s registers
172.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
173        add             \tmp1\().4s, \in1\().4s, \in3\().4s
174        add             \tmp2\().4s, \in2\().4s, \in4\().4s
175        sub             \tmp3\().4s, \in1\().4s, \in3\().4s
176        sub             \tmp4\().4s, \in2\().4s, \in4\().4s
177        rshrn           \out1\().4h, \tmp1\().4s,  #14
178        rshrn2          \out1\().8h, \tmp2\().4s,  #14
179        rshrn           \out2\().4h, \tmp3\().4s,  #14
180        rshrn2          \out2\().8h, \tmp4\().4s,  #14
181.endm
182
183.macro iwht4 c0, c1, c2, c3
184        add             \c0\().4h, \c0\().4h, \c1\().4h
185        sub             v17.4h,    \c2\().4h, \c3\().4h
186        sub             v16.4h,    \c0\().4h, v17.4h
187        sshr            v16.4h,    v16.4h,    #1
188        sub             \c2\().4h, v16.4h,    \c1\().4h
189        sub             \c1\().4h, v16.4h,    \c3\().4h
190        add             \c3\().4h, v17.4h,    \c2\().4h
191        sub             \c0\().4h, \c0\().4h, \c1\().4h
192.endm
193
194.macro idct4 c0, c1, c2, c3
195        smull           v22.4s,    \c1\().4h, v0.h[3]
196        smull           v20.4s,    \c1\().4h, v0.h[2]
197        add             v16.4h,    \c0\().4h, \c2\().4h
198        sub             v17.4h,    \c0\().4h, \c2\().4h
199        smlal           v22.4s,    \c3\().4h, v0.h[2]
200        smull           v18.4s,    v16.4h,    v0.h[0]
201        smull           v19.4s,    v17.4h,    v0.h[0]
202        smlsl           v20.4s,    \c3\().4h, v0.h[3]
203        rshrn           v22.4h,    v22.4s,    #14
204        rshrn           v18.4h,    v18.4s,    #14
205        rshrn           v19.4h,    v19.4s,    #14
206        rshrn           v20.4h,    v20.4s,    #14
207        add             \c0\().4h, v18.4h,    v22.4h
208        sub             \c3\().4h, v18.4h,    v22.4h
209        add             \c1\().4h, v19.4h,    v20.4h
210        sub             \c2\().4h, v19.4h,    v20.4h
211.endm
212
213.macro iadst4 c0, c1, c2, c3
214        smull           v16.4s,    \c0\().4h, v0.h[4]
215        smlal           v16.4s,    \c2\().4h, v0.h[5]
216        smlal           v16.4s,    \c3\().4h, v0.h[6]
217        smull           v17.4s,    \c0\().4h, v0.h[6]
218        smlsl           v17.4s,    \c2\().4h, v0.h[4]
219        sub             \c0\().4h, \c0\().4h, \c2\().4h
220        smlsl           v17.4s,    \c3\().4h, v0.h[5]
221        add             \c0\().4h, \c0\().4h, \c3\().4h
222        smull           v19.4s,    \c1\().4h, v0.h[7]
223        smull           v18.4s,    \c0\().4h, v0.h[7]
224        add             v20.4s,    v16.4s,    v19.4s
225        add             v21.4s,    v17.4s,    v19.4s
226        rshrn           \c0\().4h, v20.4s,    #14
227        add             v16.4s,    v16.4s,    v17.4s
228        rshrn           \c1\().4h, v21.4s,    #14
229        sub             v16.4s,    v16.4s,    v19.4s
230        rshrn           \c2\().4h, v18.4s,    #14
231        rshrn           \c3\().4h, v16.4s,    #14
232.endm
233
234// The public functions in this file have got the following signature:
235// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
236
237.macro itxfm_func4x4 txfm1, txfm2
238function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
239.ifc \txfm1,\txfm2
240.ifc \txfm1,idct
241        movrel          x4,  itxfm4_coeffs
242        ld1             {v0.4h}, [x4]
243.endif
244.ifc \txfm1,iadst
245        movrel          x4,  iadst4_coeffs
246        ld1             {v0.d}[1], [x4]
247.endif
248.else
249        movrel          x4,  itxfm4_coeffs
250        ld1             {v0.8h}, [x4]
251.endif
252
253        movi            v31.8h, #0
254.ifc \txfm1\()_\txfm2,idct_idct
255        cmp             w3,  #1
256        b.ne            1f
257        // DC-only for idct/idct
258        ld1             {v2.h}[0], [x2]
259        smull           v2.4s,  v2.4h, v0.h[0]
260        rshrn           v2.4h,  v2.4s, #14
261        smull           v2.4s,  v2.4h, v0.h[0]
262        rshrn           v2.4h,  v2.4s, #14
263        st1             {v31.h}[0], [x2]
264        dup             v4.4h,  v2.h[0]
265        mov             v5.16b, v4.16b
266        mov             v6.16b, v4.16b
267        mov             v7.16b, v4.16b
268        b               2f
269.endif
270
2711:
272        ld1             {v4.4h,v5.4h,v6.4h,v7.4h},  [x2]
273        st1             {v31.8h}, [x2], #16
274
275.ifc \txfm1,iwht
276        sshr            v4.4h,  v4.4h,  #2
277        sshr            v5.4h,  v5.4h,  #2
278        sshr            v6.4h,  v6.4h,  #2
279        sshr            v7.4h,  v7.4h,  #2
280.endif
281
282        \txfm1\()4      v4,  v5,  v6,  v7
283
284        st1             {v31.8h}, [x2], #16
285        // Transpose 4x4 with 16 bit elements
286        transpose_4x4H  v4,  v5,  v6,  v7,  v16, v17, v18, v19
287
288        \txfm2\()4      v4,  v5,  v6,  v7
2892:
290        ld1             {v0.s}[0],   [x0], x1
291        ld1             {v1.s}[0],   [x0], x1
292.ifnc \txfm1,iwht
293        srshr           v4.4h,  v4.4h,  #4
294        srshr           v5.4h,  v5.4h,  #4
295        srshr           v6.4h,  v6.4h,  #4
296        srshr           v7.4h,  v7.4h,  #4
297.endif
298        uaddw           v4.8h,  v4.8h,  v0.8b
299        uaddw           v5.8h,  v5.8h,  v1.8b
300        ld1             {v2.s}[0],   [x0], x1
301        ld1             {v3.s}[0],   [x0], x1
302        sqxtun          v0.8b,  v4.8h
303        sqxtun          v1.8b,  v5.8h
304        sub             x0,  x0,  x1, lsl #2
305
306        uaddw           v6.8h,  v6.8h,  v2.8b
307        uaddw           v7.8h,  v7.8h,  v3.8b
308        st1             {v0.s}[0],  [x0], x1
309        sqxtun          v2.8b,  v6.8h
310        sqxtun          v3.8b,  v7.8h
311
312        st1             {v1.s}[0],  [x0], x1
313        st1             {v2.s}[0],  [x0], x1
314        st1             {v3.s}[0],  [x0], x1
315
316        ret
317endfunc
318.endm
319
320itxfm_func4x4 idct,  idct
321itxfm_func4x4 iadst, idct
322itxfm_func4x4 idct,  iadst
323itxfm_func4x4 iadst, iadst
324itxfm_func4x4 iwht,  iwht
325
326
327.macro idct8
328        dmbutterfly0    v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a
329        dmbutterfly     v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
330        dmbutterfly     v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
331        dmbutterfly     v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
332
333        butterfly_8h    v24, v25, v16, v22 // v24 = t0, v25 = t3
334        butterfly_8h    v28, v29, v17, v21 // v28 = t4, v29 = t5a
335        butterfly_8h    v30, v31, v23, v19 // v30 = t7, v31 = t6a
336        butterfly_8h    v26, v27, v20, v18 // v26 = t1, v27 = t2
337
338        dmbutterfly0    v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5
339
340        butterfly_8h    v16, v23, v24, v30 // v16 = out[0], v23 = out[7]
341        butterfly_8h    v17, v22, v26, v31 // v17 = out[1], v22 = out[6]
342        butterfly_8h    v18, v21, v27, v29 // q13 = out[2], q10 = out[5]
343        butterfly_8h    v19, v20, v25, v28 // v17 = out[3], q12 = out[4]
344.endm
345
346.macro iadst8
347        dmbutterfly_l   v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0]   // v24,v25 = t1a, v26,v27 = t0a
348        dmbutterfly_l   v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2]   // v28,v29 = t3a, v30,v31 = t2a
349        dmbutterfly_l   v2,  v3,  v4,  v5,  v19, v20, v1.h[5], v1.h[4]   // v2,v3   = t5a, v4,v5   = t4a
350        dmbutterfly_l   v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6]   // v16,v18 = t7a, v21,v23 = t6a
351
352        dbutterfly_n    v4,  v5,  v26, v27, v4,  v5,  v6,  v7, v26, v27  // v4  = t0, v5  = t4
353        dbutterfly_n    v2,  v3,  v24, v25, v2,  v3,  v6,  v7, v26, v27  // v2  = t1, v3  = t5
354        dbutterfly_n    v24, v25, v30, v31, v21, v23, v6,  v7, v26, v27  // v24 = t2, v25 = t6
355        dbutterfly_n    v30, v31, v28, v29, v16, v18, v6,  v7, v26, v27  // v30 = t3, v31 = t7
356
357        butterfly_8h    v16, v6,  v4, v24 // v16 = out[0],  v6 = t2
358        butterfly_8h    v23, v7,  v2, v30 // v23 = -out[7], v7 = t3
359        neg             v23.8h,   v23.8h  // v23 = out[7]
360
361        dmbutterfly0    v19, v20, v6, v7, v24, v26, v27, v28, v29, v30   // v19 = -out[3], v20 = out[4]
362        neg             v19.8h,   v19.8h  // v19 = out[3]
363
364        dmbutterfly_l   v26, v27, v28, v29, v5,  v3,  v0.h[2], v0.h[3]   // v26,v27 = t5a, v28,v29 = t4a
365        dmbutterfly_l   v2,  v3,  v4,  v5,  v31, v25, v0.h[3], v0.h[2]   // v2,v3   = t6a, v4,v5   = t7a
366
367        dbutterfly_n    v17, v30, v28, v29, v2,  v3,  v6,  v7,  v24, v25 // v17 = -out[1], v30 = t6
368        dbutterfly_n    v22, v31, v26, v27, v4,  v5,  v6,  v7,  v24, v25 // v22 = out[6],  v31 = t7
369        neg             v17.8h,   v17.8h  // v17 = out[1]
370
371        dmbutterfly0    v18, v21, v30, v31, v2,  v3,  v4,  v5,  v6,  v7  // v18 = out[2], v21 = -out[5]
372        neg             v21.8h,   v21.8h  // v21 = out[5]
373.endm
374
375
376.macro itxfm_func8x8 txfm1, txfm2
377function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
378        // The iadst also uses a few coefficients from
379        // idct, so those always need to be loaded.
380.ifc \txfm1\()_\txfm2,idct_idct
381        movrel          x4,  idct_coeffs
382.else
383        movrel          x4,  iadst8_coeffs
384        ld1             {v1.8h}, [x4], #16
385.endif
386        ld1             {v0.8h}, [x4]
387
388        movi            v2.8h, #0
389        movi            v3.8h, #0
390        movi            v4.8h, #0
391        movi            v5.8h, #0
392
393.ifc \txfm1\()_\txfm2,idct_idct
394        cmp             w3,  #1
395        b.ne            1f
396        // DC-only for idct/idct
397        ld1             {v2.h}[0],  [x2]
398        smull           v2.4s,  v2.4h, v0.h[0]
399        rshrn           v2.4h,  v2.4s, #14
400        smull           v2.4s,  v2.4h, v0.h[0]
401        rshrn           v2.4h,  v2.4s, #14
402        st1             {v3.h}[0],  [x2]
403        dup             v16.8h,  v2.h[0]
404        mov             v17.16b, v16.16b
405        mov             v18.16b, v16.16b
406        mov             v19.16b, v16.16b
407        mov             v20.16b, v16.16b
408        mov             v21.16b, v16.16b
409        mov             v22.16b, v16.16b
410        mov             v23.16b, v16.16b
411        b               2f
412.endif
4131:
414        ld1             {v16.8h,v17.8h,v18.8h,v19.8h},  [x2], #64
415        ld1             {v20.8h,v21.8h,v22.8h,v23.8h},  [x2], #64
416        sub             x2,  x2,  #128
417        st1             {v2.8h,v3.8h,v4.8h,v5.8h},      [x2], #64
418        st1             {v2.8h,v3.8h,v4.8h,v5.8h},      [x2], #64
419
420        \txfm1\()8
421
422        // Transpose 8x8 with 16 bit elements
423        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
424
425        \txfm2\()8
4262:
427        mov             x3,  x0
428        // Add into the destination
429        ld1             {v0.8b},  [x0], x1
430        srshr           v16.8h, v16.8h, #5
431        ld1             {v1.8b},  [x0], x1
432        srshr           v17.8h, v17.8h, #5
433        ld1             {v2.8b},  [x0], x1
434        srshr           v18.8h, v18.8h, #5
435        uaddw           v16.8h, v16.8h, v0.8b
436        ld1             {v3.8b},  [x0], x1
437        srshr           v19.8h, v19.8h, #5
438        uaddw           v17.8h, v17.8h, v1.8b
439        ld1             {v4.8b},  [x0], x1
440        srshr           v20.8h, v20.8h, #5
441        uaddw           v18.8h, v18.8h, v2.8b
442        sqxtun          v0.8b,  v16.8h
443        ld1             {v5.8b},  [x0], x1
444        srshr           v21.8h, v21.8h, #5
445        uaddw           v19.8h, v19.8h, v3.8b
446        sqxtun          v1.8b,  v17.8h
447        ld1             {v6.8b},  [x0], x1
448        srshr           v22.8h, v22.8h, #5
449        uaddw           v20.8h, v20.8h, v4.8b
450        sqxtun          v2.8b,  v18.8h
451        ld1             {v7.8b},  [x0], x1
452        srshr           v23.8h, v23.8h, #5
453        uaddw           v21.8h, v21.8h, v5.8b
454        sqxtun          v3.8b,  v19.8h
455
456        st1             {v0.8b},  [x3], x1
457        uaddw           v22.8h, v22.8h, v6.8b
458        st1             {v1.8b},  [x3], x1
459        sqxtun          v4.8b,  v20.8h
460        st1             {v2.8b},  [x3], x1
461        uaddw           v23.8h, v23.8h, v7.8b
462        st1             {v3.8b},  [x3], x1
463        sqxtun          v5.8b,  v21.8h
464        st1             {v4.8b},  [x3], x1
465        sqxtun          v6.8b,  v22.8h
466        st1             {v5.8b},  [x3], x1
467        sqxtun          v7.8b,  v23.8h
468
469        st1             {v6.8b},  [x3], x1
470        st1             {v7.8b},  [x3], x1
471
472        ret
473endfunc
474.endm
475
476itxfm_func8x8 idct,  idct
477itxfm_func8x8 iadst, idct
478itxfm_func8x8 idct,  iadst
479itxfm_func8x8 iadst, iadst
480
481
482function idct16x16_dc_add_neon
483        movrel          x4,  idct_coeffs
484        ld1             {v0.4h}, [x4]
485
486        movi            v1.4h,  #0
487
488        ld1             {v2.h}[0], [x2]
489        smull           v2.4s,  v2.4h,  v0.h[0]
490        rshrn           v2.4h,  v2.4s,  #14
491        smull           v2.4s,  v2.4h,  v0.h[0]
492        rshrn           v2.4h,  v2.4s,  #14
493        dup             v2.8h,  v2.h[0]
494        st1             {v1.h}[0], [x2]
495
496        srshr           v2.8h,  v2.8h,  #6
497
498        mov             x3,  x0
499        mov             x4,  #16
5001:
501        // Loop to add the constant from v2 into all 16x16 outputs
502        subs            x4,  x4,  #2
503        ld1             {v3.16b},  [x0], x1
504        ld1             {v4.16b},  [x0], x1
505        uaddw           v16.8h, v2.8h,  v3.8b
506        uaddw2          v17.8h, v2.8h,  v3.16b
507        uaddw           v18.8h, v2.8h,  v4.8b
508        uaddw2          v19.8h, v2.8h,  v4.16b
509        sqxtun          v3.8b,  v16.8h
510        sqxtun2         v3.16b, v17.8h
511        sqxtun          v4.8b,  v18.8h
512        sqxtun2         v4.16b, v19.8h
513        st1             {v3.16b},  [x3], x1
514        st1             {v4.16b},  [x3], x1
515        b.ne            1b
516
517        ret
518endfunc
519
520.macro idct16_end
521        butterfly_8h    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
522        butterfly_8h    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
523        butterfly_8h    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
524        butterfly_8h    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
525        butterfly_8h    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
526        butterfly_8h    v24, v21, v23, v21               // v24 = t9,   v21 = t10
527        butterfly_8h    v23, v27, v25, v27               // v23 = t14,  v27 = t13
528        butterfly_8h    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
529
530        dmbutterfly0    v2,  v3,  v27, v21, v2,  v3,  v16, v17, v30, v31 // v2  = t13a, v3  = t10a
531        dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
532
533        butterfly_8h    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
534        butterfly_8h    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
535        butterfly_8h_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
536        butterfly_8h    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
537        butterfly_8h    v18, v29, v4,  v2                // v18 = out[2], v29 = out[13]
538        butterfly_8h    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
539        butterfly_8h    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
540        butterfly_8h    v21, v26, v26, v3                // v21 = out[5], v26 = out[10]
541        ret
542.endm
543
544function idct16
545        dmbutterfly0    v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
546        dmbutterfly     v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
547        dmbutterfly     v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
548        dmbutterfly     v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
549        dmbutterfly     v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
550        dmbutterfly     v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
551        dmbutterfly     v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
552        dmbutterfly     v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
553
554        butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
555        butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
556        butterfly_8h    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
557        butterfly_8h    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
558        butterfly_8h    v16, v25, v17, v25               // v16 = t8,   v25 = t9
559        butterfly_8h    v24, v21, v29, v21               // v24 = t11,  v21 = t10
560        butterfly_8h    v17, v27, v19, v27               // v17 = t12,  v27 = t13
561        butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
562
563        dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
564        dmbutterfly     v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
565        dmbutterfly     v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
566        idct16_end
567endfunc
568
569function idct16_half
570        dmbutterfly0_h  v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
571        dmbutterfly_h1  v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
572        dmbutterfly_h1  v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
573        dmbutterfly_h2  v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
574        dmbutterfly_h1  v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
575        dmbutterfly_h2  v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
576        dmbutterfly_h1  v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
577        dmbutterfly_h2  v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
578
579        butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
580        butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
581        butterfly_8h    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
582        butterfly_8h    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
583        butterfly_8h    v16, v25, v17, v25               // v16 = t8,   v25 = t9
584        butterfly_8h    v24, v21, v29, v21               // v24 = t11,  v21 = t10
585        butterfly_8h    v17, v27, v19, v27               // v17 = t12,  v27 = t13
586        butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
587
588        dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
589        dmbutterfly     v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
590        dmbutterfly     v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
591        idct16_end
592endfunc
593
594function idct16_quarter
595        dsmull_h        v24, v25, v19, v1.h[7]
596        dsmull_h        v4,  v5,  v17, v1.h[0]
597        dsmull_h        v7,  v6,  v18, v0.h[5]
598        dsmull_h        v30, v31, v18, v0.h[4]
599        neg             v24.4s,  v24.4s
600        neg             v25.4s,  v25.4s
601        dsmull_h        v29, v28, v17, v1.h[1]
602        dsmull_h        v26, v27, v19, v1.h[6]
603        dsmull_h        v22, v23, v16, v0.h[0]
604        drshrn_h        v24, v24, v25, #14
605        drshrn_h        v16, v4,  v5,  #14
606        drshrn_h        v7,  v7,  v6,  #14
607        drshrn_h        v6,  v30, v31, #14
608        drshrn_h        v29, v29, v28, #14
609        drshrn_h        v17, v26, v27, #14
610        drshrn_h        v28, v22, v23, #14
611
612        dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.h[2], v0.h[3]
613        dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.h[2], v0.h[3]
614        neg             v22.4s,  v22.4s
615        neg             v23.4s,  v23.4s
616        drshrn_h        v27, v20, v21, #14
617        drshrn_h        v21, v22, v23, #14
618        drshrn_h        v23, v18, v19, #14
619        drshrn_h        v25, v30, v31, #14
620        mov             v4.16b,  v28.16b
621        mov             v5.16b,  v28.16b
622        dmbutterfly0    v22, v26, v7,  v6,  v18, v19, v30, v31
623        mov             v20.16b, v28.16b
624        idct16_end
625endfunc
626
627function iadst16
628        ld1             {v0.8h,v1.8h}, [x11]
629
630        dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.h[1], v0.h[0]   // v6,v7   = t1,   v4,v5   = t0
631        dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v0.h[5], v0.h[4]   // v10,v11 = t9,   v8,v9   = t8
632        dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
633        dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2]   // v14,v15 = t3,   v12,v13 = t2
634        dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
635
636        dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v0.h[7], v0.h[6]   // v6,v7   = t11,  v4,v5   = t10
637        dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
638        dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v1.h[1], v1.h[0]   // v10,v11 = t5,   v8,v9   = t4
639        dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
640
641        dmbutterfly_l   v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4]   // v14,v15 = t13,  v12,v13 = t12
642        dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
643        dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v1.h[3], v1.h[2]   // v6,v7   = t7,   v4,v5   = t6
644        dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
645
646        dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v1.h[7], v1.h[6]   // v10,v11 = t15,  v8,v9   = t14
647        ld1             {v0.8h}, [x10]
648        dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
649        dmbutterfly_l   v14, v15, v12, v13, v23, v24, v0.h[4], v0.h[5]   // v14,v15 = t9,   v12,v13 = t8
650        dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
651
652        dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v0.h[5], v0.h[4]   // v4,v5   = t12,  v6,v7   = t13
653        dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
654        dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v0.h[6], v0.h[7]   // v10,v11 = t11,  v8,v9   = t10
655        butterfly_8h_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
656        dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
657
658        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[7], v0.h[6]   // v12,v13 = t14,  v14,v15 = t15
659        butterfly_8h_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
660        dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
661        dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
662
663        butterfly_8h_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
664        butterfly_8h_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
665
666        dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.h[2], v0.h[3]   // v10,v11 = t13,  v8,v9   = t12
667        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[3], v0.h[2]   // v12,v13 = t14,  v14,v15 = t15
668
669        dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
670        dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
671        neg             v29.8h, v29.8h                   // v29 = out[13]
672
673        dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.h[2], v0.h[3]   // v10,v11 = t5a,  v8,v9   = t4a
674        dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.h[3], v0.h[2]   // v12,v13 = t6a,  v14,v15 = t7a
675
676        butterfly_8h    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
677        butterfly_8h    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
678
679        dbutterfly_n    v19, v31, v8,  v9,  v12, v13, v4,  v5,  v8,  v9  // v19 = -out[3],  v31 = t6
680        neg             v19.8h, v19.8h                   // v19 = out[3]
681        dbutterfly_n    v28, v16, v10, v11, v14, v15, v4,  v5,  v10, v11 // v28 = out[12],  v16 = t7
682
683        butterfly_8h    v5,  v8,  v20, v22               // v5 =-out[15],v8 = t3a
684        butterfly_8h    v4,  v9,  v24, v26               // v4 = out[14],v9 = t11
685
686        dmbutterfly0    v23, v24, v6,  v8,  v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
687        dmbutterfly0    v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
688        dmbutterfly0    v20, v27, v16, v31, v10, v11, v12, v13, v14, v15    // v20 = out[4], v27 = out[11]
689        dmbutterfly0    v22, v25, v9,  v7,  v10, v11, v12, v13, v14, v15    // v22 = out[6], v25 = out[9]
690
691        neg             v31.8h,  v5.8h                    // v31 = out[15]
692        neg             v17.8h,  v3.8h                    // v17 = out[1]
693
694        mov             v16.16b, v2.16b
695        mov             v30.16b, v4.16b
696        ret
697endfunc
698
699// Helper macros; we can't use these expressions directly within
700// e.g. .irp due to the extra concatenation \(). Therefore wrap
701// them in macros to allow using .irp below.
702.macro load i, src, inc
703        ld1             {v\i\().8h},  [\src], \inc
704.endm
705.macro store i, dst, inc
706        st1             {v\i\().8h},  [\dst], \inc
707.endm
708.macro movi_v i, size, imm
709        movi            v\i\()\size,  \imm
710.endm
711.macro load_clear i, src, inc
712        ld1             {v\i\().8h}, [\src]
713        st1             {v2.8h},  [\src], \inc
714.endm
715
716.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
717        srshr           \coef0, \coef0, #6
718        ld1             {v2.8b},  [x0], x1
719        srshr           \coef1, \coef1, #6
720        ld1             {v3.8b},  [x3], x1
721        srshr           \coef2, \coef2, #6
722        ld1             {v4.8b},  [x0], x1
723        srshr           \coef3, \coef3, #6
724        uaddw           \coef0, \coef0, v2.8b
725        ld1             {v5.8b},  [x3], x1
726        uaddw           \coef1, \coef1, v3.8b
727        srshr           \coef4, \coef4, #6
728        ld1             {v6.8b},  [x0], x1
729        srshr           \coef5, \coef5, #6
730        ld1             {v7.8b},  [x3], x1
731        sqxtun          v2.8b,  \coef0
732        srshr           \coef6, \coef6, #6
733        sqxtun          v3.8b,  \coef1
734        srshr           \coef7, \coef7, #6
735        uaddw           \coef2, \coef2, v4.8b
736        ld1             {\tmp1},  [x0], x1
737        uaddw           \coef3, \coef3, v5.8b
738        ld1             {\tmp2},  [x3], x1
739        sqxtun          v4.8b,  \coef2
740        sub             x0,  x0,  x1, lsl #2
741        sub             x3,  x3,  x1, lsl #2
742        sqxtun          v5.8b,  \coef3
743        uaddw           \coef4, \coef4, v6.8b
744        st1             {v2.8b},  [x0], x1
745        uaddw           \coef5, \coef5, v7.8b
746        st1             {v3.8b},  [x3], x1
747        sqxtun          v6.8b,  \coef4
748        st1             {v4.8b},  [x0], x1
749        sqxtun          v7.8b,  \coef5
750        st1             {v5.8b},  [x3], x1
751        uaddw           \coef6, \coef6, \tmp1
752        st1             {v6.8b},  [x0], x1
753        uaddw           \coef7, \coef7, \tmp2
754        st1             {v7.8b},  [x3], x1
755        sqxtun          \tmp1,  \coef6
756        sqxtun          \tmp2,  \coef7
757        st1             {\tmp1},  [x0], x1
758        st1             {\tmp2},  [x3], x1
759.endm
760
761// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
762// transpose into a horizontal 16x8 slice and store.
763// x0 = dst (temp buffer)
764// x1 = slice offset
765// x2 = src
766// x9 = input stride
767.macro itxfm16_1d_funcs txfm
768function \txfm\()16_1d_8x16_pass1_neon
769        mov             x14, x30
770
771        movi            v2.8h, #0
772.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
773        load_clear      \i,  x2,  x9
774.endr
775
776        bl              \txfm\()16
777
778        // Do two 8x8 transposes. Originally, v16-v31 contain the
779        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
780        // transposed 8x8 blocks.
781        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
782        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
783
784        // Store the transposed 8x8 blocks horizontally.
785        cmp             x1,  #8
786        b.eq            1f
787.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
788        store           \i,  x0,  #16
789.endr
790        br              x14
7911:
792        // Special case: For the last input column (x1 == 8),
793        // which would be stored as the last row in the temp buffer,
794        // don't store the first 8x8 block, but keep it in registers
795        // for the first slice of the second pass (where it is the
796        // last 8x8 block).
797.irp i, 24, 25, 26, 27, 28, 29, 30, 31
798        add             x0,  x0,  #16
799        store           \i,  x0,  #16
800.endr
801        mov             v24.16b, v16.16b
802        mov             v25.16b, v17.16b
803        mov             v26.16b, v18.16b
804        mov             v27.16b, v19.16b
805        mov             v28.16b, v20.16b
806        mov             v29.16b, v21.16b
807        mov             v30.16b, v22.16b
808        mov             v31.16b, v23.16b
809        br              x14
810endfunc
811
812// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
813// load the destination pixels (from a similar 8x16 slice), add and store back.
814// x0 = dst
815// x1 = dst stride
816// x2 = src (temp buffer)
817// x3 = slice offset
818// x9 = temp buffer stride
819function \txfm\()16_1d_8x16_pass2_neon
820        mov             x14, x30
821.irp i, 16, 17, 18, 19, 20, 21, 22, 23
822        load            \i,  x2,  x9
823.endr
824        cbz             x3,  1f
825.irp i, 24, 25, 26, 27, 28, 29, 30, 31
826        load            \i,  x2,  x9
827.endr
8281:
829
830        add             x3,  x0,  x1
831        lsl             x1,  x1,  #1
832        bl              \txfm\()16
833
834        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
835        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
836
837        br              x14
838endfunc
839.endm
840
841itxfm16_1d_funcs idct
842itxfm16_1d_funcs iadst
843
844.macro itxfm_func16x16 txfm1, txfm2
845function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
846.ifc \txfm1\()_\txfm2,idct_idct
847        cmp             w3,  #1
848        b.eq            idct16x16_dc_add_neon
849.endif
850        mov             x15, x30
851        // iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
852.ifnc \txfm1\()_\txfm2,idct_idct
853        stp             d14, d15, [sp, #-0x10]!
854        stp             d12, d13, [sp, #-0x10]!
855        stp             d10, d11, [sp, #-0x10]!
856        stp             d8,  d9,  [sp, #-0x10]!
857.endif
858
859        sub             sp,  sp,  #512
860
861        mov             x4,  x0
862        mov             x5,  x1
863        mov             x6,  x2
864
865        movrel          x10, idct_coeffs
866.ifnc \txfm1\()_\txfm2,idct_idct
867        movrel          x11, iadst16_coeffs
868.endif
869.ifc \txfm1,idct
870        ld1             {v0.8h,v1.8h}, [x10]
871.endif
872        mov             x9,  #32
873
874.ifc \txfm1\()_\txfm2,idct_idct
875        cmp             w3,  #10
876        b.le            idct16x16_quarter_add_neon
877        cmp             w3,  #38
878        b.le            idct16x16_half_add_neon
879.endif
880
881.irp i, 0, 8
882        add             x0,  sp,  #(\i*32)
883.ifc \txfm1\()_\txfm2,idct_idct
884.if \i == 8
885        cmp             w3,  #38
886        b.le            1f
887.endif
888.endif
889        mov             x1,  #\i
890        add             x2,  x6,  #(\i*2)
891        bl              \txfm1\()16_1d_8x16_pass1_neon
892.endr
893.ifc \txfm1\()_\txfm2,iadst_idct
894        ld1             {v0.8h,v1.8h}, [x10]
895.endif
896
897.ifc \txfm1\()_\txfm2,idct_idct
898        b               3f
8991:
900        // Set v24-v31 to zero, for the in-register passthrough of
901        // coefficients to pass 2. Since we only do two slices, this can
902        // only ever happen for the second slice. So we only need to store
903        // zeros to the temp buffer for the second half of the buffer.
904        // Move x0 to the second half, and use x9 == 32 as increment.
905        add             x0,  x0,  #16
906.irp i, 24, 25, 26, 27, 28, 29, 30, 31
907        movi_v          \i,  .16b, #0
908        st1             {v24.8h},  [x0], x9
909.endr
9103:
911.endif
912
913.irp i, 0, 8
914        add             x0,  x4,  #(\i)
915        mov             x1,  x5
916        add             x2,  sp,  #(\i*2)
917        mov             x3,  #\i
918        bl              \txfm2\()16_1d_8x16_pass2_neon
919.endr
920
921        add             sp,  sp,  #512
922.ifnc \txfm1\()_\txfm2,idct_idct
923        ldp             d8,  d9,  [sp], 0x10
924        ldp             d10, d11, [sp], 0x10
925        ldp             d12, d13, [sp], 0x10
926        ldp             d14, d15, [sp], 0x10
927.endif
928        br              x15
929endfunc
930.endm
931
932itxfm_func16x16 idct,  idct
933itxfm_func16x16 iadst, idct
934itxfm_func16x16 idct,  iadst
935itxfm_func16x16 iadst, iadst
936
937function idct16_1d_8x16_pass1_quarter_neon
938        mov             x14, x30
939        movi            v2.8h, #0
940.irp i, 16, 17, 18, 19
941        load_clear      \i,  x2,  x9
942.endr
943
944        bl              idct16_quarter
945
946        // Do two 8x8 transposes. Originally, v16-v31 contain the
947        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
948        // transposed 8x8 blocks.
949        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
950        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
951
952        // Store the transposed 8x8 blocks horizontally.
953        // The first 8x8 block is kept in registers for the second pass,
954        // store the rest in the temp buffer.
955        // Since only a 4x4 part of the input was nonzero, this means that
956        // only 4 rows are nonzero after transposing, and the second pass
957        // only reads the topmost 4 rows. Therefore only store the topmost
958        // 4 rows.
959        add             x0,  x0,  #16
960.irp i, 24, 25, 26, 27
961        store           \i,  x0,  x9
962.endr
963        br              x14
964endfunc
965
966function idct16_1d_8x16_pass2_quarter_neon
967        mov             x14, x30
968        cbz             x3,  1f
969.irp i, 16, 17, 18, 19
970        load            \i,  x2,  x9
971.endr
9721:
973
974        add             x3,  x0,  x1
975        lsl             x1,  x1,  #1
976        bl              idct16_quarter
977
978        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
979        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
980
981        br              x14
982endfunc
983
984function idct16_1d_8x16_pass1_half_neon
985        mov             x14, x30
986        movi            v2.8h, #0
987.irp i, 16, 17, 18, 19, 20, 21, 22, 23
988        load_clear      \i,  x2,  x9
989.endr
990
991        bl              idct16_half
992
993        // Do two 8x8 transposes. Originally, v16-v31 contain the
994        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
995        // transposed 8x8 blocks.
996        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
997        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
998
999        // Store the transposed 8x8 blocks horizontally.
1000        // The first 8x8 block is kept in registers for the second pass,
1001        // store the rest in the temp buffer.
1002        add             x0,  x0,  #16
1003.irp i, 24, 25, 26, 27, 28, 29, 30, 31
1004        store           \i,  x0,  x9
1005.endr
1006        br              x14
1007endfunc
1008
1009function idct16_1d_8x16_pass2_half_neon
1010        mov             x14, x30
1011        cbz             x3,  1f
1012.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1013        load            \i,  x2,  x9
1014.endr
10151:
1016
1017        add             x3,  x0,  x1
1018        lsl             x1,  x1,  #1
1019        bl              idct16_half
1020
1021        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
1022        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
1023
1024        br              x14
1025endfunc
1026
1027.macro idct16_partial size
1028function idct16x16_\size\()_add_neon
1029        add             x0,  sp,  #(0*32)
1030        add             x2,  x6,  #(0*2)
1031        bl              idct16_1d_8x16_pass1_\size\()_neon
1032.irp i, 0, 8
1033        add             x0,  x4,  #(\i)
1034        mov             x1,  x5
1035        add             x2,  sp,  #(\i*2)
1036        mov             x3,  #\i
1037        bl              idct16_1d_8x16_pass2_\size\()_neon
1038.endr
1039
1040        add             sp,  sp,  #512
1041        br              x15
1042endfunc
1043.endm
1044
1045idct16_partial quarter
1046idct16_partial half
1047
1048function idct32x32_dc_add_neon
1049        movrel          x4,  idct_coeffs
1050        ld1             {v0.4h}, [x4]
1051
1052        movi            v1.4h,  #0
1053
1054        ld1             {v2.h}[0], [x2]
1055        smull           v2.4s,  v2.4h,  v0.h[0]
1056        rshrn           v2.4h,  v2.4s,  #14
1057        smull           v2.4s,  v2.4h,  v0.h[0]
1058        rshrn           v2.4h,  v2.4s,  #14
1059        dup             v2.8h,  v2.h[0]
1060        st1             {v1.h}[0], [x2]
1061
1062        srshr           v0.8h,  v2.8h,  #6
1063
1064        mov             x3,  x0
1065        mov             x4,  #32
10661:
1067        // Loop to add the constant v0 into all 32x32 outputs
1068        subs            x4,  x4,  #2
1069        ld1             {v1.16b,v2.16b},  [x0], x1
1070        uaddw           v16.8h, v0.8h,  v1.8b
1071        uaddw2          v17.8h, v0.8h,  v1.16b
1072        ld1             {v3.16b,v4.16b},  [x0], x1
1073        uaddw           v18.8h, v0.8h,  v2.8b
1074        uaddw2          v19.8h, v0.8h,  v2.16b
1075        uaddw           v20.8h, v0.8h,  v3.8b
1076        uaddw2          v21.8h, v0.8h,  v3.16b
1077        uaddw           v22.8h, v0.8h,  v4.8b
1078        uaddw2          v23.8h, v0.8h,  v4.16b
1079        sqxtun          v1.8b,  v16.8h
1080        sqxtun2         v1.16b, v17.8h
1081        sqxtun          v2.8b,  v18.8h
1082        sqxtun2         v2.16b, v19.8h
1083        sqxtun          v3.8b,  v20.8h
1084        sqxtun2         v3.16b, v21.8h
1085        st1             {v1.16b,v2.16b},  [x3], x1
1086        sqxtun          v4.8b,  v22.8h
1087        sqxtun2         v4.16b, v23.8h
1088        st1             {v3.16b,v4.16b},  [x3], x1
1089        b.ne            1b
1090
1091        ret
1092endfunc
1093
1094.macro idct32_end
1095        butterfly_8h    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
1096        butterfly_8h    v17, v20, v23, v20 // v17 = t17,  v20 = t18
1097        butterfly_8h    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
1098        butterfly_8h    v19, v21, v22, v21 // v19 = t22,  v21 = t21
1099        butterfly_8h    v4,  v28, v28, v30 // v4  = t24a, v28 = t27a
1100        butterfly_8h    v23, v26, v25, v26 // v23 = t25,  v26 = t26
1101        butterfly_8h    v7,  v3,  v29, v31 // v7  = t31a, v3  = t28a
1102        butterfly_8h    v22, v27, v24, v27 // v22 = t30,  v27 = t29
1103
1104        dmbutterfly     v27, v20, v0.h[2], v0.h[3], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
1105        dmbutterfly     v3,  v5,  v0.h[2], v0.h[3], v24, v25, v30, v31        // v3  = t19,  v5  = t28
1106        dmbutterfly     v28, v6,  v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
1107        dmbutterfly     v26, v21, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
1108
1109        butterfly_8h    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
1110        butterfly_8h    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
1111        butterfly_8h_r  v23, v16, v16, v18 // v23 = t23,  v16 = t16
1112        butterfly_8h_r  v22, v17, v17, v19 // v22 = t22a, v17 = t17a
1113        butterfly_8h    v18, v21, v27, v21 // v18 = t18,  v21 = t21
1114        butterfly_8h_r  v27, v28, v5,  v28 // v27 = t27a, v28 = t28a
1115        butterfly_8h    v29, v26, v20, v26 // v29 = t29,  v26 = t26
1116        butterfly_8h    v19, v20, v3,  v6  // v19 = t19a, v20 = t20
1117
1118        dmbutterfly0    v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27,  v20 = t20
1119        dmbutterfly0    v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a
1120        dmbutterfly0    v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25,  v22 = t22
1121        dmbutterfly0    v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
1122        ret
1123.endm
1124
1125function idct32_odd
1126        dmbutterfly     v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1127        dmbutterfly     v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1128        dmbutterfly     v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1129        dmbutterfly     v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1130        dmbutterfly     v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1131        dmbutterfly     v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1132        dmbutterfly     v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1133        dmbutterfly     v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1134
1135        butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
1136        butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
1137        butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
1138        butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
1139        butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
1140        butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
1141        butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
1142        butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
1143
1144        dmbutterfly     v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
1145        dmbutterfly     v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1146        dmbutterfly     v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
1147        dmbutterfly     v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1148        idct32_end
1149endfunc
1150
1151function idct32_odd_half
1152        dmbutterfly_h1  v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1153        dmbutterfly_h2  v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1154        dmbutterfly_h1  v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1155        dmbutterfly_h2  v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1156        dmbutterfly_h1  v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1157        dmbutterfly_h2  v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1158        dmbutterfly_h1  v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1159        dmbutterfly_h2  v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1160
1161        butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
1162        butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
1163        butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
1164        butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
1165        butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
1166        butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
1167        butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
1168        butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
1169
1170        dmbutterfly     v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
1171        dmbutterfly     v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1172        dmbutterfly     v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
1173        dmbutterfly     v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1174        idct32_end
1175endfunc
1176
1177function idct32_odd_quarter
1178        dsmull_h        v4,  v5,  v16, v8.h[0]
1179        dsmull_h        v28, v29, v19, v8.h[7]
1180        dsmull_h        v30, v31, v16, v8.h[1]
1181        dsmull_h        v22, v23, v17, v9.h[6]
1182        dsmull_h        v7,  v6,  v17, v9.h[7]
1183        dsmull_h        v26, v27, v19, v8.h[6]
1184        dsmull_h        v20, v21, v18, v9.h[0]
1185        dsmull_h        v24, v25, v18, v9.h[1]
1186
1187        neg             v28.4s, v28.4s
1188        neg             v29.4s, v29.4s
1189        neg             v7.4s,  v7.4s
1190        neg             v6.4s,  v6.4s
1191
1192        drshrn_h        v4,  v4,  v5,  #14
1193        drshrn_h        v5,  v28, v29, #14
1194        drshrn_h        v29, v30, v31, #14
1195        drshrn_h        v28, v22, v23, #14
1196        drshrn_h        v7,  v7,  v6,  #14
1197        drshrn_h        v31, v26, v27, #14
1198        drshrn_h        v6,  v20, v21, #14
1199        drshrn_h        v30, v24, v25, #14
1200
1201        dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v0.h[4], v0.h[5]
1202        dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v0.h[4], v0.h[5]
1203        drshrn_h        v23, v16, v17, #14
1204        drshrn_h        v24, v18, v19, #14
1205        neg             v20.4s, v20.4s
1206        neg             v21.4s, v21.4s
1207        drshrn_h        v27, v27, v26, #14
1208        drshrn_h        v20, v20, v21, #14
1209        dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v0.h[6], v0.h[7]
1210        drshrn_h        v21, v16, v17, #14
1211        drshrn_h        v26, v18, v19, #14
1212        dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v0.h[6], v0.h[7]
1213        drshrn_h        v25, v16, v17, #14
1214        neg             v18.4s, v18.4s
1215        neg             v19.4s, v19.4s
1216        drshrn_h        v22, v18, v19, #14
1217
1218        idct32_end
1219endfunc
1220
1221.macro idct32_funcs suffix
1222// Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
1223// The 32-point IDCT can be decomposed into two 16-point IDCTs;
1224// a normal IDCT16 with every other input component (the even ones, with
1225// each output written twice), followed by a separate 16-point IDCT
1226// of the odd inputs, added/subtracted onto the outputs of the first idct16.
1227// x0 = dst (temp buffer)
1228// x1 = unused
1229// x2 = src
1230// x9 = double input stride
1231function idct32_1d_8x32_pass1\suffix\()_neon
1232        mov             x14, x30
1233        movi            v2.8h,  #0
1234
1235        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1236.ifb \suffix
1237.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1238        load_clear      \i, x2, x9
1239.endr
1240.endif
1241.ifc \suffix,_quarter
1242.irp i, 16, 17, 18, 19
1243        load_clear      \i, x2, x9
1244.endr
1245.endif
1246.ifc \suffix,_half
1247.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1248        load_clear      \i, x2, x9
1249.endr
1250.endif
1251
1252        bl              idct16\suffix
1253
1254        // Do two 8x8 transposes. Originally, v16-v31 contain the
1255        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
1256        // two transposed 8x8 blocks.
1257        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
1258        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
1259
1260        // Store the registers a, b horizontally, followed by the
1261        // same registers b, a mirrored.
1262.macro store_rev a, b
1263        // There's no rev128 instruction, but we reverse each 64 bit
1264        // half, and then flip them using an ext with 8 bytes offset.
1265        rev64           v3.8h, \b
1266        st1             {\a},  [x0], #16
1267        rev64           v2.8h, \a
1268        ext             v3.16b, v3.16b, v3.16b, #8
1269        st1             {\b},  [x0], #16
1270        ext             v2.16b, v2.16b, v2.16b, #8
1271        st1             {v3.8h},  [x0], #16
1272        st1             {v2.8h},  [x0], #16
1273.endm
1274        store_rev       v16.8h, v24.8h
1275        store_rev       v17.8h, v25.8h
1276        store_rev       v18.8h, v26.8h
1277        store_rev       v19.8h, v27.8h
1278        store_rev       v20.8h, v28.8h
1279        store_rev       v21.8h, v29.8h
1280        store_rev       v22.8h, v30.8h
1281        store_rev       v23.8h, v31.8h
1282        sub             x0,  x0,  #512
1283.purgem store_rev
1284
1285        // Move x2 back to the start of the input, and move
1286        // to the first odd row
1287.ifb \suffix
1288        sub             x2,  x2,  x9, lsl #4
1289.endif
1290.ifc \suffix,_quarter
1291        sub             x2,  x2,  x9, lsl #2
1292.endif
1293.ifc \suffix,_half
1294        sub             x2,  x2,  x9, lsl #3
1295.endif
1296        add             x2,  x2,  #64
1297
1298        movi            v2.8h,  #0
1299        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1300.ifb \suffix
1301.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1302        load_clear      \i, x2, x9
1303.endr
1304.endif
1305.ifc \suffix,_quarter
1306.irp i, 16, 17, 18, 19
1307        load_clear      \i, x2, x9
1308.endr
1309.endif
1310.ifc \suffix,_half
1311.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1312        load_clear      \i, x2, x9
1313.endr
1314.endif
1315
1316        bl              idct32_odd\suffix
1317
1318        transpose_8x8H  v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
1319        transpose_8x8H  v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
1320
1321        // Store the registers a, b horizontally,
1322        // adding into the output first, and the mirrored,
1323        // subtracted from the output.
1324.macro store_rev a, b
1325        ld1             {v4.8h},  [x0]
1326        rev64           v3.8h, \b
1327        add             v4.8h, v4.8h, \a
1328        rev64           v2.8h, \a
1329        st1             {v4.8h},  [x0], #16
1330        ext             v3.16b, v3.16b, v3.16b, #8
1331        ld1             {v5.8h},  [x0]
1332        ext             v2.16b, v2.16b, v2.16b, #8
1333        add             v5.8h, v5.8h, \b
1334        st1             {v5.8h},  [x0], #16
1335        ld1             {v6.8h},  [x0]
1336        sub             v6.8h, v6.8h, v3.8h
1337        st1             {v6.8h},  [x0], #16
1338        ld1             {v7.8h},  [x0]
1339        sub             v7.8h, v7.8h, v2.8h
1340        st1             {v7.8h},  [x0], #16
1341.endm
1342
1343        store_rev       v31.8h, v23.8h
1344        store_rev       v30.8h, v22.8h
1345        store_rev       v29.8h, v21.8h
1346        store_rev       v28.8h, v20.8h
1347        store_rev       v27.8h, v19.8h
1348        store_rev       v26.8h, v18.8h
1349        store_rev       v25.8h, v17.8h
1350        store_rev       v24.8h, v16.8h
1351.purgem store_rev
1352        br              x14
1353endfunc
1354
1355// This is mostly the same as 8x32_pass1, but without the transpose,
1356// and use the source as temp buffer between the two idct passes, and
1357// add into the destination.
1358// x0 = dst
1359// x1 = dst stride
1360// x2 = src (temp buffer)
1361// x7 = negative double temp buffer stride
1362// x9 = double temp buffer stride
1363function idct32_1d_8x32_pass2\suffix\()_neon
1364        mov             x14, x30
1365        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1366.ifb \suffix
1367.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1368        load            \i, x2, x9
1369.endr
1370        sub             x2,  x2,  x9, lsl #4
1371.endif
1372.ifc \suffix,_quarter
1373.irp i, 16, 17, 18, 19
1374        load            \i, x2, x9
1375.endr
1376        sub             x2,  x2,  x9, lsl #2
1377.endif
1378.ifc \suffix,_half
1379.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1380        load            \i, x2, x9
1381.endr
1382        sub             x2,  x2,  x9, lsl #3
1383.endif
1384
1385        bl              idct16\suffix
1386
1387.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1388        store           \i, x2, x9
1389.endr
1390
1391        sub             x2,  x2,  x9, lsl #4
1392        add             x2,  x2,  #64
1393
1394        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1395.ifb \suffix
1396.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1397        load            \i, x2, x9
1398.endr
1399        sub             x2,  x2,  x9, lsl #4
1400.endif
1401.ifc \suffix,_quarter
1402.irp i, 16, 17, 18, 19
1403        load            \i, x2, x9
1404.endr
1405        sub             x2,  x2,  x9, lsl #2
1406.endif
1407.ifc \suffix,_half
1408.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1409        load            \i, x2, x9
1410.endr
1411        sub             x2,  x2,  x9, lsl #3
1412.endif
1413        sub             x2,  x2,  #64
1414
1415        bl              idct32_odd\suffix
1416
1417.macro load_acc_store a, b, c, d, neg=0
1418.if \neg == 0
1419        ld1             {v4.8h},  [x2], x9
1420        ld1             {v5.8h},  [x2], x9
1421        add             v4.8h, v4.8h, \a
1422        ld1             {v6.8h},  [x2], x9
1423        add             v5.8h, v5.8h, \b
1424        ld1             {v7.8h},  [x2], x9
1425        add             v6.8h, v6.8h, \c
1426        add             v7.8h, v7.8h, \d
1427.else
1428        ld1             {v4.8h},  [x2], x7
1429        ld1             {v5.8h},  [x2], x7
1430        sub             v4.8h, v4.8h, \a
1431        ld1             {v6.8h},  [x2], x7
1432        sub             v5.8h, v5.8h, \b
1433        ld1             {v7.8h},  [x2], x7
1434        sub             v6.8h, v6.8h, \c
1435        sub             v7.8h, v7.8h, \d
1436.endif
1437        ld1             {v10.8b}, [x0], x1
1438        ld1             {v11.8b}, [x0], x1
1439        srshr           v4.8h, v4.8h, #6
1440        ld1             {v2.8b}, [x0], x1
1441        srshr           v5.8h, v5.8h, #6
1442        uaddw           v4.8h, v4.8h, v10.8b
1443        ld1             {v3.8b}, [x0], x1
1444        srshr           v6.8h, v6.8h, #6
1445        uaddw           v5.8h, v5.8h, v11.8b
1446        srshr           v7.8h, v7.8h, #6
1447        sub             x0,  x0,  x1, lsl #2
1448        uaddw           v6.8h, v6.8h, v2.8b
1449        sqxtun          v4.8b, v4.8h
1450        uaddw           v7.8h, v7.8h, v3.8b
1451        sqxtun          v5.8b, v5.8h
1452        st1             {v4.8b}, [x0], x1
1453        sqxtun          v6.8b, v6.8h
1454        st1             {v5.8b}, [x0], x1
1455        sqxtun          v7.8b, v7.8h
1456        st1             {v6.8b}, [x0], x1
1457        st1             {v7.8b}, [x0], x1
1458.endm
1459        load_acc_store  v31.8h, v30.8h, v29.8h, v28.8h
1460        load_acc_store  v27.8h, v26.8h, v25.8h, v24.8h
1461        load_acc_store  v23.8h, v22.8h, v21.8h, v20.8h
1462        load_acc_store  v19.8h, v18.8h, v17.8h, v16.8h
1463        sub             x2,  x2,  x9
1464        load_acc_store  v16.8h, v17.8h, v18.8h, v19.8h, 1
1465        load_acc_store  v20.8h, v21.8h, v22.8h, v23.8h, 1
1466        load_acc_store  v24.8h, v25.8h, v26.8h, v27.8h, 1
1467        load_acc_store  v28.8h, v29.8h, v30.8h, v31.8h, 1
1468.purgem load_acc_store
1469        br              x14
1470endfunc
1471.endm
1472
1473idct32_funcs
1474idct32_funcs _quarter
1475idct32_funcs _half
1476
1477const min_eob_idct_idct_32, align=4
1478        .short  0, 34, 135, 336
1479endconst
1480
1481function ff_vp9_idct_idct_32x32_add_neon, export=1
1482        cmp             w3,  #1
1483        b.eq            idct32x32_dc_add_neon
1484
1485        movrel          x10, idct_coeffs
1486
1487        mov             x15, x30
1488
1489        stp             d10, d11, [sp, #-0x10]!
1490        stp             d8,  d9,  [sp, #-0x10]!
1491
1492        sub             sp,  sp,  #2048
1493
1494        mov             x4,  x0
1495        mov             x5,  x1
1496        mov             x6,  x2
1497
1498        // Double stride of the input, since we only read every other line
1499        mov             x9,  #128
1500        neg             x7,  x9
1501
1502        ld1             {v0.8h,v1.8h}, [x10], #32
1503        ld1             {v8.8h,v9.8h}, [x10]
1504
1505        cmp             w3,  #34
1506        b.le            idct32x32_quarter_add_neon
1507        cmp             w3,  #135
1508        b.le            idct32x32_half_add_neon
1509
1510        movrel          x12, min_eob_idct_idct_32, 2
1511
1512.irp i, 0, 8, 16, 24
1513        add             x0,  sp,  #(\i*64)
1514.if \i > 0
1515        ldrh            w1,  [x12], #2
1516        cmp             w3,  w1
1517        mov             x1,  #(32 - \i)/4
1518        b.le            1f
1519.endif
1520        add             x2,  x6,  #(\i*2)
1521        bl              idct32_1d_8x32_pass1_neon
1522.endr
1523        b               3f
1524
15251:
1526        // Write zeros to the temp buffer for pass 2
1527        movi            v16.8h,  #0
1528        movi            v17.8h,  #0
1529        movi            v18.8h,  #0
1530        movi            v19.8h,  #0
15312:
1532        subs            x1,  x1,  #1
1533.rept 4
1534        st1             {v16.8h,v17.8h,v18.8h,v19.8h},  [x0], #64
1535.endr
1536        b.ne            2b
15373:
1538.irp i, 0, 8, 16, 24
1539        add             x0,  x4,  #(\i)
1540        mov             x1,  x5
1541        add             x2,  sp,  #(\i*2)
1542        bl              idct32_1d_8x32_pass2_neon
1543.endr
1544
1545        add             sp,  sp,  #2048
1546
1547        ldp             d8,  d9,  [sp], 0x10
1548        ldp             d10, d11, [sp], 0x10
1549
1550        br              x15
1551endfunc
1552
1553.macro idct32_partial size
1554function idct32x32_\size\()_add_neon
1555        add             x0,  sp,  #(0*64)
1556        add             x2,  x6,  #(0*2)
1557        bl              idct32_1d_8x32_pass1_\size\()_neon
1558.ifc \size,half
1559        add             x0,  sp,  #(8*64)
1560        add             x2,  x6,  #(8*2)
1561        bl              idct32_1d_8x32_pass1_\size\()_neon
1562.endif
1563.irp i, 0, 8, 16, 24
1564        add             x0,  x4,  #(\i)
1565        mov             x1,  x5
1566        add             x2,  sp,  #(\i*2)
1567        bl              idct32_1d_8x32_pass2_\size\()_neon
1568.endr
1569
1570        add             sp,  sp,  #2048
1571
1572        ldp             d8,  d9,  [sp], 0x10
1573        ldp             d10, d11, [sp], 0x10
1574
1575        br              x15
1576endfunc
1577.endm
1578
1579idct32_partial quarter
1580idct32_partial half
1581