• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (c) 2016 Google Inc.
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/arm/asm.S"
22#include "neon.S"
23
24@ Do an 8x8 transpose, using q registers for the subtransposes that don't
25@ need to address the indiviudal d registers.
26@ r0,r1 == rq0, r2,r3 == rq1, etc
27.macro transpose_q_8x8 rq0, rq1, rq2, rq3, r0, r1, r2, r3, r4, r5, r6, r7
28        vtrn.32         \rq0, \rq2
29        vtrn.32         \rq1, \rq3
30        vtrn.16         \rq0, \rq1
31        vtrn.16         \rq2, \rq3
32        vtrn.8          \r0,  \r1
33        vtrn.8          \r2,  \r3
34        vtrn.8          \r4,  \r5
35        vtrn.8          \r6,  \r7
36.endm
37
38@ Do a 4x4 transpose, using q registers for the subtransposes that don't
39@ need to address the indiviudal d registers.
40@ r0,r1 == rq0, r2,r3 == rq1
41.macro transpose_q_4x4 rq0, rq1, r0, r1, r2, r3
42        vtrn.16         \rq0, \rq1
43        vtrn.8          \r0,  \r1
44        vtrn.8          \r2,  \r3
45.endm
46
47@ The input to and output from this macro is in the registers q8-q15,
48@ and q0-q7 are used as scratch registers.
49@ p3 = q8, p0 = q11, q0 = q12, q3 = q15
50.macro loop_filter_q
51        vdup.u8         d0,  r2          @ E
52        lsr             r2,  r2,  #8
53        vdup.u8         d2,  r3          @ I
54        lsr             r3,  r3,  #8
55        vdup.u8         d1,  r2          @ E
56        vdup.u8         d3,  r3          @ I
57
58        vabd.u8         q2,  q8,  q9     @ abs(p3 - p2)
59        vabd.u8         q3,  q9,  q10    @ abs(p2 - p1)
60        vabd.u8         q4,  q10, q11    @ abs(p1 - p0)
61        vabd.u8         q5,  q12, q13    @ abs(q0 - q1)
62        vabd.u8         q6,  q13, q14    @ abs(q1 - q2)
63        vabd.u8         q7,  q14, q15    @ abs(q2 - q3)
64        vmax.u8         q2,  q2,  q3
65        vmax.u8         q3,  q4,  q5
66        vmax.u8         q4,  q6,  q7
67        vabd.u8         q5,  q11, q12    @ abs(p0 - q0)
68        vmax.u8         q2,  q2,  q3
69        vqadd.u8        q5,  q5,  q5     @ abs(p0 - q0) * 2
70        vabd.u8         q7,  q10, q13    @ abs(p1 - q1)
71        vmax.u8         q2,  q2,  q4     @ max(abs(p3 - p2), ..., abs(q2 - q3))
72        vshr.u8         q7,  q7,  #1
73        vcle.u8         q2,  q2,  q1     @ max(abs()) <= I
74        vqadd.u8        q5,  q5,  q7     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
75        vcle.u8         q5,  q5,  q0
76        vand            q2,  q2,  q5     @ fm
77
78        vshrn.u16       d10, q2,  #4
79        vmov            r2,  r3,  d10
80        orrs            r2,  r2,  r3
81        @ If no pixels need filtering, just exit as soon as possible
82        beq             9f
83
84        @ Calculate the normal inner loop filter for 2 or 4 pixels
85        ldr             r3,  [sp, #64]
86        vabd.u8         q3,  q10, q11    @ abs(p1 - p0)
87        vabd.u8         q4,  q13, q12    @ abs(q1 - q0)
88
89        vsubl.u8        q5,  d20, d26    @ p1 - q1
90        vsubl.u8        q6,  d21, d27    @ p1 - q1
91        vmax.u8         q3,  q3,  q4     @ max(abs(p1 - p0), abs(q1 - q0))
92        vqmovn.s16      d10, q5          @ av_clip_int8p(p1 - q1)
93        vqmovn.s16      d11, q6          @ av_clip_int8p(p1 - q1)
94        vdup.u8         d8,  r3          @ H
95        lsr             r3,  r3,  #8
96        vdup.u8         d9,  r3          @ H
97        vsubl.u8        q6,  d24, d22    @ q0 - p0
98        vsubl.u8        q7,  d25, d23    @ q0 - p0
99        vcle.u8         q3,  q3,  q4     @ hev
100        vmov.s16        q0,  #3
101        vand            q3,  q3,  q2     @ !hev && fm && !flat8in
102
103        vmul.s16        q6,  q6,  q0     @ 3 * (q0 - p0)
104        vmul.s16        q7,  q7,  q0     @ 3 * (q0 - p0)
105        vbic            q5,  q5,  q3     @ if (!hev) av_clip_int8 = 0
106        vaddw.s8        q6,  q6,  d10    @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
107        vaddw.s8        q7,  q7,  d11    @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
108        vmov.s8         q5,  #4
109        vqmovn.s16      d12, q6
110        vqmovn.s16      d13, q7          @ av_clip_int8(3 * (q0 - p0) [+ av_clip_int8(p1 - q1)], BIT_DEPTH - 1) = f
111        vmov.s8         q0,  #3
112
113        vqadd.s8        q5,  q6,  q5     @ FFMIN(f + 4, 127)
114        vqadd.s8        q0,  q6,  q0     @ FFMIN(f + 3, 127)
115        vmovl.u8        q6,  d22         @ p0
116        vmovl.u8        q7,  d23         @ p0
117        vshr.s8         q5,  q5,  #3     @ f1
118        vshr.s8         q0,  q0,  #3     @ f2
119
120        vaddw.s8        q6,  q6,  d0     @ p0 + f2
121        vaddw.s8        q7,  q7,  d1     @ p0 + f2
122        vqmovun.s16     d0,  q6          @ out p0
123        vmovl.u8        q6,  d24         @ q0
124        vqmovun.s16     d1,  q7          @ out p0
125        vmovl.u8        q7,  d25         @ q0
126        vsubw.s8        q6,  q6,  d10    @ q0 - f1
127        vsubw.s8        q7,  q7,  d11    @ q0 - f1
128        vqmovun.s16     d12, q6          @ out q0
129        vqmovun.s16     d13, q7          @ out q0
130        vrshr.s8        q5,  q5,  #1     @ f = (f1 + 1) >> 1
131        vbit            q11, q0,  q2     @ if (fm && !flat8in)
132        vbit            q12, q6,  q2
133
134        vmovl.u8        q0,  d20         @ p1
135        vmovl.u8        q2,  d21         @ p1
136        vmovl.u8        q6,  d26         @ q1
137        vmovl.u8        q7,  d27         @ q1
138        vaddw.s8        q0,  q0,  d10    @ p1 + f
139        vaddw.s8        q2,  q2,  d11    @ p1 + f
140        vsubw.s8        q6,  q6,  d10    @ q1 - f
141        vsubw.s8        q7,  q7,  d11    @ q1 - f
142        vqmovun.s16     d0,  q0          @ out p1
143        vqmovun.s16     d1,  q2          @ out p1
144        vqmovun.s16     d12, q6          @ out q1
145        vqmovun.s16     d13, q7          @ out q1
146        vbit            q10, q0,  q3     @ if (!hev && fm && !flat8in)
147        vbit            q13, q6,  q3
148.endm
149
150@ The input to and output from this macro is in the registers d16-d31,
151@ and d0-d7 are used as scratch registers.
152@ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31
153@ Depending on the width of the loop filter, we either use d16-d19
154@ and d28-d31 as temp registers, or d8-d15.
155@ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4
156.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4
157        vdup.u8         d0,  r2 @ E
158        vdup.u8         d2,  r3 @ I
159        ldr             r3,  [sp]
160
161        vabd.u8         d4,  d20, d21    @ abs(p3 - p2)
162        vabd.u8         d5,  d21, d22    @ abs(p2 - p1)
163        vabd.u8         d6,  d22, d23    @ abs(p1 - p0)
164        vabd.u8         d7,  d24, d25    @ abs(q0 - q1)
165        vabd.u8         \tmp1,  d25, d26 @ abs(q1 - q2)
166        vabd.u8         \tmp2,  d26, d27 @ abs(q2 - q3)
167        vmax.u8         d4,  d4,  d5
168        vmax.u8         d5,  d6,  d7
169        vmax.u8         \tmp1,  \tmp1,  \tmp2
170        vabd.u8         d6,  d23, d24    @ abs(p0 - q0)
171        vmax.u8         d4,  d4,  d5
172        vqadd.u8        d6,  d6,  d6     @ abs(p0 - q0) * 2
173        vabd.u8         d5,  d22, d25    @ abs(p1 - q1)
174        vmax.u8         d4,  d4,  \tmp1  @ max(abs(p3 - p2), ..., abs(q2 - q3))
175        vshr.u8         d5,  d5,  #1
176        vcle.u8         d4,  d4,  d2     @ max(abs()) <= I
177        vqadd.u8        d6,  d6,  d5     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
178        vcle.u8         d5,  d6,  d0
179        vand            d4,  d4,  d5     @ fm
180
181        vdup.u8         d3,  r3          @ H
182        vmov            r2,  r3,  d4
183        orrs            r2,  r2,  r3
184        @ If no pixels need filtering, just exit as soon as possible
185        beq             9f
186
187.if \wd >= 8
188        vmov.u8         d0,  #1
189
190        vabd.u8         d6,  d20, d23    @ abs(p3 - p0)
191        vabd.u8         d2,  d21, d23    @ abs(p2 - p0)
192        vabd.u8         d1,  d22, d23    @ abs(p1 - p0)
193        vabd.u8         \tmp1,  d25, d24 @ abs(q1 - q0)
194        vabd.u8         \tmp2,  d26, d24 @ abs(q2 - q0)
195        vabd.u8         \tmp3,  d27, d24 @ abs(q3 - q0)
196        vmax.u8         d6,  d6,  d2
197        vmax.u8         d1,  d1,  \tmp1
198        vmax.u8         \tmp2,  \tmp2,  \tmp3
199.if \wd == 16
200        vabd.u8         d7,  d16, d23    @ abs(p7 - p0)
201        vmax.u8         d6,  d6,  d1
202        vabd.u8         d2,  d17, d23    @ abs(p6 - p0)
203        vmax.u8         d6,  d6,  \tmp2
204        vabd.u8         d1,  d18, d23    @ abs(p5 - p0)
205        vcle.u8         d6,  d6,  d0     @ flat8in
206        vabd.u8         d8,  d19, d23    @ abs(p4 - p0)
207        vand            d6,  d6,  d4     @ flat8in && fm
208        vabd.u8         d9,  d28, d24    @ abs(q4 - q0)
209        vbic            d4,  d4,  d6     @ fm && !flat8in
210        vabd.u8         d10, d29, d24    @ abs(q5 - q0)
211        vabd.u8         d11, d30, d24    @ abs(q6 - q0)
212        vabd.u8         d12, d31, d24    @ abs(q7 - q0)
213
214        vmax.u8         d7,  d7,  d2
215        vmax.u8         d1,  d1,  d8
216        vmax.u8         d9,  d9,  d10
217        vmax.u8         d11, d11, d12
218        @ The rest of the calculation of flat8out is interleaved below
219.else
220        @ The rest of the calculation of flat8in is interleaved below
221.endif
222.endif
223
224        @ Calculate the normal inner loop filter for 2 or 4 pixels
225        vabd.u8         d5,  d22, d23           @ abs(p1 - p0)
226.if \wd == 16
227        vmax.u8         d7,  d7,  d1
228        vmax.u8         d9,  d9,  d11
229.elseif \wd == 8
230        vmax.u8         d6,  d6,  d1
231.endif
232        vabd.u8         d1,  d25, d24           @ abs(q1 - q0)
233.if \wd == 16
234        vmax.u8         d7,  d7,  d9
235.elseif \wd == 8
236        vmax.u8         d6,  d6,  \tmp2
237.endif
238        vsubl.u8        \tmpq1,  d22, d25       @ p1 - q1
239        vmax.u8         d5,  d5,  d1            @ max(abs(p1 - p0), abs(q1 - q0))
240        vsubl.u8        \tmpq2,  d24, d23       @ q0 - p0
241        vmov.s16        \tmpq3,  #3
242.if \wd == 8
243        vcle.u8         d6,  d6,  d0            @ flat8in
244.endif
245        vcle.u8         d5,  d5,  d3            @ !hev
246.if \wd == 8
247        vand            d6,  d6,  d4            @ flat8in && fm
248.endif
249        vqmovn.s16      \tmp1,   \tmpq1         @ av_clip_int8(p1 - q1)
250.if \wd == 16
251        vcle.u8         d7,  d7,  d0            @ flat8out
252.elseif \wd == 8
253        vbic            d4,  d4,  d6            @ fm && !flat8in
254.endif
255        vand            d5,  d5,  d4            @ !hev && fm && !flat8in
256.if \wd == 16
257        vand            d7,  d7,  d6            @ flat8out && flat8in && fm
258.endif
259
260        vmul.s16        \tmpq2,  \tmpq2, \tmpq3 @ 3 * (q0 - p0)
261        vbic            \tmp1,   \tmp1,   d5    @ if (!hev) av_clip_int8 = 0
262        vmov.s8         d2,  #4
263        vaddw.s8        \tmpq2,  \tmpq2,  \tmp1 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
264        vmov.s8         d3,  #3
265        vqmovn.s16      \tmp1,   \tmpq2         @ f
266.if \wd == 16
267        vbic            d6,  d6,  d7            @ fm && flat8in && !flat8out
268.endif
269
270        vqadd.s8        \tmp3, \tmp1,  d2       @ FFMIN(f + 4, 127)
271        vqadd.s8        \tmp4, \tmp1,  d3       @ FFMIN(f + 3, 127)
272        vmovl.u8        q0,  d23                @ p0
273        vshr.s8         \tmp3, \tmp3,  #3       @ f1
274        vshr.s8         \tmp4, \tmp4,  #3       @ f2
275
276        vmovl.u8        q1,  d24                @ q0
277        vaddw.s8        q0,  q0,  \tmp4         @ p0 + f2
278        vsubw.s8        q1,  q1,  \tmp3         @ q0 - f1
279        vqmovun.s16     d0,  q0                 @ out p0
280        vqmovun.s16     d1,  q1                 @ out q0
281        vrshr.s8        \tmp3, \tmp3, #1        @ f = (f1 + 1) >> 1
282        vbit            d23, d0,  d4            @ if (fm && !flat8in)
283        vbit            d24, d1,  d4
284
285        vmovl.u8        q0,  d22                @ p1
286        vmovl.u8        q1,  d25                @ q1
287.if \wd >= 8
288        vmov            r2,  r3,  d6
289.endif
290        vaddw.s8        q0,  q0,  \tmp3         @ p1 + f
291        vsubw.s8        q1,  q1,  \tmp3         @ q1 - f
292.if \wd >= 8
293        orrs            r2,  r2,  r3
294.endif
295        vqmovun.s16     d0,  q0                 @ out p1
296        vqmovun.s16     d2,  q1                 @ out q1
297        vbit            d22, d0,  d5            @ if (!hev && fm && !flat8in)
298        vbit            d25, d2,  d5
299
300.if \wd >= 8
301        @ If no pixels need flat8in, jump to flat8out
302        @ (or to a writeout of the inner 4 pixels, for wd=8)
303        beq             6f
304
305        @ flat8in
306        vaddl.u8        \tmpq1, d20, d21
307        vaddl.u8        \tmpq2, d22, d25
308        vaddl.u8        \tmpq3, d20, d22
309        vaddl.u8        \tmpq4, d23, d26
310        vadd.u16        q0,  \tmpq1, \tmpq1
311        vaddw.u8        q0,  q0,  d23
312        vaddw.u8        q0,  q0,  d24
313        vadd.u16        q0,  q0,  \tmpq3
314        vsub.s16        \tmpq2, \tmpq2, \tmpq1
315        vsub.s16        \tmpq4, \tmpq4, \tmpq3
316        vrshrn.u16      d2,  q0,  #3            @ out p2
317
318        vadd.u16        q0,  q0,  \tmpq2
319        vaddl.u8        \tmpq1, d20, d23
320        vaddl.u8        \tmpq2, d24, d27
321        vrshrn.u16      d3,  q0,  #3            @ out p1
322
323        vadd.u16        q0,  q0,  \tmpq4
324        vsub.s16        \tmpq2, \tmpq2, \tmpq1
325        vaddl.u8        \tmpq3, d21, d24
326        vaddl.u8        \tmpq4, d25, d27
327        vrshrn.u16      d4,  q0,  #3            @ out p0
328
329        vadd.u16        q0,  q0,  \tmpq2
330        vsub.s16        \tmpq4, \tmpq4, \tmpq3
331        vaddl.u8        \tmpq1, d22, d25
332        vaddl.u8        \tmpq2, d26, d27
333        vrshrn.u16      d5,  q0,  #3            @ out q0
334
335        vadd.u16        q0,  q0,  \tmpq4
336        vsub.s16        \tmpq2, \tmpq2, \tmpq1
337        vrshrn.u16      \tmp5,  q0,  #3         @ out q1
338
339        vadd.u16        q0,  q0,  \tmpq2
340        @ The output here is written back into the input registers. This doesn't
341        @ matter for the flat8out part below, since we only update those pixels
342        @ which won't be touched below.
343        vbit            d21, d2,  d6
344        vbit            d22, d3,  d6
345        vbit            d23, d4,  d6
346        vrshrn.u16      \tmp6,  q0,  #3         @ out q2
347        vbit            d24, d5,  d6
348        vbit            d25, \tmp5,  d6
349        vbit            d26, \tmp6,  d6
350.endif
351.if \wd == 16
3526:
353        vorr            d2,  d6,  d7
354        vmov            r2,  r3,  d2
355        orrs            r2,  r2,  r3
356        @ If no pixels needed flat8in nor flat8out, jump to a
357        @ writeout of the inner 4 pixels
358        beq             7f
359        vmov            r2,  r3,  d7
360        orrs            r2,  r2,  r3
361        @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels
362        beq             8f
363
364        @ flat8out
365        @ This writes all outputs into d2-d17 (skipping d6 and d16).
366        @ If this part is skipped, the output is read from d21-d26 (which is the input
367        @ to this section).
368        vshll.u8        q0,  d16, #3  @ 8 * d16
369        vsubw.u8        q0,  q0,  d16 @ 7 * d16
370        vaddw.u8        q0,  q0,  d17
371        vaddl.u8        q4,  d17, d18
372        vaddl.u8        q5,  d19, d20
373        vadd.s16        q0,  q0,  q4
374        vaddl.u8        q4,  d16, d17
375        vaddl.u8        q6,  d21, d22
376        vadd.s16        q0,  q0,  q5
377        vaddl.u8        q5,  d18, d25
378        vaddl.u8        q7,  d23, d24
379        vsub.s16        q5,  q5,  q4
380        vadd.s16        q0,  q0,  q6
381        vadd.s16        q0,  q0,  q7
382        vaddl.u8        q6,  d16, d18
383        vaddl.u8        q7,  d19, d26
384        vrshrn.u16      d2,  q0,  #4
385
386        vadd.s16        q0,  q0,  q5
387        vaddl.u8        q4,  d16, d19
388        vaddl.u8        q5,  d20, d27
389        vsub.s16        q7,  q7,  q6
390        vbif            d2,  d17, d7
391        vrshrn.u16      d3,  q0,  #4
392
393        vadd.s16        q0,  q0,  q7
394        vaddl.u8        q6,  d16, d20
395        vaddl.u8        q7,  d21, d28
396        vsub.s16        q5,  q5,  q4
397        vbif            d3,  d18, d7
398        vrshrn.u16      d4,  q0,  #4
399
400        vadd.s16        q0,  q0,  q5
401        vaddl.u8        q4,  d16, d21
402        vaddl.u8        q5,  d22, d29
403        vsub.s16        q7,  q7,  q6
404        vbif            d4,  d19, d7
405        vrshrn.u16      d5,  q0,  #4
406
407        vadd.s16        q0,  q0,  q7
408        vaddl.u8        q6,  d16, d22
409        vaddl.u8        q7,  d23, d30
410        vsub.s16        q5,  q5,  q4
411        vbif            d5,  d20, d7
412        vrshrn.u16      d6,  q0,  #4
413
414        vadd.s16        q0,  q0,  q5
415        vaddl.u8        q5,  d16, d23
416        vsub.s16        q7,  q7,  q6
417        vaddl.u8        q6,  d24, d31
418        vbif            d6,  d21, d7
419        vrshrn.u16      d8,  q0,  #4
420
421        vadd.s16        q0,  q0,  q7
422        vsub.s16        q5,  q6,  q5
423        vaddl.u8        q6,  d17, d24
424        vaddl.u8        q7,  d25, d31
425        vbif            d8,  d22, d7
426        vrshrn.u16      d9,  q0,  #4
427
428        vadd.s16        q0,  q0,  q5
429        vsub.s16        q7,  q7,  q6
430        vaddl.u8        q6,  d26, d31
431        vbif            d9,  d23, d7
432        vrshrn.u16      d10, q0,  #4
433
434        vadd.s16        q0,  q0,  q7
435        vaddl.u8        q7,  d18, d25
436        vaddl.u8        q9,  d19, d26
437        vsub.s16        q6,  q6,  q7
438        vaddl.u8        q7,  d27, d31
439        vbif            d10, d24, d7
440        vrshrn.u16      d11, q0,  #4
441
442        vadd.s16        q0,  q0,  q6
443        vaddl.u8        q6,  d20, d27
444        vsub.s16        q7,  q7,  q9
445        vaddl.u8        q9,  d28, d31
446        vbif            d11, d25, d7
447        vsub.s16        q9,  q9,  q6
448        vrshrn.u16      d12, q0,  #4
449
450        vadd.s16        q0,  q0,  q7
451        vaddl.u8        q7,  d21, d28
452        vaddl.u8        q10, d29, d31
453        vbif            d12, d26, d7
454        vrshrn.u16      d13, q0,  #4
455
456        vadd.s16        q0,  q0,  q9
457        vsub.s16        q10, q10, q7
458        vaddl.u8        q9,  d22, d29
459        vaddl.u8        q11, d30, d31
460        vbif            d13, d27, d7
461        vrshrn.u16      d14, q0,  #4
462
463        vadd.s16        q0,  q0,  q10
464        vsub.s16        q11, q11, q9
465        vbif            d14, d28, d7
466        vrshrn.u16      d15, q0,  #4
467
468        vadd.s16        q0,  q0,  q11
469        vbif            d15, d29, d7
470        vrshrn.u16      d17, q0,  #4
471        vbif            d17, d30, d7
472.endif
473.endm
474
475@ For wd <= 8, we use d16-d19 and d28-d31 for temp registers,
476@ while we need those for inputs/outputs in wd=16 and use d8-d15
477@ for temp registers there instead.
478.macro loop_filter_4
479        loop_filter     4,  d16, d17, d18, d19, d28, d29, d30, d31, q8,  q9,  q14, q15
480.endm
481
482.macro loop_filter_8
483        loop_filter     8,  d16, d17, d18, d19, d28, d29, d30, d31, q8,  q9,  q14, q15
484.endm
485
486.macro loop_filter_16
487        loop_filter     16, d8,  d9,  d10, d11, d12, d13, d14, d15, q4,  q5,  q6,  q7
488.endm
489
490
491@ The public functions in this file have got the following signature:
492@ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
493
494function ff_vp9_loop_filter_v_4_8_neon, export=1
495        sub             r12, r0,  r1, lsl #2
496        vld1.8          {d20}, [r12,:64], r1 @ p3
497        vld1.8          {d24}, [r0, :64], r1 @ q0
498        vld1.8          {d21}, [r12,:64], r1 @ p2
499        vld1.8          {d25}, [r0, :64], r1 @ q1
500        vld1.8          {d22}, [r12,:64], r1 @ p1
501        vld1.8          {d26}, [r0, :64], r1 @ q2
502        vld1.8          {d23}, [r12,:64], r1 @ p0
503        vld1.8          {d27}, [r0, :64], r1 @ q3
504        sub             r0,  r0,  r1, lsl #2
505        sub             r12, r12, r1, lsl #1
506
507        loop_filter_4
508
509        vst1.8          {d22}, [r12,:64], r1
510        vst1.8          {d24}, [r0, :64], r1
511        vst1.8          {d23}, [r12,:64], r1
512        vst1.8          {d25}, [r0, :64], r1
5139:
514        bx              lr
515endfunc
516
517function ff_vp9_loop_filter_h_4_8_neon, export=1
518        sub             r12, r0,  #4
519        add             r0,  r12, r1, lsl #2
520        vld1.8          {d20}, [r12], r1
521        vld1.8          {d24}, [r0],  r1
522        vld1.8          {d21}, [r12], r1
523        vld1.8          {d25}, [r0],  r1
524        vld1.8          {d22}, [r12], r1
525        vld1.8          {d26}, [r0],  r1
526        vld1.8          {d23}, [r12], r1
527        vld1.8          {d27}, [r0],  r1
528
529        sub             r12, r12, r1, lsl #2
530        sub             r0,  r0,  r1, lsl #2
531        @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
532        @ outermost 2 pixels since they aren't changed.
533        add             r12, r12, #2
534        add             r0,  r0,  #2
535
536        @ Transpose the 8x8 pixels, taking advantage of q registers, to get
537        @ one register per column.
538        transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
539
540        loop_filter_4
541
542        @ We only will write the mid 4 pixels back; after the loop filter,
543        @ these are in d22, d23, d24, d25 (q11, q12), ordered as rows
544        @ (8x4 pixels). We need to transpose them to columns, done with a
545        @ 4x4 transpose (which in practice is two 4x4 transposes of the two
546        @ 4x4 halves of the 8x4 pixels; into 4x8 pixels).
547        transpose_q_4x4 q11, q12, d22, d23, d24, d25
548
549        vst1.32         {d22[0]}, [r12], r1
550        vst1.32         {d22[1]}, [r0],  r1
551        vst1.32         {d23[0]}, [r12], r1
552        vst1.32         {d23[1]}, [r0],  r1
553        vst1.32         {d24[0]}, [r12], r1
554        vst1.32         {d24[1]}, [r0],  r1
555        vst1.32         {d25[0]}, [r12], r1
556        vst1.32         {d25[1]}, [r0],  r1
5579:
558        bx              lr
559endfunc
560
561function ff_vp9_loop_filter_v_44_16_neon, export=1
562        vpush           {q4-q7}
563        sub             r12, r0,  r1, lsl #2
564        vld1.8          {q8},  [r12,:128], r1 @ p3
565        vld1.8          {q12}, [r0, :128], r1 @ q0
566        vld1.8          {q9},  [r12,:128], r1 @ p2
567        vld1.8          {q13}, [r0, :128], r1 @ q1
568        vld1.8          {q10}, [r12,:128], r1 @ p1
569        vld1.8          {q14}, [r0, :128], r1 @ q2
570        vld1.8          {q11}, [r12,:128], r1 @ p0
571        vld1.8          {q15}, [r0, :128], r1 @ q3
572        sub             r0,  r0,  r1, lsl #2
573        sub             r12, r12, r1, lsl #1
574
575        loop_filter_q
576
577        vst1.8          {q10}, [r12,:128], r1
578        vst1.8          {q12}, [r0, :128], r1
579        vst1.8          {q11}, [r12,:128], r1
580        vst1.8          {q13}, [r0, :128], r1
5819:
582        vpop            {q4-q7}
583        bx              lr
584endfunc
585
586function ff_vp9_loop_filter_h_44_16_neon, export=1
587        vpush           {q4-q7}
588        sub             r12, r0,  #4
589        add             r0,  r12, r1, lsl #2
590        vld1.8          {d16}, [r12], r1
591        vld1.8          {d24}, [r0],  r1
592        vld1.8          {d18}, [r12], r1
593        vld1.8          {d26}, [r0],  r1
594        vld1.8          {d20}, [r12], r1
595        vld1.8          {d28}, [r0],  r1
596        vld1.8          {d22}, [r12], r1
597        vld1.8          {d30}, [r0],  r1
598        mov             r12, r0
599        add             r0,  r0,  r1, lsl #2
600        vld1.8          {d17}, [r12], r1
601        vld1.8          {d25}, [r0],  r1
602        vld1.8          {d19}, [r12], r1
603        vld1.8          {d27}, [r0],  r1
604        vld1.8          {d21}, [r12], r1
605        vld1.8          {d29}, [r0],  r1
606        vld1.8          {d23}, [r12], r1
607        vld1.8          {d31}, [r0],  r1
608
609        @ Transpose the 16x8 pixels, as two 8x8 parts
610        transpose_8x8   q8,  q9,  q10, q11, q12, q13, q14, q15
611
612        loop_filter_q
613
614        sub             r12, r0,  r1, lsl #4
615        add             r0,  r12, r1, lsl #3
616        @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
617        @ outermost 2 pixels since they aren't changed.
618        add             r12, r12, #2
619        add             r0,  r0,  #2
620
621        @ We only will write the mid 4 pixels back; after the loop filter,
622        @ these are in q10, q11, q12, q13, ordered as rows (16x4 pixels).
623        @ We need to transpose them to columns, done with a 4x4 transpose
624        @ (which in practice is four 4x4 transposes of the 4x4 blocks of
625        @ the 16x4 pixels; into 4x16 pixels).
626        transpose_4x4   q10, q11, q12, q13
627
628        vst1.32         {d20[0]}, [r12], r1
629        vst1.32         {d21[0]}, [r0],  r1
630        vst1.32         {d22[0]}, [r12], r1
631        vst1.32         {d23[0]}, [r0],  r1
632        vst1.32         {d24[0]}, [r12], r1
633        vst1.32         {d25[0]}, [r0],  r1
634        vst1.32         {d26[0]}, [r12], r1
635        vst1.32         {d27[0]}, [r0],  r1
636        vst1.32         {d20[1]}, [r12], r1
637        vst1.32         {d21[1]}, [r0],  r1
638        vst1.32         {d22[1]}, [r12], r1
639        vst1.32         {d23[1]}, [r0],  r1
640        vst1.32         {d24[1]}, [r12], r1
641        vst1.32         {d25[1]}, [r0],  r1
642        vst1.32         {d26[1]}, [r12], r1
643        vst1.32         {d27[1]}, [r0],  r1
6449:
645        vpop            {q4-q7}
646        bx              lr
647endfunc
648
649function ff_vp9_loop_filter_v_8_8_neon, export=1
650        sub             r12, r0,  r1, lsl #2
651        vld1.8          {d20}, [r12,:64], r1 @ p3
652        vld1.8          {d24}, [r0, :64], r1 @ q0
653        vld1.8          {d21}, [r12,:64], r1 @ p2
654        vld1.8          {d25}, [r0, :64], r1 @ q1
655        vld1.8          {d22}, [r12,:64], r1 @ p1
656        vld1.8          {d26}, [r0, :64], r1 @ q2
657        vld1.8          {d23}, [r12,:64], r1 @ p0
658        vld1.8          {d27}, [r0, :64], r1 @ q3
659        sub             r12, r12, r1, lsl #2
660        sub             r0,  r0,  r1, lsl #2
661        add             r12, r12, r1
662
663        loop_filter_8
664
665        vst1.8          {d21}, [r12,:64], r1
666        vst1.8          {d24}, [r0, :64], r1
667        vst1.8          {d22}, [r12,:64], r1
668        vst1.8          {d25}, [r0, :64], r1
669        vst1.8          {d23}, [r12,:64], r1
670        vst1.8          {d26}, [r0, :64], r1
6719:
672        bx              lr
6736:
674        sub             r12, r0,  r1, lsl #1
675        vst1.8          {d22}, [r12,:64], r1
676        vst1.8          {d24}, [r0, :64], r1
677        vst1.8          {d23}, [r12,:64], r1
678        vst1.8          {d25}, [r0, :64], r1
679        bx              lr
680endfunc
681
682function ff_vp9_loop_filter_h_8_8_neon, export=1
683        sub             r12, r0,  #4
684        add             r0,  r12, r1, lsl #2
685        vld1.8          {d20}, [r12], r1
686        vld1.8          {d24}, [r0],  r1
687        vld1.8          {d21}, [r12], r1
688        vld1.8          {d25}, [r0],  r1
689        vld1.8          {d22}, [r12], r1
690        vld1.8          {d26}, [r0],  r1
691        vld1.8          {d23}, [r12], r1
692        vld1.8          {d27}, [r0],  r1
693
694        sub             r12, r12, r1, lsl #2
695        sub             r0,  r0,  r1, lsl #2
696
697        transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
698
699        loop_filter_8
700
701        @ Even though only 6 pixels per row have been changed, we write the
702        @ full 8 pixel registers.
703        transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
704
705        vst1.8          {d20}, [r12], r1
706        vst1.8          {d24}, [r0],  r1
707        vst1.8          {d21}, [r12], r1
708        vst1.8          {d25}, [r0],  r1
709        vst1.8          {d22}, [r12], r1
710        vst1.8          {d26}, [r0],  r1
711        vst1.8          {d23}, [r12], r1
712        vst1.8          {d27}, [r0],  r1
7139:
714        bx              lr
7156:
716        @ If we didn't need to do the flat8in part, we use the same writeback
717        @ as in loop_filter_h_4_8.
718        add             r12, r12, #2
719        add             r0,  r0,  #2
720        transpose_q_4x4 q11, q12, d22, d23, d24, d25
721        vst1.32         {d22[0]}, [r12], r1
722        vst1.32         {d22[1]}, [r0],  r1
723        vst1.32         {d23[0]}, [r12], r1
724        vst1.32         {d23[1]}, [r0],  r1
725        vst1.32         {d24[0]}, [r12], r1
726        vst1.32         {d24[1]}, [r0],  r1
727        vst1.32         {d25[0]}, [r12], r1
728        vst1.32         {d25[1]}, [r0],  r1
729        bx              lr
730endfunc
731
732function vp9_loop_filter_v_16_neon
733        sub             r12, r0,  r1, lsl #3
734        @ Read p7-p0 using r12 and q0-q7 using r0
735        vld1.8          {d16}, [r12,:64], r1 @ p7
736        vld1.8          {d24}, [r0, :64], r1 @ q0
737        vld1.8          {d17}, [r12,:64], r1 @ p6
738        vld1.8          {d25}, [r0, :64], r1 @ q1
739        vld1.8          {d18}, [r12,:64], r1 @ p5
740        vld1.8          {d26}, [r0, :64], r1 @ q2
741        vld1.8          {d19}, [r12,:64], r1 @ p4
742        vld1.8          {d27}, [r0, :64], r1 @ q3
743        vld1.8          {d20}, [r12,:64], r1 @ p3
744        vld1.8          {d28}, [r0, :64], r1 @ q4
745        vld1.8          {d21}, [r12,:64], r1 @ p2
746        vld1.8          {d29}, [r0, :64], r1 @ q5
747        vld1.8          {d22}, [r12,:64], r1 @ p1
748        vld1.8          {d30}, [r0, :64], r1 @ q6
749        vld1.8          {d23}, [r12,:64], r1 @ p0
750        vld1.8          {d31}, [r0, :64], r1 @ q7
751        sub             r12, r12, r1, lsl #3
752        sub             r0,  r0,  r1, lsl #3
753        add             r12, r12, r1
754
755        loop_filter_16
756
757        @ If we did the flat8out part, we get the output in
758        @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride,
759        @ store d2-d9 there, and d10-d17 into r0.
760        vst1.8          {d2},  [r12,:64], r1
761        vst1.8          {d10}, [r0, :64], r1
762        vst1.8          {d3},  [r12,:64], r1
763        vst1.8          {d11}, [r0, :64], r1
764        vst1.8          {d4},  [r12,:64], r1
765        vst1.8          {d12}, [r0, :64], r1
766        vst1.8          {d5},  [r12,:64], r1
767        vst1.8          {d13}, [r0, :64], r1
768        vst1.8          {d6},  [r12,:64], r1
769        vst1.8          {d14}, [r0, :64], r1
770        vst1.8          {d8},  [r12,:64], r1
771        vst1.8          {d15}, [r0, :64], r1
772        vst1.8          {d9},  [r12,:64], r1
773        vst1.8          {d17}, [r0, :64], r1
774        sub             r0,  r0,  r1, lsl #3
775        add             r0,  r0,  r1
776
7779:
778        bx              lr
779
7808:
781        add             r12, r12, r1, lsl #2
782        @ If we didn't do the flat8out part, the output is left in the
783        @ input registers.
784        vst1.8          {d21}, [r12,:64], r1
785        vst1.8          {d24}, [r0, :64], r1
786        vst1.8          {d22}, [r12,:64], r1
787        vst1.8          {d25}, [r0, :64], r1
788        vst1.8          {d23}, [r12,:64], r1
789        vst1.8          {d26}, [r0, :64], r1
790        sub             r0,  r0,  r1, lsl #1
791        sub             r0,  r0,  r1
792        bx              lr
7937:
794        sub             r12, r0,  r1, lsl #1
795        vst1.8          {d22}, [r12,:64], r1
796        vst1.8          {d24}, [r0, :64], r1
797        vst1.8          {d23}, [r12,:64], r1
798        vst1.8          {d25}, [r0, :64], r1
799        sub             r0,  r0,  r1, lsl #1
800        bx              lr
801endfunc
802
803function ff_vp9_loop_filter_v_16_8_neon, export=1
804        ldr             r12, [sp]
805        push            {lr}
806        vpush           {q4-q7}
807        push            {r12}
808        bl              vp9_loop_filter_v_16_neon
809        add             sp,  sp,  #4
810        vpop            {q4-q7}
811        pop             {pc}
812endfunc
813
814function ff_vp9_loop_filter_v_16_16_neon, export=1
815        ldr             r12, [sp]
816        // The filter clobbers r2 and r3, but we need to keep them for the second round
817        push            {r2, r3, lr}
818        vpush           {q4-q7}
819        push            {r12}
820        bl              vp9_loop_filter_v_16_neon
821        add             r0,  #8
822        ldr             r2,  [sp, #68]
823        ldr             r3,  [sp, #72]
824        bl              vp9_loop_filter_v_16_neon
825        add             sp,  sp,  #4
826        vpop            {q4-q7}
827        pop             {r2, r3, pc}
828endfunc
829
830function vp9_loop_filter_h_16_neon
831        sub             r12, r0,  #8
832        vld1.8          {d16}, [r12,:64], r1
833        vld1.8          {d24}, [r0, :64], r1
834        vld1.8          {d17}, [r12,:64], r1
835        vld1.8          {d25}, [r0, :64], r1
836        vld1.8          {d18}, [r12,:64], r1
837        vld1.8          {d26}, [r0, :64], r1
838        vld1.8          {d19}, [r12,:64], r1
839        vld1.8          {d27}, [r0, :64], r1
840        vld1.8          {d20}, [r12,:64], r1
841        vld1.8          {d28}, [r0, :64], r1
842        vld1.8          {d21}, [r12,:64], r1
843        vld1.8          {d29}, [r0, :64], r1
844        vld1.8          {d22}, [r12,:64], r1
845        vld1.8          {d30}, [r0, :64], r1
846        vld1.8          {d23}, [r12,:64], r1
847        vld1.8          {d31}, [r0, :64], r1
848        sub             r0,  r0,  r1, lsl #3
849        sub             r12, r12, r1, lsl #3
850
851        @ The 16x8 pixels read above is in two 8x8 blocks; the left
852        @ half in d16-d23, and the right half in d24-d31. Do two 8x8 transposes
853        @ of this, to get one column per register. This could be done with two
854        @ transpose_8x8 as below, but this takes advantage of the q registers.
855        transpose16_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15
856        vtrn.8          d16, d17
857        vtrn.8          d18, d19
858        vtrn.8          d20, d21
859        vtrn.8          d22, d23
860        vtrn.8          d24, d25
861        vtrn.8          d26, d27
862        vtrn.8          d28, d29
863        vtrn.8          d30, d31
864
865        loop_filter_16
866
867        @ Transpose back; this is the same transpose as above, but
868        @ we can't take advantage of q registers for the transpose, since
869        @ all d registers in the transpose aren't consecutive.
870        transpose_8x8   d16, d2,  d3,  d4,  d5,  d6,  d8,  d9
871        transpose_8x8   d10, d11, d12, d13, d14, d15, d17, d31
872
873        vst1.8          {d16}, [r12,:64], r1
874        vst1.8          {d10}, [r0, :64], r1
875
876        vst1.8          {d2},  [r12,:64], r1
877        vst1.8          {d11}, [r0, :64], r1
878
879        vst1.8          {d3},  [r12,:64], r1
880        vst1.8          {d12}, [r0, :64], r1
881
882        vst1.8          {d4},  [r12,:64], r1
883        vst1.8          {d13}, [r0, :64], r1
884
885        vst1.8          {d5},  [r12,:64], r1
886        vst1.8          {d14}, [r0, :64], r1
887
888        vst1.8          {d6},  [r12,:64], r1
889        vst1.8          {d15}, [r0, :64], r1
890
891        vst1.8          {d8},  [r12,:64], r1
892        vst1.8          {d17}, [r0, :64], r1
893
894        vst1.8          {d9},  [r12,:64], r1
895        vst1.8          {d31}, [r0, :64], r1
896        sub             r0,  r0,  r1, lsl #3
8979:
898        bx              lr
8998:
900        @ The same writeback as in loop_filter_h_8_8
901        sub             r12, r0,  #4
902        add             r0,  r12, r1, lsl #2
903        transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
904
905        vst1.8          {d20}, [r12], r1
906        vst1.8          {d24}, [r0],  r1
907        vst1.8          {d21}, [r12], r1
908        vst1.8          {d25}, [r0],  r1
909        vst1.8          {d22}, [r12], r1
910        vst1.8          {d26}, [r0],  r1
911        vst1.8          {d23}, [r12], r1
912        vst1.8          {d27}, [r0],  r1
913        sub             r0,  r0,  r1, lsl #3
914        add             r0,  r0,  #4
915        bx              lr
9167:
917        @ The same writeback as in loop_filter_h_4_8
918        sub             r12, r0,  #2
919        add             r0,  r12, r1, lsl #2
920        transpose_q_4x4 q11, q12, d22, d23, d24, d25
921        vst1.32         {d22[0]}, [r12], r1
922        vst1.32         {d22[1]}, [r0],  r1
923        vst1.32         {d23[0]}, [r12], r1
924        vst1.32         {d23[1]}, [r0],  r1
925        vst1.32         {d24[0]}, [r12], r1
926        vst1.32         {d24[1]}, [r0],  r1
927        vst1.32         {d25[0]}, [r12], r1
928        vst1.32         {d25[1]}, [r0],  r1
929        sub             r0,  r0,  r1, lsl #3
930        add             r0,  r0,  #2
931        bx              lr
932endfunc
933
934function ff_vp9_loop_filter_h_16_8_neon, export=1
935        ldr             r12, [sp]
936        push            {lr}
937        vpush           {q4-q7}
938        push            {r12}
939        bl              vp9_loop_filter_h_16_neon
940        add             sp,  sp,  #4
941        vpop            {q4-q7}
942        pop             {pc}
943endfunc
944
945function ff_vp9_loop_filter_h_16_16_neon, export=1
946        ldr             r12, [sp]
947        // The filter clobbers r2 and r3, but we need to keep them for the second round
948        push            {r2, r3, lr}
949        vpush           {q4-q7}
950        push            {r12}
951        bl              vp9_loop_filter_h_16_neon
952        add             r0,  r0,  r1, lsl #3
953        ldr             r2,  [sp, #68]
954        ldr             r3,  [sp, #72]
955        bl              vp9_loop_filter_h_16_neon
956        add             sp,  sp,  #4
957        vpop            {q4-q7}
958        pop             {r2, r3, pc}
959endfunc
960