• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * VP8 NEON optimisations
3 *
4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24#include "libavutil/arm/asm.S"
25#include "neon.S"
26
27function ff_vp8_luma_dc_wht_neon, export=1
28        vld1.16         {q0-q1},  [r1,:128]
29        vmov.i16        q15, #0
30
31        vadd.i16        d4,  d0,  d3
32        vadd.i16        d6,  d1,  d2
33        vst1.16         {q15},    [r1,:128]!
34        vsub.i16        d7,  d1,  d2
35        vsub.i16        d5,  d0,  d3
36        vst1.16         {q15},    [r1,:128]
37        vadd.i16        q0,  q2,  q3
38        vsub.i16        q1,  q2,  q3
39
40        vmov.i16        q8, #3
41
42        vtrn.32         d0,  d2
43        vtrn.32         d1,  d3
44        vtrn.16         d0,  d1
45        vtrn.16         d2,  d3
46
47        vadd.i16        d0,  d0,  d16
48
49        vadd.i16        d4,  d0,  d3
50        vadd.i16        d6,  d1,  d2
51        vsub.i16        d7,  d1,  d2
52        vsub.i16        d5,  d0,  d3
53        vadd.i16        q0,  q2,  q3
54        vsub.i16        q1,  q2,  q3
55
56        vshr.s16        q0,  q0,  #3
57        vshr.s16        q1,  q1,  #3
58
59        mov             r3,  #32
60        vst1.16         {d0[0]},  [r0,:16], r3
61        vst1.16         {d1[0]},  [r0,:16], r3
62        vst1.16         {d2[0]},  [r0,:16], r3
63        vst1.16         {d3[0]},  [r0,:16], r3
64        vst1.16         {d0[1]},  [r0,:16], r3
65        vst1.16         {d1[1]},  [r0,:16], r3
66        vst1.16         {d2[1]},  [r0,:16], r3
67        vst1.16         {d3[1]},  [r0,:16], r3
68        vst1.16         {d0[2]},  [r0,:16], r3
69        vst1.16         {d1[2]},  [r0,:16], r3
70        vst1.16         {d2[2]},  [r0,:16], r3
71        vst1.16         {d3[2]},  [r0,:16], r3
72        vst1.16         {d0[3]},  [r0,:16], r3
73        vst1.16         {d1[3]},  [r0,:16], r3
74        vst1.16         {d2[3]},  [r0,:16], r3
75        vst1.16         {d3[3]},  [r0,:16], r3
76
77        bx              lr
78endfunc
79
80function ff_vp8_idct_add_neon, export=1
81        vld1.16         {q0-q1},  [r1,:128]
82        movw            r3,  #20091
83        movt            r3,  #35468/2
84        vdup.32         d4,  r3
85
86        vmull.s16       q12, d1,  d4[0]
87        vmull.s16       q13, d3,  d4[0]
88        vqdmulh.s16     d20, d1,  d4[1]
89        vqdmulh.s16     d23, d3,  d4[1]
90        vshrn.s32       d21, q12, #16
91        vshrn.s32       d22, q13, #16
92        vadd.s16        d21, d21, d1
93        vadd.s16        d22, d22, d3
94
95        vadd.s16        d16, d0,  d2
96        vsub.s16        d17, d0,  d2
97        vadd.s16        d18, d21, d23
98        vsub.s16        d19, d20, d22
99        vadd.s16        q0,  q8,  q9
100        vsub.s16        q1,  q8,  q9
101
102        vtrn.32         d0,  d3
103        vtrn.32         d1,  d2
104        vtrn.16         d0,  d1
105        vtrn.16         d3,  d2
106
107        vmov.i16        q15, #0
108        vmull.s16       q12, d1,  d4[0]
109        vst1.16         {q15},    [r1,:128]!
110        vmull.s16       q13, d2,  d4[0]
111        vst1.16         {q15},    [r1,:128]
112        vqdmulh.s16     d21, d1,  d4[1]
113        vqdmulh.s16     d23, d2,  d4[1]
114        vshrn.s32       d20, q12, #16
115        vshrn.s32       d22, q13, #16
116        vadd.i16        d20, d20, d1
117        vadd.i16        d22, d22, d2
118
119        vadd.i16        d16, d0,  d3
120        vsub.i16        d17, d0,  d3
121        vadd.i16        d18, d20, d23
122        vld1.32         {d20[]},  [r0,:32], r2
123        vsub.i16        d19, d21, d22
124        vld1.32         {d22[]},  [r0,:32], r2
125        vadd.s16        q0,  q8,  q9
126        vld1.32         {d23[]},  [r0,:32], r2
127        vsub.s16        q1,  q8,  q9
128        vld1.32         {d21[]},  [r0,:32], r2
129        vrshr.s16       q0,  q0,  #3
130        vtrn.32         q10, q11
131        vrshr.s16       q1,  q1,  #3
132
133        sub             r0,  r0,  r2,  lsl #2
134
135        vtrn.32         d0,  d3
136        vtrn.32         d1,  d2
137        vtrn.16         d0,  d1
138        vtrn.16         d3,  d2
139
140        vaddw.u8        q0,  q0,  d20
141        vaddw.u8        q1,  q1,  d21
142        vqmovun.s16     d0,  q0
143        vqmovun.s16     d1,  q1
144
145        vst1.32         {d0[0]},  [r0,:32], r2
146        vst1.32         {d0[1]},  [r0,:32], r2
147        vst1.32         {d1[1]},  [r0,:32], r2
148        vst1.32         {d1[0]},  [r0,:32], r2
149
150        bx              lr
151endfunc
152
153function ff_vp8_idct_dc_add_neon, export=1
154        mov             r3,  #0
155        ldrsh           r12, [r1]
156        strh            r3,  [r1]
157        vdup.16         q1,  r12
158        vrshr.s16       q1,  q1,  #3
159        vld1.32         {d0[]},   [r0,:32], r2
160        vld1.32         {d1[]},   [r0,:32], r2
161        vld1.32         {d0[1]},  [r0,:32], r2
162        vld1.32         {d1[1]},  [r0,:32], r2
163        vaddw.u8        q2,  q1,  d0
164        vaddw.u8        q3,  q1,  d1
165        sub             r0,  r0,  r2, lsl #2
166        vqmovun.s16     d0,  q2
167        vqmovun.s16     d1,  q3
168        vst1.32         {d0[0]},  [r0,:32], r2
169        vst1.32         {d1[0]},  [r0,:32], r2
170        vst1.32         {d0[1]},  [r0,:32], r2
171        vst1.32         {d1[1]},  [r0,:32], r2
172        bx              lr
173endfunc
174
175function ff_vp8_idct_dc_add4uv_neon, export=1
176        vmov.i16        d0,  #0
177        mov             r3,  #32
178        vld1.16         {d16[]},  [r1,:16]
179        vst1.16         {d0[0]},  [r1,:16], r3
180        vld1.16         {d17[]},  [r1,:16]
181        vst1.16         {d0[0]},  [r1,:16], r3
182        vld1.16         {d18[]},  [r1,:16]
183        vst1.16         {d0[0]},  [r1,:16], r3
184        vld1.16         {d19[]},  [r1,:16]
185        vst1.16         {d0[0]},  [r1,:16], r3
186        mov             r3,  r0
187        vrshr.s16       q8,  q8,  #3            @ dc >>= 3
188        vld1.8          {d0},     [r0,:64], r2
189        vrshr.s16       q9,  q9,  #3
190        vld1.8          {d1},     [r0,:64], r2
191        vaddw.u8        q10, q8,  d0
192        vld1.8          {d2},     [r0,:64], r2
193        vaddw.u8        q0,  q8,  d1
194        vld1.8          {d3},     [r0,:64], r2
195        vaddw.u8        q11, q8,  d2
196        vld1.8          {d4},     [r0,:64], r2
197        vaddw.u8        q1,  q8,  d3
198        vld1.8          {d5},     [r0,:64], r2
199        vaddw.u8        q12, q9,  d4
200        vld1.8          {d6},     [r0,:64], r2
201        vaddw.u8        q2,  q9,  d5
202        vld1.8          {d7},     [r0,:64], r2
203        vaddw.u8        q13, q9,  d6
204        vqmovun.s16     d20, q10
205        vaddw.u8        q3,  q9,  d7
206        vqmovun.s16     d21, q0
207        vqmovun.s16     d22, q11
208        vst1.8          {d20},    [r3,:64], r2
209        vqmovun.s16     d23, q1
210        vst1.8          {d21},    [r3,:64], r2
211        vqmovun.s16     d24, q12
212        vst1.8          {d22},    [r3,:64], r2
213        vqmovun.s16     d25, q2
214        vst1.8          {d23},    [r3,:64], r2
215        vqmovun.s16     d26, q13
216        vst1.8          {d24},    [r3,:64], r2
217        vqmovun.s16     d27, q3
218        vst1.8          {d25},    [r3,:64], r2
219        vst1.8          {d26},    [r3,:64], r2
220        vst1.8          {d27},    [r3,:64], r2
221
222        bx              lr
223endfunc
224
225function ff_vp8_idct_dc_add4y_neon, export=1
226        vmov.i16        d0,  #0
227        mov             r3,  #32
228        vld1.16         {d16[]},  [r1,:16]
229        vst1.16         {d0[0]},  [r1,:16], r3
230        vld1.16         {d17[]},  [r1,:16]
231        vst1.16         {d0[0]},  [r1,:16], r3
232        vld1.16         {d18[]},  [r1,:16]
233        vst1.16         {d0[0]},  [r1,:16], r3
234        vld1.16         {d19[]},  [r1,:16]
235        vst1.16         {d0[0]},  [r1,:16], r3
236        vrshr.s16       q8,  q8,  #3            @ dc >>= 3
237        vld1.8          {q0},     [r0,:128], r2
238        vrshr.s16       q9,  q9,  #3
239        vld1.8          {q1},     [r0,:128], r2
240        vaddw.u8        q10, q8,  d0
241        vld1.8          {q2},     [r0,:128], r2
242        vaddw.u8        q0,  q9,  d1
243        vld1.8          {q3},     [r0,:128], r2
244        vaddw.u8        q11, q8,  d2
245        vaddw.u8        q1,  q9,  d3
246        vaddw.u8        q12, q8,  d4
247        vaddw.u8        q2,  q9,  d5
248        vaddw.u8        q13, q8,  d6
249        vaddw.u8        q3,  q9,  d7
250        sub             r0,  r0,  r2,  lsl #2
251        vqmovun.s16     d20, q10
252        vqmovun.s16     d21, q0
253        vqmovun.s16     d22, q11
254        vqmovun.s16     d23, q1
255        vqmovun.s16     d24, q12
256        vst1.8          {q10},    [r0,:128], r2
257        vqmovun.s16     d25, q2
258        vst1.8          {q11},    [r0,:128], r2
259        vqmovun.s16     d26, q13
260        vst1.8          {q12},    [r0,:128], r2
261        vqmovun.s16     d27, q3
262        vst1.8          {q13},    [r0,:128], r2
263
264        bx              lr
265endfunc
266
267@ Register layout:
268@   P3..Q3 -> q0..q7
269@   flim_E -> q14
270@   flim_I -> q15
271@   hev_thresh -> r12
272@
273.macro  vp8_loop_filter, inner=0, simple=0
274    .if \simple
275        vabd.u8         q9,  q3,  q4            @ abs(P0-Q0)
276        vabd.u8         q15, q2,  q5            @ abs(P1-Q1)
277        vqadd.u8        q9,  q9,  q9            @ abs(P0-Q0) * 2
278        vshr.u8         q10, q15, #1            @ abs(P1-Q1) / 2
279        vqadd.u8        q11, q9,  q10           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
280        vmov.i8         q13, #0x80
281        vcle.u8         q8,  q11, q14           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
282    .else
283        @ calculate hev and normal_limit:
284        vabd.u8         q12, q2,  q3            @ abs(P1-P0)
285        vabd.u8         q13, q5,  q4            @ abs(Q1-Q0)
286        vabd.u8         q10, q0,  q1            @ abs(P3-P2)
287        vabd.u8         q11, q1,  q2            @ abs(P2-P1)
288        vcle.u8         q8,  q12, q15           @ abs(P1-P0) <= flim_I
289        vcle.u8         q9,  q13, q15           @ abs(Q1-Q0) <= flim_I
290        vcle.u8         q10, q10, q15           @ abs(P3-P2) <= flim_I
291        vcle.u8         q11, q11, q15           @ abs(P2-P1) <= flim_I
292        vand            q8,  q8,  q9
293        vabd.u8         q9,  q7,  q6            @ abs(Q3-Q2)
294        vand            q8,  q8,  q11
295        vabd.u8         q11, q6,  q5            @ abs(Q2-Q1)
296        vand            q8,  q8,  q10
297        vcle.u8         q10, q9,  q15           @ abs(Q3-Q2) <= flim_I
298        vcle.u8         q11, q11, q15           @ abs(Q2-Q1) <= flim_I
299        vabd.u8         q9,  q3,  q4            @ abs(P0-Q0)
300        vabd.u8         q15, q2,  q5            @ abs(P1-Q1)
301        vand            q8,  q8,  q10
302        vqadd.u8        q9,  q9,  q9            @ abs(P0-Q0) * 2
303        vand            q8,  q8,  q11
304        vshr.u8         q10, q15, #1            @ abs(P1-Q1) / 2
305        vdup.8          q15, r12                @ hev_thresh
306        vqadd.u8        q11, q9,  q10           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
307        vcgt.u8         q12, q12, q15           @ abs(P1-P0) > hev_thresh
308        vcle.u8         q11, q11, q14           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
309        vcgt.u8         q14, q13, q15           @ abs(Q1-Q0) > hev_thresh
310        vand            q8,  q8,  q11
311        vmov.i8         q13, #0x80
312        vorr            q9,  q12, q14
313    .endif
314
315        @ at this point:
316        @   q8: normal_limit
317        @   q9: hev
318
319        @ convert to signed value:
320        veor            q3,  q3,  q13           @ PS0 = P0 ^ 0x80
321        veor            q4,  q4,  q13           @ QS0 = Q0 ^ 0x80
322
323        vmov.i16        q12, #3
324        vsubl.s8        q10, d8,  d6            @ QS0 - PS0
325        vsubl.s8        q11, d9,  d7            @   (widened to 16 bits)
326        veor            q2,  q2,  q13           @ PS1 = P1 ^ 0x80
327        veor            q5,  q5,  q13           @ QS1 = Q1 ^ 0x80
328        vmul.i16        q10, q10, q12           @ w = 3 * (QS0 - PS0)
329        vmul.i16        q11, q11, q12
330
331        vqsub.s8        q12, q2,  q5            @ clamp(PS1-QS1)
332        vmov.i8         q14, #4
333        vmov.i8         q15, #3
334    .if \inner
335        vand            q12, q12, q9            @ if(hev) w += clamp(PS1-QS1)
336    .endif
337        vaddw.s8        q10, q10, d24           @ w += clamp(PS1-QS1)
338        vaddw.s8        q11, q11, d25
339        vqmovn.s16      d20, q10                @ narrow result back into q10
340        vqmovn.s16      d21, q11
341    .if !\inner && !\simple
342        veor            q1,  q1,  q13           @ PS2 = P2 ^ 0x80
343        veor            q6,  q6,  q13           @ QS2 = Q2 ^ 0x80
344    .endif
345        vand            q10, q10, q8            @ w &= normal_limit
346
347        @ registers used at this point..
348        @   q0 -> P3  (don't corrupt)
349        @   q1-q6 -> PS2-QS2
350        @   q7 -> Q3  (don't corrupt)
351        @   q9 -> hev
352        @   q10 -> w
353        @   q13 -> #0x80
354        @   q14 -> #4
355        @   q15 -> #3
356        @   q8, q11, q12 -> unused
357
358        @ filter_common:   is4tap==1
359        @   c1 = clamp(w + 4) >> 3;
360        @   c2 = clamp(w + 3) >> 3;
361        @   Q0 = s2u(QS0 - c1);
362        @   P0 = s2u(PS0 + c2);
363
364    .if \simple
365        vqadd.s8        q11, q10, q14           @ c1 = clamp((w&hev)+4)
366        vqadd.s8        q12, q10, q15           @ c2 = clamp((w&hev)+3)
367        vshr.s8         q11, q11, #3            @ c1 >>= 3
368        vshr.s8         q12, q12, #3            @ c2 >>= 3
369        vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
370        vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
371        veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
372        veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
373        veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
374        veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
375    .elseif \inner
376        @ the !is4tap case of filter_common, only used for inner blocks
377        @   c3 = ((c1&~hev) + 1) >> 1;
378        @   Q1 = s2u(QS1 - c3);
379        @   P1 = s2u(PS1 + c3);
380        vqadd.s8        q11, q10, q14           @ c1 = clamp((w&hev)+4)
381        vqadd.s8        q12, q10, q15           @ c2 = clamp((w&hev)+3)
382        vshr.s8         q11, q11, #3            @ c1 >>= 3
383        vshr.s8         q12, q12, #3            @ c2 >>= 3
384        vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
385        vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
386        vbic            q11, q11, q9            @ c1 & ~hev
387        veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
388        vrshr.s8        q11, q11, #1            @ c3 >>= 1
389        veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
390        vqsub.s8        q5,  q5,  q11           @ QS1 = clamp(QS1-c3)
391        vqadd.s8        q2,  q2,  q11           @ PS1 = clamp(PS1+c3)
392        veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
393        veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
394    .else
395        vand            q12, q10, q9            @ w & hev
396        vqadd.s8        q11, q12, q14           @ c1 = clamp((w&hev)+4)
397        vqadd.s8        q12, q12, q15           @ c2 = clamp((w&hev)+3)
398        vshr.s8         q11, q11, #3            @ c1 >>= 3
399        vshr.s8         q12, q12, #3            @ c2 >>= 3
400        vbic            q10, q10, q9            @ w &= ~hev
401        vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
402        vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
403
404        @ filter_mbedge:
405        @   a = clamp((27*w + 63) >> 7);
406        @   Q0 = s2u(QS0 - a);
407        @   P0 = s2u(PS0 + a);
408        @   a = clamp((18*w + 63) >> 7);
409        @   Q1 = s2u(QS1 - a);
410        @   P1 = s2u(PS1 + a);
411        @   a = clamp((9*w + 63) >> 7);
412        @   Q2 = s2u(QS2 - a);
413        @   P2 = s2u(PS2 + a);
414        vmov.i16        q9,  #63
415        vshll.s8        q14, d20, #3
416        vshll.s8        q15, d21, #3
417        vaddw.s8        q14, q14, d20
418        vaddw.s8        q15, q15, d21
419        vadd.s16        q8,  q9,  q14
420        vadd.s16        q9,  q9,  q15           @  9*w + 63
421        vadd.s16        q11, q8,  q14
422        vadd.s16        q12, q9,  q15           @ 18*w + 63
423        vadd.s16        q14, q11, q14
424        vadd.s16        q15, q12, q15           @ 27*w + 63
425        vqshrn.s16      d16, q8,  #7
426        vqshrn.s16      d17, q9,  #7            @ clamp(( 9*w + 63)>>7)
427        vqshrn.s16      d22, q11, #7
428        vqshrn.s16      d23, q12, #7            @ clamp((18*w + 63)>>7)
429        vqshrn.s16      d28, q14, #7
430        vqshrn.s16      d29, q15, #7            @ clamp((27*w + 63)>>7)
431        vqadd.s8        q1,  q1,  q8            @ PS2 = clamp(PS2+a)
432        vqsub.s8        q6,  q6,  q8            @ QS2 = clamp(QS2-a)
433        vqadd.s8        q2,  q2,  q11           @ PS1 = clamp(PS1+a)
434        vqsub.s8        q5,  q5,  q11           @ QS1 = clamp(QS1-a)
435        vqadd.s8        q3,  q3,  q14           @ PS0 = clamp(PS0+a)
436        vqsub.s8        q4,  q4,  q14           @ QS0 = clamp(QS0-a)
437        veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
438        veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
439        veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
440        veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
441        veor            q1,  q1,  q13           @ P2 = PS2 ^ 0x80
442        veor            q6,  q6,  q13           @ Q2 = QS2 ^ 0x80
443    .endif
444.endm
445
446.macro  vp8_v_loop_filter16 name, inner=0, simple=0
447function ff_vp8_v_loop_filter16\name\()_neon, export=1
448        vpush           {q4-q7}
449        sub             r0,  r0,  r1,  lsl #1+!\simple
450
451        @ Load pixels:
452    .if !\simple
453        ldr             r12, [sp, #64]          @ hev_thresh
454        vld1.8          {q0},     [r0,:128], r1 @ P3
455        vld1.8          {q1},     [r0,:128], r1 @ P2
456    .endif
457        vld1.8          {q2},     [r0,:128], r1 @ P1
458        vld1.8          {q3},     [r0,:128], r1 @ P0
459        vld1.8          {q4},     [r0,:128], r1 @ Q0
460        vld1.8          {q5},     [r0,:128], r1 @ Q1
461    .if !\simple
462        vld1.8          {q6},     [r0,:128], r1 @ Q2
463        vld1.8          {q7},     [r0,:128]     @ Q3
464        vdup.8          q15, r3                 @ flim_I
465    .endif
466        vdup.8          q14, r2                 @ flim_E
467
468        vp8_loop_filter inner=\inner, simple=\simple
469
470        @ back up to P2:  dst -= stride * 6
471        sub             r0,  r0,  r1,  lsl #2
472    .if !\simple
473        sub             r0,  r0,  r1,  lsl #1
474
475        @ Store pixels:
476        vst1.8          {q1},     [r0,:128], r1 @ P2
477    .endif
478        vst1.8          {q2},     [r0,:128], r1 @ P1
479        vst1.8          {q3},     [r0,:128], r1 @ P0
480        vst1.8          {q4},     [r0,:128], r1 @ Q0
481        vst1.8          {q5},     [r0,:128], r1 @ Q1
482    .if !\simple
483        vst1.8          {q6},     [r0,:128]     @ Q2
484    .endif
485
486        vpop            {q4-q7}
487        bx              lr
488endfunc
489.endm
490
491vp8_v_loop_filter16
492vp8_v_loop_filter16 _inner,  inner=1
493vp8_v_loop_filter16 _simple, simple=1
494
495.macro  vp8_v_loop_filter8uv name, inner=0
496function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
497        vpush           {q4-q7}
498        sub             r0,  r0,  r2,  lsl #2
499        sub             r1,  r1,  r2,  lsl #2
500        ldr             r12, [sp, #64]          @ flim_I
501
502        @ Load pixels:
503        vld1.8          {d0},     [r0,:64], r2  @ P3
504        vld1.8          {d1},     [r1,:64], r2  @ P3
505        vld1.8          {d2},     [r0,:64], r2  @ P2
506        vld1.8          {d3},     [r1,:64], r2  @ P2
507        vld1.8          {d4},     [r0,:64], r2  @ P1
508        vld1.8          {d5},     [r1,:64], r2  @ P1
509        vld1.8          {d6},     [r0,:64], r2  @ P0
510        vld1.8          {d7},     [r1,:64], r2  @ P0
511        vld1.8          {d8},     [r0,:64], r2  @ Q0
512        vld1.8          {d9},     [r1,:64], r2  @ Q0
513        vld1.8          {d10},    [r0,:64], r2  @ Q1
514        vld1.8          {d11},    [r1,:64], r2  @ Q1
515        vld1.8          {d12},    [r0,:64], r2  @ Q2
516        vld1.8          {d13},    [r1,:64], r2  @ Q2
517        vld1.8          {d14},    [r0,:64]      @ Q3
518        vld1.8          {d15},    [r1,:64]      @ Q3
519
520        vdup.8          q14, r3                 @ flim_E
521        vdup.8          q15, r12                @ flim_I
522        ldr             r12, [sp, #68]          @ hev_thresh
523
524        vp8_loop_filter inner=\inner
525
526        @ back up to P2:  u,v -= stride * 6
527        sub             r0,  r0,  r2,  lsl #2
528        sub             r1,  r1,  r2,  lsl #2
529        sub             r0,  r0,  r2,  lsl #1
530        sub             r1,  r1,  r2,  lsl #1
531
532        @ Store pixels:
533        vst1.8          {d2},     [r0,:64], r2  @ P2
534        vst1.8          {d3},     [r1,:64], r2  @ P2
535        vst1.8          {d4},     [r0,:64], r2  @ P1
536        vst1.8          {d5},     [r1,:64], r2  @ P1
537        vst1.8          {d6},     [r0,:64], r2  @ P0
538        vst1.8          {d7},     [r1,:64], r2  @ P0
539        vst1.8          {d8},     [r0,:64], r2  @ Q0
540        vst1.8          {d9},     [r1,:64], r2  @ Q0
541        vst1.8          {d10},    [r0,:64], r2  @ Q1
542        vst1.8          {d11},    [r1,:64], r2  @ Q1
543        vst1.8          {d12},    [r0,:64]      @ Q2
544        vst1.8          {d13},    [r1,:64]      @ Q2
545
546        vpop            {q4-q7}
547        bx              lr
548endfunc
549.endm
550
551vp8_v_loop_filter8uv
552vp8_v_loop_filter8uv _inner, inner=1
553
554.macro  vp8_h_loop_filter16 name, inner=0, simple=0
555function ff_vp8_h_loop_filter16\name\()_neon, export=1
556        vpush           {q4-q7}
557        sub             r0,  r0,  #4
558    .if !\simple
559        ldr             r12, [sp, #64]          @ hev_thresh
560    .endif
561
562        @ Load pixels:
563        vld1.8          {d0},     [r0], r1      @ load first 8-line src data
564        vld1.8          {d2},     [r0], r1
565        vld1.8          {d4},     [r0], r1
566        vld1.8          {d6},     [r0], r1
567        vld1.8          {d8},     [r0], r1
568        vld1.8          {d10},    [r0], r1
569        vld1.8          {d12},    [r0], r1
570        vld1.8          {d14},    [r0], r1
571        vld1.8          {d1},     [r0], r1      @ load second 8-line src data
572        vld1.8          {d3},     [r0], r1
573        vld1.8          {d5},     [r0], r1
574        vld1.8          {d7},     [r0], r1
575        vld1.8          {d9},     [r0], r1
576        vld1.8          {d11},    [r0], r1
577        vld1.8          {d13},    [r0], r1
578        vld1.8          {d15},    [r0], r1
579
580        transpose_8x8   q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
581
582        vdup.8          q14, r2                 @ flim_E
583    .if !\simple
584        vdup.8          q15, r3                 @ flim_I
585    .endif
586
587        vp8_loop_filter inner=\inner, simple=\simple
588
589        sub             r0,  r0,  r1, lsl #4    @ backup 16 rows
590
591        transpose_8x8   q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
592
593        @ Store pixels:
594        vst1.8          {d0},     [r0],     r1
595        vst1.8          {d2},     [r0],     r1
596        vst1.8          {d4},     [r0],     r1
597        vst1.8          {d6},     [r0],     r1
598        vst1.8          {d8},     [r0],     r1
599        vst1.8          {d10},    [r0],     r1
600        vst1.8          {d12},    [r0],     r1
601        vst1.8          {d14},    [r0],     r1
602        vst1.8          {d1},     [r0],     r1
603        vst1.8          {d3},     [r0],     r1
604        vst1.8          {d5},     [r0],     r1
605        vst1.8          {d7},     [r0],     r1
606        vst1.8          {d9},     [r0],     r1
607        vst1.8          {d11},    [r0],     r1
608        vst1.8          {d13},    [r0],     r1
609        vst1.8          {d15},    [r0]
610
611        vpop            {q4-q7}
612        bx              lr
613endfunc
614.endm
615
616vp8_h_loop_filter16
617vp8_h_loop_filter16 _inner,  inner=1
618vp8_h_loop_filter16 _simple, simple=1
619
620.macro  vp8_h_loop_filter8uv name, inner=0
621function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
622        vpush           {q4-q7}
623        sub             r0,  r0,  #4
624        sub             r1,  r1,  #4
625        ldr             r12, [sp, #64]          @ flim_I
626
627        @ Load pixels:
628        vld1.8          {d0},     [r0], r2      @ load u
629        vld1.8          {d1},     [r1], r2      @ load v
630        vld1.8          {d2},     [r0], r2
631        vld1.8          {d3},     [r1], r2
632        vld1.8          {d4},     [r0], r2
633        vld1.8          {d5},     [r1], r2
634        vld1.8          {d6},     [r0], r2
635        vld1.8          {d7},     [r1], r2
636        vld1.8          {d8},     [r0], r2
637        vld1.8          {d9},     [r1], r2
638        vld1.8          {d10},    [r0], r2
639        vld1.8          {d11},    [r1], r2
640        vld1.8          {d12},    [r0], r2
641        vld1.8          {d13},    [r1], r2
642        vld1.8          {d14},    [r0], r2
643        vld1.8          {d15},    [r1], r2
644
645        transpose_8x8   q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
646
647        vdup.8          q14, r3                 @ flim_E
648        vdup.8          q15, r12                @ flim_I
649        ldr             r12, [sp, #68]          @ hev_thresh
650
651        vp8_loop_filter inner=\inner
652
653        sub             r0,  r0,  r2, lsl #3    @ backup u 8 rows
654        sub             r1,  r1,  r2, lsl #3    @ backup v 8 rows
655
656        transpose_8x8   q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
657
658        @ Store pixels:
659        vst1.8          {d0},     [r0], r2
660        vst1.8          {d1},     [r1], r2
661        vst1.8          {d2},     [r0], r2
662        vst1.8          {d3},     [r1], r2
663        vst1.8          {d4},     [r0], r2
664        vst1.8          {d5},     [r1], r2
665        vst1.8          {d6},     [r0], r2
666        vst1.8          {d7},     [r1], r2
667        vst1.8          {d8},     [r0], r2
668        vst1.8          {d9},     [r1], r2
669        vst1.8          {d10},    [r0], r2
670        vst1.8          {d11},    [r1], r2
671        vst1.8          {d12},    [r0], r2
672        vst1.8          {d13},    [r1], r2
673        vst1.8          {d14},    [r0]
674        vst1.8          {d15},    [r1]
675
676        vpop            {q4-q7}
677        bx              lr
678endfunc
679.endm
680
681vp8_h_loop_filter8uv
682vp8_h_loop_filter8uv _inner, inner=1
683
684function ff_put_vp8_pixels16_neon, export=1
685        ldr             r12, [sp, #0]           @ h
6861:
687        subs            r12, r12, #4
688        vld1.8          {q0},     [r2], r3
689        vld1.8          {q1},     [r2], r3
690        vld1.8          {q2},     [r2], r3
691        vld1.8          {q3},     [r2], r3
692        vst1.8          {q0},     [r0,:128], r1
693        vst1.8          {q1},     [r0,:128], r1
694        vst1.8          {q2},     [r0,:128], r1
695        vst1.8          {q3},     [r0,:128], r1
696        bgt             1b
697        bx              lr
698endfunc
699
700function ff_put_vp8_pixels8_neon, export=1
701        ldr             r12, [sp, #0]           @ h
7021:
703        subs            r12, r12, #4
704        vld1.8          {d0},     [r2], r3
705        vld1.8          {d1},     [r2], r3
706        vld1.8          {d2},     [r2], r3
707        vld1.8          {d3},     [r2], r3
708        vst1.8          {d0},     [r0,:64], r1
709        vst1.8          {d1},     [r0,:64], r1
710        vst1.8          {d2},     [r0,:64], r1
711        vst1.8          {d3},     [r0,:64], r1
712        bgt             1b
713        bx              lr
714endfunc
715
716/* 4/6-tap 8th-pel MC */
717
718.macro  vp8_epel8_h6    d,   a,   b
719        vext.8          d27, \a,  \b,  #1
720        vmovl.u8        q8,  \a
721        vext.8          d28, \a,  \b,  #2
722        vmovl.u8        q9,  d27
723        vext.8          d29, \a,  \b,  #3
724        vmovl.u8        q10, d28
725        vext.8          d30, \a,  \b,  #4
726        vmovl.u8        q11, d29
727        vext.8          d31, \a,  \b,  #5
728        vmovl.u8        q12, d30
729        vmul.u16        q10, q10, d0[2]
730        vmovl.u8        q13, d31
731        vmul.u16        q11, q11, d0[3]
732        vmls.u16        q10, q9,  d0[1]
733        vmls.u16        q11, q12, d1[0]
734        vmla.u16        q10, q8,  d0[0]
735        vmla.u16        q11, q13, d1[1]
736        vqadd.s16       q11, q10, q11
737        vqrshrun.s16    \d,  q11, #7
738.endm
739
740.macro  vp8_epel16_h6   d0,  d1,  s0,  s1,  s2,  q0,  q1
741        vext.8          q14, \q0, \q1, #3
742        vext.8          q15, \q0, \q1, #4
743        vmovl.u8        q11, d28
744        vmovl.u8        q14, d29
745        vext.8          q3,  \q0, \q1, #2
746        vmovl.u8        q12, d30
747        vmovl.u8        q15, d31
748        vext.8          q8,  \q0, \q1, #1
749        vmovl.u8        q10, d6
750        vmovl.u8        q3,  d7
751        vext.8          q2,  \q0, \q1, #5
752        vmovl.u8        q13, d4
753        vmovl.u8        q2,  d5
754        vmovl.u8        q9,  d16
755        vmovl.u8        q8,  d17
756        vmul.u16        q11, q11, d0[3]
757        vmul.u16        q10, q10, d0[2]
758        vmul.u16        q3,  q3,  d0[2]
759        vmul.u16        q14, q14, d0[3]
760        vmls.u16        q11, q12, d1[0]
761        vmovl.u8        q12, \s0
762        vmovl.u8        q1,  \s1
763        vmls.u16        q10, q9,  d0[1]
764        vmls.u16        q3,  q8,  d0[1]
765        vmls.u16        q14, q15, d1[0]
766        vmla.u16        q10, q12, d0[0]
767        vmla.u16        q11, q13, d1[1]
768        vmla.u16        q3,  q1,  d0[0]
769        vmla.u16        q14, q2,  d1[1]
770        vqadd.s16       q11, q10, q11
771        vqadd.s16       q14, q3,  q14
772        vqrshrun.s16    \d0, q11, #7
773        vqrshrun.s16    \d1, q14, #7
774.endm
775
776.macro  vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
777        vmovl.u8        q10, \s0
778        vmovl.u8        q11, \s3
779        vmovl.u8        q14, \s6
780        vmovl.u8        q9,  \s1
781        vmovl.u8        q12, \s4
782        vmovl.u8        q8,  \s2
783        vmovl.u8        q13, \s5
784        vmul.u16        q10, q10, d0[0]
785        vmul.u16        q15, q11, d0[3]
786        vmul.u16        q11, q11, d0[2]
787        vmul.u16        q14, q14, d1[1]
788        vmls.u16        q10, q9,  d0[1]
789        vmls.u16        q15, q12, d1[0]
790        vmls.u16        q11, q8,  d0[1]
791        vmls.u16        q14, q13, d1[0]
792        vmla.u16        q10, q8,  d0[2]
793        vmla.u16        q15, q13, d1[1]
794        vmla.u16        q11, q9,  d0[0]
795        vmla.u16        q14, q12, d0[3]
796        vqadd.s16       q15, q10, q15
797        vqadd.s16       q14, q11, q14
798        vqrshrun.s16    \d0, q15, #7
799        vqrshrun.s16    \d1, q14, #7
800.endm
801
802.macro  vp8_epel8_h4    d,   a,   b
803        vext.8          d28, \a,  \b,  #1
804        vmovl.u8        q9,  \a
805        vext.8          d29, \a,  \b,  #2
806        vmovl.u8        q10, d28
807        vext.8          d30, \a,  \b,  #3
808        vmovl.u8        q11, d29
809        vmovl.u8        q12, d30
810        vmul.u16        q10, q10, d0[2]
811        vmul.u16        q11, q11, d0[3]
812        vmls.u16        q10, q9,  d0[1]
813        vmls.u16        q11, q12, d1[0]
814        vqadd.s16       q11, q10, q11
815        vqrshrun.s16    \d,  q11, #7
816.endm
817
818.macro  vp8_epel8_v4_y2 d0,  d1,  s0,  s1,  s2,  s3,  s4
819        vmovl.u8        q9,  \s0
820        vmovl.u8        q10, \s1
821        vmovl.u8        q11, \s2
822        vmovl.u8        q12, \s3
823        vmovl.u8        q13, \s4
824        vmul.u16        q8,  q10, d0[2]
825        vmul.u16        q14, q11, d0[3]
826        vmul.u16        q11, q11, d0[2]
827        vmul.u16        q15, q12, d0[3]
828        vmls.u16        q8,  q9,  d0[1]
829        vmls.u16        q14, q12, d1[0]
830        vmls.u16        q11, q10, d0[1]
831        vmls.u16        q15, q13, d1[0]
832        vqadd.s16       q8,  q8,  q14
833        vqadd.s16       q11, q11, q15
834        vqrshrun.s16    \d0, q8,  #7
835        vqrshrun.s16    \d1, q11, #7
836.endm
837
838function ff_put_vp8_epel16_v6_neon, export=1
839        sub             r2,  r2,  r3,  lsl #1
840        push            {r4,lr}
841        vpush           {d8-d15}
842
843        ldr             r4,  [sp, #80]          @ my
844        movrel          lr,  subpel_filters-16
845        ldr             r12, [sp, #72]          @ h
846        add             r4,  lr,  r4, lsl #4
847        vld1.16         {q0},     [r4,:128]
8481:
849        vld1.8          {d2-d3},  [r2], r3
850        vld1.8          {d4-d5},  [r2], r3
851        vld1.8          {d6-d7},  [r2], r3
852        vld1.8          {d8-d9},  [r2], r3
853        vld1.8          {d10-d11},[r2], r3
854        vld1.8          {d12-d13},[r2], r3
855        vld1.8          {d14-d15},[r2]
856        sub             r2,  r2,  r3,  lsl #2
857
858        vp8_epel8_v6_y2 d2,  d4,  d2,  d4,  d6,  d8,  d10, d12, d14
859        vp8_epel8_v6_y2 d3,  d5,  d3,  d5,  d7,  d9,  d11, d13, d15
860
861        vst1.8          {d2-d3},  [r0,:128], r1
862        vst1.8          {d4-d5},  [r0,:128], r1
863        subs            r12, r12, #2
864        bne             1b
865
866        vpop            {d8-d15}
867        pop             {r4,pc}
868endfunc
869
870function ff_put_vp8_epel16_h6_neon, export=1
871        sub             r2,  r2,  #2
872        push            {r4,lr}
873
874        ldr             r4,  [sp, #12]          @ mx
875        movrel          lr,  subpel_filters-16
876        ldr             r12, [sp, #8]           @ h
877        add             r4,  lr,  r4, lsl #4
878        vld1.16         {q0},     [r4,:128]
8791:
880        vld1.8          {d2-d4},  [r2], r3
881
882        vp8_epel16_h6   d2,  d3,  d2,  d3,  d4,  q1,  q2
883
884        vst1.8          {d2-d3}, [r0,:128], r1
885        subs            r12, r12, #1
886        bne             1b
887
888        pop             {r4,pc}
889endfunc
890
891function ff_put_vp8_epel16_h6v6_neon, export=1
892        sub             r2,  r2,  r3,  lsl #1
893        sub             r2,  r2,  #2
894        push            {r4,lr}
895        vpush           {d8-d15}
896
897        @ first pass (horizontal):
898        ldr             r4,  [sp, #64+8+4]          @ mx
899        movrel          lr,  subpel_filters-16
900        ldr             r12, [sp, #64+8+0]          @ h
901        add             r4,  lr,  r4, lsl #4
902        sub             sp,  sp,  #336+16
903        vld1.16         {q0},     [r4,:128]
904        add             lr,  sp,  #15
905        add             r12, r12, #5
906        bic             lr,  lr,  #15
9071:
908        vld1.8          {d2,d3,d4}, [r2], r3
909
910        vp8_epel16_h6   d2,  d3,  d2,  d3,  d4,  q1,  q2
911
912        vst1.8          {d2-d3}, [lr,:128]!
913        subs            r12, r12, #1
914        bne             1b
915
916        @ second pass (vertical):
917        ldr             r4,  [sp, #336+16+64+8+8]   @ my
918        movrel          lr,  subpel_filters-16
919        ldr             r12, [sp, #336+16+64+8+0]   @ h
920        add             r4,  lr,  r4, lsl #4
921        add             lr,  sp,  #15
922        vld1.16         {q0},     [r4,:128]
923        bic             lr,  lr,  #15
9242:
925        vld1.8          {d2-d5},  [lr,:128]!
926        vld1.8          {d6-d9},  [lr,:128]!
927        vld1.8          {d10-d13},[lr,:128]!
928        vld1.8          {d14-d15},[lr,:128]
929        sub             lr,  lr,  #64
930
931        vp8_epel8_v6_y2 d2,  d4,  d2,  d4,  d6,  d8,  d10, d12, d14
932        vp8_epel8_v6_y2 d3,  d5,  d3,  d5,  d7,  d9,  d11, d13, d15
933
934        vst1.8          {d2-d3}, [r0,:128], r1
935        vst1.8          {d4-d5}, [r0,:128], r1
936        subs            r12, r12, #2
937        bne             2b
938
939        add             sp,  sp,  #336+16
940        vpop            {d8-d15}
941        pop             {r4,pc}
942endfunc
943
944function ff_put_vp8_epel8_v6_neon, export=1
945        sub             r2,  r2,  r3,  lsl #1
946        push            {r4,lr}
947
948        ldr             r4,  [sp, #16]          @ my
949        movrel          lr,  subpel_filters-16
950        ldr             r12, [sp, #8]           @ h
951        add             r4,  lr,  r4, lsl #4
952        vld1.16         {q0},     [r4,:128]
9531:
954        vld1.8          {d2},  [r2], r3
955        vld1.8          {d3},  [r2], r3
956        vld1.8          {d4},  [r2], r3
957        vld1.8          {d5},  [r2], r3
958        vld1.8          {d6},  [r2], r3
959        vld1.8          {d7},  [r2], r3
960        vld1.8          {d28}, [r2]
961
962        sub             r2,  r2,  r3,  lsl #2
963
964        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d28
965
966        vst1.8          {d2}, [r0,:64], r1
967        vst1.8          {d3}, [r0,:64], r1
968        subs            r12, r12, #2
969        bne             1b
970
971        pop             {r4,pc}
972endfunc
973
974function ff_put_vp8_epel8_h6_neon, export=1
975        sub             r2,  r2,  #2
976        push            {r4,lr}
977
978        ldr             r4,  [sp, #12]          @ mx
979        movrel          lr,  subpel_filters-16
980        ldr             r12, [sp, #8]           @ h
981        add             r4,  lr,  r4, lsl #4
982        vld1.16         {q0},     [r4,:128]
9831:
984        vld1.8          {d2,d3}, [r2], r3
985
986        vp8_epel8_h6    d2,  d2,  d3
987
988        vst1.8          {d2}, [r0,:64], r1
989        subs            r12, r12, #1
990        bne             1b
991
992        pop             {r4,pc}
993endfunc
994
995function ff_put_vp8_epel8_h6v6_neon, export=1
996        sub             r2,  r2,  r3,  lsl #1
997        sub             r2,  r2,  #2
998        push            {r4,lr}
999
1000        @ first pass (horizontal):
1001        ldr             r4,  [sp, #12]          @ mx
1002        movrel          lr,  subpel_filters-16
1003        ldr             r12, [sp, #8]           @ h
1004        add             r4,  lr,  r4, lsl #4
1005        sub             sp,  sp,  #168+16
1006        vld1.16         {q0},     [r4,:128]
1007        add             lr,  sp,  #15
1008        add             r12, r12, #5
1009        bic             lr,  lr,  #15
10101:
1011        vld1.8          {d2,d3}, [r2], r3
1012
1013        vp8_epel8_h6    d2,  d2,  d3
1014
1015        vst1.8          {d2}, [lr,:64]!
1016        subs            r12, r12, #1
1017        bne             1b
1018
1019        @ second pass (vertical):
1020        ldr             r4,  [sp, #168+16+16]   @ my
1021        movrel          lr,  subpel_filters-16
1022        ldr             r12, [sp, #168+16+8]    @ h
1023        add             r4,  lr,  r4, lsl #4
1024        add             lr,  sp,  #15
1025        vld1.16         {q0},     [r4,:128]
1026        bic             lr,  lr,  #15
10272:
1028        vld1.8          {d2-d5},  [lr,:128]!
1029        vld1.8          {d6-d7},  [lr,:128]!
1030        vld1.8          {d30},    [lr,:64]
1031        sub             lr,  lr,  #32
1032
1033        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d30
1034
1035        vst1.8          {d2}, [r0,:64], r1
1036        vst1.8          {d3}, [r0,:64], r1
1037        subs            r12, r12, #2
1038        bne             2b
1039
1040        add             sp,  sp,  #168+16
1041        pop             {r4,pc}
1042endfunc
1043
1044function ff_put_vp8_epel8_v4_neon, export=1
1045        sub             r2,  r2,  r3
1046        push            {r4,lr}
1047
1048        ldr             r4,  [sp, #16]          @ my
1049        movrel          lr,  subpel_filters-16
1050        ldr             r12, [sp, #8]           @ h
1051        add             r4,  lr,  r4, lsl #4
1052        vld1.16         {q0},     [r4,:128]
10531:
1054        vld1.8          {d2},     [r2], r3
1055        vld1.8          {d3},     [r2], r3
1056        vld1.8          {d4},     [r2], r3
1057        vld1.8          {d5},     [r2], r3
1058        vld1.8          {d6},     [r2]
1059        sub             r2,  r2,  r3,  lsl #1
1060
1061        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1062
1063        vst1.8          {d2}, [r0,:64], r1
1064        vst1.8          {d3}, [r0,:64], r1
1065        subs            r12, r12, #2
1066        bne             1b
1067
1068        pop             {r4,pc}
1069endfunc
1070
1071function ff_put_vp8_epel8_h4_neon, export=1
1072        sub             r2,  r2,  #1
1073        push            {r4,lr}
1074
1075        ldr             r4,  [sp, #12]          @ mx
1076        movrel          lr,  subpel_filters-16
1077        ldr             r12, [sp, #8]           @ h
1078        add             r4,  lr,  r4, lsl #4
1079        vld1.16         {q0},     [r4,:128]
10801:
1081        vld1.8          {d2,d3}, [r2], r3
1082
1083        vp8_epel8_h4    d2,  d2,  d3
1084
1085        vst1.8          {d2}, [r0,:64], r1
1086        subs            r12, r12, #1
1087        bne             1b
1088
1089        pop             {r4,pc}
1090endfunc
1091
1092function ff_put_vp8_epel8_h4v4_neon, export=1
1093        sub             r2,  r2,  r3
1094        sub             r2,  r2,  #1
1095        push            {r4,lr}
1096
1097        @ first pass (horizontal):
1098        ldr             r4,  [sp, #12]          @ mx
1099        movrel          lr,  subpel_filters-16
1100        ldr             r12, [sp, #8]           @ h
1101        add             r4,  lr,  r4, lsl #4
1102        sub             sp,  sp,  #168+16
1103        vld1.16         {q0},     [r4,:128]
1104        add             lr,  sp,  #15
1105        add             r12, r12, #3
1106        bic             lr,  lr,  #15
11071:
1108        vld1.8          {d2,d3}, [r2], r3
1109
1110        vp8_epel8_h4    d2,  d2,  d3
1111
1112        vst1.8          {d2}, [lr,:64]!
1113        subs            r12, r12, #1
1114        bne             1b
1115
1116        @ second pass (vertical):
1117        ldr             r4,  [sp, #168+16+16]   @ my
1118        movrel          lr,  subpel_filters-16
1119        ldr             r12, [sp, #168+16+8]    @ h
1120        add             r4,  lr,  r4, lsl #4
1121        add             lr,  sp,  #15
1122        vld1.16         {q0},     [r4,:128]
1123        bic             lr,  lr,  #15
11242:
1125        vld1.8          {d2-d5},  [lr,:128]!
1126        vld1.8          {d6},     [lr,:64]
1127        sub             lr,  lr,  #16
1128
1129        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1130
1131        vst1.8          {d2},     [r0,:64], r1
1132        vst1.8          {d3},     [r0,:64], r1
1133        subs            r12, r12, #2
1134        bne             2b
1135
1136        add             sp,  sp,  #168+16
1137        pop             {r4,pc}
1138endfunc
1139
1140function ff_put_vp8_epel8_h6v4_neon, export=1
1141        sub             r2,  r2,  r3
1142        sub             r2,  r2,  #2
1143        push            {r4,lr}
1144
1145        @ first pass (horizontal):
1146        ldr             r4,  [sp, #12]          @ mx
1147        movrel          lr,  subpel_filters-16
1148        ldr             r12, [sp, #8]           @ h
1149        add             r4,  lr,  r4, lsl #4
1150        sub             sp,  sp,  #168+16
1151        vld1.16         {q0},     [r4,:128]
1152        add             lr,  sp,  #15
1153        add             r12, r12, #3
1154        bic             lr,  lr,  #15
11551:
1156        vld1.8          {d2,d3}, [r2], r3
1157
1158        vp8_epel8_h6    d2,  d2,  d3
1159
1160        vst1.8          {d2}, [lr,:64]!
1161        subs            r12, r12, #1
1162        bne             1b
1163
1164        @ second pass (vertical):
1165        ldr             r4,  [sp, #168+16+16]   @ my
1166        movrel          lr,  subpel_filters-16
1167        ldr             r12, [sp, #168+16+8]    @ h
1168        add             r4,  lr,  r4, lsl #4
1169        add             lr,  sp,  #15
1170        vld1.16         {q0},     [r4,:128]
1171        bic             lr,  lr,  #15
11722:
1173        vld1.8          {d2-d5},  [lr,:128]!
1174        vld1.8          {d6},     [lr,:64]
1175        sub             lr,  lr,  #16
1176
1177        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1178
1179        vst1.8          {d2},     [r0,:64], r1
1180        vst1.8          {d3},     [r0,:64], r1
1181        subs            r12, r12, #2
1182        bne             2b
1183
1184        add             sp,  sp,  #168+16
1185        pop             {r4,pc}
1186endfunc
1187
1188function ff_put_vp8_epel8_h4v6_neon, export=1
1189        sub             r2,  r2,  r3,  lsl #1
1190        sub             r2,  r2,  #1
1191        push            {r4,lr}
1192
1193        @ first pass (horizontal):
1194        ldr             r4,  [sp, #12]          @ mx
1195        movrel          lr,  subpel_filters-16
1196        ldr             r12, [sp, #8]           @ h
1197        add             r4,  lr,  r4, lsl #4
1198        sub             sp,  sp,  #168+16
1199        vld1.16         {q0},     [r4,:128]
1200        add             lr,  sp,  #15
1201        add             r12, r12, #5
1202        bic             lr,  lr,  #15
12031:
1204        vld1.8          {d2,d3}, [r2], r3
1205
1206        vp8_epel8_h4    d2,  d2,  d3
1207
1208        vst1.8          {d2}, [lr,:64]!
1209        subs            r12, r12, #1
1210        bne             1b
1211
1212        @ second pass (vertical):
1213        ldr             r4,  [sp, #168+16+16]   @ my
1214        movrel          lr,  subpel_filters-16
1215        ldr             r12, [sp, #168+16+8]    @ h
1216        add             r4,  lr,  r4, lsl #4
1217        add             lr,  sp,  #15
1218        vld1.16         {q0},     [r4,:128]
1219        bic             lr,  lr,  #15
12202:
1221        vld1.8          {d2-d5},  [lr,:128]!
1222        vld1.8          {d6-d7},  [lr,:128]!
1223        vld1.8          {d30},    [lr,:64]
1224        sub             lr,  lr,  #32
1225
1226        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d30
1227
1228        vst1.8          {d2}, [r0,:64], r1
1229        vst1.8          {d3}, [r0,:64], r1
1230        subs            r12, r12, #2
1231        bne             2b
1232
1233        add             sp,  sp,  #168+16
1234        pop             {r4,pc}
1235endfunc
1236
1237.ltorg
1238
1239function ff_put_vp8_epel4_v6_neon, export=1
1240        sub             r2,  r2,  r3,  lsl #1
1241        push            {r4,lr}
1242
1243        ldr             r4,  [sp, #16]          @ my
1244        movrel          lr,  subpel_filters-16
1245        ldr             r12, [sp, #8]           @ h
1246        add             r4,  lr,  r4, lsl #4
1247        vld1.16         {q0},     [r4,:128]
12481:
1249        vld1.32         {d2[]},   [r2], r3
1250        vld1.32         {d3[]},   [r2], r3
1251        vld1.32         {d4[]},   [r2], r3
1252        vld1.32         {d5[]},   [r2], r3
1253        vld1.32         {d6[]},   [r2], r3
1254        vld1.32         {d7[]},   [r2], r3
1255        vld1.32         {d28[]},  [r2]
1256        sub             r2,  r2,  r3,  lsl #2
1257        vld1.32         {d2[1]},  [r2], r3
1258        vld1.32         {d3[1]},  [r2], r3
1259        vld1.32         {d4[1]},  [r2], r3
1260        vld1.32         {d5[1]},  [r2], r3
1261        vld1.32         {d6[1]},  [r2], r3
1262        vld1.32         {d7[1]},  [r2], r3
1263        vld1.32         {d28[1]}, [r2]
1264        sub             r2,  r2,  r3,  lsl #2
1265
1266        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d28
1267
1268        vst1.32         {d2[0]},  [r0,:32], r1
1269        vst1.32         {d3[0]},  [r0,:32], r1
1270        vst1.32         {d2[1]},  [r0,:32], r1
1271        vst1.32         {d3[1]},  [r0,:32], r1
1272        subs            r12, r12, #4
1273        bne             1b
1274
1275        pop             {r4,pc}
1276endfunc
1277
1278function ff_put_vp8_epel4_h6_neon, export=1
1279        sub             r2,  r2,  #2
1280        push            {r4,lr}
1281
1282        ldr             r4,  [sp, #12]          @ mx
1283        movrel          lr,  subpel_filters-16
1284        ldr             r12, [sp, #8]           @ h
1285        add             r4,  lr,  r4, lsl #4
1286        vld1.16         {q0},     [r4,:128]
12871:
1288        vld1.8          {q1},     [r2], r3
1289        vp8_epel8_h6    d2,  d2,  d3
1290        vst1.32         {d2[0]},  [r0,:32], r1
1291        subs            r12, r12, #1
1292        bne             1b
1293
1294        pop             {r4,pc}
1295endfunc
1296
1297function ff_put_vp8_epel4_h6v6_neon, export=1
1298        sub             r2,  r2,  r3,  lsl #1
1299        sub             r2,  r2,  #2
1300        push            {r4,lr}
1301
1302        ldr             r4,  [sp, #12]          @ mx
1303        movrel          lr,  subpel_filters-16
1304        ldr             r12, [sp, #8]           @ h
1305        add             r4,  lr,  r4, lsl #4
1306        sub             sp,  sp,  #52+16
1307        vld1.16         {q0},     [r4,:128]
1308        add             lr,  sp,  #15
1309        add             r12, r12, #5
1310        bic             lr,  lr,  #15
13111:
1312        vld1.8          {q1},     [r2], r3
1313        vp8_epel8_h6    d2,  d2,  d3
1314        vst1.32         {d2[0]},  [lr,:32]!
1315        subs            r12, r12, #1
1316        bne             1b
1317
1318        ldr             r4,  [sp, #52+16+16]    @ my
1319        movrel          lr,  subpel_filters-16
1320        ldr             r12, [sp, #52+16+8]     @ h
1321        add             r4,  lr,  r4, lsl #4
1322        add             lr,  sp,  #15
1323        vld1.16         {q0},     [r4,:128]
1324        bic             lr,  lr,  #15
13252:
1326        vld1.8          {d2-d3},  [lr,:128]!
1327        vld1.8          {d6},     [lr,:64]!
1328        vld1.32         {d28[]},  [lr,:32]
1329        sub             lr,  lr,  #16
1330        vld1.8          {d4-d5},  [lr]!
1331        vld1.8          {d7},     [lr,:64]!
1332        vld1.32         {d28[1]}, [lr,:32]
1333        sub             lr,  lr,  #16
1334        vtrn.32         q1,  q2
1335        vtrn.32         d6,  d7
1336        vp8_epel8_v6_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6,  d7,  d28
1337        vst1.32         {d2[0]},  [r0,:32], r1
1338        vst1.32         {d3[0]},  [r0,:32], r1
1339        vst1.32         {d2[1]},  [r0,:32], r1
1340        vst1.32         {d3[1]},  [r0,:32], r1
1341        subs            r12, r12, #4
1342        bne             2b
1343
1344        add             sp,  sp,  #52+16
1345        pop             {r4,pc}
1346endfunc
1347
1348function ff_put_vp8_epel4_h4v6_neon, export=1
1349        sub             r2,  r2,  r3,  lsl #1
1350        sub             r2,  r2,  #1
1351        push            {r4,lr}
1352
1353        ldr             r4,  [sp, #12]          @ mx
1354        movrel          lr,  subpel_filters-16
1355        ldr             r12, [sp, #8]           @ h
1356        add             r4,  lr,  r4, lsl #4
1357        sub             sp,  sp,  #52+16
1358        vld1.16         {q0},     [r4,:128]
1359        add             lr,  sp,  #15
1360        add             r12, r12, #5
1361        bic             lr,  lr,  #15
13621:
1363        vld1.8          {d2},     [r2], r3
1364        vp8_epel8_h4    d2,  d2,  d2
1365        vst1.32         {d2[0]},  [lr,:32]!
1366        subs            r12, r12, #1
1367        bne             1b
1368
1369        ldr             r4,  [sp, #52+16+16]    @ my
1370        movrel          lr,  subpel_filters-16
1371        ldr             r12, [sp, #52+16+8]     @ h
1372        add             r4,  lr,  r4, lsl #4
1373        add             lr,  sp,  #15
1374        vld1.16         {q0},     [r4,:128]
1375        bic             lr,  lr,  #15
13762:
1377        vld1.8          {d2-d3},  [lr,:128]!
1378        vld1.8          {d6},     [lr,:64]!
1379        vld1.32         {d28[]},  [lr,:32]
1380        sub             lr,  lr,  #16
1381        vld1.8          {d4-d5},  [lr]!
1382        vld1.8          {d7},     [lr,:64]!
1383        vld1.32         {d28[1]}, [lr,:32]
1384        sub             lr,  lr,  #16
1385        vtrn.32         q1,  q2
1386        vtrn.32         d6,  d7
1387        vp8_epel8_v6_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6,  d7,  d28
1388        vst1.32         {d2[0]},  [r0,:32], r1
1389        vst1.32         {d3[0]},  [r0,:32], r1
1390        vst1.32         {d2[1]},  [r0,:32], r1
1391        vst1.32         {d3[1]},  [r0,:32], r1
1392        subs            r12, r12, #4
1393        bne             2b
1394
1395        add             sp,  sp,  #52+16
1396        pop             {r4,pc}
1397endfunc
1398
1399function ff_put_vp8_epel4_h6v4_neon, export=1
1400        sub             r2,  r2,  r3
1401        sub             r2,  r2,  #2
1402        push            {r4,lr}
1403
1404        ldr             r4,  [sp, #12]          @ mx
1405        movrel          lr,  subpel_filters-16
1406        ldr             r12, [sp, #8]           @ h
1407        add             r4,  lr,  r4, lsl #4
1408        sub             sp,  sp,  #44+16
1409        vld1.16         {q0},     [r4,:128]
1410        add             lr,  sp,  #15
1411        add             r12, r12, #3
1412        bic             lr,  lr,  #15
14131:
1414        vld1.8          {q1},     [r2], r3
1415        vp8_epel8_h6    d2,  d2,  d3
1416        vst1.32         {d2[0]},  [lr,:32]!
1417        subs            r12, r12, #1
1418        bne             1b
1419
1420        ldr             r4,  [sp, #44+16+16]    @ my
1421        movrel          lr,  subpel_filters-16
1422        ldr             r12, [sp, #44+16+8]     @ h
1423        add             r4,  lr,  r4, lsl #4
1424        add             lr,  sp,  #15
1425        vld1.16         {q0},     [r4,:128]
1426        bic             lr,  lr,  #15
14272:
1428        vld1.8          {d2-d3},  [lr,:128]!
1429        vld1.32         {d6[]},   [lr,:32]
1430        sub             lr,  lr,  #8
1431        vld1.8          {d4-d5},  [lr]!
1432        vld1.32         {d6[1]},  [lr,:32]
1433        sub             lr,  lr,  #8
1434        vtrn.32         q1,  q2
1435        vp8_epel8_v4_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6
1436        vst1.32         {d2[0]},  [r0,:32], r1
1437        vst1.32         {d3[0]},  [r0,:32], r1
1438        vst1.32         {d2[1]},  [r0,:32], r1
1439        vst1.32         {d3[1]},  [r0,:32], r1
1440        subs            r12, r12, #4
1441        bne             2b
1442
1443        add             sp,  sp,  #44+16
1444        pop             {r4,pc}
1445endfunc
1446
1447function ff_put_vp8_epel4_h4_neon, export=1
1448        sub             r2,  r2,  #1
1449        push            {r4,lr}
1450
1451        ldr             r4,  [sp, #12]          @ mx
1452        movrel          lr,  subpel_filters-16
1453        ldr             r12, [sp, #8]           @ h
1454        add             r4,  lr,  r4, lsl #4
1455        vld1.16         {q0},     [r4,:128]
14561:
1457        vld1.8          {d2},     [r2], r3
1458        vp8_epel8_h4    d2,  d2,  d2
1459        vst1.32         {d2[0]},  [r0,:32], r1
1460        subs            r12, r12, #1
1461        bne             1b
1462
1463        pop             {r4,pc}
1464endfunc
1465
1466function ff_put_vp8_epel4_v4_neon, export=1
1467        sub             r2,  r2,  r3
1468        push            {r4,lr}
1469
1470        ldr             r4,  [sp, #16]          @ my
1471        movrel          lr,  subpel_filters-16
1472        ldr             r12, [sp, #8]           @ h
1473        add             r4,  lr,  r4, lsl #4
1474        vld1.16         {q0},     [r4,:128]
14751:
1476        vld1.32         {d2[]},   [r2], r3
1477        vld1.32         {d3[]},   [r2], r3
1478        vld1.32         {d4[]},   [r2], r3
1479        vld1.32         {d5[]},   [r2], r3
1480        vld1.32         {d6[]},   [r2]
1481        sub             r2,  r2,  r3,  lsl #1
1482        vld1.32         {d2[1]},  [r2], r3
1483        vld1.32         {d3[1]},  [r2], r3
1484        vld1.32         {d4[1]},  [r2], r3
1485        vld1.32         {d5[1]},  [r2], r3
1486        vld1.32         {d6[1]},  [r2]
1487        sub             r2,  r2,  r3,  lsl #1
1488
1489        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1490
1491        vst1.32         {d2[0]},  [r0,:32], r1
1492        vst1.32         {d3[0]},  [r0,:32], r1
1493        vst1.32         {d2[1]},  [r0,:32], r1
1494        vst1.32         {d3[1]},  [r0,:32], r1
1495        subs            r12, r12, #4
1496        bne             1b
1497
1498        pop             {r4,pc}
1499endfunc
1500
1501function ff_put_vp8_epel4_h4v4_neon, export=1
1502        sub             r2,  r2,  r3
1503        sub             r2,  r2,  #1
1504        push            {r4,lr}
1505
1506        ldr             r4,  [sp, #12]          @ mx
1507        movrel          lr,  subpel_filters-16
1508        ldr             r12, [sp, #8]           @ h
1509        add             r4,  lr,  r4, lsl #4
1510        sub             sp,  sp,  #44+16
1511        vld1.16         {q0},     [r4,:128]
1512        add             lr,  sp,  #15
1513        add             r12, r12, #3
1514        bic             lr,  lr,  #15
15151:
1516        vld1.8          {d2},     [r2], r3
1517        vp8_epel8_h4    d2,  d2,  d3
1518        vst1.32         {d2[0]},  [lr,:32]!
1519        subs            r12, r12, #1
1520        bne             1b
1521
1522        ldr             r4,  [sp, #44+16+16]    @ my
1523        movrel          lr,  subpel_filters-16
1524        ldr             r12, [sp, #44+16+8]     @ h
1525        add             r4,  lr,  r4, lsl #4
1526        add             lr,  sp,  #15
1527        vld1.16         {q0},     [r4,:128]
1528        bic             lr,  lr,  #15
15292:
1530        vld1.8          {d2-d3},  [lr,:128]!
1531        vld1.32         {d6[]},   [lr,:32]
1532        sub             lr,  lr,  #8
1533        vld1.8          {d4-d5},  [lr]!
1534        vld1.32         {d6[1]},  [lr,:32]
1535        sub             lr,  lr,  #8
1536        vtrn.32         q1,  q2
1537        vp8_epel8_v4_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6
1538        vst1.32         {d2[0]},  [r0,:32], r1
1539        vst1.32         {d3[0]},  [r0,:32], r1
1540        vst1.32         {d2[1]},  [r0,:32], r1
1541        vst1.32         {d3[1]},  [r0,:32], r1
1542        subs            r12, r12, #4
1543        bne             2b
1544
1545        add             sp,  sp,  #44+16
1546        pop             {r4,pc}
1547endfunc
1548
1549@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
1550@ arithmetic can be used to apply filters
1551const   subpel_filters, align=4
1552        .short     0,   6, 123,  12,   1,   0,   0,   0
1553        .short     2,  11, 108,  36,   8,   1,   0,   0
1554        .short     0,   9,  93,  50,   6,   0,   0,   0
1555        .short     3,  16,  77,  77,  16,   3,   0,   0
1556        .short     0,   6,  50,  93,   9,   0,   0,   0
1557        .short     1,   8,  36, 108,  11,   2,   0,   0
1558        .short     0,   1,  12, 123,   6,   0,   0,   0
1559endconst
1560
1561/* Bilinear MC */
1562
1563function ff_put_vp8_bilin16_h_neon, export=1
1564        ldr             r12, [sp, #4]           @ mx
1565        vdup.8          d0,  r12
1566        rsb             r12, r12, #8
1567        vdup.8          d1,  r12
1568        ldr             r12, [sp]               @ h
15691:
1570        subs            r12, r12, #2
1571        vld1.8          {d2-d4},  [r2], r3
1572        vext.8          q2,  q1,  q2,  #1
1573        vmull.u8        q8,  d2,  d1
1574        vmlal.u8        q8,  d4,  d0
1575        vld1.8          {d18-d20},[r2], r3
1576        vmull.u8        q3,  d3,  d1
1577        vmlal.u8        q3,  d5,  d0
1578        vext.8          q10, q9,  q10, #1
1579        vmull.u8        q11, d18, d1
1580        vmlal.u8        q11, d20, d0
1581        vmull.u8        q12, d19, d1
1582        vmlal.u8        q12, d21, d0
1583        vrshrn.u16      d4,  q8,  #3
1584        vrshrn.u16      d5,  q3,  #3
1585        vrshrn.u16      d6,  q11, #3
1586        vrshrn.u16      d7,  q12, #3
1587        vst1.8          {q2},     [r0,:128], r1
1588        vst1.8          {q3},     [r0,:128], r1
1589        bgt             1b
1590
1591        bx              lr
1592endfunc
1593
1594function ff_put_vp8_bilin16_v_neon, export=1
1595        ldr             r12, [sp, #8]           @ my
1596        vdup.8          d0,  r12
1597        rsb             r12, r12, #8
1598        vdup.8          d1,  r12
1599        ldr             r12, [sp]               @ h
1600        vld1.8          {q1},     [r2], r3
16011:
1602        subs            r12, r12, #2
1603        vld1.8          {q2},     [r2], r3
1604        vmull.u8        q3,  d2,  d1
1605        vmlal.u8        q3,  d4,  d0
1606        vmull.u8        q8,  d3,  d1
1607        vmlal.u8        q8,  d5,  d0
1608        vld1.8          {q1},     [r2], r3
1609        vmull.u8        q9,  d4,  d1
1610        vmlal.u8        q9,  d2,  d0
1611        vmull.u8        q10, d5,  d1
1612        vmlal.u8        q10, d3,  d0
1613        vrshrn.u16      d4,  q3,  #3
1614        vrshrn.u16      d5,  q8,  #3
1615        vrshrn.u16      d6,  q9,  #3
1616        vrshrn.u16      d7,  q10, #3
1617        vst1.8          {q2},     [r0,:128], r1
1618        vst1.8          {q3},     [r0,:128], r1
1619        bgt             1b
1620
1621        bx              lr
1622endfunc
1623
1624function ff_put_vp8_bilin16_hv_neon, export=1
1625        ldr             r12, [sp, #4]           @ mx
1626        vdup.8          d0,  r12
1627        rsb             r12, r12, #8
1628        vdup.8          d1,  r12
1629        ldr             r12, [sp, #8]           @ my
1630        vdup.8          d2,  r12
1631        rsb             r12, r12, #8
1632        vdup.8          d3,  r12
1633        ldr             r12, [sp]               @ h
1634
1635        vld1.8          {d4-d6},  [r2], r3
1636        vext.8          q3,  q2,  q3,  #1
1637        vmull.u8        q8,  d4,  d1
1638        vmlal.u8        q8,  d6,  d0
1639        vmull.u8        q9,  d5,  d1
1640        vmlal.u8        q9,  d7,  d0
1641        vrshrn.u16      d4,  q8,  #3
1642        vrshrn.u16      d5,  q9,  #3
16431:
1644        subs            r12, r12, #2
1645        vld1.8          {d18-d20},[r2], r3
1646        vext.8          q10, q9,  q10, #1
1647        vmull.u8        q11, d18, d1
1648        vmlal.u8        q11, d20, d0
1649        vld1.8          {d26-d28},[r2], r3
1650        vmull.u8        q12, d19, d1
1651        vmlal.u8        q12, d21, d0
1652        vext.8          q14, q13, q14, #1
1653        vmull.u8        q8,  d26, d1
1654        vmlal.u8        q8,  d28, d0
1655        vmull.u8        q9,  d27, d1
1656        vmlal.u8        q9,  d29, d0
1657        vrshrn.u16      d6,  q11, #3
1658        vrshrn.u16      d7,  q12, #3
1659        vmull.u8        q12, d4,  d3
1660        vmlal.u8        q12, d6,  d2
1661        vmull.u8        q15, d5,  d3
1662        vmlal.u8        q15, d7,  d2
1663        vrshrn.u16      d4,  q8,  #3
1664        vrshrn.u16      d5,  q9,  #3
1665        vmull.u8        q10, d6,  d3
1666        vmlal.u8        q10, d4,  d2
1667        vmull.u8        q11, d7,  d3
1668        vmlal.u8        q11, d5,  d2
1669        vrshrn.u16      d24, q12, #3
1670        vrshrn.u16      d25, q15, #3
1671        vst1.8          {q12},    [r0,:128], r1
1672        vrshrn.u16      d20, q10, #3
1673        vrshrn.u16      d21, q11, #3
1674        vst1.8          {q10},    [r0,:128], r1
1675        bgt             1b
1676
1677        bx              lr
1678endfunc
1679
1680function ff_put_vp8_bilin8_h_neon, export=1
1681        ldr             r12, [sp, #4]           @ mx
1682        vdup.8          d0,  r12
1683        rsb             r12, r12, #8
1684        vdup.8          d1,  r12
1685        ldr             r12, [sp]               @ h
16861:
1687        subs            r12, r12, #2
1688        vld1.8          {q1},     [r2], r3
1689        vext.8          d3,  d2,  d3,  #1
1690        vmull.u8        q2,  d2,  d1
1691        vmlal.u8        q2,  d3,  d0
1692        vld1.8          {q3},     [r2], r3
1693        vext.8          d7,  d6,  d7,  #1
1694        vmull.u8        q8,  d6,  d1
1695        vmlal.u8        q8,  d7,  d0
1696        vrshrn.u16      d4,  q2,  #3
1697        vrshrn.u16      d16, q8,  #3
1698        vst1.8          {d4},     [r0,:64], r1
1699        vst1.8          {d16},    [r0,:64], r1
1700        bgt             1b
1701
1702        bx              lr
1703endfunc
1704
1705function ff_put_vp8_bilin8_v_neon, export=1
1706        ldr             r12, [sp, #8]           @ my
1707        vdup.8          d0,  r12
1708        rsb             r12, r12,  #8
1709        vdup.8          d1,  r12
1710        ldr             r12, [sp]               @ h
1711        vld1.8          {d2},     [r2], r3
17121:
1713        subs            r12, r12, #2
1714        vld1.8          {d3},     [r2], r3
1715        vmull.u8        q2,  d2,  d1
1716        vmlal.u8        q2,  d3,  d0
1717        vld1.8          {d2},     [r2], r3
1718        vmull.u8        q3,  d3,  d1
1719        vmlal.u8        q3,  d2,  d0
1720        vrshrn.u16      d4,  q2,  #3
1721        vrshrn.u16      d6,  q3,  #3
1722        vst1.8          {d4},     [r0,:64], r1
1723        vst1.8          {d6},     [r0,:64], r1
1724        bgt             1b
1725
1726        bx              lr
1727endfunc
1728
1729function ff_put_vp8_bilin8_hv_neon, export=1
1730        ldr             r12, [sp, #4]           @ mx
1731        vdup.8          d0,  r12
1732        rsb             r12, r12, #8
1733        vdup.8          d1,  r12
1734        ldr             r12, [sp, #8]           @ my
1735        vdup.8          d2,  r12
1736        rsb             r12, r12, #8
1737        vdup.8          d3,  r12
1738        ldr             r12, [sp]               @ h
1739
1740        vld1.8          {q2},     [r2], r3
1741        vext.8          d5,  d4,  d5,  #1
1742        vmull.u8        q9,  d4,  d1
1743        vmlal.u8        q9,  d5,  d0
1744        vrshrn.u16      d22, q9,  #3
17451:
1746        subs            r12, r12, #2
1747        vld1.8          {q3},     [r2], r3
1748        vext.8          d7,  d6,  d7,  #1
1749        vmull.u8        q8,  d6,  d1
1750        vmlal.u8        q8,  d7,  d0
1751        vld1.8          {q2},     [r2], r3
1752        vext.8          d5,  d4,  d5,  #1
1753        vmull.u8        q9,  d4,  d1
1754        vmlal.u8        q9,  d5,  d0
1755        vrshrn.u16      d16, q8,  #3
1756        vmull.u8        q10, d22, d3
1757        vmlal.u8        q10, d16, d2
1758        vrshrn.u16      d22, q9,  #3
1759        vmull.u8        q12, d16, d3
1760        vmlal.u8        q12, d22, d2
1761        vrshrn.u16      d20, q10, #3
1762        vst1.8          {d20},    [r0,:64], r1
1763        vrshrn.u16      d23, q12, #3
1764        vst1.8          {d23},    [r0,:64], r1
1765        bgt             1b
1766
1767        bx              lr
1768endfunc
1769
1770function ff_put_vp8_bilin4_h_neon, export=1
1771        ldr             r12, [sp, #4]           @ mx
1772        vdup.8          d0,  r12
1773        rsb             r12, r12, #8
1774        vdup.8          d1,  r12
1775        ldr             r12, [sp]               @ h
17761:
1777        subs            r12, r12, #2
1778        vld1.8          {d2},     [r2], r3
1779        vext.8          d3,  d2,  d3,  #1
1780        vld1.8          {d6},     [r2], r3
1781        vext.8          d7,  d6,  d7,  #1
1782        vtrn.32         q1,  q3
1783        vmull.u8        q2,  d2,  d1
1784        vmlal.u8        q2,  d3,  d0
1785        vrshrn.u16      d4,  q2,  #3
1786        vst1.32         {d4[0]},  [r0,:32], r1
1787        vst1.32         {d4[1]}, [r0,:32], r1
1788        bgt             1b
1789
1790        bx              lr
1791endfunc
1792
1793function ff_put_vp8_bilin4_v_neon, export=1
1794        ldr             r12, [sp, #8]           @ my
1795        vdup.8          d0,  r12
1796        rsb             r12, r12, #8
1797        vdup.8          d1,  r12
1798        ldr             r12, [sp]               @ h
1799        vld1.32         {d2[]},   [r2], r3
18001:
1801        vld1.32         {d3[]},   [r2]
1802        vld1.32         {d2[1]},  [r2], r3
1803        vld1.32         {d3[1]},  [r2], r3
1804        vmull.u8        q2,  d2,  d1
1805        vmlal.u8        q2,  d3,  d0
1806        vtrn.32         d3,  d2
1807        vrshrn.u16      d4,  q2,  #3
1808        vst1.32         {d4[0]},  [r0,:32], r1
1809        vst1.32         {d4[1]},  [r0,:32], r1
1810        subs            r12, r12, #2
1811        bgt             1b
1812
1813        bx              lr
1814endfunc
1815
1816function ff_put_vp8_bilin4_hv_neon, export=1
1817        ldr             r12, [sp, #4]           @ mx
1818        vdup.8          d0,  r12
1819        rsb             r12, r12, #8
1820        vdup.8          d1,  r12
1821        ldr             r12, [sp, #8]           @ my
1822        vdup.8          d2,  r12
1823        rsb             r12, r12, #8
1824        vdup.8          d3,  r12
1825        ldr             r12, [sp]               @ h
1826
1827        vld1.8          {d4},     [r2], r3
1828        vext.8          d5,  d4,  d4,  #1
1829        vmull.u8        q9,  d4,  d1
1830        vmlal.u8        q9,  d5,  d0
1831        vrshrn.u16      d22, q9,  #3
18321:
1833        subs            r12, r12, #2
1834        vld1.8          {d6},     [r2], r3
1835        vext.8          d7,  d6,  d6,  #1
1836        vld1.8          {d4},     [r2], r3
1837        vext.8          d5,  d4,  d4,  #1
1838        vtrn.32         q3,  q2
1839        vmull.u8        q8,  d6,  d1
1840        vmlal.u8        q8,  d7,  d0
1841        vrshrn.u16      d16, q8,  #3
1842        vmull.u8        q10, d16, d2
1843        vtrn.32         d22, d16
1844        vmlal.u8        q10, d22, d3
1845        vrev64.32       d22, d16
1846        vrshrn.u16      d20, q10, #3
1847        vst1.32         {d20[0]}, [r0,:32], r1
1848        vst1.32         {d20[1]}, [r0,:32], r1
1849        bgt             1b
1850
1851        bx              lr
1852endfunc
1853