• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * VC1 NEON optimisations
3 *
4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24#include "libavutil/arm/asm.S"
25#include "neon.S"
26
27#include "config.h"
28
29@ Transpose rows into columns of a matrix of 16-bit elements. For 4x4, pass
30@ double-word registers, for 8x4, pass quad-word registers.
31.macro transpose16 r0, r1, r2, r3
32        @ At this point:
33        @   row[0]  r0
34        @   row[1]  r1
35        @   row[2]  r2
36        @   row[3]  r3
37
38        vtrn.16         \r0,  \r1         @ first and second row
39        vtrn.16         \r2,  \r3         @ third and fourth row
40        vtrn.32         \r0,  \r2         @ first and third row
41        vtrn.32         \r1,  \r3         @ second and fourth row
42
43        @ At this point, if registers are quad-word:
44        @   column[0]   d0
45        @   column[1]   d2
46        @   column[2]   d4
47        @   column[3]   d6
48        @   column[4]   d1
49        @   column[5]   d3
50        @   column[6]   d5
51        @   column[7]   d7
52
53        @ At this point, if registers are double-word:
54        @   column[0]   d0
55        @   column[1]   d1
56        @   column[2]   d2
57        @   column[3]   d3
58.endm
59
60@ ff_vc1_inv_trans_{4,8}x{4,8}_neon and overflow: The input values in the file
61@ are supposed to be in a specific range as to allow for 16-bit math without
62@ causing overflows, but sometimes the input values are just big enough to
63@ barely cause overflow in vadd instructions like:
64@
65@   vadd.i16  q0, q8, q10
66@   vshr.s16  q0, q0, #\rshift
67@
68@ To prevent these borderline cases from overflowing, we just need one more
69@ bit of precision, which is accomplished by replacing the sequence above with:
70@
71@   vhadd.s16 q0, q8, q10
72@   vshr.s16  q0, q0, #(\rshift -1)
73@
74@ This works because vhadd is a single instruction that adds, then shifts to
75@ the right once, all before writing the result to the destination register.
76@
77@ Even with this workaround, there were still some files that caused overflows
78@ in ff_vc1_inv_trans_8x8_neon. See the comments in ff_vc1_inv_trans_8x8_neon
79@ for the additional workaround.
80
81@ Takes 4 columns of 8 values each and operates on it. Modeled after the first
82@ for loop in vc1_inv_trans_4x8_c.
83@ Input columns: q0 q1 q2 q3
84@ Output columns: q0 q1 q2 q3
85@ Trashes: r12 q8 q9 q10 q11 q12 q13
86.macro vc1_inv_trans_4x8_helper add rshift
87        @ Compute temp1, temp2 and setup scalar #17, #22, #10
88        vadd.i16        q12,   q0,  q2              @ temp1 = src[0] + src[2]
89        movw            r12,   #17
90        vsub.i16        q13,   q0,  q2              @ temp2 = src[0] - src[2]
91        movt            r12,   #22
92        vmov.32         d0[0], r12
93        movw            r12,   #10
94        vmov.16         d1[0], r12
95
96        vmov.i16        q8,  #\add                  @ t1 will accumulate here
97        vmov.i16        q9,  #\add                  @ t2 will accumulate here
98
99        vmul.i16        q10, q1,  d0[1]             @ t3 = 22 * (src[1])
100        vmul.i16        q11, q3,  d0[1]             @ t4 = 22 * (src[3])
101
102        vmla.i16        q8,  q12, d0[0]             @ t1 = 17 * (temp1) + 4
103        vmla.i16        q9,  q13, d0[0]             @ t2 = 17 * (temp2) + 4
104
105        vmla.i16        q10, q3,  d1[0]             @ t3 += 10 * src[3]
106        vmls.i16        q11, q1,  d1[0]             @ t4 -= 10 * src[1]
107
108        vhadd.s16       q0,  q8,  q10               @ dst[0] = (t1 + t3) >> 1
109        vhsub.s16       q3,  q8,  q10               @ dst[3] = (t1 - t3) >> 1
110        vhsub.s16       q1,  q9,  q11               @ dst[1] = (t2 - t4) >> 1
111        vhadd.s16       q2,  q9,  q11               @ dst[2] = (t2 + t4) >> 1
112
113        @ Halving add/sub above already did one shift
114        vshr.s16        q0,  q0,  #(\rshift - 1)    @ dst[0] >>= (rshift - 1)
115        vshr.s16        q3,  q3,  #(\rshift - 1)    @ dst[3] >>= (rshift - 1)
116        vshr.s16        q1,  q1,  #(\rshift - 1)    @ dst[1] >>= (rshift - 1)
117        vshr.s16        q2,  q2,  #(\rshift - 1)    @ dst[2] >>= (rshift - 1)
118.endm
119
120@ Takes 8 columns of 4 values each and operates on it. Modeled after the second
121@ for loop in vc1_inv_trans_4x8_c.
122@ Input columns: d0 d2 d4 d6 d1 d3 d5 d7
123@ Output columns: d16 d17 d18 d19 d21 d20 d23 d22
124@ Trashes all NEON registers (and r12) except for: q4 q5 q6 q7
125.macro vc1_inv_trans_8x4_helper add add1beforeshift rshift
126        @ At this point:
127        @   src[0]      d0 overwritten later
128        @   src[8]      d2
129        @   src[16]     d4 overwritten later
130        @   src[24]     d6
131        @   src[32]     d1 overwritten later
132        @   src[40]     d3
133        @   src[48]     d5 overwritten later
134        @   src[56]     d7
135
136        movw            r12,   #12
137        vmov.i16        q14,   #\add            @ t1|t2 will accumulate here
138        movt            r12,   #6
139
140        vadd.i16        d20,   d0,  d1          @ temp1 = src[0] + src[32]
141        vsub.i16        d21,   d0,  d1          @ temp2 = src[0] - src[32]
142        vmov.i32        d0[0], r12              @ 16-bit: d0[0] = #12, d0[1] = #6
143
144        vshl.i16        q15,   q2,  #4          @ t3|t4 = 16 * (src[16]|src[48])
145        vswp            d4,    d5               @ q2 = src[48]|src[16]
146        vmla.i16        q14,   q10, d0[0]       @ t1|t2 = 12 * (temp1|temp2) + 64
147        movw            r12,   #15
148        movt            r12,   #9
149        vmov.i32        d0[1], r12              @ 16-bit: d0[2] = #15, d0[3] = #9
150        vneg.s16        d31,   d31              @ t4 = -t4
151        vmla.i16        q15,   q2,  d0[1]       @ t3|t4 += 6 * (src[48]|src[16])
152
153        @ At this point:
154        @   d0[2]   #15
155        @   d0[3]   #9
156        @   q1      src[8]|src[40]
157        @   q3      src[24]|src[56]
158        @   q14     old t1|t2
159        @   q15     old t3|t4
160
161        vshl.i16        q8,  q1,  #4            @ t1|t2 = 16 * (src[8]|src[40])
162        vswp            d2,  d3                 @ q1 = src[40]|src[8]
163        vshl.i16        q12, q3,  #4            @ temp3a|temp4a = 16 * src[24]|src[56]
164        vswp            d6,  d7                 @ q3 = src[56]|src[24]
165        vshl.i16        q13, q1,  #2            @ temp3b|temp4b = 4 * (src[40]|src[8])
166        vshl.i16        q2,  q3,  #2            @ temp1|temp2 = 4 * (src[56]|src[24])
167        vswp            d3,  d6                 @ q1 = src[40]|src[56], q3 = src[8]|src[24]
168        vsub.i16        q9,  q13, q12           @ t3|t4 = - (temp3a|temp4a) + (temp3b|temp4b)
169        vadd.i16        q8,  q8,  q2            @ t1|t2 += temp1|temp2
170        vmul.i16        q12, q3,  d0[3]         @ temp3|temp4 = 9 * src[8]|src[24]
171        vmla.i16        q8,  q1,  d0[3]         @ t1|t2 += 9 * (src[40]|src[56])
172        vswp            d6,  d7                 @ q3 = src[24]|src[8]
173        vswp            d2,  d3                 @ q1 = src[56]|src[40]
174
175        vsub.i16        q11, q14, q15           @ t8|t7 = old t1|t2 - old t3|t4
176        vadd.i16        q10, q14, q15           @ t5|t6 = old t1|t2 + old t3|t4
177  .if \add1beforeshift
178        vmov.i16        q15, #1
179  .endif
180
181        vadd.i16        d18, d18, d24           @ t3 += temp3
182        vsub.i16        d19, d19, d25           @ t4 -= temp4
183
184        vswp            d22, d23                @ q11 = t7|t8
185
186        vneg.s16        d17, d17                @ t2 = -t2
187        vmla.i16        q9,  q1,  d0[2]         @ t3|t4 += 15 * src[56]|src[40]
188        vmla.i16        q8,  q3,  d0[2]         @ t1|t2 += 15 * src[24]|src[8]
189
190        @ At this point:
191        @   t1  d16
192        @   t2  d17
193        @   t3  d18
194        @   t4  d19
195        @   t5  d20
196        @   t6  d21
197        @   t7  d22
198        @   t8  d23
199        @   #1  q15
200
201  .if \add1beforeshift
202        vadd.i16        q3,  q15, q10           @ line[7,6] = t5|t6 + 1
203        vadd.i16        q2,  q15, q11           @ line[5,4] = t7|t8 + 1
204  .endif
205
206        @ Sometimes this overflows, so to get one additional bit of precision, use
207        @ a single instruction that both adds and shifts right (halving).
208        vhadd.s16       q1,  q9,  q11           @ line[2,3] = (t3|t4 + t7|t8) >> 1
209        vhadd.s16       q0,  q8,  q10           @ line[0,1] = (t1|t2 + t5|t6) >> 1
210  .if \add1beforeshift
211        vhsub.s16       q2,  q2,  q9            @ line[5,4] = (t7|t8 - t3|t4 + 1) >> 1
212        vhsub.s16       q3,  q3,  q8            @ line[7,6] = (t5|t6 - t1|t2 + 1) >> 1
213  .else
214        vhsub.s16       q2,  q11, q9            @ line[5,4] = (t7|t8 - t3|t4) >> 1
215        vhsub.s16       q3,  q10, q8            @ line[7,6] = (t5|t6 - t1|t2) >> 1
216  .endif
217
218        vshr.s16        q9,  q1,  #(\rshift - 1)    @ one shift is already done by vhadd/vhsub above
219        vshr.s16        q8,  q0,  #(\rshift - 1)
220        vshr.s16        q10, q2,  #(\rshift - 1)
221        vshr.s16        q11, q3,  #(\rshift - 1)
222
223        @ At this point:
224        @   dst[0]   d16
225        @   dst[1]   d17
226        @   dst[2]   d18
227        @   dst[3]   d19
228        @   dst[4]   d21
229        @   dst[5]   d20
230        @   dst[6]   d23
231        @   dst[7]   d22
232.endm
233
234@ This is modeled after the first and second for loop in vc1_inv_trans_8x8_c.
235@ Input columns:  q8, q9, q10, q11, q12, q13, q14, q15
236@ Output columns: q8, q9, q10, q11, q12, q13, q14, q15
237@ Trashes all NEON registers (and r12) except for: q4 q5 q6 q7
238.macro vc1_inv_trans_8x8_helper add add1beforeshift rshift
239        @ This actually computes half of t1, t2, t3, t4, as explained below
240        @ near `tNhalf`.
241        vmov.i16        q0,    #(6 / 2)         @ q0 = #6/2
242        vshl.i16        q1,    q10, #3          @ t3 = 16/2 * src[16]
243        vshl.i16        q3,    q14, #3          @ temp4 = 16/2 * src[48]
244        vmul.i16        q2,    q10, q0          @ t4 = 6/2 * src[16]
245        vmla.i16        q1,    q14, q0          @ t3 += 6/2 * src[48]
246        @ unused: q0, q10, q14
247        vmov.i16        q0,    #(12 / 2)        @ q0 = #12/2
248        vadd.i16        q10,   q8,  q12         @ temp1 = src[0] + src[32]
249        vsub.i16        q14,   q8,  q12         @ temp2 = src[0] - src[32]
250        @ unused: q8, q12
251        vmov.i16        q8,    #(\add / 2)      @ t1 will accumulate here
252        vmov.i16        q12,   #(\add / 2)      @ t2 will accumulate here
253        movw            r12,   #15
254        vsub.i16        q2,    q2,  q3          @ t4 = 6/2 * src[16] - 16/2 * src[48]
255        movt            r12,   #9
256        @ unused: q3
257        vmla.i16        q8,    q10, q0          @ t1 = 12/2 * temp1 + add
258        vmla.i16        q12,   q14, q0          @ t2 = 12/2 * temp2 + add
259        vmov.i32        d0[0], r12
260        @ unused: q3, q10, q14
261
262        @ At this point:
263        @   q0          d0=#15|#9
264        @   q1  old t3
265        @   q2  old t4
266        @   q3
267        @   q8  old t1
268        @   q9          src[8]
269        @   q10
270        @   q11         src[24]
271        @   q12 old t2
272        @   q13         src[40]
273        @   q14
274        @   q15         src[56]
275
276        @ unused: q3, q10, q14
277        movw            r12,   #16
278        vshl.i16        q3,    q9,  #4          @ t1 = 16 * src[8]
279        movt            r12,   #4
280        vshl.i16        q10,   q9,  #2          @ t4 = 4 * src[8]
281        vmov.i32        d1[0], r12
282        vmul.i16        q14,   q9,  d0[0]       @ t2 = 15 * src[8]
283        vmul.i16        q9,    q9,  d0[1]       @ t3 = 9 * src[8]
284        @ unused: none
285        vmla.i16        q3,    q11, d0[0]       @ t1 += 15 * src[24]
286        vmls.i16        q10,   q11, d0[1]       @ t4 -= 9 * src[24]
287        vmls.i16        q14,   q11, d1[1]       @ t2 -= 4 * src[24]
288        vmls.i16        q9,    q11, d1[0]       @ t3 -= 16 * src[24]
289        @ unused: q11
290        vmla.i16        q3,    q13, d0[1]       @ t1 += 9 * src[40]
291        vmla.i16        q10,   q13, d0[0]       @ t4 += 15 * src[40]
292        vmls.i16        q14,   q13, d1[0]       @ t2 -= 16 * src[40]
293        vmla.i16        q9,    q13, d1[1]       @ t3 += 4 * src[40]
294        @ unused: q11, q13
295
296        @ Compute t5, t6, t7, t8 from old t1, t2, t3, t4. Actually, it computes
297        @ half of t5, t6, t7, t8 since t1, t2, t3, t4 are halved.
298        vadd.i16        q11,   q8,  q1          @ t5 = t1 + t3
299        vsub.i16        q1,    q8,  q1          @ t8 = t1 - t3
300        vadd.i16        q13,   q12, q2          @ t6 = t2 + t4
301        vsub.i16        q2,    q12, q2          @ t7 = t2 - t4
302        @ unused: q8, q12
303
304  .if \add1beforeshift
305        vmov.i16        q12,   #1
306  .endif
307
308        @ unused: q8
309        vmla.i16        q3,    q15, d1[1]       @ t1 += 4 * src[56]
310        vmls.i16        q14,   q15, d0[1]       @ t2 -= 9 * src[56]
311        vmla.i16        q9,    q15, d0[0]       @ t3 += 15 * src[56]
312        vmls.i16        q10,   q15, d1[0]       @ t4 -= 16 * src[56]
313        @ unused: q0, q8, q15
314
315        @ At this point:
316        @   t1      q3
317        @   t2      q14
318        @   t3      q9
319        @   t4      q10
320        @   t5half  q11
321        @   t6half  q13
322        @   t7half  q2
323        @   t8half  q1
324        @   #1      q12
325        @
326        @ tNhalf is half of the value of tN (as described in vc1_inv_trans_8x8_c).
327        @ This is done because sometimes files have input that causes tN + tM to
328        @ overflow. To avoid this overflow, we compute tNhalf, then compute
329        @ tNhalf + tM (which doesn't overflow), and then we use vhadd to compute
330        @ (tNhalf + (tNhalf + tM)) >> 1 which does not overflow because it is
331        @ one instruction.
332
333        @ For each pair of tN and tM, do:
334        @   lineA = t5half + t1
335        @   if add1beforeshift:  t1 -= 1
336        @   lineA = (t5half + lineA) >> 1
337        @   lineB = t5half - t1
338        @   lineB = (t5half + lineB) >> 1
339        @   lineA >>= rshift - 1
340        @   lineB >>= rshift - 1
341
342        vadd.i16        q8,  q11, q3                @ q8 = t5half + t1
343  .if \add1beforeshift
344        vsub.i16        q3,  q3,  q12               @ q3 = t1 - 1
345  .endif
346
347        vadd.i16        q0,  q13, q14               @ q0  = t6half + t2
348  .if \add1beforeshift
349        vsub.i16        q14, q14, q12               @ q14 = t2 - 1
350  .endif
351
352        vadd.i16        q15, q2,  q9                @ q15 = t7half + t3
353  .if \add1beforeshift
354        vsub.i16        q9,  q9,  q12               @ q9  = t3 - 1
355  .endif
356        @ unused: none
357
358        vhadd.s16       q8,  q11, q8                @ q8  = (t5half + t5half + t1) >> 1
359        vsub.i16        q3,  q11, q3                @ q3  = t5half - t1 + 1
360
361        vhadd.s16       q0,  q13, q0                @ q0  = (t6half + t6half + t2) >> 1
362        vsub.i16        q14, q13, q14               @ q14 = t6half - t2 + 1
363
364        vhadd.s16       q15, q2,  q15               @ q15 = (t7half + t7half + t3) >> 1
365        vsub.i16        q9,  q2,  q9                @ q9  = t7half - t3 + 1
366
367        vhadd.s16       q3,  q11, q3                @ q3  = (t5half + t5half - t1 + 1) >> 1
368        @ unused: q11
369
370        vadd.i16        q11, q1,  q10               @ q11 = t8half + t4
371  .if \add1beforeshift
372        vsub.i16        q10, q10, q12               @ q10 = t4 - 1
373  .endif
374        @ unused: q12
375
376        vhadd.s16       q14, q13, q14               @ q14 = (t6half + t6half - t2 + 1) >> 1
377        @ unused: q12, q13
378        vhadd.s16       q13, q2,  q9                @ q9  = (t7half + t7half - t3 + 1) >> 1
379        @ unused: q12, q2, q9
380
381        vsub.i16        q10, q1,  q10               @ q10 = t8half - t4 + 1
382        vhadd.s16       q11, q1,  q11               @ q11 = (t8half + t8half + t4) >> 1
383
384        vshr.s16        q8,  q8,  #(\rshift - 1)    @ q8  = line[0]
385        vhadd.s16       q12, q1,  q10               @ q12 = (t8half + t8half - t4 + 1) >> 1
386        vshr.s16        q9,  q0,  #(\rshift - 1)    @ q9  = line[1]
387        vshr.s16        q10, q15, #(\rshift - 1)    @ q10 = line[2]
388        vshr.s16        q11, q11, #(\rshift - 1)    @ q11 = line[3]
389        vshr.s16        q12, q12, #(\rshift - 1)    @ q12 = line[4]
390        vshr.s16        q13, q13, #(\rshift - 1)    @ q13 = line[5]
391        vshr.s16        q14, q14, #(\rshift - 1)    @ q14 = line[6]
392        vshr.s16        q15, q3,  #(\rshift - 1)    @ q15 = line[7]
393.endm
394
395@ (int16_t *block [r0])
396function ff_vc1_inv_trans_8x8_neon, export=1
397        vld1.64         {q8-q9},   [r0,:128]!
398        vld1.64         {q10-q11}, [r0,:128]!
399        vld1.64         {q12-q13}, [r0,:128]!
400        vld1.64         {q14-q15}, [r0,:128]
401        sub             r0, r0, #(16 * 2 * 3)   @ restore r0
402
403        @ At this point:
404        @   src[0]  q8
405        @   src[8]  q9
406        @   src[16] q10
407        @   src[24] q11
408        @   src[32] q12
409        @   src[40] q13
410        @   src[48] q14
411        @   src[56] q15
412
413        vc1_inv_trans_8x8_helper add=4, add1beforeshift=0, rshift=3
414
415        @ Transpose result matrix of 8x8
416        swap4           d17, d19, d21, d23, d24, d26, d28, d30
417        transpose16_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15
418
419        vc1_inv_trans_8x8_helper add=64, add1beforeshift=1, rshift=7
420
421        vst1.64         {q8-q9},   [r0,:128]!
422        vst1.64         {q10-q11}, [r0,:128]!
423        vst1.64         {q12-q13}, [r0,:128]!
424        vst1.64         {q14-q15}, [r0,:128]
425
426        bx              lr
427endfunc
428
429@ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2])
430function ff_vc1_inv_trans_8x4_neon, export=1
431        vld1.64         {q0-q1}, [r2,:128]!     @ load 8 * 4 * 2 = 64 bytes / 16 bytes per quad = 4 quad registers
432        vld1.64         {q2-q3}, [r2,:128]
433
434        transpose16     q0, q1, q2, q3          @ transpose rows to columns
435
436        @ At this point:
437        @   src[0]   d0
438        @   src[1]   d2
439        @   src[2]   d4
440        @   src[3]   d6
441        @   src[4]   d1
442        @   src[5]   d3
443        @   src[6]   d5
444        @   src[7]   d7
445
446        vc1_inv_trans_8x4_helper    add=4, add1beforeshift=0, rshift=3
447
448        @ Move output to more standardized registers
449        vmov        d0, d16
450        vmov        d2, d17
451        vmov        d4, d18
452        vmov        d6, d19
453        vmov        d1, d21
454        vmov        d3, d20
455        vmov        d5, d23
456        vmov        d7, d22
457
458        @ At this point:
459        @   dst[0]   d0
460        @   dst[1]   d2
461        @   dst[2]   d4
462        @   dst[3]   d6
463        @   dst[4]   d1
464        @   dst[5]   d3
465        @   dst[6]   d5
466        @   dst[7]   d7
467
468        transpose16     q0, q1, q2, q3   @ turn columns into rows
469
470        @ At this point:
471        @   row[0] q0
472        @   row[1] q1
473        @   row[2] q2
474        @   row[3] q3
475
476        vc1_inv_trans_4x8_helper    add=64, rshift=7
477
478        @ At this point:
479        @   line[0].l   d0
480        @   line[0].h   d1
481        @   line[1].l   d2
482        @   line[1].h   d3
483        @   line[2].l   d4
484        @   line[2].h   d5
485        @   line[3].l   d6
486        @   line[3].h   d7
487
488        @ unused registers: q12, q13, q14, q15
489
490        vld1.64         {d28}, [r0,:64], r1     @ read dest
491        vld1.64         {d29}, [r0,:64], r1
492        vld1.64         {d30}, [r0,:64], r1
493        vld1.64         {d31}, [r0,:64], r1
494        sub             r0,  r0,  r1, lsl #2    @ restore original r0 value
495
496        vaddw.u8        q0,  q0,  d28           @ line[0] += dest[0]
497        vaddw.u8        q1,  q1,  d29           @ line[1] += dest[1]
498        vaddw.u8        q2,  q2,  d30           @ line[2] += dest[2]
499        vaddw.u8        q3,  q3,  d31           @ line[3] += dest[3]
500
501        vqmovun.s16     d0,  q0                 @ line[0]
502        vqmovun.s16     d1,  q1                 @ line[1]
503        vqmovun.s16     d2,  q2                 @ line[2]
504        vqmovun.s16     d3,  q3                 @ line[3]
505
506        vst1.64         {d0},  [r0,:64], r1     @ write dest
507        vst1.64         {d1},  [r0,:64], r1
508        vst1.64         {d2},  [r0,:64], r1
509        vst1.64         {d3},  [r0,:64]
510
511        bx              lr
512endfunc
513
514@ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2])
515function ff_vc1_inv_trans_4x8_neon, export=1
516        mov             r12, #(8 * 2)  @ 8 elements per line, each element 2 bytes
517        vld4.16         {d0[],  d2[],  d4[],  d6[]},  [r2,:64], r12     @ read each column into a q register
518        vld4.16         {d0[1], d2[1], d4[1], d6[1]}, [r2,:64], r12
519        vld4.16         {d0[2], d2[2], d4[2], d6[2]}, [r2,:64], r12
520        vld4.16         {d0[3], d2[3], d4[3], d6[3]}, [r2,:64], r12
521        vld4.16         {d1[],  d3[],  d5[],  d7[]},  [r2,:64], r12
522        vld4.16         {d1[1], d3[1], d5[1], d7[1]}, [r2,:64], r12
523        vld4.16         {d1[2], d3[2], d5[2], d7[2]}, [r2,:64], r12
524        vld4.16         {d1[3], d3[3], d5[3], d7[3]}, [r2,:64]
525
526        vc1_inv_trans_4x8_helper    add=4, rshift=3
527
528        @ At this point:
529        @   dst[0] = q0
530        @   dst[1] = q1
531        @   dst[2] = q2
532        @   dst[3] = q3
533
534        transpose16     q0, q1, q2, q3  @ Transpose rows (registers) into columns
535
536        vc1_inv_trans_8x4_helper    add=64, add1beforeshift=1, rshift=7
537
538        vld1.32         {d28[]},  [r0,:32], r1  @ read dest
539        vld1.32         {d28[1]}, [r0,:32], r1
540        vld1.32         {d29[]},  [r0,:32], r1
541        vld1.32         {d29[1]}, [r0,:32], r1
542
543        vld1.32         {d30[]},  [r0,:32], r1
544        vld1.32         {d30[0]}, [r0,:32], r1
545        vld1.32         {d31[]},  [r0,:32], r1
546        vld1.32         {d31[0]}, [r0,:32], r1
547        sub             r0,  r0,  r1, lsl #3    @ restore original r0 value
548
549        vaddw.u8        q8,  q8,  d28           @ line[0,1] += dest[0,1]
550        vaddw.u8        q9,  q9,  d29           @ line[2,3] += dest[2,3]
551        vaddw.u8        q10, q10, d30           @ line[5,4] += dest[5,4]
552        vaddw.u8        q11, q11, d31           @ line[7,6] += dest[7,6]
553
554        vqmovun.s16     d16, q8                 @ clip(line[0,1])
555        vqmovun.s16     d18, q9                 @ clip(line[2,3])
556        vqmovun.s16     d20, q10                @ clip(line[5,4])
557        vqmovun.s16     d22, q11                @ clip(line[7,6])
558
559        vst1.32         {d16[0]}, [r0,:32], r1  @ write dest
560        vst1.32         {d16[1]}, [r0,:32], r1
561        vst1.32         {d18[0]}, [r0,:32], r1
562        vst1.32         {d18[1]}, [r0,:32], r1
563
564        vst1.32         {d20[1]}, [r0,:32], r1
565        vst1.32         {d20[0]}, [r0,:32], r1
566        vst1.32         {d22[1]}, [r0,:32], r1
567        vst1.32         {d22[0]}, [r0,:32]
568
569        bx              lr
570endfunc
571
572@ Setup constants in registers which are used by vc1_inv_trans_4x4_helper
573.macro vc1_inv_trans_4x4_helper_setup
574        vmov.i16        q13, #17
575        vmov.i16        q14, #22
576        vmov.i16        d30, #10                @ only need double-word, not quad-word
577.endm
578
579@ This is modeled after the first for loop in vc1_inv_trans_4x4_c.
580.macro vc1_inv_trans_4x4_helper add rshift
581        vmov.i16        q2,  #\add              @ t1|t2 will accumulate here
582
583        vadd.i16        d16, d0,  d1            @ temp1 = src[0] + src[2]
584        vsub.i16        d17, d0,  d1            @ temp2 = src[0] - src[2]
585        vmul.i16        q3,  q14, q1            @ t3|t4 = 22 * (src[1]|src[3])
586        vmla.i16        q2,  q13, q8            @ t1|t2 = 17 * (temp1|temp2) + add
587        vmla.i16        d6,  d30, d3            @ t3 += 10 * src[3]
588        vmls.i16        d7,  d30, d2            @ t4 -= 10 * src[1]
589
590        vadd.i16        q0,  q2,  q3            @ dst[0,2] = (t1|t2 + t3|t4)
591        vsub.i16        q1,  q2,  q3            @ dst[3,1] = (t1|t2 - t3|t4)
592        vshr.s16        q0,  q0,  #\rshift      @ dst[0,2] >>= rshift
593        vshr.s16        q1,  q1,  #\rshift      @ dst[3,1] >>= rshift
594.endm
595
596@ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2])
597function ff_vc1_inv_trans_4x4_neon, export=1
598        mov             r12, #(8 * 2)  @ 8 elements per line, each element 2 bytes
599        vld4.16         {d0[],  d1[],  d2[],  d3[]},  [r2,:64], r12     @ read each column into a register
600        vld4.16         {d0[1], d1[1], d2[1], d3[1]}, [r2,:64], r12
601        vld4.16         {d0[2], d1[2], d2[2], d3[2]}, [r2,:64], r12
602        vld4.16         {d0[3], d1[3], d2[3], d3[3]}, [r2,:64]
603
604        vswp            d1,  d2         @ so that we can later access column 1 and column 3 as a single q1 register
605
606        vc1_inv_trans_4x4_helper_setup
607
608        @ At this point:
609        @   src[0] = d0
610        @   src[1] = d2
611        @   src[2] = d1
612        @   src[3] = d3
613
614        vc1_inv_trans_4x4_helper add=4, rshift=3     @ compute t1, t2, t3, t4 and combine them into dst[0-3]
615
616        @ At this point:
617        @   dst[0] = d0
618        @   dst[1] = d3
619        @   dst[2] = d1
620        @   dst[3] = d2
621
622        transpose16     d0, d3, d1, d2  @ Transpose rows (registers) into columns
623
624        @ At this point:
625        @   src[0]  = d0
626        @   src[8]  = d3
627        @   src[16] = d1
628        @   src[24] = d2
629
630        vswp            d2,  d3         @ so that we can later access column 1 and column 3 in order as a single q1 register
631
632        @ At this point:
633        @   src[0]  = d0
634        @   src[8]  = d2
635        @   src[16] = d1
636        @   src[24] = d3
637
638        vc1_inv_trans_4x4_helper add=64, rshift=7             @ compute t1, t2, t3, t4 and combine them into dst[0-3]
639
640        @ At this point:
641        @   line[0] = d0
642        @   line[1] = d3
643        @   line[2] = d1
644        @   line[3] = d2
645
646        vld1.32         {d18[]},  [r0,:32], r1  @ read dest
647        vld1.32         {d19[]},  [r0,:32], r1
648        vld1.32         {d18[1]}, [r0,:32], r1
649        vld1.32         {d19[0]}, [r0,:32], r1
650        sub             r0,  r0,  r1, lsl #2    @ restore original r0 value
651
652        vaddw.u8        q0,  q0,  d18           @ line[0,2] += dest[0,2]
653        vaddw.u8        q1,  q1,  d19           @ line[3,1] += dest[3,1]
654
655        vqmovun.s16     d0,  q0                 @ clip(line[0,2])
656        vqmovun.s16     d1,  q1                 @ clip(line[3,1])
657
658        vst1.32         {d0[0]},  [r0,:32], r1  @ write dest
659        vst1.32         {d1[1]},  [r0,:32], r1
660        vst1.32         {d0[1]},  [r0,:32], r1
661        vst1.32         {d1[0]},  [r0,:32]
662
663        bx              lr
664endfunc
665
666@ The absolute value of multiplication constants from vc1_mspel_filter and vc1_mspel_{ver,hor}_filter_16bits.
667@ The sign is embedded in the code below that carries out the multiplication (mspel_filter{,.16}).
668#define MSPEL_MODE_1_MUL_CONSTANTS  4, 53, 18, 3
669#define MSPEL_MODE_2_MUL_CONSTANTS  1, 9,  9,  1
670#define MSPEL_MODE_3_MUL_CONSTANTS  3, 18, 53, 4
671
672@ These constants are from reading the source code of vc1_mspel_mc and determining the value that
673@ is added to `rnd` to result in the variable `r`, and the value of the variable `shift`.
674#define MSPEL_MODES_11_ADDSHIFT_CONSTANTS   15, 5
675#define MSPEL_MODES_12_ADDSHIFT_CONSTANTS   3,  3
676#define MSPEL_MODES_13_ADDSHIFT_CONSTANTS   15, 5
677#define MSPEL_MODES_21_ADDSHIFT_CONSTANTS   MSPEL_MODES_12_ADDSHIFT_CONSTANTS
678#define MSPEL_MODES_22_ADDSHIFT_CONSTANTS   0,  1
679#define MSPEL_MODES_23_ADDSHIFT_CONSTANTS   3,  3
680#define MSPEL_MODES_31_ADDSHIFT_CONSTANTS   MSPEL_MODES_13_ADDSHIFT_CONSTANTS
681#define MSPEL_MODES_32_ADDSHIFT_CONSTANTS   MSPEL_MODES_23_ADDSHIFT_CONSTANTS
682#define MSPEL_MODES_33_ADDSHIFT_CONSTANTS   15, 5
683
684@ The addition and shift constants from vc1_mspel_filter.
685#define MSPEL_MODE_1_ADDSHIFT_CONSTANTS     32, 6
686#define MSPEL_MODE_2_ADDSHIFT_CONSTANTS     8,  4
687#define MSPEL_MODE_3_ADDSHIFT_CONSTANTS     32, 6
688
689@ Setup constants in registers for a subsequent use of mspel_filter{,.16}.
690.macro mspel_constants typesize reg_a reg_b reg_c reg_d filter_a filter_b filter_c filter_d reg_add filter_add_register
691  @ Typesize should be i8 or i16.
692
693  @ Only set the register if the value is not 1 and unique
694  .if \filter_a != 1
695        vmov.\typesize  \reg_a,  #\filter_a          @ reg_a = filter_a
696  .endif
697        vmov.\typesize  \reg_b,  #\filter_b          @ reg_b = filter_b
698  .if \filter_b != \filter_c
699        vmov.\typesize  \reg_c,  #\filter_c          @ reg_c = filter_c
700  .endif
701  .if \filter_d != 1
702        vmov.\typesize  \reg_d,  #\filter_d          @ reg_d = filter_d
703  .endif
704  @ vdup to double the size of typesize
705  .ifc \typesize,i8
706        vdup.16         \reg_add,  \filter_add_register     @ reg_add = filter_add_register
707  .else
708        vdup.32         \reg_add,  \filter_add_register     @ reg_add = filter_add_register
709  .endif
710.endm
711
712@ After mspel_constants has been used, do the filtering.
713.macro mspel_filter acc dest src0 src1 src2 src3 filter_a filter_b filter_c filter_d reg_a reg_b reg_c reg_d reg_add filter_shift narrow=1
714  .if \filter_a != 1
715        @ If filter_a != 1, then we need a move and subtract instruction
716        vmov            \acc,  \reg_add                     @ acc = reg_add
717        vmlsl.u8        \acc,  \reg_a,  \src0               @ acc -= filter_a * src[-stride]
718  .else
719        @ If filter_a is 1, then just subtract without an extra move
720        vsubw.u8        \acc,  \reg_add,  \src0             @ acc = reg_add - src[-stride]      @ since filter_a == 1
721  .endif
722        vmlal.u8        \acc,  \reg_b,  \src1               @ acc += filter_b * src[0]
723  .if \filter_b != \filter_c
724        vmlal.u8        \acc,  \reg_c,  \src2               @ acc += filter_c * src[stride]
725  .else
726        @ If filter_b is the same as filter_c, use the same reg_b register
727        vmlal.u8        \acc,  \reg_b,  \src2               @ acc += filter_c * src[stride]     @ where filter_c == filter_b
728  .endif
729  .if \filter_d != 1
730        @ If filter_d != 1, then do a multiply accumulate
731        vmlsl.u8        \acc,  \reg_d,  \src3               @ acc -= filter_d * src[stride * 2]
732  .else
733        @ If filter_d is 1, then just do a subtract
734        vsubw.u8        \acc,  \acc,    \src3               @ acc -= src[stride * 2]            @ since filter_d == 1
735  .endif
736  .if \narrow
737        vqshrun.s16     \dest, \acc,    #\filter_shift      @ dest = clip_uint8(acc >> filter_shift)
738  .else
739        vshr.s16        \dest, \acc,    #\filter_shift      @ dest = acc >> filter_shift
740  .endif
741.endm
742
743@ This is similar to mspel_filter, but the input is 16-bit instead of 8-bit and narrow=0 is not supported.
744.macro mspel_filter.16 acc0 acc1 acc0_0 acc0_1 dest src0 src1 src2 src3 src4 src5 src6 src7 filter_a filter_b filter_c filter_d reg_a reg_b reg_c reg_d reg_add filter_shift
745  .if \filter_a != 1
746        vmov            \acc0,  \reg_add
747        vmov            \acc1,  \reg_add
748        vmlsl.s16       \acc0,  \reg_a,  \src0
749        vmlsl.s16       \acc1,  \reg_a,  \src1
750  .else
751        vsubw.s16       \acc0,  \reg_add,  \src0
752        vsubw.s16       \acc1,  \reg_add,  \src1
753  .endif
754        vmlal.s16       \acc0,  \reg_b,  \src2
755        vmlal.s16       \acc1,  \reg_b,  \src3
756  .if \filter_b != \filter_c
757        vmlal.s16       \acc0,  \reg_c,  \src4
758        vmlal.s16       \acc1,  \reg_c,  \src5
759  .else
760        vmlal.s16       \acc0,  \reg_b,  \src4
761        vmlal.s16       \acc1,  \reg_b,  \src5
762  .endif
763  .if \filter_d != 1
764        vmlsl.s16       \acc0,  \reg_d,  \src6
765        vmlsl.s16       \acc1,  \reg_d,  \src7
766  .else
767        vsubw.s16       \acc0,  \acc0,   \src6
768        vsubw.s16       \acc1,  \acc1,   \src7
769  .endif
770        @ Use acc0_0 and acc0_1 as temp space
771        vqshrun.s32     \acc0_0, \acc0,  #\filter_shift     @ Shift and narrow with saturation from s32 to u16
772        vqshrun.s32     \acc0_1, \acc1,  #\filter_shift
773        vqmovn.u16      \dest,  \acc0                       @ Narrow with saturation from u16 to u8
774.endm
775
776@ Register usage for put_vc1_mspel_mc functions. Registers marked 'hv' are only used in put_vc1_mspel_mc_hv.
777@
778@   r0        adjusted dst
779@   r1        adjusted src
780@   r2        stride
781@   r3        adjusted rnd
782@   r4 [hv]   tmp
783@   r11 [hv]  sp saved
784@   r12       loop counter
785@   d0        src[-stride]
786@   d1        src[0]
787@   d2        src[stride]
788@   d3        src[stride * 2]
789@   q0 [hv]   src[-stride]
790@   q1 [hv]   src[0]
791@   q2 [hv]   src[stride]
792@   q3 [hv]   src[stride * 2]
793@   d21       often result from mspel_filter
794@   q11       accumulator 0
795@   q12 [hv]  accumulator 1
796@   q13       accumulator initial value
797@   d28       filter_a
798@   d29       filter_b
799@   d30       filter_c
800@   d31       filter_d
801
802@ (uint8_t *dst [r0], const uint8_t *src [r1], ptrdiff_t stride [r2], int rnd [r3])
803.macro put_vc1_mspel_mc_hv hmode vmode filter_h_a filter_h_b filter_h_c filter_h_d filter_v_a filter_v_b filter_v_c filter_v_d filter_add filter_shift
804function ff_put_vc1_mspel_mc\hmode\()\vmode\()_neon, export=1
805        push            {r4, r11, lr}
806        mov             r11, sp                 @ r11 = stack pointer before realignmnet
807A       bic             sp,  sp,  #15           @ sp = round down to multiple of 16 bytes
808T       bic             r4,  r11, #15
809T       mov             sp,  r4
810        sub             sp,  sp,  #(8*2*16)     @ make space for 8 rows * 2 byte per element * 16 elements per row (to fit 11 actual elements per row)
811        mov             r4,  sp                 @ r4 = int16_t tmp[8 * 16]
812
813        sub             r1,  r1,  #1            @ src -= 1
814  .if \filter_add != 0
815        add             r3,  r3,  #\filter_add  @ r3 = filter_add + rnd
816  .endif
817        mov             r12, #8                 @ loop counter
818        sub             r1,  r1,  r2            @ r1 = &src[-stride]      @ slide back
819
820        @ Do vertical filtering from src into tmp
821        mspel_constants i8, d28, d29, d30, d31, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, q13, r3
822
823        vld1.64         {d0,d1}, [r1], r2
824        vld1.64         {d2,d3}, [r1], r2
825        vld1.64         {d4,d5}, [r1], r2
826
8271:
828        subs            r12,  r12,  #4
829
830        vld1.64         {d6,d7}, [r1], r2
831        mspel_filter    q11, q11, d0, d2, d4, d6, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
832        mspel_filter    q12, q12, d1, d3, d5, d7, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
833        vst1.64         {q11,q12}, [r4,:128]!   @ store and increment
834
835        vld1.64         {d0,d1}, [r1], r2
836        mspel_filter    q11, q11, d2, d4, d6, d0, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
837        mspel_filter    q12, q12, d3, d5, d7, d1, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
838        vst1.64         {q11,q12}, [r4,:128]!   @ store and increment
839
840        vld1.64         {d2,d3}, [r1], r2
841        mspel_filter    q11, q11, d4, d6, d0, d2, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
842        mspel_filter    q12, q12, d5, d7, d1, d3, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
843        vst1.64         {q11,q12}, [r4,:128]!   @ store and increment
844
845        vld1.64         {d4,d5}, [r1], r2
846        mspel_filter    q11, q11, d6, d0, d2, d4, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
847        mspel_filter    q12, q12, d7, d1, d3, d5, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
848        vst1.64         {q11,q12}, [r4,:128]!   @ store and increment
849
850        bne             1b
851
852        rsb             r3,   r3,  #(64 + \filter_add)      @ r3 = (64 + filter_add) - r3
853        mov             r12,  #8                @ loop counter
854        mov             r4,   sp                @ r4 = tmp
855
856        @ Do horizontal filtering from temp to dst
857        mspel_constants i16, d28, d29, d30, d31, \filter_h_a, \filter_h_b, \filter_h_c, \filter_h_d, q13, r3
858
8592:
860        subs            r12,  r12,  #1
861
862        vld1.64         {q0,q1}, [r4,:128]!     @ read one line of tmp
863        vext.16         q2,   q0,   q1,  #2
864        vext.16         q3,   q0,   q1,  #3
865        vext.16         q1,   q0,   q1,  #1     @ do last because it writes to q1 which is read by the other vext instructions
866
867        mspel_filter.16 q11, q12, d22, d23, d21, d0, d1, d2, d3, d4, d5, d6, d7, \filter_h_a, \filter_h_b, \filter_h_c, \filter_h_d, d28, d29, d30, d31, q13, 7
868
869        vst1.64         {d21}, [r0,:64], r2     @ store and increment dst
870
871        bne             2b
872
873        mov             sp,  r11
874        pop             {r4, r11, pc}
875endfunc
876.endm
877
878@ Use C preprocessor and assembler macros to expand to functions for horizontal and vertical filtering.
879#define PUT_VC1_MSPEL_MC_HV(hmode, vmode)   \
880    put_vc1_mspel_mc_hv hmode, vmode, \
881        MSPEL_MODE_ ## hmode ## _MUL_CONSTANTS, \
882        MSPEL_MODE_ ## vmode ## _MUL_CONSTANTS, \
883        MSPEL_MODES_ ## hmode ## vmode ## _ADDSHIFT_CONSTANTS
884
885PUT_VC1_MSPEL_MC_HV(1, 1)
886PUT_VC1_MSPEL_MC_HV(1, 2)
887PUT_VC1_MSPEL_MC_HV(1, 3)
888PUT_VC1_MSPEL_MC_HV(2, 1)
889PUT_VC1_MSPEL_MC_HV(2, 2)
890PUT_VC1_MSPEL_MC_HV(2, 3)
891PUT_VC1_MSPEL_MC_HV(3, 1)
892PUT_VC1_MSPEL_MC_HV(3, 2)
893PUT_VC1_MSPEL_MC_HV(3, 3)
894
895#undef PUT_VC1_MSPEL_MC_HV
896
897.macro  put_vc1_mspel_mc_h_only hmode filter_a filter_b filter_c filter_d filter_add filter_shift
898function ff_put_vc1_mspel_mc\hmode\()0_neon, export=1
899        rsb             r3,   r3,   #\filter_add        @ r3 = filter_add - r = filter_add - rnd
900        mov             r12,  #8                        @ loop counter
901        sub             r1,   r1,   #1                  @ slide back, using immediate
902
903        mspel_constants i8, d28, d29, d30, d31, \filter_a, \filter_b, \filter_c, \filter_d, q13, r3
904
9051:
906        subs            r12,  r12,  #1
907
908        vld1.64         {d0,d1}, [r1], r2               @ read 16 bytes even though we only need 11, also src += stride
909        vext.8          d2,   d0,   d1,  #2
910        vext.8          d3,   d0,   d1,  #3
911        vext.8          d1,   d0,   d1,  #1             @ do last because it writes to d1 which is read by the other vext instructions
912
913        mspel_filter    q11, d21, d0, d1, d2, d3, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift
914
915        vst1.64         {d21}, [r0,:64], r2             @ store and increment dst
916
917        bne             1b
918
919        bx              lr
920endfunc
921.endm
922
923@ Use C preprocessor and assembler macros to expand to functions for horizontal only filtering.
924#define PUT_VC1_MSPEL_MC_H_ONLY(hmode) \
925        put_vc1_mspel_mc_h_only hmode, MSPEL_MODE_ ## hmode ## _MUL_CONSTANTS, MSPEL_MODE_ ## hmode ## _ADDSHIFT_CONSTANTS
926
927PUT_VC1_MSPEL_MC_H_ONLY(1)
928PUT_VC1_MSPEL_MC_H_ONLY(2)
929PUT_VC1_MSPEL_MC_H_ONLY(3)
930
931#undef PUT_VC1_MSPEL_MC_H_ONLY
932
933@ (uint8_t *dst [r0], const uint8_t *src [r1], ptrdiff_t stride [r2], int rnd [r3])
934.macro put_vc1_mspel_mc_v_only vmode filter_a filter_b filter_c filter_d filter_add filter_shift
935function ff_put_vc1_mspel_mc0\vmode\()_neon, export=1
936        add             r3,   r3,   #\filter_add - 1    @ r3 = filter_add - r = filter_add - (1 - rnd) = filter_add - 1 + rnd
937        mov             r12,  #8                        @ loop counter
938        sub             r1,   r1,   r2                  @ r1 = &src[-stride]      @ slide back
939
940        mspel_constants i8, d28, d29, d30, d31, \filter_a, \filter_b, \filter_c, \filter_d, q13, r3
941
942        vld1.64         {d0},  [r1], r2                 @ d0 = src[-stride]
943        vld1.64         {d1},  [r1], r2                 @ d1 = src[0]
944        vld1.64         {d2},  [r1], r2                 @ d2 = src[stride]
945
9461:
947        subs            r12,  r12,  #4
948
949        vld1.64         {d3},  [r1], r2                 @ d3 = src[stride * 2]
950        mspel_filter    q11, d21, d0, d1, d2, d3, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift
951        vst1.64         {d21}, [r0,:64], r2             @ store and increment dst
952
953        vld1.64         {d0},  [r1], r2                 @ d0 = next line
954        mspel_filter    q11, d21, d1, d2, d3, d0, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift
955        vst1.64         {d21}, [r0,:64], r2             @ store and increment dst
956
957        vld1.64         {d1},  [r1], r2                 @ d1 = next line
958        mspel_filter    q11, d21, d2, d3, d0, d1, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift
959        vst1.64         {d21}, [r0,:64], r2             @ store and increment dst
960
961        vld1.64         {d2},  [r1], r2                 @ d2 = next line
962        mspel_filter    q11, d21, d3, d0, d1, d2, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift
963        vst1.64         {d21}, [r0,:64], r2             @ store and increment dst
964
965        bne             1b
966
967        bx              lr
968endfunc
969.endm
970
971@ Use C preprocessor and assembler macros to expand to functions for vertical only filtering.
972#define PUT_VC1_MSPEL_MC_V_ONLY(vmode) \
973        put_vc1_mspel_mc_v_only vmode, MSPEL_MODE_ ## vmode ## _MUL_CONSTANTS, MSPEL_MODE_ ## vmode ## _ADDSHIFT_CONSTANTS
974
975PUT_VC1_MSPEL_MC_V_ONLY(1)
976PUT_VC1_MSPEL_MC_V_ONLY(2)
977PUT_VC1_MSPEL_MC_V_ONLY(3)
978
979#undef PUT_VC1_MSPEL_MC_V_ONLY
980
981function ff_put_pixels8x8_neon, export=1
982        vld1.64         {d0}, [r1], r2
983        vld1.64         {d1}, [r1], r2
984        vld1.64         {d2}, [r1], r2
985        vld1.64         {d3}, [r1], r2
986        vld1.64         {d4}, [r1], r2
987        vld1.64         {d5}, [r1], r2
988        vld1.64         {d6}, [r1], r2
989        vld1.64         {d7}, [r1]
990        vst1.64         {d0}, [r0,:64], r2
991        vst1.64         {d1}, [r0,:64], r2
992        vst1.64         {d2}, [r0,:64], r2
993        vst1.64         {d3}, [r0,:64], r2
994        vst1.64         {d4}, [r0,:64], r2
995        vst1.64         {d5}, [r0,:64], r2
996        vst1.64         {d6}, [r0,:64], r2
997        vst1.64         {d7}, [r0,:64]
998        bx              lr
999endfunc
1000
1001function ff_vc1_inv_trans_8x8_dc_neon, export=1
1002        ldrsh           r2, [r2]              @ int dc = block[0];
1003
1004        vld1.64         {d0},  [r0,:64], r1
1005        vld1.64         {d1},  [r0,:64], r1
1006        vld1.64         {d4},  [r0,:64], r1
1007        vld1.64         {d5},  [r0,:64], r1
1008
1009        add             r2, r2, r2, lsl #1    @ dc = (3 * dc +  1) >> 1;
1010        vld1.64         {d6},  [r0,:64], r1
1011        add             r2, r2, #1
1012        vld1.64         {d7},  [r0,:64], r1
1013        vld1.64         {d16}, [r0,:64], r1
1014        vld1.64         {d17}, [r0,:64], r1
1015        asr             r2, r2, #1
1016
1017        sub             r0,  r0,  r1, lsl #3  @ restore r0 to original value
1018
1019        add             r2, r2, r2, lsl #1    @ dc = (3 * dc + 16) >> 5;
1020        add             r2, r2, #16
1021        asr             r2, r2, #5
1022
1023        vdup.16         q1,  r2               @ dc
1024
1025        vaddw.u8        q9,   q1,  d0
1026        vaddw.u8        q10,  q1,  d1
1027        vaddw.u8        q11,  q1,  d4
1028        vaddw.u8        q12,  q1,  d5
1029        vqmovun.s16     d0,  q9
1030        vqmovun.s16     d1,  q10
1031        vqmovun.s16     d4,  q11
1032        vst1.64         {d0},  [r0,:64], r1
1033        vqmovun.s16     d5,  q12
1034        vst1.64         {d1},  [r0,:64], r1
1035        vaddw.u8        q13,  q1,  d6
1036        vst1.64         {d4},  [r0,:64], r1
1037        vaddw.u8        q14,  q1,  d7
1038        vst1.64         {d5},  [r0,:64], r1
1039        vaddw.u8        q15,  q1,  d16
1040        vaddw.u8        q1,   q1,  d17        @ this destroys q1
1041        vqmovun.s16     d6,  q13
1042        vqmovun.s16     d7,  q14
1043        vqmovun.s16     d16, q15
1044        vqmovun.s16     d17, q1
1045        vst1.64         {d6},  [r0,:64], r1
1046        vst1.64         {d7},  [r0,:64], r1
1047        vst1.64         {d16}, [r0,:64], r1
1048        vst1.64         {d17}, [r0,:64]
1049        bx              lr
1050endfunc
1051
1052function ff_vc1_inv_trans_8x4_dc_neon, export=1
1053        ldrsh           r2, [r2]              @ int dc = block[0];
1054
1055        vld1.64         {d0},  [r0,:64], r1
1056        vld1.64         {d1},  [r0,:64], r1
1057        vld1.64         {d4},  [r0,:64], r1
1058        vld1.64         {d5},  [r0,:64], r1
1059
1060        add             r2, r2, r2, lsl #1    @ dc = ( 3 * dc +  1) >> 1;
1061
1062        sub             r0,  r0,  r1, lsl #2  @ restore r0 to original value
1063
1064        add             r2, r2, #1
1065        asr             r2, r2, #1
1066
1067        add             r2, r2, r2, lsl #4    @ dc = (17 * dc + 64) >> 7;
1068        add             r2, r2, #64
1069        asr             r2, r2, #7
1070
1071        vdup.16         q1,  r2               @ dc
1072
1073        vaddw.u8        q3,  q1,  d0
1074        vaddw.u8        q8,  q1,  d1
1075        vaddw.u8        q9,  q1,  d4
1076        vaddw.u8        q10, q1,  d5
1077        vqmovun.s16     d0,  q3
1078        vqmovun.s16     d1,  q8
1079        vqmovun.s16     d4,  q9
1080        vst1.64         {d0},  [r0,:64], r1
1081        vqmovun.s16     d5,  q10
1082        vst1.64         {d1},  [r0,:64], r1
1083        vst1.64         {d4},  [r0,:64], r1
1084        vst1.64         {d5},  [r0,:64]
1085        bx              lr
1086endfunc
1087
1088function ff_vc1_inv_trans_4x8_dc_neon, export=1
1089        ldrsh           r2, [r2]              @ int dc = block[0];
1090
1091        vld1.32         {d0[]},   [r0,:32], r1
1092        vld1.32         {d1[]},   [r0,:32], r1
1093        vld1.32         {d0[1]},  [r0,:32], r1
1094        vld1.32         {d1[1]},  [r0,:32], r1
1095
1096        add             r2, r2, r2, lsl #4    @ dc = (17 * dc +  4) >> 3;
1097        vld1.32         {d4[]},   [r0,:32], r1
1098        add             r2, r2, #4
1099        vld1.32         {d5[]},   [r0,:32], r1
1100        vld1.32         {d4[1]},  [r0,:32], r1
1101        asr             r2, r2, #3
1102        vld1.32         {d5[1]},  [r0,:32], r1
1103
1104        add             r2, r2, r2, lsl #1    @ dc = (12 * dc + 64) >> 7;
1105
1106        sub             r0,  r0,  r1, lsl #3  @ restore r0 to original value
1107
1108        lsl             r2, r2, #2
1109        add             r2, r2, #64
1110        asr             r2, r2, #7
1111
1112        vdup.16         q1,  r2               @ dc
1113
1114        vaddw.u8        q3,  q1,  d0
1115        vaddw.u8        q8,  q1,  d1
1116        vaddw.u8        q9,  q1,  d4
1117        vaddw.u8        q10, q1,  d5
1118        vqmovun.s16     d0,  q3
1119        vst1.32         {d0[0]},  [r0,:32], r1
1120        vqmovun.s16     d1,  q8
1121        vst1.32         {d1[0]},  [r0,:32], r1
1122        vqmovun.s16     d4,  q9
1123        vst1.32         {d0[1]},  [r0,:32], r1
1124        vqmovun.s16     d5,  q10
1125        vst1.32         {d1[1]},  [r0,:32], r1
1126        vst1.32         {d4[0]},  [r0,:32], r1
1127        vst1.32         {d5[0]},  [r0,:32], r1
1128        vst1.32         {d4[1]},  [r0,:32], r1
1129        vst1.32         {d5[1]},  [r0,:32]
1130        bx              lr
1131endfunc
1132
1133function ff_vc1_inv_trans_4x4_dc_neon, export=1
1134        ldrsh           r2, [r2]              @ int dc = block[0];
1135
1136        vld1.32         {d0[]},   [r0,:32], r1
1137        vld1.32         {d1[]},   [r0,:32], r1
1138        vld1.32         {d0[1]},  [r0,:32], r1
1139        vld1.32         {d1[1]},  [r0,:32], r1
1140
1141        add             r2, r2, r2, lsl #4    @ dc = (17 * dc +  4) >> 3;
1142
1143        sub             r0,  r0,  r1, lsl #2  @ restore r0 to original value
1144
1145        add             r2, r2, #4
1146        asr             r2, r2, #3
1147
1148        add             r2, r2, r2, lsl #4    @ dc = (17 * dc + 64) >> 7;
1149        add             r2, r2, #64
1150        asr             r2, r2, #7
1151
1152        vdup.16         q1,  r2               @ dc
1153
1154        vaddw.u8        q2,  q1,  d0
1155        vaddw.u8        q3,  q1,  d1
1156        vqmovun.s16     d0,  q2
1157        vst1.32         {d0[0]},  [r0,:32], r1
1158        vqmovun.s16     d1,  q3
1159        vst1.32         {d1[0]},  [r0,:32], r1
1160        vst1.32         {d0[1]},  [r0,:32], r1
1161        vst1.32         {d1[1]},  [r0,:32]
1162        bx              lr
1163endfunc
1164