• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
3 *
4 * This file is part of FFmpeg
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "config.h"
22#include "asm.S"
23
24/**
25 * Assume that len is a positive number and is multiple of 8
26 */
27@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len)
28function ff_vector_fmul_vfp, export=1
29        vpush           {d8-d15}
30        fmrx            r12, fpscr
31        orr             r12, r12, #(3 << 16) /* set vector size to 4 */
32        fmxr            fpscr, r12
33
34        vldmia          r1!, {s0-s3}
35        vldmia          r2!, {s8-s11}
36        vldmia          r1!, {s4-s7}
37        vldmia          r2!, {s12-s15}
38        vmul.f32        s8,  s0,  s8
391:
40        subs            r3,  r3,  #16
41        vmul.f32        s12, s4,  s12
42        itttt           ge
43        vldmiage        r1!, {s16-s19}
44        vldmiage        r2!, {s24-s27}
45        vldmiage        r1!, {s20-s23}
46        vldmiage        r2!, {s28-s31}
47        it              ge
48        vmulge.f32      s24, s16, s24
49        vstmia          r0!, {s8-s11}
50        vstmia          r0!, {s12-s15}
51        it              ge
52        vmulge.f32      s28, s20, s28
53        itttt           gt
54        vldmiagt        r1!, {s0-s3}
55        vldmiagt        r2!, {s8-s11}
56        vldmiagt        r1!, {s4-s7}
57        vldmiagt        r2!, {s12-s15}
58        ittt            ge
59        vmulge.f32      s8,  s0,  s8
60        vstmiage        r0!, {s24-s27}
61        vstmiage        r0!, {s28-s31}
62        bgt             1b
63
64        bic             r12, r12, #(7 << 16) /* set vector size back to 1 */
65        fmxr            fpscr, r12
66        vpop            {d8-d15}
67        bx              lr
68endfunc
69
70/**
71 * ARM VFP implementation of 'vector_fmul_window_c' function
72 * Assume that len is a positive non-zero number
73 */
74@ void ff_vector_fmul_window_vfp(float *dst, const float *src0,
75@                                const float *src1, const float *win, int len)
76function ff_vector_fmul_window_vfp, export=1
77DST0    .req    a1
78SRC0    .req    a2
79SRC1    .req    a3
80WIN0    .req    a4
81LEN     .req    v1
82DST1    .req    v2
83WIN1    .req    v3
84OLDFPSCR .req   ip
85
86        push    {v1-v3,lr}
87        ldr     LEN, [sp, #4*4+0]
88        vpush   {s16-s31}
89        fmrx    OLDFPSCR, FPSCR
90        add     DST1, DST0, LEN, lsl #3
91        add     SRC1, SRC1, LEN, lsl #2
92        add     WIN1, WIN0, LEN, lsl #3
93
94        tst     LEN, #7
95        beq     4f                          @ common case: len is a multiple of 8
96
97        ldr     lr, =0x03000000             @ RunFast mode, scalar mode
98        fmxr    FPSCR, lr
99
100        tst     LEN, #1
101        beq     1f
102        vldmdb  WIN1!, {s0}
103        vldmia  SRC0!, {s8}
104        vldmia  WIN0!, {s16}
105        vmul.f  s24, s0, s8
106        vldmdb  SRC1!, {s20}
107        vmul.f  s8, s16, s8
108        vmls.f  s24, s16, s20
109        vmla.f  s8, s0, s20
110        vstmia  DST0!, {s24}
111        vstmdb  DST1!, {s8}
1121:
113        tst     LEN, #2
114        beq     2f
115        vldmdb  WIN1!, {s0}
116        vldmdb  WIN1!, {s1}
117        vldmia  SRC0!, {s8-s9}
118        vldmia  WIN0!, {s16-s17}
119        vmul.f  s24, s0, s8
120        vmul.f  s25, s1, s9
121        vldmdb  SRC1!, {s20}
122        vldmdb  SRC1!, {s21}
123        vmul.f  s8, s16, s8
124        vmul.f  s9, s17, s9
125        vmls.f  s24, s16, s20
126        vmls.f  s25, s17, s21
127        vmla.f  s8, s0, s20
128        vmla.f  s9, s1, s21
129        vstmia  DST0!, {s24-s25}
130        vstmdb  DST1!, {s8}
131        vstmdb  DST1!, {s9}
1322:
133        tst     LEN, #4
134        beq     3f
135        vldmdb  WIN1!, {s0}
136        vldmdb  WIN1!, {s1}
137        vldmdb  WIN1!, {s2}
138        vldmdb  WIN1!, {s3}
139        vldmia  SRC0!, {s8-s11}
140        vldmia  WIN0!, {s16-s19}
141        vmul.f  s24, s0, s8
142        vmul.f  s25, s1, s9
143        vmul.f  s26, s2, s10
144        vmul.f  s27, s3, s11
145        vldmdb  SRC1!, {s20}
146        vldmdb  SRC1!, {s21}
147        vldmdb  SRC1!, {s22}
148        vldmdb  SRC1!, {s23}
149        vmul.f  s8, s16, s8
150        vmul.f  s9, s17, s9
151        vmul.f  s10, s18, s10
152        vmul.f  s11, s19, s11
153        vmls.f  s24, s16, s20
154        vmls.f  s25, s17, s21
155        vmls.f  s26, s18, s22
156        vmls.f  s27, s19, s23
157        vmla.f  s8, s0, s20
158        vmla.f  s9, s1, s21
159        vmla.f  s10, s2, s22
160        vmla.f  s11, s3, s23
161        vstmia  DST0!, {s24-s27}
162        vstmdb  DST1!, {s8}
163        vstmdb  DST1!, {s9}
164        vstmdb  DST1!, {s10}
165        vstmdb  DST1!, {s11}
1663:
167        bics    LEN, LEN, #7
168        beq     7f
1694:
170        ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
171        fmxr    FPSCR, lr
172
173        vldmdb  WIN1!, {s0}
174        vldmdb  WIN1!, {s1}
175        vldmdb  WIN1!, {s2}
176        vldmdb  WIN1!, {s3}
177        vldmia  SRC0!, {s8-s11}
178        vldmia  WIN0!, {s16-s19}
179        vmul.f  s24, s0, s8                     @ vector * vector
180        vldmdb  SRC1!, {s20}
181        vldmdb  SRC1!, {s21}
182        vldmdb  SRC1!, {s22}
183        vldmdb  SRC1!, {s23}
184        vmul.f  s8, s16, s8                     @ vector * vector
185        vmls.f  s24, s16, s20                   @ vector * vector
186            vldmdb  WIN1!, {s4}
187            vldmdb  WIN1!, {s5}
188            vldmdb  WIN1!, {s6}
189            vldmdb  WIN1!, {s7}
190            vldmia  SRC0!, {s12-s13}
191        vmla.f  s8, s0, s20                     @ vector * vector
192            vldmia  SRC0!, {s14-s15}
193        subs    LEN, LEN, #8
194        beq     6f
1955:          vldmia  WIN0!, {s20-s23}
196            vmul.f  s28, s4, s12                @ vector * vector
197        vstmia  DST0!, {s24-s25}
198            vldmdb  SRC1!, {s16}
199            vldmdb  SRC1!, {s17}
200            vldmdb  SRC1!, {s18}
201            vldmdb  SRC1!, {s19}
202            vmul.f  s12, s20, s12               @ vector * vector
203        vstmia  DST0!, {s26-s27}
204        vstmdb  DST1!, {s8}
205        vstmdb  DST1!, {s9}
206        vstmdb  DST1!, {s10}
207        vstmdb  DST1!, {s11}
208            vmls.f  s28, s20, s16               @ vector * vector
209                vldmdb  WIN1!, {s0}
210                vldmdb  WIN1!, {s1}
211                vldmdb  WIN1!, {s2}
212                vldmdb  WIN1!, {s3}
213                vldmia  SRC0!, {s8-s9}
214            vmla.f  s12, s4, s16                @ vector * vector
215                vldmia  SRC0!, {s10-s11}
216        subs    LEN, LEN, #8
217                vldmia  WIN0!, {s16-s19}
218                vmul.f  s24, s0, s8             @ vector * vector
219            vstmia  DST0!, {s28-s29}
220                vldmdb  SRC1!, {s20}
221                vldmdb  SRC1!, {s21}
222                vldmdb  SRC1!, {s22}
223                vldmdb  SRC1!, {s23}
224                vmul.f  s8, s16, s8             @ vector * vector
225            vstmia  DST0!, {s30-s31}
226            vstmdb  DST1!, {s12}
227            vstmdb  DST1!, {s13}
228            vstmdb  DST1!, {s14}
229            vstmdb  DST1!, {s15}
230                vmls.f  s24, s16, s20           @ vector * vector
231                    vldmdb  WIN1!, {s4}
232                    vldmdb  WIN1!, {s5}
233                    vldmdb  WIN1!, {s6}
234                    vldmdb  WIN1!, {s7}
235                    vldmia  SRC0!, {s12-s13}
236                vmla.f  s8, s0, s20             @ vector * vector
237                    vldmia  SRC0!, {s14-s15}
238        bne     5b
2396:                  vldmia  WIN0!, {s20-s23}
240                    vmul.f  s28, s4, s12        @ vector * vector
241                vstmia  DST0!, {s24-s25}
242                    vldmdb  SRC1!, {s16}
243                    vldmdb  SRC1!, {s17}
244                    vldmdb  SRC1!, {s18}
245                    vldmdb  SRC1!, {s19}
246                    vmul.f  s12, s20, s12       @ vector * vector
247                vstmia  DST0!, {s26-s27}
248                vstmdb  DST1!, {s8}
249                vstmdb  DST1!, {s9}
250                vstmdb  DST1!, {s10}
251                vstmdb  DST1!, {s11}
252                    vmls.f  s28, s20, s16       @ vector * vector
253                    vmla.f  s12, s4, s16        @ vector * vector
254                    vstmia  DST0!, {s28-s31}
255                    vstmdb  DST1!, {s12}
256                    vstmdb  DST1!, {s13}
257                    vstmdb  DST1!, {s14}
258                    vstmdb  DST1!, {s15}
2597:
260        fmxr    FPSCR, OLDFPSCR
261        vpop    {s16-s31}
262        pop     {v1-v3,pc}
263
264        .unreq  DST0
265        .unreq  SRC0
266        .unreq  SRC1
267        .unreq  WIN0
268        .unreq  LEN
269        .unreq  OLDFPSCR
270        .unreq  DST1
271        .unreq  WIN1
272endfunc
273
274/**
275 * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
276 * Assume that len is a positive number and is multiple of 8
277 */
278@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
279@                                 const float *src1, int len)
280function ff_vector_fmul_reverse_vfp, export=1
281        vpush           {d8-d15}
282        add             r2,  r2,  r3, lsl #2
283        vldmdb          r2!, {s0-s3}
284        vldmia          r1!, {s8-s11}
285        vldmdb          r2!, {s4-s7}
286        vldmia          r1!, {s12-s15}
287        vmul.f32        s8,  s3,  s8
288        vmul.f32        s9,  s2,  s9
289        vmul.f32        s10, s1,  s10
290        vmul.f32        s11, s0,  s11
2911:
292        subs            r3,  r3,  #16
293        it              ge
294        vldmdbge        r2!, {s16-s19}
295        vmul.f32        s12, s7,  s12
296        it              ge
297        vldmiage        r1!, {s24-s27}
298        vmul.f32        s13, s6,  s13
299        it              ge
300        vldmdbge        r2!, {s20-s23}
301        vmul.f32        s14, s5,  s14
302        it              ge
303        vldmiage        r1!, {s28-s31}
304        vmul.f32        s15, s4,  s15
305        it              ge
306        vmulge.f32      s24, s19, s24
307        it              gt
308        vldmdbgt        r2!, {s0-s3}
309        it              ge
310        vmulge.f32      s25, s18, s25
311        vstmia          r0!, {s8-s13}
312        it              ge
313        vmulge.f32      s26, s17, s26
314        it              gt
315        vldmiagt        r1!, {s8-s11}
316        itt             ge
317        vmulge.f32      s27, s16, s27
318        vmulge.f32      s28, s23, s28
319        it              gt
320        vldmdbgt        r2!, {s4-s7}
321        it              ge
322        vmulge.f32      s29, s22, s29
323        vstmia          r0!, {s14-s15}
324        ittt            ge
325        vmulge.f32      s30, s21, s30
326        vmulge.f32      s31, s20, s31
327        vmulge.f32      s8,  s3,  s8
328        it              gt
329        vldmiagt        r1!, {s12-s15}
330        itttt           ge
331        vmulge.f32      s9,  s2,  s9
332        vmulge.f32      s10, s1,  s10
333        vstmiage        r0!, {s24-s27}
334        vmulge.f32      s11, s0,  s11
335        it              ge
336        vstmiage        r0!, {s28-s31}
337        bgt             1b
338
339        vpop            {d8-d15}
340        bx              lr
341endfunc
342
343/**
344 * ARM VFP implementation of 'butterflies_float_c' function
345 * Assume that len is a positive non-zero number
346 */
347@ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len)
348function ff_butterflies_float_vfp, export=1
349BASE1   .req    a1
350BASE2   .req    a2
351LEN     .req    a3
352OLDFPSCR .req   a4
353
354        vpush   {s16-s31}
355        fmrx    OLDFPSCR, FPSCR
356
357        tst     LEN, #7
358        beq     4f                          @ common case: len is a multiple of 8
359
360        ldr     ip, =0x03000000             @ RunFast mode, scalar mode
361        fmxr    FPSCR, ip
362
363        tst     LEN, #1
364        beq     1f
365        vldmia  BASE1!, {s0}
366        vldmia  BASE2!, {s8}
367        vadd.f  s16, s0, s8
368        vsub.f  s24, s0, s8
369        vstr    s16, [BASE1, #0-4*1]
370        vstr    s24, [BASE2, #0-4*1]
3711:
372        tst     LEN, #2
373        beq     2f
374        vldmia  BASE1!, {s0-s1}
375        vldmia  BASE2!, {s8-s9}
376        vadd.f  s16, s0, s8
377        vadd.f  s17, s1, s9
378        vsub.f  s24, s0, s8
379        vsub.f  s25, s1, s9
380        vstr    d8, [BASE1, #0-8*1]    @ s16,s17
381        vstr    d12, [BASE2, #0-8*1]   @ s24,s25
3822:
383        tst     LEN, #4
384        beq     3f
385        vldmia  BASE1!, {s0-s1}
386        vldmia  BASE2!, {s8-s9}
387        vldmia  BASE1!, {s2-s3}
388        vldmia  BASE2!, {s10-s11}
389        vadd.f  s16, s0, s8
390        vadd.f  s17, s1, s9
391        vsub.f  s24, s0, s8
392        vsub.f  s25, s1, s9
393        vadd.f  s18, s2, s10
394        vadd.f  s19, s3, s11
395        vsub.f  s26, s2, s10
396        vsub.f  s27, s3, s11
397        vstr    d8, [BASE1, #0-16*1]    @ s16,s17
398        vstr    d12, [BASE2, #0-16*1]   @ s24,s25
399        vstr    d9, [BASE1, #8-16*1]    @ s18,s19
400        vstr    d13, [BASE2, #8-16*1]   @ s26,s27
4013:
402        bics    LEN, LEN, #7
403        beq     7f
4044:
405        ldr     ip, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
406        fmxr    FPSCR, ip
407
408        vldmia  BASE1!, {s0-s1}
409        vldmia  BASE2!, {s8-s9}
410        vldmia  BASE1!, {s2-s3}
411        vldmia  BASE2!, {s10-s11}
412        vadd.f  s16, s0, s8
413            vldmia  BASE1!, {s4-s5}
414            vldmia  BASE2!, {s12-s13}
415            vldmia  BASE1!, {s6-s7}
416            vldmia  BASE2!, {s14-s15}
417        vsub.f  s24, s0, s8
418            vadd.f  s20, s4, s12
419        subs    LEN, LEN, #8
420        beq     6f
4215:              vldmia  BASE1!, {s0-s3}
422                vldmia  BASE2!, {s8-s11}
423            vsub.f  s28, s4, s12
424        vstr    d8, [BASE1, #0-16*3]    @ s16,s17
425        vstr    d9, [BASE1, #8-16*3]    @ s18,s19
426        vstr    d12, [BASE2, #0-16*3]   @ s24,s25
427        vstr    d13, [BASE2, #8-16*3]   @ s26,s27
428                vadd.f  s16, s0, s8
429                    vldmia  BASE1!, {s4-s7}
430                    vldmia  BASE2!, {s12-s15}
431                vsub.f  s24, s0, s8
432            vstr    d10, [BASE1, #0-16*3]   @ s20,s21
433            vstr    d11, [BASE1, #8-16*3]   @ s22,s23
434            vstr    d14, [BASE2, #0-16*3]   @ s28,s29
435            vstr    d15, [BASE2, #8-16*3]   @ s30,s31
436                    vadd.f  s20, s4, s12
437        subs    LEN, LEN, #8
438        bne     5b
4396:                   vsub.f  s28, s4, s12
440                vstr    d8, [BASE1, #0-16*2]    @ s16,s17
441                vstr    d9, [BASE1, #8-16*2]    @ s18,s19
442                vstr    d12, [BASE2, #0-16*2]   @ s24,s25
443                vstr    d13, [BASE2, #8-16*2]   @ s26,s27
444                    vstr    d10, [BASE1, #0-16*1]   @ s20,s21
445                    vstr    d11, [BASE1, #8-16*1]   @ s22,s23
446                    vstr    d14, [BASE2, #0-16*1]   @ s28,s29
447                    vstr    d15, [BASE2, #8-16*1]   @ s30,s31
4487:
449        fmxr    FPSCR, OLDFPSCR
450        vpop    {s16-s31}
451        bx      lr
452
453        .unreq  BASE1
454        .unreq  BASE2
455        .unreq  LEN
456        .unreq  OLDFPSCR
457endfunc
458