• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;******************************************************************************
2;* x86 optimized Format Conversion Utils
3;* Copyright (c) 2008 Loren Merritt
4;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24%include "util.asm"
25
26SECTION_RODATA 32
27
28pf_s32_inv_scale: times 8 dd 0x30000000
29pf_s32_scale:     times 8 dd 0x4f000000
30pf_s32_clip:      times 8 dd 0x4effffff
31pf_s16_inv_scale: times 4 dd 0x38000000
32pf_s16_scale:     times 4 dd 0x47000000
33pb_shuf_unpack_even:      db -1, -1,  0,  1, -1, -1,  2,  3, -1, -1,  8,  9, -1, -1, 10, 11
34pb_shuf_unpack_odd:       db -1, -1,  4,  5, -1, -1,  6,  7, -1, -1, 12, 13, -1, -1, 14, 15
35pb_interleave_words: SHUFFLE_MASK_W  0,  4,  1,  5,  2,  6,  3,  7
36pb_deinterleave_words: SHUFFLE_MASK_W  0,  2,  4,  6,  1,  3,  5,  7
37pw_zero_even:     times 4 dw 0x0000, 0xffff
38
39SECTION .text
40
41;------------------------------------------------------------------------------
42; void ff_conv_s16_to_s32(int32_t *dst, const int16_t *src, int len);
43;------------------------------------------------------------------------------
44
45INIT_XMM sse2
46cglobal conv_s16_to_s32, 3,3,3, dst, src, len
47    lea      lenq, [2*lend]
48    lea      dstq, [dstq+2*lenq]
49    add      srcq, lenq
50    neg      lenq
51.loop:
52    mova       m2, [srcq+lenq]
53    pxor       m0, m0
54    pxor       m1, m1
55    punpcklwd  m0, m2
56    punpckhwd  m1, m2
57    mova  [dstq+2*lenq       ], m0
58    mova  [dstq+2*lenq+mmsize], m1
59    add      lenq, mmsize
60    jl .loop
61    REP_RET
62
63;------------------------------------------------------------------------------
64; void ff_conv_s16_to_flt(float *dst, const int16_t *src, int len);
65;------------------------------------------------------------------------------
66
67%macro CONV_S16_TO_FLT 0
68cglobal conv_s16_to_flt, 3,3,3, dst, src, len
69    lea      lenq, [2*lend]
70    add      srcq, lenq
71    lea      dstq, [dstq + 2*lenq]
72    neg      lenq
73    mova       m2, [pf_s16_inv_scale]
74    ALIGN 16
75.loop:
76    mova       m0, [srcq+lenq]
77    S16_TO_S32_SX 0, 1
78    cvtdq2ps   m0, m0
79    cvtdq2ps   m1, m1
80    mulps      m0, m2
81    mulps      m1, m2
82    mova  [dstq+2*lenq       ], m0
83    mova  [dstq+2*lenq+mmsize], m1
84    add      lenq, mmsize
85    jl .loop
86    REP_RET
87%endmacro
88
89INIT_XMM sse2
90CONV_S16_TO_FLT
91INIT_XMM sse4
92CONV_S16_TO_FLT
93
94;------------------------------------------------------------------------------
95; void ff_conv_s32_to_s16(int16_t *dst, const int32_t *src, int len);
96;------------------------------------------------------------------------------
97
98%macro CONV_S32_TO_S16 0
99cglobal conv_s32_to_s16, 3,3,4, dst, src, len
100    lea     lenq, [2*lend]
101    lea     srcq, [srcq+2*lenq]
102    add     dstq, lenq
103    neg     lenq
104.loop:
105    mova      m0, [srcq+2*lenq         ]
106    mova      m1, [srcq+2*lenq+  mmsize]
107    mova      m2, [srcq+2*lenq+2*mmsize]
108    mova      m3, [srcq+2*lenq+3*mmsize]
109    psrad     m0, 16
110    psrad     m1, 16
111    psrad     m2, 16
112    psrad     m3, 16
113    packssdw  m0, m1
114    packssdw  m2, m3
115    mova  [dstq+lenq       ], m0
116    mova  [dstq+lenq+mmsize], m2
117    add     lenq, mmsize*2
118    jl .loop
119%if mmsize == 8
120    emms
121    RET
122%else
123    REP_RET
124%endif
125%endmacro
126
127INIT_MMX mmx
128CONV_S32_TO_S16
129INIT_XMM sse2
130CONV_S32_TO_S16
131
132;------------------------------------------------------------------------------
133; void ff_conv_s32_to_flt(float *dst, const int32_t *src, int len);
134;------------------------------------------------------------------------------
135
136%macro CONV_S32_TO_FLT 0
137cglobal conv_s32_to_flt, 3,3,3, dst, src, len
138    lea     lenq, [4*lend]
139    add     srcq, lenq
140    add     dstq, lenq
141    neg     lenq
142    mova      m0, [pf_s32_inv_scale]
143    ALIGN 16
144.loop:
145    cvtdq2ps  m1, [srcq+lenq       ]
146    cvtdq2ps  m2, [srcq+lenq+mmsize]
147    mulps     m1, m1, m0
148    mulps     m2, m2, m0
149    mova  [dstq+lenq       ], m1
150    mova  [dstq+lenq+mmsize], m2
151    add     lenq, mmsize*2
152    jl .loop
153    REP_RET
154%endmacro
155
156INIT_XMM sse2
157CONV_S32_TO_FLT
158%if HAVE_AVX_EXTERNAL
159INIT_YMM avx
160CONV_S32_TO_FLT
161%endif
162
163;------------------------------------------------------------------------------
164; void ff_conv_flt_to_s16(int16_t *dst, const float *src, int len);
165;------------------------------------------------------------------------------
166
167INIT_XMM sse2
168cglobal conv_flt_to_s16, 3,3,5, dst, src, len
169    lea     lenq, [2*lend]
170    lea     srcq, [srcq+2*lenq]
171    add     dstq, lenq
172    neg     lenq
173    mova      m4, [pf_s16_scale]
174.loop:
175    mova      m0, [srcq+2*lenq         ]
176    mova      m1, [srcq+2*lenq+1*mmsize]
177    mova      m2, [srcq+2*lenq+2*mmsize]
178    mova      m3, [srcq+2*lenq+3*mmsize]
179    mulps     m0, m4
180    mulps     m1, m4
181    mulps     m2, m4
182    mulps     m3, m4
183    cvtps2dq  m0, m0
184    cvtps2dq  m1, m1
185    cvtps2dq  m2, m2
186    cvtps2dq  m3, m3
187    packssdw  m0, m1
188    packssdw  m2, m3
189    mova  [dstq+lenq       ], m0
190    mova  [dstq+lenq+mmsize], m2
191    add     lenq, mmsize*2
192    jl .loop
193    REP_RET
194
195;------------------------------------------------------------------------------
196; void ff_conv_flt_to_s32(int32_t *dst, const float *src, int len);
197;------------------------------------------------------------------------------
198
199%macro CONV_FLT_TO_S32 0
200cglobal conv_flt_to_s32, 3,3,6, dst, src, len
201    lea     lenq, [lend*4]
202    add     srcq, lenq
203    add     dstq, lenq
204    neg     lenq
205    mova      m4, [pf_s32_scale]
206    mova      m5, [pf_s32_clip]
207.loop:
208    mulps     m0, m4, [srcq+lenq         ]
209    mulps     m1, m4, [srcq+lenq+1*mmsize]
210    mulps     m2, m4, [srcq+lenq+2*mmsize]
211    mulps     m3, m4, [srcq+lenq+3*mmsize]
212    minps     m0, m0, m5
213    minps     m1, m1, m5
214    minps     m2, m2, m5
215    minps     m3, m3, m5
216    cvtps2dq  m0, m0
217    cvtps2dq  m1, m1
218    cvtps2dq  m2, m2
219    cvtps2dq  m3, m3
220    mova  [dstq+lenq         ], m0
221    mova  [dstq+lenq+1*mmsize], m1
222    mova  [dstq+lenq+2*mmsize], m2
223    mova  [dstq+lenq+3*mmsize], m3
224    add     lenq, mmsize*4
225    jl .loop
226    REP_RET
227%endmacro
228
229INIT_XMM sse2
230CONV_FLT_TO_S32
231%if HAVE_AVX_EXTERNAL
232INIT_YMM avx
233CONV_FLT_TO_S32
234%endif
235
236;------------------------------------------------------------------------------
237; void ff_conv_s16p_to_s16_2ch(int16_t *dst, int16_t *const *src, int len,
238;                              int channels);
239;------------------------------------------------------------------------------
240
241%macro CONV_S16P_TO_S16_2CH 0
242cglobal conv_s16p_to_s16_2ch, 3,4,5, dst, src0, len, src1
243    mov       src1q, [src0q+gprsize]
244    mov       src0q, [src0q        ]
245    lea        lenq, [2*lend]
246    add       src0q, lenq
247    add       src1q, lenq
248    lea        dstq, [dstq+2*lenq]
249    neg        lenq
250.loop:
251    mova         m0, [src0q+lenq       ]
252    mova         m1, [src1q+lenq       ]
253    mova         m2, [src0q+lenq+mmsize]
254    mova         m3, [src1q+lenq+mmsize]
255    SBUTTERFLY2  wd, 0, 1, 4
256    SBUTTERFLY2  wd, 2, 3, 4
257    mova  [dstq+2*lenq+0*mmsize], m0
258    mova  [dstq+2*lenq+1*mmsize], m1
259    mova  [dstq+2*lenq+2*mmsize], m2
260    mova  [dstq+2*lenq+3*mmsize], m3
261    add        lenq, 2*mmsize
262    jl .loop
263    REP_RET
264%endmacro
265
266INIT_XMM sse2
267CONV_S16P_TO_S16_2CH
268%if HAVE_AVX_EXTERNAL
269INIT_XMM avx
270CONV_S16P_TO_S16_2CH
271%endif
272
273;------------------------------------------------------------------------------
274; void ff_conv_s16p_to_s16_6ch(int16_t *dst, int16_t *const *src, int len,
275;                              int channels);
276;------------------------------------------------------------------------------
277
278;------------------------------------------------------------------------------
279; NOTE: In the 6-channel functions, len could be used as an index on x86-64
280;       instead of just a counter, which would avoid incrementing the
281;       pointers, but the extra complexity and amount of code is not worth
282;       the small gain. On x86-32 there are not enough registers to use len
283;       as an index without keeping two of the pointers on the stack and
284;       loading them in each iteration.
285;------------------------------------------------------------------------------
286
287%macro CONV_S16P_TO_S16_6CH 0
288%if ARCH_X86_64
289cglobal conv_s16p_to_s16_6ch, 3,8,7, dst, src0, len, src1, src2, src3, src4, src5
290%else
291cglobal conv_s16p_to_s16_6ch, 2,7,7, dst, src0, src1, src2, src3, src4, src5
292%define lend dword r2m
293%endif
294    mov      src1q, [src0q+1*gprsize]
295    mov      src2q, [src0q+2*gprsize]
296    mov      src3q, [src0q+3*gprsize]
297    mov      src4q, [src0q+4*gprsize]
298    mov      src5q, [src0q+5*gprsize]
299    mov      src0q, [src0q]
300    sub      src1q, src0q
301    sub      src2q, src0q
302    sub      src3q, src0q
303    sub      src4q, src0q
304    sub      src5q, src0q
305.loop:
306%if cpuflag(sse2slow)
307    movq        m0, [src0q      ]   ; m0 =  0,  6, 12, 18,  x,  x,  x,  x
308    movq        m1, [src0q+src1q]   ; m1 =  1,  7, 13, 19,  x,  x,  x,  x
309    movq        m2, [src0q+src2q]   ; m2 =  2,  8, 14, 20,  x,  x,  x,  x
310    movq        m3, [src0q+src3q]   ; m3 =  3,  9, 15, 21,  x,  x,  x,  x
311    movq        m4, [src0q+src4q]   ; m4 =  4, 10, 16, 22,  x,  x,  x,  x
312    movq        m5, [src0q+src5q]   ; m5 =  5, 11, 17, 23,  x,  x,  x,  x
313                                    ; unpack words:
314    punpcklwd   m0, m1              ; m0 =  0,  1,  6,  7, 12, 13, 18, 19
315    punpcklwd   m2, m3              ; m2 =  4,  5, 10, 11, 16, 17, 22, 23
316    punpcklwd   m4, m5              ; m4 =  2,  3,  8,  9, 14, 15, 20, 21
317                                    ; blend dwords
318    shufps      m1, m0, m2, q2020   ; m1 =  0,  1, 12, 13,  2,  3, 14, 15
319    shufps      m0, m4, q2031       ; m0 =  6,  7, 18, 19,  4,  5, 16, 17
320    shufps      m2, m4, q3131       ; m2 =  8,  9, 20, 21, 10, 11, 22, 23
321                                    ; shuffle dwords
322    pshufd      m0, m0, q1302       ; m0 =  4,  5,  6,  7, 16, 17, 18, 19
323    pshufd      m1, m1, q3120       ; m1 =  0,  1,  2,  3, 12, 13, 14, 15
324    pshufd      m2, m2, q3120       ; m2 =  8,  9, 10, 11, 20, 21, 22, 23
325    movq   [dstq+0*mmsize/2], m1
326    movq   [dstq+1*mmsize/2], m0
327    movq   [dstq+2*mmsize/2], m2
328    movhps [dstq+3*mmsize/2], m1
329    movhps [dstq+4*mmsize/2], m0
330    movhps [dstq+5*mmsize/2], m2
331    add      src0q, mmsize/2
332    add       dstq, mmsize*3
333    sub       lend, mmsize/4
334%else
335    mova        m0, [src0q      ]   ; m0 =  0,  6, 12, 18, 24, 30, 36, 42
336    mova        m1, [src0q+src1q]   ; m1 =  1,  7, 13, 19, 25, 31, 37, 43
337    mova        m2, [src0q+src2q]   ; m2 =  2,  8, 14, 20, 26, 32, 38, 44
338    mova        m3, [src0q+src3q]   ; m3 =  3,  9, 15, 21, 27, 33, 39, 45
339    mova        m4, [src0q+src4q]   ; m4 =  4, 10, 16, 22, 28, 34, 40, 46
340    mova        m5, [src0q+src5q]   ; m5 =  5, 11, 17, 23, 29, 35, 41, 47
341                                    ; unpack words:
342    SBUTTERFLY2 wd, 0, 1, 6         ; m0 =  0,  1,  6,  7, 12, 13, 18, 19
343                                    ; m1 = 24, 25, 30, 31, 36, 37, 42, 43
344    SBUTTERFLY2 wd, 2, 3, 6         ; m2 =  2,  3,  8,  9, 14, 15, 20, 21
345                                    ; m3 = 26, 27, 32, 33, 38, 39, 44, 45
346    SBUTTERFLY2 wd, 4, 5, 6         ; m4 =  4,  5, 10, 11, 16, 17, 22, 23
347                                    ; m5 = 28, 29, 34, 35, 40, 41, 46, 47
348                                    ; blend dwords
349    shufps      m6, m0, m2, q2020   ; m6 =  0,  1, 12, 13,  2,  3, 14, 15
350    shufps      m0, m4, q2031       ; m0 =  6,  7, 18, 19,  4,  5, 16, 17
351    shufps      m2, m4, q3131       ; m2 =  8,  9, 20, 21, 10, 11, 22, 23
352    SWAP 4,6                        ; m4 =  0,  1, 12, 13,  2,  3, 14, 15
353    shufps      m6, m1, m3, q2020   ; m6 = 24, 25, 36, 37, 26, 27, 38, 39
354    shufps      m1, m5, q2031       ; m1 = 30, 31, 42, 43, 28, 29, 40, 41
355    shufps      m3, m5, q3131       ; m3 = 32, 33, 44, 45, 34, 35, 46, 47
356    SWAP 5,6                        ; m5 = 24, 25, 36, 37, 26, 27, 38, 39
357                                    ; shuffle dwords
358    pshufd      m0, m0, q1302       ; m0 =  4,  5,  6,  7, 16, 17, 18, 19
359    pshufd      m2, m2, q3120       ; m2 =  8,  9, 10, 11, 20, 21, 22, 23
360    pshufd      m4, m4, q3120       ; m4 =  0,  1,  2,  3, 12, 13, 14, 15
361    pshufd      m1, m1, q1302       ; m1 = 28, 29, 30, 31, 40, 41, 42, 43
362    pshufd      m3, m3, q3120       ; m3 = 32, 33, 34, 35, 44, 45, 46, 47
363    pshufd      m5, m5, q3120       ; m5 = 24, 25, 26, 27, 36, 37, 38, 39
364                                    ; shuffle qwords
365    punpcklqdq  m6, m4, m0          ; m6 =  0,  1,  2,  3,  4,  5,  6,  7
366    punpckhqdq  m0, m2              ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
367    shufps      m2, m4, q3210       ; m2 =  8,  9, 10, 11, 12, 13, 14, 15
368    SWAP 4,6                        ; m4 =  0,  1,  2,  3,  4,  5,  6,  7
369    punpcklqdq  m6, m5, m1          ; m6 = 24, 25, 26, 27, 28, 29, 30, 31
370    punpckhqdq  m1, m3              ; m1 = 40, 41, 42, 43, 44, 45, 46, 47
371    shufps      m3, m5, q3210       ; m3 = 32, 33, 34, 35, 36, 37, 38, 39
372    SWAP 5,6                        ; m5 = 24, 25, 26, 27, 28, 29, 30, 31
373    mova   [dstq+0*mmsize], m4
374    mova   [dstq+1*mmsize], m2
375    mova   [dstq+2*mmsize], m0
376    mova   [dstq+3*mmsize], m5
377    mova   [dstq+4*mmsize], m3
378    mova   [dstq+5*mmsize], m1
379    add      src0q, mmsize
380    add       dstq, mmsize*6
381    sub       lend, mmsize/2
382%endif
383    jg .loop
384    REP_RET
385%endmacro
386
387INIT_XMM sse2
388CONV_S16P_TO_S16_6CH
389INIT_XMM sse2slow
390CONV_S16P_TO_S16_6CH
391%if HAVE_AVX_EXTERNAL
392INIT_XMM avx
393CONV_S16P_TO_S16_6CH
394%endif
395
396;------------------------------------------------------------------------------
397; void ff_conv_s16p_to_flt_2ch(float *dst, int16_t *const *src, int len,
398;                              int channels);
399;------------------------------------------------------------------------------
400
401%macro CONV_S16P_TO_FLT_2CH 0
402cglobal conv_s16p_to_flt_2ch, 3,4,6, dst, src0, len, src1
403    lea       lenq, [2*lend]
404    mov      src1q, [src0q+gprsize]
405    mov      src0q, [src0q        ]
406    lea       dstq, [dstq+4*lenq]
407    add      src0q, lenq
408    add      src1q, lenq
409    neg       lenq
410    mova        m5, [pf_s32_inv_scale]
411.loop:
412    mova        m2, [src0q+lenq]    ; m2 =  0,  2,  4,  6,  8, 10, 12, 14
413    mova        m4, [src1q+lenq]    ; m4 =  1,  3,  5,  7,  9, 11, 13, 15
414    SBUTTERFLY2 wd, 2, 4, 3         ; m2 =  0,  1,  2,  3,  4,  5,  6,  7
415                                    ; m4 =  8,  9, 10, 11, 12, 13, 14, 15
416    pxor        m3, m3
417    punpcklwd   m0, m3, m2          ; m0 =      0,      1,      2,      3
418    punpckhwd   m1, m3, m2          ; m1 =      4,      5,      6,      7
419    punpcklwd   m2, m3, m4          ; m2 =      8,      9,     10,     11
420    punpckhwd   m3, m4              ; m3 =     12,     13,     14,     15
421    cvtdq2ps    m0, m0
422    cvtdq2ps    m1, m1
423    cvtdq2ps    m2, m2
424    cvtdq2ps    m3, m3
425    mulps       m0, m5
426    mulps       m1, m5
427    mulps       m2, m5
428    mulps       m3, m5
429    mova  [dstq+4*lenq         ], m0
430    mova  [dstq+4*lenq+  mmsize], m1
431    mova  [dstq+4*lenq+2*mmsize], m2
432    mova  [dstq+4*lenq+3*mmsize], m3
433    add       lenq, mmsize
434    jl .loop
435    REP_RET
436%endmacro
437
438INIT_XMM sse2
439CONV_S16P_TO_FLT_2CH
440%if HAVE_AVX_EXTERNAL
441INIT_XMM avx
442CONV_S16P_TO_FLT_2CH
443%endif
444
445;------------------------------------------------------------------------------
446; void ff_conv_s16p_to_flt_6ch(float *dst, int16_t *const *src, int len,
447;                              int channels);
448;------------------------------------------------------------------------------
449
450%macro CONV_S16P_TO_FLT_6CH 0
451%if ARCH_X86_64
452cglobal conv_s16p_to_flt_6ch, 3,8,8, dst, src, len, src1, src2, src3, src4, src5
453%else
454cglobal conv_s16p_to_flt_6ch, 2,7,8, dst, src, src1, src2, src3, src4, src5
455%define lend dword r2m
456%endif
457    mov     src1q, [srcq+1*gprsize]
458    mov     src2q, [srcq+2*gprsize]
459    mov     src3q, [srcq+3*gprsize]
460    mov     src4q, [srcq+4*gprsize]
461    mov     src5q, [srcq+5*gprsize]
462    mov      srcq, [srcq]
463    sub     src1q, srcq
464    sub     src2q, srcq
465    sub     src3q, srcq
466    sub     src4q, srcq
467    sub     src5q, srcq
468    mova       m7, [pf_s32_inv_scale]
469%if cpuflag(ssse3)
470    %define unpack_even m6
471    mova       m6, [pb_shuf_unpack_even]
472%if ARCH_X86_64
473    %define unpack_odd m8
474    mova       m8, [pb_shuf_unpack_odd]
475%else
476    %define unpack_odd [pb_shuf_unpack_odd]
477%endif
478%endif
479.loop:
480    movq       m0, [srcq      ]  ; m0 =  0,  6, 12, 18,  x,  x,  x,  x
481    movq       m1, [srcq+src1q]  ; m1 =  1,  7, 13, 19,  x,  x,  x,  x
482    movq       m2, [srcq+src2q]  ; m2 =  2,  8, 14, 20,  x,  x,  x,  x
483    movq       m3, [srcq+src3q]  ; m3 =  3,  9, 15, 21,  x,  x,  x,  x
484    movq       m4, [srcq+src4q]  ; m4 =  4, 10, 16, 22,  x,  x,  x,  x
485    movq       m5, [srcq+src5q]  ; m5 =  5, 11, 17, 23,  x,  x,  x,  x
486                                 ; unpack words:
487    punpcklwd  m0, m1            ; m0 =  0,  1,  6,  7, 12, 13, 18, 19
488    punpcklwd  m2, m3            ; m2 =  2,  3,  8,  9, 14, 15, 20, 21
489    punpcklwd  m4, m5            ; m4 =  4,  5, 10, 11, 16, 17, 22, 23
490                                 ; blend dwords
491    shufps     m1, m4, m0, q3120 ; m1 =  4,  5, 16, 17,  6,  7, 18, 19
492    shufps         m0, m2, q2020 ; m0 =  0,  1, 12, 13,  2,  3, 14, 15
493    shufps         m2, m4, q3131 ; m2 =  8,  9, 20, 21, 10, 11, 22, 23
494%if cpuflag(ssse3)
495    pshufb     m3, m0, unpack_odd   ; m3 =  12,     13,     14,     15
496    pshufb         m0, unpack_even  ; m0 =   0,      1,      2,      3
497    pshufb     m4, m1, unpack_odd   ; m4 =  16,     17,     18,     19
498    pshufb         m1, unpack_even  ; m1 =   4,      5,      6,      7
499    pshufb     m5, m2, unpack_odd   ; m5 =  20,     21,     22,     23
500    pshufb         m2, unpack_even  ; m2 =   8,      9,     10,     11
501%else
502                                 ; shuffle dwords
503    pshufd     m0, m0, q3120     ; m0 =  0,  1,  2,  3, 12, 13, 14, 15
504    pshufd     m1, m1, q3120     ; m1 =  4,  5,  6,  7, 16, 17, 18, 19
505    pshufd     m2, m2, q3120     ; m2 =  8,  9, 10, 11, 20, 21, 22, 23
506    pxor       m6, m6            ; convert s16 in m0-m2 to s32 in m0-m5
507    punpcklwd  m3, m6, m0        ; m3 =      0,      1,      2,      3
508    punpckhwd  m4, m6, m0        ; m4 =     12,     13,     14,     15
509    punpcklwd  m0, m6, m1        ; m0 =      4,      5,      6,      7
510    punpckhwd  m5, m6, m1        ; m5 =     16,     17,     18,     19
511    punpcklwd  m1, m6, m2        ; m1 =      8,      9,     10,     11
512    punpckhwd      m6, m2        ; m6 =     20,     21,     22,     23
513    SWAP 6,2,1,0,3,4,5           ; swap registers 3,0,1,4,5,6 to 0,1,2,3,4,5
514%endif
515    cvtdq2ps   m0, m0            ; convert s32 to float
516    cvtdq2ps   m1, m1
517    cvtdq2ps   m2, m2
518    cvtdq2ps   m3, m3
519    cvtdq2ps   m4, m4
520    cvtdq2ps   m5, m5
521    mulps      m0, m7            ; scale float from s32 range to [-1.0,1.0]
522    mulps      m1, m7
523    mulps      m2, m7
524    mulps      m3, m7
525    mulps      m4, m7
526    mulps      m5, m7
527    mova  [dstq         ], m0
528    mova  [dstq+  mmsize], m1
529    mova  [dstq+2*mmsize], m2
530    mova  [dstq+3*mmsize], m3
531    mova  [dstq+4*mmsize], m4
532    mova  [dstq+5*mmsize], m5
533    add      srcq, mmsize/2
534    add      dstq, mmsize*6
535    sub      lend, mmsize/4
536    jg .loop
537    REP_RET
538%endmacro
539
540INIT_XMM sse2
541CONV_S16P_TO_FLT_6CH
542INIT_XMM ssse3
543CONV_S16P_TO_FLT_6CH
544%if HAVE_AVX_EXTERNAL
545INIT_XMM avx
546CONV_S16P_TO_FLT_6CH
547%endif
548
549;------------------------------------------------------------------------------
550; void ff_conv_fltp_to_s16_2ch(int16_t *dst, float *const *src, int len,
551;                              int channels);
552;------------------------------------------------------------------------------
553
554%macro CONV_FLTP_TO_S16_2CH 0
555cglobal conv_fltp_to_s16_2ch, 3,4,3, dst, src0, len, src1
556    lea      lenq, [4*lend]
557    mov     src1q, [src0q+gprsize]
558    mov     src0q, [src0q        ]
559    add      dstq, lenq
560    add     src0q, lenq
561    add     src1q, lenq
562    neg      lenq
563    mova       m2, [pf_s16_scale]
564%if cpuflag(ssse3)
565    mova       m3, [pb_interleave_words]
566%endif
567.loop:
568    mulps      m0, m2, [src0q+lenq] ; m0 =    0,    2,    4,    6
569    mulps      m1, m2, [src1q+lenq] ; m1 =    1,    3,    5,    7
570    cvtps2dq   m0, m0
571    cvtps2dq   m1, m1
572%if cpuflag(ssse3)
573    packssdw   m0, m1               ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
574    pshufb     m0, m3               ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
575%else
576    packssdw   m0, m0               ; m0 = 0, 2, 4, 6, x, x, x, x
577    packssdw   m1, m1               ; m1 = 1, 3, 5, 7, x, x, x, x
578    punpcklwd  m0, m1               ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
579%endif
580    mova  [dstq+lenq], m0
581    add      lenq, mmsize
582    jl .loop
583    REP_RET
584%endmacro
585
586INIT_XMM sse2
587CONV_FLTP_TO_S16_2CH
588INIT_XMM ssse3
589CONV_FLTP_TO_S16_2CH
590
591;------------------------------------------------------------------------------
592; void ff_conv_fltp_to_s16_6ch(int16_t *dst, float *const *src, int len,
593;                              int channels);
594;------------------------------------------------------------------------------
595
596%macro CONV_FLTP_TO_S16_6CH 0
597%if ARCH_X86_64
598cglobal conv_fltp_to_s16_6ch, 3,8,7, dst, src, len, src1, src2, src3, src4, src5
599%else
600cglobal conv_fltp_to_s16_6ch, 2,7,7, dst, src, src1, src2, src3, src4, src5
601%define lend dword r2m
602%endif
603    mov        src1q, [srcq+1*gprsize]
604    mov        src2q, [srcq+2*gprsize]
605    mov        src3q, [srcq+3*gprsize]
606    mov        src4q, [srcq+4*gprsize]
607    mov        src5q, [srcq+5*gprsize]
608    mov         srcq, [srcq]
609    sub        src1q, srcq
610    sub        src2q, srcq
611    sub        src3q, srcq
612    sub        src4q, srcq
613    sub        src5q, srcq
614    movaps      xmm6, [pf_s16_scale]
615.loop:
616%if cpuflag(sse2)
617    mulps         m0, m6, [srcq      ]
618    mulps         m1, m6, [srcq+src1q]
619    mulps         m2, m6, [srcq+src2q]
620    mulps         m3, m6, [srcq+src3q]
621    mulps         m4, m6, [srcq+src4q]
622    mulps         m5, m6, [srcq+src5q]
623    cvtps2dq      m0, m0
624    cvtps2dq      m1, m1
625    cvtps2dq      m2, m2
626    cvtps2dq      m3, m3
627    cvtps2dq      m4, m4
628    cvtps2dq      m5, m5
629    packssdw      m0, m3            ; m0 =  0,  6, 12, 18,  3,  9, 15, 21
630    packssdw      m1, m4            ; m1 =  1,  7, 13, 19,  4, 10, 16, 22
631    packssdw      m2, m5            ; m2 =  2,  8, 14, 20,  5, 11, 17, 23
632                                    ; unpack words:
633    movhlps       m3, m0            ; m3 =  3,  9, 15, 21,  x,  x,  x,  x
634    punpcklwd     m0, m1            ; m0 =  0,  1,  6,  7, 12, 13, 18, 19
635    punpckhwd     m1, m2            ; m1 =  4,  5, 10, 11, 16, 17, 22, 23
636    punpcklwd     m2, m3            ; m2 =  2,  3,  8,  9, 14, 15, 20, 21
637                                    ; blend dwords:
638    shufps        m3, m0, m2, q2020 ; m3 =  0,  1, 12, 13,  2,  3, 14, 15
639    shufps        m0, m1, q2031     ; m0 =  6,  7, 18, 19,  4,  5, 16, 17
640    shufps        m2, m1, q3131     ; m2 =  8,  9, 20, 21, 10, 11, 22, 23
641                                    ; shuffle dwords:
642    shufps        m1, m2, m3, q3120 ; m1 =  8,  9, 10, 11, 12, 13, 14, 15
643    shufps        m3, m0,     q0220 ; m3 =  0,  1,  2,  3,  4,  5,  6,  7
644    shufps        m0, m2,     q3113 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
645    mova  [dstq+0*mmsize], m3
646    mova  [dstq+1*mmsize], m1
647    mova  [dstq+2*mmsize], m0
648%else ; sse
649    movlps      xmm0, [srcq      ]
650    movlps      xmm1, [srcq+src1q]
651    movlps      xmm2, [srcq+src2q]
652    movlps      xmm3, [srcq+src3q]
653    movlps      xmm4, [srcq+src4q]
654    movlps      xmm5, [srcq+src5q]
655    mulps       xmm0, xmm6
656    mulps       xmm1, xmm6
657    mulps       xmm2, xmm6
658    mulps       xmm3, xmm6
659    mulps       xmm4, xmm6
660    mulps       xmm5, xmm6
661    cvtps2pi     mm0, xmm0
662    cvtps2pi     mm1, xmm1
663    cvtps2pi     mm2, xmm2
664    cvtps2pi     mm3, xmm3
665    cvtps2pi     mm4, xmm4
666    cvtps2pi     mm5, xmm5
667    packssdw     mm0, mm3           ; m0 =  0,  6,  3,  9
668    packssdw     mm1, mm4           ; m1 =  1,  7,  4, 10
669    packssdw     mm2, mm5           ; m2 =  2,  8,  5, 11
670                                    ; unpack words
671    pshufw       mm3, mm0, q1032    ; m3 =  3,  9,  0,  6
672    punpcklwd    mm0, mm1           ; m0 =  0,  1,  6,  7
673    punpckhwd    mm1, mm2           ; m1 =  4,  5, 10, 11
674    punpcklwd    mm2, mm3           ; m2 =  2,  3,  8,  9
675                                    ; unpack dwords
676    pshufw       mm3, mm0, q1032    ; m3 =  6,  7,  0,  1
677    punpckldq    mm0, mm2           ; m0 =  0,  1,  2,  3 (final)
678    punpckhdq    mm2, mm1           ; m2 =  8,  9, 10, 11 (final)
679    punpckldq    mm1, mm3           ; m1 =  4,  5,  6,  7 (final)
680    mova  [dstq+0*mmsize], mm0
681    mova  [dstq+1*mmsize], mm1
682    mova  [dstq+2*mmsize], mm2
683%endif
684    add       srcq, mmsize
685    add       dstq, mmsize*3
686    sub       lend, mmsize/4
687    jg .loop
688%if mmsize == 8
689    emms
690    RET
691%else
692    REP_RET
693%endif
694%endmacro
695
696INIT_MMX sse
697CONV_FLTP_TO_S16_6CH
698INIT_XMM sse2
699CONV_FLTP_TO_S16_6CH
700%if HAVE_AVX_EXTERNAL
701INIT_XMM avx
702CONV_FLTP_TO_S16_6CH
703%endif
704
705;------------------------------------------------------------------------------
706; void ff_conv_fltp_to_flt_2ch(float *dst, float *const *src, int len,
707;                              int channels);
708;------------------------------------------------------------------------------
709
710%macro CONV_FLTP_TO_FLT_2CH 0
711cglobal conv_fltp_to_flt_2ch, 3,4,5, dst, src0, len, src1
712    mov  src1q, [src0q+gprsize]
713    mov  src0q, [src0q]
714    lea   lenq, [4*lend]
715    add  src0q, lenq
716    add  src1q, lenq
717    lea   dstq, [dstq+2*lenq]
718    neg   lenq
719.loop:
720    mova    m0, [src0q+lenq       ]
721    mova    m1, [src1q+lenq       ]
722    mova    m2, [src0q+lenq+mmsize]
723    mova    m3, [src1q+lenq+mmsize]
724    SBUTTERFLYPS 0, 1, 4
725    SBUTTERFLYPS 2, 3, 4
726    mova  [dstq+2*lenq+0*mmsize], m0
727    mova  [dstq+2*lenq+1*mmsize], m1
728    mova  [dstq+2*lenq+2*mmsize], m2
729    mova  [dstq+2*lenq+3*mmsize], m3
730    add   lenq, 2*mmsize
731    jl .loop
732    REP_RET
733%endmacro
734
735INIT_XMM sse
736CONV_FLTP_TO_FLT_2CH
737%if HAVE_AVX_EXTERNAL
738INIT_XMM avx
739CONV_FLTP_TO_FLT_2CH
740%endif
741
742;-----------------------------------------------------------------------------
743; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len,
744;                              int channels);
745;-----------------------------------------------------------------------------
746
747%macro CONV_FLTP_TO_FLT_6CH 0
748cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
749%if ARCH_X86_64
750    mov     lend, r2d
751%else
752    %define lend dword r2m
753%endif
754    mov    src1q, [srcq+1*gprsize]
755    mov    src2q, [srcq+2*gprsize]
756    mov    src3q, [srcq+3*gprsize]
757    mov    src4q, [srcq+4*gprsize]
758    mov    src5q, [srcq+5*gprsize]
759    mov     srcq, [srcq]
760    sub    src1q, srcq
761    sub    src2q, srcq
762    sub    src3q, srcq
763    sub    src4q, srcq
764    sub    src5q, srcq
765.loop:
766    mova      m0, [srcq      ]
767    mova      m1, [srcq+src1q]
768    mova      m2, [srcq+src2q]
769    mova      m3, [srcq+src3q]
770    mova      m4, [srcq+src4q]
771    mova      m5, [srcq+src5q]
772%if cpuflag(sse4)
773    SBUTTERFLYPS 0, 1, 6
774    SBUTTERFLYPS 2, 3, 6
775    SBUTTERFLYPS 4, 5, 6
776
777    blendps   m6, m4, m0, 1100b
778    movlhps   m0, m2
779    movhlps   m4, m2
780    blendps   m2, m5, m1, 1100b
781    movlhps   m1, m3
782    movhlps   m5, m3
783
784    movaps [dstq   ], m0
785    movaps [dstq+16], m6
786    movaps [dstq+32], m4
787    movaps [dstq+48], m1
788    movaps [dstq+64], m2
789    movaps [dstq+80], m5
790%else ; mmx
791    SBUTTERFLY dq, 0, 1, 6
792    SBUTTERFLY dq, 2, 3, 6
793    SBUTTERFLY dq, 4, 5, 6
794
795    movq   [dstq   ], m0
796    movq   [dstq+ 8], m2
797    movq   [dstq+16], m4
798    movq   [dstq+24], m1
799    movq   [dstq+32], m3
800    movq   [dstq+40], m5
801%endif
802    add      srcq, mmsize
803    add      dstq, mmsize*6
804    sub      lend, mmsize/4
805    jg .loop
806%if mmsize == 8
807    emms
808    RET
809%else
810    REP_RET
811%endif
812%endmacro
813
814INIT_MMX mmx
815CONV_FLTP_TO_FLT_6CH
816INIT_XMM sse4
817CONV_FLTP_TO_FLT_6CH
818%if HAVE_AVX_EXTERNAL
819INIT_XMM avx
820CONV_FLTP_TO_FLT_6CH
821%endif
822
823;------------------------------------------------------------------------------
824; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len,
825;                              int channels);
826;------------------------------------------------------------------------------
827
828%macro CONV_S16_TO_S16P_2CH 0
829cglobal conv_s16_to_s16p_2ch, 3,4,4, dst0, src, len, dst1
830    lea       lenq, [2*lend]
831    mov      dst1q, [dst0q+gprsize]
832    mov      dst0q, [dst0q        ]
833    lea       srcq, [srcq+2*lenq]
834    add      dst0q, lenq
835    add      dst1q, lenq
836    neg       lenq
837%if cpuflag(ssse3)
838    mova        m3, [pb_deinterleave_words]
839%endif
840.loop:
841    mova        m0, [srcq+2*lenq       ]  ; m0 =  0,  1,  2,  3,  4,  5,  6,  7
842    mova        m1, [srcq+2*lenq+mmsize]  ; m1 =  8,  9, 10, 11, 12, 13, 14, 15
843%if cpuflag(ssse3)
844    pshufb      m0, m3                    ; m0 =  0,  2,  4,  6,  1,  3,  5,  7
845    pshufb      m1, m3                    ; m1 =  8, 10, 12, 14,  9, 11, 13, 15
846    SBUTTERFLY2 qdq, 0, 1, 2              ; m0 =  0,  2,  4,  6,  8, 10, 12, 14
847                                          ; m1 =  1,  3,  5,  7,  9, 11, 13, 15
848%else ; sse2
849    pshuflw     m0, m0, q3120             ; m0 =  0,  2,  1,  3,  4,  5,  6,  7
850    pshufhw     m0, m0, q3120             ; m0 =  0,  2,  1,  3,  4,  6,  5,  7
851    pshuflw     m1, m1, q3120             ; m1 =  8, 10,  9, 11, 12, 13, 14, 15
852    pshufhw     m1, m1, q3120             ; m1 =  8, 10,  9, 11, 12, 14, 13, 15
853    DEINT2_PS    0, 1, 2                  ; m0 =  0,  2,  4,  6,  8, 10, 12, 14
854                                          ; m1 =  1,  3,  5,  7,  9, 11, 13, 15
855%endif
856    mova  [dst0q+lenq], m0
857    mova  [dst1q+lenq], m1
858    add       lenq, mmsize
859    jl .loop
860    REP_RET
861%endmacro
862
863INIT_XMM sse2
864CONV_S16_TO_S16P_2CH
865INIT_XMM ssse3
866CONV_S16_TO_S16P_2CH
867%if HAVE_AVX_EXTERNAL
868INIT_XMM avx
869CONV_S16_TO_S16P_2CH
870%endif
871
872;------------------------------------------------------------------------------
873; void ff_conv_s16_to_s16p_6ch(int16_t *const *dst, int16_t *src, int len,
874;                              int channels);
875;------------------------------------------------------------------------------
876
877%macro CONV_S16_TO_S16P_6CH 0
878%if ARCH_X86_64
879cglobal conv_s16_to_s16p_6ch, 3,8,5, dst, src, len, dst1, dst2, dst3, dst4, dst5
880%else
881cglobal conv_s16_to_s16p_6ch, 2,7,5, dst, src, dst1, dst2, dst3, dst4, dst5
882%define lend dword r2m
883%endif
884    mov     dst1q, [dstq+  gprsize]
885    mov     dst2q, [dstq+2*gprsize]
886    mov     dst3q, [dstq+3*gprsize]
887    mov     dst4q, [dstq+4*gprsize]
888    mov     dst5q, [dstq+5*gprsize]
889    mov      dstq, [dstq          ]
890    sub     dst1q, dstq
891    sub     dst2q, dstq
892    sub     dst3q, dstq
893    sub     dst4q, dstq
894    sub     dst5q, dstq
895.loop:
896    mova       m0, [srcq+0*mmsize]      ; m0 =  0,  1,  2,  3,  4,  5,  6,  7
897    mova       m3, [srcq+1*mmsize]      ; m3 =  8,  9, 10, 11, 12, 13, 14, 15
898    mova       m2, [srcq+2*mmsize]      ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
899    PALIGNR    m1, m3, m0, 12, m4       ; m1 =  6,  7,  8,  9, 10, 11,  x,  x
900    shufps     m3, m2, q1032            ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
901    psrldq     m2, 4                    ; m2 = 18, 19, 20, 21, 22, 23,  x,  x
902    SBUTTERFLY2 wd, 0, 1, 4             ; m0 =  0,  6,  1,  7,  2,  8,  3,  9
903                                        ; m1 =  4, 10,  5, 11,  x,  x,  x,  x
904    SBUTTERFLY2 wd, 3, 2, 4             ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
905                                        ; m2 = 16, 22, 17, 23,  x,  x,  x,  x
906    SBUTTERFLY2 dq, 0, 3, 4             ; m0 =  0,  6, 12, 18,  1,  7, 13, 19
907                                        ; m3 =  2,  8, 14, 20,  3,  9, 15, 21
908    punpckldq  m1, m2                   ; m1 =  4, 10, 16, 22,  5, 11, 17, 23
909    movq    [dstq      ], m0
910    movhps  [dstq+dst1q], m0
911    movq    [dstq+dst2q], m3
912    movhps  [dstq+dst3q], m3
913    movq    [dstq+dst4q], m1
914    movhps  [dstq+dst5q], m1
915    add      srcq, mmsize*3
916    add      dstq, mmsize/2
917    sub      lend, mmsize/4
918    jg .loop
919    REP_RET
920%endmacro
921
922INIT_XMM sse2
923CONV_S16_TO_S16P_6CH
924INIT_XMM ssse3
925CONV_S16_TO_S16P_6CH
926%if HAVE_AVX_EXTERNAL
927INIT_XMM avx
928CONV_S16_TO_S16P_6CH
929%endif
930
931;------------------------------------------------------------------------------
932; void ff_conv_s16_to_fltp_2ch(float *const *dst, int16_t *src, int len,
933;                              int channels);
934;------------------------------------------------------------------------------
935
936%macro CONV_S16_TO_FLTP_2CH 0
937cglobal conv_s16_to_fltp_2ch, 3,4,5, dst0, src, len, dst1
938    lea       lenq, [4*lend]
939    mov      dst1q, [dst0q+gprsize]
940    mov      dst0q, [dst0q        ]
941    add       srcq, lenq
942    add      dst0q, lenq
943    add      dst1q, lenq
944    neg       lenq
945    mova        m3, [pf_s32_inv_scale]
946    mova        m4, [pw_zero_even]
947.loop:
948    mova        m1, [srcq+lenq]
949    pslld       m0, m1, 16
950    pand        m1, m4
951    cvtdq2ps    m0, m0
952    cvtdq2ps    m1, m1
953    mulps       m0, m0, m3
954    mulps       m1, m1, m3
955    mova  [dst0q+lenq], m0
956    mova  [dst1q+lenq], m1
957    add       lenq, mmsize
958    jl .loop
959    REP_RET
960%endmacro
961
962INIT_XMM sse2
963CONV_S16_TO_FLTP_2CH
964%if HAVE_AVX_EXTERNAL
965INIT_XMM avx
966CONV_S16_TO_FLTP_2CH
967%endif
968
969;------------------------------------------------------------------------------
970; void ff_conv_s16_to_fltp_6ch(float *const *dst, int16_t *src, int len,
971;                              int channels);
972;------------------------------------------------------------------------------
973
974%macro CONV_S16_TO_FLTP_6CH 0
975%if ARCH_X86_64
976cglobal conv_s16_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
977%else
978cglobal conv_s16_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
979%define lend dword r2m
980%endif
981    mov     dst1q, [dstq+  gprsize]
982    mov     dst2q, [dstq+2*gprsize]
983    mov     dst3q, [dstq+3*gprsize]
984    mov     dst4q, [dstq+4*gprsize]
985    mov     dst5q, [dstq+5*gprsize]
986    mov      dstq, [dstq          ]
987    sub     dst1q, dstq
988    sub     dst2q, dstq
989    sub     dst3q, dstq
990    sub     dst4q, dstq
991    sub     dst5q, dstq
992    mova       m6, [pf_s16_inv_scale]
993.loop:
994    mova       m0, [srcq+0*mmsize]  ; m0 =  0,  1,  2,  3,  4,  5,  6,  7
995    mova       m3, [srcq+1*mmsize]  ; m3 =  8,  9, 10, 11, 12, 13, 14, 15
996    mova       m2, [srcq+2*mmsize]  ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
997    PALIGNR    m1, m3, m0, 12, m4   ; m1 =  6,  7,  8,  9, 10, 11,  x,  x
998    shufps     m3, m2, q1032        ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
999    psrldq     m2, 4                ; m2 = 18, 19, 20, 21, 22, 23,  x,  x
1000    SBUTTERFLY2 wd, 0, 1, 4         ; m0 =  0,  6,  1,  7,  2,  8,  3,  9
1001                                    ; m1 =  4, 10,  5, 11,  x,  x,  x,  x
1002    SBUTTERFLY2 wd, 3, 2, 4         ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
1003                                    ; m2 = 16, 22, 17, 23,  x,  x,  x,  x
1004    SBUTTERFLY2 dq, 0, 3, 4         ; m0 =  0,  6, 12, 18,  1,  7, 13, 19
1005                                    ; m3 =  2,  8, 14, 20,  3,  9, 15, 21
1006    punpckldq  m1, m2               ; m1 =  4, 10, 16, 22,  5, 11, 17, 23
1007    S16_TO_S32_SX 0, 2              ; m0 =      0,      6,     12,     18
1008                                    ; m2 =      1,      7,     13,     19
1009    S16_TO_S32_SX 3, 4              ; m3 =      2,      8,     14,     20
1010                                    ; m4 =      3,      9,     15,     21
1011    S16_TO_S32_SX 1, 5              ; m1 =      4,     10,     16,     22
1012                                    ; m5 =      5,     11,     17,     23
1013    SWAP 1,2,3,4
1014    cvtdq2ps   m0, m0
1015    cvtdq2ps   m1, m1
1016    cvtdq2ps   m2, m2
1017    cvtdq2ps   m3, m3
1018    cvtdq2ps   m4, m4
1019    cvtdq2ps   m5, m5
1020    mulps      m0, m6
1021    mulps      m1, m6
1022    mulps      m2, m6
1023    mulps      m3, m6
1024    mulps      m4, m6
1025    mulps      m5, m6
1026    mova  [dstq      ], m0
1027    mova  [dstq+dst1q], m1
1028    mova  [dstq+dst2q], m2
1029    mova  [dstq+dst3q], m3
1030    mova  [dstq+dst4q], m4
1031    mova  [dstq+dst5q], m5
1032    add      srcq, mmsize*3
1033    add      dstq, mmsize
1034    sub      lend, mmsize/4
1035    jg .loop
1036    REP_RET
1037%endmacro
1038
1039INIT_XMM sse2
1040CONV_S16_TO_FLTP_6CH
1041INIT_XMM ssse3
1042CONV_S16_TO_FLTP_6CH
1043INIT_XMM sse4
1044CONV_S16_TO_FLTP_6CH
1045%if HAVE_AVX_EXTERNAL
1046INIT_XMM avx
1047CONV_S16_TO_FLTP_6CH
1048%endif
1049
1050;------------------------------------------------------------------------------
1051; void ff_conv_flt_to_s16p_2ch(int16_t *const *dst, float *src, int len,
1052;                              int channels);
1053;------------------------------------------------------------------------------
1054
1055%macro CONV_FLT_TO_S16P_2CH 0
1056cglobal conv_flt_to_s16p_2ch, 3,4,6, dst0, src, len, dst1
1057    lea       lenq, [2*lend]
1058    mov      dst1q, [dst0q+gprsize]
1059    mov      dst0q, [dst0q        ]
1060    lea       srcq, [srcq+4*lenq]
1061    add      dst0q, lenq
1062    add      dst1q, lenq
1063    neg       lenq
1064    mova        m5, [pf_s16_scale]
1065.loop:
1066    mova       m0, [srcq+4*lenq         ]
1067    mova       m1, [srcq+4*lenq+  mmsize]
1068    mova       m2, [srcq+4*lenq+2*mmsize]
1069    mova       m3, [srcq+4*lenq+3*mmsize]
1070    DEINT2_PS   0, 1, 4
1071    DEINT2_PS   2, 3, 4
1072    mulps      m0, m0, m5
1073    mulps      m1, m1, m5
1074    mulps      m2, m2, m5
1075    mulps      m3, m3, m5
1076    cvtps2dq   m0, m0
1077    cvtps2dq   m1, m1
1078    cvtps2dq   m2, m2
1079    cvtps2dq   m3, m3
1080    packssdw   m0, m2
1081    packssdw   m1, m3
1082    mova  [dst0q+lenq], m0
1083    mova  [dst1q+lenq], m1
1084    add      lenq, mmsize
1085    jl .loop
1086    REP_RET
1087%endmacro
1088
1089INIT_XMM sse2
1090CONV_FLT_TO_S16P_2CH
1091%if HAVE_AVX_EXTERNAL
1092INIT_XMM avx
1093CONV_FLT_TO_S16P_2CH
1094%endif
1095
1096;------------------------------------------------------------------------------
1097; void ff_conv_flt_to_s16p_6ch(int16_t *const *dst, float *src, int len,
1098;                              int channels);
1099;------------------------------------------------------------------------------
1100
1101%macro CONV_FLT_TO_S16P_6CH 0
1102%if ARCH_X86_64
1103cglobal conv_flt_to_s16p_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
1104%else
1105cglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
1106%define lend dword r2m
1107%endif
1108    mov     dst1q, [dstq+  gprsize]
1109    mov     dst2q, [dstq+2*gprsize]
1110    mov     dst3q, [dstq+3*gprsize]
1111    mov     dst4q, [dstq+4*gprsize]
1112    mov     dst5q, [dstq+5*gprsize]
1113    mov      dstq, [dstq          ]
1114    sub     dst1q, dstq
1115    sub     dst2q, dstq
1116    sub     dst3q, dstq
1117    sub     dst4q, dstq
1118    sub     dst5q, dstq
1119    mova       m6, [pf_s16_scale]
1120.loop:
1121    mulps      m0, m6, [srcq+0*mmsize]
1122    mulps      m3, m6, [srcq+1*mmsize]
1123    mulps      m1, m6, [srcq+2*mmsize]
1124    mulps      m4, m6, [srcq+3*mmsize]
1125    mulps      m2, m6, [srcq+4*mmsize]
1126    mulps      m5, m6, [srcq+5*mmsize]
1127    cvtps2dq   m0, m0
1128    cvtps2dq   m1, m1
1129    cvtps2dq   m2, m2
1130    cvtps2dq   m3, m3
1131    cvtps2dq   m4, m4
1132    cvtps2dq   m5, m5
1133    packssdw   m0, m3               ; m0 =  0,  1,  2,  3,  4,  5,  6,  7
1134    packssdw   m1, m4               ; m1 =  8,  9, 10, 11, 12, 13, 14, 15
1135    packssdw   m2, m5               ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
1136    PALIGNR    m3, m1, m0, 12, m4   ; m3 =  6,  7,  8,  9, 10, 11,  x,  x
1137    shufps     m1, m2, q1032        ; m1 = 12, 13, 14, 15, 16, 17, 18, 19
1138    psrldq     m2, 4                ; m2 = 18, 19, 20, 21, 22, 23,  x,  x
1139    SBUTTERFLY2 wd, 0, 3, 4         ; m0 =  0,  6,  1,  7,  2,  8,  3,  9
1140                                    ; m3 =  4, 10,  5, 11,  x,  x,  x,  x
1141    SBUTTERFLY2 wd, 1, 2, 4         ; m1 = 12, 18, 13, 19, 14, 20, 15, 21
1142                                    ; m2 = 16, 22, 17, 23,  x,  x,  x,  x
1143    SBUTTERFLY2 dq, 0, 1, 4         ; m0 =  0,  6, 12, 18,  1,  7, 13, 19
1144                                    ; m1 =  2,  8, 14, 20,  3,  9, 15, 21
1145    punpckldq  m3, m2               ; m3 =  4, 10, 16, 22,  5, 11, 17, 23
1146    movq    [dstq      ], m0
1147    movhps  [dstq+dst1q], m0
1148    movq    [dstq+dst2q], m1
1149    movhps  [dstq+dst3q], m1
1150    movq    [dstq+dst4q], m3
1151    movhps  [dstq+dst5q], m3
1152    add      srcq, mmsize*6
1153    add      dstq, mmsize/2
1154    sub      lend, mmsize/4
1155    jg .loop
1156    REP_RET
1157%endmacro
1158
1159INIT_XMM sse2
1160CONV_FLT_TO_S16P_6CH
1161INIT_XMM ssse3
1162CONV_FLT_TO_S16P_6CH
1163%if HAVE_AVX_EXTERNAL
1164INIT_XMM avx
1165CONV_FLT_TO_S16P_6CH
1166%endif
1167
1168;------------------------------------------------------------------------------
1169; void ff_conv_flt_to_fltp_2ch(float *const *dst, float *src, int len,
1170;                              int channels);
1171;------------------------------------------------------------------------------
1172
1173%macro CONV_FLT_TO_FLTP_2CH 0
1174cglobal conv_flt_to_fltp_2ch, 3,4,3, dst0, src, len, dst1
1175    lea    lenq, [4*lend]
1176    mov   dst1q, [dst0q+gprsize]
1177    mov   dst0q, [dst0q        ]
1178    lea    srcq, [srcq+2*lenq]
1179    add   dst0q, lenq
1180    add   dst1q, lenq
1181    neg    lenq
1182.loop:
1183    mova     m0, [srcq+2*lenq       ]
1184    mova     m1, [srcq+2*lenq+mmsize]
1185    DEINT2_PS 0, 1, 2
1186    mova  [dst0q+lenq], m0
1187    mova  [dst1q+lenq], m1
1188    add    lenq, mmsize
1189    jl .loop
1190    REP_RET
1191%endmacro
1192
1193INIT_XMM sse
1194CONV_FLT_TO_FLTP_2CH
1195%if HAVE_AVX_EXTERNAL
1196INIT_XMM avx
1197CONV_FLT_TO_FLTP_2CH
1198%endif
1199
1200;------------------------------------------------------------------------------
1201; void ff_conv_flt_to_fltp_6ch(float *const *dst, float *src, int len,
1202;                              int channels);
1203;------------------------------------------------------------------------------
1204
1205%macro CONV_FLT_TO_FLTP_6CH 0
1206%if ARCH_X86_64
1207cglobal conv_flt_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
1208%else
1209cglobal conv_flt_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
1210%define lend dword r2m
1211%endif
1212    mov     dst1q, [dstq+  gprsize]
1213    mov     dst2q, [dstq+2*gprsize]
1214    mov     dst3q, [dstq+3*gprsize]
1215    mov     dst4q, [dstq+4*gprsize]
1216    mov     dst5q, [dstq+5*gprsize]
1217    mov      dstq, [dstq          ]
1218    sub     dst1q, dstq
1219    sub     dst2q, dstq
1220    sub     dst3q, dstq
1221    sub     dst4q, dstq
1222    sub     dst5q, dstq
1223.loop:
1224    mova       m0, [srcq+0*mmsize]  ; m0 =  0,  1,  2,  3
1225    mova       m1, [srcq+1*mmsize]  ; m1 =  4,  5,  6,  7
1226    mova       m2, [srcq+2*mmsize]  ; m2 =  8,  9, 10, 11
1227    mova       m3, [srcq+3*mmsize]  ; m3 = 12, 13, 14, 15
1228    mova       m4, [srcq+4*mmsize]  ; m4 = 16, 17, 18, 19
1229    mova       m5, [srcq+5*mmsize]  ; m5 = 20, 21, 22, 23
1230
1231    SBUTTERFLY2 dq, 0, 3, 6         ; m0 =  0, 12,  1, 13
1232                                    ; m3 =  2, 14,  3, 15
1233    SBUTTERFLY2 dq, 1, 4, 6         ; m1 =  4, 16,  5, 17
1234                                    ; m4 =  6, 18,  7, 19
1235    SBUTTERFLY2 dq, 2, 5, 6         ; m2 =  8, 20,  9, 21
1236                                    ; m5 = 10, 22, 11, 23
1237    SBUTTERFLY2 dq, 0, 4, 6         ; m0 =  0,  6, 12, 18
1238                                    ; m4 =  1,  7, 13, 19
1239    SBUTTERFLY2 dq, 3, 2, 6         ; m3 =  2,  8, 14, 20
1240                                    ; m2 =  3,  9, 15, 21
1241    SBUTTERFLY2 dq, 1, 5, 6         ; m1 =  4, 10, 16, 22
1242                                    ; m5 =  5, 11, 17, 23
1243    mova [dstq      ], m0
1244    mova [dstq+dst1q], m4
1245    mova [dstq+dst2q], m3
1246    mova [dstq+dst3q], m2
1247    mova [dstq+dst4q], m1
1248    mova [dstq+dst5q], m5
1249    add      srcq, mmsize*6
1250    add      dstq, mmsize
1251    sub      lend, mmsize/4
1252    jg .loop
1253    REP_RET
1254%endmacro
1255
1256INIT_XMM sse2
1257CONV_FLT_TO_FLTP_6CH
1258%if HAVE_AVX_EXTERNAL
1259INIT_XMM avx
1260CONV_FLT_TO_FLTP_6CH
1261%endif
1262