• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;******************************************************************************
2;* Copyright (c) 2012 Michael Niedermayer
3;* Copyright (c) 2014 James Almer <jamrial <at> gmail.com>
4;* Copyright (c) 2014 Ronald S. Bultje <rsbultje@gmail.com>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25%if ARCH_X86_64
26%define pointer resq
27%else
28%define pointer resd
29%endif
30
31struc ResampleContext
32    .av_class:              pointer 1
33    .filter_bank:           pointer 1
34    .filter_length:         resd 1
35    .filter_alloc:          resd 1
36    .ideal_dst_incr:        resd 1
37    .dst_incr:              resd 1
38    .dst_incr_div:          resd 1
39    .dst_incr_mod:          resd 1
40    .index:                 resd 1
41    .frac:                  resd 1
42    .src_incr:              resd 1
43    .compensation_distance: resd 1
44    .phase_count:           resd 1
45
46    ; there's a few more here but we only care about the first few
47endstruc
48
49SECTION_RODATA
50
51pf_1:      dd 1.0
52pdbl_1:    dq 1.0
53pd_0x4000: dd 0x4000
54
55SECTION .text
56
57; FIXME remove unneeded variables (index_incr, phase_mask)
58%macro RESAMPLE_FNS 3-5 ; format [float or int16], bps, log2_bps, float op suffix [s or d], 1.0 constant
59; int resample_common_$format(ResampleContext *ctx, $format *dst,
60;                             const $format *src, int size, int update_ctx)
61%if ARCH_X86_64 ; unix64 and win64
62cglobal resample_common_%1, 0, 15, 2, ctx, dst, src, phase_count, index, frac, \
63                                      dst_incr_mod, size, min_filter_count_x4, \
64                                      min_filter_len_x4, dst_incr_div, src_incr, \
65                                      phase_mask, dst_end, filter_bank
66
67    ; use red-zone for variable storage
68%define ctx_stackq            [rsp-0x8]
69%define src_stackq            [rsp-0x10]
70%if WIN64
71%define update_context_stackd r4m
72%else ; unix64
73%define update_context_stackd [rsp-0x14]
74%endif
75
76    ; load as many variables in registers as possible; for the rest, store
77    ; on stack so that we have 'ctx' available as one extra register
78    mov                        sized, r3d
79%if UNIX64
80    mov        update_context_stackd, r4d
81%endif
82    mov                       indexd, [ctxq+ResampleContext.index]
83    mov                        fracd, [ctxq+ResampleContext.frac]
84    mov                dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod]
85    mov                 filter_bankq, [ctxq+ResampleContext.filter_bank]
86    mov                    src_incrd, [ctxq+ResampleContext.src_incr]
87    mov                   ctx_stackq, ctxq
88    mov           min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
89    mov                dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
90    shl           min_filter_len_x4d, %3
91    lea                     dst_endq, [dstq+sizeq*%2]
92
93%if UNIX64
94    mov                          ecx, [ctxq+ResampleContext.phase_count]
95    mov                          edi, [ctxq+ResampleContext.filter_alloc]
96
97    DEFINE_ARGS filter_alloc, dst, src, phase_count, index, frac, dst_incr_mod, \
98                filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
99                src_incr, phase_mask, dst_end, filter_bank
100%elif WIN64
101    mov                          R9d, [ctxq+ResampleContext.filter_alloc]
102    mov                          ecx, [ctxq+ResampleContext.phase_count]
103
104    DEFINE_ARGS phase_count, dst, src, filter_alloc, index, frac, dst_incr_mod, \
105                filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
106                src_incr, phase_mask, dst_end, filter_bank
107%endif
108
109    neg           min_filter_len_x4q
110    sub                 filter_bankq, min_filter_len_x4q
111    sub                         srcq, min_filter_len_x4q
112    mov                   src_stackq, srcq
113%else ; x86-32
114cglobal resample_common_%1, 1, 7, 2, ctx, phase_count, dst, frac, \
115                                     index, min_filter_length_x4, filter_bank
116
117    ; push temp variables to stack
118%define ctx_stackq            r0mp
119%define src_stackq            r2mp
120%define update_context_stackd r4m
121
122    mov                         dstq, r1mp
123    mov                           r3, r3mp
124    lea                           r3, [dstq+r3*%2]
125    PUSH                              dword [ctxq+ResampleContext.dst_incr_div]
126    PUSH                              dword [ctxq+ResampleContext.dst_incr_mod]
127    PUSH                              dword [ctxq+ResampleContext.filter_alloc]
128    PUSH                              r3
129    PUSH                              dword [ctxq+ResampleContext.phase_count]  ; unneeded replacement for phase_mask
130    PUSH                              dword [ctxq+ResampleContext.src_incr]
131    mov        min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
132    mov                       indexd, [ctxq+ResampleContext.index]
133    shl        min_filter_length_x4d, %3
134    mov                        fracd, [ctxq+ResampleContext.frac]
135    neg        min_filter_length_x4q
136    mov                 filter_bankq, [ctxq+ResampleContext.filter_bank]
137    sub                         r2mp, min_filter_length_x4q
138    sub                 filter_bankq, min_filter_length_x4q
139    PUSH                              min_filter_length_x4q
140    PUSH                              filter_bankq
141    mov                 phase_countd, [ctxq+ResampleContext.phase_count]
142
143    DEFINE_ARGS src, phase_count, dst, frac, index, min_filter_count_x4, filter
144
145%define filter_bankq          dword [rsp+0x0]
146%define min_filter_length_x4q dword [rsp+0x4]
147%define src_incrd             dword [rsp+0x8]
148%define phase_maskd           dword [rsp+0xc]
149%define dst_endq              dword [rsp+0x10]
150%define filter_allocd         dword [rsp+0x14]
151%define dst_incr_modd         dword [rsp+0x18]
152%define dst_incr_divd         dword [rsp+0x1c]
153
154    mov                         srcq, r2mp
155%endif
156
157.loop:
158    mov                      filterd, filter_allocd
159    imul                     filterd, indexd
160%if ARCH_X86_64
161    mov         min_filter_count_x4q, min_filter_len_x4q
162    lea                      filterq, [filter_bankq+filterq*%2]
163%else ; x86-32
164    mov         min_filter_count_x4q, filter_bankq
165    lea                      filterq, [min_filter_count_x4q+filterq*%2]
166    mov         min_filter_count_x4q, min_filter_length_x4q
167%endif
168%ifidn %1, int16
169    movd                          m0, [pd_0x4000]
170%else ; float/double
171    xorps                         m0, m0, m0
172%endif
173
174    align 16
175.inner_loop:
176    movu                          m1, [srcq+min_filter_count_x4q*1]
177%ifidn %1, int16
178%if cpuflag(xop)
179    vpmadcswd                     m0, m1, [filterq+min_filter_count_x4q*1], m0
180%else
181    pmaddwd                       m1, [filterq+min_filter_count_x4q*1]
182    paddd                         m0, m1
183%endif
184%else ; float/double
185%if cpuflag(fma4) || cpuflag(fma3)
186    fmaddp%4                      m0, m1, [filterq+min_filter_count_x4q*1], m0
187%else
188    mulp%4                        m1, m1, [filterq+min_filter_count_x4q*1]
189    addp%4                        m0, m0, m1
190%endif ; cpuflag
191%endif
192    add         min_filter_count_x4q, mmsize
193    js .inner_loop
194
195%ifidn %1, int16
196    HADDD                         m0, m1
197    psrad                         m0, 15
198    add                        fracd, dst_incr_modd
199    packssdw                      m0, m0
200    add                       indexd, dst_incr_divd
201    movd                      [dstq], m0
202%else ; float/double
203    ; horizontal sum & store
204%if mmsize == 32
205    vextractf128                 xm1, m0, 0x1
206    addp%4                       xm0, xm1
207%endif
208    movhlps                      xm1, xm0
209%ifidn %1, float
210    addps                        xm0, xm1
211    shufps                       xm1, xm0, xm0, q0001
212%endif
213    add                        fracd, dst_incr_modd
214    addp%4                       xm0, xm1
215    add                       indexd, dst_incr_divd
216    movs%4                    [dstq], xm0
217%endif
218    cmp                        fracd, src_incrd
219    jl .skip
220    sub                        fracd, src_incrd
221    inc                       indexd
222
223%if UNIX64
224    DEFINE_ARGS filter_alloc, dst, src, phase_count, index, frac, dst_incr_mod, \
225                index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
226                src_incr, phase_mask, dst_end, filter_bank
227%elif WIN64
228    DEFINE_ARGS phase_count, dst, src, filter_alloc, index, frac, dst_incr_mod, \
229                index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
230                src_incr, phase_mask, dst_end, filter_bank
231%else ; x86-32
232    DEFINE_ARGS src, phase_count, dst, frac, index, index_incr
233%endif
234
235.skip:
236    add                         dstq, %2
237    cmp                       indexd, phase_countd
238    jb .index_skip
239.index_while:
240    sub                       indexd, phase_countd
241    lea                         srcq, [srcq+%2]
242    cmp                       indexd, phase_countd
243    jnb .index_while
244.index_skip:
245    cmp                         dstq, dst_endq
246    jne .loop
247
248%if ARCH_X86_64
249    DEFINE_ARGS ctx, dst, src, phase_count, index, frac
250%else ; x86-32
251    DEFINE_ARGS src, ctx, update_context, frac, index
252%endif
253
254    cmp  dword update_context_stackd, 0
255    jz .skip_store
256    ; strictly speaking, the function should always return the consumed
257    ; number of bytes; however, we only use the value if update_context
258    ; is true, so let's just leave it uninitialized otherwise
259    mov                         ctxq, ctx_stackq
260    movifnidn                    rax, srcq
261    mov [ctxq+ResampleContext.frac ], fracd
262    sub                          rax, src_stackq
263    mov [ctxq+ResampleContext.index], indexd
264    shr                          rax, %3
265
266.skip_store:
267%if ARCH_X86_32
268    ADD                          rsp, 0x20
269%endif
270    RET
271
272; int resample_linear_$format(ResampleContext *ctx, float *dst,
273;                             const float *src, int size, int update_ctx)
274%if ARCH_X86_64 ; unix64 and win64
275%if UNIX64
276cglobal resample_linear_%1, 0, 15, 5, ctx, dst, phase_mask, phase_count, index, frac, \
277                                      size, dst_incr_mod, min_filter_count_x4, \
278                                      min_filter_len_x4, dst_incr_div, src_incr, \
279                                      src, dst_end, filter_bank
280
281    mov                         srcq, r2mp
282%else ; win64
283cglobal resample_linear_%1, 0, 15, 5, ctx, phase_mask, src, phase_count, index, frac, \
284                                      size, dst_incr_mod, min_filter_count_x4, \
285                                      min_filter_len_x4, dst_incr_div, src_incr, \
286                                      dst, dst_end, filter_bank
287
288    mov                         dstq, r1mp
289%endif
290
291    ; use red-zone for variable storage
292%define ctx_stackq            [rsp-0x8]
293%define src_stackq            [rsp-0x10]
294%define phase_mask_stackd     [rsp-0x14]
295%if WIN64
296%define update_context_stackd r4m
297%else ; unix64
298%define update_context_stackd [rsp-0x18]
299%endif
300
301    ; load as many variables in registers as possible; for the rest, store
302    ; on stack so that we have 'ctx' available as one extra register
303    mov                        sized, r3d
304%if UNIX64
305    mov        update_context_stackd, r4d
306%endif
307    mov                       indexd, [ctxq+ResampleContext.index]
308    mov                        fracd, [ctxq+ResampleContext.frac]
309    mov                dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod]
310    mov                 filter_bankq, [ctxq+ResampleContext.filter_bank]
311    mov                    src_incrd, [ctxq+ResampleContext.src_incr]
312    mov                   ctx_stackq, ctxq
313    mov           min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
314%ifidn %1, int16
315    movd                          m4, [pd_0x4000]
316%else ; float/double
317    cvtsi2s%4                    xm0, src_incrd
318    movs%4                       xm4, [%5]
319    divs%4                       xm4, xm0
320%endif
321    mov                dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
322    shl           min_filter_len_x4d, %3
323    lea                     dst_endq, [dstq+sizeq*%2]
324
325%if UNIX64
326    mov                          ecx, [ctxq+ResampleContext.phase_count]
327    mov                          edi, [ctxq+ResampleContext.filter_alloc]
328
329    DEFINE_ARGS filter_alloc, dst, filter2, phase_count, index, frac, filter1, \
330                dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
331                dst_incr_div, src_incr, src, dst_end, filter_bank
332%elif WIN64
333    mov                          R9d, [ctxq+ResampleContext.filter_alloc]
334    mov                          ecx, [ctxq+ResampleContext.phase_count]
335
336    DEFINE_ARGS phase_count, filter2, src, filter_alloc, index, frac, filter1, \
337                dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
338                dst_incr_div, src_incr, dst, dst_end, filter_bank
339%endif
340
341    neg           min_filter_len_x4q
342    sub                 filter_bankq, min_filter_len_x4q
343    sub                         srcq, min_filter_len_x4q
344    mov                   src_stackq, srcq
345%else ; x86-32
346cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
347                                     frac, index, dst, filter_bank
348
349    ; push temp variables to stack
350%define ctx_stackq            r0mp
351%define src_stackq            r2mp
352%define update_context_stackd r4m
353
354    mov                         dstq, r1mp
355    mov                           r3, r3mp
356    lea                           r3, [dstq+r3*%2]
357    PUSH                              dword [ctxq+ResampleContext.dst_incr_div]
358    PUSH                              r3
359    mov                           r3, dword [ctxq+ResampleContext.filter_alloc]
360    PUSH                              dword [ctxq+ResampleContext.dst_incr_mod]
361    PUSH                              r3
362    shl                           r3, %3
363    PUSH                              r3
364    mov                           r3, dword [ctxq+ResampleContext.src_incr]
365    PUSH                              dword [ctxq+ResampleContext.phase_count]  ; unneeded replacement of phase_mask
366    PUSH                              r3d
367%ifidn %1, int16
368    movd                          m4, [pd_0x4000]
369%else ; float/double
370    cvtsi2s%4                    xm0, r3d
371    movs%4                       xm4, [%5]
372    divs%4                       xm4, xm0
373%endif
374    mov        min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
375    mov                       indexd, [ctxq+ResampleContext.index]
376    shl        min_filter_length_x4d, %3
377    mov                        fracd, [ctxq+ResampleContext.frac]
378    neg        min_filter_length_x4q
379    mov                 filter_bankq, [ctxq+ResampleContext.filter_bank]
380    sub                         r2mp, min_filter_length_x4q
381    sub                 filter_bankq, min_filter_length_x4q
382    PUSH                              min_filter_length_x4q
383    PUSH                              filter_bankq
384    PUSH                              dword [ctxq+ResampleContext.phase_count]
385
386    DEFINE_ARGS filter1, min_filter_count_x4, filter2, frac, index, dst, src
387
388%define phase_count_stackd    dword [rsp+0x0]
389%define filter_bankq          dword [rsp+0x4]
390%define min_filter_length_x4q dword [rsp+0x8]
391%define src_incrd             dword [rsp+0xc]
392%define phase_mask_stackd     dword [rsp+0x10]
393%define filter_alloc_x4q      dword [rsp+0x14]
394%define filter_allocd         dword [rsp+0x18]
395%define dst_incr_modd         dword [rsp+0x1c]
396%define dst_endq              dword [rsp+0x20]
397%define dst_incr_divd         dword [rsp+0x24]
398
399    mov                         srcq, r2mp
400%endif
401
402.loop:
403    mov                     filter1d, filter_allocd
404    imul                    filter1d, indexd
405%if ARCH_X86_64
406    mov         min_filter_count_x4q, min_filter_len_x4q
407    lea                     filter1q, [filter_bankq+filter1q*%2]
408    lea                     filter2q, [filter1q+filter_allocq*%2]
409%else ; x86-32
410    mov         min_filter_count_x4q, filter_bankq
411    lea                     filter1q, [min_filter_count_x4q+filter1q*%2]
412    mov         min_filter_count_x4q, min_filter_length_x4q
413    mov                     filter2q, filter1q
414    add                     filter2q, filter_alloc_x4q
415%endif
416%ifidn %1, int16
417    mova                          m0, m4
418    mova                          m2, m4
419%else ; float/double
420    xorps                         m0, m0, m0
421    xorps                         m2, m2, m2
422%endif
423
424    align 16
425.inner_loop:
426    movu                          m1, [srcq+min_filter_count_x4q*1]
427%ifidn %1, int16
428%if cpuflag(xop)
429    vpmadcswd                     m2, m1, [filter2q+min_filter_count_x4q*1], m2
430    vpmadcswd                     m0, m1, [filter1q+min_filter_count_x4q*1], m0
431%else
432    pmaddwd                       m3, m1, [filter2q+min_filter_count_x4q*1]
433    pmaddwd                       m1, [filter1q+min_filter_count_x4q*1]
434    paddd                         m2, m3
435    paddd                         m0, m1
436%endif ; cpuflag
437%else ; float/double
438%if cpuflag(fma4) || cpuflag(fma3)
439    fmaddp%4                      m2, m1, [filter2q+min_filter_count_x4q*1], m2
440    fmaddp%4                      m0, m1, [filter1q+min_filter_count_x4q*1], m0
441%else
442    mulp%4                        m3, m1, [filter2q+min_filter_count_x4q*1]
443    mulp%4                        m1, m1, [filter1q+min_filter_count_x4q*1]
444    addp%4                        m2, m2, m3
445    addp%4                        m0, m0, m1
446%endif ; cpuflag
447%endif
448    add         min_filter_count_x4q, mmsize
449    js .inner_loop
450
451%ifidn %1, int16
452%if mmsize == 16
453%if cpuflag(xop)
454    vphadddq                      m2, m2
455    vphadddq                      m0, m0
456%endif
457    pshufd                        m3, m2, q0032
458    pshufd                        m1, m0, q0032
459    paddd                         m2, m3
460    paddd                         m0, m1
461%endif
462%if notcpuflag(xop)
463    PSHUFLW                       m3, m2, q0032
464    PSHUFLW                       m1, m0, q0032
465    paddd                         m2, m3
466    paddd                         m0, m1
467%endif
468    psubd                         m2, m0
469    ; This is probably a really bad idea on atom and other machines with a
470    ; long transfer latency between GPRs and XMMs (atom). However, it does
471    ; make the clip a lot simpler...
472    movd                         eax, m2
473    add                       indexd, dst_incr_divd
474    imul                              fracd
475    idiv                              src_incrd
476    movd                          m1, eax
477    add                        fracd, dst_incr_modd
478    paddd                         m0, m1
479    psrad                         m0, 15
480    packssdw                      m0, m0
481    movd                      [dstq], m0
482
483    ; note that for imul/idiv, I need to move filter to edx/eax for each:
484    ; - 32bit: eax=r0[filter1], edx=r2[filter2]
485    ; - win64: eax=r6[filter1], edx=r1[todo]
486    ; - unix64: eax=r6[filter1], edx=r2[todo]
487%else ; float/double
488    ; val += (v2 - val) * (FELEML) frac / c->src_incr;
489%if mmsize == 32
490    vextractf128                 xm1, m0, 0x1
491    vextractf128                 xm3, m2, 0x1
492    addp%4                       xm0, xm1
493    addp%4                       xm2, xm3
494%endif
495    cvtsi2s%4                    xm1, fracd
496    subp%4                       xm2, xm0
497    mulp%4                       xm1, xm4
498    shufp%4                      xm1, xm1, q0000
499%if cpuflag(fma4) || cpuflag(fma3)
500    fmaddp%4                     xm0, xm2, xm1, xm0
501%else
502    mulp%4                       xm2, xm1
503    addp%4                       xm0, xm2
504%endif ; cpuflag
505
506    ; horizontal sum & store
507    movhlps                      xm1, xm0
508%ifidn %1, float
509    addps                        xm0, xm1
510    shufps                       xm1, xm0, xm0, q0001
511%endif
512    add                        fracd, dst_incr_modd
513    addp%4                       xm0, xm1
514    add                       indexd, dst_incr_divd
515    movs%4                    [dstq], xm0
516%endif
517    cmp                        fracd, src_incrd
518    jl .skip
519    sub                        fracd, src_incrd
520    inc                       indexd
521
522%if UNIX64
523    DEFINE_ARGS filter_alloc, dst, filter2, phase_count, index, frac, index_incr, \
524                dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
525                dst_incr_div, src_incr, src, dst_end, filter_bank
526%elif WIN64
527    DEFINE_ARGS phase_count, filter2, src, filter_alloc, index, frac, index_incr, \
528                dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
529                dst_incr_div, src_incr, dst, dst_end, filter_bank
530%else ; x86-32
531    DEFINE_ARGS filter1, phase_count, index_incr, frac, index, dst, src
532%endif
533
534.skip:
535%if ARCH_X86_32
536    mov                 phase_countd, phase_count_stackd
537%endif
538    add                         dstq, %2
539    cmp                       indexd, phase_countd
540    jb .index_skip
541.index_while:
542    sub                       indexd, phase_countd
543    lea                         srcq, [srcq+%2]
544    cmp                       indexd, phase_countd
545    jnb .index_while
546.index_skip:
547    cmp                         dstq, dst_endq
548    jne .loop
549
550%if UNIX64
551    DEFINE_ARGS ctx, dst, filter2, phase_count, index, frac, index_incr, \
552                dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
553                dst_incr_div, src_incr, src, dst_end, filter_bank
554%elif WIN64
555    DEFINE_ARGS ctx, filter2, src, phase_count, index, frac, index_incr, \
556                dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
557                dst_incr_div, src_incr, dst, dst_end, filter_bank
558%else ; x86-32
559    DEFINE_ARGS filter1, ctx, update_context, frac, index, dst, src
560%endif
561
562    cmp  dword update_context_stackd, 0
563    jz .skip_store
564    ; strictly speaking, the function should always return the consumed
565    ; number of bytes; however, we only use the value if update_context
566    ; is true, so let's just leave it uninitialized otherwise
567    mov                         ctxq, ctx_stackq
568    movifnidn                    rax, srcq
569    mov [ctxq+ResampleContext.frac ], fracd
570    sub                          rax, src_stackq
571    mov [ctxq+ResampleContext.index], indexd
572    shr                          rax, %3
573
574.skip_store:
575%if ARCH_X86_32
576    ADD                          rsp, 0x28
577%endif
578    RET
579%endmacro
580
581INIT_XMM sse
582RESAMPLE_FNS float, 4, 2, s, pf_1
583
584%if HAVE_AVX_EXTERNAL
585INIT_YMM avx
586RESAMPLE_FNS float, 4, 2, s, pf_1
587%endif
588%if HAVE_FMA3_EXTERNAL
589INIT_YMM fma3
590RESAMPLE_FNS float, 4, 2, s, pf_1
591%endif
592%if HAVE_FMA4_EXTERNAL
593INIT_XMM fma4
594RESAMPLE_FNS float, 4, 2, s, pf_1
595%endif
596
597%if ARCH_X86_32
598INIT_MMX mmxext
599RESAMPLE_FNS int16, 2, 1
600%endif
601
602INIT_XMM sse2
603RESAMPLE_FNS int16, 2, 1
604%if HAVE_XOP_EXTERNAL
605INIT_XMM xop
606RESAMPLE_FNS int16, 2, 1
607%endif
608
609INIT_XMM sse2
610RESAMPLE_FNS double, 8, 3, d, pdbl_1
611
612%if HAVE_AVX_EXTERNAL
613INIT_YMM avx
614RESAMPLE_FNS double, 8, 3, d, pdbl_1
615%endif
616%if HAVE_FMA3_EXTERNAL
617INIT_YMM fma3
618RESAMPLE_FNS double, 8, 3, d, pdbl_1
619%endif
620