• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;******************************************************************************
2;* x86-optimized horizontal line scaling functions
3;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25
26max_19bit_int: times 4 dd 0x7ffff
27max_19bit_flt: times 4 dd 524287.0
28minshort:      times 8 dw 0x8000
29unicoeff:      times 4 dd 0x20000000
30
31SECTION .text
32
33;-----------------------------------------------------------------------------
34; horizontal line scaling
35;
36; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
37;                               (SwsContext *c, int{16,32}_t *dst,
38;                                int dstW, const uint{8,16}_t *src,
39;                                const int16_t *filter,
40;                                const int32_t *filterPos, int filterSize);
41;
42; Scale one horizontal line. Input is either 8-bit width or 16-bit width
43; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
44; downscale before multiplying). Filter is 14 bits. Output is either 15 bits
45; (in int16_t) or 19 bits (in int32_t), as given in $intermediate_nbits. Each
46; output pixel is generated from $filterSize input pixels, the position of
47; the first pixel is given in filterPos[nOutputPixel].
48;-----------------------------------------------------------------------------
49
50; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm
51%macro SCALE_FUNC 6
52%ifnidn %3, X
53cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1
54%else
55cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
56%endif
57%if ARCH_X86_64
58    movsxd        wq, wd
59%define mov32 movsxd
60%else ; x86-32
61%define mov32 mov
62%endif ; x86-64
63%if %2 == 19
64%if cpuflag(sse4)
65    mova          m2, [max_19bit_int]
66%else ; ssse3/sse2
67    mova          m2, [max_19bit_flt]
68%endif ; sse2/ssse3/sse4
69%endif ; %2 == 19
70%if %1 == 16
71    mova          m6, [minshort]
72    mova          m7, [unicoeff]
73%elif %1 == 8
74    pxor          m3, m3
75%endif ; %1 == 8/16
76
77%if %1 == 8
78%define movlh movd
79%define movbh movh
80%define srcmul 1
81%else ; %1 == 9-16
82%define movlh movq
83%define movbh movu
84%define srcmul 2
85%endif ; %1 == 8/9-16
86
87%ifnidn %3, X
88
89    ; setup loop
90%if %3 == 8
91    shl           wq, 1                         ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
92%define wshr 1
93%else ; %3 == 4
94%define wshr 0
95%endif ; %3 == 8
96    lea      filterq, [filterq+wq*8]
97%if %2 == 15
98    lea         dstq, [dstq+wq*(2>>wshr)]
99%else ; %2 == 19
100    lea         dstq, [dstq+wq*(4>>wshr)]
101%endif ; %2 == 15/19
102    lea      fltposq, [fltposq+wq*(4>>wshr)]
103    neg           wq
104
105.loop:
106%if %3 == 4 ; filterSize == 4 scaling
107    ; load 2x4 or 4x4 source pixels into m0/m1
108    mov32      pos0q, dword [fltposq+wq*4+ 0]   ; filterPos[0]
109    mov32      pos1q, dword [fltposq+wq*4+ 4]   ; filterPos[1]
110    movlh         m0, [srcq+pos0q*srcmul]       ; src[filterPos[0] + {0,1,2,3}]
111%if mmsize == 8
112    movlh         m1, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
113%else ; mmsize == 16
114%if %1 > 8
115    movhps        m0, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
116%else ; %1 == 8
117    movd          m4, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
118%endif
119    mov32      pos0q, dword [fltposq+wq*4+ 8]   ; filterPos[2]
120    mov32      pos1q, dword [fltposq+wq*4+12]   ; filterPos[3]
121    movlh         m1, [srcq+pos0q*srcmul]       ; src[filterPos[2] + {0,1,2,3}]
122%if %1 > 8
123    movhps        m1, [srcq+pos1q*srcmul]       ; src[filterPos[3] + {0,1,2,3}]
124%else ; %1 == 8
125    movd          m5, [srcq+pos1q*srcmul]       ; src[filterPos[3] + {0,1,2,3}]
126    punpckldq     m0, m4
127    punpckldq     m1, m5
128%endif ; %1 == 8
129%endif ; mmsize == 8/16
130%if %1 == 8
131    punpcklbw     m0, m3                        ; byte -> word
132    punpcklbw     m1, m3                        ; byte -> word
133%endif ; %1 == 8
134
135    ; multiply with filter coefficients
136%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
137             ; add back 0x8000 * sum(coeffs) after the horizontal add
138    psubw         m0, m6
139    psubw         m1, m6
140%endif ; %1 == 16
141    pmaddwd       m0, [filterq+wq*8+mmsize*0]   ; *= filter[{0,1,..,6,7}]
142    pmaddwd       m1, [filterq+wq*8+mmsize*1]   ; *= filter[{8,9,..,14,15}]
143
144    ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
145%if notcpuflag(ssse3) ; sse2
146    mova          m4, m0
147    shufps        m0, m1, 10001000b
148    shufps        m4, m1, 11011101b
149    paddd         m0, m4
150%else ; ssse3/sse4
151    phaddd        m0, m1                        ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
152                                                ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
153                                                ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
154                                                ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
155%endif ; sse2/ssse3/sse4
156%else ; %3 == 8, i.e. filterSize == 8 scaling
157    ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
158    mov32      pos0q, dword [fltposq+wq*2+0]    ; filterPos[0]
159    mov32      pos1q, dword [fltposq+wq*2+4]    ; filterPos[1]
160    movbh         m0, [srcq+ pos0q   *srcmul]   ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
161%if mmsize == 8
162    movbh         m1, [srcq+(pos0q+4)*srcmul]   ; src[filterPos[0] + {4,5,6,7}]
163    movbh         m4, [srcq+ pos1q   *srcmul]   ; src[filterPos[1] + {0,1,2,3}]
164    movbh         m5, [srcq+(pos1q+4)*srcmul]   ; src[filterPos[1] + {4,5,6,7}]
165%else ; mmsize == 16
166    movbh         m1, [srcq+ pos1q   *srcmul]   ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
167    mov32      pos0q, dword [fltposq+wq*2+8]    ; filterPos[2]
168    mov32      pos1q, dword [fltposq+wq*2+12]   ; filterPos[3]
169    movbh         m4, [srcq+ pos0q   *srcmul]   ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
170    movbh         m5, [srcq+ pos1q   *srcmul]   ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
171%endif ; mmsize == 8/16
172%if %1 == 8
173    punpcklbw     m0, m3                        ; byte -> word
174    punpcklbw     m1, m3                        ; byte -> word
175    punpcklbw     m4, m3                        ; byte -> word
176    punpcklbw     m5, m3                        ; byte -> word
177%endif ; %1 == 8
178
179    ; multiply
180%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
181             ; add back 0x8000 * sum(coeffs) after the horizontal add
182    psubw         m0, m6
183    psubw         m1, m6
184    psubw         m4, m6
185    psubw         m5, m6
186%endif ; %1 == 16
187    pmaddwd       m0, [filterq+wq*8+mmsize*0]   ; *= filter[{0,1,..,6,7}]
188    pmaddwd       m1, [filterq+wq*8+mmsize*1]   ; *= filter[{8,9,..,14,15}]
189    pmaddwd       m4, [filterq+wq*8+mmsize*2]   ; *= filter[{16,17,..,22,23}]
190    pmaddwd       m5, [filterq+wq*8+mmsize*3]   ; *= filter[{24,25,..,30,31}]
191
192    ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
193%if notcpuflag(ssse3) ; sse2
194%if %1 == 8
195%define mex m6
196%else
197%define mex m3
198%endif
199    ; emulate horizontal add as transpose + vertical add
200    mova         mex, m0
201    punpckldq     m0, m1
202    punpckhdq    mex, m1
203    paddd         m0, mex
204    mova          m1, m4
205    punpckldq     m4, m5
206    punpckhdq     m1, m5
207    paddd         m4, m1
208    mova          m1, m0
209    punpcklqdq    m0, m4
210    punpckhqdq    m1, m4
211    paddd         m0, m1
212%else ; ssse3/sse4
213    ; FIXME if we rearrange the filter in pairs of 4, we can
214    ; load pixels likewise and use 2 x paddd + phaddd instead
215    ; of 3 x phaddd here, faster on older cpus
216    phaddd        m0, m1
217    phaddd        m4, m5
218    phaddd        m0, m4                        ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
219                                                ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
220                                                ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
221                                                ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
222%endif ; sse2/ssse3/sse4
223%endif ; %3 == 4/8
224
225%else ; %3 == X, i.e. any filterSize scaling
226
227%ifidn %4, X4
228%define dlt 4
229%else ; %4 == X || %4 == X8
230%define dlt 0
231%endif ; %4 ==/!= X4
232%if ARCH_X86_64
233%define srcq    r8
234%define pos1q   r7
235%define srcendq r9
236    movsxd  fltsizeq, fltsized                  ; filterSize
237    lea      srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
238%else ; x86-32
239%define srcq    srcmemq
240%define pos1q   dstq
241%define srcendq r6m
242    lea        pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
243    mov      srcendq, pos0q
244%endif ; x86-32/64
245    lea      fltposq, [fltposq+wq*4]
246%if %2 == 15
247    lea         dstq, [dstq+wq*2]
248%else ; %2 == 19
249    lea         dstq, [dstq+wq*4]
250%endif ; %2 == 15/19
251    movifnidn  dstmp, dstq
252    neg           wq
253
254.loop:
255    mov32      pos0q, dword [fltposq+wq*4+0]    ; filterPos[0]
256    mov32      pos1q, dword [fltposq+wq*4+4]    ; filterPos[1]
257    ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
258    pxor          m4, m4
259    pxor          m5, m5
260    mov         srcq, srcmemmp
261
262.innerloop:
263    ; load 2x8 (sse) source pixels into m0/m1 -> m4/m5
264    movbh         m0, [srcq+ pos0q     *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
265    movbh         m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
266%if %1 == 8
267    punpcklbw     m0, m3
268    punpcklbw     m1, m3
269%endif ; %1 == 8
270
271    ; multiply
272%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
273             ; add back 0x8000 * sum(coeffs) after the horizontal add
274    psubw         m0, m6
275    psubw         m1, m6
276%endif ; %1 == 16
277    pmaddwd       m0, [filterq]                 ; filter[{0,1,2,3(,4,5,6,7)}]
278    pmaddwd       m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
279    paddd         m4, m0
280    paddd         m5, m1
281    add      filterq, mmsize
282    add         srcq, srcmul*mmsize/2
283    cmp         srcq, srcendq                   ; while (src += 4) < &src[filterSize]
284    jl .innerloop
285
286%ifidn %4, X4
287    mov32      pos1q, dword [fltposq+wq*4+4]    ; filterPos[1]
288    movlh         m0, [srcq+ pos0q     *srcmul] ; split last 4 srcpx of dstpx[0]
289    sub        pos1q, fltsizeq                  ; and first 4 srcpx of dstpx[1]
290%if %1 > 8
291    movhps        m0, [srcq+(pos1q+dlt)*srcmul]
292%else ; %1 == 8
293    movd          m1, [srcq+(pos1q+dlt)*srcmul]
294    punpckldq     m0, m1
295%endif ; %1 == 8
296%if %1 == 8
297    punpcklbw     m0, m3
298%endif ; %1 == 8
299%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
300             ; add back 0x8000 * sum(coeffs) after the horizontal add
301    psubw         m0, m6
302%endif ; %1 == 16
303    pmaddwd       m0, [filterq]
304%endif ; %4 == X4
305
306    lea      filterq, [filterq+(fltsizeq+dlt)*2]
307
308%if notcpuflag(ssse3) ; sse2
309    mova          m1, m4
310    punpcklqdq    m4, m5
311    punpckhqdq    m1, m5
312    paddd         m4, m1
313%else ; ssse3/sse4
314    phaddd        m4, m5
315%endif ; sse2/ssse3/sse4
316%ifidn %4, X4
317    paddd         m4, m0
318%endif ; %3 == X4
319%if notcpuflag(ssse3) ; sse2
320    pshufd        m4, m4, 11011000b
321    movhlps       m0, m4
322    paddd         m0, m4
323%else ; ssse3/sse4
324    phaddd        m4, m4
325    SWAP           0, 4
326%endif ; sse2/ssse3/sse4
327%endif ; %3 ==/!= X
328
329%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
330    paddd         m0, m7
331%endif ; %1 == 16
332
333    ; clip, store
334    psrad         m0, 14 + %1 - %2
335%ifidn %3, X
336    movifnidn   dstq, dstmp
337%endif ; %3 == X
338%if %2 == 15
339    packssdw      m0, m0
340%ifnidn %3, X
341    movh [dstq+wq*(2>>wshr)], m0
342%else ; %3 == X
343    movd [dstq+wq*2], m0
344%endif ; %3 ==/!= X
345%else ; %2 == 19
346    PMINSD        m0, m2, m4
347%ifnidn %3, X
348    mova [dstq+wq*(4>>wshr)], m0
349%else ; %3 == X
350    movq [dstq+wq*4], m0
351%endif ; %3 ==/!= X
352%endif ; %2 == 15/19
353%ifnidn %3, X
354    add           wq, (mmsize<<wshr)/4          ; both 8tap and 4tap really only do 4 pixels
355                                                ; per iteration. see "shl wq,1" above as for why we do this
356%else ; %3 == X
357    add           wq, 2
358%endif ; %3 ==/!= X
359    jl .loop
360    REP_RET
361%endmacro
362
363; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
364%macro SCALE_FUNCS 3
365SCALE_FUNC %1, %2, 4, 4,  6, %3
366SCALE_FUNC %1, %2, 8, 8,  6, %3
367SCALE_FUNC %1, %2, X, X4, 7, %3
368SCALE_FUNC %1, %2, X, X8, 7, %3
369%endmacro
370
371; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
372%macro SCALE_FUNCS2 3
373%if notcpuflag(sse4)
374SCALE_FUNCS  8, 15, %1
375SCALE_FUNCS  9, 15, %2
376SCALE_FUNCS 10, 15, %2
377SCALE_FUNCS 12, 15, %2
378SCALE_FUNCS 14, 15, %2
379SCALE_FUNCS 16, 15, %3
380%endif ; !sse4
381SCALE_FUNCS  8, 19, %1
382SCALE_FUNCS  9, 19, %2
383SCALE_FUNCS 10, 19, %2
384SCALE_FUNCS 12, 19, %2
385SCALE_FUNCS 14, 19, %2
386SCALE_FUNCS 16, 19, %3
387%endmacro
388
389INIT_XMM sse2
390SCALE_FUNCS2 7, 6, 8
391INIT_XMM ssse3
392SCALE_FUNCS2 6, 6, 8
393INIT_XMM sse4
394SCALE_FUNCS2 6, 6, 8
395