• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; XVID MPEG-4 VIDEO CODEC
2;
3; Conversion from gcc syntax to x264asm syntax with modifications
4; by Christophe Gisquet <christophe.gisquet@gmail.com>
5;
6; ===========     SSE2 inverse discrete cosine transform     ===========
7;
8; Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
9;
10; Conversion to gcc syntax with modifications
11; by Alexander Strange <astrange@ithinksw.com>
12;
13; Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
14;
15; Vertical pass is an implementation of the scheme:
16;  Loeffler C., Ligtenberg A., and Moschytz C.S.:
17;  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
18;  Proc. ICASSP 1989, 988-991.
19;
20; Horizontal pass is a double 4x4 vector/matrix multiplication,
21; (see also Intel's Application Note 922:
22;  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
23;  Copyright (C) 1999 Intel Corporation)
24;
25; More details at http://skal.planet-d.net/coding/dct.html
26;
27; =======     MMX and XMM forward discrete cosine transform     =======
28;
29; Copyright(C) 2001 Peter Ross <pross@xvid.org>
30;
31; Originally provided by Intel at AP-922
32; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
33; (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
34; but in a limited edition.
35; New macro implements a column part for precise iDCT
36; The routine precision now satisfies IEEE standard 1180-1990.
37;
38; Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
39; Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
40;
41; http://www.elecard.com/peter/idct.html
42; http://www.linuxvideo.org/mpeg2dec/
43;
44; These examples contain code fragments for first stage iDCT 8x8
45; (for rows) and first stage DCT 8x8 (for columns)
46;
47; conversion to gcc syntax by Michael Niedermayer
48;
49; ======================================================================
50;
51; This file is part of FFmpeg.
52;
53; FFmpeg is free software; you can redistribute it and/or
54; modify it under the terms of the GNU Lesser General Public
55; License as published by the Free Software Foundation; either
56; version 2.1 of the License, or (at your option) any later version.
57;
58; FFmpeg is distributed in the hope that it will be useful,
59; but WITHOUT ANY WARRANTY; without even the implied warranty of
60; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
61; Lesser General Public License for more details.
62;
63; You should have received a copy of the GNU Lesser General Public License
64; along with FFmpeg; if not, write to the Free Software Foundation,
65; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
66
67%include "libavutil/x86/x86util.asm"
68
69SECTION_RODATA
70; Similar to tg_1_16 in MMX code
71tan1:   times 8 dw 13036
72tan2:   times 8 dw 27146
73tan3:   times 8 dw 43790
74sqrt2:  times 8 dw 23170
75
76; SSE2 tables
77iTab1:  dw 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d
78        dw 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61
79        dw 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7
80        dw 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
81iTab2:  dw 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5
82        dw 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04
83        dw 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41
84        dw 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
85iTab3:  dw 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf
86        dw 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf
87        dw 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d
88        dw 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
89iTab4:  dw 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746
90        dw 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac
91        dw 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df
92        dw 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
93
94; Similar to rounder_0 in MMX code
95; 4 first similar, then: 4*8->6*16  5*8->4*16  6/7*8->5*16
96walkenIdctRounders: times 4 dd 65536
97                    times 4 dd  3597
98                    times 4 dd  2260
99                    times 4 dd  1203
100                    times 4 dd   120
101                    times 4 dd   512
102                    times 2 dd     0
103
104pb_127: times 8 db 127
105
106SECTION .text
107
108; Temporary storage before the column pass
109%define ROW1 xmm6
110%define ROW3 xmm4
111%define ROW5 xmm5
112%define ROW7 xmm7
113
114%macro CLEAR_ODD 1
115    pxor      %1, %1
116%endmacro
117%macro PUT_ODD 1
118    pshufhw   %1, xmm2, 0x1B
119%endmacro
120
121%macro MOV32 2
122%if ARCH_X86_32
123    movdqa    %2, %1
124%endif
125%endmacro
126
127%macro CLEAR_EVEN 1
128%if ARCH_X86_64
129    CLEAR_ODD %1
130%endif
131%endmacro
132
133%macro PUT_EVEN 1
134%if ARCH_X86_64
135    PUT_ODD   %1
136%else
137    pshufhw xmm2, xmm2, 0x1B
138    movdqa    %1, xmm2
139%endif
140%endmacro
141
142%if ARCH_X86_64
143%define ROW0  xmm8
144%define REG0  ROW0
145%define ROW2  xmm9
146%define REG2  ROW2
147%define ROW4  xmm10
148%define REG4  ROW4
149%define ROW6  xmm11
150%define REG6  ROW6
151%define XMMS  xmm12
152%define SREG2 REG2
153%define TAN3  xmm13
154%define TAN1  xmm14
155%else
156%define ROW0  [BLOCK + 0*16]
157%define REG0  xmm4
158%define ROW2  [BLOCK + 2*16]
159%define REG2  xmm4
160%define ROW4  [BLOCK + 4*16]
161%define REG4  xmm6
162%define ROW6  [BLOCK + 6*16]
163%define REG6  xmm6
164%define XMMS  xmm2
165%define SREG2 xmm7
166%define TAN3  xmm0
167%define TAN1  xmm2
168%endif
169
170%macro JZ  2
171    test      %1, %1
172    jz       .%2
173%endmacro
174
175%macro JNZ  2
176    test      %1, %1
177    jnz      .%2
178%endmacro
179
180%macro TEST_ONE_ROW 4 ; src, reg, clear, arg
181    %3        %4
182    movq     mm1, [%1]
183    por      mm1, [%1 + 8]
184    paddusb  mm1, mm0
185    pmovmskb  %2, mm1
186%endmacro
187
188;row1, row2, reg1, reg2, clear1, arg1, clear2, arg2
189%macro  TEST_TWO_ROWS  8
190    %5         %6
191    %7         %8
192    movq      mm1, [%1 + 0]
193    por       mm1, [%1 + 8]
194    movq      mm2, [%2 + 0]
195    por       mm2, [%2 + 8]
196    paddusb   mm1, mm0
197    paddusb   mm2, mm0
198    pmovmskb   %3, mm1
199    pmovmskb   %4, mm2
200%endmacro
201
202; IDCT pass on rows.
203%macro iMTX_MULT   4-5 ; src, table, put, arg, rounder
204    movdqa       xmm3, [%1]
205    movdqa       xmm0, xmm3
206    pshufd       xmm1, xmm3, 0x11 ; 4602
207    punpcklqdq   xmm0, xmm0       ; 0246
208    pmaddwd      xmm0, [%2]
209    pmaddwd      xmm1, [%2+16]
210    pshufd       xmm2, xmm3, 0xBB ; 5713
211    punpckhqdq   xmm3, xmm3       ; 1357
212    pmaddwd      xmm2, [%2+32]
213    pmaddwd      xmm3, [%2+48]
214    paddd        xmm0, xmm1
215    paddd        xmm2, xmm3
216%if %0 == 5
217    paddd        xmm0, [walkenIdctRounders+%5]
218%endif
219    movdqa       xmm3, xmm2
220    paddd        xmm2, xmm0
221    psubd        xmm0, xmm3
222    psrad        xmm2, 11
223    psrad        xmm0, 11
224    packssdw     xmm2, xmm0
225    %3           %4
226%endmacro
227
228%macro iLLM_HEAD 0
229    movdqa   TAN3, [tan3]
230    movdqa   TAN1, [tan1]
231%endmacro
232
233%macro FIRST_HALF 2  ; %1=dct  %2=type(normal,add,put)
234    psraw    xmm5, 6
235    psraw    REG0, 6
236    psraw    TAN3, 6
237    psraw    xmm3, 6
238    ; dct coeffs must still be written for AC prediction
239%if %2 == 0
240    movdqa   [%1+1*16], TAN3
241    movdqa   [%1+2*16], xmm3
242    movdqa   [%1+5*16], REG0
243    movdqa   [%1+6*16], xmm5
244%else
245    ; Must now load args as gprs are no longer used for masks
246    ; DEST is set to where address of dest was loaded
247    %if ARCH_X86_32
248        %if %2 == 2 ; Not enough xmms, store
249    movdqa   [%1+1*16], TAN3
250    movdqa   [%1+2*16], xmm3
251    movdqa   [%1+5*16], REG0
252    movdqa   [%1+6*16], xmm5
253        %endif
254    %xdefine DEST r2q ; BLOCK is r0, stride r1
255    movifnidn DEST, destm
256    movifnidn strideq, stridem
257    %else
258    %xdefine DEST r0q
259    %endif
260    lea      r3q, [3*strideq]
261    %if %2 == 1
262    packuswb TAN3, xmm3
263    packuswb xmm5, REG0
264    movq     [DEST + strideq], TAN3
265    movhps   [DEST + 2*strideq], TAN3
266    ; REG0 and TAN3 are now available (and likely used in second half)
267    %endif
268%endif
269%endmacro
270
271%macro SECOND_HALF 6 ; %1=dct  %2=type(normal,add,put) 3-6: xmms
272    psraw    %3, 6
273    psraw    %4, 6
274    psraw    %5, 6
275    psraw    %6, 6
276    ; dct coeffs must still be written for AC prediction
277%if %2 == 0
278    movdqa   [%1+0*16], %3
279    movdqa   [%1+3*16], %5
280    movdqa   [%1+4*16], %6
281    movdqa   [%1+7*16], %4
282%elif %2 == 1
283    packuswb %3, %5
284    packuswb %6, %4
285    ; address of dest may have been loaded
286    movq     [DEST], %3
287    movhps   [DEST + r3q], %3
288    lea      DEST, [DEST + 4*strideq]
289    movq     [DEST], %6
290    movhps   [DEST + r3q], %6
291    ; and now write remainder of first half
292    movq     [DEST + 2*strideq], xmm5
293    movhps   [DEST + strideq], xmm5
294%elif %2 == 2
295    pxor        xmm0, xmm0
296    %if ARCH_X86_32
297    ; free: m3 REG0=m4 m5
298    ; input: m1, m7, m2, m6
299    movq        xmm3, [DEST+0*strideq]
300    movq        xmm4, [DEST+1*strideq]
301    punpcklbw   xmm3, xmm0
302    punpcklbw   xmm4, xmm0
303    paddsw      xmm3, %3
304    paddsw      xmm4, [%1 + 1*16]
305    movq          %3, [DEST+2*strideq]
306    movq        xmm5, [DEST+      r3q]
307    punpcklbw     %3, xmm0
308    punpcklbw   xmm5, xmm0
309    paddsw        %3, [%1 + 2*16]
310    paddsw      xmm5, %5
311    packuswb    xmm3, xmm4
312    packuswb      %3, xmm5
313    movq    [DEST+0*strideq], xmm3
314    movhps  [DEST+1*strideq], xmm3
315    movq    [DEST+2*strideq], %3
316    movhps  [DEST+      r3q], %3
317    lea         DEST, [DEST+4*strideq]
318    movq        xmm3, [DEST+0*strideq]
319    movq        xmm4, [DEST+1*strideq]
320    movq          %3, [DEST+2*strideq]
321    movq        xmm5, [DEST+      r3q]
322    punpcklbw   xmm3, xmm0
323    punpcklbw   xmm4, xmm0
324    punpcklbw     %3, xmm0
325    punpcklbw   xmm5, xmm0
326    paddsw      xmm3, %6
327    paddsw      xmm4, [%1 + 5*16]
328    paddsw        %3, [%1 + 6*16]
329    paddsw      xmm5, %4
330    packuswb    xmm3, xmm4
331    packuswb      %3, xmm5
332    movq    [DEST+0*strideq], xmm3
333    movhps  [DEST+1*strideq], xmm3
334    movq    [DEST+2*strideq], %3
335    movhps  [DEST+      r3q], %3
336    %else
337    ; l1:TAN3=m13  l2:m3  l5:REG0=m8 l6=m5
338    ; input: m1, m7/SREG2=m9, TAN1=m14, REG4=m10
339    movq        xmm2, [DEST+0*strideq]
340    movq        xmm4, [DEST+1*strideq]
341    movq       xmm12, [DEST+2*strideq]
342    movq       xmm11, [DEST+      r3q]
343    punpcklbw   xmm2, xmm0
344    punpcklbw   xmm4, xmm0
345    punpcklbw  xmm12, xmm0
346    punpcklbw  xmm11, xmm0
347    paddsw      xmm2, %3
348    paddsw      xmm4, TAN3
349    paddsw     xmm12, xmm3
350    paddsw     xmm11, %5
351    packuswb    xmm2, xmm4
352    packuswb   xmm12, xmm11
353    movq    [DEST+0*strideq], xmm2
354    movhps  [DEST+1*strideq], xmm2
355    movq    [DEST+2*strideq], xmm12
356    movhps  [DEST+      r3q], xmm12
357    lea         DEST, [DEST+4*strideq]
358    movq        xmm2, [DEST+0*strideq]
359    movq        xmm4, [DEST+1*strideq]
360    movq       xmm12, [DEST+2*strideq]
361    movq       xmm11, [DEST+      r3q]
362    punpcklbw   xmm2, xmm0
363    punpcklbw   xmm4, xmm0
364    punpcklbw  xmm12, xmm0
365    punpcklbw  xmm11, xmm0
366    paddsw      xmm2, %6
367    paddsw      xmm4, REG0
368    paddsw     xmm12, xmm5
369    paddsw     xmm11, %4
370    packuswb    xmm2, xmm4
371    packuswb   xmm12, xmm11
372    movq    [DEST+0*strideq], xmm2
373    movhps  [DEST+1*strideq], xmm2
374    movq    [DEST+2*strideq], xmm12
375    movhps  [DEST+      r3q], xmm12
376    %endif
377%endif
378%endmacro
379
380
381; IDCT pass on columns.
382%macro iLLM_PASS  2  ; %1=dct  %2=type(normal,add,put)
383    movdqa   xmm1, TAN3
384    movdqa   xmm3, TAN1
385    pmulhw   TAN3, xmm4
386    pmulhw   xmm1, xmm5
387    paddsw   TAN3, xmm4
388    paddsw   xmm1, xmm5
389    psubsw   TAN3, xmm5
390    paddsw   xmm1, xmm4
391    pmulhw   xmm3, xmm7
392    pmulhw   TAN1, xmm6
393    paddsw   xmm3, xmm6
394    psubsw   TAN1, xmm7
395    movdqa   xmm7, xmm3
396    movdqa   xmm6, TAN1
397    psubsw   xmm3, xmm1
398    psubsw   TAN1, TAN3
399    paddsw   xmm1, xmm7
400    paddsw   TAN3, xmm6
401    movdqa   xmm6, xmm3
402    psubsw   xmm3, TAN3
403    paddsw   TAN3, xmm6
404    movdqa   xmm4, [sqrt2]
405    pmulhw   xmm3, xmm4
406    pmulhw   TAN3, xmm4
407    paddsw   TAN3, TAN3
408    paddsw   xmm3, xmm3
409    movdqa   xmm7, [tan2]
410    MOV32    ROW2, REG2
411    MOV32    ROW6, REG6
412    movdqa   xmm5, xmm7
413    pmulhw   xmm7, REG6
414    pmulhw   xmm5, REG2
415    paddsw   xmm7, REG2
416    psubsw   xmm5, REG6
417    MOV32    ROW0, REG0
418    MOV32    ROW4, REG4
419    MOV32    TAN1, [BLOCK]
420    movdqa   XMMS, REG0
421    psubsw   REG0, REG4
422    paddsw   REG4, XMMS
423    movdqa   XMMS, REG4
424    psubsw   REG4, xmm7
425    paddsw   xmm7, XMMS
426    movdqa   XMMS, REG0
427    psubsw   REG0, xmm5
428    paddsw   xmm5, XMMS
429    movdqa   XMMS, xmm5
430    psubsw   xmm5, TAN3
431    paddsw   TAN3, XMMS
432    movdqa   XMMS, REG0
433    psubsw   REG0, xmm3
434    paddsw   xmm3, XMMS
435    MOV32    [BLOCK], TAN1
436
437    FIRST_HALF %1, %2
438
439    movdqa   xmm0, xmm7
440    movdqa   xmm4, REG4
441    psubsw   xmm7, xmm1
442    psubsw   REG4, TAN1
443    paddsw   xmm1, xmm0
444    paddsw   TAN1, xmm4
445
446    SECOND_HALF %1, %2, xmm1, xmm7, TAN1, REG4
447%endmacro
448
449; IDCT pass on columns, assuming rows 4-7 are zero
450%macro iLLM_PASS_SPARSE   2 ; %1=dct   %2=type(normal,put,add)
451    pmulhw   TAN3, xmm4
452    paddsw   TAN3, xmm4
453    movdqa   xmm3, xmm6
454    pmulhw   TAN1, xmm6
455    movdqa   xmm1, xmm4
456    psubsw   xmm3, xmm1
457    paddsw   xmm1, xmm6
458    movdqa   xmm6, TAN1
459    psubsw   TAN1, TAN3
460    paddsw   TAN3, xmm6
461    movdqa   xmm6, xmm3
462    psubsw   xmm3, TAN3
463    paddsw   TAN3, xmm6
464    movdqa   xmm4, [sqrt2]
465    pmulhw   xmm3, xmm4
466    pmulhw   TAN3, xmm4
467    paddsw   TAN3, TAN3
468    paddsw   xmm3, xmm3
469    movdqa   xmm5, [tan2]
470    MOV32    ROW2, SREG2
471    pmulhw   xmm5, SREG2
472    MOV32    ROW0, REG0
473    movdqa   xmm6, REG0
474    psubsw   xmm6, SREG2
475    paddsw  SREG2, REG0
476    MOV32    TAN1, [BLOCK]
477    movdqa   XMMS, REG0
478    psubsw   REG0, xmm5
479    paddsw   xmm5, XMMS
480    movdqa   XMMS, xmm5
481    psubsw   xmm5, TAN3
482    paddsw   TAN3, XMMS
483    movdqa   XMMS, REG0
484    psubsw   REG0, xmm3
485    paddsw   xmm3, XMMS
486    MOV32    [BLOCK], TAN1
487
488    FIRST_HALF %1, %2
489
490    movdqa   xmm0, SREG2
491    movdqa   xmm4, xmm6
492    psubsw  SREG2, xmm1
493    psubsw   xmm6, TAN1
494    paddsw   xmm1, xmm0
495    paddsw   TAN1, xmm4
496
497    SECOND_HALF %1, %2, xmm1, SREG2, TAN1, xmm6
498%endmacro
499
500%macro IDCT_SSE2 1 ; 0=normal  1=put  2=add
501%if %1 == 0 || ARCH_X86_32
502    %define GPR0  r1d
503    %define GPR1  r2d
504    %define GPR2  r3d
505    %define GPR3  r4d
506    %define NUM_GPRS 5
507%else
508    %define GPR0  r3d
509    %define GPR1  r4d
510    %define GPR2  r5d
511    %define GPR3  r6d
512    %define NUM_GPRS 7
513%endif
514%if %1 == 0
515cglobal xvid_idct, 1, NUM_GPRS, 8+7*ARCH_X86_64, block
516%xdefine BLOCK blockq
517%else
518    %if %1 == 1
519cglobal xvid_idct_put, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
520    %else
521cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
522    %endif
523    %if ARCH_X86_64
524    %xdefine BLOCK blockq
525    %else
526    mov    r0q, blockm
527    %xdefine BLOCK r0q
528    %endif
529%endif
530    movq           mm0, [pb_127]
531    iMTX_MULT      BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16
532    iMTX_MULT      BLOCK + 1*16, iTab2, PUT_ODD, ROW1,  1*16
533    iMTX_MULT      BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16
534
535    TEST_TWO_ROWS  BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c
536    JZ   GPR0, col1
537    iMTX_MULT      BLOCK + 3*16, iTab4, PUT_ODD, ROW3,  3*16
538.col1:
539    TEST_TWO_ROWS  BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d
540    TEST_ONE_ROW   BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi
541
542    iLLM_HEAD
543    JNZ  GPR1, 2
544    JNZ  GPR0, 3
545    JNZ  GPR2, 4
546    JNZ  GPR3, 5
547    iLLM_PASS_SPARSE BLOCK, %1
548    jmp .6
549.2:
550    iMTX_MULT     BLOCK + 4*16, iTab1, PUT_EVEN, ROW4
551.3:
552    iMTX_MULT     BLOCK + 5*16, iTab4, PUT_ODD, ROW5,  4*16
553    JZ   GPR2, col2
554.4:
555    iMTX_MULT     BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16
556.col2:
557    JZ   GPR3, col3
558.5:
559    iMTX_MULT     BLOCK + 7*16, iTab2, PUT_ODD, ROW7,  5*16
560.col3:
561%if ARCH_X86_32
562    iLLM_HEAD
563%endif
564    iLLM_PASS     BLOCK, %1
565.6:
566    RET
567%endmacro
568
569INIT_XMM sse2
570IDCT_SSE2 0
571IDCT_SSE2 1
572IDCT_SSE2 2
573