• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jfdctflt.asm - floating-point FDCT (3DNow!)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; This file contains a floating-point implementation of the forward DCT
18; (Discrete Cosine Transform). The following code is based directly on
19; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
20;
21; [TAB8]
22
23%include "jsimdext.inc"
24%include "jdct.inc"
25
26; --------------------------------------------------------------------------
27    SECTION     SEG_CONST
28
29    alignz      32
30    GLOBAL_DATA(jconst_fdct_float_3dnow)
31
32EXTN(jconst_fdct_float_3dnow):
33
34PD_0_382 times 2 dd 0.382683432365089771728460
35PD_0_707 times 2 dd 0.707106781186547524400844
36PD_0_541 times 2 dd 0.541196100146196984399723
37PD_1_306 times 2 dd 1.306562964876376527856643
38
39    alignz      32
40
41; --------------------------------------------------------------------------
42    SECTION     SEG_TEXT
43    BITS        32
44;
45; Perform the forward DCT on one block of samples.
46;
47; GLOBAL(void)
48; jsimd_fdct_float_3dnow(FAST_FLOAT *data)
49;
50
51%define data(b)       (b) + 8           ; FAST_FLOAT *data
52
53%define original_ebp  ebp + 0
54%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
55%define WK_NUM        2
56
57    align       32
58    GLOBAL_FUNCTION(jsimd_fdct_float_3dnow)
59
60EXTN(jsimd_fdct_float_3dnow):
61    push        ebp
62    mov         eax, esp                    ; eax = original ebp
63    sub         esp, byte 4
64    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
65    mov         [esp], eax
66    mov         ebp, esp                    ; ebp = aligned ebp
67    lea         esp, [wk(0)]
68    pushpic     ebx
69;   push        ecx                     ; need not be preserved
70;   push        edx                     ; need not be preserved
71;   push        esi                     ; unused
72;   push        edi                     ; unused
73
74    get_GOT     ebx                     ; get GOT address
75
76    ; ---- Pass 1: process rows.
77
78    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
79    mov         ecx, DCTSIZE/2
80    alignx      16, 7
81.rowloop:
82
83    movq        mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
84    movq        mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
85    movq        mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
86    movq        mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
87
88    ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
89
90    movq        mm4, mm0                ; transpose coefficients
91    punpckldq   mm0, mm1                ; mm0=(00 10)=data0
92    punpckhdq   mm4, mm1                ; mm4=(01 11)=data1
93    movq        mm5, mm2                ; transpose coefficients
94    punpckldq   mm2, mm3                ; mm2=(06 16)=data6
95    punpckhdq   mm5, mm3                ; mm5=(07 17)=data7
96
97    movq        mm6, mm4
98    movq        mm7, mm0
99    pfsub       mm4, mm2                ; mm4=data1-data6=tmp6
100    pfsub       mm0, mm5                ; mm0=data0-data7=tmp7
101    pfadd       mm6, mm2                ; mm6=data1+data6=tmp1
102    pfadd       mm7, mm5                ; mm7=data0+data7=tmp0
103
104    movq        mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
105    movq        mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
106    movq        mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
107    movq        mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
108
109    ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
110
111    movq        MMWORD [wk(0)], mm4     ; wk(0)=tmp6
112    movq        MMWORD [wk(1)], mm0     ; wk(1)=tmp7
113
114    movq        mm4, mm1                ; transpose coefficients
115    punpckldq   mm1, mm3                ; mm1=(02 12)=data2
116    punpckhdq   mm4, mm3                ; mm4=(03 13)=data3
117    movq        mm0, mm2                ; transpose coefficients
118    punpckldq   mm2, mm5                ; mm2=(04 14)=data4
119    punpckhdq   mm0, mm5                ; mm0=(05 15)=data5
120
121    movq        mm3, mm4
122    movq        mm5, mm1
123    pfadd       mm4, mm2                ; mm4=data3+data4=tmp3
124    pfadd       mm1, mm0                ; mm1=data2+data5=tmp2
125    pfsub       mm3, mm2                ; mm3=data3-data4=tmp4
126    pfsub       mm5, mm0                ; mm5=data2-data5=tmp5
127
128    ; -- Even part
129
130    movq        mm2, mm7
131    movq        mm0, mm6
132    pfsub       mm7, mm4                ; mm7=tmp13
133    pfsub       mm6, mm1                ; mm6=tmp12
134    pfadd       mm2, mm4                ; mm2=tmp10
135    pfadd       mm0, mm1                ; mm0=tmp11
136
137    pfadd       mm6, mm7
138    pfmul       mm6, [GOTOFF(ebx,PD_0_707)]  ; mm6=z1
139
140    movq        mm4, mm2
141    movq        mm1, mm7
142    pfsub       mm2, mm0                ; mm2=data4
143    pfsub       mm7, mm6                ; mm7=data6
144    pfadd       mm4, mm0                ; mm4=data0
145    pfadd       mm1, mm6                ; mm1=data2
146
147    movq        MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
148    movq        MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
149    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
150    movq        MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
151
152    ; -- Odd part
153
154    movq        mm0, MMWORD [wk(0)]     ; mm0=tmp6
155    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp7
156
157    pfadd       mm3, mm5                ; mm3=tmp10
158    pfadd       mm5, mm0                ; mm5=tmp11
159    pfadd       mm0, mm6                ; mm0=tmp12, mm6=tmp7
160
161    pfmul       mm5, [GOTOFF(ebx,PD_0_707)]  ; mm5=z3
162
163    movq        mm2, mm3                     ; mm2=tmp10
164    pfsub       mm3, mm0
165    pfmul       mm3, [GOTOFF(ebx,PD_0_382)]  ; mm3=z5
166    pfmul       mm2, [GOTOFF(ebx,PD_0_541)]  ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
167    pfmul       mm0, [GOTOFF(ebx,PD_1_306)]  ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
168    pfadd       mm2, mm3                     ; mm2=z2
169    pfadd       mm0, mm3                     ; mm0=z4
170
171    movq        mm7, mm6
172    pfsub       mm6, mm5                ; mm6=z13
173    pfadd       mm7, mm5                ; mm7=z11
174
175    movq        mm4, mm6
176    movq        mm1, mm7
177    pfsub       mm6, mm2                ; mm6=data3
178    pfsub       mm7, mm0                ; mm7=data7
179    pfadd       mm4, mm2                ; mm4=data5
180    pfadd       mm1, mm0                ; mm1=data1
181
182    movq        MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
183    movq        MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
184    movq        MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
185    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
186
187    add         edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
188    dec         ecx
189    jnz         near .rowloop
190
191    ; ---- Pass 2: process columns.
192
193    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
194    mov         ecx, DCTSIZE/2
195    alignx      16, 7
196.columnloop:
197
198    movq        mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
199    movq        mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
200    movq        mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
201    movq        mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
202
203    ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
204
205    movq        mm4, mm0                ; transpose coefficients
206    punpckldq   mm0, mm1                ; mm0=(00 01)=data0
207    punpckhdq   mm4, mm1                ; mm4=(10 11)=data1
208    movq        mm5, mm2                ; transpose coefficients
209    punpckldq   mm2, mm3                ; mm2=(60 61)=data6
210    punpckhdq   mm5, mm3                ; mm5=(70 71)=data7
211
212    movq        mm6, mm4
213    movq        mm7, mm0
214    pfsub       mm4, mm2                ; mm4=data1-data6=tmp6
215    pfsub       mm0, mm5                ; mm0=data0-data7=tmp7
216    pfadd       mm6, mm2                ; mm6=data1+data6=tmp1
217    pfadd       mm7, mm5                ; mm7=data0+data7=tmp0
218
219    movq        mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
220    movq        mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
221    movq        mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
222    movq        mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
223
224    ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
225
226    movq        MMWORD [wk(0)], mm4     ; wk(0)=tmp6
227    movq        MMWORD [wk(1)], mm0     ; wk(1)=tmp7
228
229    movq        mm4, mm1                ; transpose coefficients
230    punpckldq   mm1, mm3                ; mm1=(20 21)=data2
231    punpckhdq   mm4, mm3                ; mm4=(30 31)=data3
232    movq        mm0, mm2                ; transpose coefficients
233    punpckldq   mm2, mm5                ; mm2=(40 41)=data4
234    punpckhdq   mm0, mm5                ; mm0=(50 51)=data5
235
236    movq        mm3, mm4
237    movq        mm5, mm1
238    pfadd       mm4, mm2                ; mm4=data3+data4=tmp3
239    pfadd       mm1, mm0                ; mm1=data2+data5=tmp2
240    pfsub       mm3, mm2                ; mm3=data3-data4=tmp4
241    pfsub       mm5, mm0                ; mm5=data2-data5=tmp5
242
243    ; -- Even part
244
245    movq        mm2, mm7
246    movq        mm0, mm6
247    pfsub       mm7, mm4                ; mm7=tmp13
248    pfsub       mm6, mm1                ; mm6=tmp12
249    pfadd       mm2, mm4                ; mm2=tmp10
250    pfadd       mm0, mm1                ; mm0=tmp11
251
252    pfadd       mm6, mm7
253    pfmul       mm6, [GOTOFF(ebx,PD_0_707)]  ; mm6=z1
254
255    movq        mm4, mm2
256    movq        mm1, mm7
257    pfsub       mm2, mm0                ; mm2=data4
258    pfsub       mm7, mm6                ; mm7=data6
259    pfadd       mm4, mm0                ; mm4=data0
260    pfadd       mm1, mm6                ; mm1=data2
261
262    movq        MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
263    movq        MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
264    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
265    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
266
267    ; -- Odd part
268
269    movq        mm0, MMWORD [wk(0)]     ; mm0=tmp6
270    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp7
271
272    pfadd       mm3, mm5                ; mm3=tmp10
273    pfadd       mm5, mm0                ; mm5=tmp11
274    pfadd       mm0, mm6                ; mm0=tmp12, mm6=tmp7
275
276    pfmul       mm5, [GOTOFF(ebx,PD_0_707)]  ; mm5=z3
277
278    movq        mm2, mm3                     ; mm2=tmp10
279    pfsub       mm3, mm0
280    pfmul       mm3, [GOTOFF(ebx,PD_0_382)]  ; mm3=z5
281    pfmul       mm2, [GOTOFF(ebx,PD_0_541)]  ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
282    pfmul       mm0, [GOTOFF(ebx,PD_1_306)]  ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
283    pfadd       mm2, mm3                     ; mm2=z2
284    pfadd       mm0, mm3                     ; mm0=z4
285
286    movq        mm7, mm6
287    pfsub       mm6, mm5                ; mm6=z13
288    pfadd       mm7, mm5                ; mm7=z11
289
290    movq        mm4, mm6
291    movq        mm1, mm7
292    pfsub       mm6, mm2                ; mm6=data3
293    pfsub       mm7, mm0                ; mm7=data7
294    pfadd       mm4, mm2                ; mm4=data5
295    pfadd       mm1, mm0                ; mm1=data1
296
297    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
298    movq        MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
299    movq        MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
300    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
301
302    add         edx, byte 2*SIZEOF_FAST_FLOAT
303    dec         ecx
304    jnz         near .columnloop
305
306    femms                               ; empty MMX/3DNow! state
307
308;   pop         edi                     ; unused
309;   pop         esi                     ; unused
310;   pop         edx                     ; need not be preserved
311;   pop         ecx                     ; need not be preserved
312    poppic      ebx
313    mov         esp, ebp                ; esp <- aligned ebp
314    pop         esp                     ; esp <- original ebp
315    pop         ebp
316    ret
317
318; For some reason, the OS X linker does not honor the request to align the
319; segment unless we do this.
320    align       32
321