• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jfdctflt.asm - floating-point FDCT (SSE)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
7; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; This file contains a floating-point implementation of the forward DCT
18; (Discrete Cosine Transform). The following code is based directly on
19; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
20;
21; [TAB8]
22
23%include "jsimdext.inc"
24%include "jdct.inc"
25
26; --------------------------------------------------------------------------
27
28%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
29        shufps  %1,%2,0x44
30%endmacro
31
32%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
33        shufps  %1,%2,0xEE
34%endmacro
35
36; --------------------------------------------------------------------------
37        SECTION SEG_CONST
38
39        alignz  16
40        global  EXTN(jconst_fdct_float_sse)
41
42EXTN(jconst_fdct_float_sse):
43
44PD_0_382        times 4 dd  0.382683432365089771728460
45PD_0_707        times 4 dd  0.707106781186547524400844
46PD_0_541        times 4 dd  0.541196100146196984399723
47PD_1_306        times 4 dd  1.306562964876376527856643
48
49        alignz  16
50
51; --------------------------------------------------------------------------
52        SECTION SEG_TEXT
53        BITS    32
54;
55; Perform the forward DCT on one block of samples.
56;
57; GLOBAL(void)
58; jsimd_fdct_float_sse (FAST_FLOAT * data)
59;
60
61%define data(b)         (b)+8           ; FAST_FLOAT * data
62
63%define original_ebp    ebp+0
64%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
65%define WK_NUM          2
66
67        align   16
68        global  EXTN(jsimd_fdct_float_sse)
69
70EXTN(jsimd_fdct_float_sse):
71        push    ebp
72        mov     eax,esp                         ; eax = original ebp
73        sub     esp, byte 4
74        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
75        mov     [esp],eax
76        mov     ebp,esp                         ; ebp = aligned ebp
77        lea     esp, [wk(0)]
78        pushpic ebx
79;       push    ecx             ; need not be preserved
80;       push    edx             ; need not be preserved
81;       push    esi             ; unused
82;       push    edi             ; unused
83
84        get_GOT ebx             ; get GOT address
85
86        ; ---- Pass 1: process rows.
87
88        mov     edx, POINTER [data(eax)]        ; (FAST_FLOAT *)
89        mov     ecx, DCTSIZE/4
90        alignx  16,7
91.rowloop:
92
93        movaps  xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
94        movaps  xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
95        movaps  xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
96        movaps  xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
97
98        ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
99        ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
100
101        movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
102        unpcklps xmm0,xmm1              ; xmm0=(20 30 21 31)
103        unpckhps xmm4,xmm1              ; xmm4=(22 32 23 33)
104        movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
105        unpcklps xmm2,xmm3              ; xmm2=(24 34 25 35)
106        unpckhps xmm5,xmm3              ; xmm5=(26 36 27 37)
107
108        movaps  xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
109        movaps  xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
110        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
111        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
112
113        ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
114        ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
115
116        movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
117        movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
118
119        movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
120        unpcklps xmm6,xmm7              ; xmm6=(00 10 01 11)
121        unpckhps xmm4,xmm7              ; xmm4=(02 12 03 13)
122        movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
123        unpcklps xmm1,xmm3              ; xmm1=(04 14 05 15)
124        unpckhps xmm2,xmm3              ; xmm2=(06 16 07 17)
125
126        movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
127        unpcklps2 xmm6,xmm0             ; xmm6=(00 10 20 30)=data0
128        unpckhps2 xmm7,xmm0             ; xmm7=(01 11 21 31)=data1
129        movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
130        unpcklps2 xmm2,xmm5             ; xmm2=(06 16 26 36)=data6
131        unpckhps2 xmm3,xmm5             ; xmm3=(07 17 27 37)=data7
132
133        movaps  xmm0,xmm7
134        movaps  xmm5,xmm6
135        subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
136        subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
137        addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
138        addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
139
140        movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
141        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
142        movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
143        movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
144
145        movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
146        unpcklps2 xmm4,xmm2             ; xmm4=(02 12 22 32)=data2
147        unpckhps2 xmm7,xmm2             ; xmm7=(03 13 23 33)=data3
148        movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
149        unpcklps2 xmm1,xmm3             ; xmm1=(04 14 24 34)=data4
150        unpckhps2 xmm6,xmm3             ; xmm6=(05 15 25 35)=data5
151
152        movaps  xmm2,xmm7
153        movaps  xmm3,xmm4
154        addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
155        addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
156        subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
157        subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
158
159        ; -- Even part
160
161        movaps  xmm1,xmm5
162        movaps  xmm6,xmm0
163        subps   xmm5,xmm7               ; xmm5=tmp13
164        subps   xmm0,xmm4               ; xmm0=tmp12
165        addps   xmm1,xmm7               ; xmm1=tmp10
166        addps   xmm6,xmm4               ; xmm6=tmp11
167
168        addps   xmm0,xmm5
169        mulps   xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
170
171        movaps  xmm7,xmm1
172        movaps  xmm4,xmm5
173        subps   xmm1,xmm6               ; xmm1=data4
174        subps   xmm5,xmm0               ; xmm5=data6
175        addps   xmm7,xmm6               ; xmm7=data0
176        addps   xmm4,xmm0               ; xmm4=data2
177
178        movaps  XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
179        movaps  XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
180        movaps  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
181        movaps  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
182
183        ; -- Odd part
184
185        movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
186        movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
187
188        addps   xmm2,xmm3               ; xmm2=tmp10
189        addps   xmm3,xmm6               ; xmm3=tmp11
190        addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
191
192        mulps   xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
193
194        movaps  xmm1,xmm2               ; xmm1=tmp10
195        subps   xmm2,xmm6
196        mulps   xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
197        mulps   xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
198        mulps   xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
199        addps   xmm1,xmm2               ; xmm1=z2
200        addps   xmm6,xmm2               ; xmm6=z4
201
202        movaps  xmm5,xmm0
203        subps   xmm0,xmm3               ; xmm0=z13
204        addps   xmm5,xmm3               ; xmm5=z11
205
206        movaps  xmm7,xmm0
207        movaps  xmm4,xmm5
208        subps   xmm0,xmm1               ; xmm0=data3
209        subps   xmm5,xmm6               ; xmm5=data7
210        addps   xmm7,xmm1               ; xmm7=data5
211        addps   xmm4,xmm6               ; xmm4=data1
212
213        movaps  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
214        movaps  XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
215        movaps  XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
216        movaps  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
217
218        add     edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
219        dec     ecx
220        jnz     near .rowloop
221
222        ; ---- Pass 2: process columns.
223
224        mov     edx, POINTER [data(eax)]        ; (FAST_FLOAT *)
225        mov     ecx, DCTSIZE/4
226        alignx  16,7
227.columnloop:
228
229        movaps  xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
230        movaps  xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
231        movaps  xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
232        movaps  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
233
234        ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
235        ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
236
237        movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
238        unpcklps xmm0,xmm1              ; xmm0=(02 03 12 13)
239        unpckhps xmm4,xmm1              ; xmm4=(22 23 32 33)
240        movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
241        unpcklps xmm2,xmm3              ; xmm2=(42 43 52 53)
242        unpckhps xmm5,xmm3              ; xmm5=(62 63 72 73)
243
244        movaps  xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
245        movaps  xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
246        movaps  xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
247        movaps  xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
248
249        ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
250        ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
251
252        movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
253        movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
254
255        movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
256        unpcklps xmm6,xmm7              ; xmm6=(00 01 10 11)
257        unpckhps xmm4,xmm7              ; xmm4=(20 21 30 31)
258        movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
259        unpcklps xmm1,xmm3              ; xmm1=(40 41 50 51)
260        unpckhps xmm2,xmm3              ; xmm2=(60 61 70 71)
261
262        movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
263        unpcklps2 xmm6,xmm0             ; xmm6=(00 01 02 03)=data0
264        unpckhps2 xmm7,xmm0             ; xmm7=(10 11 12 13)=data1
265        movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
266        unpcklps2 xmm2,xmm5             ; xmm2=(60 61 62 63)=data6
267        unpckhps2 xmm3,xmm5             ; xmm3=(70 71 72 73)=data7
268
269        movaps  xmm0,xmm7
270        movaps  xmm5,xmm6
271        subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
272        subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
273        addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
274        addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
275
276        movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
277        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
278        movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
279        movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
280
281        movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
282        unpcklps2 xmm4,xmm2             ; xmm4=(20 21 22 23)=data2
283        unpckhps2 xmm7,xmm2             ; xmm7=(30 31 32 33)=data3
284        movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
285        unpcklps2 xmm1,xmm3             ; xmm1=(40 41 42 43)=data4
286        unpckhps2 xmm6,xmm3             ; xmm6=(50 51 52 53)=data5
287
288        movaps  xmm2,xmm7
289        movaps  xmm3,xmm4
290        addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
291        addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
292        subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
293        subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
294
295        ; -- Even part
296
297        movaps  xmm1,xmm5
298        movaps  xmm6,xmm0
299        subps   xmm5,xmm7               ; xmm5=tmp13
300        subps   xmm0,xmm4               ; xmm0=tmp12
301        addps   xmm1,xmm7               ; xmm1=tmp10
302        addps   xmm6,xmm4               ; xmm6=tmp11
303
304        addps   xmm0,xmm5
305        mulps   xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
306
307        movaps  xmm7,xmm1
308        movaps  xmm4,xmm5
309        subps   xmm1,xmm6               ; xmm1=data4
310        subps   xmm5,xmm0               ; xmm5=data6
311        addps   xmm7,xmm6               ; xmm7=data0
312        addps   xmm4,xmm0               ; xmm4=data2
313
314        movaps  XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
315        movaps  XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
316        movaps  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
317        movaps  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
318
319        ; -- Odd part
320
321        movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
322        movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
323
324        addps   xmm2,xmm3               ; xmm2=tmp10
325        addps   xmm3,xmm6               ; xmm3=tmp11
326        addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
327
328        mulps   xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
329
330        movaps  xmm1,xmm2               ; xmm1=tmp10
331        subps   xmm2,xmm6
332        mulps   xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
333        mulps   xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
334        mulps   xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
335        addps   xmm1,xmm2               ; xmm1=z2
336        addps   xmm6,xmm2               ; xmm6=z4
337
338        movaps  xmm5,xmm0
339        subps   xmm0,xmm3               ; xmm0=z13
340        addps   xmm5,xmm3               ; xmm5=z11
341
342        movaps  xmm7,xmm0
343        movaps  xmm4,xmm5
344        subps   xmm0,xmm1               ; xmm0=data3
345        subps   xmm5,xmm6               ; xmm5=data7
346        addps   xmm7,xmm1               ; xmm7=data5
347        addps   xmm4,xmm6               ; xmm4=data1
348
349        movaps  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
350        movaps  XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
351        movaps  XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
352        movaps  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
353
354        add     edx, byte 4*SIZEOF_FAST_FLOAT
355        dec     ecx
356        jnz     near .columnloop
357
358;       pop     edi             ; unused
359;       pop     esi             ; unused
360;       pop     edx             ; need not be preserved
361;       pop     ecx             ; need not be preserved
362        poppic  ebx
363        mov     esp,ebp         ; esp <- aligned ebp
364        pop     esp             ; esp <- original ebp
365        pop     ebp
366        ret
367
368; For some reason, the OS X linker does not honor the request to align the
369; segment unless we do this.
370        align   16
371