• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jidctflt.asm - floating-point IDCT (SSE & SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; This file contains a floating-point implementation of the inverse DCT
18; (Discrete Cosine Transform). The following code is based directly on
19; the IJG's original jidctflt.c; see the jidctflt.c for more details.
20
21%include "jsimdext.inc"
22%include "jdct.inc"
23
24; --------------------------------------------------------------------------
25
26%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
27    shufps      %1, %2, 0x44
28%endmacro
29
30%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
31    shufps      %1, %2, 0xEE
32%endmacro
33
34; --------------------------------------------------------------------------
35    SECTION     SEG_CONST
36
37    alignz      32
38    GLOBAL_DATA(jconst_idct_float_sse2)
39
40EXTN(jconst_idct_float_sse2):
41
42PD_1_414        times 4  dd  1.414213562373095048801689
43PD_1_847        times 4  dd  1.847759065022573512256366
44PD_1_082        times 4  dd  1.082392200292393968799446
45PD_M2_613       times 4  dd -2.613125929752753055713286
46PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
47PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
48
49    alignz      32
50
51; --------------------------------------------------------------------------
52    SECTION     SEG_TEXT
53    BITS        32
54;
55; Perform dequantization and inverse DCT on one block of coefficients.
56;
57; GLOBAL(void)
58; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
59;                       JSAMPARRAY output_buf, JDIMENSION output_col)
60;
61
62%define dct_table(b)   (b) + 8          ; void *dct_table
63%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
64%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
65%define output_col(b)  (b) + 20         ; JDIMENSION output_col
66
67%define original_ebp   ebp + 0
68%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
69                                        ; xmmword wk[WK_NUM]
70%define WK_NUM         2
71%define workspace      wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
72                                        ; FAST_FLOAT workspace[DCTSIZE2]
73
74    align       32
75    GLOBAL_FUNCTION(jsimd_idct_float_sse2)
76
77EXTN(jsimd_idct_float_sse2):
78    push        ebp
79    mov         eax, esp                     ; eax = original ebp
80    sub         esp, byte 4
81    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
82    mov         [esp], eax
83    mov         ebp, esp                     ; ebp = aligned ebp
84    lea         esp, [workspace]
85    push        ebx
86;   push        ecx                     ; need not be preserved
87;   push        edx                     ; need not be preserved
88    push        esi
89    push        edi
90
91    get_GOT     ebx                     ; get GOT address
92
93    ; ---- Pass 1: process columns from input, store into work array.
94
95;   mov         eax, [original_ebp]
96    mov         edx, POINTER [dct_table(eax)]    ; quantptr
97    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
98    lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
99    mov         ecx, DCTSIZE/4                   ; ctr
100    alignx      16, 7
101.columnloop:
102%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
103    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
104    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
105    jnz         near .columnDCT
106
107    movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
108    movq        xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
109    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
110    movq        xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
111    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
112    movq        xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
113    movq        xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
114    por         xmm1, xmm2
115    por         xmm3, xmm4
116    por         xmm5, xmm6
117    por         xmm1, xmm3
118    por         xmm5, xmm7
119    por         xmm1, xmm5
120    packsswb    xmm1, xmm1
121    movd        eax, xmm1
122    test        eax, eax
123    jnz         short .columnDCT
124
125    ; -- AC terms all zero
126
127    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
128
129    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
130    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
131    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
132
133    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
134
135    movaps      xmm1, xmm0
136    movaps      xmm2, xmm0
137    movaps      xmm3, xmm0
138
139    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
140    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
141    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
142    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
143
144    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
145    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
146    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
147    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
148    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
149    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
150    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
151    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
152    jmp         near .nextcolumn
153    alignx      16, 7
154%endif
155.columnDCT:
156
157    ; -- Even part
158
159    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
160    movq        xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
161    movq        xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
162    movq        xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
163
164    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
165    punpcklwd   xmm1, xmm1                  ; xmm1=(20 20 21 21 22 22 23 23)
166    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
167    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in2=(20 21 22 23)
168    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
169    cvtdq2ps    xmm1, xmm1                  ; xmm1=in2=(20 21 22 23)
170
171    punpcklwd   xmm2, xmm2                  ; xmm2=(40 40 41 41 42 42 43 43)
172    punpcklwd   xmm3, xmm3                  ; xmm3=(60 60 61 61 62 62 63 63)
173    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in4=(40 41 42 43)
174    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in6=(60 61 62 63)
175    cvtdq2ps    xmm2, xmm2                  ; xmm2=in4=(40 41 42 43)
176    cvtdq2ps    xmm3, xmm3                  ; xmm3=in6=(60 61 62 63)
177
178    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
179    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
180    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
181    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
182
183    movaps      xmm4, xmm0
184    movaps      xmm5, xmm1
185    subps       xmm0, xmm2              ; xmm0=tmp11
186    subps       xmm1, xmm3
187    addps       xmm4, xmm2              ; xmm4=tmp10
188    addps       xmm5, xmm3              ; xmm5=tmp13
189
190    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
191    subps       xmm1, xmm5              ; xmm1=tmp12
192
193    movaps      xmm6, xmm4
194    movaps      xmm7, xmm0
195    subps       xmm4, xmm5              ; xmm4=tmp3
196    subps       xmm0, xmm1              ; xmm0=tmp2
197    addps       xmm6, xmm5              ; xmm6=tmp0
198    addps       xmm7, xmm1              ; xmm7=tmp1
199
200    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
201    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
202
203    ; -- Odd part
204
205    movq        xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
206    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
207    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
208    movq        xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
209
210    punpcklwd   xmm2, xmm2                  ; xmm2=(10 10 11 11 12 12 13 13)
211    punpcklwd   xmm3, xmm3                  ; xmm3=(30 30 31 31 32 32 33 33)
212    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in1=(10 11 12 13)
213    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in3=(30 31 32 33)
214    cvtdq2ps    xmm2, xmm2                  ; xmm2=in1=(10 11 12 13)
215    cvtdq2ps    xmm3, xmm3                  ; xmm3=in3=(30 31 32 33)
216
217    punpcklwd   xmm5, xmm5                  ; xmm5=(50 50 51 51 52 52 53 53)
218    punpcklwd   xmm1, xmm1                  ; xmm1=(70 70 71 71 72 72 73 73)
219    psrad       xmm5, (DWORD_BIT-WORD_BIT)  ; xmm5=in5=(50 51 52 53)
220    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in7=(70 71 72 73)
221    cvtdq2ps    xmm5, xmm5                  ; xmm5=in5=(50 51 52 53)
222    cvtdq2ps    xmm1, xmm1                  ; xmm1=in7=(70 71 72 73)
223
224    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
225    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
226    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
227    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
228
229    movaps      xmm4, xmm2
230    movaps      xmm0, xmm5
231    addps       xmm2, xmm1              ; xmm2=z11
232    addps       xmm5, xmm3              ; xmm5=z13
233    subps       xmm4, xmm1              ; xmm4=z12
234    subps       xmm0, xmm3              ; xmm0=z10
235
236    movaps      xmm1, xmm2
237    subps       xmm2, xmm5
238    addps       xmm1, xmm5              ; xmm1=tmp7
239
240    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
241
242    movaps      xmm3, xmm0
243    addps       xmm0, xmm4
244    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
245    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
246    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
247    addps       xmm3, xmm0                     ; xmm3=tmp12
248    subps       xmm4, xmm0                     ; xmm4=tmp10
249
250    ; -- Final output stage
251
252    subps       xmm3, xmm1              ; xmm3=tmp6
253    movaps      xmm5, xmm6
254    movaps      xmm0, xmm7
255    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
256    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
257    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
258    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
259    subps       xmm2, xmm3              ; xmm2=tmp5
260
261    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
262    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
263    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
264    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
265    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
266    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
267
268    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
269    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
270
271    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
272    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
273
274    addps       xmm4, xmm2              ; xmm4=tmp4
275    movaps      xmm0, xmm7
276    movaps      xmm3, xmm5
277    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
278    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
279    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
280    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
281
282    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
283    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
284    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
285    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
286    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
287    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
288
289    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
290    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
291    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
292    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
293    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
294    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
295
296    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
297    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
298
299    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
300    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
301    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
302    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
303
304    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
305    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
306    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
307    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
308    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
309    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
310
311    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
312    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
313    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
314    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
315
316.nextcolumn:
317    add         esi, byte 4*SIZEOF_JCOEF               ; coef_block
318    add         edx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
319    add         edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
320    dec         ecx                                    ; ctr
321    jnz         near .columnloop
322
323    ; -- Prefetch the next coefficient block
324
325    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
326    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
327    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
328    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
329
330    ; ---- Pass 2: process rows from work array, store into output array.
331
332    mov         eax, [original_ebp]
333    lea         esi, [workspace]                   ; FAST_FLOAT *wsptr
334    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
335    mov         eax, JDIMENSION [output_col(eax)]
336    mov         ecx, DCTSIZE/4                     ; ctr
337    alignx      16, 7
338.rowloop:
339
340    ; -- Even part
341
342    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
343    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
344    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
345    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
346
347    movaps      xmm4, xmm0
348    movaps      xmm5, xmm1
349    subps       xmm0, xmm2              ; xmm0=tmp11
350    subps       xmm1, xmm3
351    addps       xmm4, xmm2              ; xmm4=tmp10
352    addps       xmm5, xmm3              ; xmm5=tmp13
353
354    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
355    subps       xmm1, xmm5              ; xmm1=tmp12
356
357    movaps      xmm6, xmm4
358    movaps      xmm7, xmm0
359    subps       xmm4, xmm5              ; xmm4=tmp3
360    subps       xmm0, xmm1              ; xmm0=tmp2
361    addps       xmm6, xmm5              ; xmm6=tmp0
362    addps       xmm7, xmm1              ; xmm7=tmp1
363
364    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
365    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
366
367    ; -- Odd part
368
369    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
370    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
371    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
372    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
373
374    movaps      xmm4, xmm2
375    movaps      xmm0, xmm5
376    addps       xmm2, xmm1              ; xmm2=z11
377    addps       xmm5, xmm3              ; xmm5=z13
378    subps       xmm4, xmm1              ; xmm4=z12
379    subps       xmm0, xmm3              ; xmm0=z10
380
381    movaps      xmm1, xmm2
382    subps       xmm2, xmm5
383    addps       xmm1, xmm5              ; xmm1=tmp7
384
385    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
386
387    movaps      xmm3, xmm0
388    addps       xmm0, xmm4
389    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
390    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
391    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
392    addps       xmm3, xmm0                     ; xmm3=tmp12
393    subps       xmm4, xmm0                     ; xmm4=tmp10
394
395    ; -- Final output stage
396
397    subps       xmm3, xmm1              ; xmm3=tmp6
398    movaps      xmm5, xmm6
399    movaps      xmm0, xmm7
400    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
401    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
402    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
403    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
404    subps       xmm2, xmm3              ; xmm2=tmp5
405
406    movaps      xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; xmm1=[PD_RNDINT_MAGIC]
407    pcmpeqd     xmm3, xmm3
408    psrld       xmm3, WORD_BIT          ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
409
410    addps       xmm6, xmm1              ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
411    addps       xmm7, xmm1              ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
412    addps       xmm0, xmm1              ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
413    addps       xmm5, xmm1              ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
414
415    pand        xmm6, xmm3              ; xmm6=(00 -- 10 -- 20 -- 30 --)
416    pslld       xmm7, WORD_BIT          ; xmm7=(-- 01 -- 11 -- 21 -- 31)
417    pand        xmm0, xmm3              ; xmm0=(06 -- 16 -- 26 -- 36 --)
418    pslld       xmm5, WORD_BIT          ; xmm5=(-- 07 -- 17 -- 27 -- 37)
419    por         xmm6, xmm7              ; xmm6=(00 01 10 11 20 21 30 31)
420    por         xmm0, xmm5              ; xmm0=(06 07 16 17 26 27 36 37)
421
422    movaps      xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
423    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
424
425    addps       xmm4, xmm2              ; xmm4=tmp4
426    movaps      xmm7, xmm1
427    movaps      xmm5, xmm3
428    addps       xmm1, xmm2              ; xmm1=data2=(02 12 22 32)
429    addps       xmm3, xmm4              ; xmm3=data4=(04 14 24 34)
430    subps       xmm7, xmm2              ; xmm7=data5=(05 15 25 35)
431    subps       xmm5, xmm4              ; xmm5=data3=(03 13 23 33)
432
433    movaps      xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; xmm2=[PD_RNDINT_MAGIC]
434    pcmpeqd     xmm4, xmm4
435    psrld       xmm4, WORD_BIT          ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
436
437    addps       xmm3, xmm2              ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
438    addps       xmm7, xmm2              ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
439    addps       xmm1, xmm2              ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
440    addps       xmm5, xmm2              ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
441
442    pand        xmm3, xmm4              ; xmm3=(04 -- 14 -- 24 -- 34 --)
443    pslld       xmm7, WORD_BIT          ; xmm7=(-- 05 -- 15 -- 25 -- 35)
444    pand        xmm1, xmm4              ; xmm1=(02 -- 12 -- 22 -- 32 --)
445    pslld       xmm5, WORD_BIT          ; xmm5=(-- 03 -- 13 -- 23 -- 33)
446    por         xmm3, xmm7              ; xmm3=(04 05 14 15 24 25 34 35)
447    por         xmm1, xmm5              ; xmm1=(02 03 12 13 22 23 32 33)
448
449    movdqa      xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; xmm2=[PB_CENTERJSAMP]
450
451    packsswb    xmm6, xmm3        ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
452    packsswb    xmm1, xmm0        ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
453    paddb       xmm6, xmm2
454    paddb       xmm1, xmm2
455
456    movdqa      xmm4, xmm6        ; transpose coefficients(phase 2)
457    punpcklwd   xmm6, xmm1        ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
458    punpckhwd   xmm4, xmm1        ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
459
460    movdqa      xmm7, xmm6        ; transpose coefficients(phase 3)
461    punpckldq   xmm6, xmm4        ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
462    punpckhdq   xmm7, xmm4        ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
463
464    pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
465    pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
466
467    pushpic     ebx                     ; save GOT address
468
469    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
470    mov         ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
471    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
472    movq        XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
473    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
474    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
475    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
476    movq        XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
477
478    poppic      ebx                     ; restore GOT address
479
480    add         esi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
481    add         edi, byte 4*SIZEOF_JSAMPROW
482    dec         ecx                            ; ctr
483    jnz         near .rowloop
484
485    pop         edi
486    pop         esi
487;   pop         edx                     ; need not be preserved
488;   pop         ecx                     ; need not be preserved
489    pop         ebx
490    mov         esp, ebp                ; esp <- aligned ebp
491    pop         esp                     ; esp <- original ebp
492    pop         ebp
493    ret
494
495; For some reason, the OS X linker does not honor the request to align the
496; segment unless we do this.
497    align       32
498