• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; This file contains a floating-point implementation of the inverse DCT
18; (Discrete Cosine Transform). The following code is based directly on
19; the IJG's original jidctflt.c; see the jidctflt.c for more details.
20
21%include "jsimdext.inc"
22%include "jdct.inc"
23
24; --------------------------------------------------------------------------
25
26%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
27    shufps      %1, %2, 0x44
28%endmacro
29
30%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
31    shufps      %1, %2, 0xEE
32%endmacro
33
34; --------------------------------------------------------------------------
35    SECTION     SEG_CONST
36
37    alignz      32
38    GLOBAL_DATA(jconst_idct_float_sse2)
39
40EXTN(jconst_idct_float_sse2):
41
42PD_1_414        times 4  dd  1.414213562373095048801689
43PD_1_847        times 4  dd  1.847759065022573512256366
44PD_1_082        times 4  dd  1.082392200292393968799446
45PD_M2_613       times 4  dd -2.613125929752753055713286
46PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
47PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
48
49    alignz      32
50
51; --------------------------------------------------------------------------
52    SECTION     SEG_TEXT
53    BITS        64
54;
55; Perform dequantization and inverse DCT on one block of coefficients.
56;
57; GLOBAL(void)
58; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
59;                       JSAMPARRAY output_buf, JDIMENSION output_col)
60;
61
62; r10 = void *dct_table
63; r11 = JCOEFPTR coef_block
64; r12 = JSAMPARRAY output_buf
65; r13d = JDIMENSION output_col
66
67%define original_rbp  rbp + 0
68%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
69                                        ; xmmword wk[WK_NUM]
70%define WK_NUM        2
71%define workspace     wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
72                                        ; FAST_FLOAT workspace[DCTSIZE2]
73
74    align       32
75    GLOBAL_FUNCTION(jsimd_idct_float_sse2)
76
77EXTN(jsimd_idct_float_sse2):
78    push        rbp
79    mov         rax, rsp                     ; rax = original rbp
80    sub         rsp, byte 4
81    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
82    mov         [rsp], rax
83    mov         rbp, rsp                     ; rbp = aligned rbp
84    lea         rsp, [workspace]
85    collect_args 4
86    push        rbx
87
88    ; ---- Pass 1: process columns from input, store into work array.
89
90    mov         rdx, r10                ; quantptr
91    mov         rsi, r11                ; inptr
92    lea         rdi, [workspace]        ; FAST_FLOAT *wsptr
93    mov         rcx, DCTSIZE/4          ; ctr
94.columnloop:
95%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
96    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
97    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
98    jnz         near .columnDCT
99
100    movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
101    movq        xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
102    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
103    movq        xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
104    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
105    movq        xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
106    movq        xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
107    por         xmm1, xmm2
108    por         xmm3, xmm4
109    por         xmm5, xmm6
110    por         xmm1, xmm3
111    por         xmm5, xmm7
112    por         xmm1, xmm5
113    packsswb    xmm1, xmm1
114    movd        eax, xmm1
115    test        rax, rax
116    jnz         short .columnDCT
117
118    ; -- AC terms all zero
119
120    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
121
122    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
123    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
124    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
125
126    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
127
128    movaps      xmm1, xmm0
129    movaps      xmm2, xmm0
130    movaps      xmm3, xmm0
131
132    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
133    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
134    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
135    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
136
137    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
138    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
139    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
140    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
141    movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
142    movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
143    movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
144    movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
145    jmp         near .nextcolumn
146%endif
147.columnDCT:
148
149    ; -- Even part
150
151    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
152    movq        xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
153    movq        xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
154    movq        xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
155
156    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
157    punpcklwd   xmm1, xmm1                  ; xmm1=(20 20 21 21 22 22 23 23)
158    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
159    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in2=(20 21 22 23)
160    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
161    cvtdq2ps    xmm1, xmm1                  ; xmm1=in2=(20 21 22 23)
162
163    punpcklwd   xmm2, xmm2                  ; xmm2=(40 40 41 41 42 42 43 43)
164    punpcklwd   xmm3, xmm3                  ; xmm3=(60 60 61 61 62 62 63 63)
165    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in4=(40 41 42 43)
166    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in6=(60 61 62 63)
167    cvtdq2ps    xmm2, xmm2                  ; xmm2=in4=(40 41 42 43)
168    cvtdq2ps    xmm3, xmm3                  ; xmm3=in6=(60 61 62 63)
169
170    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
171    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
172    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
173    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
174
175    movaps      xmm4, xmm0
176    movaps      xmm5, xmm1
177    subps       xmm0, xmm2              ; xmm0=tmp11
178    subps       xmm1, xmm3
179    addps       xmm4, xmm2              ; xmm4=tmp10
180    addps       xmm5, xmm3              ; xmm5=tmp13
181
182    mulps       xmm1, [rel PD_1_414]
183    subps       xmm1, xmm5              ; xmm1=tmp12
184
185    movaps      xmm6, xmm4
186    movaps      xmm7, xmm0
187    subps       xmm4, xmm5              ; xmm4=tmp3
188    subps       xmm0, xmm1              ; xmm0=tmp2
189    addps       xmm6, xmm5              ; xmm6=tmp0
190    addps       xmm7, xmm1              ; xmm7=tmp1
191
192    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
193    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
194
195    ; -- Odd part
196
197    movq        xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
198    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
199    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
200    movq        xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
201
202    punpcklwd   xmm2, xmm2                  ; xmm2=(10 10 11 11 12 12 13 13)
203    punpcklwd   xmm3, xmm3                  ; xmm3=(30 30 31 31 32 32 33 33)
204    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in1=(10 11 12 13)
205    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in3=(30 31 32 33)
206    cvtdq2ps    xmm2, xmm2                  ; xmm2=in1=(10 11 12 13)
207    cvtdq2ps    xmm3, xmm3                  ; xmm3=in3=(30 31 32 33)
208
209    punpcklwd   xmm5, xmm5                  ; xmm5=(50 50 51 51 52 52 53 53)
210    punpcklwd   xmm1, xmm1                  ; xmm1=(70 70 71 71 72 72 73 73)
211    psrad       xmm5, (DWORD_BIT-WORD_BIT)  ; xmm5=in5=(50 51 52 53)
212    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in7=(70 71 72 73)
213    cvtdq2ps    xmm5, xmm5                  ; xmm5=in5=(50 51 52 53)
214    cvtdq2ps    xmm1, xmm1                  ; xmm1=in7=(70 71 72 73)
215
216    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
217    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
218    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
219    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
220
221    movaps      xmm4, xmm2
222    movaps      xmm0, xmm5
223    addps       xmm2, xmm1              ; xmm2=z11
224    addps       xmm5, xmm3              ; xmm5=z13
225    subps       xmm4, xmm1              ; xmm4=z12
226    subps       xmm0, xmm3              ; xmm0=z10
227
228    movaps      xmm1, xmm2
229    subps       xmm2, xmm5
230    addps       xmm1, xmm5              ; xmm1=tmp7
231
232    mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
233
234    movaps      xmm3, xmm0
235    addps       xmm0, xmm4
236    mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
237    mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
238    mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
239    addps       xmm3, xmm0              ; xmm3=tmp12
240    subps       xmm4, xmm0              ; xmm4=tmp10
241
242    ; -- Final output stage
243
244    subps       xmm3, xmm1              ; xmm3=tmp6
245    movaps      xmm5, xmm6
246    movaps      xmm0, xmm7
247    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
248    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
249    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
250    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
251    subps       xmm2, xmm3              ; xmm2=tmp5
252
253    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
254    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
255    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
256    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
257    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
258    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
259
260    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
261    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
262
263    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
264    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
265
266    addps       xmm4, xmm2              ; xmm4=tmp4
267    movaps      xmm0, xmm7
268    movaps      xmm3, xmm5
269    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
270    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
271    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
272    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
273
274    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
275    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
276    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
277    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
278    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
279    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
280
281    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
282    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
283    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
284    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
285    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
286    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
287
288    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
289    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
290
291    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
292    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
293    movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
294    movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
295
296    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
297    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
298    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
299    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
300    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
301    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
302
303    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
304    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
305    movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
306    movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
307
308.nextcolumn:
309    add         rsi, byte 4*SIZEOF_JCOEF               ; coef_block
310    add         rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
311    add         rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
312    dec         rcx                                    ; ctr
313    jnz         near .columnloop
314
315    ; -- Prefetch the next coefficient block
316
317    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
318    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
319    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
320    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
321
322    ; ---- Pass 2: process rows from work array, store into output array.
323
324    mov         rax, [original_rbp]
325    lea         rsi, [workspace]        ; FAST_FLOAT *wsptr
326    mov         rdi, r12                ; (JSAMPROW *)
327    mov         eax, r13d
328    mov         rcx, DCTSIZE/4          ; ctr
329.rowloop:
330
331    ; -- Even part
332
333    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
334    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
335    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
336    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
337
338    movaps      xmm4, xmm0
339    movaps      xmm5, xmm1
340    subps       xmm0, xmm2              ; xmm0=tmp11
341    subps       xmm1, xmm3
342    addps       xmm4, xmm2              ; xmm4=tmp10
343    addps       xmm5, xmm3              ; xmm5=tmp13
344
345    mulps       xmm1, [rel PD_1_414]
346    subps       xmm1, xmm5              ; xmm1=tmp12
347
348    movaps      xmm6, xmm4
349    movaps      xmm7, xmm0
350    subps       xmm4, xmm5              ; xmm4=tmp3
351    subps       xmm0, xmm1              ; xmm0=tmp2
352    addps       xmm6, xmm5              ; xmm6=tmp0
353    addps       xmm7, xmm1              ; xmm7=tmp1
354
355    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
356    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
357
358    ; -- Odd part
359
360    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
361    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
362    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
363    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
364
365    movaps      xmm4, xmm2
366    movaps      xmm0, xmm5
367    addps       xmm2, xmm1              ; xmm2=z11
368    addps       xmm5, xmm3              ; xmm5=z13
369    subps       xmm4, xmm1              ; xmm4=z12
370    subps       xmm0, xmm3              ; xmm0=z10
371
372    movaps      xmm1, xmm2
373    subps       xmm2, xmm5
374    addps       xmm1, xmm5              ; xmm1=tmp7
375
376    mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
377
378    movaps      xmm3, xmm0
379    addps       xmm0, xmm4
380    mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
381    mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
382    mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
383    addps       xmm3, xmm0              ; xmm3=tmp12
384    subps       xmm4, xmm0              ; xmm4=tmp10
385
386    ; -- Final output stage
387
388    subps       xmm3, xmm1              ; xmm3=tmp6
389    movaps      xmm5, xmm6
390    movaps      xmm0, xmm7
391    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
392    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
393    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
394    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
395    subps       xmm2, xmm3              ; xmm2=tmp5
396
397    movaps      xmm1, [rel PD_RNDINT_MAGIC]  ; xmm1=[rel PD_RNDINT_MAGIC]
398    pcmpeqd     xmm3, xmm3
399    psrld       xmm3, WORD_BIT          ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
400
401    addps       xmm6, xmm1              ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
402    addps       xmm7, xmm1              ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
403    addps       xmm0, xmm1              ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
404    addps       xmm5, xmm1              ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
405
406    pand        xmm6, xmm3              ; xmm6=(00 -- 10 -- 20 -- 30 --)
407    pslld       xmm7, WORD_BIT          ; xmm7=(-- 01 -- 11 -- 21 -- 31)
408    pand        xmm0, xmm3              ; xmm0=(06 -- 16 -- 26 -- 36 --)
409    pslld       xmm5, WORD_BIT          ; xmm5=(-- 07 -- 17 -- 27 -- 37)
410    por         xmm6, xmm7              ; xmm6=(00 01 10 11 20 21 30 31)
411    por         xmm0, xmm5              ; xmm0=(06 07 16 17 26 27 36 37)
412
413    movaps      xmm1,  XMMWORD [wk(0)]  ; xmm1=tmp2
414    movaps      xmm3,  XMMWORD [wk(1)]  ; xmm3=tmp3
415
416    addps       xmm4, xmm2              ; xmm4=tmp4
417    movaps      xmm7, xmm1
418    movaps      xmm5, xmm3
419    addps       xmm1, xmm2              ; xmm1=data2=(02 12 22 32)
420    addps       xmm3, xmm4              ; xmm3=data4=(04 14 24 34)
421    subps       xmm7, xmm2              ; xmm7=data5=(05 15 25 35)
422    subps       xmm5, xmm4              ; xmm5=data3=(03 13 23 33)
423
424    movaps      xmm2, [rel PD_RNDINT_MAGIC]  ; xmm2=[rel PD_RNDINT_MAGIC]
425    pcmpeqd     xmm4, xmm4
426    psrld       xmm4, WORD_BIT          ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
427
428    addps       xmm3, xmm2              ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
429    addps       xmm7, xmm2              ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
430    addps       xmm1, xmm2              ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
431    addps       xmm5, xmm2              ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
432
433    pand        xmm3, xmm4              ; xmm3=(04 -- 14 -- 24 -- 34 --)
434    pslld       xmm7, WORD_BIT          ; xmm7=(-- 05 -- 15 -- 25 -- 35)
435    pand        xmm1, xmm4              ; xmm1=(02 -- 12 -- 22 -- 32 --)
436    pslld       xmm5, WORD_BIT          ; xmm5=(-- 03 -- 13 -- 23 -- 33)
437    por         xmm3, xmm7              ; xmm3=(04 05 14 15 24 25 34 35)
438    por         xmm1, xmm5              ; xmm1=(02 03 12 13 22 23 32 33)
439
440    movdqa      xmm2, [rel PB_CENTERJSAMP]  ; xmm2=[rel PB_CENTERJSAMP]
441
442    packsswb    xmm6, xmm3        ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
443    packsswb    xmm1, xmm0        ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
444    paddb       xmm6, xmm2
445    paddb       xmm1, xmm2
446
447    movdqa      xmm4, xmm6        ; transpose coefficients(phase 2)
448    punpcklwd   xmm6, xmm1        ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
449    punpckhwd   xmm4, xmm1        ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
450
451    movdqa      xmm7, xmm6        ; transpose coefficients(phase 3)
452    punpckldq   xmm6, xmm4        ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
453    punpckhdq   xmm7, xmm4        ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
454
455    pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
456    pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
457
458    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
459    mov         rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
460    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
461    movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
462    mov         rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
463    mov         rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
464    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
465    movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
466
467    add         rsi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
468    add         rdi, byte 4*SIZEOF_JSAMPROW
469    dec         rcx                            ; ctr
470    jnz         near .rowloop
471
472    pop         rbx
473    uncollect_args 4
474    mov         rsp, rbp                ; rsp <- aligned rbp
475    pop         rsp                     ; rsp <- original rbp
476    pop         rbp
477    ret
478
479; For some reason, the OS X linker does not honor the request to align the
480; segment unless we do this.
481    align       32
482