• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jccolext.asm - colorspace conversion (64-bit SSE2)
3;
4; x86 SIMD extension for IJG JPEG library
5; Copyright (C) 1999-2006, MIYASAKA Masaru.
6; Copyright (C) 2009, D. R. Commander.
7; For conditions of distribution and use, see copyright notice in jsimdext.inc
8;
9; This file should be assembled with NASM (Netwide Assembler),
10; can *not* be assembled with Microsoft's MASM or any compatible
11; assembler (including Borland's Turbo Assembler).
12; NASM is available from http://nasm.sourceforge.net/ or
13; http://sourceforge.net/project/showfiles.php?group_id=6208
14;
15; [TAB8]
16
17%include "jcolsamp.inc"
18
19; --------------------------------------------------------------------------
20;
21; Convert some rows of samples to the output colorspace.
22;
23; GLOBAL(void)
24; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
25;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
26;                             JDIMENSION output_row, int num_rows);
27;
28
29; r10 = JDIMENSION img_width
30; r11 = JSAMPARRAY input_buf
31; r12 = JSAMPIMAGE output_buf
32; r13 = JDIMENSION output_row
33; r14 = int num_rows
34
35%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
36%define WK_NUM          8
37
38        align   16
39
40        global  EXTN(jsimd_rgb_ycc_convert_sse2)
41
42EXTN(jsimd_rgb_ycc_convert_sse2):
43        push    rbp
44        mov     rax,rsp                         ; rax = original rbp
45        sub     rsp, byte 4
46        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
47        mov     [rsp],rax
48        mov     rbp,rsp                         ; rbp = aligned rbp
49        lea     rsp, [wk(0)]
50        collect_args
51        push    rbx
52
53        mov     ecx, r10d
54        test    rcx,rcx
55        jz      near .return
56
57        push    rcx
58
59        mov rsi, r12
60        mov ecx, r13d
61        mov     rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
62        mov     rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
63        mov     rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
64        lea     rdi, [rdi+rcx*SIZEOF_JSAMPROW]
65        lea     rbx, [rbx+rcx*SIZEOF_JSAMPROW]
66        lea     rdx, [rdx+rcx*SIZEOF_JSAMPROW]
67
68        pop     rcx
69
70        mov rsi, r11
71        mov     eax, r14d
72        test    rax,rax
73        jle     near .return
74.rowloop:
75        push    rdx
76        push    rbx
77        push    rdi
78        push    rsi
79        push    rcx                     ; col
80
81        mov     rsi, JSAMPROW [rsi]     ; inptr
82        mov     rdi, JSAMPROW [rdi]     ; outptr0
83        mov     rbx, JSAMPROW [rbx]     ; outptr1
84        mov     rdx, JSAMPROW [rdx]     ; outptr2
85
86        cmp     rcx, byte SIZEOF_XMMWORD
87        jae     near .columnloop
88
89%if RGB_PIXELSIZE == 3 ; ---------------
90
91.column_ld1:
92        push    rax
93        push    rdx
94        lea     rcx,[rcx+rcx*2]         ; imul ecx,RGB_PIXELSIZE
95        test    cl, SIZEOF_BYTE
96        jz      short .column_ld2
97        sub     rcx, byte SIZEOF_BYTE
98        movzx   rax, BYTE [rsi+rcx]
99.column_ld2:
100        test    cl, SIZEOF_WORD
101        jz      short .column_ld4
102        sub     rcx, byte SIZEOF_WORD
103        movzx   rdx, WORD [rsi+rcx]
104        shl     rax, WORD_BIT
105        or      rax,rdx
106.column_ld4:
107        movd    xmmA,eax
108        pop     rdx
109        pop     rax
110        test    cl, SIZEOF_DWORD
111        jz      short .column_ld8
112        sub     rcx, byte SIZEOF_DWORD
113        movd    xmmF, XMM_DWORD [rsi+rcx]
114        pslldq  xmmA, SIZEOF_DWORD
115        por     xmmA,xmmF
116.column_ld8:
117        test    cl, SIZEOF_MMWORD
118        jz      short .column_ld16
119        sub     rcx, byte SIZEOF_MMWORD
120        movq    xmmB, XMM_MMWORD [rsi+rcx]
121        pslldq  xmmA, SIZEOF_MMWORD
122        por     xmmA,xmmB
123.column_ld16:
124        test    cl, SIZEOF_XMMWORD
125        jz      short .column_ld32
126        movdqa  xmmF,xmmA
127        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
128        mov     rcx, SIZEOF_XMMWORD
129        jmp     short .rgb_ycc_cnv
130.column_ld32:
131        test    cl, 2*SIZEOF_XMMWORD
132        mov     rcx, SIZEOF_XMMWORD
133        jz      short .rgb_ycc_cnv
134        movdqa  xmmB,xmmA
135        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
136        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
137        jmp     short .rgb_ycc_cnv
138
139.columnloop:
140        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
141        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
142        movdqu  xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
143
144.rgb_ycc_cnv:
145        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
146        ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
147        ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
148
149        movdqa    xmmG,xmmA
150        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
151        psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
152
153        punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
154        pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
155
156        punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
157        punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
158
159        movdqa    xmmD,xmmA
160        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
161        psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
162
163        punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
164        pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
165
166        punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
167        punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
168
169        movdqa    xmmE,xmmA
170        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
171        psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
172
173        punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
174        pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
175
176        punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
177        punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
178
179        pxor      xmmH,xmmH
180
181        movdqa    xmmC,xmmA
182        punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
183        punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
184
185        movdqa    xmmB,xmmE
186        punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
187        punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
188
189        movdqa    xmmF,xmmD
190        punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
191        punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
192
193%else ; RGB_PIXELSIZE == 4 ; -----------
194
195.column_ld1:
196        test    cl, SIZEOF_XMMWORD/16
197        jz      short .column_ld2
198        sub     rcx, byte SIZEOF_XMMWORD/16
199        movd    xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
200.column_ld2:
201        test    cl, SIZEOF_XMMWORD/8
202        jz      short .column_ld4
203        sub     rcx, byte SIZEOF_XMMWORD/8
204        movq    xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
205        pslldq  xmmA, SIZEOF_MMWORD
206        por     xmmA,xmmE
207.column_ld4:
208        test    cl, SIZEOF_XMMWORD/4
209        jz      short .column_ld8
210        sub     rcx, byte SIZEOF_XMMWORD/4
211        movdqa  xmmE,xmmA
212        movdqu  xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
213.column_ld8:
214        test    cl, SIZEOF_XMMWORD/2
215        mov     rcx, SIZEOF_XMMWORD
216        jz      short .rgb_ycc_cnv
217        movdqa  xmmF,xmmA
218        movdqa  xmmH,xmmE
219        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
220        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
221        jmp     short .rgb_ycc_cnv
222
223.columnloop:
224        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
225        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
226        movdqu  xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
227        movdqu  xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
228
229.rgb_ycc_cnv:
230        ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
231        ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
232        ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
233        ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
234
235        movdqa    xmmD,xmmA
236        punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
237        punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
238
239        movdqa    xmmC,xmmF
240        punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
241        punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
242
243        movdqa    xmmB,xmmA
244        punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
245        punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
246
247        movdqa    xmmG,xmmD
248        punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
249        punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
250
251        movdqa    xmmE,xmmA
252        punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
253        punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
254
255        movdqa    xmmH,xmmB
256        punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
257        punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
258
259        pxor      xmmF,xmmF
260
261        movdqa    xmmC,xmmA
262        punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
263        punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
264
265        movdqa    xmmD,xmmB
266        punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
267        punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
268
269        movdqa    xmmG,xmmE
270        punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
271        punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
272
273        punpcklbw xmmF,xmmH
274        punpckhbw xmmH,xmmH
275        psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
276        psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
277
278%endif ; RGB_PIXELSIZE ; ---------------
279
280        ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
281        ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
282
283        ; (Original)
284        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
285        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
286        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
287        ;
288        ; (This implementation)
289        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
290        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
291        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
292
293        movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=RE
294        movdqa    XMMWORD [wk(1)], xmm1 ; wk(1)=RO
295        movdqa    XMMWORD [wk(2)], xmm4 ; wk(2)=BE
296        movdqa    XMMWORD [wk(3)], xmm5 ; wk(3)=BO
297
298        movdqa    xmm6,xmm1
299        punpcklwd xmm1,xmm3
300        punpckhwd xmm6,xmm3
301        movdqa    xmm7,xmm1
302        movdqa    xmm4,xmm6
303        pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
304        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
305        pmaddwd   xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
306        pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
307
308        movdqa    XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
309        movdqa    XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
310
311        pxor      xmm1,xmm1
312        pxor      xmm6,xmm6
313        punpcklwd xmm1,xmm5             ; xmm1=BOL
314        punpckhwd xmm6,xmm5             ; xmm6=BOH
315        psrld     xmm1,1                ; xmm1=BOL*FIX(0.500)
316        psrld     xmm6,1                ; xmm6=BOH*FIX(0.500)
317
318        movdqa    xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
319
320        paddd     xmm7,xmm1
321        paddd     xmm4,xmm6
322        paddd     xmm7,xmm5
323        paddd     xmm4,xmm5
324        psrld     xmm7,SCALEBITS        ; xmm7=CbOL
325        psrld     xmm4,SCALEBITS        ; xmm4=CbOH
326        packssdw  xmm7,xmm4             ; xmm7=CbO
327
328        movdqa    xmm1, XMMWORD [wk(2)] ; xmm1=BE
329
330        movdqa    xmm6,xmm0
331        punpcklwd xmm0,xmm2
332        punpckhwd xmm6,xmm2
333        movdqa    xmm5,xmm0
334        movdqa    xmm4,xmm6
335        pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
336        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
337        pmaddwd   xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
338        pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
339
340        movdqa    XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
341        movdqa    XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
342
343        pxor      xmm0,xmm0
344        pxor      xmm6,xmm6
345        punpcklwd xmm0,xmm1             ; xmm0=BEL
346        punpckhwd xmm6,xmm1             ; xmm6=BEH
347        psrld     xmm0,1                ; xmm0=BEL*FIX(0.500)
348        psrld     xmm6,1                ; xmm6=BEH*FIX(0.500)
349
350        movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
351
352        paddd     xmm5,xmm0
353        paddd     xmm4,xmm6
354        paddd     xmm5,xmm1
355        paddd     xmm4,xmm1
356        psrld     xmm5,SCALEBITS        ; xmm5=CbEL
357        psrld     xmm4,SCALEBITS        ; xmm4=CbEH
358        packssdw  xmm5,xmm4             ; xmm5=CbE
359
360        psllw     xmm7,BYTE_BIT
361        por       xmm5,xmm7             ; xmm5=Cb
362        movdqa    XMMWORD [rbx], xmm5   ; Save Cb
363
364        movdqa    xmm0, XMMWORD [wk(3)] ; xmm0=BO
365        movdqa    xmm6, XMMWORD [wk(2)] ; xmm6=BE
366        movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=RO
367
368        movdqa    xmm4,xmm0
369        punpcklwd xmm0,xmm3
370        punpckhwd xmm4,xmm3
371        movdqa    xmm7,xmm0
372        movdqa    xmm5,xmm4
373        pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
374        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
375        pmaddwd   xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
376        pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
377
378        movdqa    xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
379
380        paddd     xmm0, XMMWORD [wk(4)]
381        paddd     xmm4, XMMWORD [wk(5)]
382        paddd     xmm0,xmm3
383        paddd     xmm4,xmm3
384        psrld     xmm0,SCALEBITS        ; xmm0=YOL
385        psrld     xmm4,SCALEBITS        ; xmm4=YOH
386        packssdw  xmm0,xmm4             ; xmm0=YO
387
388        pxor      xmm3,xmm3
389        pxor      xmm4,xmm4
390        punpcklwd xmm3,xmm1             ; xmm3=ROL
391        punpckhwd xmm4,xmm1             ; xmm4=ROH
392        psrld     xmm3,1                ; xmm3=ROL*FIX(0.500)
393        psrld     xmm4,1                ; xmm4=ROH*FIX(0.500)
394
395        movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
396
397        paddd     xmm7,xmm3
398        paddd     xmm5,xmm4
399        paddd     xmm7,xmm1
400        paddd     xmm5,xmm1
401        psrld     xmm7,SCALEBITS        ; xmm7=CrOL
402        psrld     xmm5,SCALEBITS        ; xmm5=CrOH
403        packssdw  xmm7,xmm5             ; xmm7=CrO
404
405        movdqa    xmm3, XMMWORD [wk(0)] ; xmm3=RE
406
407        movdqa    xmm4,xmm6
408        punpcklwd xmm6,xmm2
409        punpckhwd xmm4,xmm2
410        movdqa    xmm1,xmm6
411        movdqa    xmm5,xmm4
412        pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
413        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
414        pmaddwd   xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
415        pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
416
417        movdqa    xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
418
419        paddd     xmm6, XMMWORD [wk(6)]
420        paddd     xmm4, XMMWORD [wk(7)]
421        paddd     xmm6,xmm2
422        paddd     xmm4,xmm2
423        psrld     xmm6,SCALEBITS        ; xmm6=YEL
424        psrld     xmm4,SCALEBITS        ; xmm4=YEH
425        packssdw  xmm6,xmm4             ; xmm6=YE
426
427        psllw     xmm0,BYTE_BIT
428        por       xmm6,xmm0             ; xmm6=Y
429        movdqa    XMMWORD [rdi], xmm6   ; Save Y
430
431        pxor      xmm2,xmm2
432        pxor      xmm4,xmm4
433        punpcklwd xmm2,xmm3             ; xmm2=REL
434        punpckhwd xmm4,xmm3             ; xmm4=REH
435        psrld     xmm2,1                ; xmm2=REL*FIX(0.500)
436        psrld     xmm4,1                ; xmm4=REH*FIX(0.500)
437
438        movdqa    xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
439
440        paddd     xmm1,xmm2
441        paddd     xmm5,xmm4
442        paddd     xmm1,xmm0
443        paddd     xmm5,xmm0
444        psrld     xmm1,SCALEBITS        ; xmm1=CrEL
445        psrld     xmm5,SCALEBITS        ; xmm5=CrEH
446        packssdw  xmm1,xmm5             ; xmm1=CrE
447
448        psllw     xmm7,BYTE_BIT
449        por       xmm1,xmm7             ; xmm1=Cr
450        movdqa    XMMWORD [rdx], xmm1   ; Save Cr
451
452        sub     rcx, byte SIZEOF_XMMWORD
453        add     rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
454        add     rdi, byte SIZEOF_XMMWORD                ; outptr0
455        add     rbx, byte SIZEOF_XMMWORD                ; outptr1
456        add     rdx, byte SIZEOF_XMMWORD                ; outptr2
457        cmp     rcx, byte SIZEOF_XMMWORD
458        jae     near .columnloop
459        test    rcx,rcx
460        jnz     near .column_ld1
461
462        pop     rcx                     ; col
463        pop     rsi
464        pop     rdi
465        pop     rbx
466        pop     rdx
467
468        add     rsi, byte SIZEOF_JSAMPROW       ; input_buf
469        add     rdi, byte SIZEOF_JSAMPROW
470        add     rbx, byte SIZEOF_JSAMPROW
471        add     rdx, byte SIZEOF_JSAMPROW
472        dec     rax                             ; num_rows
473        jg      near .rowloop
474
475.return:
476        pop     rbx
477        uncollect_args
478        mov     rsp,rbp         ; rsp <- aligned rbp
479        pop     rsp             ; rsp <- original rbp
480        pop     rbp
481        ret
482
483; For some reason, the OS X linker does not honor the request to align the
484; segment unless we do this.
485        align   16
486