• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jcclrss2-64.asm - colorspace conversion (64-bit SSE2)
3;
4; x86 SIMD extension for IJG JPEG library
5; Copyright (C) 1999-2006, MIYASAKA Masaru.
6; Copyright (C) 2009, D. R. Commander.
7; For conditions of distribution and use, see copyright notice in jsimdext.inc
8;
9; This file should be assembled with NASM (Netwide Assembler),
10; can *not* be assembled with Microsoft's MASM or any compatible
11; assembler (including Borland's Turbo Assembler).
12; NASM is available from http://nasm.sourceforge.net/ or
13; http://sourceforge.net/project/showfiles.php?group_id=6208
14;
15; [TAB8]
16
17%include "jcolsamp.inc"
18
19; --------------------------------------------------------------------------
20;
21; Convert some rows of samples to the output colorspace.
22;
23; GLOBAL(void)
24; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
25;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
26;                             JDIMENSION output_row, int num_rows);
27;
28
29; r10 = JDIMENSION img_width
30; r11 = JSAMPARRAY input_buf
31; r12 = JSAMPIMAGE output_buf
32; r13 = JDIMENSION output_row
33; r14 = int num_rows
34
35%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
36%define WK_NUM		8
37
38	align	16
39
40	global	EXTN(jsimd_rgb_ycc_convert_sse2) PRIVATE
41
42EXTN(jsimd_rgb_ycc_convert_sse2):
43	push	rbp
44	mov	rax,rsp				; rax = original rbp
45	sub	rsp, byte 4
46	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
47	mov	[rsp],rax
48	mov	rbp,rsp				; rbp = aligned rbp
49	lea	rsp, [wk(0)]
50	collect_args
51	push	rbx
52
53	mov	rcx, r10
54	test	rcx,rcx
55	jz	near .return
56
57	push	rcx
58
59	mov rsi, r12
60	mov rcx, r13
61	mov	rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
62	mov	rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
63	mov	rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
64	lea	rdi, [rdi+rcx*SIZEOF_JSAMPROW]
65	lea	rbx, [rbx+rcx*SIZEOF_JSAMPROW]
66	lea	rdx, [rdx+rcx*SIZEOF_JSAMPROW]
67
68	pop	rcx
69
70	mov rsi, r11
71	mov	eax, r14d
72	test	rax,rax
73	jle	near .return
74.rowloop:
75	push	rdx
76	push	rbx
77	push	rdi
78	push	rsi
79	push	rcx			; col
80
81	mov	rsi, JSAMPROW [rsi]	; inptr
82	mov	rdi, JSAMPROW [rdi]	; outptr0
83	mov	rbx, JSAMPROW [rbx]	; outptr1
84	mov	rdx, JSAMPROW [rdx]	; outptr2
85
86	cmp	rcx, byte SIZEOF_XMMWORD
87	jae	near .columnloop
88
89%if RGB_PIXELSIZE == 3 ; ---------------
90
91.column_ld1:
92	push	rax
93	push	rdx
94	lea	rcx,[rcx+rcx*2]		; imul ecx,RGB_PIXELSIZE
95	test	cl, SIZEOF_BYTE
96	jz	short .column_ld2
97	sub	rcx, byte SIZEOF_BYTE
98	movzx	rax, BYTE [rsi+rcx]
99.column_ld2:
100	test	cl, SIZEOF_WORD
101	jz	short .column_ld4
102	sub	rcx, byte SIZEOF_WORD
103	movzx	rdx, WORD [rsi+rcx]
104	shl	rax, WORD_BIT
105	or	rax,rdx
106.column_ld4:
107	movd	xmmA,eax
108	pop	rdx
109	pop	rax
110	test	cl, SIZEOF_DWORD
111	jz	short .column_ld8
112	sub	rcx, byte SIZEOF_DWORD
113	movd	xmmF, XMM_DWORD [rsi+rcx]
114	pslldq	xmmA, SIZEOF_DWORD
115	por	xmmA,xmmF
116.column_ld8:
117	test	cl, SIZEOF_MMWORD
118	jz	short .column_ld16
119	sub	rcx, byte SIZEOF_MMWORD
120	movq	xmmB, XMM_MMWORD [rsi+rcx]
121	pslldq	xmmA, SIZEOF_MMWORD
122	por	xmmA,xmmB
123.column_ld16:
124	test	cl, SIZEOF_XMMWORD
125	jz	short .column_ld32
126	movdqa	xmmF,xmmA
127	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
128	mov	rcx, SIZEOF_XMMWORD
129	jmp	short .rgb_ycc_cnv
130.column_ld32:
131	test	cl, 2*SIZEOF_XMMWORD
132	mov	rcx, SIZEOF_XMMWORD
133	jz	short .rgb_ycc_cnv
134	movdqa	xmmB,xmmA
135	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
136	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
137	jmp	short .rgb_ycc_cnv
138
139.columnloop:
140	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
141	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
142	movdqu	xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
143
144.rgb_ycc_cnv:
145	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
146	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
147	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
148
149	movdqa    xmmG,xmmA
150	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
151	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
152
153	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
154	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
155
156	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
157	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
158
159	movdqa    xmmD,xmmA
160	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
161	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
162
163	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
164	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
165
166	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
167	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
168
169	movdqa    xmmE,xmmA
170	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
171	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
172
173	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
174	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
175
176	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
177	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
178
179	pxor      xmmH,xmmH
180
181	movdqa    xmmC,xmmA
182	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
183	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
184
185	movdqa    xmmB,xmmE
186	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
187	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
188
189	movdqa    xmmF,xmmD
190	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
191	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
192
193%else ; RGB_PIXELSIZE == 4 ; -----------
194
195.column_ld1:
196	test	cl, SIZEOF_XMMWORD/16
197	jz	short .column_ld2
198	sub	rcx, byte SIZEOF_XMMWORD/16
199	movd	xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
200.column_ld2:
201	test	cl, SIZEOF_XMMWORD/8
202	jz	short .column_ld4
203	sub	rcx, byte SIZEOF_XMMWORD/8
204	movq	xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
205	pslldq	xmmA, SIZEOF_MMWORD
206	por	xmmA,xmmE
207.column_ld4:
208	test	cl, SIZEOF_XMMWORD/4
209	jz	short .column_ld8
210	sub	rcx, byte SIZEOF_XMMWORD/4
211	movdqa	xmmE,xmmA
212	movdqu	xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
213.column_ld8:
214	test	cl, SIZEOF_XMMWORD/2
215	mov	rcx, SIZEOF_XMMWORD
216	jz	short .rgb_ycc_cnv
217	movdqa	xmmF,xmmA
218	movdqa	xmmH,xmmE
219	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
220	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
221	jmp	short .rgb_ycc_cnv
222
223.columnloop:
224	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
225	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
226	movdqu	xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
227	movdqu	xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
228
229.rgb_ycc_cnv:
230	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
231	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
232	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
233	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
234
235	movdqa    xmmD,xmmA
236	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
237	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
238
239	movdqa    xmmC,xmmF
240	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
241	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
242
243	movdqa    xmmB,xmmA
244	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
245	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
246
247	movdqa    xmmG,xmmD
248	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
249	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
250
251	movdqa    xmmE,xmmA
252	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
253	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
254
255	movdqa    xmmH,xmmB
256	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
257	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
258
259	pxor      xmmF,xmmF
260
261	movdqa    xmmC,xmmA
262	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
263	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
264
265	movdqa    xmmD,xmmB
266	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
267	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
268
269	movdqa    xmmG,xmmE
270	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
271	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
272
273	punpcklbw xmmF,xmmH
274	punpckhbw xmmH,xmmH
275	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
276	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
277
278%endif ; RGB_PIXELSIZE ; ---------------
279
280	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
281	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
282
283	; (Original)
284	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
285	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
286	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
287	;
288	; (This implementation)
289	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
290	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
291	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
292
293	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=RE
294	movdqa    XMMWORD [wk(1)], xmm1	; wk(1)=RO
295	movdqa    XMMWORD [wk(2)], xmm4	; wk(2)=BE
296	movdqa    XMMWORD [wk(3)], xmm5	; wk(3)=BO
297
298	movdqa    xmm6,xmm1
299	punpcklwd xmm1,xmm3
300	punpckhwd xmm6,xmm3
301	movdqa    xmm7,xmm1
302	movdqa    xmm4,xmm6
303	pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
304	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
305	pmaddwd   xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
306	pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
307
308	movdqa    XMMWORD [wk(4)], xmm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
309	movdqa    XMMWORD [wk(5)], xmm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
310
311	pxor      xmm1,xmm1
312	pxor      xmm6,xmm6
313	punpcklwd xmm1,xmm5		; xmm1=BOL
314	punpckhwd xmm6,xmm5		; xmm6=BOH
315	psrld     xmm1,1		; xmm1=BOL*FIX(0.500)
316	psrld     xmm6,1		; xmm6=BOH*FIX(0.500)
317
318	movdqa    xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
319
320	paddd     xmm7,xmm1
321	paddd     xmm4,xmm6
322	paddd     xmm7,xmm5
323	paddd     xmm4,xmm5
324	psrld     xmm7,SCALEBITS	; xmm7=CbOL
325	psrld     xmm4,SCALEBITS	; xmm4=CbOH
326	packssdw  xmm7,xmm4		; xmm7=CbO
327
328	movdqa    xmm1, XMMWORD [wk(2)]	; xmm1=BE
329
330	movdqa    xmm6,xmm0
331	punpcklwd xmm0,xmm2
332	punpckhwd xmm6,xmm2
333	movdqa    xmm5,xmm0
334	movdqa    xmm4,xmm6
335	pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
336	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
337	pmaddwd   xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
338	pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
339
340	movdqa    XMMWORD [wk(6)], xmm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
341	movdqa    XMMWORD [wk(7)], xmm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
342
343	pxor      xmm0,xmm0
344	pxor      xmm6,xmm6
345	punpcklwd xmm0,xmm1		; xmm0=BEL
346	punpckhwd xmm6,xmm1		; xmm6=BEH
347	psrld     xmm0,1		; xmm0=BEL*FIX(0.500)
348	psrld     xmm6,1		; xmm6=BEH*FIX(0.500)
349
350	movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
351
352	paddd     xmm5,xmm0
353	paddd     xmm4,xmm6
354	paddd     xmm5,xmm1
355	paddd     xmm4,xmm1
356	psrld     xmm5,SCALEBITS	; xmm5=CbEL
357	psrld     xmm4,SCALEBITS	; xmm4=CbEH
358	packssdw  xmm5,xmm4		; xmm5=CbE
359
360	psllw     xmm7,BYTE_BIT
361	por       xmm5,xmm7		; xmm5=Cb
362	movdqa    XMMWORD [rbx], xmm5	; Save Cb
363
364	movdqa    xmm0, XMMWORD [wk(3)]	; xmm0=BO
365	movdqa    xmm6, XMMWORD [wk(2)]	; xmm6=BE
366	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=RO
367
368	movdqa    xmm4,xmm0
369	punpcklwd xmm0,xmm3
370	punpckhwd xmm4,xmm3
371	movdqa    xmm7,xmm0
372	movdqa    xmm5,xmm4
373	pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
374	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
375	pmaddwd   xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
376	pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
377
378	movdqa    xmm3,[rel PD_ONEHALF]	; xmm3=[PD_ONEHALF]
379
380	paddd     xmm0, XMMWORD [wk(4)]
381	paddd     xmm4, XMMWORD [wk(5)]
382	paddd     xmm0,xmm3
383	paddd     xmm4,xmm3
384	psrld     xmm0,SCALEBITS	; xmm0=YOL
385	psrld     xmm4,SCALEBITS	; xmm4=YOH
386	packssdw  xmm0,xmm4		; xmm0=YO
387
388	pxor      xmm3,xmm3
389	pxor      xmm4,xmm4
390	punpcklwd xmm3,xmm1		; xmm3=ROL
391	punpckhwd xmm4,xmm1		; xmm4=ROH
392	psrld     xmm3,1		; xmm3=ROL*FIX(0.500)
393	psrld     xmm4,1		; xmm4=ROH*FIX(0.500)
394
395	movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
396
397	paddd     xmm7,xmm3
398	paddd     xmm5,xmm4
399	paddd     xmm7,xmm1
400	paddd     xmm5,xmm1
401	psrld     xmm7,SCALEBITS	; xmm7=CrOL
402	psrld     xmm5,SCALEBITS	; xmm5=CrOH
403	packssdw  xmm7,xmm5		; xmm7=CrO
404
405	movdqa    xmm3, XMMWORD [wk(0)]	; xmm3=RE
406
407	movdqa    xmm4,xmm6
408	punpcklwd xmm6,xmm2
409	punpckhwd xmm4,xmm2
410	movdqa    xmm1,xmm6
411	movdqa    xmm5,xmm4
412	pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
413	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
414	pmaddwd   xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
415	pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
416
417	movdqa    xmm2,[rel PD_ONEHALF]	; xmm2=[PD_ONEHALF]
418
419	paddd     xmm6, XMMWORD [wk(6)]
420	paddd     xmm4, XMMWORD [wk(7)]
421	paddd     xmm6,xmm2
422	paddd     xmm4,xmm2
423	psrld     xmm6,SCALEBITS	; xmm6=YEL
424	psrld     xmm4,SCALEBITS	; xmm4=YEH
425	packssdw  xmm6,xmm4		; xmm6=YE
426
427	psllw     xmm0,BYTE_BIT
428	por       xmm6,xmm0		; xmm6=Y
429	movdqa    XMMWORD [rdi], xmm6	; Save Y
430
431	pxor      xmm2,xmm2
432	pxor      xmm4,xmm4
433	punpcklwd xmm2,xmm3		; xmm2=REL
434	punpckhwd xmm4,xmm3		; xmm4=REH
435	psrld     xmm2,1		; xmm2=REL*FIX(0.500)
436	psrld     xmm4,1		; xmm4=REH*FIX(0.500)
437
438	movdqa    xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
439
440	paddd     xmm1,xmm2
441	paddd     xmm5,xmm4
442	paddd     xmm1,xmm0
443	paddd     xmm5,xmm0
444	psrld     xmm1,SCALEBITS	; xmm1=CrEL
445	psrld     xmm5,SCALEBITS	; xmm5=CrEH
446	packssdw  xmm1,xmm5		; xmm1=CrE
447
448	psllw     xmm7,BYTE_BIT
449	por       xmm1,xmm7		; xmm1=Cr
450	movdqa    XMMWORD [rdx], xmm1	; Save Cr
451
452	sub	rcx, byte SIZEOF_XMMWORD
453	add	rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
454	add	rdi, byte SIZEOF_XMMWORD		; outptr0
455	add	rbx, byte SIZEOF_XMMWORD		; outptr1
456	add	rdx, byte SIZEOF_XMMWORD		; outptr2
457	cmp	rcx, byte SIZEOF_XMMWORD
458	jae	near .columnloop
459	test	rcx,rcx
460	jnz	near .column_ld1
461
462	pop	rcx			; col
463	pop	rsi
464	pop	rdi
465	pop	rbx
466	pop	rdx
467
468	add	rsi, byte SIZEOF_JSAMPROW	; input_buf
469	add	rdi, byte SIZEOF_JSAMPROW
470	add	rbx, byte SIZEOF_JSAMPROW
471	add	rdx, byte SIZEOF_JSAMPROW
472	dec	rax				; num_rows
473	jg	near .rowloop
474
475.return:
476	pop	rbx
477	uncollect_args
478	mov	rsp,rbp		; rsp <- aligned rbp
479	pop	rsp		; rsp <- original rbp
480	pop	rbp
481	ret
482
483; For some reason, the OS X linker does not honor the request to align the
484; segment unless we do this.
485	align	16
486