• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2)
3;
4; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright 2009, 2012 D. R. Commander
6;
7; Based on
8; x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17;
18; [TAB8]
19
20%include "jcolsamp.inc"
21
22; --------------------------------------------------------------------------
23;
24; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
25;
26; GLOBAL(void)
27; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
28;                                  JSAMPIMAGE input_buf,
29;                                  JDIMENSION in_row_group_ctr,
30;                                  JSAMPARRAY output_buf);
31;
32
33; r10 = JDIMENSION output_width
34; r11 = JSAMPIMAGE input_buf
35; r12 = JDIMENSION in_row_group_ctr
36; r13 = JSAMPARRAY output_buf
37
38%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
39%define WK_NUM		3
40
41	align	16
42	global	EXTN(jsimd_h2v1_merged_upsample_sse2) PRIVATE
43
44EXTN(jsimd_h2v1_merged_upsample_sse2):
45	push	rbp
46	mov	rax,rsp				; rax = original rbp
47	sub	rsp, byte 4
48	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
49	mov	[rsp],rax
50	mov	rbp,rsp				; rbp = aligned rbp
51	lea	rsp, [wk(0)]
52	collect_args
53	push	rbx
54
55	mov	rcx, r10	; col
56	test	rcx,rcx
57	jz	near .return
58
59	push	rcx
60
61	mov	rdi, r11
62	mov	rcx, r12
63	mov	rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
64	mov	rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
65	mov	rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
66	mov	rdi, r13
67	mov	rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]		; inptr0
68	mov	rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]		; inptr1
69	mov	rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]		; inptr2
70	mov	rdi, JSAMPROW [rdi]				; outptr
71
72	pop	rcx			; col
73
74.columnloop:
75
76	movdqa    xmm6, XMMWORD [rbx]	; xmm6=Cb(0123456789ABCDEF)
77	movdqa    xmm7, XMMWORD [rdx]	; xmm7=Cr(0123456789ABCDEF)
78
79	pxor      xmm1,xmm1		; xmm1=(all 0's)
80	pcmpeqw   xmm3,xmm3
81	psllw     xmm3,7		; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
82
83	movdqa    xmm4,xmm6
84	punpckhbw xmm6,xmm1		; xmm6=Cb(89ABCDEF)=CbH
85	punpcklbw xmm4,xmm1		; xmm4=Cb(01234567)=CbL
86	movdqa    xmm0,xmm7
87	punpckhbw xmm7,xmm1		; xmm7=Cr(89ABCDEF)=CrH
88	punpcklbw xmm0,xmm1		; xmm0=Cr(01234567)=CrL
89
90	paddw     xmm6,xmm3
91	paddw     xmm4,xmm3
92	paddw     xmm7,xmm3
93	paddw     xmm0,xmm3
94
95	; (Original)
96	; R = Y                + 1.40200 * Cr
97	; G = Y - 0.34414 * Cb - 0.71414 * Cr
98	; B = Y + 1.77200 * Cb
99	;
100	; (This implementation)
101	; R = Y                + 0.40200 * Cr + Cr
102	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
103	; B = Y - 0.22800 * Cb + Cb + Cb
104
105	movdqa	xmm5,xmm6		; xmm5=CbH
106	movdqa	xmm2,xmm4		; xmm2=CbL
107	paddw	xmm6,xmm6		; xmm6=2*CbH
108	paddw	xmm4,xmm4		; xmm4=2*CbL
109	movdqa	xmm1,xmm7		; xmm1=CrH
110	movdqa	xmm3,xmm0		; xmm3=CrL
111	paddw	xmm7,xmm7		; xmm7=2*CrH
112	paddw	xmm0,xmm0		; xmm0=2*CrL
113
114	pmulhw	xmm6,[rel PW_MF0228]	; xmm6=(2*CbH * -FIX(0.22800))
115	pmulhw	xmm4,[rel PW_MF0228]	; xmm4=(2*CbL * -FIX(0.22800))
116	pmulhw	xmm7,[rel PW_F0402]	; xmm7=(2*CrH * FIX(0.40200))
117	pmulhw	xmm0,[rel PW_F0402]	; xmm0=(2*CrL * FIX(0.40200))
118
119	paddw	xmm6,[rel PW_ONE]
120	paddw	xmm4,[rel PW_ONE]
121	psraw	xmm6,1			; xmm6=(CbH * -FIX(0.22800))
122	psraw	xmm4,1			; xmm4=(CbL * -FIX(0.22800))
123	paddw	xmm7,[rel PW_ONE]
124	paddw	xmm0,[rel PW_ONE]
125	psraw	xmm7,1			; xmm7=(CrH * FIX(0.40200))
126	psraw	xmm0,1			; xmm0=(CrL * FIX(0.40200))
127
128	paddw	xmm6,xmm5
129	paddw	xmm4,xmm2
130	paddw	xmm6,xmm5		; xmm6=(CbH * FIX(1.77200))=(B-Y)H
131	paddw	xmm4,xmm2		; xmm4=(CbL * FIX(1.77200))=(B-Y)L
132	paddw	xmm7,xmm1		; xmm7=(CrH * FIX(1.40200))=(R-Y)H
133	paddw	xmm0,xmm3		; xmm0=(CrL * FIX(1.40200))=(R-Y)L
134
135	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=(B-Y)H
136	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(R-Y)H
137
138	movdqa    xmm6,xmm5
139	movdqa    xmm7,xmm2
140	punpcklwd xmm5,xmm1
141	punpckhwd xmm6,xmm1
142	pmaddwd   xmm5,[rel PW_MF0344_F0285]
143	pmaddwd   xmm6,[rel PW_MF0344_F0285]
144	punpcklwd xmm2,xmm3
145	punpckhwd xmm7,xmm3
146	pmaddwd   xmm2,[rel PW_MF0344_F0285]
147	pmaddwd   xmm7,[rel PW_MF0344_F0285]
148
149	paddd     xmm5,[rel PD_ONEHALF]
150	paddd     xmm6,[rel PD_ONEHALF]
151	psrad     xmm5,SCALEBITS
152	psrad     xmm6,SCALEBITS
153	paddd     xmm2,[rel PD_ONEHALF]
154	paddd     xmm7,[rel PD_ONEHALF]
155	psrad     xmm2,SCALEBITS
156	psrad     xmm7,SCALEBITS
157
158	packssdw  xmm5,xmm6	; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
159	packssdw  xmm2,xmm7	; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
160	psubw     xmm5,xmm1	; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
161	psubw     xmm2,xmm3	; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
162
163	movdqa	XMMWORD [wk(2)], xmm5	; wk(2)=(G-Y)H
164
165	mov	al,2			; Yctr
166	jmp	short .Yloop_1st
167
168.Yloop_2nd:
169	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(R-Y)H
170	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(G-Y)H
171	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(B-Y)H
172
173.Yloop_1st:
174	movdqa	xmm7, XMMWORD [rsi]	; xmm7=Y(0123456789ABCDEF)
175
176	pcmpeqw	xmm6,xmm6
177	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
178	pand	xmm6,xmm7		; xmm6=Y(02468ACE)=YE
179	psrlw	xmm7,BYTE_BIT		; xmm7=Y(13579BDF)=YO
180
181	movdqa	xmm1,xmm0		; xmm1=xmm0=(R-Y)(L/H)
182	movdqa	xmm3,xmm2		; xmm3=xmm2=(G-Y)(L/H)
183	movdqa	xmm5,xmm4		; xmm5=xmm4=(B-Y)(L/H)
184
185	paddw     xmm0,xmm6		; xmm0=((R-Y)+YE)=RE=R(02468ACE)
186	paddw     xmm1,xmm7		; xmm1=((R-Y)+YO)=RO=R(13579BDF)
187	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
188	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
189
190	paddw     xmm2,xmm6		; xmm2=((G-Y)+YE)=GE=G(02468ACE)
191	paddw     xmm3,xmm7		; xmm3=((G-Y)+YO)=GO=G(13579BDF)
192	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
193	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
194
195	paddw     xmm4,xmm6		; xmm4=((B-Y)+YE)=BE=B(02468ACE)
196	paddw     xmm5,xmm7		; xmm5=((B-Y)+YO)=BO=B(13579BDF)
197	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
198	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
199
200%if RGB_PIXELSIZE == 3 ; ---------------
201
202	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
203	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
204	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
205	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
206
207	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
208	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
209	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
210
211	movdqa    xmmG,xmmA
212	movdqa    xmmH,xmmA
213	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
214	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
215
216	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
217	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
218
219	movdqa    xmmC,xmmD
220	movdqa    xmmB,xmmD
221	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
222	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
223
224	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
225
226	movdqa    xmmF,xmmE
227	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
228	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
229
230	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
231	movdqa    xmmB,xmmE
232	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
233	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
234	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
235
236	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
237	movdqa    xmmB,xmmF
238	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
239	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
240	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
241
242	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
243	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
244	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
245
246	cmp	rcx, byte SIZEOF_XMMWORD
247	jb	short .column_st32
248
249	test	rdi, SIZEOF_XMMWORD-1
250	jnz	short .out1
251	; --(aligned)-------------------
252	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
253	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
254	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
255	jmp	short .out0
256.out1:	; --(unaligned)-----------------
257	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
258	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
259	movdqu	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
260.out0:
261	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
262	sub	rcx, byte SIZEOF_XMMWORD
263	jz	near .endcolumn
264
265	add	rsi, byte SIZEOF_XMMWORD	; inptr0
266	dec	al			; Yctr
267	jnz	near .Yloop_2nd
268
269	add	rbx, byte SIZEOF_XMMWORD	; inptr1
270	add	rdx, byte SIZEOF_XMMWORD	; inptr2
271	jmp	near .columnloop
272
273.column_st32:
274	lea	rcx, [rcx+rcx*2]		; imul ecx, RGB_PIXELSIZE
275	cmp	rcx, byte 2*SIZEOF_XMMWORD
276	jb	short .column_st16
277	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
278	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
279	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
280	movdqa	xmmA,xmmF
281	sub	rcx, byte 2*SIZEOF_XMMWORD
282	jmp	short .column_st15
283.column_st16:
284	cmp	rcx, byte SIZEOF_XMMWORD
285	jb	short .column_st15
286	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
287	add	rdi, byte SIZEOF_XMMWORD	; outptr
288	movdqa	xmmA,xmmD
289	sub	rcx, byte SIZEOF_XMMWORD
290.column_st15:
291	; Store the lower 8 bytes of xmmA to the output when it has enough
292	; space.
293	cmp	rcx, byte SIZEOF_MMWORD
294	jb	short .column_st7
295	movq	XMM_MMWORD [rdi], xmmA
296	add	rdi, byte SIZEOF_MMWORD
297	sub	rcx, byte SIZEOF_MMWORD
298	psrldq	xmmA, SIZEOF_MMWORD
299.column_st7:
300	; Store the lower 4 bytes of xmmA to the output when it has enough
301	; space.
302	cmp	rcx, byte SIZEOF_DWORD
303	jb	short .column_st3
304	movd	XMM_DWORD [rdi], xmmA
305	add	rdi, byte SIZEOF_DWORD
306	sub	rcx, byte SIZEOF_DWORD
307	psrldq	xmmA, SIZEOF_DWORD
308.column_st3:
309	; Store the lower 2 bytes of rax to the output when it has enough
310	; space.
311	movd	eax, xmmA
312	cmp	rcx, byte SIZEOF_WORD
313	jb	short .column_st1
314	mov	WORD [rdi], ax
315	add	rdi, byte SIZEOF_WORD
316	sub	rcx, byte SIZEOF_WORD
317	shr	rax, 16
318.column_st1:
319	; Store the lower 1 byte of rax to the output when it has enough
320	; space.
321	test	rcx, rcx
322	jz	short .endcolumn
323	mov	BYTE [rdi], al
324
325%else ; RGB_PIXELSIZE == 4 ; -----------
326
327%ifdef RGBX_FILLER_0XFF
328	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
329	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
330%else
331	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
332	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
333%endif
334	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
335	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
336	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
337	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
338
339	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
340	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
341	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
342	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
343
344	movdqa    xmmC,xmmA
345	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
346	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
347	movdqa    xmmG,xmmB
348	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
349	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
350
351	movdqa    xmmD,xmmA
352	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
353	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
354	movdqa    xmmH,xmmC
355	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
356	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
357
358	cmp	rcx, byte SIZEOF_XMMWORD
359	jb	short .column_st32
360
361	test	rdi, SIZEOF_XMMWORD-1
362	jnz	short .out1
363	; --(aligned)-------------------
364	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
365	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
366	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
367	movntdq	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
368	jmp	short .out0
369.out1:	; --(unaligned)-----------------
370	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
371	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
372	movdqu	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
373	movdqu	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
374.out0:
375	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
376	sub	rcx, byte SIZEOF_XMMWORD
377	jz	near .endcolumn
378
379	add	rsi, byte SIZEOF_XMMWORD	; inptr0
380	dec	al			; Yctr
381	jnz	near .Yloop_2nd
382
383	add	rbx, byte SIZEOF_XMMWORD	; inptr1
384	add	rdx, byte SIZEOF_XMMWORD	; inptr2
385	jmp	near .columnloop
386
387.column_st32:
388	cmp	rcx, byte SIZEOF_XMMWORD/2
389	jb	short .column_st16
390	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
391	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
392	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
393	movdqa	xmmA,xmmC
394	movdqa	xmmD,xmmH
395	sub	rcx, byte SIZEOF_XMMWORD/2
396.column_st16:
397	cmp	rcx, byte SIZEOF_XMMWORD/4
398	jb	short .column_st15
399	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
400	add	rdi, byte SIZEOF_XMMWORD	; outptr
401	movdqa	xmmA,xmmD
402	sub	rcx, byte SIZEOF_XMMWORD/4
403.column_st15:
404	; Store two pixels (8 bytes) of xmmA to the output when it has enough
405	; space.
406	cmp	rcx, byte SIZEOF_XMMWORD/8
407	jb	short .column_st7
408	movq	XMM_MMWORD [rdi], xmmA
409	add	rdi, byte SIZEOF_XMMWORD/8*4
410	sub	rcx, byte SIZEOF_XMMWORD/8
411	psrldq	xmmA, SIZEOF_XMMWORD/8*4
412.column_st7:
413	; Store one pixel (4 bytes) of xmmA to the output when it has enough
414	; space.
415	test	rcx, rcx
416	jz	short .endcolumn
417	movd	XMM_DWORD [rdi], xmmA
418
419%endif ; RGB_PIXELSIZE ; ---------------
420
421.endcolumn:
422	sfence		; flush the write buffer
423
424.return:
425	pop	rbx
426	uncollect_args
427	mov	rsp,rbp		; rsp <- aligned rbp
428	pop	rsp		; rsp <- original rbp
429	pop	rbp
430	ret
431
432; --------------------------------------------------------------------------
433;
434; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
435;
436; GLOBAL(void)
437; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
438;                                  JSAMPIMAGE input_buf,
439;                                  JDIMENSION in_row_group_ctr,
440;                                  JSAMPARRAY output_buf);
441;
442
443; r10 = JDIMENSION output_width
444; r11 = JSAMPIMAGE input_buf
445; r12 = JDIMENSION in_row_group_ctr
446; r13 = JSAMPARRAY output_buf
447
448	align	16
449	global	EXTN(jsimd_h2v2_merged_upsample_sse2) PRIVATE
450
451EXTN(jsimd_h2v2_merged_upsample_sse2):
452	push	rbp
453	mov	rax,rsp
454	mov	rbp,rsp
455	collect_args
456	push	rbx
457
458	mov	rax, r10
459
460	mov	rdi, r11
461	mov	rcx, r12
462	mov	rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
463	mov	rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
464	mov	rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
465	mov	rdi, r13
466	lea	rsi, [rsi+rcx*SIZEOF_JSAMPROW]
467
468	push	rdx			; inptr2
469	push	rbx			; inptr1
470	push	rsi			; inptr00
471	mov	rbx,rsp
472
473	push	rdi
474	push	rcx
475	push	rax
476
477	%ifdef WIN64
478	mov r8, rcx
479	mov r9, rdi
480	mov rcx, rax
481	mov rdx, rbx
482	%else
483	mov rdx, rcx
484	mov rcx, rdi
485	mov	rdi, rax
486	mov rsi, rbx
487	%endif
488
489	call	EXTN(jsimd_h2v1_merged_upsample_sse2)
490
491	pop rax
492	pop rcx
493	pop rdi
494	pop rsi
495	pop rbx
496	pop rdx
497
498	add	rdi, byte SIZEOF_JSAMPROW	; outptr1
499	add	rsi, byte SIZEOF_JSAMPROW	; inptr01
500
501	push	rdx			; inptr2
502	push	rbx			; inptr1
503	push	rsi			; inptr00
504	mov	rbx,rsp
505
506	push	rdi
507	push	rcx
508	push	rax
509
510	%ifdef WIN64
511	mov r8, rcx
512	mov r9, rdi
513	mov rcx, rax
514	mov rdx, rbx
515	%else
516	mov rdx, rcx
517	mov rcx, rdi
518	mov	rdi, rax
519	mov rsi, rbx
520	%endif
521
522	call	EXTN(jsimd_h2v1_merged_upsample_sse2)
523
524	pop rax
525	pop rcx
526	pop rdi
527	pop rsi
528	pop rbx
529	pop rdx
530
531	pop	rbx
532	uncollect_args
533	pop	rbp
534	ret
535
536; For some reason, the OS X linker does not honor the request to align the
537; segment unless we do this.
538	align	16
539