• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jdsammmx.asm - upsampling (MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
7; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jsimdext.inc"
20
21; --------------------------------------------------------------------------
22	SECTION	SEG_CONST
23
24	alignz	16
25	global	EXTN(jconst_fancy_upsample_mmx) PRIVATE
26
27EXTN(jconst_fancy_upsample_mmx):
28
29PW_ONE		times 4 dw  1
30PW_TWO		times 4 dw  2
31PW_THREE	times 4 dw  3
32PW_SEVEN	times 4 dw  7
33PW_EIGHT	times 4 dw  8
34
35	alignz	16
36
37; --------------------------------------------------------------------------
38	SECTION	SEG_TEXT
39	BITS	32
40;
41; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
42;
43; The upsampling algorithm is linear interpolation between pixel centers,
44; also known as a "triangle filter".  This is a good compromise between
45; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
46; of the way between input pixel centers.
47;
48; GLOBAL(void)
49; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
50;                                JDIMENSION downsampled_width,
51;                                JSAMPARRAY input_data,
52;                                JSAMPARRAY * output_data_ptr);
53;
54
55%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
56%define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
57%define input_data(b)		(b)+16		; JSAMPARRAY input_data
58%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
59
60	align	16
61	global	EXTN(jsimd_h2v1_fancy_upsample_mmx) PRIVATE
62
63EXTN(jsimd_h2v1_fancy_upsample_mmx):
64	push	ebp
65	mov	ebp,esp
66	pushpic	ebx
67;	push	ecx		; need not be preserved
68;	push	edx		; need not be preserved
69	push	esi
70	push	edi
71
72	get_GOT	ebx		; get GOT address
73
74	mov	eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
75	test	eax,eax
76	jz	near .return
77
78	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
79	test	ecx,ecx
80	jz	near .return
81
82	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
83	mov	edi, POINTER [output_data_ptr(ebp)]
84	mov	edi, JSAMPARRAY [edi]			; output_data
85	alignx	16,7
86.rowloop:
87	push	eax			; colctr
88	push	edi
89	push	esi
90
91	mov	esi, JSAMPROW [esi]	; inptr
92	mov	edi, JSAMPROW [edi]	; outptr
93
94	test	eax, SIZEOF_MMWORD-1
95	jz	short .skip
96	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
97	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
98.skip:
99	pxor	mm0,mm0			; mm0=(all 0's)
100	pcmpeqb	mm7,mm7
101	psrlq	mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
102	pand	mm7, MMWORD [esi+0*SIZEOF_MMWORD]
103
104	add	eax, byte SIZEOF_MMWORD-1
105	and	eax, byte -SIZEOF_MMWORD
106	cmp	eax, byte SIZEOF_MMWORD
107	ja	short .columnloop
108	alignx	16,7
109
110.columnloop_last:
111	pcmpeqb	mm6,mm6
112	psllq	mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
113	pand	mm6, MMWORD [esi+0*SIZEOF_MMWORD]
114	jmp	short .upsample
115	alignx	16,7
116
117.columnloop:
118	movq	mm6, MMWORD [esi+1*SIZEOF_MMWORD]
119	psllq	mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
120
121.upsample:
122	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
123	movq	mm2,mm1
124	movq	mm3,mm1			; mm1=( 0 1 2 3 4 5 6 7)
125	psllq	mm2,BYTE_BIT		; mm2=( - 0 1 2 3 4 5 6)
126	psrlq	mm3,BYTE_BIT		; mm3=( 1 2 3 4 5 6 7 -)
127
128	por	mm2,mm7			; mm2=(-1 0 1 2 3 4 5 6)
129	por	mm3,mm6			; mm3=( 1 2 3 4 5 6 7 8)
130
131	movq	mm7,mm1
132	psrlq	mm7,(SIZEOF_MMWORD-1)*BYTE_BIT	; mm7=( 7 - - - - - - -)
133
134	movq      mm4,mm1
135	punpcklbw mm1,mm0		; mm1=( 0 1 2 3)
136	punpckhbw mm4,mm0		; mm4=( 4 5 6 7)
137	movq      mm5,mm2
138	punpcklbw mm2,mm0		; mm2=(-1 0 1 2)
139	punpckhbw mm5,mm0		; mm5=( 3 4 5 6)
140	movq      mm6,mm3
141	punpcklbw mm3,mm0		; mm3=( 1 2 3 4)
142	punpckhbw mm6,mm0		; mm6=( 5 6 7 8)
143
144	pmullw	mm1,[GOTOFF(ebx,PW_THREE)]
145	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
146	paddw	mm2,[GOTOFF(ebx,PW_ONE)]
147	paddw	mm5,[GOTOFF(ebx,PW_ONE)]
148	paddw	mm3,[GOTOFF(ebx,PW_TWO)]
149	paddw	mm6,[GOTOFF(ebx,PW_TWO)]
150
151	paddw	mm2,mm1
152	paddw	mm5,mm4
153	psrlw	mm2,2			; mm2=OutLE=( 0  2  4  6)
154	psrlw	mm5,2			; mm5=OutHE=( 8 10 12 14)
155	paddw	mm3,mm1
156	paddw	mm6,mm4
157	psrlw	mm3,2			; mm3=OutLO=( 1  3  5  7)
158	psrlw	mm6,2			; mm6=OutHO=( 9 11 13 15)
159
160	psllw	mm3,BYTE_BIT
161	psllw	mm6,BYTE_BIT
162	por	mm2,mm3			; mm2=OutL=( 0  1  2  3  4  5  6  7)
163	por	mm5,mm6			; mm5=OutH=( 8  9 10 11 12 13 14 15)
164
165	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm2
166	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm5
167
168	sub	eax, byte SIZEOF_MMWORD
169	add	esi, byte 1*SIZEOF_MMWORD	; inptr
170	add	edi, byte 2*SIZEOF_MMWORD	; outptr
171	cmp	eax, byte SIZEOF_MMWORD
172	ja	near .columnloop
173	test	eax,eax
174	jnz	near .columnloop_last
175
176	pop	esi
177	pop	edi
178	pop	eax
179
180	add	esi, byte SIZEOF_JSAMPROW	; input_data
181	add	edi, byte SIZEOF_JSAMPROW	; output_data
182	dec	ecx				; rowctr
183	jg	near .rowloop
184
185	emms		; empty MMX state
186
187.return:
188	pop	edi
189	pop	esi
190;	pop	edx		; need not be preserved
191;	pop	ecx		; need not be preserved
192	poppic	ebx
193	pop	ebp
194	ret
195
196; --------------------------------------------------------------------------
197;
198; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
199; Again a triangle filter; see comments for h2v1 case, above.
200;
201; GLOBAL(void)
202; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
203;                                JDIMENSION downsampled_width,
204;                                JSAMPARRAY input_data,
205;                                JSAMPARRAY * output_data_ptr);
206;
207
208%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
209%define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
210%define input_data(b)		(b)+16		; JSAMPARRAY input_data
211%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
212
213%define original_ebp	ebp+0
214%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
215%define WK_NUM		4
216%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
217
218	align	16
219	global	EXTN(jsimd_h2v2_fancy_upsample_mmx) PRIVATE
220
221EXTN(jsimd_h2v2_fancy_upsample_mmx):
222	push	ebp
223	mov	eax,esp				; eax = original ebp
224	sub	esp, byte 4
225	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
226	mov	[esp],eax
227	mov	ebp,esp				; ebp = aligned ebp
228	lea	esp, [wk(0)]
229	pushpic	eax		; make a room for GOT address
230	push	ebx
231;	push	ecx		; need not be preserved
232;	push	edx		; need not be preserved
233	push	esi
234	push	edi
235
236	get_GOT	ebx			; get GOT address
237	movpic	POINTER [gotptr], ebx	; save GOT address
238
239	mov	edx,eax				; edx = original ebp
240	mov	eax, JDIMENSION [downsamp_width(edx)]  ; colctr
241	test	eax,eax
242	jz	near .return
243
244	mov	ecx, INT [max_v_samp(edx)]	; rowctr
245	test	ecx,ecx
246	jz	near .return
247
248	mov	esi, JSAMPARRAY [input_data(edx)]	; input_data
249	mov	edi, POINTER [output_data_ptr(edx)]
250	mov	edi, JSAMPARRAY [edi]			; output_data
251	alignx	16,7
252.rowloop:
253	push	eax					; colctr
254	push	ecx
255	push	edi
256	push	esi
257
258	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
259	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
260	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
261	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
262	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
263
264	test	eax, SIZEOF_MMWORD-1
265	jz	short .skip
266	push	edx
267	mov	dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
268	mov	JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
269	mov	dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
270	mov	JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
271	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
272	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
273	pop	edx
274.skip:
275	; -- process the first column block
276
277	movq	mm0, MMWORD [ebx+0*SIZEOF_MMWORD]	; mm0=row[ 0][0]
278	movq	mm1, MMWORD [ecx+0*SIZEOF_MMWORD]	; mm1=row[-1][0]
279	movq	mm2, MMWORD [esi+0*SIZEOF_MMWORD]	; mm2=row[+1][0]
280
281	pushpic	ebx
282	movpic	ebx, POINTER [gotptr]	; load GOT address
283
284	pxor      mm3,mm3		; mm3=(all 0's)
285	movq      mm4,mm0
286	punpcklbw mm0,mm3		; mm0=row[ 0][0]( 0 1 2 3)
287	punpckhbw mm4,mm3		; mm4=row[ 0][0]( 4 5 6 7)
288	movq      mm5,mm1
289	punpcklbw mm1,mm3		; mm1=row[-1][0]( 0 1 2 3)
290	punpckhbw mm5,mm3		; mm5=row[-1][0]( 4 5 6 7)
291	movq      mm6,mm2
292	punpcklbw mm2,mm3		; mm2=row[+1][0]( 0 1 2 3)
293	punpckhbw mm6,mm3		; mm6=row[+1][0]( 4 5 6 7)
294
295	pmullw	mm0,[GOTOFF(ebx,PW_THREE)]
296	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
297
298	pcmpeqb	mm7,mm7
299	psrlq	mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
300
301	paddw	mm1,mm0			; mm1=Int0L=( 0 1 2 3)
302	paddw	mm5,mm4			; mm5=Int0H=( 4 5 6 7)
303	paddw	mm2,mm0			; mm2=Int1L=( 0 1 2 3)
304	paddw	mm6,mm4			; mm6=Int1H=( 4 5 6 7)
305
306	movq	MMWORD [edx+0*SIZEOF_MMWORD], mm1	; temporarily save
307	movq	MMWORD [edx+1*SIZEOF_MMWORD], mm5	; the intermediate data
308	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm2
309	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm6
310
311	pand	mm1,mm7			; mm1=( 0 - - -)
312	pand	mm2,mm7			; mm2=( 0 - - -)
313
314	movq	MMWORD [wk(0)], mm1
315	movq	MMWORD [wk(1)], mm2
316
317	poppic	ebx
318
319	add	eax, byte SIZEOF_MMWORD-1
320	and	eax, byte -SIZEOF_MMWORD
321	cmp	eax, byte SIZEOF_MMWORD
322	ja	short .columnloop
323	alignx	16,7
324
325.columnloop_last:
326	; -- process the last column block
327
328	pushpic	ebx
329	movpic	ebx, POINTER [gotptr]	; load GOT address
330
331	pcmpeqb	mm1,mm1
332	psllq	mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
333	movq	mm2,mm1
334
335	pand	mm1, MMWORD [edx+1*SIZEOF_MMWORD]	; mm1=( - - - 7)
336	pand	mm2, MMWORD [edi+1*SIZEOF_MMWORD]	; mm2=( - - - 7)
337
338	movq	MMWORD [wk(2)], mm1
339	movq	MMWORD [wk(3)], mm2
340
341	jmp	short .upsample
342	alignx	16,7
343
344.columnloop:
345	; -- process the next column block
346
347	movq	mm0, MMWORD [ebx+1*SIZEOF_MMWORD]	; mm0=row[ 0][1]
348	movq	mm1, MMWORD [ecx+1*SIZEOF_MMWORD]	; mm1=row[-1][1]
349	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]	; mm2=row[+1][1]
350
351	pushpic	ebx
352	movpic	ebx, POINTER [gotptr]	; load GOT address
353
354	pxor      mm3,mm3		; mm3=(all 0's)
355	movq      mm4,mm0
356	punpcklbw mm0,mm3		; mm0=row[ 0][1]( 0 1 2 3)
357	punpckhbw mm4,mm3		; mm4=row[ 0][1]( 4 5 6 7)
358	movq      mm5,mm1
359	punpcklbw mm1,mm3		; mm1=row[-1][1]( 0 1 2 3)
360	punpckhbw mm5,mm3		; mm5=row[-1][1]( 4 5 6 7)
361	movq      mm6,mm2
362	punpcklbw mm2,mm3		; mm2=row[+1][1]( 0 1 2 3)
363	punpckhbw mm6,mm3		; mm6=row[+1][1]( 4 5 6 7)
364
365	pmullw	mm0,[GOTOFF(ebx,PW_THREE)]
366	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
367
368	paddw	mm1,mm0			; mm1=Int0L=( 0 1 2 3)
369	paddw	mm5,mm4			; mm5=Int0H=( 4 5 6 7)
370	paddw	mm2,mm0			; mm2=Int1L=( 0 1 2 3)
371	paddw	mm6,mm4			; mm6=Int1H=( 4 5 6 7)
372
373	movq	MMWORD [edx+2*SIZEOF_MMWORD], mm1	; temporarily save
374	movq	MMWORD [edx+3*SIZEOF_MMWORD], mm5	; the intermediate data
375	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
376	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm6
377
378	psllq	mm1,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm1=( - - - 0)
379	psllq	mm2,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm2=( - - - 0)
380
381	movq	MMWORD [wk(2)], mm1
382	movq	MMWORD [wk(3)], mm2
383
384.upsample:
385	; -- process the upper row
386
387	movq	mm7, MMWORD [edx+0*SIZEOF_MMWORD]	; mm7=Int0L=( 0 1 2 3)
388	movq	mm3, MMWORD [edx+1*SIZEOF_MMWORD]	; mm3=Int0H=( 4 5 6 7)
389
390	movq	mm0,mm7
391	movq	mm4,mm3
392	psrlq	mm0,2*BYTE_BIT			; mm0=( 1 2 3 -)
393	psllq	mm4,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm4=( - - - 4)
394	movq	mm5,mm7
395	movq	mm6,mm3
396	psrlq	mm5,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm5=( 3 - - -)
397	psllq	mm6,2*BYTE_BIT			; mm6=( - 4 5 6)
398
399	por	mm0,mm4				; mm0=( 1 2 3 4)
400	por	mm5,mm6				; mm5=( 3 4 5 6)
401
402	movq	mm1,mm7
403	movq	mm2,mm3
404	psllq	mm1,2*BYTE_BIT			; mm1=( - 0 1 2)
405	psrlq	mm2,2*BYTE_BIT			; mm2=( 5 6 7 -)
406	movq	mm4,mm3
407	psrlq	mm4,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm4=( 7 - - -)
408
409	por	mm1, MMWORD [wk(0)]		; mm1=(-1 0 1 2)
410	por	mm2, MMWORD [wk(2)]		; mm2=( 5 6 7 8)
411
412	movq	MMWORD [wk(0)], mm4
413
414	pmullw	mm7,[GOTOFF(ebx,PW_THREE)]
415	pmullw	mm3,[GOTOFF(ebx,PW_THREE)]
416	paddw	mm1,[GOTOFF(ebx,PW_EIGHT)]
417	paddw	mm5,[GOTOFF(ebx,PW_EIGHT)]
418	paddw	mm0,[GOTOFF(ebx,PW_SEVEN)]
419	paddw	mm2,[GOTOFF(ebx,PW_SEVEN)]
420
421	paddw	mm1,mm7
422	paddw	mm5,mm3
423	psrlw	mm1,4			; mm1=Out0LE=( 0  2  4  6)
424	psrlw	mm5,4			; mm5=Out0HE=( 8 10 12 14)
425	paddw	mm0,mm7
426	paddw	mm2,mm3
427	psrlw	mm0,4			; mm0=Out0LO=( 1  3  5  7)
428	psrlw	mm2,4			; mm2=Out0HO=( 9 11 13 15)
429
430	psllw	mm0,BYTE_BIT
431	psllw	mm2,BYTE_BIT
432	por	mm1,mm0			; mm1=Out0L=( 0  1  2  3  4  5  6  7)
433	por	mm5,mm2			; mm5=Out0H=( 8  9 10 11 12 13 14 15)
434
435	movq	MMWORD [edx+0*SIZEOF_MMWORD], mm1
436	movq	MMWORD [edx+1*SIZEOF_MMWORD], mm5
437
438	; -- process the lower row
439
440	movq	mm6, MMWORD [edi+0*SIZEOF_MMWORD]	; mm6=Int1L=( 0 1 2 3)
441	movq	mm4, MMWORD [edi+1*SIZEOF_MMWORD]	; mm4=Int1H=( 4 5 6 7)
442
443	movq	mm7,mm6
444	movq	mm3,mm4
445	psrlq	mm7,2*BYTE_BIT			; mm7=( 1 2 3 -)
446	psllq	mm3,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm3=( - - - 4)
447	movq	mm0,mm6
448	movq	mm2,mm4
449	psrlq	mm0,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm0=( 3 - - -)
450	psllq	mm2,2*BYTE_BIT			; mm2=( - 4 5 6)
451
452	por	mm7,mm3				; mm7=( 1 2 3 4)
453	por	mm0,mm2				; mm0=( 3 4 5 6)
454
455	movq	mm1,mm6
456	movq	mm5,mm4
457	psllq	mm1,2*BYTE_BIT			; mm1=( - 0 1 2)
458	psrlq	mm5,2*BYTE_BIT			; mm5=( 5 6 7 -)
459	movq	mm3,mm4
460	psrlq	mm3,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm3=( 7 - - -)
461
462	por	mm1, MMWORD [wk(1)]		; mm1=(-1 0 1 2)
463	por	mm5, MMWORD [wk(3)]		; mm5=( 5 6 7 8)
464
465	movq	MMWORD [wk(1)], mm3
466
467	pmullw	mm6,[GOTOFF(ebx,PW_THREE)]
468	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
469	paddw	mm1,[GOTOFF(ebx,PW_EIGHT)]
470	paddw	mm0,[GOTOFF(ebx,PW_EIGHT)]
471	paddw	mm7,[GOTOFF(ebx,PW_SEVEN)]
472	paddw	mm5,[GOTOFF(ebx,PW_SEVEN)]
473
474	paddw	mm1,mm6
475	paddw	mm0,mm4
476	psrlw	mm1,4			; mm1=Out1LE=( 0  2  4  6)
477	psrlw	mm0,4			; mm0=Out1HE=( 8 10 12 14)
478	paddw	mm7,mm6
479	paddw	mm5,mm4
480	psrlw	mm7,4			; mm7=Out1LO=( 1  3  5  7)
481	psrlw	mm5,4			; mm5=Out1HO=( 9 11 13 15)
482
483	psllw	mm7,BYTE_BIT
484	psllw	mm5,BYTE_BIT
485	por	mm1,mm7			; mm1=Out1L=( 0  1  2  3  4  5  6  7)
486	por	mm0,mm5			; mm0=Out1H=( 8  9 10 11 12 13 14 15)
487
488	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm1
489	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm0
490
491	poppic	ebx
492
493	sub	eax, byte SIZEOF_MMWORD
494	add	ecx, byte 1*SIZEOF_MMWORD	; inptr1(above)
495	add	ebx, byte 1*SIZEOF_MMWORD	; inptr0
496	add	esi, byte 1*SIZEOF_MMWORD	; inptr1(below)
497	add	edx, byte 2*SIZEOF_MMWORD	; outptr0
498	add	edi, byte 2*SIZEOF_MMWORD	; outptr1
499	cmp	eax, byte SIZEOF_MMWORD
500	ja	near .columnloop
501	test	eax,eax
502	jnz	near .columnloop_last
503
504	pop	esi
505	pop	edi
506	pop	ecx
507	pop	eax
508
509	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
510	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
511	sub	ecx, byte 2			; rowctr
512	jg	near .rowloop
513
514	emms		; empty MMX state
515
516.return:
517	pop	edi
518	pop	esi
519;	pop	edx		; need not be preserved
520;	pop	ecx		; need not be preserved
521	pop	ebx
522	mov	esp,ebp		; esp <- aligned ebp
523	pop	esp		; esp <- original ebp
524	pop	ebp
525	ret
526
527; --------------------------------------------------------------------------
528;
529; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
530; It's still a box filter.
531;
532; GLOBAL(void)
533; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
534;                          JDIMENSION output_width,
535;                          JSAMPARRAY input_data,
536;                          JSAMPARRAY * output_data_ptr);
537;
538
539%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
540%define output_width(b)	(b)+12		; JDIMENSION output_width
541%define input_data(b)		(b)+16		; JSAMPARRAY input_data
542%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
543
544	align	16
545	global	EXTN(jsimd_h2v1_upsample_mmx) PRIVATE
546
547EXTN(jsimd_h2v1_upsample_mmx):
548	push	ebp
549	mov	ebp,esp
550;	push	ebx		; unused
551;	push	ecx		; need not be preserved
552;	push	edx		; need not be preserved
553	push	esi
554	push	edi
555
556	mov	edx, JDIMENSION [output_width(ebp)]
557	add	edx, byte (2*SIZEOF_MMWORD)-1
558	and	edx, byte -(2*SIZEOF_MMWORD)
559	jz	short .return
560
561	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
562	test	ecx,ecx
563	jz	short .return
564
565	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
566	mov	edi, POINTER [output_data_ptr(ebp)]
567	mov	edi, JSAMPARRAY [edi]			; output_data
568	alignx	16,7
569.rowloop:
570	push	edi
571	push	esi
572
573	mov	esi, JSAMPROW [esi]		; inptr
574	mov	edi, JSAMPROW [edi]		; outptr
575	mov	eax,edx				; colctr
576	alignx	16,7
577.columnloop:
578
579	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
580
581	movq      mm1,mm0
582	punpcklbw mm0,mm0
583	punpckhbw mm1,mm1
584
585	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
586	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm1
587
588	sub	eax, byte 2*SIZEOF_MMWORD
589	jz	short .nextrow
590
591	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]
592
593	movq      mm3,mm2
594	punpcklbw mm2,mm2
595	punpckhbw mm3,mm3
596
597	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
598	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm3
599
600	sub	eax, byte 2*SIZEOF_MMWORD
601	jz	short .nextrow
602
603	add	esi, byte 2*SIZEOF_MMWORD	; inptr
604	add	edi, byte 4*SIZEOF_MMWORD	; outptr
605	jmp	short .columnloop
606	alignx	16,7
607
608.nextrow:
609	pop	esi
610	pop	edi
611
612	add	esi, byte SIZEOF_JSAMPROW	; input_data
613	add	edi, byte SIZEOF_JSAMPROW	; output_data
614	dec	ecx				; rowctr
615	jg	short .rowloop
616
617	emms		; empty MMX state
618
619.return:
620	pop	edi
621	pop	esi
622;	pop	edx		; need not be preserved
623;	pop	ecx		; need not be preserved
624;	pop	ebx		; unused
625	pop	ebp
626	ret
627
628; --------------------------------------------------------------------------
629;
630; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
631; It's still a box filter.
632;
633; GLOBAL(void)
634; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
635;                          JDIMENSION output_width,
636;                          JSAMPARRAY input_data,
637;                          JSAMPARRAY * output_data_ptr);
638;
639
640%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
641%define output_width(b)	(b)+12		; JDIMENSION output_width
642%define input_data(b)		(b)+16		; JSAMPARRAY input_data
643%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
644
645	align	16
646	global	EXTN(jsimd_h2v2_upsample_mmx) PRIVATE
647
648EXTN(jsimd_h2v2_upsample_mmx):
649	push	ebp
650	mov	ebp,esp
651	push	ebx
652;	push	ecx		; need not be preserved
653;	push	edx		; need not be preserved
654	push	esi
655	push	edi
656
657	mov	edx, JDIMENSION [output_width(ebp)]
658	add	edx, byte (2*SIZEOF_MMWORD)-1
659	and	edx, byte -(2*SIZEOF_MMWORD)
660	jz	near .return
661
662	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
663	test	ecx,ecx
664	jz	short .return
665
666	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
667	mov	edi, POINTER [output_data_ptr(ebp)]
668	mov	edi, JSAMPARRAY [edi]			; output_data
669	alignx	16,7
670.rowloop:
671	push	edi
672	push	esi
673
674	mov	esi, JSAMPROW [esi]			; inptr
675	mov	ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
676	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
677	mov	eax,edx					; colctr
678	alignx	16,7
679.columnloop:
680
681	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
682
683	movq      mm1,mm0
684	punpcklbw mm0,mm0
685	punpckhbw mm1,mm1
686
687	movq	MMWORD [ebx+0*SIZEOF_MMWORD], mm0
688	movq	MMWORD [ebx+1*SIZEOF_MMWORD], mm1
689	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
690	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm1
691
692	sub	eax, byte 2*SIZEOF_MMWORD
693	jz	short .nextrow
694
695	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]
696
697	movq      mm3,mm2
698	punpcklbw mm2,mm2
699	punpckhbw mm3,mm3
700
701	movq	MMWORD [ebx+2*SIZEOF_MMWORD], mm2
702	movq	MMWORD [ebx+3*SIZEOF_MMWORD], mm3
703	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
704	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm3
705
706	sub	eax, byte 2*SIZEOF_MMWORD
707	jz	short .nextrow
708
709	add	esi, byte 2*SIZEOF_MMWORD	; inptr
710	add	ebx, byte 4*SIZEOF_MMWORD	; outptr0
711	add	edi, byte 4*SIZEOF_MMWORD	; outptr1
712	jmp	short .columnloop
713	alignx	16,7
714
715.nextrow:
716	pop	esi
717	pop	edi
718
719	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
720	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
721	sub	ecx, byte 2			; rowctr
722	jg	short .rowloop
723
724	emms		; empty MMX state
725
726.return:
727	pop	edi
728	pop	esi
729;	pop	edx		; need not be preserved
730;	pop	ecx		; need not be preserved
731	pop	ebx
732	pop	ebp
733	ret
734
735; For some reason, the OS X linker does not honor the request to align the
736; segment unless we do this.
737	align	16
738