• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jdmrgmmx.asm - merged upsampling/color conversion (MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
7; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jcolsamp.inc"
20
21; --------------------------------------------------------------------------
22;
23; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
24;
25; GLOBAL(void)
26; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width,
27;                                 JSAMPIMAGE input_buf,
28;                                 JDIMENSION in_row_group_ctr,
29;                                 JSAMPARRAY output_buf);
30;
31
32%define output_width(b)	(b)+8			; JDIMENSION output_width
33%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
34%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
35%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
36
37%define original_ebp	ebp+0
38%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
39%define WK_NUM		3
40%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
41
42	align	16
43	global	EXTN(jsimd_h2v1_merged_upsample_mmx) PRIVATE
44
45EXTN(jsimd_h2v1_merged_upsample_mmx):
46	push	ebp
47	mov	eax,esp				; eax = original ebp
48	sub	esp, byte 4
49	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
50	mov	[esp],eax
51	mov	ebp,esp				; ebp = aligned ebp
52	lea	esp, [wk(0)]
53	pushpic	eax		; make a room for GOT address
54	push	ebx
55;	push	ecx		; need not be preserved
56;	push	edx		; need not be preserved
57	push	esi
58	push	edi
59
60	get_GOT	ebx			; get GOT address
61	movpic	POINTER [gotptr], ebx	; save GOT address
62
63	mov	ecx, JDIMENSION [output_width(eax)]	; col
64	test	ecx,ecx
65	jz	near .return
66
67	push	ecx
68
69	mov	edi, JSAMPIMAGE [input_buf(eax)]
70	mov	ecx, JDIMENSION [in_row_group_ctr(eax)]
71	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
72	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
73	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
74	mov	edi, JSAMPARRAY [output_buf(eax)]
75	mov	esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]		; inptr0
76	mov	ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]		; inptr1
77	mov	edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]		; inptr2
78	mov	edi, JSAMPROW [edi]				; outptr
79
80	pop	ecx			; col
81
82	alignx	16,7
83.columnloop:
84	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
85
86	movq      mm6, MMWORD [ebx]	; mm6=Cb(01234567)
87	movq      mm7, MMWORD [edx]	; mm7=Cr(01234567)
88
89	pxor      mm1,mm1		; mm1=(all 0's)
90	pcmpeqw   mm3,mm3
91	psllw     mm3,7			; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
92
93	movq      mm4,mm6
94	punpckhbw mm6,mm1		; mm6=Cb(4567)=CbH
95	punpcklbw mm4,mm1		; mm4=Cb(0123)=CbL
96	movq      mm0,mm7
97	punpckhbw mm7,mm1		; mm7=Cr(4567)=CrH
98	punpcklbw mm0,mm1		; mm0=Cr(0123)=CrL
99
100	paddw     mm6,mm3
101	paddw     mm4,mm3
102	paddw     mm7,mm3
103	paddw     mm0,mm3
104
105	; (Original)
106	; R = Y                + 1.40200 * Cr
107	; G = Y - 0.34414 * Cb - 0.71414 * Cr
108	; B = Y + 1.77200 * Cb
109	;
110	; (This implementation)
111	; R = Y                + 0.40200 * Cr + Cr
112	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
113	; B = Y - 0.22800 * Cb + Cb + Cb
114
115	movq	mm5,mm6			; mm5=CbH
116	movq	mm2,mm4			; mm2=CbL
117	paddw	mm6,mm6			; mm6=2*CbH
118	paddw	mm4,mm4			; mm4=2*CbL
119	movq	mm1,mm7			; mm1=CrH
120	movq	mm3,mm0			; mm3=CrL
121	paddw	mm7,mm7			; mm7=2*CrH
122	paddw	mm0,mm0			; mm0=2*CrL
123
124	pmulhw	mm6,[GOTOFF(eax,PW_MF0228)]	; mm6=(2*CbH * -FIX(0.22800))
125	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbL * -FIX(0.22800))
126	pmulhw	mm7,[GOTOFF(eax,PW_F0402)]	; mm7=(2*CrH * FIX(0.40200))
127	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrL * FIX(0.40200))
128
129	paddw	mm6,[GOTOFF(eax,PW_ONE)]
130	paddw	mm4,[GOTOFF(eax,PW_ONE)]
131	psraw	mm6,1			; mm6=(CbH * -FIX(0.22800))
132	psraw	mm4,1			; mm4=(CbL * -FIX(0.22800))
133	paddw	mm7,[GOTOFF(eax,PW_ONE)]
134	paddw	mm0,[GOTOFF(eax,PW_ONE)]
135	psraw	mm7,1			; mm7=(CrH * FIX(0.40200))
136	psraw	mm0,1			; mm0=(CrL * FIX(0.40200))
137
138	paddw	mm6,mm5
139	paddw	mm4,mm2
140	paddw	mm6,mm5			; mm6=(CbH * FIX(1.77200))=(B-Y)H
141	paddw	mm4,mm2			; mm4=(CbL * FIX(1.77200))=(B-Y)L
142	paddw	mm7,mm1			; mm7=(CrH * FIX(1.40200))=(R-Y)H
143	paddw	mm0,mm3			; mm0=(CrL * FIX(1.40200))=(R-Y)L
144
145	movq	MMWORD [wk(0)], mm6	; wk(0)=(B-Y)H
146	movq	MMWORD [wk(1)], mm7	; wk(1)=(R-Y)H
147
148	movq      mm6,mm5
149	movq      mm7,mm2
150	punpcklwd mm5,mm1
151	punpckhwd mm6,mm1
152	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
153	pmaddwd   mm6,[GOTOFF(eax,PW_MF0344_F0285)]
154	punpcklwd mm2,mm3
155	punpckhwd mm7,mm3
156	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
157	pmaddwd   mm7,[GOTOFF(eax,PW_MF0344_F0285)]
158
159	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
160	paddd     mm6,[GOTOFF(eax,PD_ONEHALF)]
161	psrad     mm5,SCALEBITS
162	psrad     mm6,SCALEBITS
163	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
164	paddd     mm7,[GOTOFF(eax,PD_ONEHALF)]
165	psrad     mm2,SCALEBITS
166	psrad     mm7,SCALEBITS
167
168	packssdw  mm5,mm6	; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
169	packssdw  mm2,mm7	; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
170	psubw     mm5,mm1	; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
171	psubw     mm2,mm3	; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
172
173	movq	MMWORD [wk(2)], mm5	; wk(2)=(G-Y)H
174
175	mov	al,2			; Yctr
176	jmp	short .Yloop_1st
177	alignx	16,7
178
179.Yloop_2nd:
180	movq	mm0, MMWORD [wk(1)]	; mm0=(R-Y)H
181	movq	mm2, MMWORD [wk(2)]	; mm2=(G-Y)H
182	movq	mm4, MMWORD [wk(0)]	; mm4=(B-Y)H
183	alignx	16,7
184
185.Yloop_1st:
186	movq	mm7, MMWORD [esi]	; mm7=Y(01234567)
187
188	pcmpeqw	mm6,mm6
189	psrlw	mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
190	pand	mm6,mm7			; mm6=Y(0246)=YE
191	psrlw	mm7,BYTE_BIT		; mm7=Y(1357)=YO
192
193	movq	mm1,mm0			; mm1=mm0=(R-Y)(L/H)
194	movq	mm3,mm2			; mm3=mm2=(G-Y)(L/H)
195	movq	mm5,mm4			; mm5=mm4=(B-Y)(L/H)
196
197	paddw     mm0,mm6		; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
198	paddw     mm1,mm7		; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
199	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
200	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
201
202	paddw     mm2,mm6		; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
203	paddw     mm3,mm7		; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
204	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
205	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
206
207	paddw     mm4,mm6		; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
208	paddw     mm5,mm7		; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
209	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
210	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
211
212%if RGB_PIXELSIZE == 3 ; ---------------
213
214	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
215	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
216	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
217	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
218
219	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
220	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
221	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
222
223	movq      mmG,mmA
224	movq      mmH,mmA
225	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
226	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
227
228	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
229	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
230
231	movq      mmC,mmD
232	movq      mmB,mmD
233	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
234	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
235
236	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
237
238	movq      mmF,mmE
239	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
240	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
241
242	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
243	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
244	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
245
246	cmp	ecx, byte SIZEOF_MMWORD
247	jb	short .column_st16
248
249	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
250	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
251	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
252
253	sub	ecx, byte SIZEOF_MMWORD
254	jz	near .endcolumn
255
256	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
257	add	esi, byte SIZEOF_MMWORD			; inptr0
258	dec	al			; Yctr
259	jnz	near .Yloop_2nd
260
261	add	ebx, byte SIZEOF_MMWORD			; inptr1
262	add	edx, byte SIZEOF_MMWORD			; inptr2
263	jmp	near .columnloop
264	alignx	16,7
265
266.column_st16:
267	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
268	cmp	ecx, byte 2*SIZEOF_MMWORD
269	jb	short .column_st8
270	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
271	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
272	movq	mmA,mmC
273	sub	ecx, byte 2*SIZEOF_MMWORD
274	add	edi, byte 2*SIZEOF_MMWORD
275	jmp	short .column_st4
276.column_st8:
277	cmp	ecx, byte SIZEOF_MMWORD
278	jb	short .column_st4
279	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
280	movq	mmA,mmE
281	sub	ecx, byte SIZEOF_MMWORD
282	add	edi, byte SIZEOF_MMWORD
283.column_st4:
284	movd	eax,mmA
285	cmp	ecx, byte SIZEOF_DWORD
286	jb	short .column_st2
287	mov	DWORD [edi+0*SIZEOF_DWORD], eax
288	psrlq	mmA,DWORD_BIT
289	movd	eax,mmA
290	sub	ecx, byte SIZEOF_DWORD
291	add	edi, byte SIZEOF_DWORD
292.column_st2:
293	cmp	ecx, byte SIZEOF_WORD
294	jb	short .column_st1
295	mov	WORD [edi+0*SIZEOF_WORD], ax
296	shr	eax,WORD_BIT
297	sub	ecx, byte SIZEOF_WORD
298	add	edi, byte SIZEOF_WORD
299.column_st1:
300	cmp	ecx, byte SIZEOF_BYTE
301	jb	short .endcolumn
302	mov	BYTE [edi+0*SIZEOF_BYTE], al
303
304%else ; RGB_PIXELSIZE == 4 ; -----------
305
306%ifdef RGBX_FILLER_0XFF
307	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
308	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
309%else
310	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
311	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
312%endif
313	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
314	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
315	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
316	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
317
318	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
319	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
320	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
321	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
322
323	movq      mmC,mmA
324	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
325	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
326	movq      mmG,mmB
327	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
328	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
329
330	movq      mmD,mmA
331	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
332	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
333	movq      mmH,mmC
334	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
335	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
336
337	cmp	ecx, byte SIZEOF_MMWORD
338	jb	short .column_st16
339
340	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
341	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
342	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
343	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
344
345	sub	ecx, byte SIZEOF_MMWORD
346	jz	short .endcolumn
347
348	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
349	add	esi, byte SIZEOF_MMWORD			; inptr0
350	dec	al			; Yctr
351	jnz	near .Yloop_2nd
352
353	add	ebx, byte SIZEOF_MMWORD			; inptr1
354	add	edx, byte SIZEOF_MMWORD			; inptr2
355	jmp	near .columnloop
356	alignx	16,7
357
358.column_st16:
359	cmp	ecx, byte SIZEOF_MMWORD/2
360	jb	short .column_st8
361	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
362	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
363	movq	mmA,mmC
364	movq	mmD,mmH
365	sub	ecx, byte SIZEOF_MMWORD/2
366	add	edi, byte 2*SIZEOF_MMWORD
367.column_st8:
368	cmp	ecx, byte SIZEOF_MMWORD/4
369	jb	short .column_st4
370	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
371	movq	mmA,mmD
372	sub	ecx, byte SIZEOF_MMWORD/4
373	add	edi, byte 1*SIZEOF_MMWORD
374.column_st4:
375	cmp	ecx, byte SIZEOF_MMWORD/8
376	jb	short .endcolumn
377	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
378
379%endif ; RGB_PIXELSIZE ; ---------------
380
381.endcolumn:
382	emms		; empty MMX state
383
384.return:
385	pop	edi
386	pop	esi
387;	pop	edx		; need not be preserved
388;	pop	ecx		; need not be preserved
389	pop	ebx
390	mov	esp,ebp		; esp <- aligned ebp
391	pop	esp		; esp <- original ebp
392	pop	ebp
393	ret
394
395; --------------------------------------------------------------------------
396;
397; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
398;
399; GLOBAL(void)
400; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width,
401;                                 JSAMPIMAGE input_buf,
402;                                 JDIMENSION in_row_group_ctr,
403;                                 JSAMPARRAY output_buf);
404;
405
406%define output_width(b)	(b)+8			; JDIMENSION output_width
407%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
408%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
409%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
410
411	align	16
412	global	EXTN(jsimd_h2v2_merged_upsample_mmx) PRIVATE
413
414EXTN(jsimd_h2v2_merged_upsample_mmx):
415	push	ebp
416	mov	ebp,esp
417	push	ebx
418;	push	ecx		; need not be preserved
419;	push	edx		; need not be preserved
420	push	esi
421	push	edi
422
423	mov	eax, JDIMENSION [output_width(ebp)]
424
425	mov	edi, JSAMPIMAGE [input_buf(ebp)]
426	mov	ecx, JDIMENSION [in_row_group_ctr(ebp)]
427	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
428	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
429	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
430	mov	edi, JSAMPARRAY [output_buf(ebp)]
431	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
432
433	push	edx			; inptr2
434	push	ebx			; inptr1
435	push	esi			; inptr00
436	mov	ebx,esp
437
438	push	edi			; output_buf (outptr0)
439	push	ecx			; in_row_group_ctr
440	push	ebx			; input_buf
441	push	eax			; output_width
442
443	call	near EXTN(jsimd_h2v1_merged_upsample_mmx)
444
445	add	esi, byte SIZEOF_JSAMPROW	; inptr01
446	add	edi, byte SIZEOF_JSAMPROW	; outptr1
447	mov	POINTER [ebx+0*SIZEOF_POINTER], esi
448	mov	POINTER [ebx-1*SIZEOF_POINTER], edi
449
450	call	near EXTN(jsimd_h2v1_merged_upsample_mmx)
451
452	add	esp, byte 7*SIZEOF_DWORD
453
454	pop	edi
455	pop	esi
456;	pop	edx		; need not be preserved
457;	pop	ecx		; need not be preserved
458	pop	ebx
459	pop	ebp
460	ret
461
462; For some reason, the OS X linker does not honor the request to align the
463; segment unless we do this.
464	align	16
465