• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jcqntsse.asm - sample data conversion and quantization (SSE & MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
7; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jsimdext.inc"
20%include "jdct.inc"
21
22; --------------------------------------------------------------------------
23	SECTION	SEG_TEXT
24	BITS	32
25;
26; Load data into workspace, applying unsigned->signed conversion
27;
28; GLOBAL(void)
29; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
30;                           FAST_FLOAT * workspace);
31;
32
33%define sample_data	ebp+8		; JSAMPARRAY sample_data
34%define start_col	ebp+12		; JDIMENSION start_col
35%define workspace	ebp+16		; FAST_FLOAT * workspace
36
37	align	16
38	global	EXTN(jsimd_convsamp_float_sse) PRIVATE
39
40EXTN(jsimd_convsamp_float_sse):
41	push	ebp
42	mov	ebp,esp
43	push	ebx
44;	push	ecx		; need not be preserved
45;	push	edx		; need not be preserved
46	push	esi
47	push	edi
48
49	pcmpeqw  mm7,mm7
50	psllw    mm7,7
51	packsswb mm7,mm7		; mm7 = PB_CENTERJSAMPLE (0x808080..)
52
53	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
54	mov	eax, JDIMENSION [start_col]
55	mov	edi, POINTER [workspace]	; (DCTELEM *)
56	mov	ecx, DCTSIZE/2
57	alignx	16,7
58.convloop:
59	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
60	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
61
62	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
63	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
64
65	psubb	mm0,mm7				; mm0=(01234567)
66	psubb	mm1,mm7				; mm1=(89ABCDEF)
67
68	punpcklbw mm2,mm0			; mm2=(*0*1*2*3)
69	punpckhbw mm0,mm0			; mm0=(*4*5*6*7)
70	punpcklbw mm3,mm1			; mm3=(*8*9*A*B)
71	punpckhbw mm1,mm1			; mm1=(*C*D*E*F)
72
73	punpcklwd mm4,mm2			; mm4=(***0***1)
74	punpckhwd mm2,mm2			; mm2=(***2***3)
75	punpcklwd mm5,mm0			; mm5=(***4***5)
76	punpckhwd mm0,mm0			; mm0=(***6***7)
77
78	psrad     mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(01)
79	psrad     mm2,(DWORD_BIT-BYTE_BIT)	; mm2=(23)
80	cvtpi2ps  xmm0,mm4			; xmm0=(01**)
81	cvtpi2ps  xmm1,mm2			; xmm1=(23**)
82	psrad     mm5,(DWORD_BIT-BYTE_BIT)	; mm5=(45)
83	psrad     mm0,(DWORD_BIT-BYTE_BIT)	; mm0=(67)
84	cvtpi2ps  xmm2,mm5			; xmm2=(45**)
85	cvtpi2ps  xmm3,mm0			; xmm3=(67**)
86
87	punpcklwd mm6,mm3			; mm6=(***8***9)
88	punpckhwd mm3,mm3			; mm3=(***A***B)
89	punpcklwd mm4,mm1			; mm4=(***C***D)
90	punpckhwd mm1,mm1			; mm1=(***E***F)
91
92	psrad     mm6,(DWORD_BIT-BYTE_BIT)	; mm6=(89)
93	psrad     mm3,(DWORD_BIT-BYTE_BIT)	; mm3=(AB)
94	cvtpi2ps  xmm4,mm6			; xmm4=(89**)
95	cvtpi2ps  xmm5,mm3			; xmm5=(AB**)
96	psrad     mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(CD)
97	psrad     mm1,(DWORD_BIT-BYTE_BIT)	; mm1=(EF)
98	cvtpi2ps  xmm6,mm4			; xmm6=(CD**)
99	cvtpi2ps  xmm7,mm1			; xmm7=(EF**)
100
101	movlhps   xmm0,xmm1			; xmm0=(0123)
102	movlhps   xmm2,xmm3			; xmm2=(4567)
103	movlhps   xmm4,xmm5			; xmm4=(89AB)
104	movlhps   xmm6,xmm7			; xmm6=(CDEF)
105
106	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
107	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
108	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
109	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
110
111	add	esi, byte 2*SIZEOF_JSAMPROW
112	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
113	dec	ecx
114	jnz	near .convloop
115
116	emms		; empty MMX state
117
118	pop	edi
119	pop	esi
120;	pop	edx		; need not be preserved
121;	pop	ecx		; need not be preserved
122	pop	ebx
123	pop	ebp
124	ret
125
126
127; --------------------------------------------------------------------------
128;
129; Quantize/descale the coefficients, and store into coef_block
130;
131; GLOBAL(void)
132; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors,
133;                           FAST_FLOAT * workspace);
134;
135
136%define coef_block	ebp+8		; JCOEFPTR coef_block
137%define divisors	ebp+12		; FAST_FLOAT * divisors
138%define workspace	ebp+16		; FAST_FLOAT * workspace
139
140	align	16
141	global	EXTN(jsimd_quantize_float_sse) PRIVATE
142
143EXTN(jsimd_quantize_float_sse):
144	push	ebp
145	mov	ebp,esp
146;	push	ebx		; unused
147;	push	ecx		; unused
148;	push	edx		; need not be preserved
149	push	esi
150	push	edi
151
152	mov	esi, POINTER [workspace]
153	mov	edx, POINTER [divisors]
154	mov	edi, JCOEFPTR [coef_block]
155	mov	eax, DCTSIZE2/16
156	alignx	16,7
157.quantloop:
158	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
159	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
160	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
161	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
162	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
163	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
164	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
165	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
166
167	movhlps  xmm4,xmm0
168	movhlps  xmm5,xmm1
169
170	cvtps2pi mm0,xmm0
171	cvtps2pi mm1,xmm1
172	cvtps2pi mm4,xmm4
173	cvtps2pi mm5,xmm5
174
175	movhlps  xmm6,xmm2
176	movhlps  xmm7,xmm3
177
178	cvtps2pi mm2,xmm2
179	cvtps2pi mm3,xmm3
180	cvtps2pi mm6,xmm6
181	cvtps2pi mm7,xmm7
182
183	packssdw mm0,mm4
184	packssdw mm1,mm5
185	packssdw mm2,mm6
186	packssdw mm3,mm7
187
188	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
189	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
190	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
191	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
192
193	add	esi, byte 16*SIZEOF_FAST_FLOAT
194	add	edx, byte 16*SIZEOF_FAST_FLOAT
195	add	edi, byte 16*SIZEOF_JCOEF
196	dec	eax
197	jnz	short .quantloop
198
199	emms		; empty MMX state
200
201	pop	edi
202	pop	esi
203;	pop	edx		; need not be preserved
204;	pop	ecx		; unused
205;	pop	ebx		; unused
206	pop	ebp
207	ret
208
209; For some reason, the OS X linker does not honor the request to align the
210; segment unless we do this.
211	align	16
212