• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jcqnts2f.asm - sample data conversion and quantization (SSE & SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
7; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jsimdext.inc"
20%include "jdct.inc"
21
22; --------------------------------------------------------------------------
23	SECTION	SEG_TEXT
24	BITS	32
25;
26; Load data into workspace, applying unsigned->signed conversion
27;
28; GLOBAL(void)
29; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
30;                            FAST_FLOAT * workspace);
31;
32
33%define sample_data	ebp+8		; JSAMPARRAY sample_data
34%define start_col	ebp+12		; JDIMENSION start_col
35%define workspace	ebp+16		; FAST_FLOAT * workspace
36
37	align	16
38	global	EXTN(jsimd_convsamp_float_sse2) PRIVATE
39
40EXTN(jsimd_convsamp_float_sse2):
41	push	ebp
42	mov	ebp,esp
43	push	ebx
44;	push	ecx		; need not be preserved
45;	push	edx		; need not be preserved
46	push	esi
47	push	edi
48
49	pcmpeqw  xmm7,xmm7
50	psllw    xmm7,7
51	packsswb xmm7,xmm7		; xmm7 = PB_CENTERJSAMPLE (0x808080..)
52
53	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
54	mov	eax, JDIMENSION [start_col]
55	mov	edi, POINTER [workspace]	; (DCTELEM *)
56	mov	ecx, DCTSIZE/2
57	alignx	16,7
58.convloop:
59	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
60	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
61
62	movq	xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
63	movq	xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
64
65	psubb	xmm0,xmm7			; xmm0=(01234567)
66	psubb	xmm1,xmm7			; xmm1=(89ABCDEF)
67
68	punpcklbw xmm0,xmm0			; xmm0=(*0*1*2*3*4*5*6*7)
69	punpcklbw xmm1,xmm1			; xmm1=(*8*9*A*B*C*D*E*F)
70
71	punpcklwd xmm2,xmm0			; xmm2=(***0***1***2***3)
72	punpckhwd xmm0,xmm0			; xmm0=(***4***5***6***7)
73	punpcklwd xmm3,xmm1			; xmm3=(***8***9***A***B)
74	punpckhwd xmm1,xmm1			; xmm1=(***C***D***E***F)
75
76	psrad     xmm2,(DWORD_BIT-BYTE_BIT)	; xmm2=(0123)
77	psrad     xmm0,(DWORD_BIT-BYTE_BIT)	; xmm0=(4567)
78	cvtdq2ps  xmm2,xmm2			; xmm2=(0123)
79	cvtdq2ps  xmm0,xmm0			; xmm0=(4567)
80	psrad     xmm3,(DWORD_BIT-BYTE_BIT)	; xmm3=(89AB)
81	psrad     xmm1,(DWORD_BIT-BYTE_BIT)	; xmm1=(CDEF)
82	cvtdq2ps  xmm3,xmm3			; xmm3=(89AB)
83	cvtdq2ps  xmm1,xmm1			; xmm1=(CDEF)
84
85	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
86	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
87	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
88	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
89
90	add	esi, byte 2*SIZEOF_JSAMPROW
91	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
92	dec	ecx
93	jnz	short .convloop
94
95	pop	edi
96	pop	esi
97;	pop	edx		; need not be preserved
98;	pop	ecx		; need not be preserved
99	pop	ebx
100	pop	ebp
101	ret
102
103
104; --------------------------------------------------------------------------
105;
106; Quantize/descale the coefficients, and store into coef_block
107;
108; GLOBAL(void)
109; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
110;                         FAST_FLOAT * workspace);
111;
112
113%define coef_block	ebp+8		; JCOEFPTR coef_block
114%define divisors	ebp+12		; FAST_FLOAT * divisors
115%define workspace	ebp+16		; FAST_FLOAT * workspace
116
117	align	16
118	global	EXTN(jsimd_quantize_float_sse2) PRIVATE
119
120EXTN(jsimd_quantize_float_sse2):
121	push	ebp
122	mov	ebp,esp
123;	push	ebx		; unused
124;	push	ecx		; unused
125;	push	edx		; need not be preserved
126	push	esi
127	push	edi
128
129	mov	esi, POINTER [workspace]
130	mov	edx, POINTER [divisors]
131	mov	edi, JCOEFPTR [coef_block]
132	mov	eax, DCTSIZE2/16
133	alignx	16,7
134.quantloop:
135	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
136	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
137	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
138	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
139	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
140	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
141	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
142	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
143
144	cvtps2dq xmm0,xmm0
145	cvtps2dq xmm1,xmm1
146	cvtps2dq xmm2,xmm2
147	cvtps2dq xmm3,xmm3
148
149	packssdw xmm0,xmm1
150	packssdw xmm2,xmm3
151
152	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
153	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
154
155	add	esi, byte 16*SIZEOF_FAST_FLOAT
156	add	edx, byte 16*SIZEOF_FAST_FLOAT
157	add	edi, byte 16*SIZEOF_JCOEF
158	dec	eax
159	jnz	short .quantloop
160
161	pop	edi
162	pop	esi
163;	pop	edx		; need not be preserved
164;	pop	ecx		; unused
165;	pop	ebx		; unused
166	pop	ebp
167	ret
168
169; For some reason, the OS X linker does not honor the request to align the
170; segment unless we do this.
171	align	16
172