• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jquant.asm - sample data conversion and quantization (SSE & MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on the x86 SIMD extension for IJG JPEG library
7; Copyright (C) 1999-2006, MIYASAKA Masaru.
8; For conditions of distribution and use, see copyright notice in jsimdext.inc
9;
10; This file should be assembled with NASM (Netwide Assembler),
11; can *not* be assembled with Microsoft's MASM or any compatible
12; assembler (including Borland's Turbo Assembler).
13; NASM is available from http://nasm.sourceforge.net/ or
14; http://sourceforge.net/project/showfiles.php?group_id=6208
15;
16; [TAB8]
17
18%include "jsimdext.inc"
19%include "jdct.inc"
20
21; --------------------------------------------------------------------------
22        SECTION SEG_TEXT
23        BITS    32
24;
25; Load data into workspace, applying unsigned->signed conversion
26;
27; GLOBAL(void)
28; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
29;                           FAST_FLOAT *workspace);
30;
31
32%define sample_data     ebp+8           ; JSAMPARRAY sample_data
33%define start_col       ebp+12          ; JDIMENSION start_col
34%define workspace       ebp+16          ; FAST_FLOAT *workspace
35
36        align   16
37        global  EXTN(jsimd_convsamp_float_sse)
38
39EXTN(jsimd_convsamp_float_sse):
40        push    ebp
41        mov     ebp,esp
42        push    ebx
43;       push    ecx             ; need not be preserved
44;       push    edx             ; need not be preserved
45        push    esi
46        push    edi
47
48        pcmpeqw  mm7,mm7
49        psllw    mm7,7
50        packsswb mm7,mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
51
52        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
53        mov     eax, JDIMENSION [start_col]
54        mov     edi, POINTER [workspace]        ; (DCTELEM *)
55        mov     ecx, DCTSIZE/2
56        alignx  16,7
57.convloop:
58        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
59        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
60
61        movq    mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
62        movq    mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
63
64        psubb   mm0,mm7                         ; mm0=(01234567)
65        psubb   mm1,mm7                         ; mm1=(89ABCDEF)
66
67        punpcklbw mm2,mm0                       ; mm2=(*0*1*2*3)
68        punpckhbw mm0,mm0                       ; mm0=(*4*5*6*7)
69        punpcklbw mm3,mm1                       ; mm3=(*8*9*A*B)
70        punpckhbw mm1,mm1                       ; mm1=(*C*D*E*F)
71
72        punpcklwd mm4,mm2                       ; mm4=(***0***1)
73        punpckhwd mm2,mm2                       ; mm2=(***2***3)
74        punpcklwd mm5,mm0                       ; mm5=(***4***5)
75        punpckhwd mm0,mm0                       ; mm0=(***6***7)
76
77        psrad     mm4,(DWORD_BIT-BYTE_BIT)      ; mm4=(01)
78        psrad     mm2,(DWORD_BIT-BYTE_BIT)      ; mm2=(23)
79        cvtpi2ps  xmm0,mm4                      ; xmm0=(01**)
80        cvtpi2ps  xmm1,mm2                      ; xmm1=(23**)
81        psrad     mm5,(DWORD_BIT-BYTE_BIT)      ; mm5=(45)
82        psrad     mm0,(DWORD_BIT-BYTE_BIT)      ; mm0=(67)
83        cvtpi2ps  xmm2,mm5                      ; xmm2=(45**)
84        cvtpi2ps  xmm3,mm0                      ; xmm3=(67**)
85
86        punpcklwd mm6,mm3                       ; mm6=(***8***9)
87        punpckhwd mm3,mm3                       ; mm3=(***A***B)
88        punpcklwd mm4,mm1                       ; mm4=(***C***D)
89        punpckhwd mm1,mm1                       ; mm1=(***E***F)
90
91        psrad     mm6,(DWORD_BIT-BYTE_BIT)      ; mm6=(89)
92        psrad     mm3,(DWORD_BIT-BYTE_BIT)      ; mm3=(AB)
93        cvtpi2ps  xmm4,mm6                      ; xmm4=(89**)
94        cvtpi2ps  xmm5,mm3                      ; xmm5=(AB**)
95        psrad     mm4,(DWORD_BIT-BYTE_BIT)      ; mm4=(CD)
96        psrad     mm1,(DWORD_BIT-BYTE_BIT)      ; mm1=(EF)
97        cvtpi2ps  xmm6,mm4                      ; xmm6=(CD**)
98        cvtpi2ps  xmm7,mm1                      ; xmm7=(EF**)
99
100        movlhps   xmm0,xmm1                     ; xmm0=(0123)
101        movlhps   xmm2,xmm3                     ; xmm2=(4567)
102        movlhps   xmm4,xmm5                     ; xmm4=(89AB)
103        movlhps   xmm6,xmm7                     ; xmm6=(CDEF)
104
105        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
106        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
107        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
108        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
109
110        add     esi, byte 2*SIZEOF_JSAMPROW
111        add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
112        dec     ecx
113        jnz     near .convloop
114
115        emms            ; empty MMX state
116
117        pop     edi
118        pop     esi
119;       pop     edx             ; need not be preserved
120;       pop     ecx             ; need not be preserved
121        pop     ebx
122        pop     ebp
123        ret
124
125
126; --------------------------------------------------------------------------
127;
128; Quantize/descale the coefficients, and store into coef_block
129;
130; GLOBAL(void)
131; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT *divisors,
132;                           FAST_FLOAT *workspace);
133;
134
135%define coef_block      ebp+8           ; JCOEFPTR coef_block
136%define divisors        ebp+12          ; FAST_FLOAT *divisors
137%define workspace       ebp+16          ; FAST_FLOAT *workspace
138
139        align   16
140        global  EXTN(jsimd_quantize_float_sse)
141
142EXTN(jsimd_quantize_float_sse):
143        push    ebp
144        mov     ebp,esp
145;       push    ebx             ; unused
146;       push    ecx             ; unused
147;       push    edx             ; need not be preserved
148        push    esi
149        push    edi
150
151        mov     esi, POINTER [workspace]
152        mov     edx, POINTER [divisors]
153        mov     edi, JCOEFPTR [coef_block]
154        mov     eax, DCTSIZE2/16
155        alignx  16,7
156.quantloop:
157        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
158        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
159        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
160        mulps   xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
161        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
162        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
163        mulps   xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
164        mulps   xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
165
166        movhlps  xmm4,xmm0
167        movhlps  xmm5,xmm1
168
169        cvtps2pi mm0,xmm0
170        cvtps2pi mm1,xmm1
171        cvtps2pi mm4,xmm4
172        cvtps2pi mm5,xmm5
173
174        movhlps  xmm6,xmm2
175        movhlps  xmm7,xmm3
176
177        cvtps2pi mm2,xmm2
178        cvtps2pi mm3,xmm3
179        cvtps2pi mm6,xmm6
180        cvtps2pi mm7,xmm7
181
182        packssdw mm0,mm4
183        packssdw mm1,mm5
184        packssdw mm2,mm6
185        packssdw mm3,mm7
186
187        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
188        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
189        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
190        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
191
192        add     esi, byte 16*SIZEOF_FAST_FLOAT
193        add     edx, byte 16*SIZEOF_FAST_FLOAT
194        add     edi, byte 16*SIZEOF_JCOEF
195        dec     eax
196        jnz     short .quantloop
197
198        emms            ; empty MMX state
199
200        pop     edi
201        pop     esi
202;       pop     edx             ; need not be preserved
203;       pop     ecx             ; unused
204;       pop     ebx             ; unused
205        pop     ebp
206        ret
207
208; For some reason, the OS X linker does not honor the request to align the
209; segment unless we do this.
210        align   16
211