• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jquant.asm - sample data conversion and quantization (3DNow! & MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on the x86 SIMD extension for IJG JPEG library
7; Copyright (C) 1999-2006, MIYASAKA Masaru.
8; For conditions of distribution and use, see copyright notice in jsimdext.inc
9;
10; This file should be assembled with NASM (Netwide Assembler),
11; can *not* be assembled with Microsoft's MASM or any compatible
12; assembler (including Borland's Turbo Assembler).
13; NASM is available from http://nasm.sourceforge.net/ or
14; http://sourceforge.net/project/showfiles.php?group_id=6208
15;
16; [TAB8]
17
18%include "jsimdext.inc"
19%include "jdct.inc"
20
21; --------------------------------------------------------------------------
22        SECTION SEG_TEXT
23        BITS    32
24;
25; Load data into workspace, applying unsigned->signed conversion
26;
27; GLOBAL(void)
28; jsimd_convsamp_float_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col,
29;                             FAST_FLOAT *workspace);
30;
31
32%define sample_data     ebp+8           ; JSAMPARRAY sample_data
33%define start_col       ebp+12          ; JDIMENSION start_col
34%define workspace       ebp+16          ; FAST_FLOAT *workspace
35
36        align   16
37        global  EXTN(jsimd_convsamp_float_3dnow)
38
39EXTN(jsimd_convsamp_float_3dnow):
40        push    ebp
41        mov     ebp,esp
42        push    ebx
43;       push    ecx             ; need not be preserved
44;       push    edx             ; need not be preserved
45        push    esi
46        push    edi
47
48        pcmpeqw  mm7,mm7
49        psllw    mm7,7
50        packsswb mm7,mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
51
52        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
53        mov     eax, JDIMENSION [start_col]
54        mov     edi, POINTER [workspace]        ; (DCTELEM *)
55        mov     ecx, DCTSIZE/2
56        alignx  16,7
57.convloop:
58        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
59        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
60
61        movq    mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
62        movq    mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
63
64        psubb   mm0,mm7                         ; mm0=(01234567)
65        psubb   mm1,mm7                         ; mm1=(89ABCDEF)
66
67        punpcklbw mm2,mm0                       ; mm2=(*0*1*2*3)
68        punpckhbw mm0,mm0                       ; mm0=(*4*5*6*7)
69        punpcklbw mm3,mm1                       ; mm3=(*8*9*A*B)
70        punpckhbw mm1,mm1                       ; mm1=(*C*D*E*F)
71
72        punpcklwd mm4,mm2                       ; mm4=(***0***1)
73        punpckhwd mm2,mm2                       ; mm2=(***2***3)
74        punpcklwd mm5,mm0                       ; mm5=(***4***5)
75        punpckhwd mm0,mm0                       ; mm0=(***6***7)
76
77        psrad   mm4,(DWORD_BIT-BYTE_BIT)        ; mm4=(01)
78        psrad   mm2,(DWORD_BIT-BYTE_BIT)        ; mm2=(23)
79        pi2fd   mm4,mm4
80        pi2fd   mm2,mm2
81        psrad   mm5,(DWORD_BIT-BYTE_BIT)        ; mm5=(45)
82        psrad   mm0,(DWORD_BIT-BYTE_BIT)        ; mm0=(67)
83        pi2fd   mm5,mm5
84        pi2fd   mm0,mm0
85
86        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
87        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
88        movq    MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
89        movq    MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
90
91        punpcklwd mm6,mm3                       ; mm6=(***8***9)
92        punpckhwd mm3,mm3                       ; mm3=(***A***B)
93        punpcklwd mm4,mm1                       ; mm4=(***C***D)
94        punpckhwd mm1,mm1                       ; mm1=(***E***F)
95
96        psrad   mm6,(DWORD_BIT-BYTE_BIT)        ; mm6=(89)
97        psrad   mm3,(DWORD_BIT-BYTE_BIT)        ; mm3=(AB)
98        pi2fd   mm6,mm6
99        pi2fd   mm3,mm3
100        psrad   mm4,(DWORD_BIT-BYTE_BIT)        ; mm4=(CD)
101        psrad   mm1,(DWORD_BIT-BYTE_BIT)        ; mm1=(EF)
102        pi2fd   mm4,mm4
103        pi2fd   mm1,mm1
104
105        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
106        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
107        movq    MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
108        movq    MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
109
110        add     esi, byte 2*SIZEOF_JSAMPROW
111        add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
112        dec     ecx
113        jnz     near .convloop
114
115        femms           ; empty MMX/3DNow! state
116
117        pop     edi
118        pop     esi
119;       pop     edx             ; need not be preserved
120;       pop     ecx             ; need not be preserved
121        pop     ebx
122        pop     ebp
123        ret
124
125
126; --------------------------------------------------------------------------
127;
128; Quantize/descale the coefficients, and store into coef_block
129;
130; GLOBAL(void)
131; jsimd_quantize_float_3dnow (JCOEFPTR coef_block, FAST_FLOAT *divisors,
132;                             FAST_FLOAT *workspace);
133;
134
135%define coef_block      ebp+8           ; JCOEFPTR coef_block
136%define divisors        ebp+12          ; FAST_FLOAT *divisors
137%define workspace       ebp+16          ; FAST_FLOAT *workspace
138
139        align   16
140        global  EXTN(jsimd_quantize_float_3dnow)
141
142EXTN(jsimd_quantize_float_3dnow):
143        push    ebp
144        mov     ebp,esp
145;       push    ebx             ; unused
146;       push    ecx             ; unused
147;       push    edx             ; need not be preserved
148        push    esi
149        push    edi
150
151        mov       eax, 0x4B400000       ; (float)0x00C00000 (rndint_magic)
152        movd      mm7,eax
153        punpckldq mm7,mm7               ; mm7={12582912.0F 12582912.0F}
154
155        mov     esi, POINTER [workspace]
156        mov     edx, POINTER [divisors]
157        mov     edi, JCOEFPTR [coef_block]
158        mov     eax, DCTSIZE2/16
159        alignx  16,7
160.quantloop:
161        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
162        movq    mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
163        pfmul   mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
164        pfmul   mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
165        movq    mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
166        movq    mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
167        pfmul   mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
168        pfmul   mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
169
170        pfadd   mm0,mm7                 ; mm0=(00 ** 01 **)
171        pfadd   mm1,mm7                 ; mm1=(02 ** 03 **)
172        pfadd   mm2,mm7                 ; mm0=(04 ** 05 **)
173        pfadd   mm3,mm7                 ; mm1=(06 ** 07 **)
174
175        movq      mm4,mm0
176        punpcklwd mm0,mm1               ; mm0=(00 02 ** **)
177        punpckhwd mm4,mm1               ; mm4=(01 03 ** **)
178        movq      mm5,mm2
179        punpcklwd mm2,mm3               ; mm2=(04 06 ** **)
180        punpckhwd mm5,mm3               ; mm5=(05 07 ** **)
181
182        punpcklwd mm0,mm4               ; mm0=(00 01 02 03)
183        punpcklwd mm2,mm5               ; mm2=(04 05 06 07)
184
185        movq    mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
186        movq    mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
187        pfmul   mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
188        pfmul   mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
189        movq    mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
190        movq    mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
191        pfmul   mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
192        pfmul   mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
193
194        pfadd   mm6,mm7                 ; mm0=(10 ** 11 **)
195        pfadd   mm1,mm7                 ; mm4=(12 ** 13 **)
196        pfadd   mm3,mm7                 ; mm0=(14 ** 15 **)
197        pfadd   mm4,mm7                 ; mm4=(16 ** 17 **)
198
199        movq      mm5,mm6
200        punpcklwd mm6,mm1               ; mm6=(10 12 ** **)
201        punpckhwd mm5,mm1               ; mm5=(11 13 ** **)
202        movq      mm1,mm3
203        punpcklwd mm3,mm4               ; mm3=(14 16 ** **)
204        punpckhwd mm1,mm4               ; mm1=(15 17 ** **)
205
206        punpcklwd mm6,mm5               ; mm6=(10 11 12 13)
207        punpcklwd mm3,mm1               ; mm3=(14 15 16 17)
208
209        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
210        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
211        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
212        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
213
214        add     esi, byte 16*SIZEOF_FAST_FLOAT
215        add     edx, byte 16*SIZEOF_FAST_FLOAT
216        add     edi, byte 16*SIZEOF_JCOEF
217        dec     eax
218        jnz     near .quantloop
219
220        femms           ; empty MMX/3DNow! state
221
222        pop     edi
223        pop     esi
224;       pop     edx             ; need not be preserved
225;       pop     ecx             ; unused
226;       pop     ebx             ; unused
227        pop     ebp
228        ret
229
230; For some reason, the OS X linker does not honor the request to align the
231; segment unless we do this.
232        align   16
233