• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jquant.asm - sample data conversion and quantization (SSE & MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16
17%include "jsimdext.inc"
18%include "jdct.inc"
19
20; --------------------------------------------------------------------------
21    SECTION     SEG_TEXT
22    BITS        32
23;
24; Load data into workspace, applying unsigned->signed conversion
25;
26; GLOBAL(void)
27; jsimd_convsamp_float_sse(JSAMPARRAY sample_data, JDIMENSION start_col,
28;                          FAST_FLOAT *workspace);
29;
30
31%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
32%define start_col    ebp + 12           ; JDIMENSION start_col
33%define workspace    ebp + 16           ; FAST_FLOAT *workspace
34
35    align       32
36    GLOBAL_FUNCTION(jsimd_convsamp_float_sse)
37
38EXTN(jsimd_convsamp_float_sse):
39    push        ebp
40    mov         ebp, esp
41    push        ebx
42;   push        ecx                     ; need not be preserved
43;   push        edx                     ; need not be preserved
44    push        esi
45    push        edi
46
47    pcmpeqw     mm7, mm7
48    psllw       mm7, 7
49    packsswb    mm7, mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
50
51    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
52    mov         eax, JDIMENSION [start_col]
53    mov         edi, POINTER [workspace]       ; (DCTELEM *)
54    mov         ecx, DCTSIZE/2
55    alignx      16, 7
56.convloop:
57    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
58    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
59
60    movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
61    movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
62
63    psubb       mm0, mm7                ; mm0=(01234567)
64    psubb       mm1, mm7                ; mm1=(89ABCDEF)
65
66    punpcklbw   mm2, mm0                ; mm2=(*0*1*2*3)
67    punpckhbw   mm0, mm0                ; mm0=(*4*5*6*7)
68    punpcklbw   mm3, mm1                ; mm3=(*8*9*A*B)
69    punpckhbw   mm1, mm1                ; mm1=(*C*D*E*F)
70
71    punpcklwd   mm4, mm2                ; mm4=(***0***1)
72    punpckhwd   mm2, mm2                ; mm2=(***2***3)
73    punpcklwd   mm5, mm0                ; mm5=(***4***5)
74    punpckhwd   mm0, mm0                ; mm0=(***6***7)
75
76    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(01)
77    psrad       mm2, (DWORD_BIT-BYTE_BIT)  ; mm2=(23)
78    cvtpi2ps    xmm0, mm4                  ; xmm0=(01**)
79    cvtpi2ps    xmm1, mm2                  ; xmm1=(23**)
80    psrad       mm5, (DWORD_BIT-BYTE_BIT)  ; mm5=(45)
81    psrad       mm0, (DWORD_BIT-BYTE_BIT)  ; mm0=(67)
82    cvtpi2ps    xmm2, mm5                  ; xmm2=(45**)
83    cvtpi2ps    xmm3, mm0                  ; xmm3=(67**)
84
85    punpcklwd   mm6, mm3                ; mm6=(***8***9)
86    punpckhwd   mm3, mm3                ; mm3=(***A***B)
87    punpcklwd   mm4, mm1                ; mm4=(***C***D)
88    punpckhwd   mm1, mm1                ; mm1=(***E***F)
89
90    psrad       mm6, (DWORD_BIT-BYTE_BIT)  ; mm6=(89)
91    psrad       mm3, (DWORD_BIT-BYTE_BIT)  ; mm3=(AB)
92    cvtpi2ps    xmm4, mm6                  ; xmm4=(89**)
93    cvtpi2ps    xmm5, mm3                  ; xmm5=(AB**)
94    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(CD)
95    psrad       mm1, (DWORD_BIT-BYTE_BIT)  ; mm1=(EF)
96    cvtpi2ps    xmm6, mm4                  ; xmm6=(CD**)
97    cvtpi2ps    xmm7, mm1                  ; xmm7=(EF**)
98
99    movlhps     xmm0, xmm1              ; xmm0=(0123)
100    movlhps     xmm2, xmm3              ; xmm2=(4567)
101    movlhps     xmm4, xmm5              ; xmm4=(89AB)
102    movlhps     xmm6, xmm7              ; xmm6=(CDEF)
103
104    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
105    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
106    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
107    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
108
109    add         esi, byte 2*SIZEOF_JSAMPROW
110    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
111    dec         ecx
112    jnz         near .convloop
113
114    emms                                ; empty MMX state
115
116    pop         edi
117    pop         esi
118;   pop         edx                     ; need not be preserved
119;   pop         ecx                     ; need not be preserved
120    pop         ebx
121    pop         ebp
122    ret
123
124; --------------------------------------------------------------------------
125;
126; Quantize/descale the coefficients, and store into coef_block
127;
128; GLOBAL(void)
129; jsimd_quantize_float_sse(JCOEFPTR coef_block, FAST_FLOAT *divisors,
130;                          FAST_FLOAT *workspace);
131;
132
133%define coef_block  ebp + 8             ; JCOEFPTR coef_block
134%define divisors    ebp + 12            ; FAST_FLOAT *divisors
135%define workspace   ebp + 16            ; FAST_FLOAT *workspace
136
137    align       32
138    GLOBAL_FUNCTION(jsimd_quantize_float_sse)
139
140EXTN(jsimd_quantize_float_sse):
141    push        ebp
142    mov         ebp, esp
143;   push        ebx                     ; unused
144;   push        ecx                     ; unused
145;   push        edx                     ; need not be preserved
146    push        esi
147    push        edi
148
149    mov         esi, POINTER [workspace]
150    mov         edx, POINTER [divisors]
151    mov         edi, JCOEFPTR [coef_block]
152    mov         eax, DCTSIZE2/16
153    alignx      16, 7
154.quantloop:
155    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
156    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
157    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
158    mulps       xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
159    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
160    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
161    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
162    mulps       xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
163
164    movhlps     xmm4, xmm0
165    movhlps     xmm5, xmm1
166
167    cvtps2pi    mm0, xmm0
168    cvtps2pi    mm1, xmm1
169    cvtps2pi    mm4, xmm4
170    cvtps2pi    mm5, xmm5
171
172    movhlps     xmm6, xmm2
173    movhlps     xmm7, xmm3
174
175    cvtps2pi    mm2, xmm2
176    cvtps2pi    mm3, xmm3
177    cvtps2pi    mm6, xmm6
178    cvtps2pi    mm7, xmm7
179
180    packssdw    mm0, mm4
181    packssdw    mm1, mm5
182    packssdw    mm2, mm6
183    packssdw    mm3, mm7
184
185    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
186    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
187    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
188    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
189
190    add         esi, byte 16*SIZEOF_FAST_FLOAT
191    add         edx, byte 16*SIZEOF_FAST_FLOAT
192    add         edi, byte 16*SIZEOF_JCOEF
193    dec         eax
194    jnz         short .quantloop
195
196    emms                                ; empty MMX state
197
198    pop         edi
199    pop         esi
200;   pop         edx                     ; need not be preserved
201;   pop         ecx                     ; unused
202;   pop         ebx                     ; unused
203    pop         ebp
204    ret
205
206; For some reason, the OS X linker does not honor the request to align the
207; segment unless we do this.
208    align       32
209