• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jquant.asm - sample data conversion and quantization (SSE & MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jsimdext.inc"
20%include "jdct.inc"
21
22; --------------------------------------------------------------------------
23    SECTION     SEG_TEXT
24    BITS        32
25;
26; Load data into workspace, applying unsigned->signed conversion
27;
28; GLOBAL(void)
29; jsimd_convsamp_float_sse(JSAMPARRAY sample_data, JDIMENSION start_col,
30;                          FAST_FLOAT *workspace);
31;
32
33%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
34%define start_col    ebp + 12           ; JDIMENSION start_col
35%define workspace    ebp + 16           ; FAST_FLOAT *workspace
36
37    align       32
38    GLOBAL_FUNCTION(jsimd_convsamp_float_sse)
39
40EXTN(jsimd_convsamp_float_sse):
41    push        ebp
42    mov         ebp, esp
43    push        ebx
44;   push        ecx                     ; need not be preserved
45;   push        edx                     ; need not be preserved
46    push        esi
47    push        edi
48
49    pcmpeqw     mm7, mm7
50    psllw       mm7, 7
51    packsswb    mm7, mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
52
53    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
54    mov         eax, JDIMENSION [start_col]
55    mov         edi, POINTER [workspace]       ; (DCTELEM *)
56    mov         ecx, DCTSIZE/2
57    alignx      16, 7
58.convloop:
59    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
60    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
61
62    movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
63    movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
64
65    psubb       mm0, mm7                ; mm0=(01234567)
66    psubb       mm1, mm7                ; mm1=(89ABCDEF)
67
68    punpcklbw   mm2, mm0                ; mm2=(*0*1*2*3)
69    punpckhbw   mm0, mm0                ; mm0=(*4*5*6*7)
70    punpcklbw   mm3, mm1                ; mm3=(*8*9*A*B)
71    punpckhbw   mm1, mm1                ; mm1=(*C*D*E*F)
72
73    punpcklwd   mm4, mm2                ; mm4=(***0***1)
74    punpckhwd   mm2, mm2                ; mm2=(***2***3)
75    punpcklwd   mm5, mm0                ; mm5=(***4***5)
76    punpckhwd   mm0, mm0                ; mm0=(***6***7)
77
78    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(01)
79    psrad       mm2, (DWORD_BIT-BYTE_BIT)  ; mm2=(23)
80    cvtpi2ps    xmm0, mm4                  ; xmm0=(01**)
81    cvtpi2ps    xmm1, mm2                  ; xmm1=(23**)
82    psrad       mm5, (DWORD_BIT-BYTE_BIT)  ; mm5=(45)
83    psrad       mm0, (DWORD_BIT-BYTE_BIT)  ; mm0=(67)
84    cvtpi2ps    xmm2, mm5                  ; xmm2=(45**)
85    cvtpi2ps    xmm3, mm0                  ; xmm3=(67**)
86
87    punpcklwd   mm6, mm3                ; mm6=(***8***9)
88    punpckhwd   mm3, mm3                ; mm3=(***A***B)
89    punpcklwd   mm4, mm1                ; mm4=(***C***D)
90    punpckhwd   mm1, mm1                ; mm1=(***E***F)
91
92    psrad       mm6, (DWORD_BIT-BYTE_BIT)  ; mm6=(89)
93    psrad       mm3, (DWORD_BIT-BYTE_BIT)  ; mm3=(AB)
94    cvtpi2ps    xmm4, mm6                  ; xmm4=(89**)
95    cvtpi2ps    xmm5, mm3                  ; xmm5=(AB**)
96    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(CD)
97    psrad       mm1, (DWORD_BIT-BYTE_BIT)  ; mm1=(EF)
98    cvtpi2ps    xmm6, mm4                  ; xmm6=(CD**)
99    cvtpi2ps    xmm7, mm1                  ; xmm7=(EF**)
100
101    movlhps     xmm0, xmm1              ; xmm0=(0123)
102    movlhps     xmm2, xmm3              ; xmm2=(4567)
103    movlhps     xmm4, xmm5              ; xmm4=(89AB)
104    movlhps     xmm6, xmm7              ; xmm6=(CDEF)
105
106    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
107    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
108    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
109    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
110
111    add         esi, byte 2*SIZEOF_JSAMPROW
112    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
113    dec         ecx
114    jnz         near .convloop
115
116    emms                                ; empty MMX state
117
118    pop         edi
119    pop         esi
120;   pop         edx                     ; need not be preserved
121;   pop         ecx                     ; need not be preserved
122    pop         ebx
123    pop         ebp
124    ret
125
126; --------------------------------------------------------------------------
127;
128; Quantize/descale the coefficients, and store into coef_block
129;
130; GLOBAL(void)
131; jsimd_quantize_float_sse(JCOEFPTR coef_block, FAST_FLOAT *divisors,
132;                          FAST_FLOAT *workspace);
133;
134
135%define coef_block  ebp + 8             ; JCOEFPTR coef_block
136%define divisors    ebp + 12            ; FAST_FLOAT *divisors
137%define workspace   ebp + 16            ; FAST_FLOAT *workspace
138
139    align       32
140    GLOBAL_FUNCTION(jsimd_quantize_float_sse)
141
142EXTN(jsimd_quantize_float_sse):
143    push        ebp
144    mov         ebp, esp
145;   push        ebx                     ; unused
146;   push        ecx                     ; unused
147;   push        edx                     ; need not be preserved
148    push        esi
149    push        edi
150
151    mov         esi, POINTER [workspace]
152    mov         edx, POINTER [divisors]
153    mov         edi, JCOEFPTR [coef_block]
154    mov         eax, DCTSIZE2/16
155    alignx      16, 7
156.quantloop:
157    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
158    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
159    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
160    mulps       xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
161    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
162    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
163    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
164    mulps       xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
165
166    movhlps     xmm4, xmm0
167    movhlps     xmm5, xmm1
168
169    cvtps2pi    mm0, xmm0
170    cvtps2pi    mm1, xmm1
171    cvtps2pi    mm4, xmm4
172    cvtps2pi    mm5, xmm5
173
174    movhlps     xmm6, xmm2
175    movhlps     xmm7, xmm3
176
177    cvtps2pi    mm2, xmm2
178    cvtps2pi    mm3, xmm3
179    cvtps2pi    mm6, xmm6
180    cvtps2pi    mm7, xmm7
181
182    packssdw    mm0, mm4
183    packssdw    mm1, mm5
184    packssdw    mm2, mm6
185    packssdw    mm3, mm7
186
187    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
188    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
189    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
190    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
191
192    add         esi, byte 16*SIZEOF_FAST_FLOAT
193    add         edx, byte 16*SIZEOF_FAST_FLOAT
194    add         edi, byte 16*SIZEOF_JCOEF
195    dec         eax
196    jnz         short .quantloop
197
198    emms                                ; empty MMX state
199
200    pop         edi
201    pop         esi
202;   pop         edx                     ; need not be preserved
203;   pop         ecx                     ; unused
204;   pop         ebx                     ; unused
205    pop         ebp
206    ret
207
208; For some reason, the OS X linker does not honor the request to align the
209; segment unless we do this.
210    align       32
211