• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jquanti.asm - sample data conversion and quantization (SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16
17%include "jsimdext.inc"
18%include "jdct.inc"
19
20; --------------------------------------------------------------------------
21    SECTION     SEG_TEXT
22    BITS        32
23;
24; Load data into workspace, applying unsigned->signed conversion
25;
26; GLOBAL(void)
27; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
28;                     DCTELEM *workspace);
29;
30
31%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
32%define start_col    ebp + 12           ; JDIMENSION start_col
33%define workspace    ebp + 16           ; DCTELEM *workspace
34
35    align       32
36    GLOBAL_FUNCTION(jsimd_convsamp_sse2)
37
38EXTN(jsimd_convsamp_sse2):
39    push        ebp
40    mov         ebp, esp
41    push        ebx
42;   push        ecx                     ; need not be preserved
43;   push        edx                     ; need not be preserved
44    push        esi
45    push        edi
46
47    pxor        xmm6, xmm6              ; xmm6=(all 0's)
48    pcmpeqw     xmm7, xmm7
49    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
50
51    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
52    mov         eax, JDIMENSION [start_col]
53    mov         edi, POINTER [workspace]       ; (DCTELEM *)
54    mov         ecx, DCTSIZE/4
55    alignx      16, 7
56.convloop:
57    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
58    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
59
60    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]  ; xmm0=(01234567)
61    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]  ; xmm1=(89ABCDEF)
62
63    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
64    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
65
66    movq        xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]  ; xmm2=(GHIJKLMN)
67    movq        xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]  ; xmm3=(OPQRSTUV)
68
69    punpcklbw   xmm0, xmm6              ; xmm0=(01234567)
70    punpcklbw   xmm1, xmm6              ; xmm1=(89ABCDEF)
71    paddw       xmm0, xmm7
72    paddw       xmm1, xmm7
73    punpcklbw   xmm2, xmm6              ; xmm2=(GHIJKLMN)
74    punpcklbw   xmm3, xmm6              ; xmm3=(OPQRSTUV)
75    paddw       xmm2, xmm7
76    paddw       xmm3, xmm7
77
78    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
79    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
80    movdqa      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
81    movdqa      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
82
83    add         esi, byte 4*SIZEOF_JSAMPROW
84    add         edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
85    dec         ecx
86    jnz         short .convloop
87
88    pop         edi
89    pop         esi
90;   pop         edx                     ; need not be preserved
91;   pop         ecx                     ; need not be preserved
92    pop         ebx
93    pop         ebp
94    ret
95
96; --------------------------------------------------------------------------
97;
98; Quantize/descale the coefficients, and store into coef_block
99;
100; This implementation is based on an algorithm described in
101;   "How to optimize for the Pentium family of microprocessors"
102;   (http://www.agner.org/assem/).
103;
104; GLOBAL(void)
105; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
106;                     DCTELEM *workspace);
107;
108
109%define RECIPROCAL(m, n, b) \
110  XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
111%define CORRECTION(m, n, b) \
112  XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
113%define SCALE(m, n, b) \
114  XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
115
116%define coef_block  ebp + 8             ; JCOEFPTR coef_block
117%define divisors    ebp + 12            ; DCTELEM *divisors
118%define workspace   ebp + 16            ; DCTELEM *workspace
119
120    align       32
121    GLOBAL_FUNCTION(jsimd_quantize_sse2)
122
123EXTN(jsimd_quantize_sse2):
124    push        ebp
125    mov         ebp, esp
126;   push        ebx                     ; unused
127;   push        ecx                     ; unused
128;   push        edx                     ; need not be preserved
129    push        esi
130    push        edi
131
132    mov         esi, POINTER [workspace]
133    mov         edx, POINTER [divisors]
134    mov         edi, JCOEFPTR [coef_block]
135    mov         eax, DCTSIZE2/32
136    alignx      16, 7
137.quantloop:
138    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
139    movdqa      xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
140    movdqa      xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
141    movdqa      xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
142    movdqa      xmm0, xmm4
143    movdqa      xmm1, xmm5
144    movdqa      xmm2, xmm6
145    movdqa      xmm3, xmm7
146    psraw       xmm4, (WORD_BIT-1)
147    psraw       xmm5, (WORD_BIT-1)
148    psraw       xmm6, (WORD_BIT-1)
149    psraw       xmm7, (WORD_BIT-1)
150    pxor        xmm0, xmm4
151    pxor        xmm1, xmm5
152    pxor        xmm2, xmm6
153    pxor        xmm3, xmm7
154    psubw       xmm0, xmm4              ; if (xmm0 < 0) xmm0 = -xmm0;
155    psubw       xmm1, xmm5              ; if (xmm1 < 0) xmm1 = -xmm1;
156    psubw       xmm2, xmm6              ; if (xmm2 < 0) xmm2 = -xmm2;
157    psubw       xmm3, xmm7              ; if (xmm3 < 0) xmm3 = -xmm3;
158
159    paddw       xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
160    paddw       xmm1, XMMWORD [CORRECTION(1,0,edx)]
161    paddw       xmm2, XMMWORD [CORRECTION(2,0,edx)]
162    paddw       xmm3, XMMWORD [CORRECTION(3,0,edx)]
163    pmulhuw     xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
164    pmulhuw     xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
165    pmulhuw     xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
166    pmulhuw     xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
167    pmulhuw     xmm0, XMMWORD [SCALE(0,0,edx)]       ; scale
168    pmulhuw     xmm1, XMMWORD [SCALE(1,0,edx)]
169    pmulhuw     xmm2, XMMWORD [SCALE(2,0,edx)]
170    pmulhuw     xmm3, XMMWORD [SCALE(3,0,edx)]
171
172    pxor        xmm0, xmm4
173    pxor        xmm1, xmm5
174    pxor        xmm2, xmm6
175    pxor        xmm3, xmm7
176    psubw       xmm0, xmm4
177    psubw       xmm1, xmm5
178    psubw       xmm2, xmm6
179    psubw       xmm3, xmm7
180    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
181    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
182    movdqa      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
183    movdqa      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
184
185    add         esi, byte 32*SIZEOF_DCTELEM
186    add         edx, byte 32*SIZEOF_DCTELEM
187    add         edi, byte 32*SIZEOF_JCOEF
188    dec         eax
189    jnz         near .quantloop
190
191    pop         edi
192    pop         esi
193;   pop         edx                     ; need not be preserved
194;   pop         ecx                     ; unused
195;   pop         ebx                     ; unused
196    pop         ebp
197    ret
198
199; For some reason, the OS X linker does not honor the request to align the
200; segment unless we do this.
201    align       32
202