• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jcsample.asm - downsampling (SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on the x86 SIMD extension for IJG JPEG library
7; Copyright (C) 1999-2006, MIYASAKA Masaru.
8; For conditions of distribution and use, see copyright notice in jsimdext.inc
9;
10; This file should be assembled with NASM (Netwide Assembler),
11; can *not* be assembled with Microsoft's MASM or any compatible
12; assembler (including Borland's Turbo Assembler).
13; NASM is available from http://nasm.sourceforge.net/ or
14; http://sourceforge.net/project/showfiles.php?group_id=6208
15;
16; [TAB8]
17
18%include "jsimdext.inc"
19
20; --------------------------------------------------------------------------
21        SECTION SEG_TEXT
22        BITS    32
23;
24; Downsample pixel values of a single component.
25; This version handles the common case of 2:1 horizontal and 1:1 vertical,
26; without smoothing.
27;
28; GLOBAL(void)
29; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
30;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
31;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
32;
33
34%define img_width(b)    (b)+8           ; JDIMENSION image_width
35%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
36%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
37%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
38%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
39%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
40
41        align   16
42        global  EXTN(jsimd_h2v1_downsample_sse2)
43
44EXTN(jsimd_h2v1_downsample_sse2):
45        push    ebp
46        mov     ebp,esp
47;       push    ebx             ; unused
48;       push    ecx             ; need not be preserved
49;       push    edx             ; need not be preserved
50        push    esi
51        push    edi
52
53        mov     ecx, JDIMENSION [width_blks(ebp)]
54        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
55        jz      near .return
56
57        mov     edx, JDIMENSION [img_width(ebp)]
58
59        ; -- expand_right_edge
60
61        push    ecx
62        shl     ecx,1                           ; output_cols * 2
63        sub     ecx,edx
64        jle     short .expand_end
65
66        mov     eax, INT [max_v_samp(ebp)]
67        test    eax,eax
68        jle     short .expand_end
69
70        cld
71        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
72        alignx  16,7
73.expandloop:
74        push    eax
75        push    ecx
76
77        mov     edi, JSAMPROW [esi]
78        add     edi,edx
79        mov     al, JSAMPLE [edi-1]
80
81        rep stosb
82
83        pop     ecx
84        pop     eax
85
86        add     esi, byte SIZEOF_JSAMPROW
87        dec     eax
88        jg      short .expandloop
89
90.expand_end:
91        pop     ecx                             ; output_cols
92
93        ; -- h2v1_downsample
94
95        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
96        test    eax,eax
97        jle     near .return
98
99        mov     edx, 0x00010000         ; bias pattern
100        movd    xmm7,edx
101        pcmpeqw xmm6,xmm6
102        pshufd  xmm7,xmm7,0x00          ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
103        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
104
105        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
106        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
107        alignx  16,7
108.rowloop:
109        push    ecx
110        push    edi
111        push    esi
112
113        mov     esi, JSAMPROW [esi]             ; inptr
114        mov     edi, JSAMPROW [edi]             ; outptr
115
116        cmp     ecx, byte SIZEOF_XMMWORD
117        jae     short .columnloop
118        alignx  16,7
119
120.columnloop_r8:
121        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
122        pxor    xmm1,xmm1
123        mov     ecx, SIZEOF_XMMWORD
124        jmp     short .downsample
125        alignx  16,7
126
127.columnloop:
128        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
129        movdqa  xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
130
131.downsample:
132        movdqa  xmm2,xmm0
133        movdqa  xmm3,xmm1
134
135        pand    xmm0,xmm6
136        psrlw   xmm2,BYTE_BIT
137        pand    xmm1,xmm6
138        psrlw   xmm3,BYTE_BIT
139
140        paddw   xmm0,xmm2
141        paddw   xmm1,xmm3
142        paddw   xmm0,xmm7
143        paddw   xmm1,xmm7
144        psrlw   xmm0,1
145        psrlw   xmm1,1
146
147        packuswb xmm0,xmm1
148
149        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
150
151        sub     ecx, byte SIZEOF_XMMWORD        ; outcol
152        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr
153        add     edi, byte 1*SIZEOF_XMMWORD      ; outptr
154        cmp     ecx, byte SIZEOF_XMMWORD
155        jae     short .columnloop
156        test    ecx,ecx
157        jnz     short .columnloop_r8
158
159        pop     esi
160        pop     edi
161        pop     ecx
162
163        add     esi, byte SIZEOF_JSAMPROW       ; input_data
164        add     edi, byte SIZEOF_JSAMPROW       ; output_data
165        dec     eax                             ; rowctr
166        jg      near .rowloop
167
168.return:
169        pop     edi
170        pop     esi
171;       pop     edx             ; need not be preserved
172;       pop     ecx             ; need not be preserved
173;       pop     ebx             ; unused
174        pop     ebp
175        ret
176
177; --------------------------------------------------------------------------
178;
179; Downsample pixel values of a single component.
180; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
181; without smoothing.
182;
183; GLOBAL(void)
184; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
185;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
186;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
187;
188
189%define img_width(b)    (b)+8           ; JDIMENSION image_width
190%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
191%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
192%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
193%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
194%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
195
196        align   16
197        global  EXTN(jsimd_h2v2_downsample_sse2)
198
199EXTN(jsimd_h2v2_downsample_sse2):
200        push    ebp
201        mov     ebp,esp
202;       push    ebx             ; unused
203;       push    ecx             ; need not be preserved
204;       push    edx             ; need not be preserved
205        push    esi
206        push    edi
207
208        mov     ecx, JDIMENSION [width_blks(ebp)]
209        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
210        jz      near .return
211
212        mov     edx, JDIMENSION [img_width(ebp)]
213
214        ; -- expand_right_edge
215
216        push    ecx
217        shl     ecx,1                           ; output_cols * 2
218        sub     ecx,edx
219        jle     short .expand_end
220
221        mov     eax, INT [max_v_samp(ebp)]
222        test    eax,eax
223        jle     short .expand_end
224
225        cld
226        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
227        alignx  16,7
228.expandloop:
229        push    eax
230        push    ecx
231
232        mov     edi, JSAMPROW [esi]
233        add     edi,edx
234        mov     al, JSAMPLE [edi-1]
235
236        rep stosb
237
238        pop     ecx
239        pop     eax
240
241        add     esi, byte SIZEOF_JSAMPROW
242        dec     eax
243        jg      short .expandloop
244
245.expand_end:
246        pop     ecx                             ; output_cols
247
248        ; -- h2v2_downsample
249
250        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
251        test    eax,eax
252        jle     near .return
253
254        mov     edx, 0x00020001         ; bias pattern
255        movd    xmm7,edx
256        pcmpeqw xmm6,xmm6
257        pshufd  xmm7,xmm7,0x00          ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
258        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
259
260        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
261        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
262        alignx  16,7
263.rowloop:
264        push    ecx
265        push    edi
266        push    esi
267
268        mov     edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
269        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1
270        mov     edi, JSAMPROW [edi]                     ; outptr
271
272        cmp     ecx, byte SIZEOF_XMMWORD
273        jae     short .columnloop
274        alignx  16,7
275
276.columnloop_r8:
277        movdqa  xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
278        movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
279        pxor    xmm2,xmm2
280        pxor    xmm3,xmm3
281        mov     ecx, SIZEOF_XMMWORD
282        jmp     short .downsample
283        alignx  16,7
284
285.columnloop:
286        movdqa  xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
287        movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
288        movdqa  xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
289        movdqa  xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
290
291.downsample:
292        movdqa  xmm4,xmm0
293        movdqa  xmm5,xmm1
294        pand    xmm0,xmm6
295        psrlw   xmm4,BYTE_BIT
296        pand    xmm1,xmm6
297        psrlw   xmm5,BYTE_BIT
298        paddw   xmm0,xmm4
299        paddw   xmm1,xmm5
300
301        movdqa  xmm4,xmm2
302        movdqa  xmm5,xmm3
303        pand    xmm2,xmm6
304        psrlw   xmm4,BYTE_BIT
305        pand    xmm3,xmm6
306        psrlw   xmm5,BYTE_BIT
307        paddw   xmm2,xmm4
308        paddw   xmm3,xmm5
309
310        paddw   xmm0,xmm1
311        paddw   xmm2,xmm3
312        paddw   xmm0,xmm7
313        paddw   xmm2,xmm7
314        psrlw   xmm0,2
315        psrlw   xmm2,2
316
317        packuswb xmm0,xmm2
318
319        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
320
321        sub     ecx, byte SIZEOF_XMMWORD        ; outcol
322        add     edx, byte 2*SIZEOF_XMMWORD      ; inptr0
323        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr1
324        add     edi, byte 1*SIZEOF_XMMWORD      ; outptr
325        cmp     ecx, byte SIZEOF_XMMWORD
326        jae     near .columnloop
327        test    ecx,ecx
328        jnz     near .columnloop_r8
329
330        pop     esi
331        pop     edi
332        pop     ecx
333
334        add     esi, byte 2*SIZEOF_JSAMPROW     ; input_data
335        add     edi, byte 1*SIZEOF_JSAMPROW     ; output_data
336        dec     eax                             ; rowctr
337        jg      near .rowloop
338
339.return:
340        pop     edi
341        pop     esi
342;       pop     edx             ; need not be preserved
343;       pop     ecx             ; need not be preserved
344;       pop     ebx             ; unused
345        pop     ebp
346        ret
347
348; For some reason, the OS X linker does not honor the request to align the
349; segment unless we do this.
350        align   16
351