• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jcsample.asm - downsampling (64-bit SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jsimdext.inc"
20
21; --------------------------------------------------------------------------
22        SECTION SEG_TEXT
23        BITS    64
24;
25; Downsample pixel values of a single component.
26; This version handles the common case of 2:1 horizontal and 1:1 vertical,
27; without smoothing.
28;
29; GLOBAL(void)
30; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
31;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
32;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
33;
34
35; r10 = JDIMENSION image_width
36; r11 = int max_v_samp_factor
37; r12 = JDIMENSION v_samp_factor
38; r13 = JDIMENSION width_blocks
39; r14 = JSAMPARRAY input_data
40; r15 = JSAMPARRAY output_data
41
42        align   16
43        global  EXTN(jsimd_h2v1_downsample_sse2)
44
45EXTN(jsimd_h2v1_downsample_sse2):
46        push    rbp
47        mov     rax,rsp
48        mov     rbp,rsp
49        collect_args
50
51        mov ecx, r13d
52        shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
53        jz      near .return
54
55        mov edx, r10d
56
57        ; -- expand_right_edge
58
59        push    rcx
60        shl     rcx,1                           ; output_cols * 2
61        sub     rcx,rdx
62        jle     short .expand_end
63
64        mov     rax, r11
65        test    rax,rax
66        jle     short .expand_end
67
68        cld
69        mov     rsi, r14        ; input_data
70.expandloop:
71        push    rax
72        push    rcx
73
74        mov     rdi, JSAMPROW [rsi]
75        add     rdi,rdx
76        mov     al, JSAMPLE [rdi-1]
77
78        rep stosb
79
80        pop     rcx
81        pop     rax
82
83        add     rsi, byte SIZEOF_JSAMPROW
84        dec     rax
85        jg      short .expandloop
86
87.expand_end:
88        pop     rcx                             ; output_cols
89
90        ; -- h2v1_downsample
91
92        mov     eax, r12d        ; rowctr
93        test    eax,eax
94        jle     near .return
95
96        mov     rdx, 0x00010000         ; bias pattern
97        movd    xmm7,edx
98        pcmpeqw xmm6,xmm6
99        pshufd  xmm7,xmm7,0x00          ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
100        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
101
102        mov     rsi, r14        ; input_data
103        mov     rdi, r15        ; output_data
104.rowloop:
105        push    rcx
106        push    rdi
107        push    rsi
108
109        mov     rsi, JSAMPROW [rsi]             ; inptr
110        mov rdi, JSAMPROW [rdi]         ; outptr
111
112        cmp     rcx, byte SIZEOF_XMMWORD
113        jae     short .columnloop
114
115.columnloop_r8:
116        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
117        pxor    xmm1,xmm1
118        mov     rcx, SIZEOF_XMMWORD
119        jmp     short .downsample
120
121.columnloop:
122        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
123        movdqa  xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
124
125.downsample:
126        movdqa  xmm2,xmm0
127        movdqa  xmm3,xmm1
128
129        pand    xmm0,xmm6
130        psrlw   xmm2,BYTE_BIT
131        pand    xmm1,xmm6
132        psrlw   xmm3,BYTE_BIT
133
134        paddw   xmm0,xmm2
135        paddw   xmm1,xmm3
136        paddw   xmm0,xmm7
137        paddw   xmm1,xmm7
138        psrlw   xmm0,1
139        psrlw   xmm1,1
140
141        packuswb xmm0,xmm1
142
143        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
144
145        sub     rcx, byte SIZEOF_XMMWORD        ; outcol
146        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
147        add     rdi, byte 1*SIZEOF_XMMWORD      ; outptr
148        cmp     rcx, byte SIZEOF_XMMWORD
149        jae     short .columnloop
150        test    rcx,rcx
151        jnz     short .columnloop_r8
152
153        pop     rsi
154        pop     rdi
155        pop     rcx
156
157        add     rsi, byte SIZEOF_JSAMPROW       ; input_data
158        add     rdi, byte SIZEOF_JSAMPROW       ; output_data
159        dec     rax                             ; rowctr
160        jg      near .rowloop
161
162.return:
163        uncollect_args
164        pop     rbp
165        ret
166
167; --------------------------------------------------------------------------
168;
169; Downsample pixel values of a single component.
170; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
171; without smoothing.
172;
173; GLOBAL(void)
174; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
175;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
176;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
177;
178
179; r10 = JDIMENSION image_width
180; r11 = int max_v_samp_factor
181; r12 = JDIMENSION v_samp_factor
182; r13 = JDIMENSION width_blocks
183; r14 = JSAMPARRAY input_data
184; r15 = JSAMPARRAY output_data
185
186        align   16
187        global  EXTN(jsimd_h2v2_downsample_sse2)
188
189EXTN(jsimd_h2v2_downsample_sse2):
190        push    rbp
191        mov     rax,rsp
192        mov     rbp,rsp
193        collect_args
194
195        mov     ecx, r13d
196        shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
197        jz      near .return
198
199        mov     edx, r10d
200
201        ; -- expand_right_edge
202
203        push    rcx
204        shl     rcx,1                           ; output_cols * 2
205        sub     rcx,rdx
206        jle     short .expand_end
207
208        mov     rax, r11
209        test    rax,rax
210        jle     short .expand_end
211
212        cld
213        mov     rsi, r14        ; input_data
214.expandloop:
215        push    rax
216        push    rcx
217
218        mov     rdi, JSAMPROW [rsi]
219        add     rdi,rdx
220        mov     al, JSAMPLE [rdi-1]
221
222        rep stosb
223
224        pop     rcx
225        pop     rax
226
227        add     rsi, byte SIZEOF_JSAMPROW
228        dec     rax
229        jg      short .expandloop
230
231.expand_end:
232        pop     rcx                             ; output_cols
233
234        ; -- h2v2_downsample
235
236        mov     eax, r12d        ; rowctr
237        test    rax,rax
238        jle     near .return
239
240        mov     rdx, 0x00020001         ; bias pattern
241        movd    xmm7,edx
242        pcmpeqw xmm6,xmm6
243        pshufd  xmm7,xmm7,0x00          ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
244        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
245
246        mov     rsi, r14        ; input_data
247        mov     rdi, r15        ; output_data
248.rowloop:
249        push    rcx
250        push    rdi
251        push    rsi
252
253        mov     rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; inptr0
254        mov     rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]   ; inptr1
255        mov     rdi, JSAMPROW [rdi]                     ; outptr
256
257        cmp     rcx, byte SIZEOF_XMMWORD
258        jae     short .columnloop
259
260.columnloop_r8:
261        movdqa  xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
262        movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
263        pxor    xmm2,xmm2
264        pxor    xmm3,xmm3
265        mov     rcx, SIZEOF_XMMWORD
266        jmp     short .downsample
267
268.columnloop:
269        movdqa  xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
270        movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
271        movdqa  xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
272        movdqa  xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
273
274.downsample:
275        movdqa  xmm4,xmm0
276        movdqa  xmm5,xmm1
277        pand    xmm0,xmm6
278        psrlw   xmm4,BYTE_BIT
279        pand    xmm1,xmm6
280        psrlw   xmm5,BYTE_BIT
281        paddw   xmm0,xmm4
282        paddw   xmm1,xmm5
283
284        movdqa  xmm4,xmm2
285        movdqa  xmm5,xmm3
286        pand    xmm2,xmm6
287        psrlw   xmm4,BYTE_BIT
288        pand    xmm3,xmm6
289        psrlw   xmm5,BYTE_BIT
290        paddw   xmm2,xmm4
291        paddw   xmm3,xmm5
292
293        paddw   xmm0,xmm1
294        paddw   xmm2,xmm3
295        paddw   xmm0,xmm7
296        paddw   xmm2,xmm7
297        psrlw   xmm0,2
298        psrlw   xmm2,2
299
300        packuswb xmm0,xmm2
301
302        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
303
304        sub     rcx, byte SIZEOF_XMMWORD        ; outcol
305        add     rdx, byte 2*SIZEOF_XMMWORD      ; inptr0
306        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr1
307        add     rdi, byte 1*SIZEOF_XMMWORD      ; outptr
308        cmp     rcx, byte SIZEOF_XMMWORD
309        jae     near .columnloop
310        test    rcx,rcx
311        jnz     near .columnloop_r8
312
313        pop     rsi
314        pop     rdi
315        pop     rcx
316
317        add     rsi, byte 2*SIZEOF_JSAMPROW     ; input_data
318        add     rdi, byte 1*SIZEOF_JSAMPROW     ; output_data
319        dec     rax                             ; rowctr
320        jg      near .rowloop
321
322.return:
323        uncollect_args
324        pop     rbp
325        ret
326
327; For some reason, the OS X linker does not honor the request to align the
328; segment unless we do this.
329        align   16
330