• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jcsample.asm - downsampling (SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jsimdext.inc"
20
21; --------------------------------------------------------------------------
22    SECTION     SEG_TEXT
23    BITS        32
24;
25; Downsample pixel values of a single component.
26; This version handles the common case of 2:1 horizontal and 1:1 vertical,
27; without smoothing.
28;
29; GLOBAL(void)
30; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
31;                            JDIMENSION v_samp_factor,
32;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
33;                            JSAMPARRAY output_data);
34;
35
36%define img_width(b)    (b) + 8         ; JDIMENSION image_width
37%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
38%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
39%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
40%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
41%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
42
43    align       32
44    GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
45
46EXTN(jsimd_h2v1_downsample_sse2):
47    push        ebp
48    mov         ebp, esp
49;   push        ebx                     ; unused
50;   push        ecx                     ; need not be preserved
51;   push        edx                     ; need not be preserved
52    push        esi
53    push        edi
54
55    mov         ecx, JDIMENSION [width_blks(ebp)]
56    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
57    jz          near .return
58
59    mov         edx, JDIMENSION [img_width(ebp)]
60
61    ; -- expand_right_edge
62
63    push        ecx
64    shl         ecx, 1                  ; output_cols * 2
65    sub         ecx, edx
66    jle         short .expand_end
67
68    mov         eax, INT [max_v_samp(ebp)]
69    test        eax, eax
70    jle         short .expand_end
71
72    cld
73    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
74    alignx      16, 7
75.expandloop:
76    push        eax
77    push        ecx
78
79    mov         edi, JSAMPROW [esi]
80    add         edi, edx
81    mov         al, JSAMPLE [edi-1]
82
83    rep stosb
84
85    pop         ecx
86    pop         eax
87
88    add         esi, byte SIZEOF_JSAMPROW
89    dec         eax
90    jg          short .expandloop
91
92.expand_end:
93    pop         ecx                     ; output_cols
94
95    ; -- h2v1_downsample
96
97    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
98    test        eax, eax
99    jle         near .return
100
101    mov         edx, 0x00010000         ; bias pattern
102    movd        xmm7, edx
103    pcmpeqw     xmm6, xmm6
104    pshufd      xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
105    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
106
107    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
108    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
109    alignx      16, 7
110.rowloop:
111    push        ecx
112    push        edi
113    push        esi
114
115    mov         esi, JSAMPROW [esi]     ; inptr
116    mov         edi, JSAMPROW [edi]     ; outptr
117
118    cmp         ecx, byte SIZEOF_XMMWORD
119    jae         short .columnloop
120    alignx      16, 7
121
122.columnloop_r8:
123    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
124    pxor        xmm1, xmm1
125    mov         ecx, SIZEOF_XMMWORD
126    jmp         short .downsample
127    alignx      16, 7
128
129.columnloop:
130    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
131    movdqa      xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
132
133.downsample:
134    movdqa      xmm2, xmm0
135    movdqa      xmm3, xmm1
136
137    pand        xmm0, xmm6
138    psrlw       xmm2, BYTE_BIT
139    pand        xmm1, xmm6
140    psrlw       xmm3, BYTE_BIT
141
142    paddw       xmm0, xmm2
143    paddw       xmm1, xmm3
144    paddw       xmm0, xmm7
145    paddw       xmm1, xmm7
146    psrlw       xmm0, 1
147    psrlw       xmm1, 1
148
149    packuswb    xmm0, xmm1
150
151    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
152
153    sub         ecx, byte SIZEOF_XMMWORD    ; outcol
154    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr
155    add         edi, byte 1*SIZEOF_XMMWORD  ; outptr
156    cmp         ecx, byte SIZEOF_XMMWORD
157    jae         short .columnloop
158    test        ecx, ecx
159    jnz         short .columnloop_r8
160
161    pop         esi
162    pop         edi
163    pop         ecx
164
165    add         esi, byte SIZEOF_JSAMPROW  ; input_data
166    add         edi, byte SIZEOF_JSAMPROW  ; output_data
167    dec         eax                        ; rowctr
168    jg          near .rowloop
169
170.return:
171    pop         edi
172    pop         esi
173;   pop         edx                     ; need not be preserved
174;   pop         ecx                     ; need not be preserved
175;   pop         ebx                     ; unused
176    pop         ebp
177    ret
178
179; --------------------------------------------------------------------------
180;
181; Downsample pixel values of a single component.
182; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
183; without smoothing.
184;
185; GLOBAL(void)
186; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
187;                            JDIMENSION v_samp_factor,
188;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
189;                            JSAMPARRAY output_data);
190;
191
192%define img_width(b)    (b) + 8         ; JDIMENSION image_width
193%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
194%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
195%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
196%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
197%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
198
199    align       32
200    GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
201
202EXTN(jsimd_h2v2_downsample_sse2):
203    push        ebp
204    mov         ebp, esp
205;   push        ebx                     ; unused
206;   push        ecx                     ; need not be preserved
207;   push        edx                     ; need not be preserved
208    push        esi
209    push        edi
210
211    mov         ecx, JDIMENSION [width_blks(ebp)]
212    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
213    jz          near .return
214
215    mov         edx, JDIMENSION [img_width(ebp)]
216
217    ; -- expand_right_edge
218
219    push        ecx
220    shl         ecx, 1                  ; output_cols * 2
221    sub         ecx, edx
222    jle         short .expand_end
223
224    mov         eax, INT [max_v_samp(ebp)]
225    test        eax, eax
226    jle         short .expand_end
227
228    cld
229    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
230    alignx      16, 7
231.expandloop:
232    push        eax
233    push        ecx
234
235    mov         edi, JSAMPROW [esi]
236    add         edi, edx
237    mov         al, JSAMPLE [edi-1]
238
239    rep stosb
240
241    pop         ecx
242    pop         eax
243
244    add         esi, byte SIZEOF_JSAMPROW
245    dec         eax
246    jg          short .expandloop
247
248.expand_end:
249    pop         ecx                     ; output_cols
250
251    ; -- h2v2_downsample
252
253    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
254    test        eax, eax
255    jle         near .return
256
257    mov         edx, 0x00020001         ; bias pattern
258    movd        xmm7, edx
259    pcmpeqw     xmm6, xmm6
260    pshufd      xmm7, xmm7, 0x00        ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
261    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
262
263    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
264    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
265    alignx      16, 7
266.rowloop:
267    push        ecx
268    push        edi
269    push        esi
270
271    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
272    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
273    mov         edi, JSAMPROW [edi]                    ; outptr
274
275    cmp         ecx, byte SIZEOF_XMMWORD
276    jae         short .columnloop
277    alignx      16, 7
278
279.columnloop_r8:
280    movdqa      xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
281    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
282    pxor        xmm2, xmm2
283    pxor        xmm3, xmm3
284    mov         ecx, SIZEOF_XMMWORD
285    jmp         short .downsample
286    alignx      16, 7
287
288.columnloop:
289    movdqa      xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
290    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
291    movdqa      xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
292    movdqa      xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
293
294.downsample:
295    movdqa      xmm4, xmm0
296    movdqa      xmm5, xmm1
297    pand        xmm0, xmm6
298    psrlw       xmm4, BYTE_BIT
299    pand        xmm1, xmm6
300    psrlw       xmm5, BYTE_BIT
301    paddw       xmm0, xmm4
302    paddw       xmm1, xmm5
303
304    movdqa      xmm4, xmm2
305    movdqa      xmm5, xmm3
306    pand        xmm2, xmm6
307    psrlw       xmm4, BYTE_BIT
308    pand        xmm3, xmm6
309    psrlw       xmm5, BYTE_BIT
310    paddw       xmm2, xmm4
311    paddw       xmm3, xmm5
312
313    paddw       xmm0, xmm1
314    paddw       xmm2, xmm3
315    paddw       xmm0, xmm7
316    paddw       xmm2, xmm7
317    psrlw       xmm0, 2
318    psrlw       xmm2, 2
319
320    packuswb    xmm0, xmm2
321
322    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
323
324    sub         ecx, byte SIZEOF_XMMWORD    ; outcol
325    add         edx, byte 2*SIZEOF_XMMWORD  ; inptr0
326    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr1
327    add         edi, byte 1*SIZEOF_XMMWORD  ; outptr
328    cmp         ecx, byte SIZEOF_XMMWORD
329    jae         near .columnloop
330    test        ecx, ecx
331    jnz         near .columnloop_r8
332
333    pop         esi
334    pop         edi
335    pop         ecx
336
337    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
338    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
339    dec         eax                          ; rowctr
340    jg          near .rowloop
341
342.return:
343    pop         edi
344    pop         esi
345;   pop         edx                     ; need not be preserved
346;   pop         ecx                     ; need not be preserved
347;   pop         ebx                     ; unused
348    pop         ebp
349    ret
350
351; For some reason, the OS X linker does not honor the request to align the
352; segment unless we do this.
353    align       32
354