• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jcsample.asm - downsampling (MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16
17%include "jsimdext.inc"
18
19; --------------------------------------------------------------------------
20    SECTION     SEG_TEXT
21    BITS        32
22;
23; Downsample pixel values of a single component.
24; This version handles the common case of 2:1 horizontal and 1:1 vertical,
25; without smoothing.
26;
27; GLOBAL(void)
28; jsimd_h2v1_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
29;                           JDIMENSION v_samp_factor,
30;                           JDIMENSION width_in_blocks, JSAMPARRAY input_data,
31;                           JSAMPARRAY output_data);
32;
33
34%define img_width(b)    (b) + 8         ; JDIMENSION image_width
35%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
36%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
37%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
38%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
39%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
40
41    align       32
42    GLOBAL_FUNCTION(jsimd_h2v1_downsample_mmx)
43
44EXTN(jsimd_h2v1_downsample_mmx):
45    push        ebp
46    mov         ebp, esp
47;   push        ebx                     ; unused
48;   push        ecx                     ; need not be preserved
49;   push        edx                     ; need not be preserved
50    push        esi
51    push        edi
52
53    mov         ecx, JDIMENSION [width_blks(ebp)]
54    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
55    jz          near .return
56
57    mov         edx, JDIMENSION [img_width(ebp)]
58
59    ; -- expand_right_edge
60
61    push        ecx
62    shl         ecx, 1                  ; output_cols * 2
63    sub         ecx, edx
64    jle         short .expand_end
65
66    mov         eax, INT [max_v_samp(ebp)]
67    test        eax, eax
68    jle         short .expand_end
69
70    cld
71    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
72    alignx      16, 7
73.expandloop:
74    push        eax
75    push        ecx
76
77    mov         edi, JSAMPROW [esi]
78    add         edi, edx
79    mov         al, JSAMPLE [edi-1]
80
81    rep stosb
82
83    pop         ecx
84    pop         eax
85
86    add         esi, byte SIZEOF_JSAMPROW
87    dec         eax
88    jg          short .expandloop
89
90.expand_end:
91    pop         ecx                     ; output_cols
92
93    ; -- h2v1_downsample
94
95    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
96    test        eax, eax
97    jle         near .return
98
99    mov         edx, 0x00010000         ; bias pattern
100    movd        mm7, edx
101    pcmpeqw     mm6, mm6
102    punpckldq   mm7, mm7                ; mm7={0, 1, 0, 1}
103    psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
104
105    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
106    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
107    alignx      16, 7
108.rowloop:
109    push        ecx
110    push        edi
111    push        esi
112
113    mov         esi, JSAMPROW [esi]     ; inptr
114    mov         edi, JSAMPROW [edi]     ; outptr
115    alignx      16, 7
116.columnloop:
117
118    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
119    movq        mm1, MMWORD [esi+1*SIZEOF_MMWORD]
120    movq        mm2, mm0
121    movq        mm3, mm1
122
123    pand        mm0, mm6
124    psrlw       mm2, BYTE_BIT
125    pand        mm1, mm6
126    psrlw       mm3, BYTE_BIT
127
128    paddw       mm0, mm2
129    paddw       mm1, mm3
130    paddw       mm0, mm7
131    paddw       mm1, mm7
132    psrlw       mm0, 1
133    psrlw       mm1, 1
134
135    packuswb    mm0, mm1
136
137    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
138
139    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
140    add         edi, byte 1*SIZEOF_MMWORD  ; outptr
141    sub         ecx, byte SIZEOF_MMWORD    ; outcol
142    jnz         short .columnloop
143
144    pop         esi
145    pop         edi
146    pop         ecx
147
148    add         esi, byte SIZEOF_JSAMPROW  ; input_data
149    add         edi, byte SIZEOF_JSAMPROW  ; output_data
150    dec         eax                        ; rowctr
151    jg          short .rowloop
152
153    emms                                ; empty MMX state
154
155.return:
156    pop         edi
157    pop         esi
158;   pop         edx                     ; need not be preserved
159;   pop         ecx                     ; need not be preserved
160;   pop         ebx                     ; unused
161    pop         ebp
162    ret
163
164; --------------------------------------------------------------------------
165;
166; Downsample pixel values of a single component.
167; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
168; without smoothing.
169;
170; GLOBAL(void)
171; jsimd_h2v2_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
172;                           JDIMENSION v_samp_factor,
173;                           JDIMENSION width_in_blocks, JSAMPARRAY input_data,
174;                           JSAMPARRAY output_data);
175;
176
177%define img_width(b)    (b) + 8         ; JDIMENSION image_width
178%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
179%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
180%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
181%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
182%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
183
184    align       32
185    GLOBAL_FUNCTION(jsimd_h2v2_downsample_mmx)
186
187EXTN(jsimd_h2v2_downsample_mmx):
188    push        ebp
189    mov         ebp, esp
190;   push        ebx                     ; unused
191;   push        ecx                     ; need not be preserved
192;   push        edx                     ; need not be preserved
193    push        esi
194    push        edi
195
196    mov         ecx, JDIMENSION [width_blks(ebp)]
197    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
198    jz          near .return
199
200    mov         edx, JDIMENSION [img_width(ebp)]
201
202    ; -- expand_right_edge
203
204    push        ecx
205    shl         ecx, 1                  ; output_cols * 2
206    sub         ecx, edx
207    jle         short .expand_end
208
209    mov         eax, INT [max_v_samp(ebp)]
210    test        eax, eax
211    jle         short .expand_end
212
213    cld
214    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
215    alignx      16, 7
216.expandloop:
217    push        eax
218    push        ecx
219
220    mov         edi, JSAMPROW [esi]
221    add         edi, edx
222    mov         al, JSAMPLE [edi-1]
223
224    rep stosb
225
226    pop         ecx
227    pop         eax
228
229    add         esi, byte SIZEOF_JSAMPROW
230    dec         eax
231    jg          short .expandloop
232
233.expand_end:
234    pop         ecx                     ; output_cols
235
236    ; -- h2v2_downsample
237
238    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
239    test        eax, eax
240    jle         near .return
241
242    mov         edx, 0x00020001         ; bias pattern
243    movd        mm7, edx
244    pcmpeqw     mm6, mm6
245    punpckldq   mm7, mm7                ; mm7={1, 2, 1, 2}
246    psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
247
248    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
249    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
250    alignx      16, 7
251.rowloop:
252    push        ecx
253    push        edi
254    push        esi
255
256    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
257    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
258    mov         edi, JSAMPROW [edi]                    ; outptr
259    alignx      16, 7
260.columnloop:
261
262    movq        mm0, MMWORD [edx+0*SIZEOF_MMWORD]
263    movq        mm1, MMWORD [esi+0*SIZEOF_MMWORD]
264    movq        mm2, MMWORD [edx+1*SIZEOF_MMWORD]
265    movq        mm3, MMWORD [esi+1*SIZEOF_MMWORD]
266
267    movq        mm4, mm0
268    movq        mm5, mm1
269    pand        mm0, mm6
270    psrlw       mm4, BYTE_BIT
271    pand        mm1, mm6
272    psrlw       mm5, BYTE_BIT
273    paddw       mm0, mm4
274    paddw       mm1, mm5
275
276    movq        mm4, mm2
277    movq        mm5, mm3
278    pand        mm2, mm6
279    psrlw       mm4, BYTE_BIT
280    pand        mm3, mm6
281    psrlw       mm5, BYTE_BIT
282    paddw       mm2, mm4
283    paddw       mm3, mm5
284
285    paddw       mm0, mm1
286    paddw       mm2, mm3
287    paddw       mm0, mm7
288    paddw       mm2, mm7
289    psrlw       mm0, 2
290    psrlw       mm2, 2
291
292    packuswb    mm0, mm2
293
294    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
295
296    add         edx, byte 2*SIZEOF_MMWORD  ; inptr0
297    add         esi, byte 2*SIZEOF_MMWORD  ; inptr1
298    add         edi, byte 1*SIZEOF_MMWORD  ; outptr
299    sub         ecx, byte SIZEOF_MMWORD    ; outcol
300    jnz         near .columnloop
301
302    pop         esi
303    pop         edi
304    pop         ecx
305
306    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
307    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
308    dec         eax                          ; rowctr
309    jg          near .rowloop
310
311    emms                                ; empty MMX state
312
313.return:
314    pop         edi
315    pop         esi
316;   pop         edx                     ; need not be preserved
317;   pop         ecx                     ; need not be preserved
318;   pop         ebx                     ; unused
319    pop         ebp
320    ret
321
322; For some reason, the OS X linker does not honor the request to align the
323; segment unless we do this.
324    align       32
325