• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jcsample.asm - downsampling (AVX2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2015, Intel Corporation.
6; Copyright (C) 2016, D. R. Commander.
7;
8; Based on the x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17;
18; [TAB8]
19
20%include "jsimdext.inc"
21
22; --------------------------------------------------------------------------
23    SECTION     SEG_TEXT
24    BITS        32
25;
26; Downsample pixel values of a single component.
27; This version handles the common case of 2:1 horizontal and 1:1 vertical,
28; without smoothing.
29;
30; GLOBAL(void)
31; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
32;                            JDIMENSION v_samp_factor,
33;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
34;                            JSAMPARRAY output_data);
35;
36
37%define img_width(b)    (b) + 8         ; JDIMENSION image_width
38%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
39%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
40%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
41%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
42%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
43
44    align       32
45    GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
46
47EXTN(jsimd_h2v1_downsample_avx2):
48    push        ebp
49    mov         ebp, esp
50;   push        ebx                     ; unused
51;   push        ecx                     ; need not be preserved
52;   push        edx                     ; need not be preserved
53    push        esi
54    push        edi
55
56    mov         ecx, JDIMENSION [width_blks(ebp)]
57    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
58    jz          near .return
59
60    mov         edx, JDIMENSION [img_width(ebp)]
61
62    ; -- expand_right_edge
63
64    push        ecx
65    shl         ecx, 1                  ; output_cols * 2
66    sub         ecx, edx
67    jle         short .expand_end
68
69    mov         eax, INT [max_v_samp(ebp)]
70    test        eax, eax
71    jle         short .expand_end
72
73    cld
74    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
75    alignx      16, 7
76.expandloop:
77    push        eax
78    push        ecx
79
80    mov         edi, JSAMPROW [esi]
81    add         edi, edx
82    mov         al, JSAMPLE [edi-1]
83
84    rep stosb
85
86    pop         ecx
87    pop         eax
88
89    add         esi, byte SIZEOF_JSAMPROW
90    dec         eax
91    jg          short .expandloop
92
93.expand_end:
94    pop         ecx                     ; output_cols
95
96    ; -- h2v1_downsample
97
98    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
99    test        eax, eax
100    jle         near .return
101
102    mov         edx, 0x00010000         ; bias pattern
103    vmovd       xmm7, edx
104    vpshufd     xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
105    vperm2i128  ymm7, ymm7, ymm7, 0     ; ymm7={xmm7, xmm7}
106    vpcmpeqw    ymm6, ymm6, ymm6
107    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
108
109    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
110    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
111    alignx      16, 7
112.rowloop:
113    push        ecx
114    push        edi
115    push        esi
116
117    mov         esi, JSAMPROW [esi]     ; inptr
118    mov         edi, JSAMPROW [edi]     ; outptr
119
120    cmp         ecx, byte SIZEOF_YMMWORD
121    jae         short .columnloop
122    alignx      16, 7
123
124.columnloop_r24:
125    ; ecx can possibly be 8, 16, 24
126    cmp         ecx, 24
127    jne         .columnloop_r16
128    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
129    vmovdqu     xmm1, XMMWORD [esi+1*SIZEOF_YMMWORD]
130    mov         ecx, SIZEOF_YMMWORD
131    jmp         short .downsample
132
133.columnloop_r16:
134    cmp         ecx, 16
135    jne         .columnloop_r8
136    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
137    vpxor       ymm1, ymm1, ymm1
138    mov         ecx, SIZEOF_YMMWORD
139    jmp         short .downsample
140
141.columnloop_r8:
142    vmovdqu     xmm0, XMMWORD[esi+0*SIZEOF_YMMWORD]
143    vpxor       ymm1, ymm1, ymm1
144    mov         ecx, SIZEOF_YMMWORD
145    jmp         short .downsample
146    alignx      16, 7
147
148.columnloop:
149    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
150    vmovdqu     ymm1, YMMWORD [esi+1*SIZEOF_YMMWORD]
151
152.downsample:
153    vpsrlw      ymm2, ymm0, BYTE_BIT
154    vpand       ymm0, ymm0, ymm6
155    vpsrlw      ymm3, ymm1, BYTE_BIT
156    vpand       ymm1, ymm1, ymm6
157
158    vpaddw      ymm0, ymm0, ymm2
159    vpaddw      ymm1, ymm1, ymm3
160    vpaddw      ymm0, ymm0, ymm7
161    vpaddw      ymm1, ymm1, ymm7
162    vpsrlw      ymm0, ymm0, 1
163    vpsrlw      ymm1, ymm1, 1
164
165    vpackuswb   ymm0, ymm0, ymm1
166    vpermq      ymm0, ymm0, 0xd8
167
168    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
169
170    sub         ecx, byte SIZEOF_YMMWORD    ; outcol
171    add         esi, byte 2*SIZEOF_YMMWORD  ; inptr
172    add         edi, byte 1*SIZEOF_YMMWORD  ; outptr
173    cmp         ecx, byte SIZEOF_YMMWORD
174    jae         short .columnloop
175    test        ecx, ecx
176    jnz         near .columnloop_r24
177
178    pop         esi
179    pop         edi
180    pop         ecx
181
182    add         esi, byte SIZEOF_JSAMPROW  ; input_data
183    add         edi, byte SIZEOF_JSAMPROW  ; output_data
184    dec         eax                        ; rowctr
185    jg          near .rowloop
186
187.return:
188    vzeroupper
189    pop         edi
190    pop         esi
191;   pop         edx                     ; need not be preserved
192;   pop         ecx                     ; need not be preserved
193;   pop         ebx                     ; unused
194    pop         ebp
195    ret
196
197; --------------------------------------------------------------------------
198;
199; Downsample pixel values of a single component.
200; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
201; without smoothing.
202;
203; GLOBAL(void)
204; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
205;                            JDIMENSION v_samp_factor,
206;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
207;                            JSAMPARRAY output_data);
208;
209
210%define img_width(b)    (b) + 8         ; JDIMENSION image_width
211%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
212%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
213%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
214%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
215%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
216
217    align       32
218    GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
219
220EXTN(jsimd_h2v2_downsample_avx2):
221    push        ebp
222    mov         ebp, esp
223;   push        ebx                     ; unused
224;   push        ecx                     ; need not be preserved
225;   push        edx                     ; need not be preserved
226    push        esi
227    push        edi
228
229    mov         ecx, JDIMENSION [width_blks(ebp)]
230    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
231    jz          near .return
232
233    mov         edx, JDIMENSION [img_width(ebp)]
234
235    ; -- expand_right_edge
236
237    push        ecx
238    shl         ecx, 1                  ; output_cols * 2
239    sub         ecx, edx
240    jle         short .expand_end
241
242    mov         eax, INT [max_v_samp(ebp)]
243    test        eax, eax
244    jle         short .expand_end
245
246    cld
247    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
248    alignx      16, 7
249.expandloop:
250    push        eax
251    push        ecx
252
253    mov         edi, JSAMPROW [esi]
254    add         edi, edx
255    mov         al, JSAMPLE [edi-1]
256
257    rep stosb
258
259    pop         ecx
260    pop         eax
261
262    add         esi, byte SIZEOF_JSAMPROW
263    dec         eax
264    jg          short .expandloop
265
266.expand_end:
267    pop         ecx                     ; output_cols
268
269    ; -- h2v2_downsample
270
271    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
272    test        eax, eax
273    jle         near .return
274
275    mov         edx, 0x00020001         ; bias pattern
276    vmovd       xmm7, edx
277    vpcmpeqw    ymm6, ymm6, ymm6
278    vpshufd     xmm7, xmm7, 0x00        ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
279    vperm2i128  ymm7, ymm7, ymm7, 0
280    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
281
282    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
283    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
284    alignx      16, 7
285.rowloop:
286    push        ecx
287    push        edi
288    push        esi
289
290    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
291    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
292    mov         edi, JSAMPROW [edi]                    ; outptr
293
294    cmp         ecx, byte SIZEOF_YMMWORD
295    jae         short .columnloop
296    alignx      16, 7
297
298.columnloop_r24:
299    cmp         ecx, 24
300    jne         .columnloop_r16
301    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
302    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
303    vmovdqu     xmm2, XMMWORD [edx+1*SIZEOF_YMMWORD]
304    vmovdqu     xmm3, XMMWORD [esi+1*SIZEOF_YMMWORD]
305    mov         ecx, SIZEOF_YMMWORD
306    jmp         short .downsample
307
308.columnloop_r16:
309    cmp         ecx, 16
310    jne         .columnloop_r8
311    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
312    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
313    vpxor       ymm2, ymm2, ymm2
314    vpxor       ymm3, ymm3, ymm3
315    mov         ecx, SIZEOF_YMMWORD
316    jmp         short .downsample
317
318.columnloop_r8:
319    vmovdqu     xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
320    vmovdqu     xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
321    vpxor       ymm2, ymm2, ymm2
322    vpxor       ymm3, ymm3, ymm3
323    mov         ecx, SIZEOF_YMMWORD
324    jmp         short .downsample
325    alignx      16, 7
326
327.columnloop:
328    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
329    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
330    vmovdqu     ymm2, YMMWORD [edx+1*SIZEOF_YMMWORD]
331    vmovdqu     ymm3, YMMWORD [esi+1*SIZEOF_YMMWORD]
332
333.downsample:
334    vpand       ymm4, ymm0, ymm6
335    vpsrlw      ymm0, ymm0, BYTE_BIT
336    vpand       ymm5, ymm1, ymm6
337    vpsrlw      ymm1, ymm1, BYTE_BIT
338    vpaddw      ymm0, ymm0, ymm4
339    vpaddw      ymm1, ymm1, ymm5
340
341    vpand       ymm4, ymm2, ymm6
342    vpsrlw      ymm2, ymm2, BYTE_BIT
343    vpand       ymm5, ymm3, ymm6
344    vpsrlw      ymm3, ymm3, BYTE_BIT
345    vpaddw      ymm2, ymm2, ymm4
346    vpaddw      ymm3, ymm3, ymm5
347
348    vpaddw      ymm0, ymm0, ymm1
349    vpaddw      ymm2, ymm2, ymm3
350    vpaddw      ymm0, ymm0, ymm7
351    vpaddw      ymm2, ymm2, ymm7
352    vpsrlw      ymm0, ymm0, 2
353    vpsrlw      ymm2, ymm2, 2
354
355    vpackuswb   ymm0, ymm0, ymm2
356    vpermq      ymm0, ymm0, 0xd8
357
358    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
359
360    sub         ecx, byte SIZEOF_YMMWORD    ; outcol
361    add         edx, byte 2*SIZEOF_YMMWORD  ; inptr0
362    add         esi, byte 2*SIZEOF_YMMWORD  ; inptr1
363    add         edi, byte 1*SIZEOF_YMMWORD  ; outptr
364    cmp         ecx, byte SIZEOF_YMMWORD
365    jae         near .columnloop
366    test        ecx, ecx
367    jnz         near .columnloop_r24
368
369    pop         esi
370    pop         edi
371    pop         ecx
372
373    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
374    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
375    dec         eax                          ; rowctr
376    jg          near .rowloop
377
378.return:
379    vzeroupper
380    pop         edi
381    pop         esi
382;   pop         edx                     ; need not be preserved
383;   pop         ecx                     ; need not be preserved
384;   pop         ebx                     ; unused
385    pop         ebp
386    ret
387
388; For some reason, the OS X linker does not honor the request to align the
389; segment unless we do this.
390    align       32
391