• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jcsample.asm - downsampling (AVX2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2015, Intel Corporation.
6; Copyright (C) 2016, D. R. Commander.
7;
8; Based on the x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17
18%include "jsimdext.inc"
19
20; --------------------------------------------------------------------------
21    SECTION     SEG_TEXT
22    BITS        32
23;
24; Downsample pixel values of a single component.
25; This version handles the common case of 2:1 horizontal and 1:1 vertical,
26; without smoothing.
27;
28; GLOBAL(void)
29; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
30;                            JDIMENSION v_samp_factor,
31;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
32;                            JSAMPARRAY output_data);
33;
34
35%define img_width(b)    (b) + 8         ; JDIMENSION image_width
36%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
37%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
38%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
39%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
40%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
41
42    align       32
43    GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
44
45EXTN(jsimd_h2v1_downsample_avx2):
46    push        ebp
47    mov         ebp, esp
48;   push        ebx                     ; unused
49;   push        ecx                     ; need not be preserved
50;   push        edx                     ; need not be preserved
51    push        esi
52    push        edi
53
54    mov         ecx, JDIMENSION [width_blks(ebp)]
55    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
56    jz          near .return
57
58    mov         edx, JDIMENSION [img_width(ebp)]
59
60    ; -- expand_right_edge
61
62    push        ecx
63    shl         ecx, 1                  ; output_cols * 2
64    sub         ecx, edx
65    jle         short .expand_end
66
67    mov         eax, INT [max_v_samp(ebp)]
68    test        eax, eax
69    jle         short .expand_end
70
71    cld
72    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
73    alignx      16, 7
74.expandloop:
75    push        eax
76    push        ecx
77
78    mov         edi, JSAMPROW [esi]
79    add         edi, edx
80    mov         al, JSAMPLE [edi-1]
81
82    rep stosb
83
84    pop         ecx
85    pop         eax
86
87    add         esi, byte SIZEOF_JSAMPROW
88    dec         eax
89    jg          short .expandloop
90
91.expand_end:
92    pop         ecx                     ; output_cols
93
94    ; -- h2v1_downsample
95
96    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
97    test        eax, eax
98    jle         near .return
99
100    mov         edx, 0x00010000         ; bias pattern
101    vmovd       xmm7, edx
102    vpshufd     xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
103    vperm2i128  ymm7, ymm7, ymm7, 0     ; ymm7={xmm7, xmm7}
104    vpcmpeqw    ymm6, ymm6, ymm6
105    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
106
107    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
108    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
109    alignx      16, 7
110.rowloop:
111    push        ecx
112    push        edi
113    push        esi
114
115    mov         esi, JSAMPROW [esi]     ; inptr
116    mov         edi, JSAMPROW [edi]     ; outptr
117
118    cmp         ecx, byte SIZEOF_YMMWORD
119    jae         short .columnloop
120    alignx      16, 7
121
122.columnloop_r24:
123    ; ecx can possibly be 8, 16, 24
124    cmp         ecx, 24
125    jne         .columnloop_r16
126    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
127    vmovdqu     xmm1, XMMWORD [esi+1*SIZEOF_YMMWORD]
128    mov         ecx, SIZEOF_YMMWORD
129    jmp         short .downsample
130
131.columnloop_r16:
132    cmp         ecx, 16
133    jne         .columnloop_r8
134    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
135    vpxor       ymm1, ymm1, ymm1
136    mov         ecx, SIZEOF_YMMWORD
137    jmp         short .downsample
138
139.columnloop_r8:
140    vmovdqu     xmm0, XMMWORD[esi+0*SIZEOF_YMMWORD]
141    vpxor       ymm1, ymm1, ymm1
142    mov         ecx, SIZEOF_YMMWORD
143    jmp         short .downsample
144    alignx      16, 7
145
146.columnloop:
147    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
148    vmovdqu     ymm1, YMMWORD [esi+1*SIZEOF_YMMWORD]
149
150.downsample:
151    vpsrlw      ymm2, ymm0, BYTE_BIT
152    vpand       ymm0, ymm0, ymm6
153    vpsrlw      ymm3, ymm1, BYTE_BIT
154    vpand       ymm1, ymm1, ymm6
155
156    vpaddw      ymm0, ymm0, ymm2
157    vpaddw      ymm1, ymm1, ymm3
158    vpaddw      ymm0, ymm0, ymm7
159    vpaddw      ymm1, ymm1, ymm7
160    vpsrlw      ymm0, ymm0, 1
161    vpsrlw      ymm1, ymm1, 1
162
163    vpackuswb   ymm0, ymm0, ymm1
164    vpermq      ymm0, ymm0, 0xd8
165
166    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
167
168    sub         ecx, byte SIZEOF_YMMWORD    ; outcol
169    add         esi, byte 2*SIZEOF_YMMWORD  ; inptr
170    add         edi, byte 1*SIZEOF_YMMWORD  ; outptr
171    cmp         ecx, byte SIZEOF_YMMWORD
172    jae         short .columnloop
173    test        ecx, ecx
174    jnz         near .columnloop_r24
175
176    pop         esi
177    pop         edi
178    pop         ecx
179
180    add         esi, byte SIZEOF_JSAMPROW  ; input_data
181    add         edi, byte SIZEOF_JSAMPROW  ; output_data
182    dec         eax                        ; rowctr
183    jg          near .rowloop
184
185.return:
186    vzeroupper
187    pop         edi
188    pop         esi
189;   pop         edx                     ; need not be preserved
190;   pop         ecx                     ; need not be preserved
191;   pop         ebx                     ; unused
192    pop         ebp
193    ret
194
195; --------------------------------------------------------------------------
196;
197; Downsample pixel values of a single component.
198; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
199; without smoothing.
200;
201; GLOBAL(void)
202; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
203;                            JDIMENSION v_samp_factor,
204;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
205;                            JSAMPARRAY output_data);
206;
207
208%define img_width(b)    (b) + 8         ; JDIMENSION image_width
209%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
210%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
211%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
212%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
213%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
214
215    align       32
216    GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
217
218EXTN(jsimd_h2v2_downsample_avx2):
219    push        ebp
220    mov         ebp, esp
221;   push        ebx                     ; unused
222;   push        ecx                     ; need not be preserved
223;   push        edx                     ; need not be preserved
224    push        esi
225    push        edi
226
227    mov         ecx, JDIMENSION [width_blks(ebp)]
228    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
229    jz          near .return
230
231    mov         edx, JDIMENSION [img_width(ebp)]
232
233    ; -- expand_right_edge
234
235    push        ecx
236    shl         ecx, 1                  ; output_cols * 2
237    sub         ecx, edx
238    jle         short .expand_end
239
240    mov         eax, INT [max_v_samp(ebp)]
241    test        eax, eax
242    jle         short .expand_end
243
244    cld
245    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
246    alignx      16, 7
247.expandloop:
248    push        eax
249    push        ecx
250
251    mov         edi, JSAMPROW [esi]
252    add         edi, edx
253    mov         al, JSAMPLE [edi-1]
254
255    rep stosb
256
257    pop         ecx
258    pop         eax
259
260    add         esi, byte SIZEOF_JSAMPROW
261    dec         eax
262    jg          short .expandloop
263
264.expand_end:
265    pop         ecx                     ; output_cols
266
267    ; -- h2v2_downsample
268
269    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
270    test        eax, eax
271    jle         near .return
272
273    mov         edx, 0x00020001         ; bias pattern
274    vmovd       xmm7, edx
275    vpcmpeqw    ymm6, ymm6, ymm6
276    vpshufd     xmm7, xmm7, 0x00        ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
277    vperm2i128  ymm7, ymm7, ymm7, 0
278    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
279
280    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
281    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
282    alignx      16, 7
283.rowloop:
284    push        ecx
285    push        edi
286    push        esi
287
288    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
289    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
290    mov         edi, JSAMPROW [edi]                    ; outptr
291
292    cmp         ecx, byte SIZEOF_YMMWORD
293    jae         short .columnloop
294    alignx      16, 7
295
296.columnloop_r24:
297    cmp         ecx, 24
298    jne         .columnloop_r16
299    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
300    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
301    vmovdqu     xmm2, XMMWORD [edx+1*SIZEOF_YMMWORD]
302    vmovdqu     xmm3, XMMWORD [esi+1*SIZEOF_YMMWORD]
303    mov         ecx, SIZEOF_YMMWORD
304    jmp         short .downsample
305
306.columnloop_r16:
307    cmp         ecx, 16
308    jne         .columnloop_r8
309    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
310    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
311    vpxor       ymm2, ymm2, ymm2
312    vpxor       ymm3, ymm3, ymm3
313    mov         ecx, SIZEOF_YMMWORD
314    jmp         short .downsample
315
316.columnloop_r8:
317    vmovdqu     xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
318    vmovdqu     xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
319    vpxor       ymm2, ymm2, ymm2
320    vpxor       ymm3, ymm3, ymm3
321    mov         ecx, SIZEOF_YMMWORD
322    jmp         short .downsample
323    alignx      16, 7
324
325.columnloop:
326    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
327    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
328    vmovdqu     ymm2, YMMWORD [edx+1*SIZEOF_YMMWORD]
329    vmovdqu     ymm3, YMMWORD [esi+1*SIZEOF_YMMWORD]
330
331.downsample:
332    vpand       ymm4, ymm0, ymm6
333    vpsrlw      ymm0, ymm0, BYTE_BIT
334    vpand       ymm5, ymm1, ymm6
335    vpsrlw      ymm1, ymm1, BYTE_BIT
336    vpaddw      ymm0, ymm0, ymm4
337    vpaddw      ymm1, ymm1, ymm5
338
339    vpand       ymm4, ymm2, ymm6
340    vpsrlw      ymm2, ymm2, BYTE_BIT
341    vpand       ymm5, ymm3, ymm6
342    vpsrlw      ymm3, ymm3, BYTE_BIT
343    vpaddw      ymm2, ymm2, ymm4
344    vpaddw      ymm3, ymm3, ymm5
345
346    vpaddw      ymm0, ymm0, ymm1
347    vpaddw      ymm2, ymm2, ymm3
348    vpaddw      ymm0, ymm0, ymm7
349    vpaddw      ymm2, ymm2, ymm7
350    vpsrlw      ymm0, ymm0, 2
351    vpsrlw      ymm2, ymm2, 2
352
353    vpackuswb   ymm0, ymm0, ymm2
354    vpermq      ymm0, ymm0, 0xd8
355
356    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
357
358    sub         ecx, byte SIZEOF_YMMWORD    ; outcol
359    add         edx, byte 2*SIZEOF_YMMWORD  ; inptr0
360    add         esi, byte 2*SIZEOF_YMMWORD  ; inptr1
361    add         edi, byte 1*SIZEOF_YMMWORD  ; outptr
362    cmp         ecx, byte SIZEOF_YMMWORD
363    jae         near .columnloop
364    test        ecx, ecx
365    jnz         near .columnloop_r24
366
367    pop         esi
368    pop         edi
369    pop         ecx
370
371    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
372    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
373    dec         eax                          ; rowctr
374    jg          near .rowloop
375
376.return:
377    vzeroupper
378    pop         edi
379    pop         esi
380;   pop         edx                     ; need not be preserved
381;   pop         ecx                     ; need not be preserved
382;   pop         ebx                     ; unused
383    pop         ebp
384    ret
385
386; For some reason, the OS X linker does not honor the request to align the
387; segment unless we do this.
388    align       32
389