• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jcsample.asm - downsampling (64-bit AVX2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, 2016, D. R. Commander.
6; Copyright (C) 2015, Intel Corporation.
7;
8; Based on the x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17
18%include "jsimdext.inc"
19
20; --------------------------------------------------------------------------
21    SECTION     SEG_TEXT
22    BITS        64
23;
24; Downsample pixel values of a single component.
25; This version handles the common case of 2:1 horizontal and 1:1 vertical,
26; without smoothing.
27;
28; GLOBAL(void)
29; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
30;                            JDIMENSION v_samp_factor,
31;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
32;                            JSAMPARRAY output_data);
33;
34
35; r10d = JDIMENSION image_width
36; r11 = int max_v_samp_factor
37; r12d = JDIMENSION v_samp_factor
38; r13d = JDIMENSION width_in_blocks
39; r14 = JSAMPARRAY input_data
40; r15 = JSAMPARRAY output_data
41
42    align       32
43    GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
44
45EXTN(jsimd_h2v1_downsample_avx2):
46    push        rbp
47    mov         rax, rsp
48    mov         rbp, rsp
49    collect_args 6
50
51    mov         ecx, r13d
52    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
53    jz          near .return
54
55    mov         edx, r10d
56
57    ; -- expand_right_edge
58
59    push        rcx
60    shl         rcx, 1                  ; output_cols * 2
61    sub         rcx, rdx
62    jle         short .expand_end
63
64    mov         rax, r11
65    test        rax, rax
66    jle         short .expand_end
67
68    cld
69    mov         rsi, r14                ; input_data
70.expandloop:
71    push        rax
72    push        rcx
73
74    mov         rdi, JSAMPROW [rsi]
75    add         rdi, rdx
76    mov         al, JSAMPLE [rdi-1]
77
78    rep stosb
79
80    pop         rcx
81    pop         rax
82
83    add         rsi, byte SIZEOF_JSAMPROW
84    dec         rax
85    jg          short .expandloop
86
87.expand_end:
88    pop         rcx                     ; output_cols
89
90    ; -- h2v1_downsample
91
92    mov         eax, r12d               ; rowctr
93    test        eax, eax
94    jle         near .return
95
96    mov         rdx, 0x00010000         ; bias pattern
97    vmovd       xmm7, edx
98    vpshufd     xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
99    vperm2i128  ymm7, ymm7, ymm7, 0     ; ymm7={xmm7, xmm7}
100    vpcmpeqw    ymm6, ymm6, ymm6
101    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
102
103    mov         rsi, r14                ; input_data
104    mov         rdi, r15                ; output_data
105.rowloop:
106    push        rcx
107    push        rdi
108    push        rsi
109
110    mov         rsi, JSAMPROW [rsi]     ; inptr
111    mov         rdi, JSAMPROW [rdi]     ; outptr
112
113    cmp         rcx, byte SIZEOF_YMMWORD
114    jae         short .columnloop
115
116.columnloop_r24:
117    ; rcx can possibly be 8, 16, 24
118    cmp         rcx, 24
119    jne         .columnloop_r16
120    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
121    vmovdqu     xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD]
122    mov         rcx, SIZEOF_YMMWORD
123    jmp         short .downsample
124
125.columnloop_r16:
126    cmp         rcx, 16
127    jne         .columnloop_r8
128    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
129    vpxor       ymm1, ymm1, ymm1
130    mov         rcx, SIZEOF_YMMWORD
131    jmp         short .downsample
132
133.columnloop_r8:
134    vmovdqu     xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD]
135    vpxor       ymm1, ymm1, ymm1
136    mov         rcx, SIZEOF_YMMWORD
137    jmp         short .downsample
138
139.columnloop:
140    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
141    vmovdqu     ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD]
142
143.downsample:
144    vpsrlw      ymm2, ymm0, BYTE_BIT
145    vpand       ymm0, ymm0, ymm6
146    vpsrlw      ymm3, ymm1, BYTE_BIT
147    vpand       ymm1, ymm1, ymm6
148
149    vpaddw      ymm0, ymm0, ymm2
150    vpaddw      ymm1, ymm1, ymm3
151    vpaddw      ymm0, ymm0, ymm7
152    vpaddw      ymm1, ymm1, ymm7
153    vpsrlw      ymm0, ymm0, 1
154    vpsrlw      ymm1, ymm1, 1
155
156    vpackuswb   ymm0, ymm0, ymm1
157    vpermq      ymm0, ymm0, 0xd8
158
159    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
160
161    sub         rcx, byte SIZEOF_YMMWORD    ; outcol
162    add         rsi, byte 2*SIZEOF_YMMWORD  ; inptr
163    add         rdi, byte 1*SIZEOF_YMMWORD  ; outptr
164    cmp         rcx, byte SIZEOF_YMMWORD
165    jae         short .columnloop
166    test        rcx, rcx
167    jnz         near .columnloop_r24
168
169    pop         rsi
170    pop         rdi
171    pop         rcx
172
173    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
174    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
175    dec         rax                        ; rowctr
176    jg          near .rowloop
177
178.return:
179    vzeroupper
180    uncollect_args 6
181    pop         rbp
182    ret
183
184; --------------------------------------------------------------------------
185;
186; Downsample pixel values of a single component.
187; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
188; without smoothing.
189;
190; GLOBAL(void)
191; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
192;                            JDIMENSION v_samp_factor,
193;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
194;                            JSAMPARRAY output_data);
195;
196
197; r10d = JDIMENSION image_width
198; r11 = int max_v_samp_factor
199; r12d = JDIMENSION v_samp_factor
200; r13d = JDIMENSION width_in_blocks
201; r14 = JSAMPARRAY input_data
202; r15 = JSAMPARRAY output_data
203
204    align       32
205    GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
206
207EXTN(jsimd_h2v2_downsample_avx2):
208    push        rbp
209    mov         rax, rsp
210    mov         rbp, rsp
211    collect_args 6
212
213    mov         ecx, r13d
214    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
215    jz          near .return
216
217    mov         edx, r10d
218
219    ; -- expand_right_edge
220
221    push        rcx
222    shl         rcx, 1                  ; output_cols * 2
223    sub         rcx, rdx
224    jle         short .expand_end
225
226    mov         rax, r11
227    test        rax, rax
228    jle         short .expand_end
229
230    cld
231    mov         rsi, r14                ; input_data
232.expandloop:
233    push        rax
234    push        rcx
235
236    mov         rdi, JSAMPROW [rsi]
237    add         rdi, rdx
238    mov         al, JSAMPLE [rdi-1]
239
240    rep stosb
241
242    pop         rcx
243    pop         rax
244
245    add         rsi, byte SIZEOF_JSAMPROW
246    dec         rax
247    jg          short .expandloop
248
249.expand_end:
250    pop         rcx                     ; output_cols
251
252    ; -- h2v2_downsample
253
254    mov         eax, r12d               ; rowctr
255    test        rax, rax
256    jle         near .return
257
258    mov         rdx, 0x00020001         ; bias pattern
259    vmovd       xmm7, edx
260    vpcmpeqw    ymm6, ymm6, ymm6
261    vpshufd     xmm7, xmm7, 0x00        ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
262    vperm2i128  ymm7, ymm7, ymm7, 0
263    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
264
265    mov         rsi, r14                ; input_data
266    mov         rdi, r15                ; output_data
267.rowloop:
268    push        rcx
269    push        rdi
270    push        rsi
271
272    mov         rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
273    mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
274    mov         rdi, JSAMPROW [rdi]                    ; outptr
275
276    cmp         rcx, byte SIZEOF_YMMWORD
277    jae         short .columnloop
278
279.columnloop_r24:
280    cmp         rcx, 24
281    jne         .columnloop_r16
282    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
283    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
284    vmovdqu     xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD]
285    vmovdqu     xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD]
286    mov         rcx, SIZEOF_YMMWORD
287    jmp         short .downsample
288
289.columnloop_r16:
290    cmp         rcx, 16
291    jne         .columnloop_r8
292    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
293    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
294    vpxor       ymm2, ymm2, ymm2
295    vpxor       ymm3, ymm3, ymm3
296    mov         rcx, SIZEOF_YMMWORD
297    jmp         short .downsample
298
299.columnloop_r8:
300    vmovdqu     xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
301    vmovdqu     xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
302    vpxor       ymm2, ymm2, ymm2
303    vpxor       ymm3, ymm3, ymm3
304    mov         rcx, SIZEOF_YMMWORD
305    jmp         short .downsample
306
307.columnloop:
308    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
309    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
310    vmovdqu     ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD]
311    vmovdqu     ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD]
312
313.downsample:
314    vpand       ymm4, ymm0, ymm6
315    vpsrlw      ymm0, ymm0, BYTE_BIT
316    vpand       ymm5, ymm1, ymm6
317    vpsrlw      ymm1, ymm1, BYTE_BIT
318    vpaddw      ymm0, ymm0, ymm4
319    vpaddw      ymm1, ymm1, ymm5
320
321    vpand       ymm4, ymm2, ymm6
322    vpsrlw      ymm2, ymm2, BYTE_BIT
323    vpand       ymm5, ymm3, ymm6
324    vpsrlw      ymm3, ymm3, BYTE_BIT
325    vpaddw      ymm2, ymm2, ymm4
326    vpaddw      ymm3, ymm3, ymm5
327
328    vpaddw      ymm0, ymm0, ymm1
329    vpaddw      ymm2, ymm2, ymm3
330    vpaddw      ymm0, ymm0, ymm7
331    vpaddw      ymm2, ymm2, ymm7
332    vpsrlw      ymm0, ymm0, 2
333    vpsrlw      ymm2, ymm2, 2
334
335    vpackuswb   ymm0, ymm0, ymm2
336    vpermq      ymm0, ymm0, 0xd8
337
338    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
339
340    sub         rcx, byte SIZEOF_YMMWORD    ; outcol
341    add         rdx, byte 2*SIZEOF_YMMWORD  ; inptr0
342    add         rsi, byte 2*SIZEOF_YMMWORD  ; inptr1
343    add         rdi, byte 1*SIZEOF_YMMWORD  ; outptr
344    cmp         rcx, byte SIZEOF_YMMWORD
345    jae         near .columnloop
346    test        rcx, rcx
347    jnz         near .columnloop_r24
348
349    pop         rsi
350    pop         rdi
351    pop         rcx
352
353    add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
354    add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
355    dec         rax                          ; rowctr
356    jg          near .rowloop
357
358.return:
359    vzeroupper
360    uncollect_args 6
361    pop         rbp
362    ret
363
364; For some reason, the OS X linker does not honor the request to align the
365; segment unless we do this.
366    align       32
367