• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jdsample.asm - upsampling (64-bit AVX2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, 2016, D. R. Commander.
6; Copyright (C) 2015, Intel Corporation.
7;
8; Based on the x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17;
18; [TAB8]
19
20%include "jsimdext.inc"
21
22; --------------------------------------------------------------------------
23    SECTION     SEG_CONST
24
25    alignz      32
26    GLOBAL_DATA(jconst_fancy_upsample_avx2)
27
28EXTN(jconst_fancy_upsample_avx2):
29
30PW_ONE   times 16 dw 1
31PW_TWO   times 16 dw 2
32PW_THREE times 16 dw 3
33PW_SEVEN times 16 dw 7
34PW_EIGHT times 16 dw 8
35
36    alignz      32
37
38; --------------------------------------------------------------------------
39    SECTION     SEG_TEXT
40    BITS        64
41;
42; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
43;
44; The upsampling algorithm is linear interpolation between pixel centers,
45; also known as a "triangle filter".  This is a good compromise between
46; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
47; of the way between input pixel centers.
48;
49; GLOBAL(void)
50; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
51;                                JDIMENSION downsampled_width,
52;                                JSAMPARRAY input_data,
53;                                JSAMPARRAY *output_data_ptr);
54;
55
56; r10 = int max_v_samp_factor
57; r11d = JDIMENSION downsampled_width
58; r12 = JSAMPARRAY input_data
59; r13 = JSAMPARRAY *output_data_ptr
60
61    align       32
62    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
63
64EXTN(jsimd_h2v1_fancy_upsample_avx2):
65    push        rbp
66    mov         rax, rsp
67    mov         rbp, rsp
68    push_xmm    3
69    collect_args 4
70
71    mov         eax, r11d               ; colctr
72    test        rax, rax
73    jz          near .return
74
75    mov         rcx, r10                ; rowctr
76    test        rcx, rcx
77    jz          near .return
78
79    mov         rsi, r12                ; input_data
80    mov         rdi, r13
81    mov         rdi, JSAMPARRAY [rdi]   ; output_data
82
83    vpxor       ymm0, ymm0, ymm0                 ; ymm0=(all 0's)
84    vpcmpeqb    xmm9, xmm9, xmm9
85    vpsrldq     xmm10, xmm9, (SIZEOF_XMMWORD-1)  ; (ff -- -- -- ... -- --) LSB is ff
86
87    vpslldq     xmm9, xmm9, (SIZEOF_XMMWORD-1)
88    vperm2i128  ymm9, ymm9, ymm9, 1              ; (---- ---- ... ---- ---- ff) MSB is ff
89
90.rowloop:
91    push        rax                     ; colctr
92    push        rdi
93    push        rsi
94
95    mov         rsi, JSAMPROW [rsi]     ; inptr
96    mov         rdi, JSAMPROW [rdi]     ; outptr
97
98    test        rax, SIZEOF_YMMWORD-1
99    jz          short .skip
100    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
101    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
102.skip:
103    vpand       ymm7, ymm10, YMMWORD [rsi+0*SIZEOF_YMMWORD]
104
105    add         rax, byte SIZEOF_YMMWORD-1
106    and         rax, byte -SIZEOF_YMMWORD
107    cmp         rax, byte SIZEOF_YMMWORD
108    ja          short .columnloop
109
110.columnloop_last:
111    vpand       ymm6, ymm9, YMMWORD [rsi+0*SIZEOF_YMMWORD]
112    jmp         short .upsample
113
114.columnloop:
115    vmovdqu     ymm6, YMMWORD [rsi+1*SIZEOF_YMMWORD]
116    vperm2i128  ymm6, ymm0, ymm6, 0x20
117    vpslldq     ymm6, ymm6, 15
118
119.upsample:
120    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]  ; ymm1=( 0  1  2 ... 29 30 31)
121
122    vperm2i128  ymm2, ymm0, ymm1, 0x20
123    vpalignr    ymm2, ymm1, ymm2, 15            ; ymm2=(--  0  1 ... 28 29 30)
124    vperm2i128  ymm4, ymm0, ymm1, 0x03
125    vpalignr    ymm3, ymm4, ymm1, 1             ; ymm3=( 1  2  3 ... 30 31 --)
126
127    vpor        ymm2, ymm2, ymm7                ; ymm2=(-1  0  1 ... 28 29 30)
128    vpor        ymm3, ymm3, ymm6                ; ymm3=( 1  2  3 ... 30 31 32)
129
130    vpsrldq     ymm7, ymm4, (SIZEOF_XMMWORD-1)  ; ymm7=(31 -- -- ... -- -- --)
131
132    vpunpckhbw  ymm4, ymm1, ymm0                ; ymm4=( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
133    vpunpcklbw  ymm5, ymm1, ymm0                ; ymm5=( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
134    vperm2i128  ymm1, ymm5, ymm4, 0x20          ; ymm1=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
135    vperm2i128  ymm4, ymm5, ymm4, 0x31          ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
136
137    vpunpckhbw  ymm5, ymm2, ymm0                ; ymm5=( 7  8  9 10 11 12 13 14 23 24 25 26 27 28 29 30)
138    vpunpcklbw  ymm6, ymm2, ymm0                ; ymm6=(-1  0  1  2  3  4  5  6 15 16 17 18 19 20 21 22)
139    vperm2i128  ymm2, ymm6, ymm5, 0x20          ; ymm2=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
140    vperm2i128  ymm5, ymm6, ymm5, 0x31          ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
141
142    vpunpckhbw  ymm6, ymm3, ymm0                ; ymm6=( 1  2  3  4  5  6  7  8 17 18 19 20 21 22 23 24)
143    vpunpcklbw  ymm8, ymm3, ymm0                ; ymm8=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
144    vperm2i128  ymm3, ymm8, ymm6, 0x20          ; ymm3=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
145    vperm2i128  ymm6, ymm8, ymm6, 0x31          ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
146
147    vpmullw     ymm1, ymm1, [rel PW_THREE]
148    vpmullw     ymm4, ymm4, [rel PW_THREE]
149    vpaddw      ymm2, ymm2, [rel PW_ONE]
150    vpaddw      ymm5, ymm5, [rel PW_ONE]
151    vpaddw      ymm3, ymm3, [rel PW_TWO]
152    vpaddw      ymm6, ymm6, [rel PW_TWO]
153
154    vpaddw      ymm2, ymm2, ymm1
155    vpaddw      ymm5, ymm5, ymm4
156    vpsrlw      ymm2, ymm2, 2                   ; ymm2=OutLE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
157    vpsrlw      ymm5, ymm5, 2                   ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
158    vpaddw      ymm3, ymm3, ymm1
159    vpaddw      ymm6, ymm6, ymm4
160    vpsrlw      ymm3, ymm3, 2                   ; ymm3=OutLO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
161    vpsrlw      ymm6, ymm6, 2                   ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
162
163    vpsllw      ymm3, ymm3, BYTE_BIT
164    vpsllw      ymm6, ymm6, BYTE_BIT
165    vpor        ymm2, ymm2, ymm3                ; ymm2=OutL=( 0  1  2 ... 29 30 31)
166    vpor        ymm5, ymm5, ymm6                ; ymm5=OutH=(32 33 34 ... 61 62 63)
167
168    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
169    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm5
170
171    sub         rax, byte SIZEOF_YMMWORD
172    add         rsi, byte 1*SIZEOF_YMMWORD  ; inptr
173    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
174    cmp         rax, byte SIZEOF_YMMWORD
175    ja          near .columnloop
176    test        eax, eax
177    jnz         near .columnloop_last
178
179    pop         rsi
180    pop         rdi
181    pop         rax
182
183    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
184    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
185    dec         rcx                        ; rowctr
186    jg          near .rowloop
187
188.return:
189    vzeroupper
190    uncollect_args 4
191    pop_xmm     3
192    pop         rbp
193    ret
194
195; --------------------------------------------------------------------------
196;
197; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
198; Again a triangle filter; see comments for h2v1 case, above.
199;
200; GLOBAL(void)
201; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
202;                                JDIMENSION downsampled_width,
203;                                JSAMPARRAY input_data,
204;                                JSAMPARRAY *output_data_ptr);
205;
206
207; r10 = int max_v_samp_factor
208; r11d = JDIMENSION downsampled_width
209; r12 = JSAMPARRAY input_data
210; r13 = JSAMPARRAY *output_data_ptr
211
212%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
213%define WK_NUM  4
214
215    align       32
216    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
217
218EXTN(jsimd_h2v2_fancy_upsample_avx2):
219    push        rbp
220    mov         rax, rsp                     ; rax = original rbp
221    sub         rsp, byte 4
222    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
223    mov         [rsp], rax
224    mov         rbp, rsp                     ; rbp = aligned rbp
225    lea         rsp, [wk(0)]
226    push_xmm    3
227    collect_args 4
228    push        rbx
229
230    mov         eax, r11d               ; colctr
231    test        rax, rax
232    jz          near .return
233
234    mov         rcx, r10                ; rowctr
235    test        rcx, rcx
236    jz          near .return
237
238    mov         rsi, r12                ; input_data
239    mov         rdi, r13
240    mov         rdi, JSAMPARRAY [rdi]   ; output_data
241.rowloop:
242    push        rax                     ; colctr
243    push        rcx
244    push        rdi
245    push        rsi
246
247    mov         rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
248    mov         rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
249    mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
250    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
251    mov         rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
252
253    vpxor       ymm8, ymm8, ymm8                 ; ymm8=(all 0's)
254    vpcmpeqb    xmm9, xmm9, xmm9
255    vpsrldq     xmm10, xmm9, (SIZEOF_XMMWORD-2)  ; (ffff ---- ---- ... ---- ----) LSB is ffff
256    vpslldq     xmm9, xmm9, (SIZEOF_XMMWORD-2)
257    vperm2i128  ymm9, ymm9, ymm9, 1              ; (---- ---- ... ---- ---- ffff) MSB is ffff
258
259    test        rax, SIZEOF_YMMWORD-1
260    jz          short .skip
261    push        rdx
262    mov         dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
263    mov         JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
264    mov         dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
265    mov         JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
266    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
267    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
268    pop         rdx
269.skip:
270    ; -- process the first column block
271
272    vmovdqu     ymm0, YMMWORD [rbx+0*SIZEOF_YMMWORD]  ; ymm0=row[ 0][0]
273    vmovdqu     ymm1, YMMWORD [rcx+0*SIZEOF_YMMWORD]  ; ymm1=row[-1][0]
274    vmovdqu     ymm2, YMMWORD [rsi+0*SIZEOF_YMMWORD]  ; ymm2=row[+1][0]
275
276    vpunpckhbw  ymm4, ymm0, ymm8        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
277    vpunpcklbw  ymm5, ymm0, ymm8        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
278    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
279    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
280
281    vpunpckhbw  ymm5, ymm1, ymm8        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
282    vpunpcklbw  ymm6, ymm1, ymm8        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
283    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
284    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
285
286    vpunpckhbw  ymm6, ymm2, ymm8        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
287    vpunpcklbw  ymm3, ymm2, ymm8        ; ymm3=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
288    vperm2i128  ymm2, ymm3, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
289    vperm2i128  ymm6, ymm3, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
290
291    vpmullw     ymm0, ymm0, [rel PW_THREE]
292    vpmullw     ymm4, ymm4, [rel PW_THREE]
293
294    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
295    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
296    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
297    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
298
299    vmovdqu     YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1  ; temporarily save
300    vmovdqu     YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5  ; the intermediate data
301    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
302    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm6
303
304    vpand       ymm1, ymm1, ymm10       ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
305    vpand       ymm2, ymm2, ymm10       ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
306
307    vmovdqa     YMMWORD [wk(0)], ymm1
308    vmovdqa     YMMWORD [wk(1)], ymm2
309
310    add         rax, byte SIZEOF_YMMWORD-1
311    and         rax, byte -SIZEOF_YMMWORD
312    cmp         rax, byte SIZEOF_YMMWORD
313    ja          short .columnloop
314
315.columnloop_last:
316    ; -- process the last column block
317
318    vpand       ymm1, ymm9, YMMWORD [rdx+1*SIZEOF_YMMWORD]
319    vpand       ymm2, ymm9, YMMWORD [rdi+1*SIZEOF_YMMWORD]
320
321    vmovdqa     YMMWORD [wk(2)], ymm1   ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
322    vmovdqa     YMMWORD [wk(3)], ymm2   ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
323
324    jmp         near .upsample
325
326.columnloop:
327    ; -- process the next column block
328
329    vmovdqu     ymm0, YMMWORD [rbx+1*SIZEOF_YMMWORD]  ; ymm0=row[ 0][1]
330    vmovdqu     ymm1, YMMWORD [rcx+1*SIZEOF_YMMWORD]  ; ymm1=row[-1][1]
331    vmovdqu     ymm2, YMMWORD [rsi+1*SIZEOF_YMMWORD]  ; ymm2=row[+1][1]
332
333    vpunpckhbw  ymm4, ymm0, ymm8        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
334    vpunpcklbw  ymm5, ymm0, ymm8        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
335    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
336    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
337
338    vpunpckhbw  ymm5, ymm1, ymm8        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
339    vpunpcklbw  ymm6, ymm1, ymm8        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
340    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
341    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
342
343    vpunpckhbw  ymm6, ymm2, ymm8        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
344    vpunpcklbw  ymm7, ymm2, ymm8        ; ymm7=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
345    vperm2i128  ymm2, ymm7, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
346    vperm2i128  ymm6, ymm7, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
347
348    vpmullw     ymm0, ymm0, [rel PW_THREE]
349    vpmullw     ymm4, ymm4, [rel PW_THREE]
350
351    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
352    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
353    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
354    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
355
356    vmovdqu     YMMWORD [rdx+2*SIZEOF_YMMWORD], ymm1  ; temporarily save
357    vmovdqu     YMMWORD [rdx+3*SIZEOF_YMMWORD], ymm5  ; the intermediate data
358    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymm2
359    vmovdqu     YMMWORD [rdi+3*SIZEOF_YMMWORD], ymm6
360
361    vperm2i128  ymm1, ymm8, ymm1, 0x20
362    vpslldq     ymm1, ymm1, 14          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
363    vperm2i128  ymm2, ymm8, ymm2, 0x20
364    vpslldq     ymm2, ymm2, 14          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
365
366    vmovdqa     YMMWORD [wk(2)], ymm1
367    vmovdqa     YMMWORD [wk(3)], ymm2
368
369.upsample:
370    ; -- process the upper row
371
372    vmovdqu     ymm7, YMMWORD [rdx+0*SIZEOF_YMMWORD]  ; ymm7=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
373    vmovdqu     ymm3, YMMWORD [rdx+1*SIZEOF_YMMWORD]  ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
374
375    vperm2i128  ymm0, ymm8, ymm7, 0x03
376    vpalignr    ymm0, ymm0, ymm7, 2     ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
377    vperm2i128  ymm4, ymm8, ymm3, 0x20
378    vpslldq     ymm4, ymm4, 14          ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
379
380    vperm2i128  ymm5, ymm8, ymm7, 0x03
381    vpsrldq     ymm5, ymm5, 14          ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
382    vperm2i128  ymm6, ymm8, ymm3, 0x20
383    vpalignr    ymm6, ymm3, ymm6, 14    ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
384
385    vpor        ymm0, ymm0, ymm4        ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
386    vpor        ymm5, ymm5, ymm6        ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
387
388    vperm2i128  ymm2, ymm8, ymm3, 0x03
389    vpalignr    ymm2, ymm2, ymm3, 2     ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
390    vperm2i128  ymm4, ymm8, ymm3, 0x03
391    vpsrldq     ymm4, ymm4, 14          ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
392    vperm2i128  ymm1, ymm8, ymm7, 0x20
393    vpalignr    ymm1, ymm7, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
394
395    vpor        ymm1, ymm1, YMMWORD [wk(0)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
396    vpor        ymm2, ymm2, YMMWORD [wk(2)]  ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
397
398    vmovdqa     YMMWORD [wk(0)], ymm4
399
400    vpmullw     ymm7, ymm7, [rel PW_THREE]
401    vpmullw     ymm3, ymm3, [rel PW_THREE]
402    vpaddw      ymm1, ymm1, [rel PW_EIGHT]
403    vpaddw      ymm5, ymm5, [rel PW_EIGHT]
404    vpaddw      ymm0, ymm0, [rel PW_SEVEN]
405    vpaddw      ymm2, [rel PW_SEVEN]
406
407    vpaddw      ymm1, ymm1, ymm7
408    vpaddw      ymm5, ymm5, ymm3
409    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out0LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
410    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
411    vpaddw      ymm0, ymm0, ymm7
412    vpaddw      ymm2, ymm2, ymm3
413    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out0LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
414    vpsrlw      ymm2, ymm2, 4           ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
415
416    vpsllw      ymm0, ymm0, BYTE_BIT
417    vpsllw      ymm2, ymm2, BYTE_BIT
418    vpor        ymm1, ymm1, ymm0        ; ymm1=Out0L=( 0  1  2 ... 29 30 31)
419    vpor        ymm5, ymm5, ymm2        ; ymm5=Out0H=(32 33 34 ... 61 62 63)
420
421    vmovdqu     YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1
422    vmovdqu     YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5
423
424    ; -- process the lower row
425
426    vmovdqu     ymm6, YMMWORD [rdi+0*SIZEOF_YMMWORD]  ; ymm6=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
427    vmovdqu     ymm4, YMMWORD [rdi+1*SIZEOF_YMMWORD]  ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
428
429    vperm2i128  ymm7, ymm8, ymm6, 0x03
430    vpalignr    ymm7, ymm7, ymm6, 2     ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
431    vperm2i128  ymm3, ymm8, ymm4, 0x20
432    vpslldq     ymm3, ymm3, 14          ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
433
434    vperm2i128  ymm0, ymm8, ymm6, 0x03
435    vpsrldq     ymm0, ymm0, 14          ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
436    vperm2i128  ymm2, ymm8, ymm4, 0x20
437    vpalignr    ymm2, ymm4, ymm2, 14    ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
438
439    vpor        ymm7, ymm7, ymm3        ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
440    vpor        ymm0, ymm0, ymm2        ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
441
442    vperm2i128  ymm5, ymm8, ymm4, 0x03
443    vpalignr    ymm5, ymm5, ymm4, 2     ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
444    vperm2i128  ymm3, ymm8, ymm4, 0x03
445    vpsrldq     ymm3, ymm3, 14          ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
446    vperm2i128  ymm1, ymm8, ymm6, 0x20
447    vpalignr    ymm1, ymm6, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
448
449    vpor        ymm1, ymm1, YMMWORD [wk(1)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
450    vpor        ymm5, ymm5, YMMWORD [wk(3)]  ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
451
452    vmovdqa     YMMWORD [wk(1)], ymm3
453
454    vpmullw     ymm6, ymm6, [rel PW_THREE]
455    vpmullw     ymm4, ymm4, [rel PW_THREE]
456    vpaddw      ymm1, ymm1, [rel PW_EIGHT]
457    vpaddw      ymm0, ymm0, [rel PW_EIGHT]
458    vpaddw      ymm7, ymm7, [rel PW_SEVEN]
459    vpaddw      ymm5, ymm5, [rel PW_SEVEN]
460
461    vpaddw      ymm1, ymm1, ymm6
462    vpaddw      ymm0, ymm0, ymm4
463    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out1LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
464    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
465    vpaddw      ymm7, ymm7, ymm6
466    vpaddw      ymm5, ymm5, ymm4
467    vpsrlw      ymm7, ymm7, 4           ; ymm7=Out1LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
468    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
469
470    vpsllw      ymm7, ymm7, BYTE_BIT
471    vpsllw      ymm5, ymm5, BYTE_BIT
472    vpor        ymm1, ymm1, ymm7        ; ymm1=Out1L=( 0  1  2 ... 29 30 31)
473    vpor        ymm0, ymm0, ymm5        ; ymm0=Out1H=(32 33 34 ... 61 62 63)
474
475    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm1
476    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm0
477
478    sub         rax, byte SIZEOF_YMMWORD
479    add         rcx, byte 1*SIZEOF_YMMWORD  ; inptr1(above)
480    add         rbx, byte 1*SIZEOF_YMMWORD  ; inptr0
481    add         rsi, byte 1*SIZEOF_YMMWORD  ; inptr1(below)
482    add         rdx, byte 2*SIZEOF_YMMWORD  ; outptr0
483    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr1
484    cmp         rax, byte SIZEOF_YMMWORD
485    ja          near .columnloop
486    test        rax, rax
487    jnz         near .columnloop_last
488
489    pop         rsi
490    pop         rdi
491    pop         rcx
492    pop         rax
493
494    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
495    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
496    sub         rcx, byte 2                  ; rowctr
497    jg          near .rowloop
498
499.return:
500    pop         rbx
501    vzeroupper
502    uncollect_args 4
503    pop_xmm     3
504    mov         rsp, rbp                ; rsp <- aligned rbp
505    pop         rsp                     ; rsp <- original rbp
506    pop         rbp
507    ret
508
509; --------------------------------------------------------------------------
510;
511; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
512; It's still a box filter.
513;
514; GLOBAL(void)
515; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
516;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
517;
518
519; r10 = int max_v_samp_factor
520; r11d = JDIMENSION output_width
521; r12 = JSAMPARRAY input_data
522; r13 = JSAMPARRAY *output_data_ptr
523
524    align       32
525    GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
526
527EXTN(jsimd_h2v1_upsample_avx2):
528    push        rbp
529    mov         rax, rsp
530    mov         rbp, rsp
531    collect_args 4
532
533    mov         edx, r11d
534    add         rdx, byte (SIZEOF_YMMWORD-1)
535    and         rdx, -SIZEOF_YMMWORD
536    jz          near .return
537
538    mov         rcx, r10                ; rowctr
539    test        rcx, rcx
540    jz          short .return
541
542    mov         rsi, r12                ; input_data
543    mov         rdi, r13
544    mov         rdi, JSAMPARRAY [rdi]   ; output_data
545.rowloop:
546    push        rdi
547    push        rsi
548
549    mov         rsi, JSAMPROW [rsi]     ; inptr
550    mov         rdi, JSAMPROW [rdi]     ; outptr
551    mov         rax, rdx                ; colctr
552.columnloop:
553
554    cmp         rax, byte SIZEOF_YMMWORD
555    ja          near .above_16
556
557    vmovdqu     xmm0, XMMWORD [rsi+0*SIZEOF_YMMWORD]
558    vpunpckhbw  xmm1, xmm0, xmm0
559    vpunpcklbw  xmm0, xmm0, xmm0
560
561    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
562    vmovdqu     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
563
564    jmp         short .nextrow
565
566.above_16:
567    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
568
569    vpermq      ymm0, ymm0, 0xd8
570    vpunpckhbw  ymm1, ymm0, ymm0
571    vpunpcklbw  ymm0, ymm0, ymm0
572
573    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
574    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
575
576    sub         rax, byte 2*SIZEOF_YMMWORD
577    jz          short .nextrow
578
579    add         rsi, byte SIZEOF_YMMWORD    ; inptr
580    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
581    jmp         short .columnloop
582
583.nextrow:
584    pop         rsi
585    pop         rdi
586
587    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
588    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
589    dec         rcx                        ; rowctr
590    jg          short .rowloop
591
592.return:
593    vzeroupper
594    uncollect_args 4
595    pop         rbp
596    ret
597
598; --------------------------------------------------------------------------
599;
600; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
601; It's still a box filter.
602;
603; GLOBAL(void)
604; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
605;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
606;
607
608; r10 = int max_v_samp_factor
609; r11d = JDIMENSION output_width
610; r12 = JSAMPARRAY input_data
611; r13 = JSAMPARRAY *output_data_ptr
612
613    align       32
614    GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
615
616EXTN(jsimd_h2v2_upsample_avx2):
617    push        rbp
618    mov         rax, rsp
619    mov         rbp, rsp
620    collect_args 4
621    push        rbx
622
623    mov         edx, r11d
624    add         rdx, byte (SIZEOF_YMMWORD-1)
625    and         rdx, -SIZEOF_YMMWORD
626    jz          near .return
627
628    mov         rcx, r10                ; rowctr
629    test        rcx, rcx
630    jz          near .return
631
632    mov         rsi, r12                ; input_data
633    mov         rdi, r13
634    mov         rdi, JSAMPARRAY [rdi]   ; output_data
635.rowloop:
636    push        rdi
637    push        rsi
638
639    mov         rsi, JSAMPROW [rsi]                    ; inptr
640    mov         rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
641    mov         rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
642    mov         rax, rdx                               ; colctr
643.columnloop:
644
645    cmp         rax, byte SIZEOF_YMMWORD
646    ja          short .above_16
647
648    vmovdqu     xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
649    vpunpckhbw  xmm1, xmm0, xmm0
650    vpunpcklbw  xmm0, xmm0, xmm0
651
652    vmovdqu     XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
653    vmovdqu     XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
654    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
655    vmovdqu     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
656
657    jmp         near .nextrow
658
659.above_16:
660    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
661
662    vpermq      ymm0, ymm0, 0xd8
663    vpunpckhbw  ymm1, ymm0, ymm0
664    vpunpcklbw  ymm0, ymm0, ymm0
665
666    vmovdqu     YMMWORD [rbx+0*SIZEOF_YMMWORD], ymm0
667    vmovdqu     YMMWORD [rbx+1*SIZEOF_YMMWORD], ymm1
668    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
669    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
670
671    sub         rax, byte 2*SIZEOF_YMMWORD
672    jz          short .nextrow
673
674    add         rsi, byte SIZEOF_YMMWORD  ; inptr
675    add         rbx, 2*SIZEOF_YMMWORD     ; outptr0
676    add         rdi, 2*SIZEOF_YMMWORD     ; outptr1
677    jmp         short .columnloop
678
679.nextrow:
680    pop         rsi
681    pop         rdi
682
683    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
684    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
685    sub         rcx, byte 2                  ; rowctr
686    jg          near .rowloop
687
688.return:
689    pop         rbx
690    vzeroupper
691    uncollect_args 4
692    pop         rbp
693    ret
694
695; For some reason, the OS X linker does not honor the request to align the
696; segment unless we do this.
697    align       32
698