• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jdsample.asm - upsampling (AVX2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2015, Intel Corporation.
6; Copyright (C) 2016, D. R. Commander.
7;
8; Based on the x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17;
18; [TAB8]
19
20%include "jsimdext.inc"
21
22; --------------------------------------------------------------------------
23    SECTION     SEG_CONST
24
25    alignz      32
26    GLOBAL_DATA(jconst_fancy_upsample_avx2)
27
28EXTN(jconst_fancy_upsample_avx2):
29
30PW_ONE   times 16 dw 1
31PW_TWO   times 16 dw 2
32PW_THREE times 16 dw 3
33PW_SEVEN times 16 dw 7
34PW_EIGHT times 16 dw 8
35
36    alignz      32
37
38; --------------------------------------------------------------------------
39    SECTION     SEG_TEXT
40    BITS        32
41;
42; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
43;
44; The upsampling algorithm is linear interpolation between pixel centers,
45; also known as a "triangle filter".  This is a good compromise between
46; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
47; of the way between input pixel centers.
48;
49; GLOBAL(void)
50; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
51;                                JDIMENSION downsampled_width,
52;                                JSAMPARRAY input_data,
53;                                JSAMPARRAY *output_data_ptr);
54;
55
56%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
57%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
58%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
59%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
60
61    align       32
62    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
63
64EXTN(jsimd_h2v1_fancy_upsample_avx2):
65    push        ebp
66    mov         ebp, esp
67    pushpic     ebx
68;   push        ecx                     ; need not be preserved
69;   push        edx                     ; need not be preserved
70    push        esi
71    push        edi
72
73    get_GOT     ebx                     ; get GOT address
74
75    mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
76    test        eax, eax
77    jz          near .return
78
79    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
80    test        ecx, ecx
81    jz          near .return
82
83    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
84    mov         edi, POINTER [output_data_ptr(ebp)]
85    mov         edi, JSAMPARRAY [edi]                ; output_data
86    alignx      16, 7
87.rowloop:
88    push        eax                     ; colctr
89    push        edi
90    push        esi
91
92    mov         esi, JSAMPROW [esi]     ; inptr
93    mov         edi, JSAMPROW [edi]     ; outptr
94
95    test        eax, SIZEOF_YMMWORD-1
96    jz          short .skip
97    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
98    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
99.skip:
100    vpxor       ymm0, ymm0, ymm0                ; ymm0=(all 0's)
101    vpcmpeqb    xmm7, xmm7, xmm7
102    vpsrldq     xmm7, xmm7, (SIZEOF_XMMWORD-1)  ; (ff -- -- -- ... -- --) LSB is ff
103    vpand       ymm7, ymm7, YMMWORD [esi+0*SIZEOF_YMMWORD]
104
105    add         eax, byte SIZEOF_YMMWORD-1
106    and         eax, byte -SIZEOF_YMMWORD
107    cmp         eax, byte SIZEOF_YMMWORD
108    ja          short .columnloop
109    alignx      16, 7
110
111.columnloop_last:
112    vpcmpeqb    xmm6, xmm6, xmm6
113    vpslldq     xmm6, xmm6, (SIZEOF_XMMWORD-1)
114    vperm2i128  ymm6, ymm6, ymm6, 1             ; (---- ---- ... ---- ---- ff) MSB is ff
115    vpand       ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD]
116    jmp         short .upsample
117    alignx      16, 7
118
119.columnloop:
120    vmovdqu     ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD]
121    vperm2i128  ymm6, ymm0, ymm6, 0x20
122    vpslldq     ymm6, ymm6, 15
123
124.upsample:
125    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]  ; ymm1=( 0  1  2 ... 29 30 31)
126
127    vperm2i128  ymm2, ymm0, ymm1, 0x20
128    vpalignr    ymm2, ymm1, ymm2, 15            ; ymm2=(--  0  1 ... 28 29 30)
129    vperm2i128  ymm4, ymm0, ymm1, 0x03
130    vpalignr    ymm3, ymm4, ymm1, 1             ; ymm3=( 1  2  3 ... 30 31 --)
131
132    vpor        ymm2, ymm2, ymm7                ; ymm2=(-1  0  1 ... 28 29 30)
133    vpor        ymm3, ymm3, ymm6                ; ymm3=( 1  2  3 ... 30 31 32)
134
135    vpsrldq     ymm7, ymm4, (SIZEOF_XMMWORD-1)  ; ymm7=(31 -- -- ... -- -- --)
136
137    vpunpckhbw  ymm4, ymm1, ymm0                ; ymm4=( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
138    vpunpcklbw  ymm5, ymm1, ymm0                ; ymm5=( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
139    vperm2i128  ymm1, ymm5, ymm4, 0x20          ; ymm1=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
140    vperm2i128  ymm4, ymm5, ymm4, 0x31          ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
141
142    vpunpckhbw  ymm5, ymm2, ymm0                ; ymm5=( 7  8  9 10 11 12 13 14 23 24 25 26 27 28 29 30)
143    vpunpcklbw  ymm6, ymm2, ymm0                ; ymm6=(-1  0  1  2  3  4  5  6 15 16 17 18 19 20 21 22)
144    vperm2i128  ymm2, ymm6, ymm5, 0x20          ; ymm2=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
145    vperm2i128  ymm5, ymm6, ymm5, 0x31          ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
146
147    vpunpckhbw  ymm6, ymm3, ymm0                ; ymm6=( 1  2  3  4  5  6  7  8 17 18 19 20 21 22 23 24)
148    vpunpcklbw  ymm0, ymm3, ymm0                ; ymm0=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
149    vperm2i128  ymm3, ymm0, ymm6, 0x20          ; ymm3=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
150    vperm2i128  ymm6, ymm0, ymm6, 0x31          ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
151
152    vpxor       ymm0, ymm0, ymm0                ; ymm0=(all 0's)
153
154    vpmullw     ymm1, ymm1, [GOTOFF(ebx,PW_THREE)]
155    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
156    vpaddw      ymm2, ymm2, [GOTOFF(ebx,PW_ONE)]
157    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_ONE)]
158    vpaddw      ymm3, ymm3, [GOTOFF(ebx,PW_TWO)]
159    vpaddw      ymm6, ymm6, [GOTOFF(ebx,PW_TWO)]
160
161    vpaddw      ymm2, ymm2, ymm1
162    vpaddw      ymm5, ymm5, ymm4
163    vpsrlw      ymm2, ymm2, 2                   ; ymm2=OutLE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
164    vpsrlw      ymm5, ymm5, 2                   ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
165    vpaddw      ymm3, ymm3, ymm1
166    vpaddw      ymm6, ymm6, ymm4
167    vpsrlw      ymm3, ymm3, 2                   ; ymm3=OutLO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
168    vpsrlw      ymm6, ymm6, 2                   ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
169
170    vpsllw      ymm3, ymm3, BYTE_BIT
171    vpsllw      ymm6, ymm6, BYTE_BIT
172    vpor        ymm2, ymm2, ymm3                ; ymm2=OutL=( 0  1  2 ... 29 30 31)
173    vpor        ymm5, ymm5, ymm6                ; ymm5=OutH=(32 33 34 ... 61 62 63)
174
175    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
176    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm5
177
178    sub         eax, byte SIZEOF_YMMWORD
179    add         esi, byte 1*SIZEOF_YMMWORD  ; inptr
180    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
181    cmp         eax, byte SIZEOF_YMMWORD
182    ja          near .columnloop
183    test        eax, eax
184    jnz         near .columnloop_last
185
186    pop         esi
187    pop         edi
188    pop         eax
189
190    add         esi, byte SIZEOF_JSAMPROW  ; input_data
191    add         edi, byte SIZEOF_JSAMPROW  ; output_data
192    dec         ecx                        ; rowctr
193    jg          near .rowloop
194
195.return:
196    vzeroupper
197    pop         edi
198    pop         esi
199;   pop         edx                     ; need not be preserved
200;   pop         ecx                     ; need not be preserved
201    poppic      ebx
202    pop         ebp
203    ret
204
205; --------------------------------------------------------------------------
206;
207; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
208; Again a triangle filter; see comments for h2v1 case, above.
209;
210; GLOBAL(void)
211; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
212;                                JDIMENSION downsampled_width,
213;                                JSAMPARRAY input_data,
214;                                JSAMPARRAY *output_data_ptr);
215;
216
217%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
218%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
219%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
220%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
221
222%define original_ebp  ebp + 0
223%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
224                                        ; ymmword wk[WK_NUM]
225%define WK_NUM        4
226%define gotptr        wk(0) - SIZEOF_POINTER  ; void *gotptr
227
228    align       32
229    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
230
231EXTN(jsimd_h2v2_fancy_upsample_avx2):
232    push        ebp
233    mov         eax, esp                     ; eax = original ebp
234    sub         esp, byte 4
235    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
236    mov         [esp], eax
237    mov         ebp, esp                     ; ebp = aligned ebp
238    lea         esp, [wk(0)]
239    pushpic     eax                     ; make a room for GOT address
240    push        ebx
241;   push        ecx                     ; need not be preserved
242;   push        edx                     ; need not be preserved
243    push        esi
244    push        edi
245
246    get_GOT     ebx                     ; get GOT address
247    movpic      POINTER [gotptr], ebx   ; save GOT address
248
249    mov         edx, eax                ; edx = original ebp
250    mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
251    test        eax, eax
252    jz          near .return
253
254    mov         ecx, INT [max_v_samp(edx)]  ; rowctr
255    test        ecx, ecx
256    jz          near .return
257
258    mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
259    mov         edi, POINTER [output_data_ptr(edx)]
260    mov         edi, JSAMPARRAY [edi]                ; output_data
261    alignx      16, 7
262.rowloop:
263    push        eax                     ; colctr
264    push        ecx
265    push        edi
266    push        esi
267
268    mov         ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
269    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
270    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
271    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
272    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
273
274    test        eax, SIZEOF_YMMWORD-1
275    jz          short .skip
276    push        edx
277    mov         dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
278    mov         JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
279    mov         dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
280    mov         JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
281    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
282    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
283    pop         edx
284.skip:
285    ; -- process the first column block
286
287    vmovdqu     ymm0, YMMWORD [ebx+0*SIZEOF_YMMWORD]  ; ymm0=row[ 0][0]
288    vmovdqu     ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD]  ; ymm1=row[-1][0]
289    vmovdqu     ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD]  ; ymm2=row[+1][0]
290
291    pushpic     ebx
292    movpic      ebx, POINTER [gotptr]   ; load GOT address
293
294    vpxor       ymm3, ymm3, ymm3        ; ymm3=(all 0's)
295
296    vpunpckhbw  ymm4, ymm0, ymm3        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
297    vpunpcklbw  ymm5, ymm0, ymm3        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
298    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
299    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
300
301    vpunpckhbw  ymm5, ymm1, ymm3        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
302    vpunpcklbw  ymm6, ymm1, ymm3        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
303    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
304    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
305
306    vpunpckhbw  ymm6, ymm2, ymm3        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
307    vpunpcklbw  ymm3, ymm2, ymm3        ; ymm3=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
308    vperm2i128  ymm2, ymm3, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
309    vperm2i128  ymm6, ymm3, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
310
311    vpmullw     ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
312    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
313
314    vpcmpeqb    xmm7, xmm7, xmm7
315    vpsrldq     xmm7, xmm7, (SIZEOF_XMMWORD-2)  ; (ffff ---- ---- ... ---- ----) LSB is ffff
316
317    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
318    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
319    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
320    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
321
322    vmovdqu     YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1  ; temporarily save
323    vmovdqu     YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5  ; the intermediate data
324    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
325    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm6
326
327    vpand       ymm1, ymm1, ymm7        ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
328    vpand       ymm2, ymm2, ymm7        ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
329
330    vmovdqa     YMMWORD [wk(0)], ymm1
331    vmovdqa     YMMWORD [wk(1)], ymm2
332
333    poppic      ebx
334
335    add         eax, byte SIZEOF_YMMWORD-1
336    and         eax, byte -SIZEOF_YMMWORD
337    cmp         eax, byte SIZEOF_YMMWORD
338    ja          short .columnloop
339    alignx      16, 7
340
341.columnloop_last:
342    ; -- process the last column block
343
344    pushpic     ebx
345    movpic      ebx, POINTER [gotptr]   ; load GOT address
346
347    vpcmpeqb    xmm1, xmm1, xmm1
348    vpslldq     xmm1, xmm1, (SIZEOF_XMMWORD-2)
349    vperm2i128  ymm1, ymm1, ymm1, 1             ; (---- ---- ... ---- ---- ffff) MSB is ffff
350
351    vpand       ymm2, ymm1, YMMWORD [edi+1*SIZEOF_YMMWORD]
352    vpand       ymm1, ymm1, YMMWORD [edx+1*SIZEOF_YMMWORD]
353
354    vmovdqa     YMMWORD [wk(2)], ymm1          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
355    vmovdqa     YMMWORD [wk(3)], ymm2          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
356
357    jmp         near .upsample
358    alignx      16, 7
359
360.columnloop:
361    ; -- process the next column block
362
363    vmovdqu     ymm0, YMMWORD [ebx+1*SIZEOF_YMMWORD]  ; ymm0=row[ 0][1]
364    vmovdqu     ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD]  ; ymm1=row[-1][1]
365    vmovdqu     ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD]  ; ymm2=row[+1][1]
366
367    pushpic     ebx
368    movpic      ebx, POINTER [gotptr]   ; load GOT address
369
370    vpxor       ymm3, ymm3, ymm3        ; ymm3=(all 0's)
371
372    vpunpckhbw  ymm4, ymm0, ymm3        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
373    vpunpcklbw  ymm5, ymm0, ymm3        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
374    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
375    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
376
377    vpunpckhbw  ymm5, ymm1, ymm3        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
378    vpunpcklbw  ymm6, ymm1, ymm3        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
379    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
380    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
381
382    vpunpckhbw  ymm6, ymm2, ymm3        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
383    vpunpcklbw  ymm7, ymm2, ymm3        ; ymm7=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
384    vperm2i128  ymm2, ymm7, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
385    vperm2i128  ymm6, ymm7, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
386
387    vpmullw     ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
388    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
389
390    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
391    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
392    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
393    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
394
395    vmovdqu     YMMWORD [edx+2*SIZEOF_YMMWORD], ymm1  ; temporarily save
396    vmovdqu     YMMWORD [edx+3*SIZEOF_YMMWORD], ymm5  ; the intermediate data
397    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymm2
398    vmovdqu     YMMWORD [edi+3*SIZEOF_YMMWORD], ymm6
399
400    vperm2i128  ymm1, ymm3, ymm1, 0x20
401    vpslldq     ymm1, ymm1, 14          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
402    vperm2i128  ymm2, ymm3, ymm2, 0x20
403    vpslldq     ymm2, ymm2, 14          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
404
405    vmovdqa     YMMWORD [wk(2)], ymm1
406    vmovdqa     YMMWORD [wk(3)], ymm2
407
408.upsample:
409    ; -- process the upper row
410
411    vmovdqu     ymm7, YMMWORD [edx+0*SIZEOF_YMMWORD]  ; ymm7=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
412    vmovdqu     ymm3, YMMWORD [edx+1*SIZEOF_YMMWORD]  ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
413
414    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
415
416    vperm2i128  ymm0, ymm1, ymm7, 0x03
417    vpalignr    ymm0, ymm0, ymm7, 2     ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
418    vperm2i128  ymm4, ymm1, ymm3, 0x20
419    vpslldq     ymm4, ymm4, 14          ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
420
421    vperm2i128  ymm5, ymm1, ymm7, 0x03
422    vpsrldq     ymm5, ymm5, 14          ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
423    vperm2i128  ymm6, ymm1, ymm3, 0x20
424    vpalignr    ymm6, ymm3, ymm6, 14    ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
425
426    vpor        ymm0, ymm0, ymm4        ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
427    vpor        ymm5, ymm5, ymm6        ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
428
429    vperm2i128  ymm2, ymm1, ymm3, 0x03
430    vpalignr    ymm2, ymm2, ymm3, 2     ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
431    vperm2i128  ymm4, ymm1, ymm3, 0x03
432    vpsrldq     ymm4, ymm4, 14          ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
433    vperm2i128  ymm1, ymm1, ymm7, 0x20
434    vpalignr    ymm1, ymm7, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
435
436    vpor        ymm1, ymm1, YMMWORD [wk(0)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
437    vpor        ymm2, ymm2, YMMWORD [wk(2)]  ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
438
439    vmovdqa     YMMWORD [wk(0)], ymm4
440
441    vpmullw     ymm7, ymm7, [GOTOFF(ebx,PW_THREE)]
442    vpmullw     ymm3, ymm3, [GOTOFF(ebx,PW_THREE)]
443    vpaddw      ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
444    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_EIGHT)]
445    vpaddw      ymm0, ymm0, [GOTOFF(ebx,PW_SEVEN)]
446    vpaddw      ymm2, [GOTOFF(ebx,PW_SEVEN)]
447
448    vpaddw      ymm1, ymm1, ymm7
449    vpaddw      ymm5, ymm5, ymm3
450    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out0LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
451    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
452    vpaddw      ymm0, ymm0, ymm7
453    vpaddw      ymm2, ymm2, ymm3
454    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out0LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
455    vpsrlw      ymm2, ymm2, 4           ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
456
457    vpsllw      ymm0, ymm0, BYTE_BIT
458    vpsllw      ymm2, ymm2, BYTE_BIT
459    vpor        ymm1, ymm1, ymm0        ; ymm1=Out0L=( 0  1  2 ... 29 30 31)
460    vpor        ymm5, ymm5, ymm2        ; ymm5=Out0H=(32 33 34 ... 61 62 63)
461
462    vmovdqu     YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1
463    vmovdqu     YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5
464
465    ; -- process the lower row
466
467    vmovdqu     ymm6, YMMWORD [edi+0*SIZEOF_YMMWORD]  ; ymm6=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
468    vmovdqu     ymm4, YMMWORD [edi+1*SIZEOF_YMMWORD]  ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
469
470    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
471
472    vperm2i128  ymm7, ymm1, ymm6, 0x03
473    vpalignr    ymm7, ymm7, ymm6, 2     ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
474    vperm2i128  ymm3, ymm1, ymm4, 0x20
475    vpslldq     ymm3, ymm3, 14          ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
476
477    vperm2i128  ymm0, ymm1, ymm6, 0x03
478    vpsrldq     ymm0, ymm0, 14          ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
479    vperm2i128  ymm2, ymm1, ymm4, 0x20
480    vpalignr    ymm2, ymm4, ymm2, 14    ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
481
482    vpor        ymm7, ymm7, ymm3        ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
483    vpor        ymm0, ymm0, ymm2        ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
484
485    vperm2i128  ymm5, ymm1, ymm4, 0x03
486    vpalignr    ymm5, ymm5, ymm4, 2     ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
487    vperm2i128  ymm3, ymm1, ymm4, 0x03
488    vpsrldq     ymm3, ymm3, 14          ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
489    vperm2i128  ymm1, ymm1, ymm6, 0x20
490    vpalignr    ymm1, ymm6, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
491
492    vpor        ymm1, ymm1, YMMWORD [wk(1)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
493    vpor        ymm5, ymm5, YMMWORD [wk(3)]  ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
494
495    vmovdqa     YMMWORD [wk(1)], ymm3
496
497    vpmullw     ymm6, ymm6, [GOTOFF(ebx,PW_THREE)]
498    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
499    vpaddw      ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
500    vpaddw      ymm0, ymm0, [GOTOFF(ebx,PW_EIGHT)]
501    vpaddw      ymm7, ymm7, [GOTOFF(ebx,PW_SEVEN)]
502    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_SEVEN)]
503
504    vpaddw      ymm1, ymm1, ymm6
505    vpaddw      ymm0, ymm0, ymm4
506    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out1LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
507    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
508    vpaddw      ymm7, ymm7, ymm6
509    vpaddw      ymm5, ymm5, ymm4
510    vpsrlw      ymm7, ymm7, 4           ; ymm7=Out1LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
511    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
512
513    vpsllw      ymm7, ymm7, BYTE_BIT
514    vpsllw      ymm5, ymm5, BYTE_BIT
515    vpor        ymm1, ymm1, ymm7        ; ymm1=Out1L=( 0  1  2 ... 29 30 31)
516    vpor        ymm0, ymm0, ymm5        ; ymm0=Out1H=(32 33 34 ... 61 62 63)
517
518    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1
519    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0
520
521    poppic      ebx
522
523    sub         eax, byte SIZEOF_YMMWORD
524    add         ecx, byte 1*SIZEOF_YMMWORD  ; inptr1(above)
525    add         ebx, byte 1*SIZEOF_YMMWORD  ; inptr0
526    add         esi, byte 1*SIZEOF_YMMWORD  ; inptr1(below)
527    add         edx, byte 2*SIZEOF_YMMWORD  ; outptr0
528    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr1
529    cmp         eax, byte SIZEOF_YMMWORD
530    ja          near .columnloop
531    test        eax, eax
532    jnz         near .columnloop_last
533
534    pop         esi
535    pop         edi
536    pop         ecx
537    pop         eax
538
539    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
540    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
541    sub         ecx, byte 2                  ; rowctr
542    jg          near .rowloop
543
544.return:
545    vzeroupper
546    pop         edi
547    pop         esi
548;   pop         edx                     ; need not be preserved
549;   pop         ecx                     ; need not be preserved
550    pop         ebx
551    mov         esp, ebp                ; esp <- aligned ebp
552    pop         esp                     ; esp <- original ebp
553    pop         ebp
554    ret
555
556; --------------------------------------------------------------------------
557;
558; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
559; It's still a box filter.
560;
561; GLOBAL(void)
562; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
563;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
564;
565
566%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
567%define output_width(b)     (b) + 12    ; JDIMENSION output_width
568%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
569%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
570
571    align       32
572    GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
573
574EXTN(jsimd_h2v1_upsample_avx2):
575    push        ebp
576    mov         ebp, esp
577;   push        ebx                     ; unused
578;   push        ecx                     ; need not be preserved
579;   push        edx                     ; need not be preserved
580    push        esi
581    push        edi
582
583    mov         edx, JDIMENSION [output_width(ebp)]
584    add         edx, byte (SIZEOF_YMMWORD-1)
585    and         edx, -SIZEOF_YMMWORD
586    jz          short .return
587
588    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
589    test        ecx, ecx
590    jz          short .return
591
592    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
593    mov         edi, POINTER [output_data_ptr(ebp)]
594    mov         edi, JSAMPARRAY [edi]                ; output_data
595    alignx      16, 7
596.rowloop:
597    push        edi
598    push        esi
599
600    mov         esi, JSAMPROW [esi]     ; inptr
601    mov         edi, JSAMPROW [edi]     ; outptr
602    mov         eax, edx                ; colctr
603    alignx      16, 7
604.columnloop:
605
606    cmp         eax, byte SIZEOF_YMMWORD
607    ja          near .above_16
608
609    vmovdqu     xmm0, XMMWORD [esi+0*SIZEOF_YMMWORD]
610    vpunpckhbw  xmm1, xmm0, xmm0
611    vpunpcklbw  xmm0, xmm0, xmm0
612
613    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
614    vmovdqu     XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
615
616    jmp         short .nextrow
617
618.above_16:
619    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
620
621    vpermq      ymm0, ymm0, 0xd8
622    vpunpckhbw  ymm1, ymm0, ymm0
623    vpunpcklbw  ymm0, ymm0, ymm0
624
625    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
626    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
627
628    sub         eax, byte 2*SIZEOF_YMMWORD
629    jz          short .nextrow
630
631    add         esi, byte SIZEOF_YMMWORD    ; inptr
632    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
633    jmp         short .columnloop
634    alignx      16, 7
635
636.nextrow:
637    pop         esi
638    pop         edi
639
640    add         esi, byte SIZEOF_JSAMPROW  ; input_data
641    add         edi, byte SIZEOF_JSAMPROW  ; output_data
642    dec         ecx                        ; rowctr
643    jg          short .rowloop
644
645.return:
646    vzeroupper
647    pop         edi
648    pop         esi
649;   pop         edx                     ; need not be preserved
650;   pop         ecx                     ; need not be preserved
651;   pop         ebx                     ; unused
652    pop         ebp
653    ret
654
655; --------------------------------------------------------------------------
656;
657; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
658; It's still a box filter.
659;
660; GLOBAL(void)
661; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
662;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
663;
664
665%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
666%define output_width(b)     (b) + 12    ; JDIMENSION output_width
667%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
668%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
669
670    align       32
671    GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
672
673EXTN(jsimd_h2v2_upsample_avx2):
674    push        ebp
675    mov         ebp, esp
676    push        ebx
677;   push        ecx                     ; need not be preserved
678;   push        edx                     ; need not be preserved
679    push        esi
680    push        edi
681
682    mov         edx, JDIMENSION [output_width(ebp)]
683    add         edx, byte (SIZEOF_YMMWORD-1)
684    and         edx, -SIZEOF_YMMWORD
685    jz          near .return
686
687    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
688    test        ecx, ecx
689    jz          near .return
690
691    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
692    mov         edi, POINTER [output_data_ptr(ebp)]
693    mov         edi, JSAMPARRAY [edi]                ; output_data
694    alignx      16, 7
695.rowloop:
696    push        edi
697    push        esi
698
699    mov         esi, JSAMPROW [esi]                    ; inptr
700    mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
701    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
702    mov         eax, edx                               ; colctr
703    alignx      16, 7
704.columnloop:
705
706    cmp         eax, byte SIZEOF_YMMWORD
707    ja          short .above_16
708
709    vmovdqu     xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
710    vpunpckhbw  xmm1, xmm0, xmm0
711    vpunpcklbw  xmm0, xmm0, xmm0
712
713    vmovdqu     XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
714    vmovdqu     XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
715    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
716    vmovdqu     XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
717
718    jmp         near .nextrow
719
720.above_16:
721    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
722
723    vpermq      ymm0, ymm0, 0xd8
724    vpunpckhbw  ymm1, ymm0, ymm0
725    vpunpcklbw  ymm0, ymm0, ymm0
726
727    vmovdqu     YMMWORD [ebx+0*SIZEOF_YMMWORD], ymm0
728    vmovdqu     YMMWORD [ebx+1*SIZEOF_YMMWORD], ymm1
729    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
730    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
731
732    sub         eax, byte 2*SIZEOF_YMMWORD
733    jz          short .nextrow
734
735    add         esi, byte SIZEOF_YMMWORD  ; inptr
736    add         ebx, 2*SIZEOF_YMMWORD     ; outptr0
737    add         edi, 2*SIZEOF_YMMWORD     ; outptr1
738    jmp         short .columnloop
739    alignx      16, 7
740
741.nextrow:
742    pop         esi
743    pop         edi
744
745    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
746    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
747    sub         ecx, byte 2                  ; rowctr
748    jg          near .rowloop
749
750.return:
751    vzeroupper
752    pop         edi
753    pop         esi
754;   pop         edx                     ; need not be preserved
755;   pop         ecx                     ; need not be preserved
756    pop         ebx
757    pop         ebp
758    ret
759
760; For some reason, the OS X linker does not honor the request to align the
761; segment unless we do this.
762    align       32
763