• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jdsample.asm - upsampling (AVX2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2015, Intel Corporation.
6; Copyright (C) 2016, D. R. Commander.
7;
8; Based on the x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17
18%include "jsimdext.inc"
19
20; --------------------------------------------------------------------------
21    SECTION     SEG_CONST
22
23    alignz      32
24    GLOBAL_DATA(jconst_fancy_upsample_avx2)
25
26EXTN(jconst_fancy_upsample_avx2):
27
28PW_ONE   times 16 dw 1
29PW_TWO   times 16 dw 2
30PW_THREE times 16 dw 3
31PW_SEVEN times 16 dw 7
32PW_EIGHT times 16 dw 8
33
34    alignz      32
35
36; --------------------------------------------------------------------------
37    SECTION     SEG_TEXT
38    BITS        32
39;
40; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
41;
42; The upsampling algorithm is linear interpolation between pixel centers,
43; also known as a "triangle filter".  This is a good compromise between
44; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
45; of the way between input pixel centers.
46;
47; GLOBAL(void)
48; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
49;                                JDIMENSION downsampled_width,
50;                                JSAMPARRAY input_data,
51;                                JSAMPARRAY *output_data_ptr);
52;
53
54%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
55%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
56%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
57%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
58
59    align       32
60    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
61
62EXTN(jsimd_h2v1_fancy_upsample_avx2):
63    push        ebp
64    mov         ebp, esp
65    pushpic     ebx
66;   push        ecx                     ; need not be preserved
67;   push        edx                     ; need not be preserved
68    push        esi
69    push        edi
70
71    get_GOT     ebx                     ; get GOT address
72
73    mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
74    test        eax, eax
75    jz          near .return
76
77    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
78    test        ecx, ecx
79    jz          near .return
80
81    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
82    mov         edi, POINTER [output_data_ptr(ebp)]
83    mov         edi, JSAMPARRAY [edi]                ; output_data
84    alignx      16, 7
85.rowloop:
86    push        eax                     ; colctr
87    push        edi
88    push        esi
89
90    mov         esi, JSAMPROW [esi]     ; inptr
91    mov         edi, JSAMPROW [edi]     ; outptr
92
93    test        eax, SIZEOF_YMMWORD-1
94    jz          short .skip
95    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
96    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
97.skip:
98    vpxor       ymm0, ymm0, ymm0                ; ymm0=(all 0's)
99    vpcmpeqb    xmm7, xmm7, xmm7
100    vpsrldq     xmm7, xmm7, (SIZEOF_XMMWORD-1)  ; (ff -- -- -- ... -- --) LSB is ff
101    vpand       ymm7, ymm7, YMMWORD [esi+0*SIZEOF_YMMWORD]
102
103    add         eax, byte SIZEOF_YMMWORD-1
104    and         eax, byte -SIZEOF_YMMWORD
105    cmp         eax, byte SIZEOF_YMMWORD
106    ja          short .columnloop
107    alignx      16, 7
108
109.columnloop_last:
110    vpcmpeqb    xmm6, xmm6, xmm6
111    vpslldq     xmm6, xmm6, (SIZEOF_XMMWORD-1)
112    vperm2i128  ymm6, ymm6, ymm6, 1             ; (---- ---- ... ---- ---- ff) MSB is ff
113    vpand       ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD]
114    jmp         short .upsample
115    alignx      16, 7
116
117.columnloop:
118    vmovdqu     ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD]
119    vperm2i128  ymm6, ymm0, ymm6, 0x20
120    vpslldq     ymm6, ymm6, 15
121
122.upsample:
123    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]  ; ymm1=( 0  1  2 ... 29 30 31)
124
125    vperm2i128  ymm2, ymm0, ymm1, 0x20
126    vpalignr    ymm2, ymm1, ymm2, 15            ; ymm2=(--  0  1 ... 28 29 30)
127    vperm2i128  ymm4, ymm0, ymm1, 0x03
128    vpalignr    ymm3, ymm4, ymm1, 1             ; ymm3=( 1  2  3 ... 30 31 --)
129
130    vpor        ymm2, ymm2, ymm7                ; ymm2=(-1  0  1 ... 28 29 30)
131    vpor        ymm3, ymm3, ymm6                ; ymm3=( 1  2  3 ... 30 31 32)
132
133    vpsrldq     ymm7, ymm4, (SIZEOF_XMMWORD-1)  ; ymm7=(31 -- -- ... -- -- --)
134
135    vpunpckhbw  ymm4, ymm1, ymm0                ; ymm4=( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
136    vpunpcklbw  ymm5, ymm1, ymm0                ; ymm5=( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
137    vperm2i128  ymm1, ymm5, ymm4, 0x20          ; ymm1=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
138    vperm2i128  ymm4, ymm5, ymm4, 0x31          ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
139
140    vpunpckhbw  ymm5, ymm2, ymm0                ; ymm5=( 7  8  9 10 11 12 13 14 23 24 25 26 27 28 29 30)
141    vpunpcklbw  ymm6, ymm2, ymm0                ; ymm6=(-1  0  1  2  3  4  5  6 15 16 17 18 19 20 21 22)
142    vperm2i128  ymm2, ymm6, ymm5, 0x20          ; ymm2=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
143    vperm2i128  ymm5, ymm6, ymm5, 0x31          ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
144
145    vpunpckhbw  ymm6, ymm3, ymm0                ; ymm6=( 1  2  3  4  5  6  7  8 17 18 19 20 21 22 23 24)
146    vpunpcklbw  ymm0, ymm3, ymm0                ; ymm0=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
147    vperm2i128  ymm3, ymm0, ymm6, 0x20          ; ymm3=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
148    vperm2i128  ymm6, ymm0, ymm6, 0x31          ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
149
150    vpxor       ymm0, ymm0, ymm0                ; ymm0=(all 0's)
151
152    vpmullw     ymm1, ymm1, [GOTOFF(ebx,PW_THREE)]
153    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
154    vpaddw      ymm2, ymm2, [GOTOFF(ebx,PW_ONE)]
155    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_ONE)]
156    vpaddw      ymm3, ymm3, [GOTOFF(ebx,PW_TWO)]
157    vpaddw      ymm6, ymm6, [GOTOFF(ebx,PW_TWO)]
158
159    vpaddw      ymm2, ymm2, ymm1
160    vpaddw      ymm5, ymm5, ymm4
161    vpsrlw      ymm2, ymm2, 2                   ; ymm2=OutLE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
162    vpsrlw      ymm5, ymm5, 2                   ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
163    vpaddw      ymm3, ymm3, ymm1
164    vpaddw      ymm6, ymm6, ymm4
165    vpsrlw      ymm3, ymm3, 2                   ; ymm3=OutLO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
166    vpsrlw      ymm6, ymm6, 2                   ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
167
168    vpsllw      ymm3, ymm3, BYTE_BIT
169    vpsllw      ymm6, ymm6, BYTE_BIT
170    vpor        ymm2, ymm2, ymm3                ; ymm2=OutL=( 0  1  2 ... 29 30 31)
171    vpor        ymm5, ymm5, ymm6                ; ymm5=OutH=(32 33 34 ... 61 62 63)
172
173    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
174    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm5
175
176    sub         eax, byte SIZEOF_YMMWORD
177    add         esi, byte 1*SIZEOF_YMMWORD  ; inptr
178    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
179    cmp         eax, byte SIZEOF_YMMWORD
180    ja          near .columnloop
181    test        eax, eax
182    jnz         near .columnloop_last
183
184    pop         esi
185    pop         edi
186    pop         eax
187
188    add         esi, byte SIZEOF_JSAMPROW  ; input_data
189    add         edi, byte SIZEOF_JSAMPROW  ; output_data
190    dec         ecx                        ; rowctr
191    jg          near .rowloop
192
193.return:
194    vzeroupper
195    pop         edi
196    pop         esi
197;   pop         edx                     ; need not be preserved
198;   pop         ecx                     ; need not be preserved
199    poppic      ebx
200    pop         ebp
201    ret
202
203; --------------------------------------------------------------------------
204;
205; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
206; Again a triangle filter; see comments for h2v1 case, above.
207;
208; GLOBAL(void)
209; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
210;                                JDIMENSION downsampled_width,
211;                                JSAMPARRAY input_data,
212;                                JSAMPARRAY *output_data_ptr);
213;
214
215%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
216%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
217%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
218%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
219
220%define original_ebp  ebp + 0
221%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
222                                        ; ymmword wk[WK_NUM]
223%define WK_NUM        4
224%define gotptr        wk(0) - SIZEOF_POINTER  ; void *gotptr
225
226    align       32
227    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
228
229EXTN(jsimd_h2v2_fancy_upsample_avx2):
230    push        ebp
231    mov         eax, esp                     ; eax = original ebp
232    sub         esp, byte 4
233    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
234    mov         [esp], eax
235    mov         ebp, esp                     ; ebp = aligned ebp
236    lea         esp, [wk(0)]
237    pushpic     eax                     ; make a room for GOT address
238    push        ebx
239;   push        ecx                     ; need not be preserved
240;   push        edx                     ; need not be preserved
241    push        esi
242    push        edi
243
244    get_GOT     ebx                     ; get GOT address
245    movpic      POINTER [gotptr], ebx   ; save GOT address
246
247    mov         edx, eax                ; edx = original ebp
248    mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
249    test        eax, eax
250    jz          near .return
251
252    mov         ecx, INT [max_v_samp(edx)]  ; rowctr
253    test        ecx, ecx
254    jz          near .return
255
256    mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
257    mov         edi, POINTER [output_data_ptr(edx)]
258    mov         edi, JSAMPARRAY [edi]                ; output_data
259    alignx      16, 7
260.rowloop:
261    push        eax                     ; colctr
262    push        ecx
263    push        edi
264    push        esi
265
266    mov         ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
267    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
268    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
269    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
270    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
271
272    test        eax, SIZEOF_YMMWORD-1
273    jz          short .skip
274    push        edx
275    mov         dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
276    mov         JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
277    mov         dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
278    mov         JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
279    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
280    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
281    pop         edx
282.skip:
283    ; -- process the first column block
284
285    vmovdqu     ymm0, YMMWORD [ebx+0*SIZEOF_YMMWORD]  ; ymm0=row[ 0][0]
286    vmovdqu     ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD]  ; ymm1=row[-1][0]
287    vmovdqu     ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD]  ; ymm2=row[+1][0]
288
289    pushpic     ebx
290    movpic      ebx, POINTER [gotptr]   ; load GOT address
291
292    vpxor       ymm3, ymm3, ymm3        ; ymm3=(all 0's)
293
294    vpunpckhbw  ymm4, ymm0, ymm3        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
295    vpunpcklbw  ymm5, ymm0, ymm3        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
296    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
297    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
298
299    vpunpckhbw  ymm5, ymm1, ymm3        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
300    vpunpcklbw  ymm6, ymm1, ymm3        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
301    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
302    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
303
304    vpunpckhbw  ymm6, ymm2, ymm3        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
305    vpunpcklbw  ymm3, ymm2, ymm3        ; ymm3=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
306    vperm2i128  ymm2, ymm3, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
307    vperm2i128  ymm6, ymm3, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
308
309    vpmullw     ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
310    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
311
312    vpcmpeqb    xmm7, xmm7, xmm7
313    vpsrldq     xmm7, xmm7, (SIZEOF_XMMWORD-2)  ; (ffff ---- ---- ... ---- ----) LSB is ffff
314
315    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
316    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
317    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
318    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
319
320    vmovdqu     YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1  ; temporarily save
321    vmovdqu     YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5  ; the intermediate data
322    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
323    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm6
324
325    vpand       ymm1, ymm1, ymm7        ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
326    vpand       ymm2, ymm2, ymm7        ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
327
328    vmovdqa     YMMWORD [wk(0)], ymm1
329    vmovdqa     YMMWORD [wk(1)], ymm2
330
331    poppic      ebx
332
333    add         eax, byte SIZEOF_YMMWORD-1
334    and         eax, byte -SIZEOF_YMMWORD
335    cmp         eax, byte SIZEOF_YMMWORD
336    ja          short .columnloop
337    alignx      16, 7
338
339.columnloop_last:
340    ; -- process the last column block
341
342    pushpic     ebx
343    movpic      ebx, POINTER [gotptr]   ; load GOT address
344
345    vpcmpeqb    xmm1, xmm1, xmm1
346    vpslldq     xmm1, xmm1, (SIZEOF_XMMWORD-2)
347    vperm2i128  ymm1, ymm1, ymm1, 1             ; (---- ---- ... ---- ---- ffff) MSB is ffff
348
349    vpand       ymm2, ymm1, YMMWORD [edi+1*SIZEOF_YMMWORD]
350    vpand       ymm1, ymm1, YMMWORD [edx+1*SIZEOF_YMMWORD]
351
352    vmovdqa     YMMWORD [wk(2)], ymm1          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
353    vmovdqa     YMMWORD [wk(3)], ymm2          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
354
355    jmp         near .upsample
356    alignx      16, 7
357
358.columnloop:
359    ; -- process the next column block
360
361    vmovdqu     ymm0, YMMWORD [ebx+1*SIZEOF_YMMWORD]  ; ymm0=row[ 0][1]
362    vmovdqu     ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD]  ; ymm1=row[-1][1]
363    vmovdqu     ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD]  ; ymm2=row[+1][1]
364
365    pushpic     ebx
366    movpic      ebx, POINTER [gotptr]   ; load GOT address
367
368    vpxor       ymm3, ymm3, ymm3        ; ymm3=(all 0's)
369
370    vpunpckhbw  ymm4, ymm0, ymm3        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
371    vpunpcklbw  ymm5, ymm0, ymm3        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
372    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
373    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
374
375    vpunpckhbw  ymm5, ymm1, ymm3        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
376    vpunpcklbw  ymm6, ymm1, ymm3        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
377    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
378    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
379
380    vpunpckhbw  ymm6, ymm2, ymm3        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
381    vpunpcklbw  ymm7, ymm2, ymm3        ; ymm7=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
382    vperm2i128  ymm2, ymm7, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
383    vperm2i128  ymm6, ymm7, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
384
385    vpmullw     ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
386    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
387
388    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
389    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
390    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
391    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
392
393    vmovdqu     YMMWORD [edx+2*SIZEOF_YMMWORD], ymm1  ; temporarily save
394    vmovdqu     YMMWORD [edx+3*SIZEOF_YMMWORD], ymm5  ; the intermediate data
395    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymm2
396    vmovdqu     YMMWORD [edi+3*SIZEOF_YMMWORD], ymm6
397
398    vperm2i128  ymm1, ymm3, ymm1, 0x20
399    vpslldq     ymm1, ymm1, 14          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
400    vperm2i128  ymm2, ymm3, ymm2, 0x20
401    vpslldq     ymm2, ymm2, 14          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
402
403    vmovdqa     YMMWORD [wk(2)], ymm1
404    vmovdqa     YMMWORD [wk(3)], ymm2
405
406.upsample:
407    ; -- process the upper row
408
409    vmovdqu     ymm7, YMMWORD [edx+0*SIZEOF_YMMWORD]  ; ymm7=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
410    vmovdqu     ymm3, YMMWORD [edx+1*SIZEOF_YMMWORD]  ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
411
412    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
413
414    vperm2i128  ymm0, ymm1, ymm7, 0x03
415    vpalignr    ymm0, ymm0, ymm7, 2     ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
416    vperm2i128  ymm4, ymm1, ymm3, 0x20
417    vpslldq     ymm4, ymm4, 14          ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
418
419    vperm2i128  ymm5, ymm1, ymm7, 0x03
420    vpsrldq     ymm5, ymm5, 14          ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
421    vperm2i128  ymm6, ymm1, ymm3, 0x20
422    vpalignr    ymm6, ymm3, ymm6, 14    ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
423
424    vpor        ymm0, ymm0, ymm4        ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
425    vpor        ymm5, ymm5, ymm6        ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
426
427    vperm2i128  ymm2, ymm1, ymm3, 0x03
428    vpalignr    ymm2, ymm2, ymm3, 2     ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
429    vperm2i128  ymm4, ymm1, ymm3, 0x03
430    vpsrldq     ymm4, ymm4, 14          ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
431    vperm2i128  ymm1, ymm1, ymm7, 0x20
432    vpalignr    ymm1, ymm7, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
433
434    vpor        ymm1, ymm1, YMMWORD [wk(0)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
435    vpor        ymm2, ymm2, YMMWORD [wk(2)]  ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
436
437    vmovdqa     YMMWORD [wk(0)], ymm4
438
439    vpmullw     ymm7, ymm7, [GOTOFF(ebx,PW_THREE)]
440    vpmullw     ymm3, ymm3, [GOTOFF(ebx,PW_THREE)]
441    vpaddw      ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
442    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_EIGHT)]
443    vpaddw      ymm0, ymm0, [GOTOFF(ebx,PW_SEVEN)]
444    vpaddw      ymm2, [GOTOFF(ebx,PW_SEVEN)]
445
446    vpaddw      ymm1, ymm1, ymm7
447    vpaddw      ymm5, ymm5, ymm3
448    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out0LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
449    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
450    vpaddw      ymm0, ymm0, ymm7
451    vpaddw      ymm2, ymm2, ymm3
452    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out0LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
453    vpsrlw      ymm2, ymm2, 4           ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
454
455    vpsllw      ymm0, ymm0, BYTE_BIT
456    vpsllw      ymm2, ymm2, BYTE_BIT
457    vpor        ymm1, ymm1, ymm0        ; ymm1=Out0L=( 0  1  2 ... 29 30 31)
458    vpor        ymm5, ymm5, ymm2        ; ymm5=Out0H=(32 33 34 ... 61 62 63)
459
460    vmovdqu     YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1
461    vmovdqu     YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5
462
463    ; -- process the lower row
464
465    vmovdqu     ymm6, YMMWORD [edi+0*SIZEOF_YMMWORD]  ; ymm6=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
466    vmovdqu     ymm4, YMMWORD [edi+1*SIZEOF_YMMWORD]  ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
467
468    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
469
470    vperm2i128  ymm7, ymm1, ymm6, 0x03
471    vpalignr    ymm7, ymm7, ymm6, 2     ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
472    vperm2i128  ymm3, ymm1, ymm4, 0x20
473    vpslldq     ymm3, ymm3, 14          ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
474
475    vperm2i128  ymm0, ymm1, ymm6, 0x03
476    vpsrldq     ymm0, ymm0, 14          ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
477    vperm2i128  ymm2, ymm1, ymm4, 0x20
478    vpalignr    ymm2, ymm4, ymm2, 14    ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
479
480    vpor        ymm7, ymm7, ymm3        ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
481    vpor        ymm0, ymm0, ymm2        ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
482
483    vperm2i128  ymm5, ymm1, ymm4, 0x03
484    vpalignr    ymm5, ymm5, ymm4, 2     ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
485    vperm2i128  ymm3, ymm1, ymm4, 0x03
486    vpsrldq     ymm3, ymm3, 14          ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
487    vperm2i128  ymm1, ymm1, ymm6, 0x20
488    vpalignr    ymm1, ymm6, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
489
490    vpor        ymm1, ymm1, YMMWORD [wk(1)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
491    vpor        ymm5, ymm5, YMMWORD [wk(3)]  ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
492
493    vmovdqa     YMMWORD [wk(1)], ymm3
494
495    vpmullw     ymm6, ymm6, [GOTOFF(ebx,PW_THREE)]
496    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
497    vpaddw      ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
498    vpaddw      ymm0, ymm0, [GOTOFF(ebx,PW_EIGHT)]
499    vpaddw      ymm7, ymm7, [GOTOFF(ebx,PW_SEVEN)]
500    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_SEVEN)]
501
502    vpaddw      ymm1, ymm1, ymm6
503    vpaddw      ymm0, ymm0, ymm4
504    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out1LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
505    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
506    vpaddw      ymm7, ymm7, ymm6
507    vpaddw      ymm5, ymm5, ymm4
508    vpsrlw      ymm7, ymm7, 4           ; ymm7=Out1LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
509    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
510
511    vpsllw      ymm7, ymm7, BYTE_BIT
512    vpsllw      ymm5, ymm5, BYTE_BIT
513    vpor        ymm1, ymm1, ymm7        ; ymm1=Out1L=( 0  1  2 ... 29 30 31)
514    vpor        ymm0, ymm0, ymm5        ; ymm0=Out1H=(32 33 34 ... 61 62 63)
515
516    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1
517    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0
518
519    poppic      ebx
520
521    sub         eax, byte SIZEOF_YMMWORD
522    add         ecx, byte 1*SIZEOF_YMMWORD  ; inptr1(above)
523    add         ebx, byte 1*SIZEOF_YMMWORD  ; inptr0
524    add         esi, byte 1*SIZEOF_YMMWORD  ; inptr1(below)
525    add         edx, byte 2*SIZEOF_YMMWORD  ; outptr0
526    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr1
527    cmp         eax, byte SIZEOF_YMMWORD
528    ja          near .columnloop
529    test        eax, eax
530    jnz         near .columnloop_last
531
532    pop         esi
533    pop         edi
534    pop         ecx
535    pop         eax
536
537    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
538    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
539    sub         ecx, byte 2                  ; rowctr
540    jg          near .rowloop
541
542.return:
543    vzeroupper
544    pop         edi
545    pop         esi
546;   pop         edx                     ; need not be preserved
547;   pop         ecx                     ; need not be preserved
548    pop         ebx
549    mov         esp, ebp                ; esp <- aligned ebp
550    pop         esp                     ; esp <- original ebp
551    pop         ebp
552    ret
553
554; --------------------------------------------------------------------------
555;
556; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
557; It's still a box filter.
558;
559; GLOBAL(void)
560; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
561;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
562;
563
564%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
565%define output_width(b)     (b) + 12    ; JDIMENSION output_width
566%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
567%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
568
569    align       32
570    GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
571
572EXTN(jsimd_h2v1_upsample_avx2):
573    push        ebp
574    mov         ebp, esp
575;   push        ebx                     ; unused
576;   push        ecx                     ; need not be preserved
577;   push        edx                     ; need not be preserved
578    push        esi
579    push        edi
580
581    mov         edx, JDIMENSION [output_width(ebp)]
582    add         edx, byte (SIZEOF_YMMWORD-1)
583    and         edx, -SIZEOF_YMMWORD
584    jz          short .return
585
586    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
587    test        ecx, ecx
588    jz          short .return
589
590    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
591    mov         edi, POINTER [output_data_ptr(ebp)]
592    mov         edi, JSAMPARRAY [edi]                ; output_data
593    alignx      16, 7
594.rowloop:
595    push        edi
596    push        esi
597
598    mov         esi, JSAMPROW [esi]     ; inptr
599    mov         edi, JSAMPROW [edi]     ; outptr
600    mov         eax, edx                ; colctr
601    alignx      16, 7
602.columnloop:
603
604    cmp         eax, byte SIZEOF_YMMWORD
605    ja          near .above_16
606
607    vmovdqu     xmm0, XMMWORD [esi+0*SIZEOF_YMMWORD]
608    vpunpckhbw  xmm1, xmm0, xmm0
609    vpunpcklbw  xmm0, xmm0, xmm0
610
611    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
612    vmovdqu     XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
613
614    jmp         short .nextrow
615
616.above_16:
617    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
618
619    vpermq      ymm0, ymm0, 0xd8
620    vpunpckhbw  ymm1, ymm0, ymm0
621    vpunpcklbw  ymm0, ymm0, ymm0
622
623    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
624    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
625
626    sub         eax, byte 2*SIZEOF_YMMWORD
627    jz          short .nextrow
628
629    add         esi, byte SIZEOF_YMMWORD    ; inptr
630    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
631    jmp         short .columnloop
632    alignx      16, 7
633
634.nextrow:
635    pop         esi
636    pop         edi
637
638    add         esi, byte SIZEOF_JSAMPROW  ; input_data
639    add         edi, byte SIZEOF_JSAMPROW  ; output_data
640    dec         ecx                        ; rowctr
641    jg          short .rowloop
642
643.return:
644    vzeroupper
645    pop         edi
646    pop         esi
647;   pop         edx                     ; need not be preserved
648;   pop         ecx                     ; need not be preserved
649;   pop         ebx                     ; unused
650    pop         ebp
651    ret
652
653; --------------------------------------------------------------------------
654;
655; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
656; It's still a box filter.
657;
658; GLOBAL(void)
659; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
660;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
661;
662
663%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
664%define output_width(b)     (b) + 12    ; JDIMENSION output_width
665%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
666%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
667
668    align       32
669    GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
670
671EXTN(jsimd_h2v2_upsample_avx2):
672    push        ebp
673    mov         ebp, esp
674    push        ebx
675;   push        ecx                     ; need not be preserved
676;   push        edx                     ; need not be preserved
677    push        esi
678    push        edi
679
680    mov         edx, JDIMENSION [output_width(ebp)]
681    add         edx, byte (SIZEOF_YMMWORD-1)
682    and         edx, -SIZEOF_YMMWORD
683    jz          near .return
684
685    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
686    test        ecx, ecx
687    jz          near .return
688
689    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
690    mov         edi, POINTER [output_data_ptr(ebp)]
691    mov         edi, JSAMPARRAY [edi]                ; output_data
692    alignx      16, 7
693.rowloop:
694    push        edi
695    push        esi
696
697    mov         esi, JSAMPROW [esi]                    ; inptr
698    mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
699    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
700    mov         eax, edx                               ; colctr
701    alignx      16, 7
702.columnloop:
703
704    cmp         eax, byte SIZEOF_YMMWORD
705    ja          short .above_16
706
707    vmovdqu     xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
708    vpunpckhbw  xmm1, xmm0, xmm0
709    vpunpcklbw  xmm0, xmm0, xmm0
710
711    vmovdqu     XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
712    vmovdqu     XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
713    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
714    vmovdqu     XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
715
716    jmp         near .nextrow
717
718.above_16:
719    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
720
721    vpermq      ymm0, ymm0, 0xd8
722    vpunpckhbw  ymm1, ymm0, ymm0
723    vpunpcklbw  ymm0, ymm0, ymm0
724
725    vmovdqu     YMMWORD [ebx+0*SIZEOF_YMMWORD], ymm0
726    vmovdqu     YMMWORD [ebx+1*SIZEOF_YMMWORD], ymm1
727    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
728    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
729
730    sub         eax, byte 2*SIZEOF_YMMWORD
731    jz          short .nextrow
732
733    add         esi, byte SIZEOF_YMMWORD  ; inptr
734    add         ebx, 2*SIZEOF_YMMWORD     ; outptr0
735    add         edi, 2*SIZEOF_YMMWORD     ; outptr1
736    jmp         short .columnloop
737    alignx      16, 7
738
739.nextrow:
740    pop         esi
741    pop         edi
742
743    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
744    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
745    sub         ecx, byte 2                  ; rowctr
746    jg          near .rowloop
747
748.return:
749    vzeroupper
750    pop         edi
751    pop         esi
752;   pop         edx                     ; need not be preserved
753;   pop         ecx                     ; need not be preserved
754    pop         ebx
755    pop         ebp
756    ret
757
758; For some reason, the OS X linker does not honor the request to align the
759; segment unless we do this.
760    align       32
761