• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jdsample.asm - upsampling (64-bit SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16
17%include "jsimdext.inc"
18
19; --------------------------------------------------------------------------
20    SECTION     SEG_CONST
21
22    alignz      32
23    GLOBAL_DATA(jconst_fancy_upsample_sse2)
24
25EXTN(jconst_fancy_upsample_sse2):
26
27PW_ONE   times 8 dw 1
28PW_TWO   times 8 dw 2
29PW_THREE times 8 dw 3
30PW_SEVEN times 8 dw 7
31PW_EIGHT times 8 dw 8
32
33    alignz      32
34
35; --------------------------------------------------------------------------
36    SECTION     SEG_TEXT
37    BITS        64
38;
39; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
40;
41; The upsampling algorithm is linear interpolation between pixel centers,
42; also known as a "triangle filter".  This is a good compromise between
43; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
44; of the way between input pixel centers.
45;
46; GLOBAL(void)
47; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
48;                                JDIMENSION downsampled_width,
49;                                JSAMPARRAY input_data,
50;                                JSAMPARRAY *output_data_ptr);
51;
52
53; r10 = int max_v_samp_factor
54; r11d = JDIMENSION downsampled_width
55; r12 = JSAMPARRAY input_data
56; r13 = JSAMPARRAY *output_data_ptr
57
58    align       32
59    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
60
61EXTN(jsimd_h2v1_fancy_upsample_sse2):
62    push        rbp
63    mov         rax, rsp
64    mov         rbp, rsp
65    collect_args 4
66
67    mov         eax, r11d               ; colctr
68    test        rax, rax
69    jz          near .return
70
71    mov         rcx, r10                ; rowctr
72    test        rcx, rcx
73    jz          near .return
74
75    mov         rsi, r12                ; input_data
76    mov         rdi, r13
77    mov         rdi, JSAMPARRAY [rdi]   ; output_data
78.rowloop:
79    push        rax                     ; colctr
80    push        rdi
81    push        rsi
82
83    mov         rsi, JSAMPROW [rsi]     ; inptr
84    mov         rdi, JSAMPROW [rdi]     ; outptr
85
86    test        rax, SIZEOF_XMMWORD-1
87    jz          short .skip
88    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
89    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
90.skip:
91    pxor        xmm0, xmm0              ; xmm0=(all 0's)
92    pcmpeqb     xmm7, xmm7
93    psrldq      xmm7, (SIZEOF_XMMWORD-1)
94    pand        xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
95
96    add         rax, byte SIZEOF_XMMWORD-1
97    and         rax, byte -SIZEOF_XMMWORD
98    cmp         rax, byte SIZEOF_XMMWORD
99    ja          short .columnloop
100
101.columnloop_last:
102    pcmpeqb     xmm6, xmm6
103    pslldq      xmm6, (SIZEOF_XMMWORD-1)
104    pand        xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
105    jmp         short .upsample
106
107.columnloop:
108    movdqa      xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
109    pslldq      xmm6, (SIZEOF_XMMWORD-1)
110
111.upsample:
112    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
113    movdqa      xmm2, xmm1
114    movdqa      xmm3, xmm1                ; xmm1=( 0  1  2 ... 13 14 15)
115    pslldq      xmm2, 1                   ; xmm2=(--  0  1 ... 12 13 14)
116    psrldq      xmm3, 1                   ; xmm3=( 1  2  3 ... 14 15 --)
117
118    por         xmm2, xmm7                ; xmm2=(-1  0  1 ... 12 13 14)
119    por         xmm3, xmm6                ; xmm3=( 1  2  3 ... 14 15 16)
120
121    movdqa      xmm7, xmm1
122    psrldq      xmm7, (SIZEOF_XMMWORD-1)  ; xmm7=(15 -- -- ... -- -- --)
123
124    movdqa      xmm4, xmm1
125    punpcklbw   xmm1, xmm0                ; xmm1=( 0  1  2  3  4  5  6  7)
126    punpckhbw   xmm4, xmm0                ; xmm4=( 8  9 10 11 12 13 14 15)
127    movdqa      xmm5, xmm2
128    punpcklbw   xmm2, xmm0                ; xmm2=(-1  0  1  2  3  4  5  6)
129    punpckhbw   xmm5, xmm0                ; xmm5=( 7  8  9 10 11 12 13 14)
130    movdqa      xmm6, xmm3
131    punpcklbw   xmm3, xmm0                ; xmm3=( 1  2  3  4  5  6  7  8)
132    punpckhbw   xmm6, xmm0                ; xmm6=( 9 10 11 12 13 14 15 16)
133
134    pmullw      xmm1, [rel PW_THREE]
135    pmullw      xmm4, [rel PW_THREE]
136    paddw       xmm2, [rel PW_ONE]
137    paddw       xmm5, [rel PW_ONE]
138    paddw       xmm3, [rel PW_TWO]
139    paddw       xmm6, [rel PW_TWO]
140
141    paddw       xmm2, xmm1
142    paddw       xmm5, xmm4
143    psrlw       xmm2, 2                 ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
144    psrlw       xmm5, 2                 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
145    paddw       xmm3, xmm1
146    paddw       xmm6, xmm4
147    psrlw       xmm3, 2                 ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
148    psrlw       xmm6, 2                 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
149
150    psllw       xmm3, BYTE_BIT
151    psllw       xmm6, BYTE_BIT
152    por         xmm2, xmm3              ; xmm2=OutL=( 0  1  2 ... 13 14 15)
153    por         xmm5, xmm6              ; xmm5=OutH=(16 17 18 ... 29 30 31)
154
155    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
156    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
157
158    sub         rax, byte SIZEOF_XMMWORD
159    add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr
160    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
161    cmp         rax, byte SIZEOF_XMMWORD
162    ja          near .columnloop
163    test        eax, eax
164    jnz         near .columnloop_last
165
166    pop         rsi
167    pop         rdi
168    pop         rax
169
170    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
171    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
172    dec         rcx                        ; rowctr
173    jg          near .rowloop
174
175.return:
176    uncollect_args 4
177    pop         rbp
178    ret
179
180; --------------------------------------------------------------------------
181;
182; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
183; Again a triangle filter; see comments for h2v1 case, above.
184;
185; GLOBAL(void)
186; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
187;                                JDIMENSION downsampled_width,
188;                                JSAMPARRAY input_data,
189;                                JSAMPARRAY *output_data_ptr);
190;
191
192; r10 = int max_v_samp_factor
193; r11d = JDIMENSION downsampled_width
194; r12 = JSAMPARRAY input_data
195; r13 = JSAMPARRAY *output_data_ptr
196
197%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
198%define WK_NUM  4
199
200    align       32
201    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
202
203EXTN(jsimd_h2v2_fancy_upsample_sse2):
204    push        rbp
205    mov         rax, rsp                     ; rax = original rbp
206    sub         rsp, byte 4
207    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
208    mov         [rsp], rax
209    mov         rbp, rsp                     ; rbp = aligned rbp
210    lea         rsp, [wk(0)]
211    collect_args 4
212    push        rbx
213
214    mov         eax, r11d               ; colctr
215    test        rax, rax
216    jz          near .return
217
218    mov         rcx, r10                ; rowctr
219    test        rcx, rcx
220    jz          near .return
221
222    mov         rsi, r12                ; input_data
223    mov         rdi, r13
224    mov         rdi, JSAMPARRAY [rdi]   ; output_data
225.rowloop:
226    push        rax                     ; colctr
227    push        rcx
228    push        rdi
229    push        rsi
230
231    mov         rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
232    mov         rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
233    mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
234    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
235    mov         rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
236
237    test        rax, SIZEOF_XMMWORD-1
238    jz          short .skip
239    push        rdx
240    mov         dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
241    mov         JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
242    mov         dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
243    mov         JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
244    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
245    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
246    pop         rdx
247.skip:
248    ; -- process the first column block
249
250    movdqa      xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]  ; xmm0=row[ 0][0]
251    movdqa      xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]  ; xmm1=row[-1][0]
252    movdqa      xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]  ; xmm2=row[+1][0]
253
254    pxor        xmm3, xmm3              ; xmm3=(all 0's)
255    movdqa      xmm4, xmm0
256    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
257    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
258    movdqa      xmm5, xmm1
259    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
260    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
261    movdqa      xmm6, xmm2
262    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
263    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
264
265    pmullw      xmm0, [rel PW_THREE]
266    pmullw      xmm4, [rel PW_THREE]
267
268    pcmpeqb     xmm7, xmm7
269    psrldq      xmm7, (SIZEOF_XMMWORD-2)
270
271    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
272    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
273    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
274    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
275
276    movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1  ; temporarily save
277    movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5  ; the intermediate data
278    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
279    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
280
281    pand        xmm1, xmm7              ; xmm1=( 0 -- -- -- -- -- -- --)
282    pand        xmm2, xmm7              ; xmm2=( 0 -- -- -- -- -- -- --)
283
284    movdqa      XMMWORD [wk(0)], xmm1
285    movdqa      XMMWORD [wk(1)], xmm2
286
287    add         rax, byte SIZEOF_XMMWORD-1
288    and         rax, byte -SIZEOF_XMMWORD
289    cmp         rax, byte SIZEOF_XMMWORD
290    ja          short .columnloop
291
292.columnloop_last:
293    ; -- process the last column block
294
295    pcmpeqb     xmm1, xmm1
296    pslldq      xmm1, (SIZEOF_XMMWORD-2)
297    movdqa      xmm2, xmm1
298
299    pand        xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
300    pand        xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
301
302    movdqa      XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
303    movdqa      XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
304
305    jmp         near .upsample
306
307.columnloop:
308    ; -- process the next column block
309
310    movdqa      xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]  ; xmm0=row[ 0][1]
311    movdqa      xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]  ; xmm1=row[-1][1]
312    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]  ; xmm2=row[+1][1]
313
314    pxor        xmm3, xmm3              ; xmm3=(all 0's)
315    movdqa      xmm4, xmm0
316    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
317    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
318    movdqa      xmm5, xmm1
319    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
320    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
321    movdqa      xmm6, xmm2
322    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
323    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
324
325    pmullw      xmm0, [rel PW_THREE]
326    pmullw      xmm4, [rel PW_THREE]
327
328    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
329    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
330    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
331    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
332
333    movdqa      XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1  ; temporarily save
334    movdqa      XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5  ; the intermediate data
335    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
336    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
337
338    pslldq      xmm1, (SIZEOF_XMMWORD-2)  ; xmm1=(-- -- -- -- -- -- --  0)
339    pslldq      xmm2, (SIZEOF_XMMWORD-2)  ; xmm2=(-- -- -- -- -- -- --  0)
340
341    movdqa      XMMWORD [wk(2)], xmm1
342    movdqa      XMMWORD [wk(3)], xmm2
343
344.upsample:
345    ; -- process the upper row
346
347    movdqa      xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
348    movdqa      xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
349
350    movdqa      xmm0, xmm7                ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
351    movdqa      xmm4, xmm3                ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
352    psrldq      xmm0, 2                   ; xmm0=( 1  2  3  4  5  6  7 --)
353    pslldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(-- -- -- -- -- -- --  8)
354    movdqa      xmm5, xmm7
355    movdqa      xmm6, xmm3
356    psrldq      xmm5, (SIZEOF_XMMWORD-2)  ; xmm5=( 7 -- -- -- -- -- -- --)
357    pslldq      xmm6, 2                   ; xmm6=(--  8  9 10 11 12 13 14)
358
359    por         xmm0, xmm4                ; xmm0=( 1  2  3  4  5  6  7  8)
360    por         xmm5, xmm6                ; xmm5=( 7  8  9 10 11 12 13 14)
361
362    movdqa      xmm1, xmm7
363    movdqa      xmm2, xmm3
364    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
365    psrldq      xmm2, 2                   ; xmm2=( 9 10 11 12 13 14 15 --)
366    movdqa      xmm4, xmm3
367    psrldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(15 -- -- -- -- -- -- --)
368
369    por         xmm1, XMMWORD [wk(0)]     ; xmm1=(-1  0  1  2  3  4  5  6)
370    por         xmm2, XMMWORD [wk(2)]     ; xmm2=( 9 10 11 12 13 14 15 16)
371
372    movdqa      XMMWORD [wk(0)], xmm4
373
374    pmullw      xmm7, [rel PW_THREE]
375    pmullw      xmm3, [rel PW_THREE]
376    paddw       xmm1, [rel PW_EIGHT]
377    paddw       xmm5, [rel PW_EIGHT]
378    paddw       xmm0, [rel PW_SEVEN]
379    paddw       xmm2, [rel PW_SEVEN]
380
381    paddw       xmm1, xmm7
382    paddw       xmm5, xmm3
383    psrlw       xmm1, 4                 ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
384    psrlw       xmm5, 4                 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
385    paddw       xmm0, xmm7
386    paddw       xmm2, xmm3
387    psrlw       xmm0, 4                 ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
388    psrlw       xmm2, 4                 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
389
390    psllw       xmm0, BYTE_BIT
391    psllw       xmm2, BYTE_BIT
392    por         xmm1, xmm0              ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
393    por         xmm5, xmm2              ; xmm5=Out0H=(16 17 18 ... 29 30 31)
394
395    movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
396    movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
397
398    ; -- process the lower row
399
400    movdqa      xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
401    movdqa      xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
402
403    movdqa      xmm7, xmm6                ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
404    movdqa      xmm3, xmm4                ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
405    psrldq      xmm7, 2                   ; xmm7=( 1  2  3  4  5  6  7 --)
406    pslldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(-- -- -- -- -- -- --  8)
407    movdqa      xmm0, xmm6
408    movdqa      xmm2, xmm4
409    psrldq      xmm0, (SIZEOF_XMMWORD-2)  ; xmm0=( 7 -- -- -- -- -- -- --)
410    pslldq      xmm2, 2                   ; xmm2=(--  8  9 10 11 12 13 14)
411
412    por         xmm7, xmm3                ; xmm7=( 1  2  3  4  5  6  7  8)
413    por         xmm0, xmm2                ; xmm0=( 7  8  9 10 11 12 13 14)
414
415    movdqa      xmm1, xmm6
416    movdqa      xmm5, xmm4
417    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
418    psrldq      xmm5, 2                   ; xmm5=( 9 10 11 12 13 14 15 --)
419    movdqa      xmm3, xmm4
420    psrldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(15 -- -- -- -- -- -- --)
421
422    por         xmm1, XMMWORD [wk(1)]     ; xmm1=(-1  0  1  2  3  4  5  6)
423    por         xmm5, XMMWORD [wk(3)]     ; xmm5=( 9 10 11 12 13 14 15 16)
424
425    movdqa      XMMWORD [wk(1)], xmm3
426
427    pmullw      xmm6, [rel PW_THREE]
428    pmullw      xmm4, [rel PW_THREE]
429    paddw       xmm1, [rel PW_EIGHT]
430    paddw       xmm0, [rel PW_EIGHT]
431    paddw       xmm7, [rel PW_SEVEN]
432    paddw       xmm5, [rel PW_SEVEN]
433
434    paddw       xmm1, xmm6
435    paddw       xmm0, xmm4
436    psrlw       xmm1, 4                 ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
437    psrlw       xmm0, 4                 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
438    paddw       xmm7, xmm6
439    paddw       xmm5, xmm4
440    psrlw       xmm7, 4                 ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
441    psrlw       xmm5, 4                 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
442
443    psllw       xmm7, BYTE_BIT
444    psllw       xmm5, BYTE_BIT
445    por         xmm1, xmm7              ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
446    por         xmm0, xmm5              ; xmm0=Out1H=(16 17 18 ... 29 30 31)
447
448    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
449    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
450
451    sub         rax, byte SIZEOF_XMMWORD
452    add         rcx, byte 1*SIZEOF_XMMWORD  ; inptr1(above)
453    add         rbx, byte 1*SIZEOF_XMMWORD  ; inptr0
454    add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr1(below)
455    add         rdx, byte 2*SIZEOF_XMMWORD  ; outptr0
456    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr1
457    cmp         rax, byte SIZEOF_XMMWORD
458    ja          near .columnloop
459    test        rax, rax
460    jnz         near .columnloop_last
461
462    pop         rsi
463    pop         rdi
464    pop         rcx
465    pop         rax
466
467    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
468    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
469    sub         rcx, byte 2                  ; rowctr
470    jg          near .rowloop
471
472.return:
473    pop         rbx
474    uncollect_args 4
475    mov         rsp, rbp                ; rsp <- aligned rbp
476    pop         rsp                     ; rsp <- original rbp
477    pop         rbp
478    ret
479
480; --------------------------------------------------------------------------
481;
482; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
483; It's still a box filter.
484;
485; GLOBAL(void)
486; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
487;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
488;
489
490; r10 = int max_v_samp_factor
491; r11d = JDIMENSION output_width
492; r12 = JSAMPARRAY input_data
493; r13 = JSAMPARRAY *output_data_ptr
494
495    align       32
496    GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
497
498EXTN(jsimd_h2v1_upsample_sse2):
499    push        rbp
500    mov         rax, rsp
501    mov         rbp, rsp
502    collect_args 4
503
504    mov         edx, r11d
505    add         rdx, byte (2*SIZEOF_XMMWORD)-1
506    and         rdx, byte -(2*SIZEOF_XMMWORD)
507    jz          near .return
508
509    mov         rcx, r10                ; rowctr
510    test        rcx, rcx
511    jz          short .return
512
513    mov         rsi, r12                ; input_data
514    mov         rdi, r13
515    mov         rdi, JSAMPARRAY [rdi]   ; output_data
516.rowloop:
517    push        rdi
518    push        rsi
519
520    mov         rsi, JSAMPROW [rsi]     ; inptr
521    mov         rdi, JSAMPROW [rdi]     ; outptr
522    mov         rax, rdx                ; colctr
523.columnloop:
524
525    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
526
527    movdqa      xmm1, xmm0
528    punpcklbw   xmm0, xmm0
529    punpckhbw   xmm1, xmm1
530
531    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
532    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
533
534    sub         rax, byte 2*SIZEOF_XMMWORD
535    jz          short .nextrow
536
537    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
538
539    movdqa      xmm3, xmm2
540    punpcklbw   xmm2, xmm2
541    punpckhbw   xmm3, xmm3
542
543    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
544    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
545
546    sub         rax, byte 2*SIZEOF_XMMWORD
547    jz          short .nextrow
548
549    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
550    add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr
551    jmp         short .columnloop
552
553.nextrow:
554    pop         rsi
555    pop         rdi
556
557    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
558    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
559    dec         rcx                        ; rowctr
560    jg          short .rowloop
561
562.return:
563    uncollect_args 4
564    pop         rbp
565    ret
566
567; --------------------------------------------------------------------------
568;
569; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
570; It's still a box filter.
571;
572; GLOBAL(void)
573; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
574;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
575;
576
577; r10 = int max_v_samp_factor
578; r11d = JDIMENSION output_width
579; r12 = JSAMPARRAY input_data
580; r13 = JSAMPARRAY *output_data_ptr
581
582    align       32
583    GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
584
585EXTN(jsimd_h2v2_upsample_sse2):
586    push        rbp
587    mov         rax, rsp
588    mov         rbp, rsp
589    collect_args 4
590    push        rbx
591
592    mov         edx, r11d
593    add         rdx, byte (2*SIZEOF_XMMWORD)-1
594    and         rdx, byte -(2*SIZEOF_XMMWORD)
595    jz          near .return
596
597    mov         rcx, r10                ; rowctr
598    test        rcx, rcx
599    jz          near .return
600
601    mov         rsi, r12                ; input_data
602    mov         rdi, r13
603    mov         rdi, JSAMPARRAY [rdi]   ; output_data
604.rowloop:
605    push        rdi
606    push        rsi
607
608    mov         rsi, JSAMPROW [rsi]                    ; inptr
609    mov         rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
610    mov         rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
611    mov         rax, rdx                               ; colctr
612.columnloop:
613
614    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
615
616    movdqa      xmm1, xmm0
617    punpcklbw   xmm0, xmm0
618    punpckhbw   xmm1, xmm1
619
620    movdqa      XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
621    movdqa      XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
622    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
623    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
624
625    sub         rax, byte 2*SIZEOF_XMMWORD
626    jz          short .nextrow
627
628    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
629
630    movdqa      xmm3, xmm2
631    punpcklbw   xmm2, xmm2
632    punpckhbw   xmm3, xmm3
633
634    movdqa      XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
635    movdqa      XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
636    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
637    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
638
639    sub         rax, byte 2*SIZEOF_XMMWORD
640    jz          short .nextrow
641
642    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
643    add         rbx, byte 4*SIZEOF_XMMWORD  ; outptr0
644    add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr1
645    jmp         short .columnloop
646
647.nextrow:
648    pop         rsi
649    pop         rdi
650
651    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
652    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
653    sub         rcx, byte 2                  ; rowctr
654    jg          near .rowloop
655
656.return:
657    pop         rbx
658    uncollect_args 4
659    pop         rbp
660    ret
661
662; For some reason, the OS X linker does not honor the request to align the
663; segment unless we do this.
664    align       32
665