• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jdsample.asm - upsampling (64-bit SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jsimdext.inc"
20
21; --------------------------------------------------------------------------
22    SECTION     SEG_CONST
23
24    alignz      32
25    GLOBAL_DATA(jconst_fancy_upsample_sse2)
26
27EXTN(jconst_fancy_upsample_sse2):
28
29PW_ONE   times 8 dw 1
30PW_TWO   times 8 dw 2
31PW_THREE times 8 dw 3
32PW_SEVEN times 8 dw 7
33PW_EIGHT times 8 dw 8
34
35    alignz      32
36
37; --------------------------------------------------------------------------
38    SECTION     SEG_TEXT
39    BITS        64
40;
41; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
42;
43; The upsampling algorithm is linear interpolation between pixel centers,
44; also known as a "triangle filter".  This is a good compromise between
45; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
46; of the way between input pixel centers.
47;
48; GLOBAL(void)
49; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
50;                                JDIMENSION downsampled_width,
51;                                JSAMPARRAY input_data,
52;                                JSAMPARRAY *output_data_ptr);
53;
54
55; r10 = int max_v_samp_factor
56; r11d = JDIMENSION downsampled_width
57; r12 = JSAMPARRAY input_data
58; r13 = JSAMPARRAY *output_data_ptr
59
60    align       32
61    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
62
63EXTN(jsimd_h2v1_fancy_upsample_sse2):
64    push        rbp
65    mov         rax, rsp
66    mov         rbp, rsp
67    collect_args 4
68
69    mov         eax, r11d               ; colctr
70    test        rax, rax
71    jz          near .return
72
73    mov         rcx, r10                ; rowctr
74    test        rcx, rcx
75    jz          near .return
76
77    mov         rsi, r12                ; input_data
78    mov         rdi, r13
79    mov         rdi, JSAMPARRAY [rdi]   ; output_data
80.rowloop:
81    push        rax                     ; colctr
82    push        rdi
83    push        rsi
84
85    mov         rsi, JSAMPROW [rsi]     ; inptr
86    mov         rdi, JSAMPROW [rdi]     ; outptr
87
88    test        rax, SIZEOF_XMMWORD-1
89    jz          short .skip
90    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
91    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
92.skip:
93    pxor        xmm0, xmm0              ; xmm0=(all 0's)
94    pcmpeqb     xmm7, xmm7
95    psrldq      xmm7, (SIZEOF_XMMWORD-1)
96    pand        xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
97
98    add         rax, byte SIZEOF_XMMWORD-1
99    and         rax, byte -SIZEOF_XMMWORD
100    cmp         rax, byte SIZEOF_XMMWORD
101    ja          short .columnloop
102
103.columnloop_last:
104    pcmpeqb     xmm6, xmm6
105    pslldq      xmm6, (SIZEOF_XMMWORD-1)
106    pand        xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
107    jmp         short .upsample
108
109.columnloop:
110    movdqa      xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
111    pslldq      xmm6, (SIZEOF_XMMWORD-1)
112
113.upsample:
114    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
115    movdqa      xmm2, xmm1
116    movdqa      xmm3, xmm1                ; xmm1=( 0  1  2 ... 13 14 15)
117    pslldq      xmm2, 1                   ; xmm2=(--  0  1 ... 12 13 14)
118    psrldq      xmm3, 1                   ; xmm3=( 1  2  3 ... 14 15 --)
119
120    por         xmm2, xmm7                ; xmm2=(-1  0  1 ... 12 13 14)
121    por         xmm3, xmm6                ; xmm3=( 1  2  3 ... 14 15 16)
122
123    movdqa      xmm7, xmm1
124    psrldq      xmm7, (SIZEOF_XMMWORD-1)  ; xmm7=(15 -- -- ... -- -- --)
125
126    movdqa      xmm4, xmm1
127    punpcklbw   xmm1, xmm0                ; xmm1=( 0  1  2  3  4  5  6  7)
128    punpckhbw   xmm4, xmm0                ; xmm4=( 8  9 10 11 12 13 14 15)
129    movdqa      xmm5, xmm2
130    punpcklbw   xmm2, xmm0                ; xmm2=(-1  0  1  2  3  4  5  6)
131    punpckhbw   xmm5, xmm0                ; xmm5=( 7  8  9 10 11 12 13 14)
132    movdqa      xmm6, xmm3
133    punpcklbw   xmm3, xmm0                ; xmm3=( 1  2  3  4  5  6  7  8)
134    punpckhbw   xmm6, xmm0                ; xmm6=( 9 10 11 12 13 14 15 16)
135
136    pmullw      xmm1, [rel PW_THREE]
137    pmullw      xmm4, [rel PW_THREE]
138    paddw       xmm2, [rel PW_ONE]
139    paddw       xmm5, [rel PW_ONE]
140    paddw       xmm3, [rel PW_TWO]
141    paddw       xmm6, [rel PW_TWO]
142
143    paddw       xmm2, xmm1
144    paddw       xmm5, xmm4
145    psrlw       xmm2, 2                 ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
146    psrlw       xmm5, 2                 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
147    paddw       xmm3, xmm1
148    paddw       xmm6, xmm4
149    psrlw       xmm3, 2                 ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
150    psrlw       xmm6, 2                 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
151
152    psllw       xmm3, BYTE_BIT
153    psllw       xmm6, BYTE_BIT
154    por         xmm2, xmm3              ; xmm2=OutL=( 0  1  2 ... 13 14 15)
155    por         xmm5, xmm6              ; xmm5=OutH=(16 17 18 ... 29 30 31)
156
157    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
158    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
159
160    sub         rax, byte SIZEOF_XMMWORD
161    add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr
162    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
163    cmp         rax, byte SIZEOF_XMMWORD
164    ja          near .columnloop
165    test        eax, eax
166    jnz         near .columnloop_last
167
168    pop         rsi
169    pop         rdi
170    pop         rax
171
172    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
173    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
174    dec         rcx                        ; rowctr
175    jg          near .rowloop
176
177.return:
178    uncollect_args 4
179    pop         rbp
180    ret
181
182; --------------------------------------------------------------------------
183;
184; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
185; Again a triangle filter; see comments for h2v1 case, above.
186;
187; GLOBAL(void)
188; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
189;                                JDIMENSION downsampled_width,
190;                                JSAMPARRAY input_data,
191;                                JSAMPARRAY *output_data_ptr);
192;
193
194; r10 = int max_v_samp_factor
195; r11d = JDIMENSION downsampled_width
196; r12 = JSAMPARRAY input_data
197; r13 = JSAMPARRAY *output_data_ptr
198
199%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
200%define WK_NUM  4
201
202    align       32
203    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
204
205EXTN(jsimd_h2v2_fancy_upsample_sse2):
206    push        rbp
207    mov         rax, rsp                     ; rax = original rbp
208    sub         rsp, byte 4
209    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
210    mov         [rsp], rax
211    mov         rbp, rsp                     ; rbp = aligned rbp
212    lea         rsp, [wk(0)]
213    collect_args 4
214    push        rbx
215
216    mov         eax, r11d               ; colctr
217    test        rax, rax
218    jz          near .return
219
220    mov         rcx, r10                ; rowctr
221    test        rcx, rcx
222    jz          near .return
223
224    mov         rsi, r12                ; input_data
225    mov         rdi, r13
226    mov         rdi, JSAMPARRAY [rdi]   ; output_data
227.rowloop:
228    push        rax                     ; colctr
229    push        rcx
230    push        rdi
231    push        rsi
232
233    mov         rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
234    mov         rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
235    mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
236    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
237    mov         rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
238
239    test        rax, SIZEOF_XMMWORD-1
240    jz          short .skip
241    push        rdx
242    mov         dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
243    mov         JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
244    mov         dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
245    mov         JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
246    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
247    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
248    pop         rdx
249.skip:
250    ; -- process the first column block
251
252    movdqa      xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]  ; xmm0=row[ 0][0]
253    movdqa      xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]  ; xmm1=row[-1][0]
254    movdqa      xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]  ; xmm2=row[+1][0]
255
256    pxor        xmm3, xmm3              ; xmm3=(all 0's)
257    movdqa      xmm4, xmm0
258    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
259    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
260    movdqa      xmm5, xmm1
261    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
262    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
263    movdqa      xmm6, xmm2
264    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
265    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
266
267    pmullw      xmm0, [rel PW_THREE]
268    pmullw      xmm4, [rel PW_THREE]
269
270    pcmpeqb     xmm7, xmm7
271    psrldq      xmm7, (SIZEOF_XMMWORD-2)
272
273    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
274    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
275    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
276    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
277
278    movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1  ; temporarily save
279    movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5  ; the intermediate data
280    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
281    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
282
283    pand        xmm1, xmm7              ; xmm1=( 0 -- -- -- -- -- -- --)
284    pand        xmm2, xmm7              ; xmm2=( 0 -- -- -- -- -- -- --)
285
286    movdqa      XMMWORD [wk(0)], xmm1
287    movdqa      XMMWORD [wk(1)], xmm2
288
289    add         rax, byte SIZEOF_XMMWORD-1
290    and         rax, byte -SIZEOF_XMMWORD
291    cmp         rax, byte SIZEOF_XMMWORD
292    ja          short .columnloop
293
294.columnloop_last:
295    ; -- process the last column block
296
297    pcmpeqb     xmm1, xmm1
298    pslldq      xmm1, (SIZEOF_XMMWORD-2)
299    movdqa      xmm2, xmm1
300
301    pand        xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
302    pand        xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
303
304    movdqa      XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
305    movdqa      XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
306
307    jmp         near .upsample
308
309.columnloop:
310    ; -- process the next column block
311
312    movdqa      xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]  ; xmm0=row[ 0][1]
313    movdqa      xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]  ; xmm1=row[-1][1]
314    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]  ; xmm2=row[+1][1]
315
316    pxor        xmm3, xmm3              ; xmm3=(all 0's)
317    movdqa      xmm4, xmm0
318    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
319    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
320    movdqa      xmm5, xmm1
321    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
322    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
323    movdqa      xmm6, xmm2
324    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
325    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
326
327    pmullw      xmm0, [rel PW_THREE]
328    pmullw      xmm4, [rel PW_THREE]
329
330    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
331    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
332    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
333    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
334
335    movdqa      XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1  ; temporarily save
336    movdqa      XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5  ; the intermediate data
337    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
338    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
339
340    pslldq      xmm1, (SIZEOF_XMMWORD-2)  ; xmm1=(-- -- -- -- -- -- --  0)
341    pslldq      xmm2, (SIZEOF_XMMWORD-2)  ; xmm2=(-- -- -- -- -- -- --  0)
342
343    movdqa      XMMWORD [wk(2)], xmm1
344    movdqa      XMMWORD [wk(3)], xmm2
345
346.upsample:
347    ; -- process the upper row
348
349    movdqa      xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
350    movdqa      xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
351
352    movdqa      xmm0, xmm7                ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
353    movdqa      xmm4, xmm3                ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
354    psrldq      xmm0, 2                   ; xmm0=( 1  2  3  4  5  6  7 --)
355    pslldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(-- -- -- -- -- -- --  8)
356    movdqa      xmm5, xmm7
357    movdqa      xmm6, xmm3
358    psrldq      xmm5, (SIZEOF_XMMWORD-2)  ; xmm5=( 7 -- -- -- -- -- -- --)
359    pslldq      xmm6, 2                   ; xmm6=(--  8  9 10 11 12 13 14)
360
361    por         xmm0, xmm4                ; xmm0=( 1  2  3  4  5  6  7  8)
362    por         xmm5, xmm6                ; xmm5=( 7  8  9 10 11 12 13 14)
363
364    movdqa      xmm1, xmm7
365    movdqa      xmm2, xmm3
366    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
367    psrldq      xmm2, 2                   ; xmm2=( 9 10 11 12 13 14 15 --)
368    movdqa      xmm4, xmm3
369    psrldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(15 -- -- -- -- -- -- --)
370
371    por         xmm1, XMMWORD [wk(0)]     ; xmm1=(-1  0  1  2  3  4  5  6)
372    por         xmm2, XMMWORD [wk(2)]     ; xmm2=( 9 10 11 12 13 14 15 16)
373
374    movdqa      XMMWORD [wk(0)], xmm4
375
376    pmullw      xmm7, [rel PW_THREE]
377    pmullw      xmm3, [rel PW_THREE]
378    paddw       xmm1, [rel PW_EIGHT]
379    paddw       xmm5, [rel PW_EIGHT]
380    paddw       xmm0, [rel PW_SEVEN]
381    paddw       xmm2, [rel PW_SEVEN]
382
383    paddw       xmm1, xmm7
384    paddw       xmm5, xmm3
385    psrlw       xmm1, 4                 ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
386    psrlw       xmm5, 4                 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
387    paddw       xmm0, xmm7
388    paddw       xmm2, xmm3
389    psrlw       xmm0, 4                 ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
390    psrlw       xmm2, 4                 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
391
392    psllw       xmm0, BYTE_BIT
393    psllw       xmm2, BYTE_BIT
394    por         xmm1, xmm0              ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
395    por         xmm5, xmm2              ; xmm5=Out0H=(16 17 18 ... 29 30 31)
396
397    movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
398    movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
399
400    ; -- process the lower row
401
402    movdqa      xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
403    movdqa      xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
404
405    movdqa      xmm7, xmm6                ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
406    movdqa      xmm3, xmm4                ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
407    psrldq      xmm7, 2                   ; xmm7=( 1  2  3  4  5  6  7 --)
408    pslldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(-- -- -- -- -- -- --  8)
409    movdqa      xmm0, xmm6
410    movdqa      xmm2, xmm4
411    psrldq      xmm0, (SIZEOF_XMMWORD-2)  ; xmm0=( 7 -- -- -- -- -- -- --)
412    pslldq      xmm2, 2                   ; xmm2=(--  8  9 10 11 12 13 14)
413
414    por         xmm7, xmm3                ; xmm7=( 1  2  3  4  5  6  7  8)
415    por         xmm0, xmm2                ; xmm0=( 7  8  9 10 11 12 13 14)
416
417    movdqa      xmm1, xmm6
418    movdqa      xmm5, xmm4
419    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
420    psrldq      xmm5, 2                   ; xmm5=( 9 10 11 12 13 14 15 --)
421    movdqa      xmm3, xmm4
422    psrldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(15 -- -- -- -- -- -- --)
423
424    por         xmm1, XMMWORD [wk(1)]     ; xmm1=(-1  0  1  2  3  4  5  6)
425    por         xmm5, XMMWORD [wk(3)]     ; xmm5=( 9 10 11 12 13 14 15 16)
426
427    movdqa      XMMWORD [wk(1)], xmm3
428
429    pmullw      xmm6, [rel PW_THREE]
430    pmullw      xmm4, [rel PW_THREE]
431    paddw       xmm1, [rel PW_EIGHT]
432    paddw       xmm0, [rel PW_EIGHT]
433    paddw       xmm7, [rel PW_SEVEN]
434    paddw       xmm5, [rel PW_SEVEN]
435
436    paddw       xmm1, xmm6
437    paddw       xmm0, xmm4
438    psrlw       xmm1, 4                 ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
439    psrlw       xmm0, 4                 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
440    paddw       xmm7, xmm6
441    paddw       xmm5, xmm4
442    psrlw       xmm7, 4                 ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
443    psrlw       xmm5, 4                 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
444
445    psllw       xmm7, BYTE_BIT
446    psllw       xmm5, BYTE_BIT
447    por         xmm1, xmm7              ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
448    por         xmm0, xmm5              ; xmm0=Out1H=(16 17 18 ... 29 30 31)
449
450    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
451    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
452
453    sub         rax, byte SIZEOF_XMMWORD
454    add         rcx, byte 1*SIZEOF_XMMWORD  ; inptr1(above)
455    add         rbx, byte 1*SIZEOF_XMMWORD  ; inptr0
456    add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr1(below)
457    add         rdx, byte 2*SIZEOF_XMMWORD  ; outptr0
458    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr1
459    cmp         rax, byte SIZEOF_XMMWORD
460    ja          near .columnloop
461    test        rax, rax
462    jnz         near .columnloop_last
463
464    pop         rsi
465    pop         rdi
466    pop         rcx
467    pop         rax
468
469    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
470    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
471    sub         rcx, byte 2                  ; rowctr
472    jg          near .rowloop
473
474.return:
475    pop         rbx
476    uncollect_args 4
477    mov         rsp, rbp                ; rsp <- aligned rbp
478    pop         rsp                     ; rsp <- original rbp
479    pop         rbp
480    ret
481
482; --------------------------------------------------------------------------
483;
484; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
485; It's still a box filter.
486;
487; GLOBAL(void)
488; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
489;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
490;
491
492; r10 = int max_v_samp_factor
493; r11d = JDIMENSION output_width
494; r12 = JSAMPARRAY input_data
495; r13 = JSAMPARRAY *output_data_ptr
496
497    align       32
498    GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
499
500EXTN(jsimd_h2v1_upsample_sse2):
501    push        rbp
502    mov         rax, rsp
503    mov         rbp, rsp
504    collect_args 4
505
506    mov         edx, r11d
507    add         rdx, byte (2*SIZEOF_XMMWORD)-1
508    and         rdx, byte -(2*SIZEOF_XMMWORD)
509    jz          near .return
510
511    mov         rcx, r10                ; rowctr
512    test        rcx, rcx
513    jz          short .return
514
515    mov         rsi, r12                ; input_data
516    mov         rdi, r13
517    mov         rdi, JSAMPARRAY [rdi]   ; output_data
518.rowloop:
519    push        rdi
520    push        rsi
521
522    mov         rsi, JSAMPROW [rsi]     ; inptr
523    mov         rdi, JSAMPROW [rdi]     ; outptr
524    mov         rax, rdx                ; colctr
525.columnloop:
526
527    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
528
529    movdqa      xmm1, xmm0
530    punpcklbw   xmm0, xmm0
531    punpckhbw   xmm1, xmm1
532
533    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
534    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
535
536    sub         rax, byte 2*SIZEOF_XMMWORD
537    jz          short .nextrow
538
539    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
540
541    movdqa      xmm3, xmm2
542    punpcklbw   xmm2, xmm2
543    punpckhbw   xmm3, xmm3
544
545    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
546    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
547
548    sub         rax, byte 2*SIZEOF_XMMWORD
549    jz          short .nextrow
550
551    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
552    add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr
553    jmp         short .columnloop
554
555.nextrow:
556    pop         rsi
557    pop         rdi
558
559    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
560    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
561    dec         rcx                        ; rowctr
562    jg          short .rowloop
563
564.return:
565    uncollect_args 4
566    pop         rbp
567    ret
568
569; --------------------------------------------------------------------------
570;
571; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
572; It's still a box filter.
573;
574; GLOBAL(void)
575; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
576;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
577;
578
579; r10 = int max_v_samp_factor
580; r11d = JDIMENSION output_width
581; r12 = JSAMPARRAY input_data
582; r13 = JSAMPARRAY *output_data_ptr
583
584    align       32
585    GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
586
587EXTN(jsimd_h2v2_upsample_sse2):
588    push        rbp
589    mov         rax, rsp
590    mov         rbp, rsp
591    collect_args 4
592    push        rbx
593
594    mov         edx, r11d
595    add         rdx, byte (2*SIZEOF_XMMWORD)-1
596    and         rdx, byte -(2*SIZEOF_XMMWORD)
597    jz          near .return
598
599    mov         rcx, r10                ; rowctr
600    test        rcx, rcx
601    jz          near .return
602
603    mov         rsi, r12                ; input_data
604    mov         rdi, r13
605    mov         rdi, JSAMPARRAY [rdi]   ; output_data
606.rowloop:
607    push        rdi
608    push        rsi
609
610    mov         rsi, JSAMPROW [rsi]                    ; inptr
611    mov         rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
612    mov         rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
613    mov         rax, rdx                               ; colctr
614.columnloop:
615
616    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
617
618    movdqa      xmm1, xmm0
619    punpcklbw   xmm0, xmm0
620    punpckhbw   xmm1, xmm1
621
622    movdqa      XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
623    movdqa      XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
624    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
625    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
626
627    sub         rax, byte 2*SIZEOF_XMMWORD
628    jz          short .nextrow
629
630    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
631
632    movdqa      xmm3, xmm2
633    punpcklbw   xmm2, xmm2
634    punpckhbw   xmm3, xmm3
635
636    movdqa      XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
637    movdqa      XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
638    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
639    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
640
641    sub         rax, byte 2*SIZEOF_XMMWORD
642    jz          short .nextrow
643
644    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
645    add         rbx, byte 4*SIZEOF_XMMWORD  ; outptr0
646    add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr1
647    jmp         short .columnloop
648
649.nextrow:
650    pop         rsi
651    pop         rdi
652
653    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
654    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
655    sub         rcx, byte 2                  ; rowctr
656    jg          near .rowloop
657
658.return:
659    pop         rbx
660    uncollect_args 4
661    pop         rbp
662    ret
663
664; For some reason, the OS X linker does not honor the request to align the
665; segment unless we do this.
666    align       32
667