• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jdsample.asm - upsampling (SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
7; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jsimdext.inc"
20
21; --------------------------------------------------------------------------
22        SECTION SEG_CONST
23
24        alignz  16
25        global  EXTN(jconst_fancy_upsample_sse2)
26
27EXTN(jconst_fancy_upsample_sse2):
28
29PW_ONE          times 8 dw  1
30PW_TWO          times 8 dw  2
31PW_THREE        times 8 dw  3
32PW_SEVEN        times 8 dw  7
33PW_EIGHT        times 8 dw  8
34
35        alignz  16
36
37; --------------------------------------------------------------------------
38        SECTION SEG_TEXT
39        BITS    32
40;
41; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
42;
43; The upsampling algorithm is linear interpolation between pixel centers,
44; also known as a "triangle filter".  This is a good compromise between
45; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
46; of the way between input pixel centers.
47;
48; GLOBAL(void)
49; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
50;                                 JDIMENSION downsampled_width,
51;                                 JSAMPARRAY input_data,
52;                                 JSAMPARRAY * output_data_ptr);
53;
54
55%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
56%define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
57%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
58%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
59
60        align   16
61        global  EXTN(jsimd_h2v1_fancy_upsample_sse2)
62
63EXTN(jsimd_h2v1_fancy_upsample_sse2):
64        push    ebp
65        mov     ebp,esp
66        pushpic ebx
67;       push    ecx             ; need not be preserved
68;       push    edx             ; need not be preserved
69        push    esi
70        push    edi
71
72        get_GOT ebx             ; get GOT address
73
74        mov     eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
75        test    eax,eax
76        jz      near .return
77
78        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
79        test    ecx,ecx
80        jz      near .return
81
82        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
83        mov     edi, POINTER [output_data_ptr(ebp)]
84        mov     edi, JSAMPARRAY [edi]                   ; output_data
85        alignx  16,7
86.rowloop:
87        push    eax                     ; colctr
88        push    edi
89        push    esi
90
91        mov     esi, JSAMPROW [esi]     ; inptr
92        mov     edi, JSAMPROW [edi]     ; outptr
93
94        test    eax, SIZEOF_XMMWORD-1
95        jz      short .skip
96        mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
97        mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
98.skip:
99        pxor    xmm0,xmm0               ; xmm0=(all 0's)
100        pcmpeqb xmm7,xmm7
101        psrldq  xmm7,(SIZEOF_XMMWORD-1)
102        pand    xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
103
104        add     eax, byte SIZEOF_XMMWORD-1
105        and     eax, byte -SIZEOF_XMMWORD
106        cmp     eax, byte SIZEOF_XMMWORD
107        ja      short .columnloop
108        alignx  16,7
109
110.columnloop_last:
111        pcmpeqb xmm6,xmm6
112        pslldq  xmm6,(SIZEOF_XMMWORD-1)
113        pand    xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
114        jmp     short .upsample
115        alignx  16,7
116
117.columnloop:
118        movdqa  xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
119        pslldq  xmm6,(SIZEOF_XMMWORD-1)
120
121.upsample:
122        movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
123        movdqa  xmm2,xmm1
124        movdqa  xmm3,xmm1               ; xmm1=( 0  1  2 ... 13 14 15)
125        pslldq  xmm2,1                  ; xmm2=(--  0  1 ... 12 13 14)
126        psrldq  xmm3,1                  ; xmm3=( 1  2  3 ... 14 15 --)
127
128        por     xmm2,xmm7               ; xmm2=(-1  0  1 ... 12 13 14)
129        por     xmm3,xmm6               ; xmm3=( 1  2  3 ... 14 15 16)
130
131        movdqa  xmm7,xmm1
132        psrldq  xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
133
134        movdqa    xmm4,xmm1
135        punpcklbw xmm1,xmm0             ; xmm1=( 0  1  2  3  4  5  6  7)
136        punpckhbw xmm4,xmm0             ; xmm4=( 8  9 10 11 12 13 14 15)
137        movdqa    xmm5,xmm2
138        punpcklbw xmm2,xmm0             ; xmm2=(-1  0  1  2  3  4  5  6)
139        punpckhbw xmm5,xmm0             ; xmm5=( 7  8  9 10 11 12 13 14)
140        movdqa    xmm6,xmm3
141        punpcklbw xmm3,xmm0             ; xmm3=( 1  2  3  4  5  6  7  8)
142        punpckhbw xmm6,xmm0             ; xmm6=( 9 10 11 12 13 14 15 16)
143
144        pmullw  xmm1,[GOTOFF(ebx,PW_THREE)]
145        pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
146        paddw   xmm2,[GOTOFF(ebx,PW_ONE)]
147        paddw   xmm5,[GOTOFF(ebx,PW_ONE)]
148        paddw   xmm3,[GOTOFF(ebx,PW_TWO)]
149        paddw   xmm6,[GOTOFF(ebx,PW_TWO)]
150
151        paddw   xmm2,xmm1
152        paddw   xmm5,xmm4
153        psrlw   xmm2,2                  ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
154        psrlw   xmm5,2                  ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
155        paddw   xmm3,xmm1
156        paddw   xmm6,xmm4
157        psrlw   xmm3,2                  ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
158        psrlw   xmm6,2                  ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
159
160        psllw   xmm3,BYTE_BIT
161        psllw   xmm6,BYTE_BIT
162        por     xmm2,xmm3               ; xmm2=OutL=( 0  1  2 ... 13 14 15)
163        por     xmm5,xmm6               ; xmm5=OutH=(16 17 18 ... 29 30 31)
164
165        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
166        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
167
168        sub     eax, byte SIZEOF_XMMWORD
169        add     esi, byte 1*SIZEOF_XMMWORD      ; inptr
170        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
171        cmp     eax, byte SIZEOF_XMMWORD
172        ja      near .columnloop
173        test    eax,eax
174        jnz     near .columnloop_last
175
176        pop     esi
177        pop     edi
178        pop     eax
179
180        add     esi, byte SIZEOF_JSAMPROW       ; input_data
181        add     edi, byte SIZEOF_JSAMPROW       ; output_data
182        dec     ecx                             ; rowctr
183        jg      near .rowloop
184
185.return:
186        pop     edi
187        pop     esi
188;       pop     edx             ; need not be preserved
189;       pop     ecx             ; need not be preserved
190        poppic  ebx
191        pop     ebp
192        ret
193
194; --------------------------------------------------------------------------
195;
196; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
197; Again a triangle filter; see comments for h2v1 case, above.
198;
199; GLOBAL(void)
200; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
201;                                 JDIMENSION downsampled_width,
202;                                 JSAMPARRAY input_data,
203;                                 JSAMPARRAY * output_data_ptr);
204;
205
206%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
207%define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
208%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
209%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
210
211%define original_ebp    ebp+0
212%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
213%define WK_NUM          4
214%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
215
216        align   16
217        global  EXTN(jsimd_h2v2_fancy_upsample_sse2)
218
219EXTN(jsimd_h2v2_fancy_upsample_sse2):
220        push    ebp
221        mov     eax,esp                         ; eax = original ebp
222        sub     esp, byte 4
223        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
224        mov     [esp],eax
225        mov     ebp,esp                         ; ebp = aligned ebp
226        lea     esp, [wk(0)]
227        pushpic eax             ; make a room for GOT address
228        push    ebx
229;       push    ecx             ; need not be preserved
230;       push    edx             ; need not be preserved
231        push    esi
232        push    edi
233
234        get_GOT ebx                     ; get GOT address
235        movpic  POINTER [gotptr], ebx   ; save GOT address
236
237        mov     edx,eax                         ; edx = original ebp
238        mov     eax, JDIMENSION [downsamp_width(edx)]  ; colctr
239        test    eax,eax
240        jz      near .return
241
242        mov     ecx, INT [max_v_samp(edx)]      ; rowctr
243        test    ecx,ecx
244        jz      near .return
245
246        mov     esi, JSAMPARRAY [input_data(edx)]       ; input_data
247        mov     edi, POINTER [output_data_ptr(edx)]
248        mov     edi, JSAMPARRAY [edi]                   ; output_data
249        alignx  16,7
250.rowloop:
251        push    eax                                     ; colctr
252        push    ecx
253        push    edi
254        push    esi
255
256        mov     ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
257        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
258        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
259        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
260        mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
261
262        test    eax, SIZEOF_XMMWORD-1
263        jz      short .skip
264        push    edx
265        mov     dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
266        mov     JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
267        mov     dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
268        mov     JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
269        mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
270        mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
271        pop     edx
272.skip:
273        ; -- process the first column block
274
275        movdqa  xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD]    ; xmm0=row[ 0][0]
276        movdqa  xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]    ; xmm1=row[-1][0]
277        movdqa  xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]    ; xmm2=row[+1][0]
278
279        pushpic ebx
280        movpic  ebx, POINTER [gotptr]   ; load GOT address
281
282        pxor      xmm3,xmm3             ; xmm3=(all 0's)
283        movdqa    xmm4,xmm0
284        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
285        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
286        movdqa    xmm5,xmm1
287        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
288        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
289        movdqa    xmm6,xmm2
290        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
291        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
292
293        pmullw  xmm0,[GOTOFF(ebx,PW_THREE)]
294        pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
295
296        pcmpeqb xmm7,xmm7
297        psrldq  xmm7,(SIZEOF_XMMWORD-2)
298
299        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
300        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
301        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
302        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
303
304        movdqa  XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1    ; temporarily save
305        movdqa  XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5    ; the intermediate data
306        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
307        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
308
309        pand    xmm1,xmm7               ; xmm1=( 0 -- -- -- -- -- -- --)
310        pand    xmm2,xmm7               ; xmm2=( 0 -- -- -- -- -- -- --)
311
312        movdqa  XMMWORD [wk(0)], xmm1
313        movdqa  XMMWORD [wk(1)], xmm2
314
315        poppic  ebx
316
317        add     eax, byte SIZEOF_XMMWORD-1
318        and     eax, byte -SIZEOF_XMMWORD
319        cmp     eax, byte SIZEOF_XMMWORD
320        ja      short .columnloop
321        alignx  16,7
322
323.columnloop_last:
324        ; -- process the last column block
325
326        pushpic ebx
327        movpic  ebx, POINTER [gotptr]   ; load GOT address
328
329        pcmpeqb xmm1,xmm1
330        pslldq  xmm1,(SIZEOF_XMMWORD-2)
331        movdqa  xmm2,xmm1
332
333        pand    xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
334        pand    xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
335
336        movdqa  XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
337        movdqa  XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
338
339        jmp     near .upsample
340        alignx  16,7
341
342.columnloop:
343        ; -- process the next column block
344
345        movdqa  xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD]    ; xmm0=row[ 0][1]
346        movdqa  xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]    ; xmm1=row[-1][1]
347        movdqa  xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]    ; xmm2=row[+1][1]
348
349        pushpic ebx
350        movpic  ebx, POINTER [gotptr]   ; load GOT address
351
352        pxor      xmm3,xmm3             ; xmm3=(all 0's)
353        movdqa    xmm4,xmm0
354        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
355        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
356        movdqa    xmm5,xmm1
357        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
358        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
359        movdqa    xmm6,xmm2
360        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
361        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
362
363        pmullw  xmm0,[GOTOFF(ebx,PW_THREE)]
364        pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
365
366        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
367        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
368        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
369        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
370
371        movdqa  XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1    ; temporarily save
372        movdqa  XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5    ; the intermediate data
373        movdqa  XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
374        movdqa  XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
375
376        pslldq  xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- --  0)
377        pslldq  xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- --  0)
378
379        movdqa  XMMWORD [wk(2)], xmm1
380        movdqa  XMMWORD [wk(3)], xmm2
381
382.upsample:
383        ; -- process the upper row
384
385        movdqa  xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
386        movdqa  xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
387
388        movdqa  xmm0,xmm7               ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
389        movdqa  xmm4,xmm3               ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
390        psrldq  xmm0,2                  ; xmm0=( 1  2  3  4  5  6  7 --)
391        pslldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- --  8)
392        movdqa  xmm5,xmm7
393        movdqa  xmm6,xmm3
394        psrldq  xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
395        pslldq  xmm6,2                  ; xmm6=(--  8  9 10 11 12 13 14)
396
397        por     xmm0,xmm4               ; xmm0=( 1  2  3  4  5  6  7  8)
398        por     xmm5,xmm6               ; xmm5=( 7  8  9 10 11 12 13 14)
399
400        movdqa  xmm1,xmm7
401        movdqa  xmm2,xmm3
402        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
403        psrldq  xmm2,2                  ; xmm2=( 9 10 11 12 13 14 15 --)
404        movdqa  xmm4,xmm3
405        psrldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
406
407        por     xmm1, XMMWORD [wk(0)]   ; xmm1=(-1  0  1  2  3  4  5  6)
408        por     xmm2, XMMWORD [wk(2)]   ; xmm2=( 9 10 11 12 13 14 15 16)
409
410        movdqa  XMMWORD [wk(0)], xmm4
411
412        pmullw  xmm7,[GOTOFF(ebx,PW_THREE)]
413        pmullw  xmm3,[GOTOFF(ebx,PW_THREE)]
414        paddw   xmm1,[GOTOFF(ebx,PW_EIGHT)]
415        paddw   xmm5,[GOTOFF(ebx,PW_EIGHT)]
416        paddw   xmm0,[GOTOFF(ebx,PW_SEVEN)]
417        paddw   xmm2,[GOTOFF(ebx,PW_SEVEN)]
418
419        paddw   xmm1,xmm7
420        paddw   xmm5,xmm3
421        psrlw   xmm1,4                  ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
422        psrlw   xmm5,4                  ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
423        paddw   xmm0,xmm7
424        paddw   xmm2,xmm3
425        psrlw   xmm0,4                  ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
426        psrlw   xmm2,4                  ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
427
428        psllw   xmm0,BYTE_BIT
429        psllw   xmm2,BYTE_BIT
430        por     xmm1,xmm0               ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
431        por     xmm5,xmm2               ; xmm5=Out0H=(16 17 18 ... 29 30 31)
432
433        movdqa  XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
434        movdqa  XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
435
436        ; -- process the lower row
437
438        movdqa  xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
439        movdqa  xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
440
441        movdqa  xmm7,xmm6               ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
442        movdqa  xmm3,xmm4               ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
443        psrldq  xmm7,2                  ; xmm7=( 1  2  3  4  5  6  7 --)
444        pslldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- --  8)
445        movdqa  xmm0,xmm6
446        movdqa  xmm2,xmm4
447        psrldq  xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
448        pslldq  xmm2,2                  ; xmm2=(--  8  9 10 11 12 13 14)
449
450        por     xmm7,xmm3               ; xmm7=( 1  2  3  4  5  6  7  8)
451        por     xmm0,xmm2               ; xmm0=( 7  8  9 10 11 12 13 14)
452
453        movdqa  xmm1,xmm6
454        movdqa  xmm5,xmm4
455        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
456        psrldq  xmm5,2                  ; xmm5=( 9 10 11 12 13 14 15 --)
457        movdqa  xmm3,xmm4
458        psrldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
459
460        por     xmm1, XMMWORD [wk(1)]   ; xmm1=(-1  0  1  2  3  4  5  6)
461        por     xmm5, XMMWORD [wk(3)]   ; xmm5=( 9 10 11 12 13 14 15 16)
462
463        movdqa  XMMWORD [wk(1)], xmm3
464
465        pmullw  xmm6,[GOTOFF(ebx,PW_THREE)]
466        pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
467        paddw   xmm1,[GOTOFF(ebx,PW_EIGHT)]
468        paddw   xmm0,[GOTOFF(ebx,PW_EIGHT)]
469        paddw   xmm7,[GOTOFF(ebx,PW_SEVEN)]
470        paddw   xmm5,[GOTOFF(ebx,PW_SEVEN)]
471
472        paddw   xmm1,xmm6
473        paddw   xmm0,xmm4
474        psrlw   xmm1,4                  ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
475        psrlw   xmm0,4                  ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
476        paddw   xmm7,xmm6
477        paddw   xmm5,xmm4
478        psrlw   xmm7,4                  ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
479        psrlw   xmm5,4                  ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
480
481        psllw   xmm7,BYTE_BIT
482        psllw   xmm5,BYTE_BIT
483        por     xmm1,xmm7               ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
484        por     xmm0,xmm5               ; xmm0=Out1H=(16 17 18 ... 29 30 31)
485
486        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
487        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
488
489        poppic  ebx
490
491        sub     eax, byte SIZEOF_XMMWORD
492        add     ecx, byte 1*SIZEOF_XMMWORD      ; inptr1(above)
493        add     ebx, byte 1*SIZEOF_XMMWORD      ; inptr0
494        add     esi, byte 1*SIZEOF_XMMWORD      ; inptr1(below)
495        add     edx, byte 2*SIZEOF_XMMWORD      ; outptr0
496        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr1
497        cmp     eax, byte SIZEOF_XMMWORD
498        ja      near .columnloop
499        test    eax,eax
500        jnz     near .columnloop_last
501
502        pop     esi
503        pop     edi
504        pop     ecx
505        pop     eax
506
507        add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
508        add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
509        sub     ecx, byte 2                     ; rowctr
510        jg      near .rowloop
511
512.return:
513        pop     edi
514        pop     esi
515;       pop     edx             ; need not be preserved
516;       pop     ecx             ; need not be preserved
517        pop     ebx
518        mov     esp,ebp         ; esp <- aligned ebp
519        pop     esp             ; esp <- original ebp
520        pop     ebp
521        ret
522
523; --------------------------------------------------------------------------
524;
525; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
526; It's still a box filter.
527;
528; GLOBAL(void)
529; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
530;                           JDIMENSION output_width,
531;                           JSAMPARRAY input_data,
532;                           JSAMPARRAY * output_data_ptr);
533;
534
535%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
536%define output_width(b)         (b)+12          ; JDIMENSION output_width
537%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
538%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
539
540        align   16
541        global  EXTN(jsimd_h2v1_upsample_sse2)
542
543EXTN(jsimd_h2v1_upsample_sse2):
544        push    ebp
545        mov     ebp,esp
546;       push    ebx             ; unused
547;       push    ecx             ; need not be preserved
548;       push    edx             ; need not be preserved
549        push    esi
550        push    edi
551
552        mov     edx, JDIMENSION [output_width(ebp)]
553        add     edx, byte (2*SIZEOF_XMMWORD)-1
554        and     edx, byte -(2*SIZEOF_XMMWORD)
555        jz      short .return
556
557        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
558        test    ecx,ecx
559        jz      short .return
560
561        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
562        mov     edi, POINTER [output_data_ptr(ebp)]
563        mov     edi, JSAMPARRAY [edi]                   ; output_data
564        alignx  16,7
565.rowloop:
566        push    edi
567        push    esi
568
569        mov     esi, JSAMPROW [esi]             ; inptr
570        mov     edi, JSAMPROW [edi]             ; outptr
571        mov     eax,edx                         ; colctr
572        alignx  16,7
573.columnloop:
574
575        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
576
577        movdqa    xmm1,xmm0
578        punpcklbw xmm0,xmm0
579        punpckhbw xmm1,xmm1
580
581        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
582        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
583
584        sub     eax, byte 2*SIZEOF_XMMWORD
585        jz      short .nextrow
586
587        movdqa  xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
588
589        movdqa    xmm3,xmm2
590        punpcklbw xmm2,xmm2
591        punpckhbw xmm3,xmm3
592
593        movdqa  XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
594        movdqa  XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
595
596        sub     eax, byte 2*SIZEOF_XMMWORD
597        jz      short .nextrow
598
599        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr
600        add     edi, byte 4*SIZEOF_XMMWORD      ; outptr
601        jmp     short .columnloop
602        alignx  16,7
603
604.nextrow:
605        pop     esi
606        pop     edi
607
608        add     esi, byte SIZEOF_JSAMPROW       ; input_data
609        add     edi, byte SIZEOF_JSAMPROW       ; output_data
610        dec     ecx                             ; rowctr
611        jg      short .rowloop
612
613.return:
614        pop     edi
615        pop     esi
616;       pop     edx             ; need not be preserved
617;       pop     ecx             ; need not be preserved
618;       pop     ebx             ; unused
619        pop     ebp
620        ret
621
622; --------------------------------------------------------------------------
623;
624; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
625; It's still a box filter.
626;
627; GLOBAL(void)
628; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
629;                           JDIMENSION output_width,
630;                           JSAMPARRAY input_data,
631;                           JSAMPARRAY * output_data_ptr);
632;
633
634%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
635%define output_width(b)         (b)+12          ; JDIMENSION output_width
636%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
637%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
638
639        align   16
640        global  EXTN(jsimd_h2v2_upsample_sse2)
641
642EXTN(jsimd_h2v2_upsample_sse2):
643        push    ebp
644        mov     ebp,esp
645        push    ebx
646;       push    ecx             ; need not be preserved
647;       push    edx             ; need not be preserved
648        push    esi
649        push    edi
650
651        mov     edx, JDIMENSION [output_width(ebp)]
652        add     edx, byte (2*SIZEOF_XMMWORD)-1
653        and     edx, byte -(2*SIZEOF_XMMWORD)
654        jz      near .return
655
656        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
657        test    ecx,ecx
658        jz      near .return
659
660        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
661        mov     edi, POINTER [output_data_ptr(ebp)]
662        mov     edi, JSAMPARRAY [edi]                   ; output_data
663        alignx  16,7
664.rowloop:
665        push    edi
666        push    esi
667
668        mov     esi, JSAMPROW [esi]                     ; inptr
669        mov     ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
670        mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
671        mov     eax,edx                                 ; colctr
672        alignx  16,7
673.columnloop:
674
675        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
676
677        movdqa    xmm1,xmm0
678        punpcklbw xmm0,xmm0
679        punpckhbw xmm1,xmm1
680
681        movdqa  XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
682        movdqa  XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
683        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
684        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
685
686        sub     eax, byte 2*SIZEOF_XMMWORD
687        jz      short .nextrow
688
689        movdqa  xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
690
691        movdqa    xmm3,xmm2
692        punpcklbw xmm2,xmm2
693        punpckhbw xmm3,xmm3
694
695        movdqa  XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
696        movdqa  XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
697        movdqa  XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
698        movdqa  XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
699
700        sub     eax, byte 2*SIZEOF_XMMWORD
701        jz      short .nextrow
702
703        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr
704        add     ebx, byte 4*SIZEOF_XMMWORD      ; outptr0
705        add     edi, byte 4*SIZEOF_XMMWORD      ; outptr1
706        jmp     short .columnloop
707        alignx  16,7
708
709.nextrow:
710        pop     esi
711        pop     edi
712
713        add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
714        add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
715        sub     ecx, byte 2                     ; rowctr
716        jg      short .rowloop
717
718.return:
719        pop     edi
720        pop     esi
721;       pop     edx             ; need not be preserved
722;       pop     ecx             ; need not be preserved
723        pop     ebx
724        pop     ebp
725        ret
726
727; For some reason, the OS X linker does not honor the request to align the
728; segment unless we do this.
729        align   16
730