• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jdsample.asm - upsampling (MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16
17%include "jsimdext.inc"
18
19; --------------------------------------------------------------------------
20    SECTION     SEG_CONST
21
22    alignz      32
23    GLOBAL_DATA(jconst_fancy_upsample_mmx)
24
25EXTN(jconst_fancy_upsample_mmx):
26
27PW_ONE   times 4 dw 1
28PW_TWO   times 4 dw 2
29PW_THREE times 4 dw 3
30PW_SEVEN times 4 dw 7
31PW_EIGHT times 4 dw 8
32
33    alignz      32
34
35; --------------------------------------------------------------------------
36    SECTION     SEG_TEXT
37    BITS        32
38;
39; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
40;
41; The upsampling algorithm is linear interpolation between pixel centers,
42; also known as a "triangle filter".  This is a good compromise between
43; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
44; of the way between input pixel centers.
45;
46; GLOBAL(void)
47; jsimd_h2v1_fancy_upsample_mmx(int max_v_samp_factor,
48;                               JDIMENSION downsampled_width,
49;                               JSAMPARRAY input_data,
50;                               JSAMPARRAY *output_data_ptr);
51;
52
53%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
54%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
55%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
56%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
57
58    align       32
59    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_mmx)
60
61EXTN(jsimd_h2v1_fancy_upsample_mmx):
62    push        ebp
63    mov         ebp, esp
64    pushpic     ebx
65;   push        ecx                     ; need not be preserved
66;   push        edx                     ; need not be preserved
67    push        esi
68    push        edi
69
70    get_GOT     ebx                     ; get GOT address
71
72    mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
73    test        eax, eax
74    jz          near .return
75
76    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
77    test        ecx, ecx
78    jz          near .return
79
80    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
81    mov         edi, POINTER [output_data_ptr(ebp)]
82    mov         edi, JSAMPARRAY [edi]                ; output_data
83    alignx      16, 7
84.rowloop:
85    push        eax                     ; colctr
86    push        edi
87    push        esi
88
89    mov         esi, JSAMPROW [esi]     ; inptr
90    mov         edi, JSAMPROW [edi]     ; outptr
91
92    test        eax, SIZEOF_MMWORD-1
93    jz          short .skip
94    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
95    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
96.skip:
97    pxor        mm0, mm0                ; mm0=(all 0's)
98    pcmpeqb     mm7, mm7
99    psrlq       mm7, (SIZEOF_MMWORD-1)*BYTE_BIT
100    pand        mm7,  MMWORD [esi+0*SIZEOF_MMWORD]
101
102    add         eax, byte SIZEOF_MMWORD-1
103    and         eax, byte -SIZEOF_MMWORD
104    cmp         eax, byte SIZEOF_MMWORD
105    ja          short .columnloop
106    alignx      16, 7
107
108.columnloop_last:
109    pcmpeqb     mm6, mm6
110    psllq       mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
111    pand        mm6, MMWORD [esi+0*SIZEOF_MMWORD]
112    jmp         short .upsample
113    alignx      16, 7
114
115.columnloop:
116    movq        mm6, MMWORD [esi+1*SIZEOF_MMWORD]
117    psllq       mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
118
119.upsample:
120    movq        mm1, MMWORD [esi+0*SIZEOF_MMWORD]
121    movq        mm2, mm1
122    movq        mm3, mm1                ; mm1=( 0 1 2 3 4 5 6 7)
123    psllq       mm2, BYTE_BIT           ; mm2=( - 0 1 2 3 4 5 6)
124    psrlq       mm3, BYTE_BIT           ; mm3=( 1 2 3 4 5 6 7 -)
125
126    por         mm2, mm7                ; mm2=(-1 0 1 2 3 4 5 6)
127    por         mm3, mm6                ; mm3=( 1 2 3 4 5 6 7 8)
128
129    movq        mm7, mm1
130    psrlq       mm7, (SIZEOF_MMWORD-1)*BYTE_BIT  ; mm7=( 7 - - - - - - -)
131
132    movq        mm4, mm1
133    punpcklbw   mm1, mm0                ; mm1=( 0 1 2 3)
134    punpckhbw   mm4, mm0                ; mm4=( 4 5 6 7)
135    movq        mm5, mm2
136    punpcklbw   mm2, mm0                ; mm2=(-1 0 1 2)
137    punpckhbw   mm5, mm0                ; mm5=( 3 4 5 6)
138    movq        mm6, mm3
139    punpcklbw   mm3, mm0                ; mm3=( 1 2 3 4)
140    punpckhbw   mm6, mm0                ; mm6=( 5 6 7 8)
141
142    pmullw      mm1, [GOTOFF(ebx,PW_THREE)]
143    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
144    paddw       mm2, [GOTOFF(ebx,PW_ONE)]
145    paddw       mm5, [GOTOFF(ebx,PW_ONE)]
146    paddw       mm3, [GOTOFF(ebx,PW_TWO)]
147    paddw       mm6, [GOTOFF(ebx,PW_TWO)]
148
149    paddw       mm2, mm1
150    paddw       mm5, mm4
151    psrlw       mm2, 2                  ; mm2=OutLE=( 0  2  4  6)
152    psrlw       mm5, 2                  ; mm5=OutHE=( 8 10 12 14)
153    paddw       mm3, mm1
154    paddw       mm6, mm4
155    psrlw       mm3, 2                  ; mm3=OutLO=( 1  3  5  7)
156    psrlw       mm6, 2                  ; mm6=OutHO=( 9 11 13 15)
157
158    psllw       mm3, BYTE_BIT
159    psllw       mm6, BYTE_BIT
160    por         mm2, mm3                ; mm2=OutL=( 0  1  2  3  4  5  6  7)
161    por         mm5, mm6                ; mm5=OutH=( 8  9 10 11 12 13 14 15)
162
163    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm2
164    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm5
165
166    sub         eax, byte SIZEOF_MMWORD
167    add         esi, byte 1*SIZEOF_MMWORD  ; inptr
168    add         edi, byte 2*SIZEOF_MMWORD  ; outptr
169    cmp         eax, byte SIZEOF_MMWORD
170    ja          near .columnloop
171    test        eax, eax
172    jnz         near .columnloop_last
173
174    pop         esi
175    pop         edi
176    pop         eax
177
178    add         esi, byte SIZEOF_JSAMPROW  ; input_data
179    add         edi, byte SIZEOF_JSAMPROW  ; output_data
180    dec         ecx                        ; rowctr
181    jg          near .rowloop
182
183    emms                                ; empty MMX state
184
185.return:
186    pop         edi
187    pop         esi
188;   pop         edx                     ; need not be preserved
189;   pop         ecx                     ; need not be preserved
190    poppic      ebx
191    pop         ebp
192    ret
193
194; --------------------------------------------------------------------------
195;
196; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
197; Again a triangle filter; see comments for h2v1 case, above.
198;
199; GLOBAL(void)
200; jsimd_h2v2_fancy_upsample_mmx(int max_v_samp_factor,
201;                               JDIMENSION downsampled_width,
202;                               JSAMPARRAY input_data,
203;                               JSAMPARRAY *output_data_ptr);
204;
205
206%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
207%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
208%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
209%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
210
211%define original_ebp  ebp + 0
212%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
213%define WK_NUM        4
214%define gotptr        wk(0) - SIZEOF_POINTER  ; void *gotptr
215
216    align       32
217    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_mmx)
218
219EXTN(jsimd_h2v2_fancy_upsample_mmx):
220    push        ebp
221    mov         eax, esp                    ; eax = original ebp
222    sub         esp, byte 4
223    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
224    mov         [esp], eax
225    mov         ebp, esp                    ; ebp = aligned ebp
226    lea         esp, [wk(0)]
227    pushpic     eax                     ; make a room for GOT address
228    push        ebx
229;   push        ecx                     ; need not be preserved
230;   push        edx                     ; need not be preserved
231    push        esi
232    push        edi
233
234    get_GOT     ebx                     ; get GOT address
235    movpic      POINTER [gotptr], ebx   ; save GOT address
236
237    mov         edx, eax                ; edx = original ebp
238    mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
239    test        eax, eax
240    jz          near .return
241
242    mov         ecx, INT [max_v_samp(edx)]  ; rowctr
243    test        ecx, ecx
244    jz          near .return
245
246    mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
247    mov         edi, POINTER [output_data_ptr(edx)]
248    mov         edi, JSAMPARRAY [edi]                ; output_data
249    alignx      16, 7
250.rowloop:
251    push        eax                     ; colctr
252    push        ecx
253    push        edi
254    push        esi
255
256    mov         ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
257    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
258    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
259    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
260    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
261
262    test        eax, SIZEOF_MMWORD-1
263    jz          short .skip
264    push        edx
265    mov         dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
266    mov         JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
267    mov         dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
268    mov         JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
269    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
270    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
271    pop         edx
272.skip:
273    ; -- process the first column block
274
275    movq        mm0, MMWORD [ebx+0*SIZEOF_MMWORD]  ; mm0=row[ 0][0]
276    movq        mm1, MMWORD [ecx+0*SIZEOF_MMWORD]  ; mm1=row[-1][0]
277    movq        mm2, MMWORD [esi+0*SIZEOF_MMWORD]  ; mm2=row[+1][0]
278
279    pushpic     ebx
280    movpic      ebx, POINTER [gotptr]   ; load GOT address
281
282    pxor        mm3, mm3                ; mm3=(all 0's)
283    movq        mm4, mm0
284    punpcklbw   mm0, mm3                ; mm0=row[ 0][0]( 0 1 2 3)
285    punpckhbw   mm4, mm3                ; mm4=row[ 0][0]( 4 5 6 7)
286    movq        mm5, mm1
287    punpcklbw   mm1, mm3                ; mm1=row[-1][0]( 0 1 2 3)
288    punpckhbw   mm5, mm3                ; mm5=row[-1][0]( 4 5 6 7)
289    movq        mm6, mm2
290    punpcklbw   mm2, mm3                ; mm2=row[+1][0]( 0 1 2 3)
291    punpckhbw   mm6, mm3                ; mm6=row[+1][0]( 4 5 6 7)
292
293    pmullw      mm0, [GOTOFF(ebx,PW_THREE)]
294    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
295
296    pcmpeqb     mm7, mm7
297    psrlq       mm7, (SIZEOF_MMWORD-2)*BYTE_BIT
298
299    paddw       mm1, mm0                ; mm1=Int0L=( 0 1 2 3)
300    paddw       mm5, mm4                ; mm5=Int0H=( 4 5 6 7)
301    paddw       mm2, mm0                ; mm2=Int1L=( 0 1 2 3)
302    paddw       mm6, mm4                ; mm6=Int1H=( 4 5 6 7)
303
304    movq        MMWORD [edx+0*SIZEOF_MMWORD], mm1  ; temporarily save
305    movq        MMWORD [edx+1*SIZEOF_MMWORD], mm5  ; the intermediate data
306    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm2
307    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm6
308
309    pand        mm1, mm7                ; mm1=( 0 - - -)
310    pand        mm2, mm7                ; mm2=( 0 - - -)
311
312    movq        MMWORD [wk(0)], mm1
313    movq        MMWORD [wk(1)], mm2
314
315    poppic      ebx
316
317    add         eax, byte SIZEOF_MMWORD-1
318    and         eax, byte -SIZEOF_MMWORD
319    cmp         eax, byte SIZEOF_MMWORD
320    ja          short .columnloop
321    alignx      16, 7
322
323.columnloop_last:
324    ; -- process the last column block
325
326    pushpic     ebx
327    movpic      ebx, POINTER [gotptr]   ; load GOT address
328
329    pcmpeqb     mm1, mm1
330    psllq       mm1, (SIZEOF_MMWORD-2)*BYTE_BIT
331    movq        mm2, mm1
332
333    pand        mm1, MMWORD [edx+1*SIZEOF_MMWORD]  ; mm1=( - - - 7)
334    pand        mm2, MMWORD [edi+1*SIZEOF_MMWORD]  ; mm2=( - - - 7)
335
336    movq        MMWORD [wk(2)], mm1
337    movq        MMWORD [wk(3)], mm2
338
339    jmp         short .upsample
340    alignx      16, 7
341
342.columnloop:
343    ; -- process the next column block
344
345    movq        mm0, MMWORD [ebx+1*SIZEOF_MMWORD]  ; mm0=row[ 0][1]
346    movq        mm1, MMWORD [ecx+1*SIZEOF_MMWORD]  ; mm1=row[-1][1]
347    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]  ; mm2=row[+1][1]
348
349    pushpic     ebx
350    movpic      ebx, POINTER [gotptr]   ; load GOT address
351
352    pxor        mm3, mm3                ; mm3=(all 0's)
353    movq        mm4, mm0
354    punpcklbw   mm0, mm3                ; mm0=row[ 0][1]( 0 1 2 3)
355    punpckhbw   mm4, mm3                ; mm4=row[ 0][1]( 4 5 6 7)
356    movq        mm5, mm1
357    punpcklbw   mm1, mm3                ; mm1=row[-1][1]( 0 1 2 3)
358    punpckhbw   mm5, mm3                ; mm5=row[-1][1]( 4 5 6 7)
359    movq        mm6, mm2
360    punpcklbw   mm2, mm3                ; mm2=row[+1][1]( 0 1 2 3)
361    punpckhbw   mm6, mm3                ; mm6=row[+1][1]( 4 5 6 7)
362
363    pmullw      mm0, [GOTOFF(ebx,PW_THREE)]
364    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
365
366    paddw       mm1, mm0                ; mm1=Int0L=( 0 1 2 3)
367    paddw       mm5, mm4                ; mm5=Int0H=( 4 5 6 7)
368    paddw       mm2, mm0                ; mm2=Int1L=( 0 1 2 3)
369    paddw       mm6, mm4                ; mm6=Int1H=( 4 5 6 7)
370
371    movq        MMWORD [edx+2*SIZEOF_MMWORD], mm1  ; temporarily save
372    movq        MMWORD [edx+3*SIZEOF_MMWORD], mm5  ; the intermediate data
373    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
374    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm6
375
376    psllq       mm1, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm1=( - - - 0)
377    psllq       mm2, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm2=( - - - 0)
378
379    movq        MMWORD [wk(2)], mm1
380    movq        MMWORD [wk(3)], mm2
381
382.upsample:
383    ; -- process the upper row
384
385    movq        mm7, MMWORD [edx+0*SIZEOF_MMWORD]  ; mm7=Int0L=( 0 1 2 3)
386    movq        mm3, MMWORD [edx+1*SIZEOF_MMWORD]  ; mm3=Int0H=( 4 5 6 7)
387
388    movq        mm0, mm7
389    movq        mm4, mm3
390    psrlq       mm0, 2*BYTE_BIT                  ; mm0=( 1 2 3 -)
391    psllq       mm4, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( - - - 4)
392    movq        mm5, mm7
393    movq        mm6, mm3
394    psrlq       mm5, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm5=( 3 - - -)
395    psllq       mm6, 2*BYTE_BIT                  ; mm6=( - 4 5 6)
396
397    por         mm0, mm4                         ; mm0=( 1 2 3 4)
398    por         mm5, mm6                         ; mm5=( 3 4 5 6)
399
400    movq        mm1, mm7
401    movq        mm2, mm3
402    psllq       mm1, 2*BYTE_BIT                  ; mm1=( - 0 1 2)
403    psrlq       mm2, 2*BYTE_BIT                  ; mm2=( 5 6 7 -)
404    movq        mm4, mm3
405    psrlq       mm4, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( 7 - - -)
406
407    por         mm1, MMWORD [wk(0)]              ; mm1=(-1 0 1 2)
408    por         mm2, MMWORD [wk(2)]              ; mm2=( 5 6 7 8)
409
410    movq        MMWORD [wk(0)], mm4
411
412    pmullw      mm7, [GOTOFF(ebx,PW_THREE)]
413    pmullw      mm3, [GOTOFF(ebx,PW_THREE)]
414    paddw       mm1, [GOTOFF(ebx,PW_EIGHT)]
415    paddw       mm5, [GOTOFF(ebx,PW_EIGHT)]
416    paddw       mm0, [GOTOFF(ebx,PW_SEVEN)]
417    paddw       mm2, [GOTOFF(ebx,PW_SEVEN)]
418
419    paddw       mm1, mm7
420    paddw       mm5, mm3
421    psrlw       mm1, 4                  ; mm1=Out0LE=( 0  2  4  6)
422    psrlw       mm5, 4                  ; mm5=Out0HE=( 8 10 12 14)
423    paddw       mm0, mm7
424    paddw       mm2, mm3
425    psrlw       mm0, 4                  ; mm0=Out0LO=( 1  3  5  7)
426    psrlw       mm2, 4                  ; mm2=Out0HO=( 9 11 13 15)
427
428    psllw       mm0, BYTE_BIT
429    psllw       mm2, BYTE_BIT
430    por         mm1, mm0                ; mm1=Out0L=( 0  1  2  3  4  5  6  7)
431    por         mm5, mm2                ; mm5=Out0H=( 8  9 10 11 12 13 14 15)
432
433    movq        MMWORD [edx+0*SIZEOF_MMWORD], mm1
434    movq        MMWORD [edx+1*SIZEOF_MMWORD], mm5
435
436    ; -- process the lower row
437
438    movq        mm6, MMWORD [edi+0*SIZEOF_MMWORD]  ; mm6=Int1L=( 0 1 2 3)
439    movq        mm4, MMWORD [edi+1*SIZEOF_MMWORD]  ; mm4=Int1H=( 4 5 6 7)
440
441    movq        mm7, mm6
442    movq        mm3, mm4
443    psrlq       mm7, 2*BYTE_BIT                  ; mm7=( 1 2 3 -)
444    psllq       mm3, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( - - - 4)
445    movq        mm0, mm6
446    movq        mm2, mm4
447    psrlq       mm0, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm0=( 3 - - -)
448    psllq       mm2, 2*BYTE_BIT                  ; mm2=( - 4 5 6)
449
450    por         mm7, mm3                         ; mm7=( 1 2 3 4)
451    por         mm0, mm2                         ; mm0=( 3 4 5 6)
452
453    movq        mm1, mm6
454    movq        mm5, mm4
455    psllq       mm1, 2*BYTE_BIT                  ; mm1=( - 0 1 2)
456    psrlq       mm5, 2*BYTE_BIT                  ; mm5=( 5 6 7 -)
457    movq        mm3, mm4
458    psrlq       mm3, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( 7 - - -)
459
460    por         mm1, MMWORD [wk(1)]              ; mm1=(-1 0 1 2)
461    por         mm5, MMWORD [wk(3)]              ; mm5=( 5 6 7 8)
462
463    movq        MMWORD [wk(1)], mm3
464
465    pmullw      mm6, [GOTOFF(ebx,PW_THREE)]
466    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
467    paddw       mm1, [GOTOFF(ebx,PW_EIGHT)]
468    paddw       mm0, [GOTOFF(ebx,PW_EIGHT)]
469    paddw       mm7, [GOTOFF(ebx,PW_SEVEN)]
470    paddw       mm5, [GOTOFF(ebx,PW_SEVEN)]
471
472    paddw       mm1, mm6
473    paddw       mm0, mm4
474    psrlw       mm1, 4                  ; mm1=Out1LE=( 0  2  4  6)
475    psrlw       mm0, 4                  ; mm0=Out1HE=( 8 10 12 14)
476    paddw       mm7, mm6
477    paddw       mm5, mm4
478    psrlw       mm7, 4                  ; mm7=Out1LO=( 1  3  5  7)
479    psrlw       mm5, 4                  ; mm5=Out1HO=( 9 11 13 15)
480
481    psllw       mm7, BYTE_BIT
482    psllw       mm5, BYTE_BIT
483    por         mm1, mm7                ; mm1=Out1L=( 0  1  2  3  4  5  6  7)
484    por         mm0, mm5                ; mm0=Out1H=( 8  9 10 11 12 13 14 15)
485
486    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm1
487    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm0
488
489    poppic      ebx
490
491    sub         eax, byte SIZEOF_MMWORD
492    add         ecx, byte 1*SIZEOF_MMWORD  ; inptr1(above)
493    add         ebx, byte 1*SIZEOF_MMWORD  ; inptr0
494    add         esi, byte 1*SIZEOF_MMWORD  ; inptr1(below)
495    add         edx, byte 2*SIZEOF_MMWORD  ; outptr0
496    add         edi, byte 2*SIZEOF_MMWORD  ; outptr1
497    cmp         eax, byte SIZEOF_MMWORD
498    ja          near .columnloop
499    test        eax, eax
500    jnz         near .columnloop_last
501
502    pop         esi
503    pop         edi
504    pop         ecx
505    pop         eax
506
507    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
508    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
509    sub         ecx, byte 2                  ; rowctr
510    jg          near .rowloop
511
512    emms                                ; empty MMX state
513
514.return:
515    pop         edi
516    pop         esi
517;   pop         edx                     ; need not be preserved
518;   pop         ecx                     ; need not be preserved
519    pop         ebx
520    mov         esp, ebp                ; esp <- aligned ebp
521    pop         esp                     ; esp <- original ebp
522    pop         ebp
523    ret
524
525; --------------------------------------------------------------------------
526;
527; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
528; It's still a box filter.
529;
530; GLOBAL(void)
531; jsimd_h2v1_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
532;                         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
533;
534
535%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
536%define output_width(b)     (b) + 12    ; JDIMENSION output_width
537%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
538%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
539
540    align       32
541    GLOBAL_FUNCTION(jsimd_h2v1_upsample_mmx)
542
543EXTN(jsimd_h2v1_upsample_mmx):
544    push        ebp
545    mov         ebp, esp
546;   push        ebx                     ; unused
547;   push        ecx                     ; need not be preserved
548;   push        edx                     ; need not be preserved
549    push        esi
550    push        edi
551
552    mov         edx, JDIMENSION [output_width(ebp)]
553    add         edx, byte (2*SIZEOF_MMWORD)-1
554    and         edx, byte -(2*SIZEOF_MMWORD)
555    jz          short .return
556
557    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
558    test        ecx, ecx
559    jz          short .return
560
561    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
562    mov         edi, POINTER [output_data_ptr(ebp)]
563    mov         edi, JSAMPARRAY [edi]                ; output_data
564    alignx      16, 7
565.rowloop:
566    push        edi
567    push        esi
568
569    mov         esi, JSAMPROW [esi]     ; inptr
570    mov         edi, JSAMPROW [edi]     ; outptr
571    mov         eax, edx                ; colctr
572    alignx      16, 7
573.columnloop:
574
575    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
576
577    movq        mm1, mm0
578    punpcklbw   mm0, mm0
579    punpckhbw   mm1, mm1
580
581    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
582    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm1
583
584    sub         eax, byte 2*SIZEOF_MMWORD
585    jz          short .nextrow
586
587    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]
588
589    movq        mm3, mm2
590    punpcklbw   mm2, mm2
591    punpckhbw   mm3, mm3
592
593    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
594    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm3
595
596    sub         eax, byte 2*SIZEOF_MMWORD
597    jz          short .nextrow
598
599    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
600    add         edi, byte 4*SIZEOF_MMWORD  ; outptr
601    jmp         short .columnloop
602    alignx      16, 7
603
604.nextrow:
605    pop         esi
606    pop         edi
607
608    add         esi, byte SIZEOF_JSAMPROW  ; input_data
609    add         edi, byte SIZEOF_JSAMPROW  ; output_data
610    dec         ecx                        ; rowctr
611    jg          short .rowloop
612
613    emms                                ; empty MMX state
614
615.return:
616    pop         edi
617    pop         esi
618;   pop         edx                     ; need not be preserved
619;   pop         ecx                     ; need not be preserved
620;   pop         ebx                     ; unused
621    pop         ebp
622    ret
623
624; --------------------------------------------------------------------------
625;
626; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
627; It's still a box filter.
628;
629; GLOBAL(void)
630; jsimd_h2v2_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
631;                         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
632;
633
634%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
635%define output_width(b)     (b) + 12    ; JDIMENSION output_width
636%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
637%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
638
639    align       32
640    GLOBAL_FUNCTION(jsimd_h2v2_upsample_mmx)
641
642EXTN(jsimd_h2v2_upsample_mmx):
643    push        ebp
644    mov         ebp, esp
645    push        ebx
646;   push        ecx                     ; need not be preserved
647;   push        edx                     ; need not be preserved
648    push        esi
649    push        edi
650
651    mov         edx, JDIMENSION [output_width(ebp)]
652    add         edx, byte (2*SIZEOF_MMWORD)-1
653    and         edx, byte -(2*SIZEOF_MMWORD)
654    jz          near .return
655
656    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
657    test        ecx, ecx
658    jz          short .return
659
660    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
661    mov         edi, POINTER [output_data_ptr(ebp)]
662    mov         edi, JSAMPARRAY [edi]                ; output_data
663    alignx      16, 7
664.rowloop:
665    push        edi
666    push        esi
667
668    mov         esi, JSAMPROW [esi]                    ; inptr
669    mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
670    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
671    mov         eax, edx                               ; colctr
672    alignx      16, 7
673.columnloop:
674
675    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
676
677    movq        mm1, mm0
678    punpcklbw   mm0, mm0
679    punpckhbw   mm1, mm1
680
681    movq        MMWORD [ebx+0*SIZEOF_MMWORD], mm0
682    movq        MMWORD [ebx+1*SIZEOF_MMWORD], mm1
683    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
684    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm1
685
686    sub         eax, byte 2*SIZEOF_MMWORD
687    jz          short .nextrow
688
689    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]
690
691    movq        mm3, mm2
692    punpcklbw   mm2, mm2
693    punpckhbw   mm3, mm3
694
695    movq        MMWORD [ebx+2*SIZEOF_MMWORD], mm2
696    movq        MMWORD [ebx+3*SIZEOF_MMWORD], mm3
697    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
698    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm3
699
700    sub         eax, byte 2*SIZEOF_MMWORD
701    jz          short .nextrow
702
703    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
704    add         ebx, byte 4*SIZEOF_MMWORD  ; outptr0
705    add         edi, byte 4*SIZEOF_MMWORD  ; outptr1
706    jmp         short .columnloop
707    alignx      16, 7
708
709.nextrow:
710    pop         esi
711    pop         edi
712
713    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
714    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
715    sub         ecx, byte 2                  ; rowctr
716    jg          short .rowloop
717
718    emms                                ; empty MMX state
719
720.return:
721    pop         edi
722    pop         esi
723;   pop         edx                     ; need not be preserved
724;   pop         ecx                     ; need not be preserved
725    pop         ebx
726    pop         ebp
727    ret
728
729; For some reason, the OS X linker does not honor the request to align the
730; segment unless we do this.
731    align       32
732