• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2)
3;
4; Copyright (C) 2016, 2018, Matthieu Darbois
5;
6; Based on the x86 SIMD extension for IJG JPEG library
7; Copyright (C) 1999-2006, MIYASAKA Masaru.
8; For conditions of distribution and use, see copyright notice in jsimdext.inc
9;
10; This file should be assembled with NASM (Netwide Assembler),
11; can *not* be assembled with Microsoft's MASM or any compatible
12; assembler (including Borland's Turbo Assembler).
13; NASM is available from http://nasm.sourceforge.net/ or
14; http://sourceforge.net/project/showfiles.php?group_id=6208
15;
16; This file contains an SSE2 implementation of data preparation for progressive
17; Huffman encoding.  See jcphuff.c for more details.
18;
19; [TAB8]
20
21%include "jsimdext.inc"
22
23; --------------------------------------------------------------------------
24    SECTION     SEG_TEXT
25    BITS        32
26
27; --------------------------------------------------------------------------
28; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
29; jsimd_encode_mcu_AC_refine_prepare_sse2()
30
31%macro LOAD16 0
32    pxor        N0, N0
33    pxor        N1, N1
34
35    mov         T0, INT [LUT +  0*SIZEOF_INT]
36    mov         T1, INT [LUT +  8*SIZEOF_INT]
37    pinsrw      X0, word [BLOCK + T0 * 2], 0
38    pinsrw      X1, word [BLOCK + T1 * 2], 0
39
40    mov         T0, INT [LUT +  1*SIZEOF_INT]
41    mov         T1, INT [LUT +  9*SIZEOF_INT]
42    pinsrw      X0, word [BLOCK + T0 * 2], 1
43    pinsrw      X1, word [BLOCK + T1 * 2], 1
44
45    mov         T0, INT [LUT +  2*SIZEOF_INT]
46    mov         T1, INT [LUT + 10*SIZEOF_INT]
47    pinsrw      X0, word [BLOCK + T0 * 2], 2
48    pinsrw      X1, word [BLOCK + T1 * 2], 2
49
50    mov         T0, INT [LUT +  3*SIZEOF_INT]
51    mov         T1, INT [LUT + 11*SIZEOF_INT]
52    pinsrw      X0, word [BLOCK + T0 * 2], 3
53    pinsrw      X1, word [BLOCK + T1 * 2], 3
54
55    mov         T0, INT [LUT +  4*SIZEOF_INT]
56    mov         T1, INT [LUT + 12*SIZEOF_INT]
57    pinsrw      X0, word [BLOCK + T0 * 2], 4
58    pinsrw      X1, word [BLOCK + T1 * 2], 4
59
60    mov         T0, INT [LUT +  5*SIZEOF_INT]
61    mov         T1, INT [LUT + 13*SIZEOF_INT]
62    pinsrw      X0, word [BLOCK + T0 * 2], 5
63    pinsrw      X1, word [BLOCK + T1 * 2], 5
64
65    mov         T0, INT [LUT +  6*SIZEOF_INT]
66    mov         T1, INT [LUT + 14*SIZEOF_INT]
67    pinsrw      X0, word [BLOCK + T0 * 2], 6
68    pinsrw      X1, word [BLOCK + T1 * 2], 6
69
70    mov         T0, INT [LUT +  7*SIZEOF_INT]
71    mov         T1, INT [LUT + 15*SIZEOF_INT]
72    pinsrw      X0, word [BLOCK + T0 * 2], 7
73    pinsrw      X1, word [BLOCK + T1 * 2], 7
74%endmacro
75
76%macro LOAD15 0
77    pxor        N0, N0
78    pxor        N1, N1
79    pxor        X1, X1
80
81    mov         T0, INT [LUT +  0*SIZEOF_INT]
82    mov         T1, INT [LUT +  8*SIZEOF_INT]
83    pinsrw      X0, word [BLOCK + T0 * 2], 0
84    pinsrw      X1, word [BLOCK + T1 * 2], 0
85
86    mov         T0, INT [LUT +  1*SIZEOF_INT]
87    pinsrw      X0, word [BLOCK + T0 * 2], 1
88
89    mov         T0, INT [LUT +  2*SIZEOF_INT]
90    pinsrw      X0, word [BLOCK + T0 * 2], 2
91
92    mov         T0, INT [LUT +  3*SIZEOF_INT]
93    pinsrw      X0, word [BLOCK + T0 * 2], 3
94
95    mov         T0, INT [LUT +  4*SIZEOF_INT]
96    pinsrw      X0, word [BLOCK + T0 * 2], 4
97
98    mov         T0, INT [LUT +  5*SIZEOF_INT]
99    pinsrw      X0, word [BLOCK + T0 * 2], 5
100
101    mov         T0, INT [LUT +  6*SIZEOF_INT]
102    pinsrw      X0, word [BLOCK + T0 * 2], 6
103
104    mov         T0, INT [LUT +  7*SIZEOF_INT]
105    pinsrw      X0, word [BLOCK + T0 * 2], 7
106
107    cmp         LENEND, 2
108    jl          %%.ELOAD15
109    mov         T1, INT [LUT +  9*SIZEOF_INT]
110    pinsrw      X1, word [BLOCK + T1 * 2], 1
111
112    cmp         LENEND, 3
113    jl          %%.ELOAD15
114    mov         T1, INT [LUT + 10*SIZEOF_INT]
115    pinsrw      X1, word [BLOCK + T1 * 2], 2
116
117    cmp         LENEND, 4
118    jl          %%.ELOAD15
119    mov         T1, INT [LUT + 11*SIZEOF_INT]
120    pinsrw      X1, word [BLOCK + T1 * 2], 3
121
122    cmp         LENEND, 5
123    jl          %%.ELOAD15
124    mov         T1, INT [LUT + 12*SIZEOF_INT]
125    pinsrw      X1, word [BLOCK + T1 * 2], 4
126
127    cmp         LENEND, 6
128    jl          %%.ELOAD15
129    mov         T1, INT [LUT + 13*SIZEOF_INT]
130    pinsrw      X1, word [BLOCK + T1 * 2], 5
131
132    cmp         LENEND, 7
133    jl          %%.ELOAD15
134    mov         T1, INT [LUT + 14*SIZEOF_INT]
135    pinsrw      X1, word [BLOCK + T1 * 2], 6
136%%.ELOAD15:
137%endmacro
138
139%macro LOAD8 0
140    pxor        N0, N0
141
142    mov         T0, INT [LUT +  0*SIZEOF_INT]
143    pinsrw      X0, word [BLOCK + T0 * 2], 0
144
145    mov         T0, INT [LUT +  1*SIZEOF_INT]
146    pinsrw      X0, word [BLOCK + T0 * 2], 1
147
148    mov         T0, INT [LUT +  2*SIZEOF_INT]
149    pinsrw      X0, word [BLOCK + T0 * 2], 2
150
151    mov         T0, INT [LUT +  3*SIZEOF_INT]
152    pinsrw      X0, word [BLOCK + T0 * 2], 3
153
154    mov         T0, INT [LUT +  4*SIZEOF_INT]
155    pinsrw      X0, word [BLOCK + T0 * 2], 4
156
157    mov         T0, INT [LUT +  5*SIZEOF_INT]
158    pinsrw      X0, word [BLOCK + T0 * 2], 5
159
160    mov         T0, INT [LUT +  6*SIZEOF_INT]
161    pinsrw      X0, word [BLOCK + T0 * 2], 6
162
163    mov         T0, INT [LUT +  7*SIZEOF_INT]
164    pinsrw      X0, word [BLOCK + T0 * 2], 7
165%endmacro
166
167%macro LOAD7 0
168    pxor        N0, N0
169    pxor        X0, X0
170
171    mov         T1, INT [LUT +  0*SIZEOF_INT]
172    pinsrw      X0, word [BLOCK + T1 * 2], 0
173
174    cmp         LENEND, 2
175    jl          %%.ELOAD7
176    mov         T1, INT [LUT +  1*SIZEOF_INT]
177    pinsrw      X0, word [BLOCK + T1 * 2], 1
178
179    cmp         LENEND, 3
180    jl          %%.ELOAD7
181    mov         T1, INT [LUT +  2*SIZEOF_INT]
182    pinsrw      X0, word [BLOCK + T1 * 2], 2
183
184    cmp         LENEND, 4
185    jl          %%.ELOAD7
186    mov         T1, INT [LUT +  3*SIZEOF_INT]
187    pinsrw      X0, word [BLOCK + T1 * 2], 3
188
189    cmp         LENEND, 5
190    jl          %%.ELOAD7
191    mov         T1, INT [LUT +  4*SIZEOF_INT]
192    pinsrw      X0, word [BLOCK + T1 * 2], 4
193
194    cmp         LENEND, 6
195    jl          %%.ELOAD7
196    mov         T1, INT [LUT +  5*SIZEOF_INT]
197    pinsrw      X0, word [BLOCK + T1 * 2], 5
198
199    cmp         LENEND, 7
200    jl          %%.ELOAD7
201    mov         T1, INT [LUT +  6*SIZEOF_INT]
202    pinsrw      X0, word [BLOCK + T1 * 2], 6
203%%.ELOAD7:
204%endmacro
205
206%macro REDUCE0 0
207    movdqa      xmm0, XMMWORD [VALUES + ( 0*2)]
208    movdqa      xmm1, XMMWORD [VALUES + ( 8*2)]
209    movdqa      xmm2, XMMWORD [VALUES + (16*2)]
210    movdqa      xmm3, XMMWORD [VALUES + (24*2)]
211    movdqa      xmm4, XMMWORD [VALUES + (32*2)]
212    movdqa      xmm5, XMMWORD [VALUES + (40*2)]
213    movdqa      xmm6, XMMWORD [VALUES + (48*2)]
214
215    pcmpeqw     xmm0, ZERO
216    pcmpeqw     xmm1, ZERO
217    pcmpeqw     xmm2, ZERO
218    pcmpeqw     xmm3, ZERO
219    pcmpeqw     xmm4, ZERO
220    pcmpeqw     xmm5, ZERO
221    pcmpeqw     xmm6, ZERO
222    pcmpeqw     xmm7, XMMWORD [VALUES + (56*2)]
223
224    packsswb    xmm0, xmm1
225    packsswb    xmm2, xmm3
226    packsswb    xmm4, xmm5
227    packsswb    xmm6, xmm7
228
229    pmovmskb    eax, xmm0
230    pmovmskb    ecx, xmm2
231    pmovmskb    edx, xmm4
232    pmovmskb    esi, xmm6
233
234    shl         ecx, 16
235    shl         esi, 16
236
237    or          eax, ecx
238    or          edx, esi
239
240    not         eax
241    not         edx
242
243    mov         edi, ZEROBITS
244
245    mov         INT [edi], eax
246    mov         INT [edi+SIZEOF_INT], edx
247%endmacro
248
249;
250; Prepare data for jsimd_encode_mcu_AC_first().
251;
252; GLOBAL(void)
253; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
254;                                        const int *jpeg_natural_order_start,
255;                                        int Sl, int Al, JCOEF *values,
256;                                        size_t *zerobits)
257;
258; eax + 8 = const JCOEF *block
259; eax + 12 = const int *jpeg_natural_order_start
260; eax + 16 = int Sl
261; eax + 20 = int Al
262; eax + 24 = JCOEF *values
263; eax + 28 = size_t *zerobits
264
265%define ZERO    xmm7
266%define X0      xmm0
267%define X1      xmm1
268%define N0      xmm2
269%define N1      xmm3
270%define AL      xmm4
271%define K       eax
272%define LENEND  eax
273%define LUT     ebx
274%define T0      ecx
275%define T1      edx
276%define BLOCK   esi
277%define VALUES  edi
278%define LEN     ebp
279
280%define ZEROBITS  INT [esp + 5 * 4]
281
282    align       32
283    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
284
285EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
286    push        ebp
287    mov         eax, esp                     ; eax = original ebp
288    sub         esp, byte 4
289    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
290    mov         [esp], eax
291    mov         ebp, esp                     ; ebp = aligned ebp
292    sub         esp, 4
293    push        ebx
294    push        ecx
295;   push        edx                     ; need not be preserved
296    push        esi
297    push        edi
298    push        ebp
299
300    mov         BLOCK, INT [eax + 8]
301    mov         LUT, INT [eax + 12]
302    mov         VALUES, INT [eax + 24]
303    movd        AL, INT [eax + 20]
304    mov         T0, INT [eax + 28]
305    mov         ZEROBITS, T0
306    mov         LEN, INT [eax + 16]
307    pxor        ZERO, ZERO
308    mov         K, LEN
309    and         K, -16
310    shr         K, 4
311    jz          .ELOOP16
312.BLOOP16:
313    LOAD16
314    pcmpgtw     N0, X0
315    pcmpgtw     N1, X1
316    paddw       X0, N0
317    paddw       X1, N1
318    pxor        X0, N0
319    pxor        X1, N1
320    psrlw       X0, AL
321    psrlw       X1, AL
322    pxor        N0, X0
323    pxor        N1, X1
324    movdqa      XMMWORD [VALUES + (0) * 2], X0
325    movdqa      XMMWORD [VALUES + (8) * 2], X1
326    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
327    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
328    add         VALUES, 16*2
329    add         LUT, 16*SIZEOF_INT
330    dec         K
331    jnz         .BLOOP16
332    test        LEN, 15
333    je          .PADDING
334.ELOOP16:
335    mov         LENEND, LEN
336    and         LENEND, 7
337
338    test        LEN, 8
339    jz          .TRY7
340    test        LEN, 7
341    jz          .TRY8
342
343    LOAD15
344    pcmpgtw     N0, X0
345    pcmpgtw     N1, X1
346    paddw       X0, N0
347    paddw       X1, N1
348    pxor        X0, N0
349    pxor        X1, N1
350    psrlw       X0, AL
351    psrlw       X1, AL
352    pxor        N0, X0
353    pxor        N1, X1
354    movdqa      XMMWORD [VALUES + (0) * 2], X0
355    movdqa      XMMWORD [VALUES + (8) * 2], X1
356    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
357    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
358    add         VALUES, 16*2
359    jmp         .PADDING
360.TRY8:
361    LOAD8
362    pcmpgtw     N0, X0
363    paddw       X0, N0
364    pxor        X0, N0
365    psrlw       X0, AL
366    pxor        N0, X0
367    movdqa      XMMWORD [VALUES + (0) * 2], X0
368    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
369    add         VALUES, 8*2
370    jmp         .PADDING
371.TRY7:
372    LOAD7
373    pcmpgtw     N0, X0
374    paddw       X0, N0
375    pxor        X0, N0
376    psrlw       X0, AL
377    pxor        N0, X0
378    movdqa      XMMWORD [VALUES + (0) * 2], X0
379    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
380    add         VALUES, 8*2
381.PADDING:
382    mov         K, LEN
383    add         K, 7
384    and         K, -8
385    shr         K, 3
386    sub         K, DCTSIZE2/8
387    jz          .EPADDING
388    align       16
389.ZEROLOOP:
390    movdqa      XMMWORD [VALUES + 0], ZERO
391    add         VALUES, 8*2
392    inc         K
393    jnz         .ZEROLOOP
394.EPADDING:
395    sub         VALUES, DCTSIZE2*2
396
397    REDUCE0
398
399    pop         ebp
400    pop         edi
401    pop         esi
402;   pop         edx                     ; need not be preserved
403    pop         ecx
404    pop         ebx
405    mov         esp, ebp                ; esp <- aligned ebp
406    pop         esp                     ; esp <- original ebp
407    pop         ebp
408    ret
409
410%undef ZERO
411%undef X0
412%undef X1
413%undef N0
414%undef N1
415%undef AL
416%undef K
417%undef LUT
418%undef T0
419%undef T1
420%undef BLOCK
421%undef VALUES
422%undef LEN
423
424;
425; Prepare data for jsimd_encode_mcu_AC_refine().
426;
427; GLOBAL(int)
428; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
429;                                         const int *jpeg_natural_order_start,
430;                                         int Sl, int Al, JCOEF *absvalues,
431;                                         size_t *bits)
432;
433; eax + 8 = const JCOEF *block
434; eax + 12 = const int *jpeg_natural_order_start
435; eax + 16 = int Sl
436; eax + 20 = int Al
437; eax + 24 = JCOEF *values
438; eax + 28 = size_t *bits
439
440%define ZERO    xmm7
441%define ONE     xmm5
442%define X0      xmm0
443%define X1      xmm1
444%define N0      xmm2
445%define N1      xmm3
446%define AL      xmm4
447%define K       eax
448%define LENEND  eax
449%define LUT     ebx
450%define T0      ecx
451%define T0w      cx
452%define T1      edx
453%define BLOCK   esi
454%define VALUES  edi
455%define KK      ebp
456
457%define ZEROBITS  INT [esp + 5 * 4]
458%define EOB       INT [esp + 5 * 4 + 4]
459%define LEN       INT [esp + 5 * 4 + 8]
460
461    align       32
462    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
463
464EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
465    push        ebp
466    mov         eax, esp                     ; eax = original ebp
467    sub         esp, byte 4
468    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
469    mov         [esp], eax
470    mov         ebp, esp                     ; ebp = aligned ebp
471    sub         esp, 16
472    push        ebx
473    push        ecx
474;   push        edx                     ; need not be preserved
475    push        esi
476    push        edi
477    push        ebp
478
479    pcmpeqw     ONE, ONE
480    psrlw       ONE, 15
481    mov         BLOCK, INT [eax + 8]
482    mov         LUT, INT [eax + 12]
483    mov         VALUES, INT [eax + 24]
484    movd        AL, INT [eax + 20]
485    mov         T0, INT [eax + 28]
486    mov         K,  INT [eax + 16]
487    mov         INT [T0 + 2 * SIZEOF_INT], -1
488    mov         INT [T0 + 3 * SIZEOF_INT], -1
489    mov         ZEROBITS, T0
490    mov         LEN, K
491    pxor        ZERO, ZERO
492    and         K, -16
493    mov         EOB, 0
494    xor         KK, KK
495    shr         K, 4
496    jz          .ELOOPR16
497.BLOOPR16:
498    LOAD16
499    pcmpgtw     N0, X0
500    pcmpgtw     N1, X1
501    paddw       X0, N0
502    paddw       X1, N1
503    pxor        X0, N0
504    pxor        X1, N1
505    psrlw       X0, AL
506    psrlw       X1, AL
507    movdqa      XMMWORD [VALUES + (0) * 2], X0
508    movdqa      XMMWORD [VALUES + (8) * 2], X1
509    pcmpeqw     X0, ONE
510    pcmpeqw     X1, ONE
511    packsswb    N0, N1
512    packsswb    X0, X1
513    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
514    mov         T1, ZEROBITS
515    not         T0
516    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
517    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
518    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
519    jz          .CONTINUER16            ; if (idx) {
520    lea         T1, [T1+KK*8]
521    mov         EOB, T1                 ; EOB = k + idx;
522.CONTINUER16:
523    add         VALUES, 16*2
524    add         LUT, 16*SIZEOF_INT
525    add         KK, 2
526    dec         K
527    jnz         .BLOOPR16
528.ELOOPR16:
529    mov         LENEND, LEN
530
531    test        LENEND, 8
532    jz          .TRYR7
533    test        LENEND, 7
534    jz          .TRYR8
535
536    and         LENEND, 7
537    LOAD15
538    pcmpgtw     N0, X0
539    pcmpgtw     N1, X1
540    paddw       X0, N0
541    paddw       X1, N1
542    pxor        X0, N0
543    pxor        X1, N1
544    psrlw       X0, AL
545    psrlw       X1, AL
546    movdqa      XMMWORD [VALUES + (0) * 2], X0
547    movdqa      XMMWORD [VALUES + (8) * 2], X1
548    pcmpeqw     X0, ONE
549    pcmpeqw     X1, ONE
550    packsswb    N0, N1
551    packsswb    X0, X1
552    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
553    mov         T1, ZEROBITS
554    not         T0
555    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
556    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
557    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
558    jz          .CONTINUER15            ; if (idx) {
559    lea         T1, [T1+KK*8]
560    mov         EOB, T1                 ; EOB = k + idx;
561.CONTINUER15:
562    add         VALUES, 16*2
563    jmp         .PADDINGR
564.TRYR8:
565    LOAD8
566
567    pcmpgtw     N0, X0
568    paddw       X0, N0
569    pxor        X0, N0
570    psrlw       X0, AL
571    movdqa      XMMWORD [VALUES + (0) * 2], X0
572    pcmpeqw     X0, ONE
573    packsswb    N0, ZERO
574    packsswb    X0, ZERO
575    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
576    mov         T1, ZEROBITS
577    not         T0
578    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
579    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
580    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
581    jz          .CONTINUER8             ; if (idx) {
582    lea         T1, [T1+KK*8]
583    mov         EOB, T1                 ; EOB = k + idx;
584.CONTINUER8:
585    add         VALUES, 8*2
586    jmp         .PADDINGR
587.TRYR7:
588    and         LENEND, 7
589    LOAD7
590
591    pcmpgtw     N0, X0
592    paddw       X0, N0
593    pxor        X0, N0
594    psrlw       X0, AL
595    movdqa      XMMWORD [VALUES + (0) * 2], X0
596    pcmpeqw     X0, ONE
597    packsswb    N0, ZERO
598    packsswb    X0, ZERO
599    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
600    mov         T1, ZEROBITS
601    not         T0
602    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
603    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
604    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
605    jz          .CONTINUER7             ; if (idx) {
606    lea         T1, [T1+KK*8]
607    mov         EOB, T1                 ; EOB = k + idx;
608.CONTINUER7:
609    add         VALUES, 8*2
610.PADDINGR:
611    mov         K, LEN
612    add         K, 7
613    and         K, -8
614    shr         K, 3
615    sub         K, DCTSIZE2/8
616    jz          .EPADDINGR
617    align       16
618.ZEROLOOPR:
619    movdqa      XMMWORD [VALUES + 0], ZERO
620    add         VALUES, 8*2
621    inc         K
622    jnz         .ZEROLOOPR
623.EPADDINGR:
624    sub         VALUES, DCTSIZE2*2
625
626    REDUCE0
627
628    mov         eax, EOB
629
630    pop         ebp
631    pop         edi
632    pop         esi
633;   pop         edx                     ; need not be preserved
634    pop         ecx
635    pop         ebx
636    mov         esp, ebp                ; esp <- aligned ebp
637    pop         esp                     ; esp <- original ebp
638    pop         ebp
639    ret
640
641%undef ZERO
642%undef ONE
643%undef X0
644%undef X1
645%undef N0
646%undef N1
647%undef AL
648%undef K
649%undef KK
650%undef EOB
651%undef SIGN
652%undef LUT
653%undef T0
654%undef T1
655%undef BLOCK
656%undef VALUES
657%undef LEN
658%undef LENEND
659
660; For some reason, the OS X linker does not honor the request to align the
661; segment unless we do this.
662    align       32
663