• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2)
3;
4; Copyright (C) 2016, 2018, Matthieu Darbois
5;
6; Based on the x86 SIMD extension for IJG JPEG library
7; Copyright (C) 1999-2006, MIYASAKA Masaru.
8; For conditions of distribution and use, see copyright notice in jsimdext.inc
9;
10; This file should be assembled with NASM (Netwide Assembler),
11; can *not* be assembled with Microsoft's MASM or any compatible
12; assembler (including Borland's Turbo Assembler).
13; NASM is available from http://nasm.sourceforge.net/ or
14; http://sourceforge.net/project/showfiles.php?group_id=6208
15;
16; This file contains an SSE2 implementation of data preparation for progressive
17; Huffman encoding.  See jcphuff.c for more details.
18
19%include "jsimdext.inc"
20
21; --------------------------------------------------------------------------
22    SECTION     SEG_TEXT
23    BITS        32
24
25; --------------------------------------------------------------------------
26; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
27; jsimd_encode_mcu_AC_refine_prepare_sse2()
28
29%macro LOAD16 0
30    pxor        N0, N0
31    pxor        N1, N1
32
33    mov         T0, INT [LUT +  0*SIZEOF_INT]
34    mov         T1, INT [LUT +  8*SIZEOF_INT]
35    pinsrw      X0, word [BLOCK + T0 * 2], 0
36    pinsrw      X1, word [BLOCK + T1 * 2], 0
37
38    mov         T0, INT [LUT +  1*SIZEOF_INT]
39    mov         T1, INT [LUT +  9*SIZEOF_INT]
40    pinsrw      X0, word [BLOCK + T0 * 2], 1
41    pinsrw      X1, word [BLOCK + T1 * 2], 1
42
43    mov         T0, INT [LUT +  2*SIZEOF_INT]
44    mov         T1, INT [LUT + 10*SIZEOF_INT]
45    pinsrw      X0, word [BLOCK + T0 * 2], 2
46    pinsrw      X1, word [BLOCK + T1 * 2], 2
47
48    mov         T0, INT [LUT +  3*SIZEOF_INT]
49    mov         T1, INT [LUT + 11*SIZEOF_INT]
50    pinsrw      X0, word [BLOCK + T0 * 2], 3
51    pinsrw      X1, word [BLOCK + T1 * 2], 3
52
53    mov         T0, INT [LUT +  4*SIZEOF_INT]
54    mov         T1, INT [LUT + 12*SIZEOF_INT]
55    pinsrw      X0, word [BLOCK + T0 * 2], 4
56    pinsrw      X1, word [BLOCK + T1 * 2], 4
57
58    mov         T0, INT [LUT +  5*SIZEOF_INT]
59    mov         T1, INT [LUT + 13*SIZEOF_INT]
60    pinsrw      X0, word [BLOCK + T0 * 2], 5
61    pinsrw      X1, word [BLOCK + T1 * 2], 5
62
63    mov         T0, INT [LUT +  6*SIZEOF_INT]
64    mov         T1, INT [LUT + 14*SIZEOF_INT]
65    pinsrw      X0, word [BLOCK + T0 * 2], 6
66    pinsrw      X1, word [BLOCK + T1 * 2], 6
67
68    mov         T0, INT [LUT +  7*SIZEOF_INT]
69    mov         T1, INT [LUT + 15*SIZEOF_INT]
70    pinsrw      X0, word [BLOCK + T0 * 2], 7
71    pinsrw      X1, word [BLOCK + T1 * 2], 7
72%endmacro
73
74%macro LOAD15 0
75    pxor        N0, N0
76    pxor        N1, N1
77    pxor        X1, X1
78
79    mov         T0, INT [LUT +  0*SIZEOF_INT]
80    mov         T1, INT [LUT +  8*SIZEOF_INT]
81    pinsrw      X0, word [BLOCK + T0 * 2], 0
82    pinsrw      X1, word [BLOCK + T1 * 2], 0
83
84    mov         T0, INT [LUT +  1*SIZEOF_INT]
85    pinsrw      X0, word [BLOCK + T0 * 2], 1
86
87    mov         T0, INT [LUT +  2*SIZEOF_INT]
88    pinsrw      X0, word [BLOCK + T0 * 2], 2
89
90    mov         T0, INT [LUT +  3*SIZEOF_INT]
91    pinsrw      X0, word [BLOCK + T0 * 2], 3
92
93    mov         T0, INT [LUT +  4*SIZEOF_INT]
94    pinsrw      X0, word [BLOCK + T0 * 2], 4
95
96    mov         T0, INT [LUT +  5*SIZEOF_INT]
97    pinsrw      X0, word [BLOCK + T0 * 2], 5
98
99    mov         T0, INT [LUT +  6*SIZEOF_INT]
100    pinsrw      X0, word [BLOCK + T0 * 2], 6
101
102    mov         T0, INT [LUT +  7*SIZEOF_INT]
103    pinsrw      X0, word [BLOCK + T0 * 2], 7
104
105    cmp         LENEND, 2
106    jl          %%.ELOAD15
107    mov         T1, INT [LUT +  9*SIZEOF_INT]
108    pinsrw      X1, word [BLOCK + T1 * 2], 1
109
110    cmp         LENEND, 3
111    jl          %%.ELOAD15
112    mov         T1, INT [LUT + 10*SIZEOF_INT]
113    pinsrw      X1, word [BLOCK + T1 * 2], 2
114
115    cmp         LENEND, 4
116    jl          %%.ELOAD15
117    mov         T1, INT [LUT + 11*SIZEOF_INT]
118    pinsrw      X1, word [BLOCK + T1 * 2], 3
119
120    cmp         LENEND, 5
121    jl          %%.ELOAD15
122    mov         T1, INT [LUT + 12*SIZEOF_INT]
123    pinsrw      X1, word [BLOCK + T1 * 2], 4
124
125    cmp         LENEND, 6
126    jl          %%.ELOAD15
127    mov         T1, INT [LUT + 13*SIZEOF_INT]
128    pinsrw      X1, word [BLOCK + T1 * 2], 5
129
130    cmp         LENEND, 7
131    jl          %%.ELOAD15
132    mov         T1, INT [LUT + 14*SIZEOF_INT]
133    pinsrw      X1, word [BLOCK + T1 * 2], 6
134%%.ELOAD15:
135%endmacro
136
137%macro LOAD8 0
138    pxor        N0, N0
139
140    mov         T0, INT [LUT +  0*SIZEOF_INT]
141    pinsrw      X0, word [BLOCK + T0 * 2], 0
142
143    mov         T0, INT [LUT +  1*SIZEOF_INT]
144    pinsrw      X0, word [BLOCK + T0 * 2], 1
145
146    mov         T0, INT [LUT +  2*SIZEOF_INT]
147    pinsrw      X0, word [BLOCK + T0 * 2], 2
148
149    mov         T0, INT [LUT +  3*SIZEOF_INT]
150    pinsrw      X0, word [BLOCK + T0 * 2], 3
151
152    mov         T0, INT [LUT +  4*SIZEOF_INT]
153    pinsrw      X0, word [BLOCK + T0 * 2], 4
154
155    mov         T0, INT [LUT +  5*SIZEOF_INT]
156    pinsrw      X0, word [BLOCK + T0 * 2], 5
157
158    mov         T0, INT [LUT +  6*SIZEOF_INT]
159    pinsrw      X0, word [BLOCK + T0 * 2], 6
160
161    mov         T0, INT [LUT +  7*SIZEOF_INT]
162    pinsrw      X0, word [BLOCK + T0 * 2], 7
163%endmacro
164
165%macro LOAD7 0
166    pxor        N0, N0
167    pxor        X0, X0
168
169    mov         T1, INT [LUT +  0*SIZEOF_INT]
170    pinsrw      X0, word [BLOCK + T1 * 2], 0
171
172    cmp         LENEND, 2
173    jl          %%.ELOAD7
174    mov         T1, INT [LUT +  1*SIZEOF_INT]
175    pinsrw      X0, word [BLOCK + T1 * 2], 1
176
177    cmp         LENEND, 3
178    jl          %%.ELOAD7
179    mov         T1, INT [LUT +  2*SIZEOF_INT]
180    pinsrw      X0, word [BLOCK + T1 * 2], 2
181
182    cmp         LENEND, 4
183    jl          %%.ELOAD7
184    mov         T1, INT [LUT +  3*SIZEOF_INT]
185    pinsrw      X0, word [BLOCK + T1 * 2], 3
186
187    cmp         LENEND, 5
188    jl          %%.ELOAD7
189    mov         T1, INT [LUT +  4*SIZEOF_INT]
190    pinsrw      X0, word [BLOCK + T1 * 2], 4
191
192    cmp         LENEND, 6
193    jl          %%.ELOAD7
194    mov         T1, INT [LUT +  5*SIZEOF_INT]
195    pinsrw      X0, word [BLOCK + T1 * 2], 5
196
197    cmp         LENEND, 7
198    jl          %%.ELOAD7
199    mov         T1, INT [LUT +  6*SIZEOF_INT]
200    pinsrw      X0, word [BLOCK + T1 * 2], 6
201%%.ELOAD7:
202%endmacro
203
204%macro REDUCE0 0
205    movdqa      xmm0, XMMWORD [VALUES + ( 0*2)]
206    movdqa      xmm1, XMMWORD [VALUES + ( 8*2)]
207    movdqa      xmm2, XMMWORD [VALUES + (16*2)]
208    movdqa      xmm3, XMMWORD [VALUES + (24*2)]
209    movdqa      xmm4, XMMWORD [VALUES + (32*2)]
210    movdqa      xmm5, XMMWORD [VALUES + (40*2)]
211    movdqa      xmm6, XMMWORD [VALUES + (48*2)]
212
213    pcmpeqw     xmm0, ZERO
214    pcmpeqw     xmm1, ZERO
215    pcmpeqw     xmm2, ZERO
216    pcmpeqw     xmm3, ZERO
217    pcmpeqw     xmm4, ZERO
218    pcmpeqw     xmm5, ZERO
219    pcmpeqw     xmm6, ZERO
220    pcmpeqw     xmm7, XMMWORD [VALUES + (56*2)]
221
222    packsswb    xmm0, xmm1
223    packsswb    xmm2, xmm3
224    packsswb    xmm4, xmm5
225    packsswb    xmm6, xmm7
226
227    pmovmskb    eax, xmm0
228    pmovmskb    ecx, xmm2
229    pmovmskb    edx, xmm4
230    pmovmskb    esi, xmm6
231
232    shl         ecx, 16
233    shl         esi, 16
234
235    or          eax, ecx
236    or          edx, esi
237
238    not         eax
239    not         edx
240
241    mov         edi, ZEROBITS
242
243    mov         INT [edi], eax
244    mov         INT [edi+SIZEOF_INT], edx
245%endmacro
246
247;
248; Prepare data for jsimd_encode_mcu_AC_first().
249;
250; GLOBAL(void)
251; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
252;                                        const int *jpeg_natural_order_start,
253;                                        int Sl, int Al, JCOEF *values,
254;                                        size_t *zerobits)
255;
256; eax + 8 = const JCOEF *block
257; eax + 12 = const int *jpeg_natural_order_start
258; eax + 16 = int Sl
259; eax + 20 = int Al
260; eax + 24 = JCOEF *values
261; eax + 28 = size_t *zerobits
262
263%define ZERO    xmm7
264%define X0      xmm0
265%define X1      xmm1
266%define N0      xmm2
267%define N1      xmm3
268%define AL      xmm4
269%define K       eax
270%define LENEND  eax
271%define LUT     ebx
272%define T0      ecx
273%define T1      edx
274%define BLOCK   esi
275%define VALUES  edi
276%define LEN     ebp
277
278%define ZEROBITS  INT [esp + 5 * 4]
279
280    align       32
281    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
282
283EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
284    push        ebp
285    mov         eax, esp                     ; eax = original ebp
286    sub         esp, byte 4
287    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
288    mov         [esp], eax
289    mov         ebp, esp                     ; ebp = aligned ebp
290    sub         esp, 4
291    push        ebx
292    push        ecx
293;   push        edx                     ; need not be preserved
294    push        esi
295    push        edi
296    push        ebp
297
298    mov         BLOCK, INT [eax + 8]
299    mov         LUT, INT [eax + 12]
300    mov         VALUES, INT [eax + 24]
301    movd        AL, INT [eax + 20]
302    mov         T0, INT [eax + 28]
303    mov         ZEROBITS, T0
304    mov         LEN, INT [eax + 16]
305    pxor        ZERO, ZERO
306    mov         K, LEN
307    and         K, -16
308    shr         K, 4
309    jz          .ELOOP16
310.BLOOP16:
311    LOAD16
312    pcmpgtw     N0, X0
313    pcmpgtw     N1, X1
314    paddw       X0, N0
315    paddw       X1, N1
316    pxor        X0, N0
317    pxor        X1, N1
318    psrlw       X0, AL
319    psrlw       X1, AL
320    pxor        N0, X0
321    pxor        N1, X1
322    movdqa      XMMWORD [VALUES + (0) * 2], X0
323    movdqa      XMMWORD [VALUES + (8) * 2], X1
324    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
325    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
326    add         VALUES, 16*2
327    add         LUT, 16*SIZEOF_INT
328    dec         K
329    jnz         .BLOOP16
330    test        LEN, 15
331    je          .PADDING
332.ELOOP16:
333    mov         LENEND, LEN
334    and         LENEND, 7
335
336    test        LEN, 8
337    jz          .TRY7
338    test        LEN, 7
339    jz          .TRY8
340
341    LOAD15
342    pcmpgtw     N0, X0
343    pcmpgtw     N1, X1
344    paddw       X0, N0
345    paddw       X1, N1
346    pxor        X0, N0
347    pxor        X1, N1
348    psrlw       X0, AL
349    psrlw       X1, AL
350    pxor        N0, X0
351    pxor        N1, X1
352    movdqa      XMMWORD [VALUES + (0) * 2], X0
353    movdqa      XMMWORD [VALUES + (8) * 2], X1
354    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
355    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
356    add         VALUES, 16*2
357    jmp         .PADDING
358.TRY8:
359    LOAD8
360    pcmpgtw     N0, X0
361    paddw       X0, N0
362    pxor        X0, N0
363    psrlw       X0, AL
364    pxor        N0, X0
365    movdqa      XMMWORD [VALUES + (0) * 2], X0
366    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
367    add         VALUES, 8*2
368    jmp         .PADDING
369.TRY7:
370    LOAD7
371    pcmpgtw     N0, X0
372    paddw       X0, N0
373    pxor        X0, N0
374    psrlw       X0, AL
375    pxor        N0, X0
376    movdqa      XMMWORD [VALUES + (0) * 2], X0
377    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
378    add         VALUES, 8*2
379.PADDING:
380    mov         K, LEN
381    add         K, 7
382    and         K, -8
383    shr         K, 3
384    sub         K, DCTSIZE2/8
385    jz          .EPADDING
386    align       16
387.ZEROLOOP:
388    movdqa      XMMWORD [VALUES + 0], ZERO
389    add         VALUES, 8*2
390    inc         K
391    jnz         .ZEROLOOP
392.EPADDING:
393    sub         VALUES, DCTSIZE2*2
394
395    REDUCE0
396
397    pop         ebp
398    pop         edi
399    pop         esi
400;   pop         edx                     ; need not be preserved
401    pop         ecx
402    pop         ebx
403    mov         esp, ebp                ; esp <- aligned ebp
404    pop         esp                     ; esp <- original ebp
405    pop         ebp
406    ret
407
408%undef ZERO
409%undef X0
410%undef X1
411%undef N0
412%undef N1
413%undef AL
414%undef K
415%undef LUT
416%undef T0
417%undef T1
418%undef BLOCK
419%undef VALUES
420%undef LEN
421
422;
423; Prepare data for jsimd_encode_mcu_AC_refine().
424;
425; GLOBAL(int)
426; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
427;                                         const int *jpeg_natural_order_start,
428;                                         int Sl, int Al, JCOEF *absvalues,
429;                                         size_t *bits)
430;
431; eax + 8 = const JCOEF *block
432; eax + 12 = const int *jpeg_natural_order_start
433; eax + 16 = int Sl
434; eax + 20 = int Al
435; eax + 24 = JCOEF *values
436; eax + 28 = size_t *bits
437
438%define ZERO    xmm7
439%define ONE     xmm5
440%define X0      xmm0
441%define X1      xmm1
442%define N0      xmm2
443%define N1      xmm3
444%define AL      xmm4
445%define K       eax
446%define LENEND  eax
447%define LUT     ebx
448%define T0      ecx
449%define T0w      cx
450%define T1      edx
451%define BLOCK   esi
452%define VALUES  edi
453%define KK      ebp
454
455%define ZEROBITS  INT [esp + 5 * 4]
456%define EOB       INT [esp + 5 * 4 + 4]
457%define LEN       INT [esp + 5 * 4 + 8]
458
459    align       32
460    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
461
462EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
463    push        ebp
464    mov         eax, esp                     ; eax = original ebp
465    sub         esp, byte 4
466    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
467    mov         [esp], eax
468    mov         ebp, esp                     ; ebp = aligned ebp
469    sub         esp, 16
470    push        ebx
471    push        ecx
472;   push        edx                     ; need not be preserved
473    push        esi
474    push        edi
475    push        ebp
476
477    pcmpeqw     ONE, ONE
478    psrlw       ONE, 15
479    mov         BLOCK, INT [eax + 8]
480    mov         LUT, INT [eax + 12]
481    mov         VALUES, INT [eax + 24]
482    movd        AL, INT [eax + 20]
483    mov         T0, INT [eax + 28]
484    mov         K,  INT [eax + 16]
485    mov         INT [T0 + 2 * SIZEOF_INT], -1
486    mov         INT [T0 + 3 * SIZEOF_INT], -1
487    mov         ZEROBITS, T0
488    mov         LEN, K
489    pxor        ZERO, ZERO
490    and         K, -16
491    mov         EOB, 0
492    xor         KK, KK
493    shr         K, 4
494    jz          .ELOOPR16
495.BLOOPR16:
496    LOAD16
497    pcmpgtw     N0, X0
498    pcmpgtw     N1, X1
499    paddw       X0, N0
500    paddw       X1, N1
501    pxor        X0, N0
502    pxor        X1, N1
503    psrlw       X0, AL
504    psrlw       X1, AL
505    movdqa      XMMWORD [VALUES + (0) * 2], X0
506    movdqa      XMMWORD [VALUES + (8) * 2], X1
507    pcmpeqw     X0, ONE
508    pcmpeqw     X1, ONE
509    packsswb    N0, N1
510    packsswb    X0, X1
511    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
512    mov         T1, ZEROBITS
513    not         T0
514    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
515    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
516    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
517    jz          .CONTINUER16            ; if (idx) {
518    lea         T1, [T1+KK*8]
519    mov         EOB, T1                 ; EOB = k + idx;
520.CONTINUER16:
521    add         VALUES, 16*2
522    add         LUT, 16*SIZEOF_INT
523    add         KK, 2
524    dec         K
525    jnz         .BLOOPR16
526    test        LEN, 15
527    je          .PADDINGR
528.ELOOPR16:
529    mov         LENEND, LEN
530
531    test        LENEND, 8
532    jz          .TRYR7
533    test        LENEND, 7
534    jz          .TRYR8
535
536    and         LENEND, 7
537    LOAD15
538    pcmpgtw     N0, X0
539    pcmpgtw     N1, X1
540    paddw       X0, N0
541    paddw       X1, N1
542    pxor        X0, N0
543    pxor        X1, N1
544    psrlw       X0, AL
545    psrlw       X1, AL
546    movdqa      XMMWORD [VALUES + (0) * 2], X0
547    movdqa      XMMWORD [VALUES + (8) * 2], X1
548    pcmpeqw     X0, ONE
549    pcmpeqw     X1, ONE
550    packsswb    N0, N1
551    packsswb    X0, X1
552    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
553    mov         T1, ZEROBITS
554    not         T0
555    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
556    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
557    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
558    jz          .CONTINUER15            ; if (idx) {
559    lea         T1, [T1+KK*8]
560    mov         EOB, T1                 ; EOB = k + idx;
561.CONTINUER15:
562    add         VALUES, 16*2
563    jmp         .PADDINGR
564.TRYR8:
565    LOAD8
566
567    pcmpgtw     N0, X0
568    paddw       X0, N0
569    pxor        X0, N0
570    psrlw       X0, AL
571    movdqa      XMMWORD [VALUES + (0) * 2], X0
572    pcmpeqw     X0, ONE
573    packsswb    N0, ZERO
574    packsswb    X0, ZERO
575    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
576    mov         T1, ZEROBITS
577    not         T0
578    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
579    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
580    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
581    jz          .CONTINUER8             ; if (idx) {
582    lea         T1, [T1+KK*8]
583    mov         EOB, T1                 ; EOB = k + idx;
584.CONTINUER8:
585    add         VALUES, 8*2
586    jmp         .PADDINGR
587.TRYR7:
588    and         LENEND, 7
589    LOAD7
590
591    pcmpgtw     N0, X0
592    paddw       X0, N0
593    pxor        X0, N0
594    psrlw       X0, AL
595    movdqa      XMMWORD [VALUES + (0) * 2], X0
596    pcmpeqw     X0, ONE
597    packsswb    N0, ZERO
598    packsswb    X0, ZERO
599    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
600    mov         T1, ZEROBITS
601    not         T0
602    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
603    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
604    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
605    jz          .CONTINUER7             ; if (idx) {
606    lea         T1, [T1+KK*8]
607    mov         EOB, T1                 ; EOB = k + idx;
608.CONTINUER7:
609    add         VALUES, 8*2
610.PADDINGR:
611    mov         K, LEN
612    add         K, 7
613    and         K, -8
614    shr         K, 3
615    sub         K, DCTSIZE2/8
616    jz          .EPADDINGR
617    align       16
618.ZEROLOOPR:
619    movdqa      XMMWORD [VALUES + 0], ZERO
620    add         VALUES, 8*2
621    inc         K
622    jnz         .ZEROLOOPR
623.EPADDINGR:
624    sub         VALUES, DCTSIZE2*2
625
626    REDUCE0
627
628    mov         eax, EOB
629
630    pop         ebp
631    pop         edi
632    pop         esi
633;   pop         edx                     ; need not be preserved
634    pop         ecx
635    pop         ebx
636    mov         esp, ebp                ; esp <- aligned ebp
637    pop         esp                     ; esp <- original ebp
638    pop         ebp
639    ret
640
641%undef ZERO
642%undef ONE
643%undef X0
644%undef X1
645%undef N0
646%undef N1
647%undef AL
648%undef K
649%undef KK
650%undef EOB
651%undef SIGN
652%undef LUT
653%undef T0
654%undef T1
655%undef BLOCK
656%undef VALUES
657%undef LEN
658%undef LENEND
659
660; For some reason, the OS X linker does not honor the request to align the
661; segment unless we do this.
662    align       32
663