• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jcphuff-sse2.asm - prepare data for progressive Huffman encoding
3; (64-bit SSE2)
4;
5; Copyright (C) 2016, 2018, Matthieu Darbois
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; This file contains an SSE2 implementation of data preparation for progressive
18; Huffman encoding.  See jcphuff.c for more details.
19
20%include "jsimdext.inc"
21
22; --------------------------------------------------------------------------
23    SECTION     SEG_TEXT
24    BITS        64
25
26; --------------------------------------------------------------------------
27; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
28; jsimd_encode_mcu_AC_refine_prepare_sse2()
29
30%macro LOAD16 0
31    pxor        N0, N0
32    pxor        N1, N1
33
34    mov         T0d, INT [LUT +  0*SIZEOF_INT]
35    mov         T1d, INT [LUT +  8*SIZEOF_INT]
36    pinsrw      X0, word [BLOCK + T0 * 2], 0
37    pinsrw      X1, word [BLOCK + T1 * 2], 0
38
39    mov         T0d, INT [LUT +  1*SIZEOF_INT]
40    mov         T1d, INT [LUT +  9*SIZEOF_INT]
41    pinsrw      X0, word [BLOCK + T0 * 2], 1
42    pinsrw      X1, word [BLOCK + T1 * 2], 1
43
44    mov         T0d, INT [LUT +  2*SIZEOF_INT]
45    mov         T1d, INT [LUT + 10*SIZEOF_INT]
46    pinsrw      X0, word [BLOCK + T0 * 2], 2
47    pinsrw      X1, word [BLOCK + T1 * 2], 2
48
49    mov         T0d, INT [LUT +  3*SIZEOF_INT]
50    mov         T1d, INT [LUT + 11*SIZEOF_INT]
51    pinsrw      X0, word [BLOCK + T0 * 2], 3
52    pinsrw      X1, word [BLOCK + T1 * 2], 3
53
54    mov         T0d, INT [LUT +  4*SIZEOF_INT]
55    mov         T1d, INT [LUT + 12*SIZEOF_INT]
56    pinsrw      X0, word [BLOCK + T0 * 2], 4
57    pinsrw      X1, word [BLOCK + T1 * 2], 4
58
59    mov         T0d, INT [LUT +  5*SIZEOF_INT]
60    mov         T1d, INT [LUT + 13*SIZEOF_INT]
61    pinsrw      X0, word [BLOCK + T0 * 2], 5
62    pinsrw      X1, word [BLOCK + T1 * 2], 5
63
64    mov         T0d, INT [LUT +  6*SIZEOF_INT]
65    mov         T1d, INT [LUT + 14*SIZEOF_INT]
66    pinsrw      X0, word [BLOCK + T0 * 2], 6
67    pinsrw      X1, word [BLOCK + T1 * 2], 6
68
69    mov         T0d, INT [LUT +  7*SIZEOF_INT]
70    mov         T1d, INT [LUT + 15*SIZEOF_INT]
71    pinsrw      X0, word [BLOCK + T0 * 2], 7
72    pinsrw      X1, word [BLOCK + T1 * 2], 7
73%endmacro
74
75%macro LOAD15 0
76    pxor        N0, N0
77    pxor        N1, N1
78    pxor        X1, X1
79
80    mov         T0d, INT [LUT +  0*SIZEOF_INT]
81    mov         T1d, INT [LUT +  8*SIZEOF_INT]
82    pinsrw      X0, word [BLOCK + T0 * 2], 0
83    pinsrw      X1, word [BLOCK + T1 * 2], 0
84
85    mov         T0d, INT [LUT +  1*SIZEOF_INT]
86    pinsrw      X0, word [BLOCK + T0 * 2], 1
87
88    mov         T0d, INT [LUT +  2*SIZEOF_INT]
89    pinsrw      X0, word [BLOCK + T0 * 2], 2
90
91    mov         T0d, INT [LUT +  3*SIZEOF_INT]
92    pinsrw      X0, word [BLOCK + T0 * 2], 3
93
94    mov         T0d, INT [LUT +  4*SIZEOF_INT]
95    pinsrw      X0, word [BLOCK + T0 * 2], 4
96
97    mov         T0d, INT [LUT +  5*SIZEOF_INT]
98    pinsrw      X0, word [BLOCK + T0 * 2], 5
99
100    mov         T0d, INT [LUT +  6*SIZEOF_INT]
101    pinsrw      X0, word [BLOCK + T0 * 2], 6
102
103    mov         T0d, INT [LUT +  7*SIZEOF_INT]
104    pinsrw      X0, word [BLOCK + T0 * 2], 7
105
106    cmp         LENEND, 2
107    jl          %%.ELOAD15
108    mov         T1d, INT [LUT +  9*SIZEOF_INT]
109    pinsrw      X1, word [BLOCK + T1 * 2], 1
110
111    cmp         LENEND, 3
112    jl          %%.ELOAD15
113    mov         T1d, INT [LUT + 10*SIZEOF_INT]
114    pinsrw      X1, word [BLOCK + T1 * 2], 2
115
116    cmp         LENEND, 4
117    jl          %%.ELOAD15
118    mov         T1d, INT [LUT + 11*SIZEOF_INT]
119    pinsrw      X1, word [BLOCK + T1 * 2], 3
120
121    cmp         LENEND, 5
122    jl          %%.ELOAD15
123    mov         T1d, INT [LUT + 12*SIZEOF_INT]
124    pinsrw      X1, word [BLOCK + T1 * 2], 4
125
126    cmp         LENEND, 6
127    jl          %%.ELOAD15
128    mov         T1d, INT [LUT + 13*SIZEOF_INT]
129    pinsrw      X1, word [BLOCK + T1 * 2], 5
130
131    cmp         LENEND, 7
132    jl          %%.ELOAD15
133    mov         T1d, INT [LUT + 14*SIZEOF_INT]
134    pinsrw      X1, word [BLOCK + T1 * 2], 6
135%%.ELOAD15:
136%endmacro
137
138%macro LOAD8 0
139    pxor        N0, N0
140
141    mov         T0d, INT [LUT +  0*SIZEOF_INT]
142    pinsrw      X0, word [BLOCK + T0 * 2], 0
143
144    mov         T0d, INT [LUT +  1*SIZEOF_INT]
145    pinsrw      X0, word [BLOCK + T0 * 2], 1
146
147    mov         T0d, INT [LUT +  2*SIZEOF_INT]
148    pinsrw      X0, word [BLOCK + T0 * 2], 2
149
150    mov         T0d, INT [LUT +  3*SIZEOF_INT]
151    pinsrw      X0, word [BLOCK + T0 * 2], 3
152
153    mov         T0d, INT [LUT +  4*SIZEOF_INT]
154    pinsrw      X0, word [BLOCK + T0 * 2], 4
155
156    mov         T0d, INT [LUT +  5*SIZEOF_INT]
157    pinsrw      X0, word [BLOCK + T0 * 2], 5
158
159    mov         T0d, INT [LUT +  6*SIZEOF_INT]
160    pinsrw      X0, word [BLOCK + T0 * 2], 6
161
162    mov         T0d, INT [LUT +  7*SIZEOF_INT]
163    pinsrw      X0, word [BLOCK + T0 * 2], 7
164%endmacro
165
166%macro LOAD7 0
167    pxor        N0, N0
168    pxor        X0, X0
169
170    mov         T1d, INT [LUT +  0*SIZEOF_INT]
171    pinsrw      X0, word [BLOCK + T1 * 2], 0
172
173    cmp         LENEND, 2
174    jl          %%.ELOAD7
175    mov         T1d, INT [LUT +  1*SIZEOF_INT]
176    pinsrw      X0, word [BLOCK + T1 * 2], 1
177
178    cmp         LENEND, 3
179    jl          %%.ELOAD7
180    mov         T1d, INT [LUT +  2*SIZEOF_INT]
181    pinsrw      X0, word [BLOCK + T1 * 2], 2
182
183    cmp         LENEND, 4
184    jl          %%.ELOAD7
185    mov         T1d, INT [LUT +  3*SIZEOF_INT]
186    pinsrw      X0, word [BLOCK + T1 * 2], 3
187
188    cmp         LENEND, 5
189    jl          %%.ELOAD7
190    mov         T1d, INT [LUT +  4*SIZEOF_INT]
191    pinsrw      X0, word [BLOCK + T1 * 2], 4
192
193    cmp         LENEND, 6
194    jl          %%.ELOAD7
195    mov         T1d, INT [LUT +  5*SIZEOF_INT]
196    pinsrw      X0, word [BLOCK + T1 * 2], 5
197
198    cmp         LENEND, 7
199    jl          %%.ELOAD7
200    mov         T1d, INT [LUT +  6*SIZEOF_INT]
201    pinsrw      X0, word [BLOCK + T1 * 2], 6
202%%.ELOAD7:
203%endmacro
204
205%macro REDUCE0 0
206    movdqa      xmm0, XMMWORD [VALUES + ( 0*2)]
207    movdqa      xmm1, XMMWORD [VALUES + ( 8*2)]
208    movdqa      xmm2, XMMWORD [VALUES + (16*2)]
209    movdqa      xmm3, XMMWORD [VALUES + (24*2)]
210    movdqa      xmm4, XMMWORD [VALUES + (32*2)]
211    movdqa      xmm5, XMMWORD [VALUES + (40*2)]
212    movdqa      xmm6, XMMWORD [VALUES + (48*2)]
213    movdqa      xmm7, XMMWORD [VALUES + (56*2)]
214
215    pcmpeqw     xmm0, ZERO
216    pcmpeqw     xmm1, ZERO
217    pcmpeqw     xmm2, ZERO
218    pcmpeqw     xmm3, ZERO
219    pcmpeqw     xmm4, ZERO
220    pcmpeqw     xmm5, ZERO
221    pcmpeqw     xmm6, ZERO
222    pcmpeqw     xmm7, ZERO
223
224    packsswb    xmm0, xmm1
225    packsswb    xmm2, xmm3
226    packsswb    xmm4, xmm5
227    packsswb    xmm6, xmm7
228
229    pmovmskb    eax, xmm0
230    pmovmskb    ecx, xmm2
231    pmovmskb    edx, xmm4
232    pmovmskb    esi, xmm6
233
234    shl         rcx, 16
235    shl         rdx, 32
236    shl         rsi, 48
237
238    or          rax, rcx
239    or          rdx, rsi
240    or          rax, rdx
241
242    not         rax
243
244    mov         MMWORD [r15], rax
245%endmacro
246
247;
248; Prepare data for jsimd_encode_mcu_AC_first().
249;
250; GLOBAL(void)
251; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
252;                                        const int *jpeg_natural_order_start,
253;                                        int Sl, int Al, JCOEF *values,
254;                                        size_t *zerobits)
255;
256; r10 = const JCOEF *block
257; r11 = const int *jpeg_natural_order_start
258; r12 = int Sl
259; r13 = int Al
260; r14 = JCOEF *values
261; r15 = size_t *zerobits
262
263%define ZERO    xmm9
264%define X0      xmm0
265%define X1      xmm1
266%define N0      xmm2
267%define N1      xmm3
268%define AL      xmm4
269%define K       eax
270%define LUT     r11
271%define T0      rcx
272%define T0d     ecx
273%define T1      rdx
274%define T1d     edx
275%define BLOCK   r10
276%define VALUES  r14
277%define LEN     r12d
278%define LENEND  r13d
279
280    align       32
281    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
282
283EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
284    push        rbp
285    mov         rax, rsp                     ; rax = original rbp
286    sub         rsp, byte 4
287    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
288    mov         [rsp], rax
289    mov         rbp, rsp                     ; rbp = aligned rbp
290    lea         rsp, [rbp - 16]
291    collect_args 6
292
293    movdqa      XMMWORD [rbp - 16], ZERO
294
295    movd        AL, r13d
296    pxor        ZERO, ZERO
297    mov         K, LEN
298    mov         LENEND, LEN
299    and         K, -16
300    and         LENEND, 7
301    shr         K, 4
302    jz          .ELOOP16
303.BLOOP16:
304    LOAD16
305    pcmpgtw     N0, X0
306    pcmpgtw     N1, X1
307    paddw       X0, N0
308    paddw       X1, N1
309    pxor        X0, N0
310    pxor        X1, N1
311    psrlw       X0, AL
312    psrlw       X1, AL
313    pxor        N0, X0
314    pxor        N1, X1
315    movdqa      XMMWORD [VALUES + (0) * 2], X0
316    movdqa      XMMWORD [VALUES + (8) * 2], X1
317    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
318    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
319    add         VALUES, 16*2
320    add         LUT, 16*SIZEOF_INT
321    dec         K
322    jnz         .BLOOP16
323    test        LEN, 15
324    je          .PADDING
325.ELOOP16:
326    test        LEN, 8
327    jz          .TRY7
328    test        LEN, 7
329    jz          .TRY8
330
331    LOAD15
332    pcmpgtw     N0, X0
333    pcmpgtw     N1, X1
334    paddw       X0, N0
335    paddw       X1, N1
336    pxor        X0, N0
337    pxor        X1, N1
338    psrlw       X0, AL
339    psrlw       X1, AL
340    pxor        N0, X0
341    pxor        N1, X1
342    movdqa      XMMWORD [VALUES + (0) * 2], X0
343    movdqa      XMMWORD [VALUES + (8) * 2], X1
344    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
345    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
346    add         VALUES, 16*2
347    jmp         .PADDING
348.TRY8:
349    LOAD8
350    pcmpgtw     N0, X0
351    paddw       X0, N0
352    pxor        X0, N0
353    psrlw       X0, AL
354    pxor        N0, X0
355    movdqa      XMMWORD [VALUES + (0) * 2], X0
356    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
357    add         VALUES, 8*2
358    jmp         .PADDING
359.TRY7:
360    LOAD7
361    pcmpgtw     N0, X0
362    paddw       X0, N0
363    pxor        X0, N0
364    psrlw       X0, AL
365    pxor        N0, X0
366    movdqa      XMMWORD [VALUES + (0) * 2], X0
367    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
368    add         VALUES, 8*2
369.PADDING:
370    mov         K, LEN
371    add         K, 7
372    and         K, -8
373    shr         K, 3
374    sub         K, DCTSIZE2/8
375    jz          .EPADDING
376    align       16
377.ZEROLOOP:
378    movdqa      XMMWORD [VALUES + 0], ZERO
379    add         VALUES, 8*2
380    inc         K
381    jnz         .ZEROLOOP
382.EPADDING:
383    sub         VALUES, DCTSIZE2*2
384
385    REDUCE0
386
387    movdqa      ZERO, XMMWORD [rbp - 16]
388    uncollect_args 6
389    mov         rsp, rbp                ; rsp <- aligned rbp
390    pop         rsp                     ; rsp <- original rbp
391    pop         rbp
392    ret
393
394%undef ZERO
395%undef X0
396%undef X1
397%undef N0
398%undef N1
399%undef AL
400%undef K
401%undef LUT
402%undef T0
403%undef T0d
404%undef T1
405%undef T1d
406%undef BLOCK
407%undef VALUES
408%undef LEN
409%undef LENEND
410
411;
412; Prepare data for jsimd_encode_mcu_AC_refine().
413;
414; GLOBAL(int)
415; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
416;                                         const int *jpeg_natural_order_start,
417;                                         int Sl, int Al, JCOEF *absvalues,
418;                                         size_t *bits)
419;
420; r10 = const JCOEF *block
421; r11 = const int *jpeg_natural_order_start
422; r12 = int Sl
423; r13 = int Al
424; r14 = JCOEF *values
425; r15 = size_t *bits
426
427%define ZERO    xmm9
428%define ONE     xmm5
429%define X0      xmm0
430%define X1      xmm1
431%define N0      xmm2
432%define N1      xmm3
433%define AL      xmm4
434%define K       eax
435%define KK      r9d
436%define EOB     r8d
437%define SIGN    rdi
438%define LUT     r11
439%define T0      rcx
440%define T0d     ecx
441%define T1      rdx
442%define T1d     edx
443%define BLOCK   r10
444%define VALUES  r14
445%define LEN     r12d
446%define LENEND  r13d
447
448    align       32
449    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
450
451EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
452    push        rbp
453    mov         rax, rsp                     ; rax = original rbp
454    sub         rsp, byte 4
455    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
456    mov         [rsp], rax
457    mov         rbp, rsp                     ; rbp = aligned rbp
458    lea         rsp, [rbp - 16]
459    collect_args 6
460
461    movdqa      XMMWORD [rbp - 16], ZERO
462
463    xor         SIGN, SIGN
464    xor         EOB, EOB
465    xor         KK, KK
466    movd        AL, r13d
467    pxor        ZERO, ZERO
468    pcmpeqw     ONE, ONE
469    psrlw       ONE, 15
470    mov         K, LEN
471    mov         LENEND, LEN
472    and         K, -16
473    and         LENEND, 7
474    shr         K, 4
475    jz          .ELOOPR16
476.BLOOPR16:
477    LOAD16
478    pcmpgtw     N0, X0
479    pcmpgtw     N1, X1
480    paddw       X0, N0
481    paddw       X1, N1
482    pxor        X0, N0
483    pxor        X1, N1
484    psrlw       X0, AL
485    psrlw       X1, AL
486    movdqa      XMMWORD [VALUES + (0) * 2], X0
487    movdqa      XMMWORD [VALUES + (8) * 2], X1
488    pcmpeqw     X0, ONE
489    pcmpeqw     X1, ONE
490    packsswb    N0, N1
491    packsswb    X0, X1
492    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
493    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
494    shr         SIGN, 16                ; make room for sizebits
495    shl         T0, 48
496    or          SIGN, T0
497    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
498    jz          .CONTINUER16            ; if (idx) {
499    mov         EOB, KK
500    add         EOB, T1d                ; EOB = k + idx;
501.CONTINUER16:
502    add         VALUES, 16*2
503    add         LUT, 16*SIZEOF_INT
504    add         KK, 16
505    dec         K
506    jnz         .BLOOPR16
507    test        LEN, 15
508    je          .PADDINGR
509.ELOOPR16:
510    test        LEN, 8
511    jz          .TRYR7
512    test        LEN, 7
513    jz          .TRYR8
514
515    LOAD15
516    pcmpgtw     N0, X0
517    pcmpgtw     N1, X1
518    paddw       X0, N0
519    paddw       X1, N1
520    pxor        X0, N0
521    pxor        X1, N1
522    psrlw       X0, AL
523    psrlw       X1, AL
524    movdqa      XMMWORD [VALUES + (0) * 2], X0
525    movdqa      XMMWORD [VALUES + (8) * 2], X1
526    pcmpeqw     X0, ONE
527    pcmpeqw     X1, ONE
528    packsswb    N0, N1
529    packsswb    X0, X1
530    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
531    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
532    shr         SIGN, 16                ; make room for sizebits
533    shl         T0, 48
534    or          SIGN, T0
535    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
536    jz          .CONTINUER15            ; if (idx) {
537    mov         EOB, KK
538    add         EOB, T1d                ; EOB = k + idx;
539.CONTINUER15:
540    add         VALUES, 16*2
541    jmp         .PADDINGR
542.TRYR8:
543    LOAD8
544
545    pcmpgtw     N0, X0
546    paddw       X0, N0
547    pxor        X0, N0
548    psrlw       X0, AL
549    movdqa      XMMWORD [VALUES + (0) * 2], X0
550    pcmpeqw     X0, ONE
551    packsswb    N0, ZERO
552    packsswb    X0, ZERO
553    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
554    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
555    shr         SIGN, 8                 ; make room for sizebits
556    shl         T0, 56
557    or          SIGN, T0
558    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
559    jz          .CONTINUER8             ; if (idx) {
560    mov         EOB, KK
561    add         EOB, T1d                ; EOB = k + idx;
562.CONTINUER8:
563    add         VALUES, 8*2
564    jmp         .PADDINGR
565.TRYR7:
566    LOAD7
567
568    pcmpgtw     N0, X0
569    paddw       X0, N0
570    pxor        X0, N0
571    psrlw       X0, AL
572    movdqa      XMMWORD [VALUES + (0) * 2], X0
573    pcmpeqw     X0, ONE
574    packsswb    N0, ZERO
575    packsswb    X0, ZERO
576    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
577    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
578    shr         SIGN, 8                 ; make room for sizebits
579    shl         T0, 56
580    or          SIGN, T0
581    bsr         T1d, T1d                ; idx = 16 - (__builtin_clz(idx)>>1);
582    jz          .CONTINUER7             ; if (idx) {
583    mov         EOB, KK
584    add         EOB, T1d                ; EOB = k + idx;
585.CONTINUER7:
586    add         VALUES, 8*2
587.PADDINGR:
588    mov         K, LEN
589    add         K, 7
590    and         K, -8
591    shr         K, 3
592    sub         K, DCTSIZE2/8
593    jz          .EPADDINGR
594    align       16
595.ZEROLOOPR:
596    movdqa      XMMWORD [VALUES + 0], ZERO
597    shr         SIGN, 8
598    add         VALUES, 8*2
599    inc         K
600    jnz         .ZEROLOOPR
601.EPADDINGR:
602    not         SIGN
603    sub         VALUES, DCTSIZE2*2
604    mov         MMWORD [r15+SIZEOF_MMWORD], SIGN
605
606    REDUCE0
607
608    mov         eax, EOB
609    movdqa      ZERO, XMMWORD [rbp - 16]
610    uncollect_args 6
611    mov         rsp, rbp                ; rsp <- aligned rbp
612    pop         rsp                     ; rsp <- original rbp
613    pop         rbp
614    ret
615
616%undef ZERO
617%undef ONE
618%undef X0
619%undef X1
620%undef N0
621%undef N1
622%undef AL
623%undef K
624%undef KK
625%undef EOB
626%undef SIGN
627%undef LUT
628%undef T0
629%undef T0d
630%undef T1
631%undef T1d
632%undef BLOCK
633%undef VALUES
634%undef LEN
635%undef LENEND
636
637; For some reason, the OS X linker does not honor the request to align the
638; segment unless we do this.
639    align       32
640