• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jcphuff-sse2.asm - prepare data for progressive Huffman encoding
3; (64-bit SSE2)
4;
5; Copyright (C) 2016, 2018, Matthieu Darbois
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; This file contains an SSE2 implementation of data preparation for progressive
18; Huffman encoding.  See jcphuff.c for more details.
19
20%include "jsimdext.inc"
21
22; --------------------------------------------------------------------------
23    SECTION     SEG_TEXT
24    BITS        64
25
26; --------------------------------------------------------------------------
27; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
28; jsimd_encode_mcu_AC_refine_prepare_sse2()
29
30%macro LOAD16 0
31    pxor        N0, N0
32    pxor        N1, N1
33
34    mov         T0d, INT [LUT +  0*SIZEOF_INT]
35    mov         T1d, INT [LUT +  8*SIZEOF_INT]
36    pinsrw      X0, word [BLOCK + T0 * 2], 0
37    pinsrw      X1, word [BLOCK + T1 * 2], 0
38
39    mov         T0d, INT [LUT +  1*SIZEOF_INT]
40    mov         T1d, INT [LUT +  9*SIZEOF_INT]
41    pinsrw      X0, word [BLOCK + T0 * 2], 1
42    pinsrw      X1, word [BLOCK + T1 * 2], 1
43
44    mov         T0d, INT [LUT +  2*SIZEOF_INT]
45    mov         T1d, INT [LUT + 10*SIZEOF_INT]
46    pinsrw      X0, word [BLOCK + T0 * 2], 2
47    pinsrw      X1, word [BLOCK + T1 * 2], 2
48
49    mov         T0d, INT [LUT +  3*SIZEOF_INT]
50    mov         T1d, INT [LUT + 11*SIZEOF_INT]
51    pinsrw      X0, word [BLOCK + T0 * 2], 3
52    pinsrw      X1, word [BLOCK + T1 * 2], 3
53
54    mov         T0d, INT [LUT +  4*SIZEOF_INT]
55    mov         T1d, INT [LUT + 12*SIZEOF_INT]
56    pinsrw      X0, word [BLOCK + T0 * 2], 4
57    pinsrw      X1, word [BLOCK + T1 * 2], 4
58
59    mov         T0d, INT [LUT +  5*SIZEOF_INT]
60    mov         T1d, INT [LUT + 13*SIZEOF_INT]
61    pinsrw      X0, word [BLOCK + T0 * 2], 5
62    pinsrw      X1, word [BLOCK + T1 * 2], 5
63
64    mov         T0d, INT [LUT +  6*SIZEOF_INT]
65    mov         T1d, INT [LUT + 14*SIZEOF_INT]
66    pinsrw      X0, word [BLOCK + T0 * 2], 6
67    pinsrw      X1, word [BLOCK + T1 * 2], 6
68
69    mov         T0d, INT [LUT +  7*SIZEOF_INT]
70    mov         T1d, INT [LUT + 15*SIZEOF_INT]
71    pinsrw      X0, word [BLOCK + T0 * 2], 7
72    pinsrw      X1, word [BLOCK + T1 * 2], 7
73%endmacro
74
75%macro LOAD15 0
76    pxor        N0, N0
77    pxor        N1, N1
78    pxor        X1, X1
79
80    mov         T0d, INT [LUT +  0*SIZEOF_INT]
81    mov         T1d, INT [LUT +  8*SIZEOF_INT]
82    pinsrw      X0, word [BLOCK + T0 * 2], 0
83    pinsrw      X1, word [BLOCK + T1 * 2], 0
84
85    mov         T0d, INT [LUT +  1*SIZEOF_INT]
86    pinsrw      X0, word [BLOCK + T0 * 2], 1
87
88    mov         T0d, INT [LUT +  2*SIZEOF_INT]
89    pinsrw      X0, word [BLOCK + T0 * 2], 2
90
91    mov         T0d, INT [LUT +  3*SIZEOF_INT]
92    pinsrw      X0, word [BLOCK + T0 * 2], 3
93
94    mov         T0d, INT [LUT +  4*SIZEOF_INT]
95    pinsrw      X0, word [BLOCK + T0 * 2], 4
96
97    mov         T0d, INT [LUT +  5*SIZEOF_INT]
98    pinsrw      X0, word [BLOCK + T0 * 2], 5
99
100    mov         T0d, INT [LUT +  6*SIZEOF_INT]
101    pinsrw      X0, word [BLOCK + T0 * 2], 6
102
103    mov         T0d, INT [LUT +  7*SIZEOF_INT]
104    pinsrw      X0, word [BLOCK + T0 * 2], 7
105
106    cmp         LENEND, 2
107    jl          %%.ELOAD15
108    mov         T1d, INT [LUT +  9*SIZEOF_INT]
109    pinsrw      X1, word [BLOCK + T1 * 2], 1
110
111    cmp         LENEND, 3
112    jl          %%.ELOAD15
113    mov         T1d, INT [LUT + 10*SIZEOF_INT]
114    pinsrw      X1, word [BLOCK + T1 * 2], 2
115
116    cmp         LENEND, 4
117    jl          %%.ELOAD15
118    mov         T1d, INT [LUT + 11*SIZEOF_INT]
119    pinsrw      X1, word [BLOCK + T1 * 2], 3
120
121    cmp         LENEND, 5
122    jl          %%.ELOAD15
123    mov         T1d, INT [LUT + 12*SIZEOF_INT]
124    pinsrw      X1, word [BLOCK + T1 * 2], 4
125
126    cmp         LENEND, 6
127    jl          %%.ELOAD15
128    mov         T1d, INT [LUT + 13*SIZEOF_INT]
129    pinsrw      X1, word [BLOCK + T1 * 2], 5
130
131    cmp         LENEND, 7
132    jl          %%.ELOAD15
133    mov         T1d, INT [LUT + 14*SIZEOF_INT]
134    pinsrw      X1, word [BLOCK + T1 * 2], 6
135%%.ELOAD15:
136%endmacro
137
138%macro LOAD8 0
139    pxor        N0, N0
140
141    mov         T0d, INT [LUT +  0*SIZEOF_INT]
142    pinsrw      X0, word [BLOCK + T0 * 2], 0
143
144    mov         T0d, INT [LUT +  1*SIZEOF_INT]
145    pinsrw      X0, word [BLOCK + T0 * 2], 1
146
147    mov         T0d, INT [LUT +  2*SIZEOF_INT]
148    pinsrw      X0, word [BLOCK + T0 * 2], 2
149
150    mov         T0d, INT [LUT +  3*SIZEOF_INT]
151    pinsrw      X0, word [BLOCK + T0 * 2], 3
152
153    mov         T0d, INT [LUT +  4*SIZEOF_INT]
154    pinsrw      X0, word [BLOCK + T0 * 2], 4
155
156    mov         T0d, INT [LUT +  5*SIZEOF_INT]
157    pinsrw      X0, word [BLOCK + T0 * 2], 5
158
159    mov         T0d, INT [LUT +  6*SIZEOF_INT]
160    pinsrw      X0, word [BLOCK + T0 * 2], 6
161
162    mov         T0d, INT [LUT +  7*SIZEOF_INT]
163    pinsrw      X0, word [BLOCK + T0 * 2], 7
164%endmacro
165
166%macro LOAD7 0
167    pxor        N0, N0
168    pxor        X0, X0
169
170    mov         T1d, INT [LUT +  0*SIZEOF_INT]
171    pinsrw      X0, word [BLOCK + T1 * 2], 0
172
173    cmp         LENEND, 2
174    jl          %%.ELOAD7
175    mov         T1d, INT [LUT +  1*SIZEOF_INT]
176    pinsrw      X0, word [BLOCK + T1 * 2], 1
177
178    cmp         LENEND, 3
179    jl          %%.ELOAD7
180    mov         T1d, INT [LUT +  2*SIZEOF_INT]
181    pinsrw      X0, word [BLOCK + T1 * 2], 2
182
183    cmp         LENEND, 4
184    jl          %%.ELOAD7
185    mov         T1d, INT [LUT +  3*SIZEOF_INT]
186    pinsrw      X0, word [BLOCK + T1 * 2], 3
187
188    cmp         LENEND, 5
189    jl          %%.ELOAD7
190    mov         T1d, INT [LUT +  4*SIZEOF_INT]
191    pinsrw      X0, word [BLOCK + T1 * 2], 4
192
193    cmp         LENEND, 6
194    jl          %%.ELOAD7
195    mov         T1d, INT [LUT +  5*SIZEOF_INT]
196    pinsrw      X0, word [BLOCK + T1 * 2], 5
197
198    cmp         LENEND, 7
199    jl          %%.ELOAD7
200    mov         T1d, INT [LUT +  6*SIZEOF_INT]
201    pinsrw      X0, word [BLOCK + T1 * 2], 6
202%%.ELOAD7:
203%endmacro
204
205%macro REDUCE0 0
206    movdqa      xmm0, XMMWORD [VALUES + ( 0*2)]
207    movdqa      xmm1, XMMWORD [VALUES + ( 8*2)]
208    movdqa      xmm2, XMMWORD [VALUES + (16*2)]
209    movdqa      xmm3, XMMWORD [VALUES + (24*2)]
210    movdqa      xmm4, XMMWORD [VALUES + (32*2)]
211    movdqa      xmm5, XMMWORD [VALUES + (40*2)]
212    movdqa      xmm6, XMMWORD [VALUES + (48*2)]
213    movdqa      xmm7, XMMWORD [VALUES + (56*2)]
214
215    pcmpeqw     xmm0, ZERO
216    pcmpeqw     xmm1, ZERO
217    pcmpeqw     xmm2, ZERO
218    pcmpeqw     xmm3, ZERO
219    pcmpeqw     xmm4, ZERO
220    pcmpeqw     xmm5, ZERO
221    pcmpeqw     xmm6, ZERO
222    pcmpeqw     xmm7, ZERO
223
224    packsswb    xmm0, xmm1
225    packsswb    xmm2, xmm3
226    packsswb    xmm4, xmm5
227    packsswb    xmm6, xmm7
228
229    pmovmskb    eax, xmm0
230    pmovmskb    ecx, xmm2
231    pmovmskb    edx, xmm4
232    pmovmskb    esi, xmm6
233
234    shl         rcx, 16
235    shl         rdx, 32
236    shl         rsi, 48
237
238    or          rax, rcx
239    or          rdx, rsi
240    or          rax, rdx
241
242    not         rax
243
244    mov         MMWORD [r15], rax
245%endmacro
246
247;
248; Prepare data for jsimd_encode_mcu_AC_first().
249;
250; GLOBAL(void)
251; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
252;                                        const int *jpeg_natural_order_start,
253;                                        int Sl, int Al, JCOEF *values,
254;                                        size_t *zerobits)
255;
256; r10 = const JCOEF *block
257; r11 = const int *jpeg_natural_order_start
258; r12 = int Sl
259; r13 = int Al
260; r14 = JCOEF *values
261; r15 = size_t *zerobits
262
263%define ZERO    xmm9
264%define X0      xmm0
265%define X1      xmm1
266%define N0      xmm2
267%define N1      xmm3
268%define AL      xmm4
269%define K       eax
270%define LUT     r11
271%define T0      rcx
272%define T0d     ecx
273%define T1      rdx
274%define T1d     edx
275%define BLOCK   r10
276%define VALUES  r14
277%define LEN     r12d
278%define LENEND  r13d
279
280    align       32
281    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
282
283EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
284    push        rbp
285    mov         rax, rsp                     ; rax = original rbp
286    sub         rsp, byte 4
287    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
288    mov         [rsp], rax
289    mov         rbp, rsp                     ; rbp = aligned rbp
290    lea         rsp, [rbp - 16]
291    collect_args 6
292
293    movdqa      XMMWORD [rbp - 16], ZERO
294
295    movd        AL, r13d
296    pxor        ZERO, ZERO
297    mov         K, LEN
298    mov         LENEND, LEN
299    and         K, -16
300    and         LENEND, 7
301    shr         K, 4
302    jz          .ELOOP16
303.BLOOP16:
304    LOAD16
305    pcmpgtw     N0, X0
306    pcmpgtw     N1, X1
307    paddw       X0, N0
308    paddw       X1, N1
309    pxor        X0, N0
310    pxor        X1, N1
311    psrlw       X0, AL
312    psrlw       X1, AL
313    pxor        N0, X0
314    pxor        N1, X1
315    movdqa      XMMWORD [VALUES + (0) * 2], X0
316    movdqa      XMMWORD [VALUES + (8) * 2], X1
317    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
318    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
319    add         VALUES, 16*2
320    add         LUT, 16*SIZEOF_INT
321    dec         K
322    jnz         .BLOOP16
323    test        LEN, 15
324    je          .PADDING
325.ELOOP16:
326    test        LEN, 8
327    jz          .TRY7
328    test        LEN, 7
329    jz          .TRY8
330
331    LOAD15
332    pcmpgtw     N0, X0
333    pcmpgtw     N1, X1
334    paddw       X0, N0
335    paddw       X1, N1
336    pxor        X0, N0
337    pxor        X1, N1
338    psrlw       X0, AL
339    psrlw       X1, AL
340    pxor        N0, X0
341    pxor        N1, X1
342    movdqa      XMMWORD [VALUES + (0) * 2], X0
343    movdqa      XMMWORD [VALUES + (8) * 2], X1
344    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
345    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
346    add         VALUES, 16*2
347    jmp         .PADDING
348.TRY8:
349    LOAD8
350    pcmpgtw     N0, X0
351    paddw       X0, N0
352    pxor        X0, N0
353    psrlw       X0, AL
354    pxor        N0, X0
355    movdqa      XMMWORD [VALUES + (0) * 2], X0
356    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
357    add         VALUES, 8*2
358    jmp         .PADDING
359.TRY7:
360    LOAD7
361    pcmpgtw     N0, X0
362    paddw       X0, N0
363    pxor        X0, N0
364    psrlw       X0, AL
365    pxor        N0, X0
366    movdqa      XMMWORD [VALUES + (0) * 2], X0
367    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
368    add         VALUES, 8*2
369.PADDING:
370    mov         K, LEN
371    add         K, 7
372    and         K, -8
373    shr         K, 3
374    sub         K, DCTSIZE2/8
375    jz          .EPADDING
376    align       16
377.ZEROLOOP:
378    movdqa      XMMWORD [VALUES + 0], ZERO
379    add         VALUES, 8*2
380    inc         K
381    jnz         .ZEROLOOP
382.EPADDING:
383    sub         VALUES, DCTSIZE2*2
384
385    REDUCE0
386
387    movdqa      ZERO, XMMWORD [rbp - 16]
388    uncollect_args 6
389    mov         rsp, rbp                ; rsp <- aligned rbp
390    pop         rsp                     ; rsp <- original rbp
391    pop         rbp
392    ret
393
394%undef ZERO
395%undef X0
396%undef X1
397%undef N0
398%undef N1
399%undef AL
400%undef K
401%undef LUT
402%undef T0
403%undef T0d
404%undef T1
405%undef T1d
406%undef BLOCK
407%undef VALUES
408%undef LEN
409%undef LENEND
410
411;
412; Prepare data for jsimd_encode_mcu_AC_refine().
413;
414; GLOBAL(int)
415; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
416;                                         const int *jpeg_natural_order_start,
417;                                         int Sl, int Al, JCOEF *absvalues,
418;                                         size_t *bits)
419;
420; r10 = const JCOEF *block
421; r11 = const int *jpeg_natural_order_start
422; r12 = int Sl
423; r13 = int Al
424; r14 = JCOEF *values
425; r15 = size_t *bits
426
427%define ZERO    xmm9
428%define ONE     xmm5
429%define X0      xmm0
430%define X1      xmm1
431%define N0      xmm2
432%define N1      xmm3
433%define AL      xmm4
434%define K       eax
435%define KK      r9d
436%define EOB     r8d
437%define SIGN    rdi
438%define LUT     r11
439%define T0      rcx
440%define T0d     ecx
441%define T1      rdx
442%define T1d     edx
443%define BLOCK   r10
444%define VALUES  r14
445%define LEN     r12d
446%define LENEND  r13d
447
448    align       32
449    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
450
451EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
452    push        rbp
453    mov         rax, rsp                     ; rax = original rbp
454    sub         rsp, byte 4
455    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
456    mov         [rsp], rax
457    mov         rbp, rsp                     ; rbp = aligned rbp
458    lea         rsp, [rbp - 16]
459    collect_args 6
460
461    movdqa      XMMWORD [rbp - 16], ZERO
462
463    xor         SIGN, SIGN
464    xor         EOB, EOB
465    xor         KK, KK
466    movd        AL, r13d
467    pxor        ZERO, ZERO
468    pcmpeqw     ONE, ONE
469    psrlw       ONE, 15
470    mov         K, LEN
471    mov         LENEND, LEN
472    and         K, -16
473    and         LENEND, 7
474    shr         K, 4
475    jz          .ELOOPR16
476.BLOOPR16:
477    LOAD16
478    pcmpgtw     N0, X0
479    pcmpgtw     N1, X1
480    paddw       X0, N0
481    paddw       X1, N1
482    pxor        X0, N0
483    pxor        X1, N1
484    psrlw       X0, AL
485    psrlw       X1, AL
486    movdqa      XMMWORD [VALUES + (0) * 2], X0
487    movdqa      XMMWORD [VALUES + (8) * 2], X1
488    pcmpeqw     X0, ONE
489    pcmpeqw     X1, ONE
490    packsswb    N0, N1
491    packsswb    X0, X1
492    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
493    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
494    shr         SIGN, 16                ; make room for sizebits
495    shl         T0, 48
496    or          SIGN, T0
497    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
498    jz          .CONTINUER16            ; if (idx) {
499    mov         EOB, KK
500    add         EOB, T1d                ; EOB = k + idx;
501.CONTINUER16:
502    add         VALUES, 16*2
503    add         LUT, 16*SIZEOF_INT
504    add         KK, 16
505    dec         K
506    jnz         .BLOOPR16
507.ELOOPR16:
508    test        LEN, 8
509    jz          .TRYR7
510    test        LEN, 7
511    jz          .TRYR8
512
513    LOAD15
514    pcmpgtw     N0, X0
515    pcmpgtw     N1, X1
516    paddw       X0, N0
517    paddw       X1, N1
518    pxor        X0, N0
519    pxor        X1, N1
520    psrlw       X0, AL
521    psrlw       X1, AL
522    movdqa      XMMWORD [VALUES + (0) * 2], X0
523    movdqa      XMMWORD [VALUES + (8) * 2], X1
524    pcmpeqw     X0, ONE
525    pcmpeqw     X1, ONE
526    packsswb    N0, N1
527    packsswb    X0, X1
528    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
529    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
530    shr         SIGN, 16                ; make room for sizebits
531    shl         T0, 48
532    or          SIGN, T0
533    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
534    jz          .CONTINUER15            ; if (idx) {
535    mov         EOB, KK
536    add         EOB, T1d                ; EOB = k + idx;
537.CONTINUER15:
538    add         VALUES, 16*2
539    jmp         .PADDINGR
540.TRYR8:
541    LOAD8
542
543    pcmpgtw     N0, X0
544    paddw       X0, N0
545    pxor        X0, N0
546    psrlw       X0, AL
547    movdqa      XMMWORD [VALUES + (0) * 2], X0
548    pcmpeqw     X0, ONE
549    packsswb    N0, ZERO
550    packsswb    X0, ZERO
551    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
552    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
553    shr         SIGN, 8                 ; make room for sizebits
554    shl         T0, 56
555    or          SIGN, T0
556    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
557    jz          .CONTINUER8             ; if (idx) {
558    mov         EOB, KK
559    add         EOB, T1d                ; EOB = k + idx;
560.CONTINUER8:
561    add         VALUES, 8*2
562    jmp         .PADDINGR
563.TRYR7:
564    LOAD7
565
566    pcmpgtw     N0, X0
567    paddw       X0, N0
568    pxor        X0, N0
569    psrlw       X0, AL
570    movdqa      XMMWORD [VALUES + (0) * 2], X0
571    pcmpeqw     X0, ONE
572    packsswb    N0, ZERO
573    packsswb    X0, ZERO
574    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
575    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
576    shr         SIGN, 8                 ; make room for sizebits
577    shl         T0, 56
578    or          SIGN, T0
579    bsr         T1d, T1d                ; idx = 16 - (__builtin_clz(idx)>>1);
580    jz          .CONTINUER7             ; if (idx) {
581    mov         EOB, KK
582    add         EOB, T1d                ; EOB = k + idx;
583.CONTINUER7:
584    add         VALUES, 8*2
585.PADDINGR:
586    mov         K, LEN
587    add         K, 7
588    and         K, -8
589    shr         K, 3
590    sub         K, DCTSIZE2/8
591    jz          .EPADDINGR
592    align       16
593.ZEROLOOPR:
594    movdqa      XMMWORD [VALUES + 0], ZERO
595    shr         SIGN, 8
596    add         VALUES, 8*2
597    inc         K
598    jnz         .ZEROLOOPR
599.EPADDINGR:
600    not         SIGN
601    sub         VALUES, DCTSIZE2*2
602    mov         MMWORD [r15+SIZEOF_MMWORD], SIGN
603
604    REDUCE0
605
606    mov         eax, EOB
607    movdqa      ZERO, XMMWORD [rbp - 16]
608    uncollect_args 6
609    mov         rsp, rbp                ; rsp <- aligned rbp
610    pop         rsp                     ; rsp <- original rbp
611    pop         rbp
612    ret
613
614%undef ZERO
615%undef ONE
616%undef X0
617%undef X1
618%undef N0
619%undef N1
620%undef AL
621%undef K
622%undef KK
623%undef EOB
624%undef SIGN
625%undef LUT
626%undef T0
627%undef T0d
628%undef T1
629%undef T1d
630%undef BLOCK
631%undef VALUES
632%undef LEN
633%undef LENEND
634
635; For some reason, the OS X linker does not honor the request to align the
636; segment unless we do this.
637    align       32
638