• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* adler32.c -- compute the Adler-32 checksum of a data stream
2  * Copyright (C) 1995-2011 Mark Adler
3  * Authors:
4  *   Brian Bockelman <bockelman@gmail.com>
5  * For conditions of distribution and use, see copyright notice in zlib.h
6  */
7 
8 #include "../../zbuild.h"
9 #include "../../zutil.h"
10 
11 #include "../../adler32_p.h"
12 
13 #include <immintrin.h>
14 
15 #ifdef X86_AVX2_ADLER32
16 
adler32_avx2(uint32_t adler,const unsigned char * buf,size_t len)17 Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) {
18     uint32_t sum2;
19 
20      /* split Adler-32 into component sums */
21     sum2 = (adler >> 16) & 0xffff;
22     adler &= 0xffff;
23 
24     /* in case user likes doing a byte at a time, keep it fast */
25     if (UNLIKELY(len == 1))
26         return adler32_len_1(adler, buf, sum2);
27 
28     /* initial Adler-32 value (deferred check for len == 1 speed) */
29     if (UNLIKELY(buf == NULL))
30         return 1L;
31 
32     /* in case short lengths are provided, keep it somewhat fast */
33     if (UNLIKELY(len < 16))
34         return adler32_len_16(adler, buf, len, sum2);
35 
36     uint32_t ALIGNED_(32) s1[8], s2[8];
37 
38     memset(s1, 0, sizeof(s1)); s1[7] = adler; // TODO: would a masked load be faster?
39     memset(s2, 0, sizeof(s2)); s2[7] = sum2;
40 
41     char ALIGNED_(32) dot1[32] = \
42         {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
44     __m256i dot1v = _mm256_load_si256((__m256i*)dot1);
45     char ALIGNED_(32) dot2[32] = \
46         {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
47          16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
48     __m256i dot2v = _mm256_load_si256((__m256i*)dot2);
49     short ALIGNED_(32) dot3[16] = \
50         {1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1};
51     __m256i dot3v = _mm256_load_si256((__m256i*)dot3);
52 
53     // We will need to multiply by
54     char ALIGNED_(32) shift[16] = {5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
55     __m128i shiftv = _mm_load_si128((__m128i*)shift);
56 
57     while (len >= 32) {
58        __m256i vs1 = _mm256_load_si256((__m256i*)s1);
59        __m256i vs2 = _mm256_load_si256((__m256i*)s2);
60        __m256i vs1_0 = vs1;
61 
62        int k = (len < NMAX ? (int)len : NMAX);
63        k -= k % 32;
64        len -= k;
65 
66        while (k >= 32) {
67            /*
68               vs1 = adler + sum(c[i])
69               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
70            */
71            __m256i vbuf = _mm256_loadu_si256((__m256i*)buf);
72            buf += 32;
73            k -= 32;
74 
75            __m256i v_short_sum1 = _mm256_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
76            __m256i vsum1 = _mm256_madd_epi16(v_short_sum1, dot3v);   // sum 8 shorts to 4 int32_t;
77            __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v);
78            vs1 = _mm256_add_epi32(vsum1, vs1);
79            __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v);
80            vs1_0 = _mm256_sll_epi32(vs1_0, shiftv);
81            vsum2 = _mm256_add_epi32(vsum2, vs2);
82            vs2   = _mm256_add_epi32(vsum2, vs1_0);
83            vs1_0 = vs1;
84        }
85 
86        // At this point, we have partial sums stored in vs1 and vs2.  There are AVX512 instructions that
87        // would allow us to sum these quickly (VP4DPWSSD).  For now, just unpack and move on.
88        uint32_t ALIGNED_(32) s1_unpack[8];
89        uint32_t ALIGNED_(32) s2_unpack[8];
90 
91        _mm256_store_si256((__m256i*)s1_unpack, vs1);
92        _mm256_store_si256((__m256i*)s2_unpack, vs2);
93 
94        adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
95                (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
96        adler %= BASE;
97        s1[7] = adler;
98 
99        sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE) +
100               (s2_unpack[4] % BASE) + (s2_unpack[5] % BASE) + (s2_unpack[6] % BASE) + (s2_unpack[7] % BASE);
101        sum2 %= BASE;
102        s2[7] = sum2;
103     }
104 
105     while (len) {
106         len--;
107         adler += *buf++;
108         sum2 += adler;
109     }
110     adler %= BASE;
111     sum2 %= BASE;
112 
113     /* return recombined sums */
114     return adler | (sum2 << 16);
115 }
116 
117 #endif
118