• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Adler32 for POWER8 using VSX instructions.
2  * Copyright (C) 2020 IBM Corporation
3  * Author: Rogerio Alves <rcardoso@linux.ibm.com>
4  * For conditions of distribution and use, see copyright notice in zlib.h
5  *
6  * Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
7  * instructions.
8  *
9  * If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
10  * iteration n) is the initial value of adler - at start  _0 is 1 unless
11  * adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
12  * the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
13  * Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
14  * after iteration N.
15  *
16  * Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
17  * N-1*c[1] + ... + c[N]
18  *
19  * In a more general way:
20  *
21  * s1_N = s1_0 + sum(i=1 to N)c[i]
22  * s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
23  *
24  * Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
25  * can process N-bit at time we can do this at once.
26  *
27  * Since VSX can support 16-bit vector instructions, we can process
28  * 16-bit at time using N = 16 we have:
29  *
30  * s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
31  * s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
32  *
33  * After the first iteration we calculate the adler32 checksum for 16 bytes.
34  *
35  * For more background about adler32 please check the RFC:
36  * https://www.ietf.org/rfc/rfc1950.txt
37  */
38 
39 #ifdef POWER8_VSX_ADLER32
40 
41 #include <altivec.h>
42 #include "zbuild.h"
43 #include "zutil.h"
44 #include "adler32_p.h"
45 
46 /* Vector across sum unsigned int (saturate).  */
vec_sumsu(vector unsigned int __a,vector unsigned int __b)47 inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) {
48     __b = vec_sld(__a, __a, 8);
49     __b = vec_add(__b, __a);
50     __a = vec_sld(__b, __b, 4);
51     __a = vec_add(__a, __b);
52 
53     return __a;
54 }
55 
adler32_power8(uint32_t adler,const unsigned char * buf,size_t len)56 uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len) {
57     uint32_t s1 = adler & 0xffff;
58     uint32_t s2 = (adler >> 16) & 0xffff;
59 
60     /* in case user likes doing a byte at a time, keep it fast */
61     if (UNLIKELY(len == 1))
62         return adler32_len_1(s1, buf, s2);
63 
64     /* If buffer is empty or len=0 we need to return adler initial value.  */
65     if (UNLIKELY(buf == NULL))
66         return 1;
67 
68     /* This is faster than VSX code for len < 64.  */
69     if (len < 64)
70         return adler32_len_64(s1, buf, len, s2);
71 
72     /* Use POWER VSX instructions for len >= 64. */
73     const vector unsigned int v_zeros = { 0 };
74     const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
75          6, 5, 4, 3, 2, 1};
76     const vector unsigned char vsh = vec_splat_u8(4);
77     const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
78     vector unsigned int vs1 = { 0 };
79     vector unsigned int vs2 = { 0 };
80     vector unsigned int vs1_save = { 0 };
81     vector unsigned int vsum1, vsum2;
82     vector unsigned char vbuf;
83     int n;
84 
85     vs1[0] = s1;
86     vs2[0] = s2;
87 
88     /* Do length bigger than NMAX in blocks of NMAX size.  */
89     while (len >= NMAX) {
90         len -= NMAX;
91         n = NMAX / 16;
92         do {
93             vbuf = vec_xl(0, (unsigned char *) buf);
94             vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i].  */
95             /* sum(i=1 to 16) buf[i]*(16-i+1).  */
96             vsum2 = vec_msum(vbuf, v_mul, v_zeros);
97             /* Save vs1.  */
98             vs1_save = vec_add(vs1_save, vs1);
99             /* Accumulate the sums.  */
100             vs1 = vec_add(vsum1, vs1);
101             vs2 = vec_add(vsum2, vs2);
102 
103             buf += 16;
104         } while (--n);
105         /* Once each block of NMAX size.  */
106         vs1 = vec_sumsu(vs1, vsum1);
107         vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save.  */
108         vs2 = vec_add(vs1_save, vs2);
109         vs2 = vec_sumsu(vs2, vsum2);
110 
111         /* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521.  */
112         vs1[0] = vs1[0] % BASE;
113         /* vs2[0] = s2_i + 16*s1_save +
114            sum(i=1 to 16)(16-i+1)*buf[i] mod 65521.  */
115         vs2[0] = vs2[0] % BASE;
116 
117         vs1 = vec_and(vs1, vmask);
118         vs2 = vec_and(vs2, vmask);
119         vs1_save = v_zeros;
120     }
121 
122     /* len is less than NMAX one modulo is needed.  */
123     if (len >= 16) {
124         while (len >= 16) {
125             len -= 16;
126 
127             vbuf = vec_xl(0, (unsigned char *) buf);
128 
129             vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i].  */
130             /* sum(i=1 to 16) buf[i]*(16-i+1).  */
131             vsum2 = vec_msum(vbuf, v_mul, v_zeros);
132             /* Save vs1.  */
133             vs1_save = vec_add(vs1_save, vs1);
134             /* Accumulate the sums.  */
135             vs1 = vec_add(vsum1, vs1);
136             vs2 = vec_add(vsum2, vs2);
137 
138             buf += 16;
139         }
140         /* Since the size will be always less than NMAX we do this once.  */
141         vs1 = vec_sumsu(vs1, vsum1);
142         vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save.  */
143         vs2 = vec_add(vs1_save, vs2);
144         vs2 = vec_sumsu(vs2, vsum2);
145     }
146     /* Copy result back to s1, s2 (mod 65521).  */
147     s1 = vs1[0] % BASE;
148     s2 = vs2[0] % BASE;
149 
150     /* Process tail (len < 16).and return  */
151     return adler32_len_16(s1, buf, len, s2);
152 }
153 
154 #endif /* POWER8_VSX_ADLER32 */
155