1 /* adler32_simd.c
2 *
3 * Copyright 2017 The Chromium Authors. All rights reserved.
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the Chromium source repository LICENSE file.
6 *
7 * Per http://en.wikipedia.org/wiki/Adler-32 the adler32 A value (aka s1) is
8 * the sum of N input data bytes D1 ... DN,
9 *
10 * A = A0 + D1 + D2 + ... + DN
11 *
12 * where A0 is the initial value.
13 *
14 * SSE2 _mm_sad_epu8() can be used for byte sums (see http://bit.ly/2wpUOeD,
15 * for example) and accumulating the byte sums can use SSE shuffle-adds (see
16 * the "Integer" section of http://bit.ly/2erPT8t for details). Arm NEON has
17 * similar instructions.
18 *
19 * The adler32 B value (aka s2) sums the A values from each step:
20 *
21 * B0 + (A0 + D1) + (A0 + D1 + D2) + ... + (A0 + D1 + D2 + ... + DN) or
22 *
23 * B0 + N.A0 + N.D1 + (N-1).D2 + (N-2).D3 + ... + (N-(N-1)).DN
24 *
25 * B0 being the initial value. For 32 bytes (ideal for garden-variety SIMD):
26 *
27 * B = B0 + 32.A0 + [D1 D2 D3 ... D32] x [32 31 30 ... 1].
28 *
29 * Adjacent blocks of 32 input bytes can be iterated with the expressions to
30 * compute the adler32 s1 s2 of M >> 32 input bytes [1].
31 *
32 * As M grows, the s1 s2 sums grow. If left unchecked, they would eventually
33 * overflow the precision of their integer representation (bad). However, s1
34 * and s2 also need to be computed modulo the adler BASE value (reduced). If
35 * at most NMAX bytes are processed before a reduce, s1 s2 _cannot_ overflow
36 * a uint32_t type (the NMAX constraint) [2].
37 *
38 * [1] the iterative equations for s2 contain constant factors; these can be
39 * hoisted from the n-blocks do loop of the SIMD code.
40 *
41 * [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
42 * of the adler s1 s2 of uint32_t type (see adler32.c).
43 */
44
45 #include "adler32_simd.h"
46
47 /* Definitions from adler32.c: largest prime smaller than 65536 */
48 #define BASE 65521U
49 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
50 #define NMAX 5552
51
52 #if defined(ADLER32_SIMD_SSSE3)
53 #ifndef __GNUC__
54 #define __attribute__()
55 #endif
56
57 #include <tmmintrin.h>
58
59 __attribute__((target("ssse3")))
adler32_simd_(uint32_t adler,const unsigned char * buf,z_size_t len)60 uint32_t ZLIB_INTERNAL adler32_simd_( /* SSSE3 */
61 uint32_t adler,
62 const unsigned char *buf,
63 z_size_t len)
64 {
65 /*
66 * Split Adler-32 into component sums.
67 */
68 uint32_t s1 = adler & 0xffff;
69 uint32_t s2 = adler >> 16;
70
71 /*
72 * Process the data in blocks.
73 */
74 const unsigned BLOCK_SIZE = 1 << 5;
75
76 z_size_t blocks = len / BLOCK_SIZE;
77 len -= blocks * BLOCK_SIZE;
78
79 while (blocks)
80 {
81 unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
82 if (n > blocks)
83 n = (unsigned) blocks;
84 blocks -= n;
85
86 const __m128i tap1 =
87 _mm_setr_epi8(32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17);
88 const __m128i tap2 =
89 _mm_setr_epi8(16,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
90 const __m128i zero =
91 _mm_setr_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
92 const __m128i ones =
93 _mm_set_epi16( 1, 1, 1, 1, 1, 1, 1, 1);
94
95 /*
96 * Process n blocks of data. At most NMAX data bytes can be
97 * processed before s2 must be reduced modulo BASE.
98 */
99 __m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
100 __m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
101 __m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
102
103 do {
104 /*
105 * Load 32 input bytes.
106 */
107 const __m128i bytes1 = _mm_loadu_si128((__m128i*)(buf));
108 const __m128i bytes2 = _mm_loadu_si128((__m128i*)(buf + 16));
109
110 /*
111 * Add previous block byte sum to v_ps.
112 */
113 v_ps = _mm_add_epi32(v_ps, v_s1);
114
115 /*
116 * Horizontally add the bytes for s1, multiply-adds the
117 * bytes by [ 32, 31, 30, ... ] for s2.
118 */
119 v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero));
120 const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1);
121 v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones));
122
123 v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero));
124 const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
125 v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
126
127 buf += BLOCK_SIZE;
128
129 } while (--n);
130
131 v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
132
133 /*
134 * Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
135 */
136
137 #define S23O1 _MM_SHUFFLE(2,3,0,1) /* A B C D -> B A D C */
138 #define S1O32 _MM_SHUFFLE(1,0,3,2) /* A B C D -> C D A B */
139
140 v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1));
141 v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32));
142
143 s1 += _mm_cvtsi128_si32(v_s1);
144
145 v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1));
146 v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32));
147
148 s2 = _mm_cvtsi128_si32(v_s2);
149
150 #undef S23O1
151 #undef S1O32
152
153 /*
154 * Reduce.
155 */
156 s1 %= BASE;
157 s2 %= BASE;
158 }
159
160 /*
161 * Handle leftover data.
162 */
163 if (len) {
164 if (len >= 16) {
165 s2 += (s1 += *buf++);
166 s2 += (s1 += *buf++);
167 s2 += (s1 += *buf++);
168 s2 += (s1 += *buf++);
169
170 s2 += (s1 += *buf++);
171 s2 += (s1 += *buf++);
172 s2 += (s1 += *buf++);
173 s2 += (s1 += *buf++);
174
175 s2 += (s1 += *buf++);
176 s2 += (s1 += *buf++);
177 s2 += (s1 += *buf++);
178 s2 += (s1 += *buf++);
179
180 s2 += (s1 += *buf++);
181 s2 += (s1 += *buf++);
182 s2 += (s1 += *buf++);
183 s2 += (s1 += *buf++);
184
185 len -= 16;
186 }
187
188 while (len--) {
189 s2 += (s1 += *buf++);
190 }
191
192 if (s1 >= BASE)
193 s1 -= BASE;
194 s2 %= BASE;
195 }
196
197 /*
198 * Return the recombined sums.
199 */
200 return s1 | (s2 << 16);
201 }
202
203 #elif defined(ADLER32_SIMD_NEON)
204
205 #include <arm_neon.h>
206
adler32_simd_(uint32_t adler,const unsigned char * buf,z_size_t len)207 uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */
208 uint32_t adler,
209 const unsigned char *buf,
210 z_size_t len)
211 {
212 /*
213 * Split Adler-32 into component sums.
214 */
215 uint32_t s1 = adler & 0xffff;
216 uint32_t s2 = adler >> 16;
217
218 /*
219 * Serially compute s1 & s2, until the data is 16-byte aligned.
220 */
221 if ((uintptr_t)buf & 15) {
222 while ((uintptr_t)buf & 15) {
223 s2 += (s1 += *buf++);
224 --len;
225 }
226
227 if (s1 >= BASE)
228 s1 -= BASE;
229 s2 %= BASE;
230 }
231
232 /*
233 * Process the data in blocks.
234 */
235 const unsigned BLOCK_SIZE = 1 << 5;
236
237 z_size_t blocks = len / BLOCK_SIZE;
238 len -= blocks * BLOCK_SIZE;
239
240 while (blocks)
241 {
242 unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
243 if (n > blocks)
244 n = (unsigned) blocks;
245 blocks -= n;
246
247 /*
248 * Process n blocks of data. At most NMAX data bytes can be
249 * processed before s2 must be reduced modulo BASE.
250 */
251 uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, s1 * n };
252 uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 };
253
254 uint16x8_t v_column_sum_1 = vdupq_n_u16(0);
255 uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
256 uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
257 uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
258
259 do {
260 /*
261 * Load 32 input bytes.
262 */
263 const uint8x16_t bytes1 = vld1q_u8((uint8_t*)(buf));
264 const uint8x16_t bytes2 = vld1q_u8((uint8_t*)(buf + 16));
265
266 /*
267 * Add previous block byte sum to v_s2.
268 */
269 v_s2 = vaddq_u32(v_s2, v_s1);
270
271 /*
272 * Horizontally add the bytes for s1.
273 */
274 v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2));
275
276 /*
277 * Vertically add the bytes for s2.
278 */
279 v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8 (bytes1));
280 v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1));
281 v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8 (bytes2));
282 v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
283
284 buf += BLOCK_SIZE;
285
286 } while (--n);
287
288 v_s2 = vshlq_n_u32(v_s2, 5);
289
290 /*
291 * Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
292 */
293 v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_1),
294 (uint16x4_t) { 32, 31, 30, 29 });
295 v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1),
296 (uint16x4_t) { 28, 27, 26, 25 });
297 v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_2),
298 (uint16x4_t) { 24, 23, 22, 21 });
299 v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2),
300 (uint16x4_t) { 20, 19, 18, 17 });
301 v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_3),
302 (uint16x4_t) { 16, 15, 14, 13 });
303 v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3),
304 (uint16x4_t) { 12, 11, 10, 9 });
305 v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_4),
306 (uint16x4_t) { 8, 7, 6, 5 });
307 v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4),
308 (uint16x4_t) { 4, 3, 2, 1 });
309
310 /*
311 * Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
312 */
313 uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1));
314 uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2));
315 uint32x2_t s1s2 = vpadd_u32(sum1, sum2);
316
317 s1 += vget_lane_u32(s1s2, 0);
318 s2 += vget_lane_u32(s1s2, 1);
319
320 /*
321 * Reduce.
322 */
323 s1 %= BASE;
324 s2 %= BASE;
325 }
326
327 /*
328 * Handle leftover data.
329 */
330 if (len) {
331 if (len >= 16) {
332 s2 += (s1 += *buf++);
333 s2 += (s1 += *buf++);
334 s2 += (s1 += *buf++);
335 s2 += (s1 += *buf++);
336
337 s2 += (s1 += *buf++);
338 s2 += (s1 += *buf++);
339 s2 += (s1 += *buf++);
340 s2 += (s1 += *buf++);
341
342 s2 += (s1 += *buf++);
343 s2 += (s1 += *buf++);
344 s2 += (s1 += *buf++);
345 s2 += (s1 += *buf++);
346
347 s2 += (s1 += *buf++);
348 s2 += (s1 += *buf++);
349 s2 += (s1 += *buf++);
350 s2 += (s1 += *buf++);
351
352 len -= 16;
353 }
354
355 while (len--) {
356 s2 += (s1 += *buf++);
357 }
358
359 if (s1 >= BASE)
360 s1 -= BASE;
361 s2 %= BASE;
362 }
363
364 /*
365 * Return the recombined sums.
366 */
367 return s1 | (s2 << 16);
368 }
369
370 #endif /* ADLER32_SIMD_SSSE3 */
371