• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* chunkcopy.h -- fast chunk copy and set operations
2  * Copyright (C) 2017 ARM, Inc.
3  * Copyright 2017 The Chromium Authors. All rights reserved.
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the Chromium source repository LICENSE file.
6  */
7 
8 #ifndef CHUNKCOPY_H
9 #define CHUNKCOPY_H
10 
11 #include <stdint.h>
12 #include "zutil.h"
13 
14 #define Z_STATIC_ASSERT(name, assert) typedef char name[(assert) ? 1 : -1]
15 
16 #if __STDC_VERSION__ >= 199901L
17 #define Z_RESTRICT restrict
18 #else
19 #define Z_RESTRICT
20 #endif
21 
22 #if defined(__clang__) || defined(__GNUC__) || defined(__llvm__)
23 #define Z_BUILTIN_MEMCPY __builtin_memcpy
24 #else
25 #define Z_BUILTIN_MEMCPY zmemcpy
26 #endif
27 
28 #if defined(INFLATE_CHUNK_SIMD_NEON)
29 #include <arm_neon.h>
30 typedef uint8x16_t z_vec128i_t;
31 #elif defined(INFLATE_CHUNK_SIMD_SSE2)
32 #pragma GCC target ("sse2")
33 #include <emmintrin.h>
34 typedef __m128i z_vec128i_t;
35 #else
36 #error chunkcopy.h inflate chunk SIMD is not defined for your build target
37 #endif
38 
39 /*
40  * chunk copy type: the z_vec128i_t type size should be exactly 128-bits
41  * and equal to CHUNKCOPY_CHUNK_SIZE.
42  */
43 #define CHUNKCOPY_CHUNK_SIZE sizeof(z_vec128i_t)
44 
45 Z_STATIC_ASSERT(vector_128_bits_wide,
46                 CHUNKCOPY_CHUNK_SIZE == sizeof(int8_t) * 16);
47 
48 /*
49  * Ask the compiler to perform a wide, unaligned load with a machine
50  * instruction appropriate for the z_vec128i_t type.
51  */
loadchunk(const unsigned char FAR * s)52 static inline z_vec128i_t loadchunk(
53     const unsigned char FAR* s) {
54   z_vec128i_t v;
55   Z_BUILTIN_MEMCPY(&v, s, sizeof(v));
56   return v;
57 }
58 
59 /*
60  * Ask the compiler to perform a wide, unaligned store with a machine
61  * instruction appropriate for the z_vec128i_t type.
62  */
storechunk(unsigned char FAR * d,const z_vec128i_t v)63 static inline void storechunk(
64     unsigned char FAR* d,
65     const z_vec128i_t v) {
66   Z_BUILTIN_MEMCPY(d, &v, sizeof(v));
67 }
68 
69 /*
70  * Perform a memcpy-like operation, assuming that length is non-zero and that
71  * it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
72  * the length is shorter than this.
73  *
74  * It also guarantees that it will properly unroll the data if the distance
75  * between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on
76  * in chunkcopy_relaxed().
77  *
78  * Aside from better memory bus utilisation, this means that short copies
79  * (CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop
80  * without iteration, which will hopefully make the branch prediction more
81  * reliable.
82  */
chunkcopy_core(unsigned char FAR * out,const unsigned char FAR * from,unsigned len)83 static inline unsigned char FAR* chunkcopy_core(
84     unsigned char FAR* out,
85     const unsigned char FAR* from,
86     unsigned len) {
87   const int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1;
88   storechunk(out, loadchunk(from));
89   out += bump;
90   from += bump;
91   len /= CHUNKCOPY_CHUNK_SIZE;
92   while (len-- > 0) {
93     storechunk(out, loadchunk(from));
94     out += CHUNKCOPY_CHUNK_SIZE;
95     from += CHUNKCOPY_CHUNK_SIZE;
96   }
97   return out;
98 }
99 
100 /*
101  * Like chunkcopy_core(), but avoid writing beyond of legal output.
102  *
103  * Accepts an additional pointer to the end of safe output.  A generic safe
104  * copy would use (out + len), but it's normally the case that the end of the
105  * output buffer is beyond the end of the current copy, and this can still be
106  * exploited.
107  */
chunkcopy_core_safe(unsigned char FAR * out,const unsigned char FAR * from,unsigned len,unsigned char FAR * limit)108 static inline unsigned char FAR* chunkcopy_core_safe(
109     unsigned char FAR* out,
110     const unsigned char FAR* from,
111     unsigned len,
112     unsigned char FAR* limit) {
113   Assert(out + len <= limit, "chunk copy exceeds safety limit");
114   if ((limit - out) < (ptrdiff_t)CHUNKCOPY_CHUNK_SIZE) {
115     const unsigned char FAR* Z_RESTRICT rfrom = from;
116     if (len & 8) {
117       Z_BUILTIN_MEMCPY(out, rfrom, 8);
118       out += 8;
119       rfrom += 8;
120     }
121     if (len & 4) {
122       Z_BUILTIN_MEMCPY(out, rfrom, 4);
123       out += 4;
124       rfrom += 4;
125     }
126     if (len & 2) {
127       Z_BUILTIN_MEMCPY(out, rfrom, 2);
128       out += 2;
129       rfrom += 2;
130     }
131     if (len & 1) {
132       *out++ = *rfrom++;
133     }
134     return out;
135   }
136   return chunkcopy_core(out, from, len);
137 }
138 
139 /*
140  * Perform short copies until distance can be rewritten as being at least
141  * CHUNKCOPY_CHUNK_SIZE.
142  *
143  * Assumes it's OK to overwrite at least the first 2*CHUNKCOPY_CHUNK_SIZE
144  * bytes of output even if the copy is shorter than this.  This assumption
145  * holds within zlib inflate_fast(), which starts every iteration with at
146  * least 258 bytes of output space available (258 being the maximum length
147  * output from a single token; see inffast.c).
148  */
chunkunroll_relaxed(unsigned char FAR * out,unsigned FAR * dist,unsigned FAR * len)149 static inline unsigned char FAR* chunkunroll_relaxed(
150     unsigned char FAR* out,
151     unsigned FAR* dist,
152     unsigned FAR* len) {
153   const unsigned char FAR* from = out - *dist;
154   while (*dist < *len && *dist < CHUNKCOPY_CHUNK_SIZE) {
155     storechunk(out, loadchunk(from));
156     out += *dist;
157     *len -= *dist;
158     *dist += *dist;
159   }
160   return out;
161 }
162 
163 #if defined(INFLATE_CHUNK_SIMD_NEON)
164 /*
165  * v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
166  * every 64-bit component of the 128-bit result (64-bit int splat).
167  */
v_load64_dup(const void * src)168 static inline z_vec128i_t v_load64_dup(const void* src) {
169   return vcombine_u8(vld1_u8(src), vld1_u8(src));
170 }
171 
172 /*
173  * v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
174  * every 32-bit component of the 128-bit result (32-bit int splat).
175  */
v_load32_dup(const void * src)176 static inline z_vec128i_t v_load32_dup(const void* src) {
177   int32_t i32;
178   Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32));
179   return vreinterpretq_u8_s32(vdupq_n_s32(i32));
180 }
181 
182 /*
183  * v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
184  * every 16-bit component of the 128-bit result (16-bit int splat).
185  */
v_load16_dup(const void * src)186 static inline z_vec128i_t v_load16_dup(const void* src) {
187   int16_t i16;
188   Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16));
189   return vreinterpretq_u8_s16(vdupq_n_s16(i16));
190 }
191 
192 /*
193  * v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
194  * component of the 128-bit result (8-bit int splat).
195  */
v_load8_dup(const void * src)196 static inline z_vec128i_t v_load8_dup(const void* src) {
197   return vld1q_dup_u8((const uint8_t*)src);
198 }
199 
200 /*
201  * v_store_128(): store the 128-bit vec in a memory destination (that might
202  * not be 16-byte aligned) void* out.
203  */
v_store_128(void * out,const z_vec128i_t vec)204 static inline void v_store_128(void* out, const z_vec128i_t vec) {
205   vst1q_u8(out, vec);
206 }
207 
208 #elif defined(INFLATE_CHUNK_SIMD_SSE2)
209 /*
210  * v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
211  * every 64-bit component of the 128-bit result (64-bit int splat).
212  */
v_load64_dup(const void * src)213 static inline z_vec128i_t v_load64_dup(const void* src) {
214   int64_t i64;
215   Z_BUILTIN_MEMCPY(&i64, src, sizeof(i64));
216   return _mm_set1_epi64x(i64);
217 }
218 
219 /*
220  * v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
221  * every 32-bit component of the 128-bit result (32-bit int splat).
222  */
v_load32_dup(const void * src)223 static inline z_vec128i_t v_load32_dup(const void* src) {
224   int32_t i32;
225   Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32));
226   return _mm_set1_epi32(i32);
227 }
228 
229 /*
230  * v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
231  * every 16-bit component of the 128-bit result (16-bit int splat).
232  */
v_load16_dup(const void * src)233 static inline z_vec128i_t v_load16_dup(const void* src) {
234   int16_t i16;
235   Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16));
236   return _mm_set1_epi16(i16);
237 }
238 
239 /*
240  * v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
241  * component of the 128-bit result (8-bit int splat).
242  */
v_load8_dup(const void * src)243 static inline z_vec128i_t v_load8_dup(const void* src) {
244   return _mm_set1_epi8(*(const char*)src);
245 }
246 
247 /*
248  * v_store_128(): store the 128-bit vec in a memory destination (that might
249  * not be 16-byte aligned) void* out.
250  */
v_store_128(void * out,const z_vec128i_t vec)251 static inline void v_store_128(void* out, const z_vec128i_t vec) {
252   _mm_storeu_si128((__m128i*)out, vec);
253 }
254 #endif
255 
256 /*
257  * Perform an overlapping copy which behaves as a memset() operation, but
258  * supporting periods other than one, and assume that length is non-zero and
259  * that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output
260  * even if the length is shorter than this.
261  */
chunkset_core(unsigned char FAR * out,unsigned period,unsigned len)262 static inline unsigned char FAR* chunkset_core(
263     unsigned char FAR* out,
264     unsigned period,
265     unsigned len) {
266   z_vec128i_t v;
267   const int bump = ((len - 1) % sizeof(v)) + 1;
268 
269   switch (period) {
270     case 1:
271       v = v_load8_dup(out - 1);
272       v_store_128(out, v);
273       out += bump;
274       len -= bump;
275       while (len > 0) {
276         v_store_128(out, v);
277         out += sizeof(v);
278         len -= sizeof(v);
279       }
280       return out;
281     case 2:
282       v = v_load16_dup(out - 2);
283       v_store_128(out, v);
284       out += bump;
285       len -= bump;
286       if (len > 0) {
287         v = v_load16_dup(out - 2);
288         do {
289           v_store_128(out, v);
290           out += sizeof(v);
291           len -= sizeof(v);
292         } while (len > 0);
293       }
294       return out;
295     case 4:
296       v = v_load32_dup(out - 4);
297       v_store_128(out, v);
298       out += bump;
299       len -= bump;
300       if (len > 0) {
301         v = v_load32_dup(out - 4);
302         do {
303           v_store_128(out, v);
304           out += sizeof(v);
305           len -= sizeof(v);
306         } while (len > 0);
307       }
308       return out;
309     case 8:
310       v = v_load64_dup(out - 8);
311       v_store_128(out, v);
312       out += bump;
313       len -= bump;
314       if (len > 0) {
315         v = v_load64_dup(out - 8);
316         do {
317           v_store_128(out, v);
318           out += sizeof(v);
319           len -= sizeof(v);
320         } while (len > 0);
321       }
322       return out;
323   }
324   out = chunkunroll_relaxed(out, &period, &len);
325   return chunkcopy_core(out, out - period, len);
326 }
327 
328 /*
329  * Perform a memcpy-like operation, but assume that length is non-zero and that
330  * it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
331  * the length is shorter than this.
332  *
333  * Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour
334  * of overlapping buffers, regardless of the distance between the pointers.
335  * This is reflected in the `restrict`-qualified pointers, allowing the
336  * compiler to re-order loads and stores.
337  */
chunkcopy_relaxed(unsigned char FAR * Z_RESTRICT out,const unsigned char FAR * Z_RESTRICT from,unsigned len)338 static inline unsigned char FAR* chunkcopy_relaxed(
339     unsigned char FAR* Z_RESTRICT out,
340     const unsigned char FAR* Z_RESTRICT from,
341     unsigned len) {
342   return chunkcopy_core(out, from, len);
343 }
344 
345 /*
346  * Like chunkcopy_relaxed(), but avoid writing beyond of legal output.
347  *
348  * Unlike chunkcopy_core_safe() above, no guarantee is made regarding the
349  * behaviour of overlapping buffers, regardless of the distance between the
350  * pointers.  This is reflected in the `restrict`-qualified pointers, allowing
351  * the compiler to re-order loads and stores.
352  *
353  * Accepts an additional pointer to the end of safe output.  A generic safe
354  * copy would use (out + len), but it's normally the case that the end of the
355  * output buffer is beyond the end of the current copy, and this can still be
356  * exploited.
357  */
chunkcopy_safe(unsigned char FAR * out,const unsigned char FAR * Z_RESTRICT from,unsigned len,unsigned char FAR * limit)358 static inline unsigned char FAR* chunkcopy_safe(
359     unsigned char FAR* out,
360     const unsigned char FAR* Z_RESTRICT from,
361     unsigned len,
362     unsigned char FAR* limit) {
363   Assert(out + len <= limit, "chunk copy exceeds safety limit");
364   return chunkcopy_core_safe(out, from, len, limit);
365 }
366 
367 /*
368  * Perform chunky copy within the same buffer, where the source and destination
369  * may potentially overlap.
370  *
371  * Assumes that len > 0 on entry, and that it's safe to write at least
372  * CHUNKCOPY_CHUNK_SIZE*3 bytes to the output.
373  */
chunkcopy_lapped_relaxed(unsigned char FAR * out,unsigned dist,unsigned len)374 static inline unsigned char FAR* chunkcopy_lapped_relaxed(
375     unsigned char FAR* out,
376     unsigned dist,
377     unsigned len) {
378   if (dist < len && dist < CHUNKCOPY_CHUNK_SIZE) {
379     return chunkset_core(out, dist, len);
380   }
381   return chunkcopy_core(out, out - dist, len);
382 }
383 
384 /*
385  * Behave like chunkcopy_lapped_relaxed(), but avoid writing beyond of legal
386  * output.
387  *
388  * Accepts an additional pointer to the end of safe output.  A generic safe
389  * copy would use (out + len), but it's normally the case that the end of the
390  * output buffer is beyond the end of the current copy, and this can still be
391  * exploited.
392  */
chunkcopy_lapped_safe(unsigned char FAR * out,unsigned dist,unsigned len,unsigned char FAR * limit)393 static inline unsigned char FAR* chunkcopy_lapped_safe(
394     unsigned char FAR* out,
395     unsigned dist,
396     unsigned len,
397     unsigned char FAR* limit) {
398   Assert(out + len <= limit, "chunk copy exceeds safety limit");
399   if ((limit - out) < (ptrdiff_t)(3 * CHUNKCOPY_CHUNK_SIZE)) {
400     /* TODO(cavalcantii): try harder to optimise this */
401     while (len-- > 0) {
402       *out = *(out - dist);
403       out++;
404     }
405     return out;
406   }
407   return chunkcopy_lapped_relaxed(out, dist, len);
408 }
409 
410 /*
411  * The chunk-copy code above deals with writing the decoded DEFLATE data to
412  * the output with SIMD methods to increase decode speed. Reading the input
413  * to the DEFLATE decoder with a wide, SIMD method can also increase decode
414  * speed. This option is supported on little endian machines, and reads the
415  * input data in 64-bit (8 byte) chunks.
416  */
417 
418 #ifdef INFLATE_CHUNK_READ_64LE
419 /*
420  * Buffer the input in a uint64_t (8 bytes) in the wide input reading case.
421  */
422 typedef uint64_t inflate_holder_t;
423 
424 /*
425  * Ask the compiler to perform a wide, unaligned load of a uint64_t using a
426  * machine instruction appropriate for the uint64_t type.
427  */
read64le(const unsigned char FAR * in)428 static inline inflate_holder_t read64le(const unsigned char FAR *in) {
429     inflate_holder_t input;
430     Z_BUILTIN_MEMCPY(&input, in, sizeof(input));
431     return input;
432 }
433 #else
434 /*
435  * Otherwise, buffer the input bits using zlib's default input buffer type.
436  */
437 typedef unsigned long inflate_holder_t;
438 
439 #endif /* INFLATE_CHUNK_READ_64LE */
440 
441 #undef Z_STATIC_ASSERT
442 #undef Z_RESTRICT
443 #undef Z_BUILTIN_MEMCPY
444 
445 #endif /* CHUNKCOPY_H */
446