• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* chunkset_sse41.c -- SSE4 inline functions to copy small data chunks.
2  * For conditions of distribution and use, see copyright notice in zlib.h
3  */
4 
5 #include "zbuild.h"
6 
7 /* This requires SSE2 support. While it's implicit with SSE4, we can minimize
8  * code size by sharing the chunkcopy functions, which will certainly compile
9  * to identical machine code */
10 #if defined(X86_SSE41) && defined(X86_SSE2)
11 #include <immintrin.h>
12 #include "chunk_permute_table.h"
13 
14 typedef __m128i chunk_t;
15 
16 #define CHUNK_SIZE 16
17 
18 #define HAVE_CHUNKMEMSET_2
19 #define HAVE_CHUNKMEMSET_4
20 #define HAVE_CHUNKMEMSET_8
21 #define HAVE_CHUNK_MAG
22 #define HAVE_CHUNKCOPY
23 #define HAVE_CHUNKUNROLL
24 
25 static const lut_rem_pair perm_idx_lut[13] = {
26     {0, 1},      /* 3 */
27     {0, 0},      /* don't care */
28     {1 * 32, 1}, /* 5 */
29     {2 * 32, 4}, /* 6 */
30     {3 * 32, 2}, /* 7 */
31     {0 * 32, 0}, /* don't care */
32     {4 * 32, 7}, /* 9 */
33     {5 * 32, 6}, /* 10 */
34     {6 * 32, 5}, /* 11 */
35     {7 * 32, 4}, /* 12 */
36     {8 * 32, 3}, /* 13 */
37     {9 * 32, 2}, /* 14 */
38     {10 * 32, 1},/* 15 */
39 };
40 
41 
chunkmemset_2(uint8_t * from,chunk_t * chunk)42 static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
43     int16_t tmp;
44     zmemcpy_2(&tmp, from);
45     *chunk = _mm_set1_epi16(tmp);
46 }
47 
chunkmemset_4(uint8_t * from,chunk_t * chunk)48 static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
49     int32_t tmp;
50     zmemcpy_4(&tmp, from);
51     *chunk = _mm_set1_epi32(tmp);
52 }
53 
chunkmemset_8(uint8_t * from,chunk_t * chunk)54 static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
55     int64_t tmp;
56     zmemcpy_8(&tmp, from);
57     *chunk = _mm_set1_epi64x(tmp);
58 }
59 
loadchunk(uint8_t const * s,chunk_t * chunk)60 static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
61     *chunk = _mm_loadu_si128((__m128i *)s);
62 }
63 
storechunk(uint8_t * out,chunk_t * chunk)64 static inline void storechunk(uint8_t *out, chunk_t *chunk) {
65     _mm_storeu_si128((__m128i *)out, *chunk);
66 }
67 
GET_CHUNK_MAG(uint8_t * buf,uint32_t * chunk_rem,uint32_t dist)68 static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
69     lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
70     __m128i perm_vec, ret_vec;
71 #ifdef Z_MEMORY_SANITIZER
72     /* Important to note:
73      * This is _not_ to subvert the memory sanitizer but to instead unpoison some
74      * bytes we willingly and purposefully load unitialized that we swizzle over
75      * in a vector register, anyway.  If what we assume is wrong about what is used,
76      * the memory sanitizer will still usefully flag it */
77     __msan_unpoison(buf + dist, 16 - dist);
78 #endif
79     ret_vec = _mm_loadu_si128((__m128i*)buf);
80     *chunk_rem = lut_rem.remval;
81 
82     perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
83     ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
84 
85     return ret_vec;
86 }
87 
88 extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
89 
90 #define CHUNKSIZE            chunksize_sse41
91 #define CHUNKMEMSET          chunkmemset_sse41
92 #define CHUNKMEMSET_SAFE     chunkmemset_safe_sse41
93 #define CHUNKCOPY(a, b, c)   chunkcopy_sse2(a, b, c)
94 #define CHUNKUNROLL(a, b, c) chunkunroll_sse2(a, b, c)
95 
96 #include "chunkset_tpl.h"
97 
98 #endif
99