1 /* chunkset_sse41.c -- SSE4 inline functions to copy small data chunks.
2 * For conditions of distribution and use, see copyright notice in zlib.h
3 */
4
5 #include "zbuild.h"
6
7 /* This requires SSE2 support. While it's implicit with SSE4, we can minimize
8 * code size by sharing the chunkcopy functions, which will certainly compile
9 * to identical machine code */
10 #if defined(X86_SSE41) && defined(X86_SSE2)
11 #include <immintrin.h>
12 #include "chunk_permute_table.h"
13
14 typedef __m128i chunk_t;
15
16 #define CHUNK_SIZE 16
17
18 #define HAVE_CHUNKMEMSET_2
19 #define HAVE_CHUNKMEMSET_4
20 #define HAVE_CHUNKMEMSET_8
21 #define HAVE_CHUNK_MAG
22 #define HAVE_CHUNKCOPY
23 #define HAVE_CHUNKUNROLL
24
25 static const lut_rem_pair perm_idx_lut[13] = {
26 {0, 1}, /* 3 */
27 {0, 0}, /* don't care */
28 {1 * 32, 1}, /* 5 */
29 {2 * 32, 4}, /* 6 */
30 {3 * 32, 2}, /* 7 */
31 {0 * 32, 0}, /* don't care */
32 {4 * 32, 7}, /* 9 */
33 {5 * 32, 6}, /* 10 */
34 {6 * 32, 5}, /* 11 */
35 {7 * 32, 4}, /* 12 */
36 {8 * 32, 3}, /* 13 */
37 {9 * 32, 2}, /* 14 */
38 {10 * 32, 1},/* 15 */
39 };
40
41
chunkmemset_2(uint8_t * from,chunk_t * chunk)42 static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
43 int16_t tmp;
44 zmemcpy_2(&tmp, from);
45 *chunk = _mm_set1_epi16(tmp);
46 }
47
chunkmemset_4(uint8_t * from,chunk_t * chunk)48 static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
49 int32_t tmp;
50 zmemcpy_4(&tmp, from);
51 *chunk = _mm_set1_epi32(tmp);
52 }
53
chunkmemset_8(uint8_t * from,chunk_t * chunk)54 static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
55 int64_t tmp;
56 zmemcpy_8(&tmp, from);
57 *chunk = _mm_set1_epi64x(tmp);
58 }
59
loadchunk(uint8_t const * s,chunk_t * chunk)60 static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
61 *chunk = _mm_loadu_si128((__m128i *)s);
62 }
63
storechunk(uint8_t * out,chunk_t * chunk)64 static inline void storechunk(uint8_t *out, chunk_t *chunk) {
65 _mm_storeu_si128((__m128i *)out, *chunk);
66 }
67
GET_CHUNK_MAG(uint8_t * buf,uint32_t * chunk_rem,uint32_t dist)68 static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
69 lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
70 __m128i perm_vec, ret_vec;
71 #ifdef Z_MEMORY_SANITIZER
72 /* Important to note:
73 * This is _not_ to subvert the memory sanitizer but to instead unpoison some
74 * bytes we willingly and purposefully load unitialized that we swizzle over
75 * in a vector register, anyway. If what we assume is wrong about what is used,
76 * the memory sanitizer will still usefully flag it */
77 __msan_unpoison(buf + dist, 16 - dist);
78 #endif
79 ret_vec = _mm_loadu_si128((__m128i*)buf);
80 *chunk_rem = lut_rem.remval;
81
82 perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
83 ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
84
85 return ret_vec;
86 }
87
88 extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
89
90 #define CHUNKSIZE chunksize_sse41
91 #define CHUNKMEMSET chunkmemset_sse41
92 #define CHUNKMEMSET_SAFE chunkmemset_safe_sse41
93 #define CHUNKCOPY(a, b, c) chunkcopy_sse2(a, b, c)
94 #define CHUNKUNROLL(a, b, c) chunkunroll_sse2(a, b, c)
95
96 #include "chunkset_tpl.h"
97
98 #endif
99