1 /* functable.c -- Choose relevant optimized functions at runtime
2 * Copyright (C) 2017 Hans Kristian Rosbach
3 * For conditions of distribution and use, see copyright notice in zlib.h
4 */
5
6 #include "zbuild.h"
7 #include "zendian.h"
8 #include "deflate.h"
9 #include "deflate_p.h"
10
11 #include "functable.h"
12
13 #ifdef X86_FEATURES
14 # include "fallback_builtins.h"
15 #endif
16
17 /* insert_string */
18 extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count);
19 #ifdef X86_SSE42_CRC_HASH
20 extern void insert_string_sse4(deflate_state *const s, const uint32_t str, uint32_t count);
21 #elif defined(ARM_ACLE_CRC_HASH)
22 extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count);
23 #endif
24
25 /* quick_insert_string */
26 extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
27 #ifdef X86_SSE42_CRC_HASH
28 extern Pos quick_insert_string_sse4(deflate_state *const s, const uint32_t str);
29 #elif defined(ARM_ACLE_CRC_HASH)
30 extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str);
31 #endif
32
33 /* slide_hash */
34 #ifdef X86_SSE2
35 void slide_hash_sse2(deflate_state *s);
36 #elif defined(ARM_NEON_SLIDEHASH)
37 void slide_hash_neon(deflate_state *s);
38 #elif defined(POWER8_VSX_SLIDEHASH)
39 void slide_hash_power8(deflate_state *s);
40 #endif
41 #ifdef X86_AVX2
42 void slide_hash_avx2(deflate_state *s);
43 #endif
44
45 /* adler32 */
46 extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len);
47 #ifdef ARM_NEON_ADLER32
48 extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
49 #endif
50 #ifdef X86_SSSE3_ADLER32
51 extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
52 #endif
53 #ifdef X86_AVX2_ADLER32
54 extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
55 #endif
56 #ifdef POWER8_VSX_ADLER32
57 extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len);
58 #endif
59
60 /* memory chunking */
61 extern uint32_t chunksize_c(void);
62 extern uint8_t* chunkcopy_c(uint8_t *out, uint8_t const *from, unsigned len);
63 extern uint8_t* chunkcopy_safe_c(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
64 extern uint8_t* chunkunroll_c(uint8_t *out, unsigned *dist, unsigned *len);
65 extern uint8_t* chunkmemset_c(uint8_t *out, unsigned dist, unsigned len);
66 extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
67 #ifdef X86_SSE2_CHUNKSET
68 extern uint32_t chunksize_sse2(void);
69 extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
70 extern uint8_t* chunkcopy_safe_sse2(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
71 extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
72 extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len);
73 extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
74 #endif
75 #ifdef ARM_NEON_CHUNKSET
76 extern uint32_t chunksize_neon(void);
77 extern uint8_t* chunkcopy_neon(uint8_t *out, uint8_t const *from, unsigned len);
78 extern uint8_t* chunkcopy_safe_neon(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
79 extern uint8_t* chunkunroll_neon(uint8_t *out, unsigned *dist, unsigned *len);
80 extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len);
81 extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
82 #endif
83
84 /* CRC32 */
85 Z_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t);
86
87 #ifdef ARM_ACLE_CRC_HASH
88 extern uint32_t crc32_acle(uint32_t, const unsigned char *, uint64_t);
89 #endif
90
91 #if BYTE_ORDER == LITTLE_ENDIAN
92 extern uint32_t crc32_little(uint32_t, const unsigned char *, uint64_t);
93 #elif BYTE_ORDER == BIG_ENDIAN
94 extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t);
95 #endif
96
97 /* compare258 */
98 extern uint32_t compare258_c(const unsigned char *src0, const unsigned char *src1);
99 #ifdef UNALIGNED_OK
100 extern uint32_t compare258_unaligned_16(const unsigned char *src0, const unsigned char *src1);
101 extern uint32_t compare258_unaligned_32(const unsigned char *src0, const unsigned char *src1);
102 #ifdef UNALIGNED64_OK
103 extern uint32_t compare258_unaligned_64(const unsigned char *src0, const unsigned char *src1);
104 #endif
105 #ifdef X86_SSE42_CMP_STR
106 extern uint32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1);
107 #endif
108 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
109 extern uint32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1);
110 #endif
111 #endif
112
113 /* longest_match */
114 extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
115 #ifdef UNALIGNED_OK
116 extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
117 extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
118 #ifdef UNALIGNED64_OK
119 extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
120 #endif
121 #ifdef X86_SSE42_CMP_STR
122 extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match);
123 #endif
124 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
125 extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match);
126 #endif
127 #endif
128
129 Z_INTERNAL Z_TLS struct functable_s functable;
130
cpu_check_features(void)131 Z_INTERNAL void cpu_check_features(void)
132 {
133 static int features_checked = 0;
134 if (features_checked)
135 return;
136 #if defined(X86_FEATURES)
137 x86_check_features();
138 #elif defined(ARM_FEATURES)
139 arm_check_features();
140 #elif defined(POWER_FEATURES)
141 power_check_features();
142 #endif
143 features_checked = 1;
144 }
145
146 /* stub functions */
insert_string_stub(deflate_state * const s,const uint32_t str,uint32_t count)147 Z_INTERNAL void insert_string_stub(deflate_state *const s, const uint32_t str, uint32_t count) {
148 // Initialize default
149
150 functable.insert_string = &insert_string_c;
151 cpu_check_features();
152
153 #ifdef X86_SSE42_CRC_HASH
154 if (x86_cpu_has_sse42)
155 functable.insert_string = &insert_string_sse4;
156 #elif defined(ARM_ACLE_CRC_HASH)
157 if (arm_cpu_has_crc32)
158 functable.insert_string = &insert_string_acle;
159 #endif
160
161 functable.insert_string(s, str, count);
162 }
163
quick_insert_string_stub(deflate_state * const s,const uint32_t str)164 Z_INTERNAL Pos quick_insert_string_stub(deflate_state *const s, const uint32_t str) {
165 functable.quick_insert_string = &quick_insert_string_c;
166
167 #ifdef X86_SSE42_CRC_HASH
168 if (x86_cpu_has_sse42)
169 functable.quick_insert_string = &quick_insert_string_sse4;
170 #elif defined(ARM_ACLE_CRC_HASH)
171 if (arm_cpu_has_crc32)
172 functable.quick_insert_string = &quick_insert_string_acle;
173 #endif
174
175 return functable.quick_insert_string(s, str);
176 }
177
slide_hash_stub(deflate_state * s)178 Z_INTERNAL void slide_hash_stub(deflate_state *s) {
179
180 functable.slide_hash = &slide_hash_c;
181 cpu_check_features();
182
183 #ifdef X86_SSE2
184 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
185 if (x86_cpu_has_sse2)
186 # endif
187 functable.slide_hash = &slide_hash_sse2;
188 #elif defined(ARM_NEON_SLIDEHASH)
189 # ifndef ARM_NOCHECK_NEON
190 if (arm_cpu_has_neon)
191 # endif
192 functable.slide_hash = &slide_hash_neon;
193 #endif
194 #ifdef X86_AVX2
195 if (x86_cpu_has_avx2)
196 functable.slide_hash = &slide_hash_avx2;
197 #endif
198 #ifdef POWER8_VSX_SLIDEHASH
199 if (power_cpu_has_arch_2_07)
200 functable.slide_hash = &slide_hash_power8;
201 #endif
202
203 functable.slide_hash(s);
204 }
205
adler32_stub(uint32_t adler,const unsigned char * buf,size_t len)206 Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len) {
207 // Initialize default
208 functable.adler32 = &adler32_c;
209 cpu_check_features();
210
211 #ifdef ARM_NEON_ADLER32
212 # ifndef ARM_NOCHECK_NEON
213 if (arm_cpu_has_neon)
214 # endif
215 functable.adler32 = &adler32_neon;
216 #endif
217 #ifdef X86_SSSE3_ADLER32
218 if (x86_cpu_has_ssse3)
219 functable.adler32 = &adler32_ssse3;
220 #endif
221 #ifdef X86_AVX2_ADLER32
222 if (x86_cpu_has_avx2)
223 functable.adler32 = &adler32_avx2;
224 #endif
225 #ifdef POWER8_VSX_ADLER32
226 if (power_cpu_has_arch_2_07)
227 functable.adler32 = &adler32_power8;
228 #endif
229
230 return functable.adler32(adler, buf, len);
231 }
232
chunksize_stub(void)233 Z_INTERNAL uint32_t chunksize_stub(void) {
234 // Initialize default
235 functable.chunksize = &chunksize_c;
236
237 #ifdef X86_SSE2_CHUNKSET
238 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
239 if (x86_cpu_has_sse2)
240 # endif
241 functable.chunksize = &chunksize_sse2;
242 #endif
243 #ifdef ARM_NEON_CHUNKSET
244 if (arm_cpu_has_neon)
245 functable.chunksize = &chunksize_neon;
246 #endif
247
248 return functable.chunksize();
249 }
250
chunkcopy_stub(uint8_t * out,uint8_t const * from,unsigned len)251 Z_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned len) {
252 // Initialize default
253 functable.chunkcopy = &chunkcopy_c;
254
255 #ifdef X86_SSE2_CHUNKSET
256 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
257 if (x86_cpu_has_sse2)
258 # endif
259 functable.chunkcopy = &chunkcopy_sse2;
260 #endif
261 #ifdef ARM_NEON_CHUNKSET
262 if (arm_cpu_has_neon)
263 functable.chunkcopy = &chunkcopy_neon;
264 #endif
265
266 return functable.chunkcopy(out, from, len);
267 }
268
chunkcopy_safe_stub(uint8_t * out,uint8_t const * from,unsigned len,uint8_t * safe)269 Z_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) {
270 // Initialize default
271 functable.chunkcopy_safe = &chunkcopy_safe_c;
272
273 #ifdef X86_SSE2_CHUNKSET
274 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
275 if (x86_cpu_has_sse2)
276 # endif
277 functable.chunkcopy_safe = &chunkcopy_safe_sse2;
278 #endif
279 #ifdef ARM_NEON_CHUNKSET
280 if (arm_cpu_has_neon)
281 functable.chunkcopy_safe = &chunkcopy_safe_neon;
282 #endif
283
284 return functable.chunkcopy_safe(out, from, len, safe);
285 }
286
chunkunroll_stub(uint8_t * out,unsigned * dist,unsigned * len)287 Z_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len) {
288 // Initialize default
289 functable.chunkunroll = &chunkunroll_c;
290
291 #ifdef X86_SSE2_CHUNKSET
292 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
293 if (x86_cpu_has_sse2)
294 # endif
295 functable.chunkunroll = &chunkunroll_sse2;
296 #endif
297 #ifdef ARM_NEON_CHUNKSET
298 if (arm_cpu_has_neon)
299 functable.chunkunroll = &chunkunroll_neon;
300 #endif
301
302 return functable.chunkunroll(out, dist, len);
303 }
304
chunkmemset_stub(uint8_t * out,unsigned dist,unsigned len)305 Z_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len) {
306 // Initialize default
307 functable.chunkmemset = &chunkmemset_c;
308
309 #ifdef X86_SSE2_CHUNKSET
310 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
311 if (x86_cpu_has_sse2)
312 # endif
313 functable.chunkmemset = &chunkmemset_sse2;
314 #endif
315 #ifdef ARM_NEON_CHUNKSET
316 if (arm_cpu_has_neon)
317 functable.chunkmemset = &chunkmemset_neon;
318 #endif
319
320 return functable.chunkmemset(out, dist, len);
321 }
322
chunkmemset_safe_stub(uint8_t * out,unsigned dist,unsigned len,unsigned left)323 Z_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
324 // Initialize default
325 functable.chunkmemset_safe = &chunkmemset_safe_c;
326
327 #ifdef X86_SSE2_CHUNKSET
328 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
329 if (x86_cpu_has_sse2)
330 # endif
331 functable.chunkmemset_safe = &chunkmemset_safe_sse2;
332 #endif
333 #ifdef ARM_NEON_CHUNKSET
334 if (arm_cpu_has_neon)
335 functable.chunkmemset_safe = &chunkmemset_safe_neon;
336 #endif
337
338 return functable.chunkmemset_safe(out, dist, len, left);
339 }
340
crc32_stub(uint32_t crc,const unsigned char * buf,uint64_t len)341 Z_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len) {
342
343 Assert(sizeof(uint64_t) >= sizeof(size_t),
344 "crc32_z takes size_t but internally we have a uint64_t len");
345 /* return a function pointer for optimized arches here after a capability test */
346
347 cpu_check_features();
348
349 if (sizeof(void *) == sizeof(ptrdiff_t)) {
350 #if BYTE_ORDER == LITTLE_ENDIAN
351 functable.crc32 = crc32_little;
352 # if defined(ARM_ACLE_CRC_HASH)
353 if (arm_cpu_has_crc32)
354 functable.crc32 = crc32_acle;
355 # endif
356 #elif BYTE_ORDER == BIG_ENDIAN
357 functable.crc32 = crc32_big;
358 #else
359 # error No endian defined
360 #endif
361 } else {
362 functable.crc32 = crc32_generic;
363 }
364
365 return functable.crc32(crc, buf, len);
366 }
367
compare258_stub(const unsigned char * src0,const unsigned char * src1)368 Z_INTERNAL uint32_t compare258_stub(const unsigned char *src0, const unsigned char *src1) {
369
370 functable.compare258 = &compare258_c;
371
372 #ifdef UNALIGNED_OK
373 # if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
374 functable.compare258 = &compare258_unaligned_64;
375 # elif defined(HAVE_BUILTIN_CTZ)
376 functable.compare258 = &compare258_unaligned_32;
377 # else
378 functable.compare258 = &compare258_unaligned_16;
379 # endif
380 # ifdef X86_SSE42_CMP_STR
381 if (x86_cpu_has_sse42)
382 functable.compare258 = &compare258_unaligned_sse4;
383 # endif
384 # if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
385 if (x86_cpu_has_avx2)
386 functable.compare258 = &compare258_unaligned_avx2;
387 # endif
388 #endif
389
390 return functable.compare258(src0, src1);
391 }
392
longest_match_stub(deflate_state * const s,Pos cur_match)393 Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) {
394
395 functable.longest_match = &longest_match_c;
396
397 #ifdef UNALIGNED_OK
398 # if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
399 functable.longest_match = &longest_match_unaligned_64;
400 # elif defined(HAVE_BUILTIN_CTZ)
401 functable.longest_match = &longest_match_unaligned_32;
402 # else
403 functable.longest_match = &longest_match_unaligned_16;
404 # endif
405 # ifdef X86_SSE42_CMP_STR
406 if (x86_cpu_has_sse42)
407 functable.longest_match = &longest_match_unaligned_sse4;
408 # endif
409 # if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
410 if (x86_cpu_has_avx2)
411 functable.longest_match = &longest_match_unaligned_avx2;
412 # endif
413 #endif
414
415 return functable.longest_match(s, cur_match);
416 }
417
418 /* functable init */
419 Z_INTERNAL Z_TLS struct functable_s functable = {
420 insert_string_stub,
421 quick_insert_string_stub,
422 adler32_stub,
423 crc32_stub,
424 slide_hash_stub,
425 compare258_stub,
426 longest_match_stub,
427 chunksize_stub,
428 chunkcopy_stub,
429 chunkcopy_safe_stub,
430 chunkunroll_stub,
431 chunkmemset_stub,
432 chunkmemset_safe_stub
433 };
434