/* functable.c -- Choose relevant optimized functions at runtime * Copyright (C) 2017 Hans Kristian Rosbach * For conditions of distribution and use, see copyright notice in zlib.h */ #include "zbuild.h" #include "zendian.h" #include "deflate.h" #include "deflate_p.h" #include "functable.h" #ifdef X86_FEATURES # include "fallback_builtins.h" #endif /* insert_string */ extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count); #ifdef X86_SSE42_CRC_HASH extern void insert_string_sse4(deflate_state *const s, const uint32_t str, uint32_t count); #elif defined(ARM_ACLE_CRC_HASH) extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count); #endif /* quick_insert_string */ extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str); #ifdef X86_SSE42_CRC_HASH extern Pos quick_insert_string_sse4(deflate_state *const s, const uint32_t str); #elif defined(ARM_ACLE_CRC_HASH) extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str); #endif /* slide_hash */ #ifdef X86_SSE2 void slide_hash_sse2(deflate_state *s); #elif defined(ARM_NEON_SLIDEHASH) void slide_hash_neon(deflate_state *s); #elif defined(POWER8_VSX_SLIDEHASH) void slide_hash_power8(deflate_state *s); #endif #ifdef X86_AVX2 void slide_hash_avx2(deflate_state *s); #endif /* adler32 */ extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len); #ifdef ARM_NEON_ADLER32 extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len); #endif #ifdef X86_SSSE3_ADLER32 extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len); #endif #ifdef X86_AVX2_ADLER32 extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len); #endif #ifdef POWER8_VSX_ADLER32 extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len); #endif /* memory chunking */ extern uint32_t chunksize_c(void); extern uint8_t* chunkcopy_c(uint8_t *out, uint8_t const *from, unsigned len); extern uint8_t* chunkcopy_safe_c(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); extern uint8_t* chunkunroll_c(uint8_t *out, unsigned *dist, unsigned *len); extern uint8_t* chunkmemset_c(uint8_t *out, unsigned dist, unsigned len); extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left); #ifdef X86_SSE2_CHUNKSET extern uint32_t chunksize_sse2(void); extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len); extern uint8_t* chunkcopy_safe_sse2(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len); extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len); extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left); #endif #ifdef ARM_NEON_CHUNKSET extern uint32_t chunksize_neon(void); extern uint8_t* chunkcopy_neon(uint8_t *out, uint8_t const *from, unsigned len); extern uint8_t* chunkcopy_safe_neon(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); extern uint8_t* chunkunroll_neon(uint8_t *out, unsigned *dist, unsigned *len); extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len); extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left); #endif /* CRC32 */ Z_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t); #ifdef ARM_ACLE_CRC_HASH extern uint32_t crc32_acle(uint32_t, const unsigned char *, uint64_t); #endif #if BYTE_ORDER == LITTLE_ENDIAN extern uint32_t crc32_little(uint32_t, const unsigned char *, uint64_t); #elif BYTE_ORDER == BIG_ENDIAN extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t); #endif /* compare258 */ extern uint32_t compare258_c(const unsigned char *src0, const unsigned char *src1); #ifdef UNALIGNED_OK extern uint32_t compare258_unaligned_16(const unsigned char *src0, const unsigned char *src1); extern uint32_t compare258_unaligned_32(const unsigned char *src0, const unsigned char *src1); #ifdef UNALIGNED64_OK extern uint32_t compare258_unaligned_64(const unsigned char *src0, const unsigned char *src1); #endif #ifdef X86_SSE42_CMP_STR extern uint32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1); #endif #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) extern uint32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1); #endif #endif /* longest_match */ extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match); #ifdef UNALIGNED_OK extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match); extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match); #ifdef UNALIGNED64_OK extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match); #endif #ifdef X86_SSE42_CMP_STR extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match); #endif #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match); #endif #endif Z_INTERNAL Z_TLS struct functable_s functable; Z_INTERNAL void cpu_check_features(void) { static int features_checked = 0; if (features_checked) return; #if defined(X86_FEATURES) x86_check_features(); #elif defined(ARM_FEATURES) arm_check_features(); #elif defined(POWER_FEATURES) power_check_features(); #endif features_checked = 1; } /* stub functions */ Z_INTERNAL void insert_string_stub(deflate_state *const s, const uint32_t str, uint32_t count) { // Initialize default functable.insert_string = &insert_string_c; cpu_check_features(); #ifdef X86_SSE42_CRC_HASH if (x86_cpu_has_sse42) functable.insert_string = &insert_string_sse4; #elif defined(ARM_ACLE_CRC_HASH) if (arm_cpu_has_crc32) functable.insert_string = &insert_string_acle; #endif functable.insert_string(s, str, count); } Z_INTERNAL Pos quick_insert_string_stub(deflate_state *const s, const uint32_t str) { functable.quick_insert_string = &quick_insert_string_c; #ifdef X86_SSE42_CRC_HASH if (x86_cpu_has_sse42) functable.quick_insert_string = &quick_insert_string_sse4; #elif defined(ARM_ACLE_CRC_HASH) if (arm_cpu_has_crc32) functable.quick_insert_string = &quick_insert_string_acle; #endif return functable.quick_insert_string(s, str); } Z_INTERNAL void slide_hash_stub(deflate_state *s) { functable.slide_hash = &slide_hash_c; cpu_check_features(); #ifdef X86_SSE2 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) if (x86_cpu_has_sse2) # endif functable.slide_hash = &slide_hash_sse2; #elif defined(ARM_NEON_SLIDEHASH) # ifndef ARM_NOCHECK_NEON if (arm_cpu_has_neon) # endif functable.slide_hash = &slide_hash_neon; #endif #ifdef X86_AVX2 if (x86_cpu_has_avx2) functable.slide_hash = &slide_hash_avx2; #endif #ifdef POWER8_VSX_SLIDEHASH if (power_cpu_has_arch_2_07) functable.slide_hash = &slide_hash_power8; #endif functable.slide_hash(s); } Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len) { // Initialize default functable.adler32 = &adler32_c; cpu_check_features(); #ifdef ARM_NEON_ADLER32 # ifndef ARM_NOCHECK_NEON if (arm_cpu_has_neon) # endif functable.adler32 = &adler32_neon; #endif #ifdef X86_SSSE3_ADLER32 if (x86_cpu_has_ssse3) functable.adler32 = &adler32_ssse3; #endif #ifdef X86_AVX2_ADLER32 if (x86_cpu_has_avx2) functable.adler32 = &adler32_avx2; #endif #ifdef POWER8_VSX_ADLER32 if (power_cpu_has_arch_2_07) functable.adler32 = &adler32_power8; #endif return functable.adler32(adler, buf, len); } Z_INTERNAL uint32_t chunksize_stub(void) { // Initialize default functable.chunksize = &chunksize_c; #ifdef X86_SSE2_CHUNKSET # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) if (x86_cpu_has_sse2) # endif functable.chunksize = &chunksize_sse2; #endif #ifdef ARM_NEON_CHUNKSET if (arm_cpu_has_neon) functable.chunksize = &chunksize_neon; #endif return functable.chunksize(); } Z_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned len) { // Initialize default functable.chunkcopy = &chunkcopy_c; #ifdef X86_SSE2_CHUNKSET # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) if (x86_cpu_has_sse2) # endif functable.chunkcopy = &chunkcopy_sse2; #endif #ifdef ARM_NEON_CHUNKSET if (arm_cpu_has_neon) functable.chunkcopy = &chunkcopy_neon; #endif return functable.chunkcopy(out, from, len); } Z_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) { // Initialize default functable.chunkcopy_safe = &chunkcopy_safe_c; #ifdef X86_SSE2_CHUNKSET # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) if (x86_cpu_has_sse2) # endif functable.chunkcopy_safe = &chunkcopy_safe_sse2; #endif #ifdef ARM_NEON_CHUNKSET if (arm_cpu_has_neon) functable.chunkcopy_safe = &chunkcopy_safe_neon; #endif return functable.chunkcopy_safe(out, from, len, safe); } Z_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len) { // Initialize default functable.chunkunroll = &chunkunroll_c; #ifdef X86_SSE2_CHUNKSET # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) if (x86_cpu_has_sse2) # endif functable.chunkunroll = &chunkunroll_sse2; #endif #ifdef ARM_NEON_CHUNKSET if (arm_cpu_has_neon) functable.chunkunroll = &chunkunroll_neon; #endif return functable.chunkunroll(out, dist, len); } Z_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len) { // Initialize default functable.chunkmemset = &chunkmemset_c; #ifdef X86_SSE2_CHUNKSET # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) if (x86_cpu_has_sse2) # endif functable.chunkmemset = &chunkmemset_sse2; #endif #ifdef ARM_NEON_CHUNKSET if (arm_cpu_has_neon) functable.chunkmemset = &chunkmemset_neon; #endif return functable.chunkmemset(out, dist, len); } Z_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned len, unsigned left) { // Initialize default functable.chunkmemset_safe = &chunkmemset_safe_c; #ifdef X86_SSE2_CHUNKSET # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) if (x86_cpu_has_sse2) # endif functable.chunkmemset_safe = &chunkmemset_safe_sse2; #endif #ifdef ARM_NEON_CHUNKSET if (arm_cpu_has_neon) functable.chunkmemset_safe = &chunkmemset_safe_neon; #endif return functable.chunkmemset_safe(out, dist, len, left); } Z_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len) { Assert(sizeof(uint64_t) >= sizeof(size_t), "crc32_z takes size_t but internally we have a uint64_t len"); /* return a function pointer for optimized arches here after a capability test */ cpu_check_features(); if (sizeof(void *) == sizeof(ptrdiff_t)) { #if BYTE_ORDER == LITTLE_ENDIAN functable.crc32 = crc32_little; # if defined(ARM_ACLE_CRC_HASH) if (arm_cpu_has_crc32) functable.crc32 = crc32_acle; # endif #elif BYTE_ORDER == BIG_ENDIAN functable.crc32 = crc32_big; #else # error No endian defined #endif } else { functable.crc32 = crc32_generic; } return functable.crc32(crc, buf, len); } Z_INTERNAL uint32_t compare258_stub(const unsigned char *src0, const unsigned char *src1) { functable.compare258 = &compare258_c; #ifdef UNALIGNED_OK # if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) functable.compare258 = &compare258_unaligned_64; # elif defined(HAVE_BUILTIN_CTZ) functable.compare258 = &compare258_unaligned_32; # else functable.compare258 = &compare258_unaligned_16; # endif # ifdef X86_SSE42_CMP_STR if (x86_cpu_has_sse42) functable.compare258 = &compare258_unaligned_sse4; # endif # if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) if (x86_cpu_has_avx2) functable.compare258 = &compare258_unaligned_avx2; # endif #endif return functable.compare258(src0, src1); } Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) { functable.longest_match = &longest_match_c; #ifdef UNALIGNED_OK # if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) functable.longest_match = &longest_match_unaligned_64; # elif defined(HAVE_BUILTIN_CTZ) functable.longest_match = &longest_match_unaligned_32; # else functable.longest_match = &longest_match_unaligned_16; # endif # ifdef X86_SSE42_CMP_STR if (x86_cpu_has_sse42) functable.longest_match = &longest_match_unaligned_sse4; # endif # if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) if (x86_cpu_has_avx2) functable.longest_match = &longest_match_unaligned_avx2; # endif #endif return functable.longest_match(s, cur_match); } /* functable init */ Z_INTERNAL Z_TLS struct functable_s functable = { insert_string_stub, quick_insert_string_stub, adler32_stub, crc32_stub, slide_hash_stub, compare258_stub, longest_match_stub, chunksize_stub, chunkcopy_stub, chunkcopy_safe_stub, chunkunroll_stub, chunkmemset_stub, chunkmemset_safe_stub };