1 /* 2 * Copyright 2016 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #ifndef SkChecksum_opts_DEFINED 9 #define SkChecksum_opts_DEFINED 10 11 #include "include/core/SkTypes.h" 12 #include "include/private/SkChecksum.h" 13 #include "src/core/SkUtils.h" // sk_unaligned_load 14 15 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 16 #include <immintrin.h> 17 #elif defined(SK_ARM_HAS_CRC32) 18 #include <arm_acle.h> 19 #endif 20 21 namespace SK_OPTS_NS { 22 23 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64)) 24 // This is not a CRC32. It's Just A Hash that uses those instructions because they're fast. hash_fn(const void * vdata,size_t bytes,uint32_t seed)25 /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) { 26 auto data = (const uint8_t*)vdata; 27 28 // _mm_crc32_u64() operates on 64-bit registers, so we use uint64_t for a while. 29 uint64_t hash = seed; 30 if (bytes >= 24) { 31 // We'll create 3 independent hashes, each using _mm_crc32_u64() 32 // to hash 8 bytes per step. Both 3 and independent are important: 33 // we can execute 3 of these instructions in parallel on a single core. 34 uint64_t a = hash, 35 b = hash, 36 c = hash; 37 size_t steps = bytes/24; 38 while (steps --> 0) { 39 a = _mm_crc32_u64(a, sk_unaligned_load<uint64_t>(data+ 0)); 40 b = _mm_crc32_u64(b, sk_unaligned_load<uint64_t>(data+ 8)); 41 c = _mm_crc32_u64(c, sk_unaligned_load<uint64_t>(data+16)); 42 data += 24; 43 } 44 bytes %= 24; 45 hash = _mm_crc32_u32(a, _mm_crc32_u32(b, c)); 46 } 47 48 SkASSERT(bytes < 24); 49 if (bytes >= 16) { 50 hash = _mm_crc32_u64(hash, sk_unaligned_load<uint64_t>(data)); 51 bytes -= 8; 52 data += 8; 53 } 54 55 SkASSERT(bytes < 16); 56 if (bytes & 8) { 57 hash = _mm_crc32_u64(hash, sk_unaligned_load<uint64_t>(data)); 58 data += 8; 59 } 60 61 // The remainder of these _mm_crc32_u*() operate on a 32-bit register. 62 // We don't lose anything here: only the bottom 32-bits were populated. 63 auto hash32 = (uint32_t)hash; 64 65 if (bytes & 4) { 66 hash32 = _mm_crc32_u32(hash32, sk_unaligned_load<uint32_t>(data)); 67 data += 4; 68 } 69 if (bytes & 2) { 70 hash32 = _mm_crc32_u16(hash32, sk_unaligned_load<uint16_t>(data)); 71 data += 2; 72 } 73 if (bytes & 1) { 74 hash32 = _mm_crc32_u8(hash32, sk_unaligned_load<uint8_t>(data)); 75 } 76 return hash32; 77 } 78 79 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 80 // 32-bit version of above, using _mm_crc32_u32() but not _mm_crc32_u64(). 81 /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { 82 auto data = (const uint8_t*)vdata; 83 84 if (bytes >= 12) { 85 // We'll create 3 independent hashes, each using _mm_crc32_u32() 86 // to hash 4 bytes per step. Both 3 and independent are important: 87 // we can execute 3 of these instructions in parallel on a single core. 88 uint32_t a = hash, 89 b = hash, 90 c = hash; 91 size_t steps = bytes/12; 92 while (steps --> 0) { 93 a = _mm_crc32_u32(a, sk_unaligned_load<uint32_t>(data+0)); 94 b = _mm_crc32_u32(b, sk_unaligned_load<uint32_t>(data+4)); 95 c = _mm_crc32_u32(c, sk_unaligned_load<uint32_t>(data+8)); 96 data += 12; 97 } 98 bytes %= 12; 99 hash = _mm_crc32_u32(a, _mm_crc32_u32(b, c)); 100 } 101 102 SkASSERT(bytes < 12); 103 if (bytes >= 8) { 104 hash = _mm_crc32_u32(hash, sk_unaligned_load<uint32_t>(data)); 105 bytes -= 4; 106 data += 4; 107 } 108 109 SkASSERT(bytes < 8); 110 if (bytes & 4) { 111 hash = _mm_crc32_u32(hash, sk_unaligned_load<uint32_t>(data)); 112 data += 4; 113 } 114 if (bytes & 2) { 115 hash = _mm_crc32_u16(hash, sk_unaligned_load<uint16_t>(data)); 116 data += 2; 117 } 118 if (bytes & 1) { 119 hash = _mm_crc32_u8(hash, sk_unaligned_load<uint8_t>(data)); 120 } 121 return hash; 122 } 123 124 #elif defined(SK_ARM_HAS_CRC32) 125 /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { 126 auto data = (const uint8_t*)vdata; 127 if (bytes >= 24) { 128 uint32_t a = hash, 129 b = hash, 130 c = hash; 131 size_t steps = bytes/24; 132 while (steps --> 0) { 133 a = __crc32d(a, sk_unaligned_load<uint64_t>(data+ 0)); 134 b = __crc32d(b, sk_unaligned_load<uint64_t>(data+ 8)); 135 c = __crc32d(c, sk_unaligned_load<uint64_t>(data+16)); 136 data += 24; 137 } 138 bytes %= 24; 139 hash = __crc32w(a, __crc32w(b, c)); 140 } 141 142 SkASSERT(bytes < 24); 143 if (bytes >= 16) { 144 hash = __crc32d(hash, sk_unaligned_load<uint64_t>(data)); 145 bytes -= 8; 146 data += 8; 147 } 148 149 SkASSERT(bytes < 16); 150 if (bytes & 8) { 151 hash = __crc32d(hash, sk_unaligned_load<uint64_t>(data)); 152 data += 8; 153 } 154 if (bytes & 4) { 155 hash = __crc32w(hash, sk_unaligned_load<uint32_t>(data)); 156 data += 4; 157 } 158 if (bytes & 2) { 159 hash = __crc32h(hash, sk_unaligned_load<uint16_t>(data)); 160 data += 2; 161 } 162 if (bytes & 1) { 163 hash = __crc32b(hash, sk_unaligned_load<uint8_t>(data)); 164 } 165 return hash; 166 } 167 168 #else 169 // This is Murmur3. 170 /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { 171 auto data = (const uint8_t*)vdata; 172 173 size_t original_bytes = bytes; 174 175 // Handle 4 bytes at a time while possible. 176 while (bytes >= 4) { 177 uint32_t k = sk_unaligned_load<uint32_t>(data); 178 k *= 0xcc9e2d51; 179 k = (k << 15) | (k >> 17); 180 k *= 0x1b873593; 181 182 hash ^= k; 183 hash = (hash << 13) | (hash >> 19); 184 hash *= 5; 185 hash += 0xe6546b64; 186 187 bytes -= 4; 188 data += 4; 189 } 190 191 // Handle last 0-3 bytes. 192 uint32_t k = 0; 193 switch (bytes & 3) { 194 case 3: k ^= data[2] << 16; 195 case 2: k ^= data[1] << 8; 196 case 1: k ^= data[0] << 0; 197 k *= 0xcc9e2d51; 198 k = (k << 15) | (k >> 17); 199 k *= 0x1b873593; 200 hash ^= k; 201 } 202 203 hash ^= original_bytes; 204 return SkChecksum::Mix(hash); 205 } 206 #endif 207 208 } // namespace SK_OPTS_NS 209 210 #endif//SkChecksum_opts_DEFINED 211