• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2016 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #ifndef SkChecksum_opts_DEFINED
9 #define SkChecksum_opts_DEFINED
10 
11 #include "SkChecksum.h"
12 #include "SkTypes.h"
13 
14 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42
15     #include <immintrin.h>
16 #elif defined(SK_ARM_HAS_CRC32)
17     #include <arm_acle.h>
18 #endif
19 
20 namespace SK_OPTS_NS {
21 
22 template <typename T, typename P>
unaligned_load(const P * p)23 static inline T unaligned_load(const P* p) {
24     T v;
25     memcpy(&v, p, sizeof(v));
26     return v;
27 }
28 
29 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64))
30     // This is not a CRC32.  It's Just A Hash that uses those instructions because they're fast.
hash_fn(const void * vdata,size_t bytes,uint32_t seed)31     /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) {
32         auto data = (const uint8_t*)vdata;
33 
34         // _mm_crc32_u64() operates on 64-bit registers, so we use uint64_t for a while.
35         uint64_t hash = seed;
36         if (bytes >= 24) {
37             // We'll create 3 independent hashes, each using _mm_crc32_u64()
38             // to hash 8 bytes per step.  Both 3 and independent are important:
39             // we can execute 3 of these instructions in parallel on a single core.
40             uint64_t a = hash,
41                      b = hash,
42                      c = hash;
43             size_t steps = bytes/24;
44             while (steps --> 0) {
45                 a = _mm_crc32_u64(a, unaligned_load<uint64_t>(data+ 0));
46                 b = _mm_crc32_u64(b, unaligned_load<uint64_t>(data+ 8));
47                 c = _mm_crc32_u64(c, unaligned_load<uint64_t>(data+16));
48                 data += 24;
49             }
50             bytes %= 24;
51             hash = _mm_crc32_u32(a, _mm_crc32_u32(b, c));
52         }
53 
54         SkASSERT(bytes < 24);
55         if (bytes >= 16) {
56             hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data));
57             bytes -= 8;
58             data  += 8;
59         }
60 
61         SkASSERT(bytes < 16);
62         if (bytes & 8) {
63             hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data));
64             data  += 8;
65         }
66 
67         // The remainder of these _mm_crc32_u*() operate on a 32-bit register.
68         // We don't lose anything here: only the bottom 32-bits were populated.
69         auto hash32 = (uint32_t)hash;
70 
71         if (bytes & 4) {
72             hash32 = _mm_crc32_u32(hash32, unaligned_load<uint32_t>(data));
73             data += 4;
74         }
75         if (bytes & 2) {
76             hash32 = _mm_crc32_u16(hash32, unaligned_load<uint16_t>(data));
77             data += 2;
78         }
79         if (bytes & 1) {
80             hash32 = _mm_crc32_u8(hash32, unaligned_load<uint8_t>(data));
81         }
82         return hash32;
83     }
84 
85 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42
86     // 32-bit version of above, using _mm_crc32_u32() but not _mm_crc32_u64().
hash_fn(const void * vdata,size_t bytes,uint32_t hash)87     /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
88         auto data = (const uint8_t*)vdata;
89 
90         if (bytes >= 12) {
91             // We'll create 3 independent hashes, each using _mm_crc32_u32()
92             // to hash 4 bytes per step.  Both 3 and independent are important:
93             // we can execute 3 of these instructions in parallel on a single core.
94             uint32_t a = hash,
95                      b = hash,
96                      c = hash;
97             size_t steps = bytes/12;
98             while (steps --> 0) {
99                 a = _mm_crc32_u32(a, unaligned_load<uint32_t>(data+0));
100                 b = _mm_crc32_u32(b, unaligned_load<uint32_t>(data+4));
101                 c = _mm_crc32_u32(c, unaligned_load<uint32_t>(data+8));
102                 data += 12;
103             }
104             bytes %= 12;
105             hash = _mm_crc32_u32(a, _mm_crc32_u32(b, c));
106         }
107 
108         SkASSERT(bytes < 12);
109         if (bytes >= 8) {
110             hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data));
111             bytes -= 4;
112             data  += 4;
113         }
114 
115         SkASSERT(bytes < 8);
116         if (bytes & 4) {
117             hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data));
118             data += 4;
119         }
120         if (bytes & 2) {
121             hash = _mm_crc32_u16(hash, unaligned_load<uint16_t>(data));
122             data += 2;
123         }
124         if (bytes & 1) {
125             hash = _mm_crc32_u8(hash, unaligned_load<uint8_t>(data));
126         }
127         return hash;
128     }
129 
130 #elif defined(SK_ARM_HAS_CRC32)
hash_fn(const void * vdata,size_t bytes,uint32_t hash)131     /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
132         auto data = (const uint8_t*)vdata;
133         if (bytes >= 24) {
134             uint32_t a = hash,
135                      b = hash,
136                      c = hash;
137             size_t steps = bytes/24;
138             while (steps --> 0) {
139                 a = __crc32d(a, unaligned_load<uint64_t>(data+ 0));
140                 b = __crc32d(b, unaligned_load<uint64_t>(data+ 8));
141                 c = __crc32d(c, unaligned_load<uint64_t>(data+16));
142                 data += 24;
143             }
144             bytes %= 24;
145             hash = __crc32w(a, __crc32w(b, c));
146         }
147 
148         SkASSERT(bytes < 24);
149         if (bytes >= 16) {
150             hash = __crc32d(hash, unaligned_load<uint64_t>(data));
151             bytes -= 8;
152             data  += 8;
153         }
154 
155         SkASSERT(bytes < 16);
156         if (bytes & 8) {
157             hash = __crc32d(hash, unaligned_load<uint64_t>(data));
158             data += 8;
159         }
160         if (bytes & 4) {
161             hash = __crc32w(hash, unaligned_load<uint32_t>(data));
162             data += 4;
163         }
164         if (bytes & 2) {
165             hash = __crc32h(hash, unaligned_load<uint16_t>(data));
166             data += 2;
167         }
168         if (bytes & 1) {
169             hash = __crc32b(hash, unaligned_load<uint8_t>(data));
170         }
171         return hash;
172     }
173 
174 #else
175     // This is Murmur3.
hash_fn(const void * vdata,size_t bytes,uint32_t hash)176     /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
177         auto data = (const uint8_t*)vdata;
178 
179         size_t original_bytes = bytes;
180 
181         // Handle 4 bytes at a time while possible.
182         while (bytes >= 4) {
183             uint32_t k = unaligned_load<uint32_t>(data);
184             k *= 0xcc9e2d51;
185             k = (k << 15) | (k >> 17);
186             k *= 0x1b873593;
187 
188             hash ^= k;
189             hash = (hash << 13) | (hash >> 19);
190             hash *= 5;
191             hash += 0xe6546b64;
192 
193             bytes -= 4;
194             data  += 4;
195         }
196 
197         // Handle last 0-3 bytes.
198         uint32_t k = 0;
199         switch (bytes & 3) {
200             case 3: k ^= data[2] << 16;
201             case 2: k ^= data[1] <<  8;
202             case 1: k ^= data[0] <<  0;
203                     k *= 0xcc9e2d51;
204                     k = (k << 15) | (k >> 17);
205                     k *= 0x1b873593;
206                     hash ^= k;
207         }
208 
209         hash ^= original_bytes;
210         return SkChecksum::Mix(hash);
211     }
212 #endif
213 
214 }  // namespace SK_OPTS_NS
215 
216 #endif//SkChecksum_opts_DEFINED
217