1 /*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #ifndef SkChecksum_opts_DEFINED
9 #define SkChecksum_opts_DEFINED
10
11 #include "SkChecksum.h"
12 #include "SkTypes.h"
13
14 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42
15 #include <immintrin.h>
16 #elif defined(SK_ARM_HAS_CRC32)
17 #include <arm_acle.h>
18 #endif
19
20 namespace SK_OPTS_NS {
21
22 template <typename T>
unaligned_load(const uint8_t * src)23 static inline T unaligned_load(const uint8_t* src) {
24 T val;
25 memcpy(&val, src, sizeof(val));
26 return val;
27 }
28
29 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64))
30 // This is not a CRC32. It's Just A Hash that uses those instructions because they're fast.
hash_fn(const void * vdata,size_t bytes,uint32_t seed)31 /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) {
32 auto data = (const uint8_t*)vdata;
33
34 // _mm_crc32_u64() operates on 64-bit registers, so we use uint64_t for a while.
35 uint64_t hash = seed;
36 if (bytes >= 24) {
37 // We'll create 3 independent hashes, each using _mm_crc32_u64()
38 // to hash 8 bytes per step. Both 3 and independent are important:
39 // we can execute 3 of these instructions in parallel on a single core.
40 uint64_t a = hash,
41 b = hash,
42 c = hash;
43 size_t steps = bytes/24;
44 while (steps --> 0) {
45 a = _mm_crc32_u64(a, unaligned_load<uint64_t>(data+ 0));
46 b = _mm_crc32_u64(b, unaligned_load<uint64_t>(data+ 8));
47 c = _mm_crc32_u64(c, unaligned_load<uint64_t>(data+16));
48 data += 24;
49 }
50 bytes %= 24;
51 hash = a^b^c;
52 }
53
54 SkASSERT(bytes < 24);
55 if (bytes >= 16) {
56 hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data));
57 bytes -= 8;
58 data += 8;
59 }
60
61 SkASSERT(bytes < 16);
62 if (bytes & 8) {
63 hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data));
64 data += 8;
65 }
66
67 // The remainder of these _mm_crc32_u*() operate on a 32-bit register.
68 // We don't lose anything here: only the bottom 32-bits were populated.
69 auto hash32 = (uint32_t)hash;
70
71 if (bytes & 4) {
72 hash32 = _mm_crc32_u32(hash32, unaligned_load<uint32_t>(data));
73 data += 4;
74 }
75 if (bytes & 2) {
76 hash32 = _mm_crc32_u16(hash32, unaligned_load<uint16_t>(data));
77 data += 2;
78 }
79 if (bytes & 1) {
80 hash32 = _mm_crc32_u8(hash32, unaligned_load<uint8_t>(data));
81 }
82 return hash32;
83 }
84
85 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42
86 // 32-bit version of above, using _mm_crc32_u32() but not _mm_crc32_u64().
hash_fn(const void * vdata,size_t bytes,uint32_t hash)87 /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
88 auto data = (const uint8_t*)vdata;
89
90 if (bytes >= 12) {
91 // We'll create 3 independent hashes, each using _mm_crc32_u32()
92 // to hash 4 bytes per step. Both 3 and independent are important:
93 // we can execute 3 of these instructions in parallel on a single core.
94 uint32_t a = hash,
95 b = hash,
96 c = hash;
97 size_t steps = bytes/12;
98 while (steps --> 0) {
99 a = _mm_crc32_u32(a, unaligned_load<uint32_t>(data+0));
100 b = _mm_crc32_u32(b, unaligned_load<uint32_t>(data+4));
101 c = _mm_crc32_u32(c, unaligned_load<uint32_t>(data+8));
102 data += 12;
103 }
104 bytes %= 12;
105 hash = a^b^c;
106 }
107
108 SkASSERT(bytes < 12);
109 if (bytes >= 8) {
110 hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data));
111 bytes -= 4;
112 data += 4;
113 }
114
115 SkASSERT(bytes < 8);
116 if (bytes & 4) {
117 hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data));
118 data += 4;
119 }
120 if (bytes & 2) {
121 hash = _mm_crc32_u16(hash, unaligned_load<uint16_t>(data));
122 data += 2;
123 }
124 if (bytes & 1) {
125 hash = _mm_crc32_u8(hash, unaligned_load<uint8_t>(data));
126 }
127 return hash;
128 }
129
130 #elif defined(SK_ARM_HAS_CRC32)
hash_fn(const void * vdata,size_t bytes,uint32_t hash)131 /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
132 auto data = (const uint8_t*)vdata;
133 if (bytes >= 24) {
134 uint32_t a = hash,
135 b = hash,
136 c = hash;
137 size_t steps = bytes/24;
138 while (steps --> 0) {
139 a = __crc32d(a, unaligned_load<uint64_t>(data+ 0));
140 b = __crc32d(b, unaligned_load<uint64_t>(data+ 8));
141 c = __crc32d(c, unaligned_load<uint64_t>(data+16));
142 data += 24;
143 }
144 bytes %= 24;
145 hash = a^b^c;
146 }
147
148 SkASSERT(bytes < 24);
149 if (bytes >= 16) {
150 hash = __crc32d(hash, unaligned_load<uint64_t>(data));
151 bytes -= 8;
152 data += 8;
153 }
154
155 SkASSERT(bytes < 16);
156 if (bytes & 8) {
157 hash = __crc32d(hash, unaligned_load<uint64_t>(data));
158 data += 8;
159 }
160 if (bytes & 4) {
161 hash = __crc32w(hash, unaligned_load<uint32_t>(data));
162 data += 4;
163 }
164 if (bytes & 2) {
165 hash = __crc32h(hash, unaligned_load<uint16_t>(data));
166 data += 2;
167 }
168 if (bytes & 1) {
169 hash = __crc32b(hash, unaligned_load<uint8_t>(data));
170 }
171 return hash;
172 }
173
174 #else
175 // This is Murmur3.
hash_fn(const void * vdata,size_t bytes,uint32_t hash)176 /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
177 auto data = (const uint8_t*)vdata;
178
179 size_t original_bytes = bytes;
180
181 // Handle 4 bytes at a time while possible.
182 while (bytes >= 4) {
183 uint32_t k = unaligned_load<uint32_t>(data);
184 k *= 0xcc9e2d51;
185 k = (k << 15) | (k >> 17);
186 k *= 0x1b873593;
187
188 hash ^= k;
189 hash = (hash << 13) | (hash >> 19);
190 hash *= 5;
191 hash += 0xe6546b64;
192
193 bytes -= 4;
194 data += 4;
195 }
196
197 // Handle last 0-3 bytes.
198 uint32_t k = 0;
199 switch (bytes & 3) {
200 case 3: k ^= data[2] << 16;
201 case 2: k ^= data[1] << 8;
202 case 1: k ^= data[0] << 0;
203 k *= 0xcc9e2d51;
204 k = (k << 15) | (k >> 17);
205 k *= 0x1b873593;
206 hash ^= k;
207 }
208
209 hash ^= original_bytes;
210 return SkChecksum::Mix(hash);
211 }
212 #endif
213
214 } // namespace SK_OPTS_NS
215
216 #endif//SkChecksum_opts_DEFINED
217