1 #include <stdbool.h>
2 #include <stddef.h>
3 #include <stdint.h>
4
5 #include "blake3_impl.h"
6
7 #if defined(IS_X86)
8 #if defined(_MSC_VER)
9 #include <intrin.h>
10 #elif defined(__GNUC__)
11 #include <immintrin.h>
12 #else
13 #undef IS_X86 /* Unimplemented! */
14 #endif
15 #endif
16
17 #define MAYBE_UNUSED(x) (void)((x))
18
19 #if defined(IS_X86)
xgetbv(void)20 static uint64_t xgetbv(void) {
21 #if defined(_MSC_VER)
22 return _xgetbv(0);
23 #else
24 uint32_t eax = 0, edx = 0;
25 __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
26 return ((uint64_t)edx << 32) | eax;
27 #endif
28 }
29
cpuid(uint32_t out[4],uint32_t id)30 static void cpuid(uint32_t out[4], uint32_t id) {
31 #if defined(_MSC_VER)
32 __cpuid((int *)out, id);
33 #elif defined(__i386__) || defined(_M_IX86)
34 __asm__ __volatile__("movl %%ebx, %1\n"
35 "cpuid\n"
36 "xchgl %1, %%ebx\n"
37 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
38 : "a"(id));
39 #else
40 __asm__ __volatile__("cpuid\n"
41 : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
42 : "a"(id));
43 #endif
44 }
45
cpuidex(uint32_t out[4],uint32_t id,uint32_t sid)46 static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
47 #if defined(_MSC_VER)
48 __cpuidex((int *)out, id, sid);
49 #elif defined(__i386__) || defined(_M_IX86)
50 __asm__ __volatile__("movl %%ebx, %1\n"
51 "cpuid\n"
52 "xchgl %1, %%ebx\n"
53 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
54 : "a"(id), "c"(sid));
55 #else
56 __asm__ __volatile__("cpuid\n"
57 : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
58 : "a"(id), "c"(sid));
59 #endif
60 }
61
62 #endif
63
64 enum cpu_feature {
65 SSE2 = 1 << 0,
66 SSSE3 = 1 << 1,
67 SSE41 = 1 << 2,
68 AVX = 1 << 3,
69 AVX2 = 1 << 4,
70 AVX512F = 1 << 5,
71 AVX512VL = 1 << 6,
72 /* ... */
73 UNDEFINED = 1 << 30
74 };
75
76 #if !defined(BLAKE3_TESTING)
77 static /* Allow the variable to be controlled manually for testing */
78 #endif
79 enum cpu_feature g_cpu_features = UNDEFINED;
80
81 #if !defined(BLAKE3_TESTING)
82 static
83 #endif
84 enum cpu_feature
get_cpu_features(void)85 get_cpu_features(void) {
86
87 if (g_cpu_features != UNDEFINED) {
88 return g_cpu_features;
89 } else {
90 #if defined(IS_X86)
91 uint32_t regs[4] = {0};
92 uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3];
93 (void)edx;
94 enum cpu_feature features = 0;
95 cpuid(regs, 0);
96 const int max_id = *eax;
97 cpuid(regs, 1);
98 #if defined(__amd64__) || defined(_M_X64)
99 features |= SSE2;
100 #else
101 if (*edx & (1UL << 26))
102 features |= SSE2;
103 #endif
104 if (*ecx & (1UL << 0))
105 features |= SSSE3;
106 if (*ecx & (1UL << 19))
107 features |= SSE41;
108
109 if (*ecx & (1UL << 27)) { // OSXSAVE
110 const uint64_t mask = xgetbv();
111 if ((mask & 6) == 6) { // SSE and AVX states
112 if (*ecx & (1UL << 28))
113 features |= AVX;
114 if (max_id >= 7) {
115 cpuidex(regs, 7, 0);
116 if (*ebx & (1UL << 5))
117 features |= AVX2;
118 if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
119 if (*ebx & (1UL << 31))
120 features |= AVX512VL;
121 if (*ebx & (1UL << 16))
122 features |= AVX512F;
123 }
124 }
125 }
126 }
127 g_cpu_features = features;
128 return features;
129 #else
130 /* How to detect NEON? */
131 return 0;
132 #endif
133 }
134 }
135
blake3_compress_in_place(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)136 void blake3_compress_in_place(uint32_t cv[8],
137 const uint8_t block[BLAKE3_BLOCK_LEN],
138 uint8_t block_len, uint64_t counter,
139 uint8_t flags) {
140 #if defined(IS_X86)
141 const enum cpu_feature features = get_cpu_features();
142 MAYBE_UNUSED(features);
143 #if !defined(BLAKE3_NO_AVX512)
144 if (features & AVX512VL) {
145 blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
146 return;
147 }
148 #endif
149 #if !defined(BLAKE3_NO_SSE41)
150 if (features & SSE41) {
151 blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
152 return;
153 }
154 #endif
155 #if !defined(BLAKE3_NO_SSE2)
156 if (features & SSE2) {
157 blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
158 return;
159 }
160 #endif
161 #endif
162 blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
163 }
164
blake3_compress_xof(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])165 void blake3_compress_xof(const uint32_t cv[8],
166 const uint8_t block[BLAKE3_BLOCK_LEN],
167 uint8_t block_len, uint64_t counter, uint8_t flags,
168 uint8_t out[64]) {
169 #if defined(IS_X86)
170 const enum cpu_feature features = get_cpu_features();
171 MAYBE_UNUSED(features);
172 #if !defined(BLAKE3_NO_AVX512)
173 if (features & AVX512VL) {
174 blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
175 return;
176 }
177 #endif
178 #if !defined(BLAKE3_NO_SSE41)
179 if (features & SSE41) {
180 blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
181 return;
182 }
183 #endif
184 #if !defined(BLAKE3_NO_SSE2)
185 if (features & SSE2) {
186 blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
187 return;
188 }
189 #endif
190 #endif
191 blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
192 }
193
blake3_hash_many(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)194 void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
195 size_t blocks, const uint32_t key[8], uint64_t counter,
196 bool increment_counter, uint8_t flags,
197 uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
198 #if defined(IS_X86)
199 const enum cpu_feature features = get_cpu_features();
200 MAYBE_UNUSED(features);
201 #if !defined(BLAKE3_NO_AVX512)
202 if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
203 blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
204 increment_counter, flags, flags_start, flags_end,
205 out);
206 return;
207 }
208 #endif
209 #if !defined(BLAKE3_NO_AVX2)
210 if (features & AVX2) {
211 blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
212 increment_counter, flags, flags_start, flags_end,
213 out);
214 return;
215 }
216 #endif
217 #if !defined(BLAKE3_NO_SSE41)
218 if (features & SSE41) {
219 blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
220 increment_counter, flags, flags_start, flags_end,
221 out);
222 return;
223 }
224 #endif
225 #if !defined(BLAKE3_NO_SSE2)
226 if (features & SSE2) {
227 blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
228 increment_counter, flags, flags_start, flags_end,
229 out);
230 return;
231 }
232 #endif
233 #endif
234
235 #if BLAKE3_USE_NEON == 1
236 blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
237 increment_counter, flags, flags_start, flags_end, out);
238 return;
239 #endif
240
241 blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
242 increment_counter, flags, flags_start, flags_end,
243 out);
244 }
245
246 // The dynamically detected SIMD degree of the current platform.
blake3_simd_degree(void)247 size_t blake3_simd_degree(void) {
248 #if defined(IS_X86)
249 const enum cpu_feature features = get_cpu_features();
250 MAYBE_UNUSED(features);
251 #if !defined(BLAKE3_NO_AVX512)
252 if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
253 return 16;
254 }
255 #endif
256 #if !defined(BLAKE3_NO_AVX2)
257 if (features & AVX2) {
258 return 8;
259 }
260 #endif
261 #if !defined(BLAKE3_NO_SSE41)
262 if (features & SSE41) {
263 return 4;
264 }
265 #endif
266 #if !defined(BLAKE3_NO_SSE2)
267 if (features & SSE2) {
268 return 4;
269 }
270 #endif
271 #endif
272 #if BLAKE3_USE_NEON == 1
273 return 4;
274 #endif
275 return 1;
276 }
277