1 #include <benchmark/benchmark.h>
2
3 #include <fp16.h>
4 #ifndef EMSCRIPTEN
5 #include <fp16/psimd.h>
6 #endif
7
8 #include <vector>
9 #include <random>
10 #include <chrono>
11 #include <functional>
12 #include <algorithm>
13
14 #if (defined(__i386__) || defined(__x86_64__)) && defined(__F16C__)
15 #include <immintrin.h>
16 #endif
17
18 #if defined(__ARM_NEON__) || defined(__aarch64__)
19 #include <arm_neon.h>
20 #endif
21
22 #ifdef FP16_COMPARATIVE_BENCHMARKS
23 #include <third-party/THHalf.h>
24 #include <third-party/npy-halffloat.h>
25 #include <third-party/eigen-half.h>
26 #include <third-party/float16-compressor.h>
27 #include <third-party/half.hpp>
28 #endif
29
30
fp16_ieee_to_fp32_bits(benchmark::State & state)31 static void fp16_ieee_to_fp32_bits(benchmark::State& state) {
32 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
33 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
34
35 std::vector<uint16_t> fp16(state.range(0));
36 std::vector<uint32_t> fp32(state.range(0));
37 std::generate(fp16.begin(), fp16.end(),
38 [&rng]{ return fp16_ieee_from_fp32_value(rng()); });
39
40 while (state.KeepRunning()) {
41 uint16_t* input = fp16.data();
42 benchmark::DoNotOptimize(input);
43
44 uint32_t* output = fp32.data();
45 const size_t n = state.range(0);
46 for (size_t i = 0; i < n; i++) {
47 output[i] = fp16_ieee_to_fp32_bits(input[i]);
48 }
49
50 benchmark::DoNotOptimize(output);
51 }
52 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
53 }
54 BENCHMARK(fp16_ieee_to_fp32_bits)->RangeMultiplier(2)->Range(1<<10, 64<<20);
55
fp16_ieee_to_fp32_value(benchmark::State & state)56 static void fp16_ieee_to_fp32_value(benchmark::State& state) {
57 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
58 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
59
60 std::vector<uint16_t> fp16(state.range(0));
61 std::vector<float> fp32(state.range(0));
62 std::generate(fp16.begin(), fp16.end(),
63 [&rng]{ return fp16_ieee_from_fp32_value(rng()); });
64
65 while (state.KeepRunning()) {
66 uint16_t* input = fp16.data();
67 benchmark::DoNotOptimize(input);
68
69 float* output = fp32.data();
70 const size_t n = state.range(0);
71 for (size_t i = 0; i < n; i++) {
72 output[i] = fp16_ieee_to_fp32_value(input[i]);
73 }
74
75 benchmark::DoNotOptimize(output);
76 }
77 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
78 }
79 BENCHMARK(fp16_ieee_to_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20);
80
81 #ifndef EMSCRIPTEN
fp16_ieee_to_fp32_psimd(benchmark::State & state)82 static void fp16_ieee_to_fp32_psimd(benchmark::State& state) {
83 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
84 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
85
86 std::vector<uint16_t> fp16(state.range(0));
87 std::vector<float> fp32(state.range(0));
88 std::generate(fp16.begin(), fp16.end(),
89 [&rng]{ return fp16_ieee_from_fp32_value(rng()); });
90
91 while (state.KeepRunning()) {
92 uint16_t* input = fp16.data();
93 benchmark::DoNotOptimize(input);
94
95 float* output = fp32.data();
96 const size_t n = state.range(0);
97 for (size_t i = 0; i < n - 4; i += 4) {
98 psimd_store_f32(&output[i],
99 fp16_ieee_to_fp32_psimd(
100 psimd_load_u16(&input[i])));
101 }
102 const psimd_u16 last_vector = { input[n - 4], input[n - 3], input[n - 2], input[n - 1] };
103 psimd_store_f32(&output[n - 4],
104 fp16_ieee_to_fp32_psimd(last_vector));
105
106 benchmark::DoNotOptimize(output);
107 }
108 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
109 }
110 BENCHMARK(fp16_ieee_to_fp32_psimd)->RangeMultiplier(2)->Range(1<<10, 64<<20);
111
fp16_ieee_to_fp32x2_psimd(benchmark::State & state)112 static void fp16_ieee_to_fp32x2_psimd(benchmark::State& state) {
113 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
114 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
115
116 std::vector<uint16_t> fp16(state.range(0));
117 std::vector<float> fp32(state.range(0));
118 std::generate(fp16.begin(), fp16.end(),
119 [&rng]{ return fp16_ieee_from_fp32_value(rng()); });
120
121 while (state.KeepRunning()) {
122 uint16_t* input = fp16.data();
123 benchmark::DoNotOptimize(input);
124
125 float* output = fp32.data();
126 const size_t n = state.range(0);
127 for (size_t i = 0; i < n; i += 8) {
128 const psimd_f32x2 data =
129 fp16_ieee_to_fp32x2_psimd(
130 psimd_load_u16(&input[i]));
131 psimd_store_f32(&output[i], data.lo);
132 psimd_store_f32(&output[i + 4], data.hi);
133 }
134
135 benchmark::DoNotOptimize(output);
136 }
137 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
138 }
139 BENCHMARK(fp16_ieee_to_fp32x2_psimd)->RangeMultiplier(2)->Range(1<<10, 64<<20);
140 #endif
141
142 #if (defined(__i386__) || defined(__x86_64__)) && defined(__F16C__)
hardware_mm_cvtph_ps(benchmark::State & state)143 static void hardware_mm_cvtph_ps(benchmark::State& state) {
144 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
145 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
146
147 std::vector<uint16_t> fp16(state.range(0));
148 std::vector<float> fp32(state.range(0));
149 std::generate(fp16.begin(), fp16.end(),
150 [&rng]{ return fp16_ieee_from_fp32_value(rng()); });
151
152 while (state.KeepRunning()) {
153 uint16_t* input = fp16.data();
154 benchmark::DoNotOptimize(input);
155
156 float* output = fp32.data();
157 const size_t n = state.range(0);
158 for (size_t i = 0; i < n; i += 4) {
159 _mm_storeu_ps(&output[i],
160 _mm_cvtph_ps(
161 _mm_loadl_epi64(static_cast<const __m128i*>(static_cast<const void*>(&input[i])))));
162 }
163
164 benchmark::DoNotOptimize(output);
165 }
166 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
167 }
168 BENCHMARK(hardware_mm_cvtph_ps)->RangeMultiplier(2)->Range(1<<10, 64<<20);
169
hardware_mm256_cvtph_ps(benchmark::State & state)170 static void hardware_mm256_cvtph_ps(benchmark::State& state) {
171 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
172 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
173
174 std::vector<uint16_t> fp16(state.range(0));
175 std::vector<float> fp32(state.range(0));
176 std::generate(fp16.begin(), fp16.end(),
177 [&rng]{ return fp16_ieee_from_fp32_value(rng()); });
178
179 while (state.KeepRunning()) {
180 uint16_t* input = fp16.data();
181 benchmark::DoNotOptimize(input);
182
183 float* output = fp32.data();
184 const size_t n = state.range(0);
185 for (size_t i = 0; i < n; i += 8) {
186 _mm256_storeu_ps(&output[i],
187 _mm256_cvtph_ps(
188 _mm_loadu_si128(static_cast<const __m128i*>(static_cast<const void*>(&input[i])))));
189 }
190
191 benchmark::DoNotOptimize(output);
192 }
193 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
194 }
195 BENCHMARK(hardware_mm256_cvtph_ps)->RangeMultiplier(2)->Range(1<<10, 64<<20);
196 #endif
197
198 #if defined(__ARM_NEON_FP) && (__ARM_NEON_FP & 0x2) || defined(__aarch64__)
hardware_vcvt_f32_f16(benchmark::State & state)199 static void hardware_vcvt_f32_f16(benchmark::State& state) {
200 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
201 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
202
203 std::vector<uint16_t> fp16(state.range(0));
204 std::vector<float> fp32(state.range(0));
205 std::generate(fp16.begin(), fp16.end(),
206 [&rng]{ return fp16_ieee_from_fp32_value(rng()); });
207
208 while (state.KeepRunning()) {
209 uint16_t* input = fp16.data();
210 benchmark::DoNotOptimize(input);
211
212 float* output = fp32.data();
213 const size_t n = state.range(0);
214 #if defined(__aarch64__)
215 const unsigned int fpcr = __builtin_aarch64_get_fpcr();
216 /* Disable flush-to-zero (bit 24) and Alternative FP16 format (bit 26) */
217 __builtin_aarch64_set_fpcr(fpcr & 0xF6FFFFFFu);
218 #else
219 unsigned int fpscr;
220 __asm__ __volatile__ ("VMRS %[fpscr], fpscr" : [fpscr] "=r" (fpscr));
221 /* Disable flush-to-zero (bit 24) and Alternative FP16 format (bit 26) */
222 __asm__ __volatile__ ("VMSR fpscr, %[fpscr]" :
223 : [fpscr] "r" (fpscr & 0xF6FFFFFFu));
224 #endif
225 for (size_t i = 0; i < n; i += 4) {
226 vst1q_f32(&output[i],
227 vcvt_f32_f16(
228 (float16x4_t) vld1_u16(&input[i])));
229 }
230 #if defined(__aarch64__)
231 __builtin_aarch64_set_fpcr(fpcr);
232 #else
233 __asm__ __volatile__ ("VMSR fpscr, %[fpscr]" :: [fpscr] "r" (fpscr));
234 #endif
235
236 benchmark::DoNotOptimize(output);
237 }
238 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
239 }
240 BENCHMARK(hardware_vcvt_f32_f16)->RangeMultiplier(2)->Range(1<<10, 64<<20);
241 #endif
242
243 #ifdef FP16_COMPARATIVE_BENCHMARKS
TH_halfbits2float(benchmark::State & state)244 static void TH_halfbits2float(benchmark::State& state) {
245 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
246 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
247
248 std::vector<uint16_t> fp16(state.range(0));
249 std::vector<float> fp32(state.range(0));
250 std::generate(fp16.begin(), fp16.end(),
251 [&rng]{ return fp16_ieee_from_fp32_value(rng()); });
252
253 while (state.KeepRunning()) {
254 uint16_t* input = fp16.data();
255 benchmark::DoNotOptimize(input);
256
257 float* output = fp32.data();
258 const size_t n = state.range(0);
259 for (size_t i = 0; i < n; i++) {
260 TH_halfbits2float(&input[i], &output[i]);
261 }
262
263 benchmark::DoNotOptimize(output);
264 }
265 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
266 }
267 BENCHMARK(TH_halfbits2float)->RangeMultiplier(2)->Range(1<<10, 64<<20);
268
npy_halfbits_to_floatbits(benchmark::State & state)269 static void npy_halfbits_to_floatbits(benchmark::State& state) {
270 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
271 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
272
273 std::vector<uint16_t> fp16(state.range(0));
274 std::vector<uint32_t> fp32(state.range(0));
275 std::generate(fp16.begin(), fp16.end(),
276 [&rng]{ return fp16_ieee_from_fp32_value(rng()); });
277
278 while (state.KeepRunning()) {
279 uint16_t* input = fp16.data();
280 benchmark::DoNotOptimize(input);
281
282 uint32_t* output = fp32.data();
283 const size_t n = state.range(0);
284 for (size_t i = 0; i < n; i++) {
285 output[i] = npy_halfbits_to_floatbits(input[i]);
286 }
287
288 benchmark::DoNotOptimize(output);
289 }
290 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
291 }
292 BENCHMARK(npy_halfbits_to_floatbits)->RangeMultiplier(2)->Range(1<<10, 64<<20);
293
Eigen_half_to_float(benchmark::State & state)294 static void Eigen_half_to_float(benchmark::State& state) {
295 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
296 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
297
298 std::vector<uint16_t> fp16(state.range(0));
299 std::vector<float> fp32(state.range(0));
300 std::generate(fp16.begin(), fp16.end(),
301 [&rng]{ return fp16_ieee_from_fp32_value(rng()); });
302
303 while (state.KeepRunning()) {
304 uint16_t* input = fp16.data();
305 benchmark::DoNotOptimize(input);
306
307 float* output = fp32.data();
308 const size_t n = state.range(0);
309 for (size_t i = 0; i < n; i++) {
310 output[i] =
311 Eigen::half_impl::half_to_float(
312 Eigen::half_impl::raw_uint16_to_half(input[i]));
313 }
314
315 benchmark::DoNotOptimize(output);
316 }
317 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
318 }
319 BENCHMARK(Eigen_half_to_float)->RangeMultiplier(2)->Range(1<<10, 64<<20);
320
Float16Compressor_decompress(benchmark::State & state)321 static void Float16Compressor_decompress(benchmark::State& state) {
322 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
323 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
324
325 std::vector<uint16_t> fp16(state.range(0));
326 std::vector<float> fp32(state.range(0));
327 std::generate(fp16.begin(), fp16.end(),
328 [&rng]{ return fp16_ieee_from_fp32_value(rng()); });
329
330 while (state.KeepRunning()) {
331 uint16_t* input = fp16.data();
332 benchmark::DoNotOptimize(input);
333
334 float* output = fp32.data();
335 const size_t n = state.range(0);
336 for (size_t i = 0; i < n; i++) {
337 output[i] = Float16Compressor::decompress(input[i]);
338 }
339
340 benchmark::DoNotOptimize(output);
341 }
342 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
343 }
344 BENCHMARK(Float16Compressor_decompress)->RangeMultiplier(2)->Range(1<<10, 64<<20);
345
half_float_detail_half2float_table(benchmark::State & state)346 static void half_float_detail_half2float_table(benchmark::State& state) {
347 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
348 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
349
350 std::vector<uint16_t> fp16(state.range(0));
351 std::vector<float> fp32(state.range(0));
352 std::generate(fp16.begin(), fp16.end(),
353 [&rng]{ return fp16_ieee_from_fp32_value(rng()); });
354
355 while (state.KeepRunning()) {
356 uint16_t* input = fp16.data();
357 benchmark::DoNotOptimize(input);
358
359 float* output = fp32.data();
360 const size_t n = state.range(0);
361 for (size_t i = 0; i < n; i++) {
362 output[i] = half_float::detail::half2float_impl(input[i],
363 half_float::detail::true_type());
364 }
365
366 benchmark::DoNotOptimize(output);
367 }
368 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
369 }
370 BENCHMARK(half_float_detail_half2float_table)->RangeMultiplier(2)->Range(1<<10, 64<<20);
371
half_float_detail_half2float_branch(benchmark::State & state)372 static void half_float_detail_half2float_branch(benchmark::State& state) {
373 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
374 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
375
376 std::vector<uint16_t> fp16(state.range(0));
377 std::vector<float> fp32(state.range(0));
378 std::generate(fp16.begin(), fp16.end(),
379 [&rng]{ return fp16_ieee_from_fp32_value(rng()); });
380
381 while (state.KeepRunning()) {
382 uint16_t* input = fp16.data();
383 benchmark::DoNotOptimize(input);
384
385 float* output = fp32.data();
386 const size_t n = state.range(0);
387 for (size_t i = 0; i < n; i++) {
388 output[i] = half_float::detail::half2float_impl(input[i],
389 half_float::detail::false_type());
390 }
391
392 benchmark::DoNotOptimize(output);
393 }
394 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
395 }
396 BENCHMARK(half_float_detail_half2float_branch)->RangeMultiplier(2)->Range(1<<10, 64<<20);
397 #endif
398
399 BENCHMARK_MAIN();
400