• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <benchmark/benchmark.h>
2 
3 #include <fp16.h>
4 #ifndef EMSCRIPTEN
5 	#include <fp16/psimd.h>
6 #endif
7 
8 #include <vector>
9 #include <random>
10 #include <chrono>
11 #include <functional>
12 #include <algorithm>
13 
14 #if (defined(__i386__) || defined(__x86_64__)) && defined(__F16C__)
15 	#include <immintrin.h>
16 #endif
17 
18 #if defined(__ARM_NEON__) || defined(__aarch64__)
19 	#include <arm_neon.h>
20 #endif
21 
22 #ifdef FP16_COMPARATIVE_BENCHMARKS
23 	#include <third-party/THHalf.h>
24 	#include <third-party/npy-halffloat.h>
25 	#include <third-party/eigen-half.h>
26 	#include <third-party/float16-compressor.h>
27 	#include <third-party/half.hpp>
28 #endif
29 
30 
fp16_ieee_to_fp32_bits(benchmark::State & state)31 static void fp16_ieee_to_fp32_bits(benchmark::State& state) {
32 	const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
33 	auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
34 
35 	std::vector<uint16_t> fp16(state.range(0));
36 	std::vector<uint32_t> fp32(state.range(0));
37 	std::generate(fp16.begin(), fp16.end(),
38 		[&rng]{ return fp16_ieee_from_fp32_value(rng()); });
39 
40 	while (state.KeepRunning()) {
41 		uint16_t* input = fp16.data();
42 		benchmark::DoNotOptimize(input);
43 
44 		uint32_t* output = fp32.data();
45 		const size_t n = state.range(0);
46 		for (size_t i = 0; i < n; i++) {
47 			output[i] = fp16_ieee_to_fp32_bits(input[i]);
48 		}
49 
50 		benchmark::DoNotOptimize(output);
51 	}
52 	state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
53 }
54 BENCHMARK(fp16_ieee_to_fp32_bits)->RangeMultiplier(2)->Range(1<<10, 64<<20);
55 
fp16_ieee_to_fp32_value(benchmark::State & state)56 static void fp16_ieee_to_fp32_value(benchmark::State& state) {
57 	const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
58 	auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
59 
60 	std::vector<uint16_t> fp16(state.range(0));
61 	std::vector<float> fp32(state.range(0));
62 	std::generate(fp16.begin(), fp16.end(),
63 		[&rng]{ return fp16_ieee_from_fp32_value(rng()); });
64 
65 	while (state.KeepRunning()) {
66 		uint16_t* input = fp16.data();
67 		benchmark::DoNotOptimize(input);
68 
69 		float* output = fp32.data();
70 		const size_t n = state.range(0);
71 		for (size_t i = 0; i < n; i++) {
72 			output[i] = fp16_ieee_to_fp32_value(input[i]);
73 		}
74 
75 		benchmark::DoNotOptimize(output);
76 	}
77 	state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
78 }
79 BENCHMARK(fp16_ieee_to_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20);
80 
81 #ifndef EMSCRIPTEN
fp16_ieee_to_fp32_psimd(benchmark::State & state)82 	static void fp16_ieee_to_fp32_psimd(benchmark::State& state) {
83 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
84 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
85 
86 		std::vector<uint16_t> fp16(state.range(0));
87 		std::vector<float> fp32(state.range(0));
88 		std::generate(fp16.begin(), fp16.end(),
89 			[&rng]{ return fp16_ieee_from_fp32_value(rng()); });
90 
91 		while (state.KeepRunning()) {
92 			uint16_t* input = fp16.data();
93 			benchmark::DoNotOptimize(input);
94 
95 			float* output = fp32.data();
96 			const size_t n = state.range(0);
97 			for (size_t i = 0; i < n - 4; i += 4) {
98 				psimd_store_f32(&output[i],
99 					fp16_ieee_to_fp32_psimd(
100 						psimd_load_u16(&input[i])));
101 			}
102 			const psimd_u16 last_vector = { input[n - 4], input[n - 3], input[n - 2], input[n - 1] };
103 			psimd_store_f32(&output[n - 4],
104 				fp16_ieee_to_fp32_psimd(last_vector));
105 
106 			benchmark::DoNotOptimize(output);
107 		}
108 		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
109 	}
110 	BENCHMARK(fp16_ieee_to_fp32_psimd)->RangeMultiplier(2)->Range(1<<10, 64<<20);
111 
fp16_ieee_to_fp32x2_psimd(benchmark::State & state)112 	static void fp16_ieee_to_fp32x2_psimd(benchmark::State& state) {
113 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
114 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
115 
116 		std::vector<uint16_t> fp16(state.range(0));
117 		std::vector<float> fp32(state.range(0));
118 		std::generate(fp16.begin(), fp16.end(),
119 			[&rng]{ return fp16_ieee_from_fp32_value(rng()); });
120 
121 		while (state.KeepRunning()) {
122 			uint16_t* input = fp16.data();
123 			benchmark::DoNotOptimize(input);
124 
125 			float* output = fp32.data();
126 			const size_t n = state.range(0);
127 			for (size_t i = 0; i < n; i += 8) {
128 				const psimd_f32x2 data =
129 					fp16_ieee_to_fp32x2_psimd(
130 						psimd_load_u16(&input[i]));
131 				psimd_store_f32(&output[i], data.lo);
132 				psimd_store_f32(&output[i + 4], data.hi);
133 			}
134 
135 			benchmark::DoNotOptimize(output);
136 		}
137 		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
138 	}
139 	BENCHMARK(fp16_ieee_to_fp32x2_psimd)->RangeMultiplier(2)->Range(1<<10, 64<<20);
140 #endif
141 
142 #if (defined(__i386__) || defined(__x86_64__)) && defined(__F16C__)
hardware_mm_cvtph_ps(benchmark::State & state)143 	static void hardware_mm_cvtph_ps(benchmark::State& state) {
144 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
145 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
146 
147 		std::vector<uint16_t> fp16(state.range(0));
148 		std::vector<float> fp32(state.range(0));
149 		std::generate(fp16.begin(), fp16.end(),
150 			[&rng]{ return fp16_ieee_from_fp32_value(rng()); });
151 
152 		while (state.KeepRunning()) {
153 			uint16_t* input = fp16.data();
154 			benchmark::DoNotOptimize(input);
155 
156 			float* output = fp32.data();
157 			const size_t n = state.range(0);
158 			for (size_t i = 0; i < n; i += 4) {
159 				_mm_storeu_ps(&output[i],
160 					_mm_cvtph_ps(
161 						_mm_loadl_epi64(static_cast<const __m128i*>(static_cast<const void*>(&input[i])))));
162 			}
163 
164 			benchmark::DoNotOptimize(output);
165 		}
166 		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
167 	}
168 	BENCHMARK(hardware_mm_cvtph_ps)->RangeMultiplier(2)->Range(1<<10, 64<<20);
169 
hardware_mm256_cvtph_ps(benchmark::State & state)170 	static void hardware_mm256_cvtph_ps(benchmark::State& state) {
171 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
172 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
173 
174 		std::vector<uint16_t> fp16(state.range(0));
175 		std::vector<float> fp32(state.range(0));
176 		std::generate(fp16.begin(), fp16.end(),
177 			[&rng]{ return fp16_ieee_from_fp32_value(rng()); });
178 
179 		while (state.KeepRunning()) {
180 			uint16_t* input = fp16.data();
181 			benchmark::DoNotOptimize(input);
182 
183 			float* output = fp32.data();
184 			const size_t n = state.range(0);
185 			for (size_t i = 0; i < n; i += 8) {
186 				_mm256_storeu_ps(&output[i],
187 					_mm256_cvtph_ps(
188 						_mm_loadu_si128(static_cast<const __m128i*>(static_cast<const void*>(&input[i])))));
189 			}
190 
191 			benchmark::DoNotOptimize(output);
192 		}
193 		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
194 	}
195 	BENCHMARK(hardware_mm256_cvtph_ps)->RangeMultiplier(2)->Range(1<<10, 64<<20);
196 #endif
197 
198 #if defined(__ARM_NEON_FP) && (__ARM_NEON_FP & 0x2) || defined(__aarch64__)
hardware_vcvt_f32_f16(benchmark::State & state)199 	static void hardware_vcvt_f32_f16(benchmark::State& state) {
200 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
201 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
202 
203 		std::vector<uint16_t> fp16(state.range(0));
204 		std::vector<float> fp32(state.range(0));
205 		std::generate(fp16.begin(), fp16.end(),
206 			[&rng]{ return fp16_ieee_from_fp32_value(rng()); });
207 
208 		while (state.KeepRunning()) {
209 			uint16_t* input = fp16.data();
210 			benchmark::DoNotOptimize(input);
211 
212 			float* output = fp32.data();
213 			const size_t n = state.range(0);
214 			#if defined(__aarch64__)
215 				const unsigned int fpcr = __builtin_aarch64_get_fpcr();
216 				/* Disable flush-to-zero (bit 24) and Alternative FP16 format (bit 26) */
217 				__builtin_aarch64_set_fpcr(fpcr & 0xF6FFFFFFu);
218 			#else
219 				unsigned int fpscr;
220 				__asm__ __volatile__ ("VMRS %[fpscr], fpscr" : [fpscr] "=r" (fpscr));
221 				/* Disable flush-to-zero (bit 24) and Alternative FP16 format (bit 26) */
222 				__asm__ __volatile__ ("VMSR fpscr, %[fpscr]" :
223 					: [fpscr] "r" (fpscr & 0xF6FFFFFFu));
224 			#endif
225 			for (size_t i = 0; i < n; i += 4) {
226 				vst1q_f32(&output[i],
227 					vcvt_f32_f16(
228 						(float16x4_t) vld1_u16(&input[i])));
229 			}
230 			#if defined(__aarch64__)
231 				__builtin_aarch64_set_fpcr(fpcr);
232 			#else
233 				__asm__ __volatile__ ("VMSR fpscr, %[fpscr]" :: [fpscr] "r" (fpscr));
234 			#endif
235 
236 			benchmark::DoNotOptimize(output);
237 		}
238 		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
239 	}
240 	BENCHMARK(hardware_vcvt_f32_f16)->RangeMultiplier(2)->Range(1<<10, 64<<20);
241 #endif
242 
243 #ifdef FP16_COMPARATIVE_BENCHMARKS
TH_halfbits2float(benchmark::State & state)244 	static void TH_halfbits2float(benchmark::State& state) {
245 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
246 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
247 
248 		std::vector<uint16_t> fp16(state.range(0));
249 		std::vector<float> fp32(state.range(0));
250 		std::generate(fp16.begin(), fp16.end(),
251 			[&rng]{ return fp16_ieee_from_fp32_value(rng()); });
252 
253 		while (state.KeepRunning()) {
254 			uint16_t* input = fp16.data();
255 			benchmark::DoNotOptimize(input);
256 
257 			float* output = fp32.data();
258 			const size_t n = state.range(0);
259 			for (size_t i = 0; i < n; i++) {
260 				TH_halfbits2float(&input[i], &output[i]);
261 			}
262 
263 			benchmark::DoNotOptimize(output);
264 		}
265 		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
266 	}
267 	BENCHMARK(TH_halfbits2float)->RangeMultiplier(2)->Range(1<<10, 64<<20);
268 
npy_halfbits_to_floatbits(benchmark::State & state)269 	static void npy_halfbits_to_floatbits(benchmark::State& state) {
270 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
271 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
272 
273 		std::vector<uint16_t> fp16(state.range(0));
274 		std::vector<uint32_t> fp32(state.range(0));
275 		std::generate(fp16.begin(), fp16.end(),
276 			[&rng]{ return fp16_ieee_from_fp32_value(rng()); });
277 
278 		while (state.KeepRunning()) {
279 			uint16_t* input = fp16.data();
280 			benchmark::DoNotOptimize(input);
281 
282 			uint32_t* output = fp32.data();
283 			const size_t n = state.range(0);
284 			for (size_t i = 0; i < n; i++) {
285 				output[i] = npy_halfbits_to_floatbits(input[i]);
286 			}
287 
288 			benchmark::DoNotOptimize(output);
289 		}
290 		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
291 	}
292 	BENCHMARK(npy_halfbits_to_floatbits)->RangeMultiplier(2)->Range(1<<10, 64<<20);
293 
Eigen_half_to_float(benchmark::State & state)294 	static void Eigen_half_to_float(benchmark::State& state) {
295 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
296 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
297 
298 		std::vector<uint16_t> fp16(state.range(0));
299 		std::vector<float> fp32(state.range(0));
300 		std::generate(fp16.begin(), fp16.end(),
301 			[&rng]{ return fp16_ieee_from_fp32_value(rng()); });
302 
303 		while (state.KeepRunning()) {
304 			uint16_t* input = fp16.data();
305 			benchmark::DoNotOptimize(input);
306 
307 			float* output = fp32.data();
308 			const size_t n = state.range(0);
309 			for (size_t i = 0; i < n; i++) {
310 				output[i] =
311 					Eigen::half_impl::half_to_float(
312 						Eigen::half_impl::raw_uint16_to_half(input[i]));
313 			}
314 
315 			benchmark::DoNotOptimize(output);
316 		}
317 		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
318 	}
319 	BENCHMARK(Eigen_half_to_float)->RangeMultiplier(2)->Range(1<<10, 64<<20);
320 
Float16Compressor_decompress(benchmark::State & state)321 	static void Float16Compressor_decompress(benchmark::State& state) {
322 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
323 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
324 
325 		std::vector<uint16_t> fp16(state.range(0));
326 		std::vector<float> fp32(state.range(0));
327 		std::generate(fp16.begin(), fp16.end(),
328 			[&rng]{ return fp16_ieee_from_fp32_value(rng()); });
329 
330 		while (state.KeepRunning()) {
331 			uint16_t* input = fp16.data();
332 			benchmark::DoNotOptimize(input);
333 
334 			float* output = fp32.data();
335 			const size_t n = state.range(0);
336 			for (size_t i = 0; i < n; i++) {
337 				output[i] = Float16Compressor::decompress(input[i]);
338 			}
339 
340 			benchmark::DoNotOptimize(output);
341 		}
342 		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
343 	}
344 	BENCHMARK(Float16Compressor_decompress)->RangeMultiplier(2)->Range(1<<10, 64<<20);
345 
half_float_detail_half2float_table(benchmark::State & state)346 	static void half_float_detail_half2float_table(benchmark::State& state) {
347 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
348 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
349 
350 		std::vector<uint16_t> fp16(state.range(0));
351 		std::vector<float> fp32(state.range(0));
352 		std::generate(fp16.begin(), fp16.end(),
353 			[&rng]{ return fp16_ieee_from_fp32_value(rng()); });
354 
355 		while (state.KeepRunning()) {
356 			uint16_t* input = fp16.data();
357 			benchmark::DoNotOptimize(input);
358 
359 			float* output = fp32.data();
360 			const size_t n = state.range(0);
361 			for (size_t i = 0; i < n; i++) {
362 				output[i] = half_float::detail::half2float_impl(input[i],
363 					half_float::detail::true_type());
364 			}
365 
366 			benchmark::DoNotOptimize(output);
367 		}
368 		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
369 	}
370 	BENCHMARK(half_float_detail_half2float_table)->RangeMultiplier(2)->Range(1<<10, 64<<20);
371 
half_float_detail_half2float_branch(benchmark::State & state)372 	static void half_float_detail_half2float_branch(benchmark::State& state) {
373 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
374 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
375 
376 		std::vector<uint16_t> fp16(state.range(0));
377 		std::vector<float> fp32(state.range(0));
378 		std::generate(fp16.begin(), fp16.end(),
379 			[&rng]{ return fp16_ieee_from_fp32_value(rng()); });
380 
381 		while (state.KeepRunning()) {
382 			uint16_t* input = fp16.data();
383 			benchmark::DoNotOptimize(input);
384 
385 			float* output = fp32.data();
386 			const size_t n = state.range(0);
387 			for (size_t i = 0; i < n; i++) {
388 				output[i] = half_float::detail::half2float_impl(input[i],
389 					half_float::detail::false_type());
390 			}
391 
392 			benchmark::DoNotOptimize(output);
393 		}
394 		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
395 	}
396 	BENCHMARK(half_float_detail_half2float_branch)->RangeMultiplier(2)->Range(1<<10, 64<<20);
397 #endif
398 
399 BENCHMARK_MAIN();
400