1 // Copyright 2022 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11
12 #include <benchmark/benchmark.h>
13 #include <fp16/fp16.h>
14 #include "bench/utils.h"
15
16 #include <xnnpack.h>
17 #include <xnnpack/aligned-allocator.h>
18 #include <xnnpack/common.h>
19 #include <xnnpack/microfnptr.h>
20 #include <xnnpack/microparams-init.h>
21 #include <xnnpack/vunary.h>
22
23
f16_velu(benchmark::State & state,xnn_f16_velu_ukernel_function elu,xnn_init_f16_elu_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)24 static void f16_velu(
25 benchmark::State& state,
26 xnn_f16_velu_ukernel_function elu,
27 xnn_init_f16_elu_params_fn init_params,
28 benchmark::utils::IsaCheckFunction isa_check = nullptr)
29 {
30 if (isa_check && !isa_check(state)) {
31 return;
32 }
33
34 const size_t num_elements = state.range(0);
35
36 std::random_device random_device;
37 auto rng = std::mt19937(random_device());
38 auto f32rng = std::bind(std::uniform_real_distribution<float>(-9.0f, 9.0f), std::ref(rng));
39 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
40
41 std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(num_elements);
42 std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(num_elements);
43 std::generate(x.begin(), x.end(), std::ref(f16rng));
44 std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
45
46 union xnn_f16_elu_params params;
47 init_params(¶ms,
48 UINT16_C(0x3C00) /* prescale = 1.0h */,
49 UINT16_C(0x3C00) /* alpha = 1.0h */,
50 UINT16_C(0x3C00) /* beta = 1.0h */);
51 for (auto _ : state) {
52 elu(num_elements * sizeof(uint16_t), x.data(), y.data(), ¶ms);
53 }
54
55 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
56 if (cpu_frequency != 0) {
57 state.counters["cpufreq"] = cpu_frequency;
58 }
59
60 const size_t elements_per_iteration = num_elements;
61 state.counters["elements"] =
62 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
63
64 const size_t bytes_per_iteration = 2 * num_elements * sizeof(uint16_t);
65 state.counters["bytes"] =
66 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
67 }
68
69
70 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
71 BENCHMARK_CAPTURE(f16_velu, neonfp16arith_rr1_p3_x8,
72 xnn_f16_velu_ukernel__neonfp16arith_rr1_p3_x8,
73 xnn_init_f16_elu_neonfp16arith_rr1_p3_params,
74 benchmark::utils::CheckNEONFP16ARITH)
75 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
76 ->UseRealTime();
77 BENCHMARK_CAPTURE(f16_velu, neonfp16arith_rr1_p3_x16,
78 xnn_f16_velu_ukernel__neonfp16arith_rr1_p3_x16,
79 xnn_init_f16_elu_neonfp16arith_rr1_p3_params,
80 benchmark::utils::CheckNEONFP16ARITH)
81 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
82 ->UseRealTime();
83 #endif // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
84
85
86 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
87 BENCHMARK_CAPTURE(f16_velu, avx2_rr1_p3_x8,
88 xnn_f16_velu_ukernel__avx2_rr1_p3_x8,
89 xnn_init_f16_elu_avx2_rr1_p3_params,
90 benchmark::utils::CheckAVX2)
91 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
92 ->UseRealTime();
93 BENCHMARK_CAPTURE(f16_velu, avx2_rr1_p3_x16,
94 xnn_f16_velu_ukernel__avx2_rr1_p3_x16,
95 xnn_init_f16_elu_avx2_rr1_p3_params,
96 benchmark::utils::CheckAVX2)
97 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
98 ->UseRealTime();
99 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
100
101
102 #ifndef XNNPACK_BENCHMARK_NO_MAIN
103 BENCHMARK_MAIN();
104 #endif
105