1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11
12 #include <benchmark/benchmark.h>
13 #include "bench/utils.h"
14
15 #include <xnnpack.h>
16 #include <xnnpack/aligned-allocator.h>
17 #include <xnnpack/common.h>
18 #include <xnnpack/microfnptr.h>
19 #include <xnnpack/raddextexp.h>
20
21
f32_raddextexp(benchmark::State & state,xnn_f32_raddextexp_ukernel_function raddextexp,benchmark::utils::IsaCheckFunction isa_check=nullptr)22 static void f32_raddextexp(
23 benchmark::State& state,
24 xnn_f32_raddextexp_ukernel_function raddextexp,
25 benchmark::utils::IsaCheckFunction isa_check = nullptr)
26 {
27 if (isa_check && !isa_check(state)) {
28 return;
29 }
30
31 const size_t elements = state.range(0);
32 const size_t cache_line_size_max = 128;
33 const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
34
35 std::random_device random_device;
36 auto rng = std::mt19937(random_device());
37 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
38
39 const size_t num_buffers = 1 +
40 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
41 std::vector<float, AlignedAllocator<float, 64>> x(elements);
42
43 std::generate(x.begin(), x.end(), std::ref(f32rng));
44
45 benchmark::utils::DisableDenormals();
46
47 size_t buffer_index = 0;
48 for (auto _ : state) {
49 if (++buffer_index == num_buffers) {
50 buffer_index = 0;
51 }
52
53 float y_sum[2] = { nanf(""), nanf("") };
54 raddextexp(elements * sizeof(float), x.data(), y_sum);
55 }
56
57 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
58 if (cpu_frequency != 0) {
59 state.counters["cpufreq"] = cpu_frequency;
60 }
61
62 const size_t elements_per_iteration = elements;
63 state.counters["elements"] =
64 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
65
66 const size_t bytes_per_iteration = 2 * elements * sizeof(float);
67 state.counters["bytes"] =
68 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
69 }
70
CharacteristicArguments(benchmark::internal::Benchmark * b)71 static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
72 b->ArgName("N");
73 for (int32_t n = 10000; n <= 100000000; n *= 10) {
74 b->Arg(n);
75 }
76 }
77
78 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
79 BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x128,
80 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128,
81 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
82 BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x128_acc2,
83 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
84 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
85 BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x128_acc4,
86 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4,
87 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
88
89 BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x144,
90 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144,
91 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
92 BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x144_acc3,
93 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3,
94 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
95
96 BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x160,
97 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160,
98 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
99 BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x160_acc2,
100 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2,
101 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
102 BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x160_acc5,
103 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5,
104 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
105
106 BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x192,
107 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192,
108 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
109 BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x192_acc2,
110 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2,
111 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
112 BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x192_acc3,
113 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3,
114 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
115 BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x192_acc6,
116 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6,
117 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
118
119 BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x64,
120 xnn_f32_raddextexp_ukernel__avx2_p5_x64,
121 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
122 BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x64_acc2,
123 xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2,
124 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
125 BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x64_acc4,
126 xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4,
127 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
128
129 BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x72,
130 xnn_f32_raddextexp_ukernel__avx2_p5_x72,
131 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
132 BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x72_acc3,
133 xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3,
134 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
135
136 BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x80,
137 xnn_f32_raddextexp_ukernel__avx2_p5_x80,
138 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
139 BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x80_acc2,
140 xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
141 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
142 BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x80_acc5,
143 xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5,
144 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
145
146 BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x96,
147 xnn_f32_raddextexp_ukernel__avx2_p5_x96,
148 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
149 BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x96_acc2,
150 xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2,
151 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
152 BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x96_acc3,
153 xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3,
154 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
155 BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x96_acc6,
156 xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6,
157 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
158 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
159
160 #ifndef XNNPACK_BENCHMARK_NO_MAIN
161 BENCHMARK_MAIN();
162 #endif
163