• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11 
12 #include <benchmark/benchmark.h>
13 #include "bench/utils.h"
14 
15 #include <xnnpack.h>
16 #include <xnnpack/aligned-allocator.h>
17 #include <xnnpack/common.h>
18 #include <xnnpack/microfnptr.h>
19 #include <xnnpack/raddextexp.h>
20 
21 
f32_raddextexp(benchmark::State & state,xnn_f32_raddextexp_ukernel_function raddextexp,benchmark::utils::IsaCheckFunction isa_check=nullptr)22 static void f32_raddextexp(
23   benchmark::State& state,
24   xnn_f32_raddextexp_ukernel_function raddextexp,
25   benchmark::utils::IsaCheckFunction isa_check = nullptr)
26 {
27   if (isa_check && !isa_check(state)) {
28     return;
29   }
30 
31   const size_t elements = state.range(0);
32   const size_t cache_line_size_max = 128;
33   const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
34 
35   std::random_device random_device;
36   auto rng = std::mt19937(random_device());
37   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
38 
39   const size_t num_buffers = 1 +
40     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
41   std::vector<float, AlignedAllocator<float, 64>> x(elements);
42 
43   std::generate(x.begin(), x.end(), std::ref(f32rng));
44 
45   benchmark::utils::DisableDenormals();
46 
47   size_t buffer_index = 0;
48   for (auto _ : state) {
49     if (++buffer_index == num_buffers) {
50       buffer_index = 0;
51     }
52 
53     float y_sum[2] = { nanf(""), nanf("") };
54     raddextexp(elements * sizeof(float), x.data(), y_sum);
55   }
56 
57   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
58   if (cpu_frequency != 0) {
59     state.counters["cpufreq"] = cpu_frequency;
60   }
61 
62   const size_t elements_per_iteration = elements;
63   state.counters["elements"] =
64     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
65 
66   const size_t bytes_per_iteration = 2 * elements * sizeof(float);
67   state.counters["bytes"] =
68     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
69 }
70 
CharacteristicArguments(benchmark::internal::Benchmark * b)71 static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
72   b->ArgName("N");
73   for (int32_t n = 10000; n <= 100000000; n *= 10) {
74     b->Arg(n);
75   }
76 }
77 
78 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
79   BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x128,
80     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128,
81     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
82   BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x128_acc2,
83     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
84     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
85   BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x128_acc4,
86     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc4,
87     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
88 
89   BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x144,
90     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144,
91     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
92   BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x144_acc3,
93     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3,
94     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
95 
96   BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x160,
97     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160,
98     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
99   BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x160_acc2,
100     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc2,
101     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
102   BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x160_acc5,
103     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x160_acc5,
104     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
105 
106   BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x192,
107     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192,
108     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
109   BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x192_acc2,
110     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc2,
111     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
112   BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x192_acc3,
113     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc3,
114     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
115   BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_x192_acc6,
116     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x192_acc6,
117     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
118 
119   BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x64,
120     xnn_f32_raddextexp_ukernel__avx2_p5_x64,
121     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
122   BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x64_acc2,
123     xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc2,
124     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
125   BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x64_acc4,
126     xnn_f32_raddextexp_ukernel__avx2_p5_x64_acc4,
127     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
128 
129   BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x72,
130     xnn_f32_raddextexp_ukernel__avx2_p5_x72,
131     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
132   BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x72_acc3,
133     xnn_f32_raddextexp_ukernel__avx2_p5_x72_acc3,
134     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
135 
136   BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x80,
137     xnn_f32_raddextexp_ukernel__avx2_p5_x80,
138     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
139   BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x80_acc2,
140     xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
141     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
142   BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x80_acc5,
143     xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc5,
144     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
145 
146   BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x96,
147     xnn_f32_raddextexp_ukernel__avx2_p5_x96,
148     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
149   BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x96_acc2,
150     xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc2,
151     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
152   BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x96_acc3,
153     xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc3,
154     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
155   BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_x96_acc6,
156     xnn_f32_raddextexp_ukernel__avx2_p5_x96_acc6,
157     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
158 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
159 
160 #ifndef XNNPACK_BENCHMARK_NO_MAIN
161 BENCHMARK_MAIN();
162 #endif
163