• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <algorithm>
2 #include <cfloat>
3 #include <chrono>
4 #include <cmath>
5 #include <functional>
6 #include <random>
7 #include <vector>
8 
9 #include "bench/utils.h"
10 #include <xnnpack/common.h>
11 #include <xnnpack/params.h>
12 #include <xnnpack/raddexpminusmax.h>
13 #include <xnnpack/raddextexp.h>
14 #include <xnnpack/raddstoreexpminusmax.h>
15 #include <xnnpack/rmax.h>
16 #include <xnnpack/vscale.h>
17 #include <xnnpack/vscaleexpminusmax.h>
18 #include <xnnpack/vscaleextexp.h>
19 
20 #include <benchmark/benchmark.h>
21 
22 
ThreePassSoftMaxWithRecomputing(benchmark::State & state,xnn_f32_rmax_ukernel_function rmax,xnn_f32_raddexpminusmax_ukernel_function raddexpminusmax,xnn_f32_vscaleexpminusmax_ukernel_function vscaleexpminusmax,benchmark::utils::IsaCheckFunction isa_check=nullptr)23 static void ThreePassSoftMaxWithRecomputing(
24   benchmark::State& state,
25   xnn_f32_rmax_ukernel_function rmax,
26   xnn_f32_raddexpminusmax_ukernel_function raddexpminusmax,
27   xnn_f32_vscaleexpminusmax_ukernel_function vscaleexpminusmax,
28   benchmark::utils::IsaCheckFunction isa_check = nullptr)
29 {
30   if (isa_check && !isa_check(state)) {
31     return;
32   }
33 
34   const size_t n = state.range(0);
35   const size_t cache_line_size_max = 128;
36   const size_t packed_n = benchmark::utils::RoundUp(n, cache_line_size_max / sizeof(float));
37 
38   std::random_device random_device;
39   auto rng = std::mt19937(random_device());
40   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), rng);
41 
42   const size_t num_buffers = 1 +
43     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_n * sizeof(float));
44   std::vector<float> x(n);
45   std::vector<float> y(packed_n * num_buffers);
46 
47   std::generate(x.begin(), x.end(), std::ref(f32rng));
48 
49   benchmark::utils::DisableDenormals();
50 
51   size_t buffer_index = 0;
52   for (auto _ : state) {
53     benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float));
54     if (++buffer_index == num_buffers) {
55       buffer_index = 0;
56     }
57 
58     const auto start = std::chrono::high_resolution_clock::now();
59     float x_max = nanf("");
60     rmax(n * sizeof(float), x.data(), &x_max);
61     float y_sum = nanf("");
62     raddexpminusmax(n * sizeof(float), x.data(), &y_sum, x_max);
63     vscaleexpminusmax(n * sizeof(float), x.data(), y.data() + packed_n * buffer_index, x_max, 1.0f / y_sum);
64     const auto end = std::chrono::high_resolution_clock::now();
65 
66     const auto elapsed_seconds =
67       std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
68     state.SetIterationTime(elapsed_seconds.count());
69   }
70 
71   state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
72   state.counters["elements"] =
73     benchmark::Counter(uint64_t(state.iterations()) * n, benchmark::Counter::kIsRate);
74   state.counters["bytes"] =
75     benchmark::Counter(uint64_t(state.iterations()) * 2 * sizeof(float) * n, benchmark::Counter::kIsRate);
76 }
77 
ThreePassSoftMaxWithReloading(benchmark::State & state,xnn_f32_rmax_ukernel_function rmax,xnn_f32_raddstoreexpminusmax_ukernel_function raddstoreexpminusmax,xnn_f32_vscale_ukernel_function vscale,benchmark::utils::IsaCheckFunction isa_check=nullptr)78 static void ThreePassSoftMaxWithReloading(
79   benchmark::State& state,
80   xnn_f32_rmax_ukernel_function rmax,
81   xnn_f32_raddstoreexpminusmax_ukernel_function raddstoreexpminusmax,
82   xnn_f32_vscale_ukernel_function vscale,
83   benchmark::utils::IsaCheckFunction isa_check = nullptr)
84 {
85   if (isa_check && !isa_check(state)) {
86     return;
87   }
88 
89   const size_t n = state.range(0);
90   const size_t cache_line_size_max = 128;
91   const size_t packed_n = benchmark::utils::RoundUp(n, cache_line_size_max / sizeof(float));
92 
93   std::random_device random_device;
94   auto rng = std::mt19937(random_device());
95   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), rng);
96 
97   const size_t num_buffers = 1 +
98     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_n * sizeof(float));
99   std::vector<float> x(n);
100   std::vector<float> y(packed_n * num_buffers);
101 
102   std::generate(x.begin(), x.end(), std::ref(f32rng));
103 
104   benchmark::utils::DisableDenormals();
105 
106   size_t buffer_index = 0;
107   for (auto _ : state) {
108     benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float));
109     if (++buffer_index == num_buffers) {
110       buffer_index = 0;
111     }
112 
113     const auto start = std::chrono::high_resolution_clock::now();
114     float x_max = nanf("");
115     rmax(n * sizeof(float), x.data(), &x_max);
116     float y_sum = nanf("");
117     raddstoreexpminusmax(n * sizeof(float), x.data(), y.data() + packed_n * buffer_index, &y_sum, x_max);
118     vscale(n * sizeof(float), y.data() + packed_n * buffer_index, y.data() + packed_n * buffer_index, 1.0f / y_sum);
119     const auto end = std::chrono::high_resolution_clock::now();
120 
121     const auto elapsed_seconds =
122       std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
123     state.SetIterationTime(elapsed_seconds.count());
124   }
125 
126   state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
127   state.counters["elements"] =
128     benchmark::Counter(uint64_t(state.iterations()) * n, benchmark::Counter::kIsRate);
129   state.counters["bytes"] =
130     benchmark::Counter(uint64_t(state.iterations()) * 2 * sizeof(float) * n, benchmark::Counter::kIsRate);
131 }
132 
TwoPassSoftMax(benchmark::State & state,xnn_f32_raddextexp_ukernel_function raddextexp,xnn_f32_vscaleextexp_ukernel_function vscaleextexp,benchmark::utils::IsaCheckFunction isa_check=nullptr)133 static void TwoPassSoftMax(
134   benchmark::State& state,
135   xnn_f32_raddextexp_ukernel_function raddextexp,
136   xnn_f32_vscaleextexp_ukernel_function vscaleextexp,
137   benchmark::utils::IsaCheckFunction isa_check = nullptr)
138 {
139   if (isa_check && !isa_check(state)) {
140     return;
141   }
142 
143   const size_t n = state.range(0);
144   const size_t cache_line_size_max = 128;
145   const size_t packed_n = benchmark::utils::RoundUp(n, cache_line_size_max / sizeof(float));
146 
147   std::random_device random_device;
148   auto rng = std::mt19937(random_device());
149   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), rng);
150 
151   const size_t num_buffers = 1 +
152     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_n * sizeof(float));
153   std::vector<float> x(n);
154   std::vector<float> y(packed_n * num_buffers);
155 
156   std::generate(x.begin(), x.end(), std::ref(f32rng));
157 
158   benchmark::utils::DisableDenormals();
159 
160   size_t buffer_index = 0;
161   for (auto _ : state) {
162     benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float));
163     if (++buffer_index == num_buffers) {
164       buffer_index = 0;
165     }
166 
167     const auto start = std::chrono::high_resolution_clock::now();
168     float scale[2];
169     raddextexp(n * sizeof(float), x.data(), scale);
170     vscaleextexp(n * sizeof(float), x.data(), y.data() + packed_n * buffer_index, 1.0f / scale[0], -scale[1]);
171     const auto end = std::chrono::high_resolution_clock::now();
172 
173     const auto elapsed_seconds =
174       std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
175     state.SetIterationTime(elapsed_seconds.count());
176   }
177 
178   state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
179   state.counters["elements"] =
180     benchmark::Counter(uint64_t(state.iterations()) * n, benchmark::Counter::kIsRate);
181   state.counters["bytes"] =
182     benchmark::Counter(uint64_t(state.iterations()) * 2 * sizeof(float) * n, benchmark::Counter::kIsRate);
183 }
184 
CharacteristicArguments(benchmark::internal::Benchmark * b)185 static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
186   for (int32_t n = 1000; n <= 100000000; n *= 10) {
187     b->Arg(n);
188     b->Arg(3 * n);
189   }
190 }
191 
192 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
193   // Parameters auto-tuned for a mix
194   BENCHMARK_CAPTURE(TwoPassSoftMax, avx2_blend,
195     xnn_f32_raddextexp_ukernel__avx2_p5_x96,
196     xnn_f32_vscaleextexp_ukernel__avx2_p5_x40,
197     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
198   BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx2_blend,
199     xnn_f32_rmax_ukernel__avx,
200     xnn_f32_raddexpminusmax_ukernel__avx2_p5_x96,
201     xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x24,
202     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
203   BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx2_blend,
204     xnn_f32_rmax_ukernel__avx,
205     xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc2,
206     xnn_f32_vscale_ukernel__avx_unroll32,
207     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
208 
209   // Parameters auto-tuned for Broadwell
210   BENCHMARK_CAPTURE(TwoPassSoftMax, avx2_broadwell,
211     xnn_f32_raddextexp_ukernel__avx2_p5_x96,
212     xnn_f32_vscaleextexp_ukernel__avx2_p5_x32,
213     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
214   BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx2_broadwell,
215     xnn_f32_rmax_ukernel__avx,
216     xnn_f32_raddexpminusmax_ukernel__avx2_p5_x96,
217     xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x24,
218     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
219   BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx2_broadwell,
220     xnn_f32_rmax_ukernel__avx,
221     xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64,
222     xnn_f32_vscale_ukernel__avx_unroll32,
223     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
224 
225   // Parameters auto-tuned for Zen 2
226   BENCHMARK_CAPTURE(TwoPassSoftMax, avx2_zen2,
227     xnn_f32_raddextexp_ukernel__avx2_p5_x72,
228     xnn_f32_vscaleextexp_ukernel__avx2_p5_x40,
229     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
230   BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx2_zen2,
231     xnn_f32_rmax_ukernel__avx,
232     xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80,
233     xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x16,
234     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
235   BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx2_zen2,
236     xnn_f32_rmax_ukernel__avx,
237     xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64,
238     xnn_f32_vscale_ukernel__avx_unroll32,
239     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
240 
241   // Parameters auto-tuned for Skylake
242   BENCHMARK_CAPTURE(TwoPassSoftMax, avx2_skylake,
243     xnn_f32_raddextexp_ukernel__avx2_p5_x64,
244     xnn_f32_vscaleextexp_ukernel__avx2_p5_x40,
245     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
246   BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx2_skylake,
247     xnn_f32_rmax_ukernel__avx,
248     xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64_acc2,
249     xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x24,
250     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
251   BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx2_skylake,
252     xnn_f32_rmax_ukernel__avx,
253     xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x80_acc2,
254     xnn_f32_vscale_ukernel__avx_unroll32,
255     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
256 
257   BENCHMARK_CAPTURE(TwoPassSoftMax, avx512f_skylake,
258     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3,
259     xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x16,
260     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
261   BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx512f_skylake,
262     xnn_f32_rmax_ukernel__avx512f,
263     xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc4,
264     xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x16,
265     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
266   BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx512f_skylake,
267     xnn_f32_rmax_ukernel__avx512f,
268     xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
269     xnn_f32_vscale_ukernel__avx512f_unroll64,
270     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
271 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
272 
273 #ifndef XNNPACK_BENCHMARK_NO_MAIN
274 BENCHMARK_MAIN();
275 #endif
276