• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <algorithm>
2 #include <cfloat>
3 #include <chrono>
4 #include <cmath>
5 #include <functional>
6 #include <random>
7 #include <vector>
8 
9 #include "bench/utils.h"
10 #include <xnnpack/common.h>
11 #include <xnnpack/params.h>
12 #include <xnnpack/params-init.h>
13 #include <xnnpack/raddexpminusmax.h>
14 #include <xnnpack/raddextexp.h>
15 #include <xnnpack/raddstoreexpminusmax.h>
16 #include <xnnpack/rmax.h>
17 #include <xnnpack/vbinary.h>
18 #include <xnnpack/vscaleexpminusmax.h>
19 #include <xnnpack/vscaleextexp.h>
20 
21 #include <benchmark/benchmark.h>
22 #ifdef BENCHMARK_INTEL_DNNL
23 #include <dnnl.h>
24 #endif  // BENCHMARK_INTEL_DNNL
25 
26 
27 #ifdef BENCHMARK_INTEL_DNNL
DNNLSoftArgMax(benchmark::State & state)28 static void DNNLSoftArgMax(
29   benchmark::State& state)
30 {
31   const size_t elements = state.range(0);
32   const size_t cache_line_size_max = 128;
33   const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
34 
35   std::random_device random_device;
36   auto rng = std::mt19937(random_device());
37   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
38 
39   const size_t num_buffers = 1 +
40     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
41   std::vector<float> x(elements);
42   std::vector<float> y(packed_elements * num_buffers);
43 
44   std::generate(x.begin(), x.end(), std::ref(f32rng));
45 
46   dnnl_engine_t engine;
47   if (dnnl_engine_create(&engine, dnnl_cpu, 0) != dnnl_success) {
48     state.SkipWithError("failed to create CPU engine");
49     return;
50   }
51 
52   dnnl_dim_t input_output_shape[1] = { static_cast<int>(elements) };
53 
54   dnnl_memory_desc_t memory_descriptor = { 0 };
55   if (dnnl_memory_desc_init_by_tag(
56     &memory_descriptor, 1, input_output_shape, dnnl_f32, dnnl_x) != dnnl_success)
57   {
58     state.SkipWithError("failed to create input memory descriptor");
59     return;
60   }
61 
62   dnnl_memory_t input_memory = nullptr;
63   if (dnnl_memory_create(
64     &input_memory, &memory_descriptor, engine, x.data()) != dnnl_success)
65   {
66     state.SkipWithError("failed to create input memory");
67     return;
68   }
69 
70   dnnl_memory_t output_memory = nullptr;
71   if (dnnl_memory_create(
72     &output_memory, &memory_descriptor, engine, y.data()) != dnnl_success)
73   {
74     state.SkipWithError("failed to create output memory");
75     return;
76   }
77 
78   dnnl_softmax_desc_t softmax_forward_descriptor = {};
79   if (dnnl_softmax_forward_desc_init(
80     &softmax_forward_descriptor, dnnl_forward_inference,
81     &memory_descriptor, 0) != dnnl_success)
82   {
83     state.SkipWithError("failed to create SoftMax forward descriptor");
84     return;
85   }
86 
87   dnnl_primitive_desc_t softmax_primitive_descriptor = nullptr;
88   if (dnnl_primitive_desc_create(
89     &softmax_primitive_descriptor, &softmax_forward_descriptor,
90     nullptr /* primitive attributes */, engine, nullptr /* hint */) != dnnl_success)
91   {
92     state.SkipWithError("failed to create SoftMax primitive descriptor");
93     return;
94   }
95 
96   dnnl_primitive_t softmax_primitive = nullptr;
97   if (dnnl_primitive_create(
98     &softmax_primitive, softmax_primitive_descriptor) != dnnl_success)
99   {
100     state.SkipWithError("failed to create SoftMax primitive");
101     return;
102   }
103 
104   dnnl_exec_arg_t softmax_args[2] = {
105     {DNNL_ARG_SRC, input_memory},
106     {DNNL_ARG_DST, output_memory},
107   };
108 
109   dnnl_stream_t stream = nullptr;
110   if (dnnl_stream_create(&stream, engine, dnnl_stream_default_flags) != dnnl_success) {
111     state.SkipWithError("failed to create stream");
112     return;
113   }
114 
115   size_t buffer_index = 0;
116   for (auto _ : state) {
117     benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float));
118     if (++buffer_index == num_buffers) {
119       buffer_index = 0;
120     }
121 
122     const auto start = std::chrono::high_resolution_clock::now();
123     if (dnnl_primitive_execute(
124       softmax_primitive, stream, 2, softmax_args) != dnnl_success)
125     {
126       state.SkipWithError("failed to execute SoftMax");
127       return;
128     }
129     const auto end = std::chrono::high_resolution_clock::now();
130 
131     const auto elapsed_seconds =
132       std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
133     state.SetIterationTime(elapsed_seconds.count());
134   }
135 
136   if (dnnl_stream_destroy(stream) != dnnl_success) {
137     state.SkipWithError("failed to destroy stream");
138     return;
139   }
140 
141   if (dnnl_primitive_desc_destroy(softmax_primitive_descriptor) != dnnl_success) {
142     state.SkipWithError("failed to destroy SoftMax primitive descriptor");
143     return;
144   }
145 
146   if (dnnl_primitive_destroy(softmax_primitive) != dnnl_success) {
147     state.SkipWithError("failed to destroy SoftMax primitive");
148     return;
149   }
150 
151   if (dnnl_memory_destroy(input_memory) != dnnl_success) {
152     state.SkipWithError("failed to destroy input memory");
153     return;
154   }
155 
156   if (dnnl_memory_destroy(output_memory) != dnnl_success) {
157     state.SkipWithError("failed to destroy output memory");
158     return;
159   }
160 
161   if (dnnl_engine_destroy(engine) != dnnl_success) {
162     state.SkipWithError("failed to destroy engine");
163     return;
164   }
165 
166   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
167   if (cpu_frequency != 0) {
168     state.counters["cpufreq"] = cpu_frequency;
169   }
170 
171   const size_t elements_per_iteration = elements;
172   state.counters["elements"] =
173     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
174 
175   const size_t bytes_per_iteration = 2 * elements * sizeof(float);
176   state.counters["bytes"] =
177     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
178 }
179 #endif  // BENCHMARK_INTEL_DNNL
180 
ThreePassSoftMaxWithRecomputing(benchmark::State & state,xnn_f32_rmax_ukernel_function rmax,xnn_f32_raddexpminusmax_ukernel_function raddexpminusmax,xnn_f32_vscaleexpminusmax_ukernel_function vscaleexpminusmax,benchmark::utils::IsaCheckFunction isa_check=nullptr)181 static void ThreePassSoftMaxWithRecomputing(
182   benchmark::State& state,
183   xnn_f32_rmax_ukernel_function rmax,
184   xnn_f32_raddexpminusmax_ukernel_function raddexpminusmax,
185   xnn_f32_vscaleexpminusmax_ukernel_function vscaleexpminusmax,
186   benchmark::utils::IsaCheckFunction isa_check = nullptr)
187 {
188   if (isa_check && !isa_check(state)) {
189     return;
190   }
191 
192   const size_t elements = state.range(0);
193   const size_t cache_line_size_max = 128;
194   const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
195 
196   std::random_device random_device;
197   auto rng = std::mt19937(random_device());
198   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
199 
200   const size_t num_buffers = 1 +
201     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
202   std::vector<float> x(elements);
203   std::vector<float> y(packed_elements * num_buffers);
204 
205   std::generate(x.begin(), x.end(), std::ref(f32rng));
206 
207   benchmark::utils::DisableDenormals();
208 
209   size_t buffer_index = 0;
210   for (auto _ : state) {
211     benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float));
212     if (++buffer_index == num_buffers) {
213       buffer_index = 0;
214     }
215 
216     const auto start = std::chrono::high_resolution_clock::now();
217     float x_max = nanf("");
218     rmax(elements * sizeof(float), x.data(), &x_max);
219     float y_sum = nanf("");
220     raddexpminusmax(elements * sizeof(float), x.data(), &y_sum, x_max);
221     vscaleexpminusmax(elements * sizeof(float), x.data(), y.data() + packed_elements * buffer_index, x_max, 1.0f / y_sum);
222     const auto end = std::chrono::high_resolution_clock::now();
223 
224     const auto elapsed_seconds =
225       std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
226     state.SetIterationTime(elapsed_seconds.count());
227   }
228 
229   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
230   if (cpu_frequency != 0) {
231     state.counters["cpufreq"] = cpu_frequency;
232   }
233 
234   const size_t elements_per_iteration = elements;
235   state.counters["elements"] =
236     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
237 
238   const size_t bytes_per_iteration = 2 * elements * sizeof(float);
239   state.counters["bytes"] =
240     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
241 }
242 
ThreePassSoftMaxWithReloading(benchmark::State & state,xnn_f32_rmax_ukernel_function rmax,xnn_f32_raddstoreexpminusmax_ukernel_function raddstoreexpminusmax,xnn_init_f32_expminus_params_fn init_expminus_params,xnn_f32_vbinary_minmax_ukernel_function vmulc,xnn_init_f32_minmax_params_fn init_minmax_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)243 static void ThreePassSoftMaxWithReloading(
244   benchmark::State& state,
245   xnn_f32_rmax_ukernel_function rmax,
246   xnn_f32_raddstoreexpminusmax_ukernel_function raddstoreexpminusmax,
247   xnn_init_f32_expminus_params_fn init_expminus_params,
248   xnn_f32_vbinary_minmax_ukernel_function vmulc,
249   xnn_init_f32_minmax_params_fn init_minmax_params,
250   benchmark::utils::IsaCheckFunction isa_check = nullptr)
251 {
252   if (isa_check && !isa_check(state)) {
253     return;
254   }
255 
256   const size_t elements = state.range(0);
257   const size_t cache_line_size_max = 128;
258   const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
259 
260   std::random_device random_device;
261   auto rng = std::mt19937(random_device());
262   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
263 
264   const size_t num_buffers = 1 +
265     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
266   std::vector<float> x(elements);
267   std::vector<float> y(packed_elements * num_buffers);
268 
269   std::generate(x.begin(), x.end(), std::ref(f32rng));
270 
271   benchmark::utils::DisableDenormals();
272 
273   xnn_f32_expminus_params expminus_params;
274   xnn_f32_minmax_params minmax_params;
275   init_expminus_params(&expminus_params);
276   init_minmax_params(&minmax_params, -INFINITY, INFINITY);
277 
278   size_t buffer_index = 0;
279   for (auto _ : state) {
280     benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float));
281     if (++buffer_index == num_buffers) {
282       buffer_index = 0;
283     }
284 
285     const auto start = std::chrono::high_resolution_clock::now();
286     float x_max = nanf("");
287     rmax(elements * sizeof(float), x.data(), &x_max);
288     float y_sum = nanf("");
289     raddstoreexpminusmax(elements * sizeof(float), x.data(), &x_max, y.data() + packed_elements * buffer_index, &y_sum, &expminus_params);
290     const float inv_y_sum = 1.0f / y_sum;
291     vmulc(elements * sizeof(float), y.data() + packed_elements * buffer_index, &inv_y_sum, y.data() + packed_elements * buffer_index, &minmax_params);
292     const auto end = std::chrono::high_resolution_clock::now();
293 
294     const auto elapsed_seconds =
295       std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
296     state.SetIterationTime(elapsed_seconds.count());
297   }
298 
299   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
300   if (cpu_frequency != 0) {
301     state.counters["cpufreq"] = cpu_frequency;
302   }
303 
304   const size_t elements_per_iteration = elements;
305   state.counters["elements"] =
306     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
307 
308   const size_t bytes_per_iteration = 2 * elements * sizeof(float);
309   state.counters["bytes"] =
310     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
311 }
312 
TwoPassSoftMax(benchmark::State & state,xnn_f32_raddextexp_ukernel_function raddextexp,xnn_f32_vscaleextexp_ukernel_function vscaleextexp,benchmark::utils::IsaCheckFunction isa_check=nullptr)313 static void TwoPassSoftMax(
314   benchmark::State& state,
315   xnn_f32_raddextexp_ukernel_function raddextexp,
316   xnn_f32_vscaleextexp_ukernel_function vscaleextexp,
317   benchmark::utils::IsaCheckFunction isa_check = nullptr)
318 {
319   if (isa_check && !isa_check(state)) {
320     return;
321   }
322 
323   const size_t elements = state.range(0);
324   const size_t cache_line_size_max = 128;
325   const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
326 
327   std::random_device random_device;
328   auto rng = std::mt19937(random_device());
329   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
330 
331   const size_t num_buffers = 1 +
332     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
333   std::vector<float> x(elements);
334   std::vector<float> y(packed_elements * num_buffers);
335 
336   std::generate(x.begin(), x.end(), std::ref(f32rng));
337 
338   benchmark::utils::DisableDenormals();
339 
340   size_t buffer_index = 0;
341   for (auto _ : state) {
342     benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float));
343     if (++buffer_index == num_buffers) {
344       buffer_index = 0;
345     }
346 
347     const auto start = std::chrono::high_resolution_clock::now();
348     float scale[2];
349     raddextexp(elements * sizeof(float), x.data(), scale);
350     vscaleextexp(elements * sizeof(float), x.data(), y.data() + packed_elements * buffer_index, 1.0f / scale[0], -scale[1]);
351     const auto end = std::chrono::high_resolution_clock::now();
352 
353     const auto elapsed_seconds =
354       std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
355     state.SetIterationTime(elapsed_seconds.count());
356   }
357 
358   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
359   if (cpu_frequency != 0) {
360     state.counters["cpufreq"] = cpu_frequency;
361   }
362 
363   const size_t elements_per_iteration = elements;
364   state.counters["elements"] =
365     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
366 
367   const size_t bytes_per_iteration = 2 * elements * sizeof(float);
368   state.counters["bytes"] =
369     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
370 }
371 
CharacteristicArguments(benchmark::internal::Benchmark * b)372 static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
373   for (int32_t n = 1000; n <= 100000000; n *= 10) {
374     b->Arg(n);
375     b->Arg(3 * n);
376   }
377 }
378 
379 #ifdef BENCHMARK_INTEL_DNNL
380   BENCHMARK(DNNLSoftArgMax)->Apply(CharacteristicArguments)->UseManualTime();
381 #endif
382 
383 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
384   BENCHMARK_CAPTURE(TwoPassSoftMax, avx2_p5,
385     xnn_f32_raddextexp_ukernel__avx2_p5_x96,
386     xnn_f32_vscaleextexp_ukernel__avx2_p5_x40,
387     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
388   BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx2_p5,
389     xnn_f32_rmax_ukernel__avx,
390     xnn_f32_raddexpminusmax_ukernel__avx2_p5_x96,
391     xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x24,
392     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
393   BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx2_p5,
394     xnn_f32_rmax_ukernel__avx,
395     xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x64_acc2,
396     xnn_init_f32_expminus_avx2_rr1_p5_params,
397     xnn_f32_vmulc_minmax_ukernel__avx_x16,
398     xnn_init_f32_minmax_avx_params,
399     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
400 
401   BENCHMARK_CAPTURE(TwoPassSoftMax, avx512f_p5_scalef,
402     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3,
403     xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x16,
404     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
405   BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx512f_p5_scalef,
406     xnn_f32_rmax_ukernel__avx512f,
407     xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc4,
408     xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x16,
409     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
410   BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx512f_p5_scalef,
411     xnn_f32_rmax_ukernel__avx512f,
412     xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc2,
413     xnn_init_f32_expminus_avx512_rr1_p5_params,
414     xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
415     xnn_init_f32_minmax_scalar_params,
416     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
417 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
418 
419 #ifndef XNNPACK_BENCHMARK_NO_MAIN
420 BENCHMARK_MAIN();
421 #endif
422