1 #include <algorithm>
2 #include <cfloat>
3 #include <chrono>
4 #include <cmath>
5 #include <functional>
6 #include <random>
7 #include <vector>
8
9 #include "bench/utils.h"
10 #include <xnnpack/common.h>
11 #include <xnnpack/params.h>
12 #include <xnnpack/params-init.h>
13 #include <xnnpack/raddexpminusmax.h>
14 #include <xnnpack/raddextexp.h>
15 #include <xnnpack/raddstoreexpminusmax.h>
16 #include <xnnpack/rmax.h>
17 #include <xnnpack/vbinary.h>
18 #include <xnnpack/vscaleexpminusmax.h>
19 #include <xnnpack/vscaleextexp.h>
20
21 #include <benchmark/benchmark.h>
22 #ifdef BENCHMARK_INTEL_DNNL
23 #include <dnnl.h>
24 #endif // BENCHMARK_INTEL_DNNL
25
26
27 #ifdef BENCHMARK_INTEL_DNNL
DNNLSoftArgMax(benchmark::State & state)28 static void DNNLSoftArgMax(
29 benchmark::State& state)
30 {
31 const size_t elements = state.range(0);
32 const size_t cache_line_size_max = 128;
33 const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
34
35 std::random_device random_device;
36 auto rng = std::mt19937(random_device());
37 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
38
39 const size_t num_buffers = 1 +
40 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
41 std::vector<float> x(elements);
42 std::vector<float> y(packed_elements * num_buffers);
43
44 std::generate(x.begin(), x.end(), std::ref(f32rng));
45
46 dnnl_engine_t engine;
47 if (dnnl_engine_create(&engine, dnnl_cpu, 0) != dnnl_success) {
48 state.SkipWithError("failed to create CPU engine");
49 return;
50 }
51
52 dnnl_dim_t input_output_shape[1] = { static_cast<int>(elements) };
53
54 dnnl_memory_desc_t memory_descriptor = { 0 };
55 if (dnnl_memory_desc_init_by_tag(
56 &memory_descriptor, 1, input_output_shape, dnnl_f32, dnnl_x) != dnnl_success)
57 {
58 state.SkipWithError("failed to create input memory descriptor");
59 return;
60 }
61
62 dnnl_memory_t input_memory = nullptr;
63 if (dnnl_memory_create(
64 &input_memory, &memory_descriptor, engine, x.data()) != dnnl_success)
65 {
66 state.SkipWithError("failed to create input memory");
67 return;
68 }
69
70 dnnl_memory_t output_memory = nullptr;
71 if (dnnl_memory_create(
72 &output_memory, &memory_descriptor, engine, y.data()) != dnnl_success)
73 {
74 state.SkipWithError("failed to create output memory");
75 return;
76 }
77
78 dnnl_softmax_desc_t softmax_forward_descriptor = {};
79 if (dnnl_softmax_forward_desc_init(
80 &softmax_forward_descriptor, dnnl_forward_inference,
81 &memory_descriptor, 0) != dnnl_success)
82 {
83 state.SkipWithError("failed to create SoftMax forward descriptor");
84 return;
85 }
86
87 dnnl_primitive_desc_t softmax_primitive_descriptor = nullptr;
88 if (dnnl_primitive_desc_create(
89 &softmax_primitive_descriptor, &softmax_forward_descriptor,
90 nullptr /* primitive attributes */, engine, nullptr /* hint */) != dnnl_success)
91 {
92 state.SkipWithError("failed to create SoftMax primitive descriptor");
93 return;
94 }
95
96 dnnl_primitive_t softmax_primitive = nullptr;
97 if (dnnl_primitive_create(
98 &softmax_primitive, softmax_primitive_descriptor) != dnnl_success)
99 {
100 state.SkipWithError("failed to create SoftMax primitive");
101 return;
102 }
103
104 dnnl_exec_arg_t softmax_args[2] = {
105 {DNNL_ARG_SRC, input_memory},
106 {DNNL_ARG_DST, output_memory},
107 };
108
109 dnnl_stream_t stream = nullptr;
110 if (dnnl_stream_create(&stream, engine, dnnl_stream_default_flags) != dnnl_success) {
111 state.SkipWithError("failed to create stream");
112 return;
113 }
114
115 size_t buffer_index = 0;
116 for (auto _ : state) {
117 benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float));
118 if (++buffer_index == num_buffers) {
119 buffer_index = 0;
120 }
121
122 const auto start = std::chrono::high_resolution_clock::now();
123 if (dnnl_primitive_execute(
124 softmax_primitive, stream, 2, softmax_args) != dnnl_success)
125 {
126 state.SkipWithError("failed to execute SoftMax");
127 return;
128 }
129 const auto end = std::chrono::high_resolution_clock::now();
130
131 const auto elapsed_seconds =
132 std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
133 state.SetIterationTime(elapsed_seconds.count());
134 }
135
136 if (dnnl_stream_destroy(stream) != dnnl_success) {
137 state.SkipWithError("failed to destroy stream");
138 return;
139 }
140
141 if (dnnl_primitive_desc_destroy(softmax_primitive_descriptor) != dnnl_success) {
142 state.SkipWithError("failed to destroy SoftMax primitive descriptor");
143 return;
144 }
145
146 if (dnnl_primitive_destroy(softmax_primitive) != dnnl_success) {
147 state.SkipWithError("failed to destroy SoftMax primitive");
148 return;
149 }
150
151 if (dnnl_memory_destroy(input_memory) != dnnl_success) {
152 state.SkipWithError("failed to destroy input memory");
153 return;
154 }
155
156 if (dnnl_memory_destroy(output_memory) != dnnl_success) {
157 state.SkipWithError("failed to destroy output memory");
158 return;
159 }
160
161 if (dnnl_engine_destroy(engine) != dnnl_success) {
162 state.SkipWithError("failed to destroy engine");
163 return;
164 }
165
166 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
167 if (cpu_frequency != 0) {
168 state.counters["cpufreq"] = cpu_frequency;
169 }
170
171 const size_t elements_per_iteration = elements;
172 state.counters["elements"] =
173 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
174
175 const size_t bytes_per_iteration = 2 * elements * sizeof(float);
176 state.counters["bytes"] =
177 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
178 }
179 #endif // BENCHMARK_INTEL_DNNL
180
ThreePassSoftMaxWithRecomputing(benchmark::State & state,xnn_f32_rmax_ukernel_function rmax,xnn_f32_raddexpminusmax_ukernel_function raddexpminusmax,xnn_f32_vscaleexpminusmax_ukernel_function vscaleexpminusmax,benchmark::utils::IsaCheckFunction isa_check=nullptr)181 static void ThreePassSoftMaxWithRecomputing(
182 benchmark::State& state,
183 xnn_f32_rmax_ukernel_function rmax,
184 xnn_f32_raddexpminusmax_ukernel_function raddexpminusmax,
185 xnn_f32_vscaleexpminusmax_ukernel_function vscaleexpminusmax,
186 benchmark::utils::IsaCheckFunction isa_check = nullptr)
187 {
188 if (isa_check && !isa_check(state)) {
189 return;
190 }
191
192 const size_t elements = state.range(0);
193 const size_t cache_line_size_max = 128;
194 const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
195
196 std::random_device random_device;
197 auto rng = std::mt19937(random_device());
198 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
199
200 const size_t num_buffers = 1 +
201 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
202 std::vector<float> x(elements);
203 std::vector<float> y(packed_elements * num_buffers);
204
205 std::generate(x.begin(), x.end(), std::ref(f32rng));
206
207 benchmark::utils::DisableDenormals();
208
209 size_t buffer_index = 0;
210 for (auto _ : state) {
211 benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float));
212 if (++buffer_index == num_buffers) {
213 buffer_index = 0;
214 }
215
216 const auto start = std::chrono::high_resolution_clock::now();
217 float x_max = nanf("");
218 rmax(elements * sizeof(float), x.data(), &x_max);
219 float y_sum = nanf("");
220 raddexpminusmax(elements * sizeof(float), x.data(), &y_sum, x_max);
221 vscaleexpminusmax(elements * sizeof(float), x.data(), y.data() + packed_elements * buffer_index, x_max, 1.0f / y_sum);
222 const auto end = std::chrono::high_resolution_clock::now();
223
224 const auto elapsed_seconds =
225 std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
226 state.SetIterationTime(elapsed_seconds.count());
227 }
228
229 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
230 if (cpu_frequency != 0) {
231 state.counters["cpufreq"] = cpu_frequency;
232 }
233
234 const size_t elements_per_iteration = elements;
235 state.counters["elements"] =
236 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
237
238 const size_t bytes_per_iteration = 2 * elements * sizeof(float);
239 state.counters["bytes"] =
240 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
241 }
242
ThreePassSoftMaxWithReloading(benchmark::State & state,xnn_f32_rmax_ukernel_function rmax,xnn_f32_raddstoreexpminusmax_ukernel_function raddstoreexpminusmax,xnn_init_f32_expminus_params_fn init_expminus_params,xnn_f32_vbinary_minmax_ukernel_function vmulc,xnn_init_f32_minmax_params_fn init_minmax_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)243 static void ThreePassSoftMaxWithReloading(
244 benchmark::State& state,
245 xnn_f32_rmax_ukernel_function rmax,
246 xnn_f32_raddstoreexpminusmax_ukernel_function raddstoreexpminusmax,
247 xnn_init_f32_expminus_params_fn init_expminus_params,
248 xnn_f32_vbinary_minmax_ukernel_function vmulc,
249 xnn_init_f32_minmax_params_fn init_minmax_params,
250 benchmark::utils::IsaCheckFunction isa_check = nullptr)
251 {
252 if (isa_check && !isa_check(state)) {
253 return;
254 }
255
256 const size_t elements = state.range(0);
257 const size_t cache_line_size_max = 128;
258 const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
259
260 std::random_device random_device;
261 auto rng = std::mt19937(random_device());
262 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
263
264 const size_t num_buffers = 1 +
265 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
266 std::vector<float> x(elements);
267 std::vector<float> y(packed_elements * num_buffers);
268
269 std::generate(x.begin(), x.end(), std::ref(f32rng));
270
271 benchmark::utils::DisableDenormals();
272
273 xnn_f32_expminus_params expminus_params;
274 xnn_f32_minmax_params minmax_params;
275 init_expminus_params(&expminus_params);
276 init_minmax_params(&minmax_params, -INFINITY, INFINITY);
277
278 size_t buffer_index = 0;
279 for (auto _ : state) {
280 benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float));
281 if (++buffer_index == num_buffers) {
282 buffer_index = 0;
283 }
284
285 const auto start = std::chrono::high_resolution_clock::now();
286 float x_max = nanf("");
287 rmax(elements * sizeof(float), x.data(), &x_max);
288 float y_sum = nanf("");
289 raddstoreexpminusmax(elements * sizeof(float), x.data(), &x_max, y.data() + packed_elements * buffer_index, &y_sum, &expminus_params);
290 const float inv_y_sum = 1.0f / y_sum;
291 vmulc(elements * sizeof(float), y.data() + packed_elements * buffer_index, &inv_y_sum, y.data() + packed_elements * buffer_index, &minmax_params);
292 const auto end = std::chrono::high_resolution_clock::now();
293
294 const auto elapsed_seconds =
295 std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
296 state.SetIterationTime(elapsed_seconds.count());
297 }
298
299 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
300 if (cpu_frequency != 0) {
301 state.counters["cpufreq"] = cpu_frequency;
302 }
303
304 const size_t elements_per_iteration = elements;
305 state.counters["elements"] =
306 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
307
308 const size_t bytes_per_iteration = 2 * elements * sizeof(float);
309 state.counters["bytes"] =
310 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
311 }
312
TwoPassSoftMax(benchmark::State & state,xnn_f32_raddextexp_ukernel_function raddextexp,xnn_f32_vscaleextexp_ukernel_function vscaleextexp,benchmark::utils::IsaCheckFunction isa_check=nullptr)313 static void TwoPassSoftMax(
314 benchmark::State& state,
315 xnn_f32_raddextexp_ukernel_function raddextexp,
316 xnn_f32_vscaleextexp_ukernel_function vscaleextexp,
317 benchmark::utils::IsaCheckFunction isa_check = nullptr)
318 {
319 if (isa_check && !isa_check(state)) {
320 return;
321 }
322
323 const size_t elements = state.range(0);
324 const size_t cache_line_size_max = 128;
325 const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
326
327 std::random_device random_device;
328 auto rng = std::mt19937(random_device());
329 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
330
331 const size_t num_buffers = 1 +
332 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
333 std::vector<float> x(elements);
334 std::vector<float> y(packed_elements * num_buffers);
335
336 std::generate(x.begin(), x.end(), std::ref(f32rng));
337
338 benchmark::utils::DisableDenormals();
339
340 size_t buffer_index = 0;
341 for (auto _ : state) {
342 benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float));
343 if (++buffer_index == num_buffers) {
344 buffer_index = 0;
345 }
346
347 const auto start = std::chrono::high_resolution_clock::now();
348 float scale[2];
349 raddextexp(elements * sizeof(float), x.data(), scale);
350 vscaleextexp(elements * sizeof(float), x.data(), y.data() + packed_elements * buffer_index, 1.0f / scale[0], -scale[1]);
351 const auto end = std::chrono::high_resolution_clock::now();
352
353 const auto elapsed_seconds =
354 std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
355 state.SetIterationTime(elapsed_seconds.count());
356 }
357
358 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
359 if (cpu_frequency != 0) {
360 state.counters["cpufreq"] = cpu_frequency;
361 }
362
363 const size_t elements_per_iteration = elements;
364 state.counters["elements"] =
365 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
366
367 const size_t bytes_per_iteration = 2 * elements * sizeof(float);
368 state.counters["bytes"] =
369 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
370 }
371
CharacteristicArguments(benchmark::internal::Benchmark * b)372 static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
373 for (int32_t n = 1000; n <= 100000000; n *= 10) {
374 b->Arg(n);
375 b->Arg(3 * n);
376 }
377 }
378
379 #ifdef BENCHMARK_INTEL_DNNL
380 BENCHMARK(DNNLSoftArgMax)->Apply(CharacteristicArguments)->UseManualTime();
381 #endif
382
383 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
384 BENCHMARK_CAPTURE(TwoPassSoftMax, avx2_p5,
385 xnn_f32_raddextexp_ukernel__avx2_p5_x96,
386 xnn_f32_vscaleextexp_ukernel__avx2_p5_x40,
387 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
388 BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx2_p5,
389 xnn_f32_rmax_ukernel__avx,
390 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x96,
391 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x24,
392 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
393 BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx2_p5,
394 xnn_f32_rmax_ukernel__avx,
395 xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x64_acc2,
396 xnn_init_f32_expminus_avx2_rr1_p5_params,
397 xnn_f32_vmulc_minmax_ukernel__avx_x16,
398 xnn_init_f32_minmax_avx_params,
399 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
400
401 BENCHMARK_CAPTURE(TwoPassSoftMax, avx512f_p5_scalef,
402 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3,
403 xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x16,
404 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
405 BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx512f_p5_scalef,
406 xnn_f32_rmax_ukernel__avx512f,
407 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc4,
408 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x16,
409 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
410 BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx512f_p5_scalef,
411 xnn_f32_rmax_ukernel__avx512f,
412 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc2,
413 xnn_init_f32_expminus_avx512_rr1_p5_params,
414 xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
415 xnn_init_f32_minmax_scalar_params,
416 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
417 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
418
419 #ifndef XNNPACK_BENCHMARK_NO_MAIN
420 BENCHMARK_MAIN();
421 #endif
422