1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <algorithm>
10 #include <cfloat>
11 #include <chrono>
12 #include <cmath>
13 #include <functional>
14 #include <mutex>
15 #include <random>
16 #include <vector>
17
18 #include <benchmark/benchmark.h>
19 #ifdef BENCHMARK_RUY
20 #include "ruy/ruy.h"
21 #endif // BENCHMARK_RUY
22 #include "bench/gemm.h"
23 #include "bench/utils.h"
24 #include <xnnpack/AlignedAllocator.h>
25 #include <xnnpack/allocator.h>
26 #include <xnnpack/common.h>
27 #include <xnnpack/gemm.h>
28 #include <xnnpack/pack.h>
29 #include <xnnpack/packx.h>
30 #include <xnnpack/params-init.h>
31 #include <xnnpack/params.h>
32 #include <xnnpack/ppmm.h>
33
34
GEMMBenchmark(benchmark::State & state,xnn_f32_gemm_minmax_ukernel_function gemm,size_t mr,size_t nr,size_t kr,size_t sr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)35 static void GEMMBenchmark(benchmark::State& state,
36 xnn_f32_gemm_minmax_ukernel_function gemm,
37 size_t mr, size_t nr, size_t kr, size_t sr,
38 xnn_init_f32_minmax_params_fn init_params,
39 benchmark::utils::IsaCheckFunction isa_check = nullptr)
40 {
41 if (isa_check && !isa_check(state)) {
42 return;
43 }
44
45 const size_t mc = state.range(0);
46 const size_t nc = state.range(1);
47 const size_t kc = state.range(2);
48
49 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
50 const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
51
52 std::random_device random_device;
53 auto rng = std::mt19937(random_device());
54 auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
55
56 std::vector<float> a(mc * kc);
57 std::generate(a.begin(), a.end(), std::ref(f32rng));
58 std::vector<float> k(nc * kc);
59 std::generate(k.begin(), k.end(), std::ref(f32rng));
60 std::vector<float> b(nc);
61 std::generate(b.begin(), b.end(), std::ref(f32rng));
62
63 const size_t w_elements = nc_stride * kc_stride + nc_stride;
64 const size_t c_elements = mc * nc;
65 const size_t num_buffers = 1 +
66 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
67 sizeof(float) * (w_elements + c_elements));
68
69 std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
70 std::fill(w.begin(), w.end(), 0.0f);
71 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
72 std::vector<float> c(c_elements * num_buffers);
73 std::fill(c.begin(), c.end(), std::nanf(""));
74
75 xnn_f32_minmax_params params;
76 init_params(¶ms,
77 -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
78
79 size_t buffer_index = 0;
80 for (auto _ : state) {
81 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
82 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
83 // - W is not in cache (for any cache level)
84 // - C is not in cache (for any cache level)
85 state.PauseTiming();
86 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
87 buffer_index = (buffer_index + 1) % num_buffers;
88 state.ResumeTiming();
89
90 for (uint32_t m = 0; m < mc; m += mr) {
91 const uint32_t mb = min(mc - m, mr);
92 gemm(
93 mb, nc, kc * sizeof(float),
94 a.data() + m * kc, kc * sizeof(float),
95 w.data() + buffer_index * nc_stride * (kc_stride + 1),
96 c.data() + (buffer_index * mc + m) * nc, nc * sizeof(float), nr * sizeof(float),
97 ¶ms);
98 }
99 }
100
101 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
102 if (cpu_frequency != 0) {
103 state.counters["cpufreq"] = cpu_frequency;
104 }
105
106 state.counters["FLOPS"] = benchmark::Counter(
107 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
108 }
109
PPMM1PBenchmark(benchmark::State & state,xnn_f32_ppmm_minmax_ukernel_function ppmm,xnn_x32_packx_ukernel_function packx,size_t mr,size_t nr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)110 static void PPMM1PBenchmark(benchmark::State& state,
111 xnn_f32_ppmm_minmax_ukernel_function ppmm,
112 xnn_x32_packx_ukernel_function packx,
113 size_t mr, size_t nr,
114 xnn_init_f32_minmax_params_fn init_params,
115 benchmark::utils::IsaCheckFunction isa_check = nullptr)
116 {
117 if (isa_check && !isa_check(state)) {
118 return;
119 }
120
121 const size_t mc = state.range(0);
122 const size_t nc = state.range(1);
123 const size_t kc = state.range(2);
124
125 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
126
127 std::random_device random_device;
128 auto rng = std::mt19937(random_device());
129 auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
130
131 std::vector<float> a(mc * kc);
132 std::generate(a.begin(), a.end(), std::ref(f32rng));
133 std::vector<float> k(nc * kc);
134 std::generate(k.begin(), k.end(), std::ref(f32rng));
135 std::vector<float> b(nc);
136 std::generate(b.begin(), b.end(), std::ref(f32rng));
137
138 std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> t(mr * kc);
139
140 const size_t w_elements = nc_stride * kc + nc_stride;
141 const size_t c_elements = mc * nc;
142 const size_t num_buffers = 1 +
143 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
144 sizeof(float) * (w_elements + c_elements));
145
146 std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
147 std::fill(w.begin(), w.end(), 0.0f);
148 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data(), 0, nullptr);
149 std::vector<float> c(c_elements * num_buffers);
150 std::fill(c.begin(), c.end(), std::nanf(""));
151
152 xnn_f32_minmax_params params;
153 init_params(¶ms,
154 -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
155
156 size_t buffer_index = 0;
157 for (auto _ : state) {
158 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
159 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
160 // - W is not in cache (for any cache level)
161 // - C is not in cache (for any cache level)
162 state.PauseTiming();
163 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
164 buffer_index = (buffer_index + 1) % num_buffers;
165 state.ResumeTiming();
166
167 for (uint32_t m = 0; m < mc; m += mr) {
168 const uint32_t mb = min(mc - m, mr);
169 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data());
170 ppmm(
171 mb, nc, kc * sizeof(float),
172 reinterpret_cast<const float*>(t.data()),
173 w.data() + nc_stride * buffer_index * (kc + 1),
174 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
175 ¶ms);
176 }
177 }
178
179 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
180 if (cpu_frequency != 0) {
181 state.counters["cpufreq"] = cpu_frequency;
182 }
183
184 state.counters["FLOPS"] = benchmark::Counter(
185 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
186 }
187
PPMM2PBenchmark(benchmark::State & state,xnn_f32_ppmm_minmax_ukernel_function ppmm,xnn_x32_packx_ukernel_function packx,size_t mr,size_t nr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)188 static void PPMM2PBenchmark(benchmark::State& state,
189 xnn_f32_ppmm_minmax_ukernel_function ppmm,
190 xnn_x32_packx_ukernel_function packx,
191 size_t mr, size_t nr,
192 xnn_init_f32_minmax_params_fn init_params,
193 benchmark::utils::IsaCheckFunction isa_check = nullptr)
194 {
195 if (isa_check && !isa_check(state)) {
196 return;
197 }
198
199 const size_t mc = state.range(0);
200 const size_t nc = state.range(1);
201 const size_t kc = state.range(2);
202
203 const size_t mc_stride = benchmark::utils::RoundUp(mc, mr);
204 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
205
206 std::random_device random_device;
207 auto rng = std::mt19937(random_device());
208 auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
209
210 std::vector<float> a(mc * kc);
211 std::generate(a.begin(), a.end(), std::ref(f32rng));
212 std::vector<float> k(nc * kc);
213 std::generate(k.begin(), k.end(), std::ref(f32rng));
214 std::vector<float> b(nc);
215 std::generate(b.begin(), b.end(), std::ref(f32rng));
216
217 std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> t(mc_stride * kc);
218
219 const size_t w_elements = nc_stride * kc + nc_stride;
220 const size_t c_elements = mc * nc;
221 const size_t num_buffers = 1 +
222 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
223 sizeof(float) * (w_elements + c_elements));
224
225 std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
226 std::fill(w.begin(), w.end(), 0.0f);
227 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data(), 0, nullptr);
228 std::vector<float> c(c_elements * num_buffers);
229 std::fill(c.begin(), c.end(), std::nanf(""));
230
231 xnn_f32_minmax_params params;
232 init_params(¶ms,
233 -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
234
235 size_t buffer_index = 0;
236 for (auto _ : state) {
237 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
238 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
239 // - W is not in cache (for any cache level)
240 // - C is not in cache (for any cache level)
241 state.PauseTiming();
242 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
243 buffer_index = (buffer_index + 1) % num_buffers;
244 state.ResumeTiming();
245
246 for (uint32_t m = 0; m < mc; m += mr) {
247 const uint32_t mb = min(mc - m, mr);
248 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data() + m * kc);
249 }
250 for (uint32_t m = 0; m < mc; m += mr) {
251 const uint32_t mb = min(mc - m, mr);
252 ppmm(
253 mb, nc, kc * sizeof(float),
254 reinterpret_cast<const float*>(t.data() + m * kc),
255 w.data() + nc_stride * buffer_index * (kc + 1),
256 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
257 ¶ms);
258 }
259 }
260
261 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
262 if (cpu_frequency != 0) {
263 state.counters["cpufreq"] = cpu_frequency;
264 }
265
266 state.counters["FLOPS"] = benchmark::Counter(
267 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
268 }
269
270 #ifdef BENCHMARK_RUY
RuyBenchmark(benchmark::State & state,uint32_t threads)271 static void RuyBenchmark(benchmark::State& state, uint32_t threads)
272 {
273 std::random_device random_device;
274 auto rng = std::mt19937(random_device());
275 auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
276
277 const size_t mc = state.range(0);
278 const size_t nc = state.range(1);
279 const size_t kc = state.range(2);
280
281 const size_t num_buffers = 1 +
282 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
283 sizeof(float) * (nc * (mc + kc + 1)));
284
285 std::vector<float> a(mc * kc);
286 std::generate(a.begin(), a.end(), std::ref(f32rng));
287 std::vector<float> k(num_buffers * nc * kc);
288 std::generate(k.begin(), k.end(), std::ref(f32rng));
289 std::vector<float> b(num_buffers * nc);
290 std::generate(b.begin(), b.end(), std::ref(f32rng));
291 std::vector<float> c(num_buffers * nc * mc);
292 std::fill(c.begin(), c.end(), std::nanf(""));
293
294 // Note: context must be static to avoid the cost of re-creating it for each benchmark.
295 static ruy::Context context;
296 context.set_max_num_threads(threads);
297
298 ruy::Matrix<float> ruy_a;
299 ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, ruy_a.mutable_layout());
300 ruy::Matrix<float> ruy_b;
301 ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, ruy_b.mutable_layout());
302 ruy_b.set_data(a.data());
303 ruy::Matrix<float> ruy_c;
304 ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, ruy_c.mutable_layout());
305
306 ruy::MulParams<float, float> mul_params;
307
308 // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
309 // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
310 // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
311 // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
312 static std::once_flag warmup;
313 std::call_once(warmup, [&](){
314 auto start = std::chrono::steady_clock::now();
315 do {
316 ruy_a.set_data(k.data());
317 ruy_c.set_data(c.data());
318 mul_params.set_bias(b.data());
319
320 ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
321 } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
322 });
323
324 size_t buffer_index = 0;
325 for (auto _ : state) {
326 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
327 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
328 // - K is not in cache (for any cache level)
329 // - B is not in cache (for any cache level)
330 // - C is not in cache (for any cache level)
331 state.PauseTiming();
332 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
333 buffer_index = (buffer_index + 1) % num_buffers;
334 state.ResumeTiming();
335
336 ruy_a.set_data(k.data() + buffer_index * nc * kc);
337 ruy_c.set_data(c.data() + buffer_index * mc * nc);
338 mul_params.set_bias(b.data() + buffer_index * nc);
339
340 ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
341 }
342
343 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
344 if (cpu_frequency != 0) {
345 state.counters["cpufreq"] = cpu_frequency;
346 }
347
348 state.counters["FLOPS"] = benchmark::Counter(
349 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
350 }
351
ruy_st(benchmark::State & state,const char * net)352 static void ruy_st(benchmark::State& state, const char* net)
353 {
354 RuyBenchmark(state, 1);
355 }
356 #endif // BENCHMARK_RUY
357
358 #if XNN_PLATFORM_JIT
GEMMBenchmark(benchmark::State & state,xnn_jit_gemm_code_generator_function generator,size_t mr,size_t nr,size_t kr,size_t sr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)359 static void GEMMBenchmark(benchmark::State& state,
360 xnn_jit_gemm_code_generator_function generator,
361 size_t mr, size_t nr, size_t kr, size_t sr,
362 xnn_init_f32_minmax_params_fn init_params,
363 benchmark::utils::IsaCheckFunction isa_check = nullptr)
364 {
365 if (isa_check && !isa_check(state)) {
366 return;
367 }
368
369 const size_t mc = state.range(0);
370 const size_t nc = state.range(1);
371 const size_t kc = state.range(2);
372
373 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
374 const size_t kc_stride = benchmark::utils::RoundUp(kc, kr);
375
376 std::random_device random_device;
377 auto rng = std::mt19937(random_device());
378 auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
379
380 std::vector<float> a(mc * kc);
381 std::generate(a.begin(), a.end(), std::ref(f32rng));
382 std::vector<float> k(nc * kc);
383 std::generate(k.begin(), k.end(), std::ref(f32rng));
384 std::vector<float> b(nc);
385 std::generate(b.begin(), b.end(), std::ref(f32rng));
386
387 const size_t w_elements = nc_stride * kc_stride + nc_stride;
388 const size_t c_elements = mc * nc;
389 const size_t num_buffers = 1 +
390 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
391 sizeof(float) * (w_elements + c_elements));
392
393 std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
394 std::fill(w.begin(), w.end(), 0.0f);
395 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
396 std::vector<float> c(c_elements * num_buffers);
397 std::fill(c.begin(), c.end(), std::nanf(""));
398
399 xnn_f32_minmax_params params;
400 init_params(¶ms,
401 -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
402
403 xnn_code_buffer code_buffer;
404 xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
405 jit_gemm_params jit_params = {
406 .f32_minmax = {
407 .min = -std::numeric_limits<float>::infinity(),
408 .max = +std::numeric_limits<float>::infinity()
409 }
410 };
411 generator(&code_buffer, nc, kc * sizeof(float), &jit_params);
412 xnn_f32_gemm_minmax_ukernel_function gemm = reinterpret_cast<xnn_f32_gemm_minmax_ukernel_function>(code_buffer.code);
413
414 size_t buffer_index = 0;
415 for (auto _ : state) {
416 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
417 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
418 // - W is not in cache (for any cache level)
419 // - C is not in cache (for any cache level)
420 state.PauseTiming();
421 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
422 buffer_index = (buffer_index + 1) % num_buffers;
423 state.ResumeTiming();
424
425 for (uint32_t m = 0; m < mc; m += mr) {
426 const uint32_t mb = min(mc - m, mr);
427 gemm(
428 mb, nc, kc * sizeof(float),
429 a.data() + m * kc, kc * sizeof(float),
430 w.data() + buffer_index * nc_stride * (kc_stride + 1),
431 c.data() + (buffer_index * mc + m) * nc, nc * sizeof(float), nr * sizeof(float),
432 ¶ms);
433 }
434 }
435
436 xnn_release_code_memory(&code_buffer);
437
438 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
439 if (cpu_frequency != 0) {
440 state.counters["cpufreq"] = cpu_frequency;
441 }
442
443 state.counters["FLOPS"] = benchmark::Counter(
444 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
445 }
446 #endif // XNN_PLATFORM_JIT
447
448 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
f32_gemm_1x8__aarch64_neonfma_ld64(benchmark::State & state,const char * net)449 static void f32_gemm_1x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
450 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, 1, 8, 1, 1,
451 xnn_init_f32_minmax_scalar_params);
452 }
f32_gemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)453 static void f32_gemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
454 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1,
455 xnn_init_f32_minmax_scalar_params);
456 }
f32_gemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)457 static void f32_gemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
458 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, 1, 8, 1, 1,
459 xnn_init_f32_minmax_scalar_params);
460 }
f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)461 static void f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
462 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1,
463 xnn_init_f32_minmax_scalar_params);
464 }
f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)465 static void f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
466 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8, 1, 1,
467 xnn_init_f32_minmax_scalar_params);
468 }
f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)469 static void f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
470 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1,
471 xnn_init_f32_minmax_scalar_params);
472 }
f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)473 static void f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
474 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53, 4, 8, 1, 1,
475 xnn_init_f32_minmax_scalar_params);
476 }
f32_gemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State & state,const char * net)477 static void f32_gemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
478 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, 4, 8, 1, 1,
479 xnn_init_f32_minmax_scalar_params);
480 }
f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)481 static void f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
482 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1,
483 xnn_init_f32_minmax_scalar_params);
484 }
f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)485 static void f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
486 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, 4, 8, 1, 1,
487 xnn_init_f32_minmax_scalar_params);
488 }
f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State & state,const char * net)489 static void f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
490 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1,
491 xnn_init_f32_minmax_scalar_params);
492 }
f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State & state,const char * net)493 static void f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
494 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld128, 4, 8, 1, 1,
495 xnn_init_f32_minmax_scalar_params);
496 }
f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)497 static void f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
498 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75, 5, 8, 1, 1,
499 xnn_init_f32_minmax_scalar_params);
500 }
f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)501 static void f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
502 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, 5, 8, 1, 1,
503 xnn_init_f32_minmax_scalar_params);
504 }
f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State & state,const char * net)505 static void f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
506 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1,
507 xnn_init_f32_minmax_scalar_params);
508 }
f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State & state,const char * net)509 static void f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
510 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1,
511 xnn_init_f32_minmax_scalar_params);
512 }
f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)513 static void f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
514 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53, 6, 8, 1, 1,
515 xnn_init_f32_minmax_scalar_params);
516 }
f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State & state,const char * net)517 static void f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
518 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, 6, 8, 1, 1,
519 xnn_init_f32_minmax_scalar_params);
520 }
f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State & state,const char * net)521 static void f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
522 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1,
523 xnn_init_f32_minmax_scalar_params);
524 }
f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)525 static void f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
526 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1,
527 xnn_init_f32_minmax_scalar_params);
528 }
f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)529 static void f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
530 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, 6, 8, 1, 1,
531 xnn_init_f32_minmax_scalar_params);
532 }
f32_gemm_1x8__neonfma_lane_ld64(benchmark::State & state,const char * net)533 static void f32_gemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
534 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64, 1, 8, 1, 1,
535 xnn_init_f32_minmax_scalar_params);
536 }
f32_gemm_4x8__neonfma_lane_ld64(benchmark::State & state,const char * net)537 static void f32_gemm_4x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
538 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld64, 4, 8, 1, 1,
539 xnn_init_f32_minmax_scalar_params);
540 }
f32_gemm_4x8__neonfma_lane_ld128(benchmark::State & state,const char * net)541 static void f32_gemm_4x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
542 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, 4, 8, 1, 1,
543 xnn_init_f32_minmax_scalar_params);
544 }
f32_gemm_5x8__neonfma_lane_ld64(benchmark::State & state,const char * net)545 static void f32_gemm_5x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
546 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__neonfma_lane_ld64, 5, 8, 1, 1,
547 xnn_init_f32_minmax_scalar_params);
548 }
f32_gemm_6x8__neonfma_lane_ld64(benchmark::State & state,const char * net)549 static void f32_gemm_6x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
550 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64, 6, 8, 1, 1,
551 xnn_init_f32_minmax_scalar_params);
552 }
f32_gemm_6x8__neonfma_lane_ld128(benchmark::State & state,const char * net)553 static void f32_gemm_6x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
554 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128, 6, 8, 1, 1,
555 xnn_init_f32_minmax_scalar_params);
556 }
557
558 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_ld64)
BENCHMARK_GEMM(f32_gemm_1x12__aarch64_neonfma_cortex_a53)559 BENCHMARK_GEMM(f32_gemm_1x12__aarch64_neonfma_cortex_a53)
560 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a53)
561 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a75)
562 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75)
563 BENCHMARK_GEMM(f32_gemm_4x12__aarch64_neonfma_cortex_a53)
564 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a53)
565 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a55)
566 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a75)
567 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75)
568 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld128)
569 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld64)
570 BENCHMARK_GEMM(f32_gemm_5x8__aarch64_neonfma_cortex_a75)
571 BENCHMARK_GEMM(f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75)
572 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a53)
573 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a55)
574 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a73)
575 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a75)
576 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75)
577 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld64)
578 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld128)
579 BENCHMARK_GEMM(f32_gemm_1x8__neonfma_lane_ld64)
580 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld64)
581 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld128)
582 BENCHMARK_GEMM(f32_gemm_5x8__neonfma_lane_ld64)
583 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld64)
584 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld128)
585 #endif // XNN_ARCH_ARM64
586
587 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
588 static void f32_gemm_4x4__aarch32_vfp_ld64(benchmark::State& state, const char* net) {
589 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, 4, 4, 1, 1,
590 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckVFP);
591 }
f32_gemm_4x8__aarch32_neon_ld64(benchmark::State & state,const char * net)592 static void f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
593 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1,
594 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
595 }
f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State & state,const char * net)596 static void f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net) {
597 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1,
598 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
599 }
f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State & state,const char * net)600 static void f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
601 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1,
602 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
603 }
f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State & state,const char * net)604 static void f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
605 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1,
606 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
607 }
f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State & state,const char * net)608 static void f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
609 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1,
610 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
611 }
f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State & state,const char * net)612 static void f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net) {
613 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75, 4, 8, 1, 1,
614 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
615 }
616
617 BENCHMARK_GEMM(f32_gemm_4x4__aarch32_vfp_ld64)
BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_ld64)618 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_ld64)
619 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a7)
620 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a53)
621 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a55)
622 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a75)
623 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_prfm_cortex_a75)
624 #endif // XNN_ARCH_ARM
625
626 #if XNN_ARCH_ARM && XNN_PLATFORM_JIT
627 static void jit_f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net)
628 {
629 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1,
630 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
631 }
jit_f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State & state,const char * net)632 static void jit_f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net)
633 {
634 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1,
635 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
636 }
jit_f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State & state,const char * net)637 static void jit_f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net)
638 {
639 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1,
640 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
641 }
jit_f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State & state,const char * net)642 static void jit_f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net)
643 {
644 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1,
645 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
646 }
jit_f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State & state,const char * net)647 static void jit_f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net)
648 {
649 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1,
650 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
651 }
jit_f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State & state,const char * net)652 static void jit_f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net)
653 {
654 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, 4, 8, 1, 1,
655 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
656 }
657
658 BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_ld64)
BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a7)659 BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a7)
660 BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a53)
661 BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a55)
662 BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a75)
663 BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_prfm_cortex_a75)
664 #endif // XNN_ARCH_ARM && XNN_PLATFORM_JIT
665
666 #if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
667 static void jit_f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net)
668 {
669 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1,
670 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
671 }
jit_f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)672 static void jit_f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net)
673 {
674 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8, 1, 1,
675 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
676 }
jit_f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)677 static void jit_f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net)
678 {
679 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1,
680 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
681 }
jit_f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)682 static void jit_f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net)
683 {
684 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, 6, 8, 1, 1,
685 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
686 }
687 BENCHMARK_GEMM(jit_f32_gemm_1x8__aarch64_neonfma_cortex_a75)
BENCHMARK_GEMM(jit_f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75)688 BENCHMARK_GEMM(jit_f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75)
689 BENCHMARK_GEMM(jit_f32_gemm_6x8__aarch64_neonfma_cortex_a75)
690 BENCHMARK_GEMM(jit_f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75)
691 #endif // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
692
693 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
694 static void f32_gemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
695 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1,
696 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
697 }
f32_gemm_4x8__neon_lane_ld64(benchmark::State & state,const char * net)698 static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
699 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1,
700 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
701 }
f32_gemm_4x8__neon_lane_ld128(benchmark::State & state,const char * net)702 static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
703 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1,
704 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
705 }
f32_gemm_5x8__neon_lane_ld64(benchmark::State & state,const char * net)706 static void f32_gemm_5x8__neon_lane_ld64(benchmark::State& state, const char* net) {
707 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, 5, 8, 1, 1,
708 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
709 }
f32_gemm_6x8__neon_lane_ld64(benchmark::State & state,const char * net)710 static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
711 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1,
712 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
713 }
f32_gemm_6x8__neon_lane_ld128(benchmark::State & state,const char * net)714 static void f32_gemm_6x8__neon_lane_ld128(benchmark::State& state, const char* net) {
715 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, 6, 8, 1, 1,
716 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
717 }
f32_gemm_1x8__neonfma_dup_ld64(benchmark::State & state,const char * net)718 static void f32_gemm_1x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
719 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1,
720 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
721 }
f32_gemm_4x8__neonfma_dup_ld64(benchmark::State & state,const char * net)722 static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
723 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1,
724 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
725 }
f32_gemm_4x8__neonfma_dup_ld128(benchmark::State & state,const char * net)726 static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
727 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1,
728 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
729 }
f32_gemm_6x8__neonfma_dup_ld64(benchmark::State & state,const char * net)730 static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
731 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1,
732 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
733 }
f32_gemm_6x8__neonfma_dup_ld128(benchmark::State & state,const char * net)734 static void f32_gemm_6x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
735 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld128, 6, 8, 1, 1,
736 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
737 }
f32_gemm_1x8s4__neon(benchmark::State & state,const char * net)738 static void f32_gemm_1x8s4__neon(benchmark::State& state, const char* net) {
739 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8s4__neon, 1, 8, 1, 4,
740 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
741 }
f32_gemm_1x8s4__neonfma(benchmark::State & state,const char * net)742 static void f32_gemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
743 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, 1, 8, 1, 4,
744 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
745 }
f32_gemm_4x8s4__neon(benchmark::State & state,const char * net)746 static void f32_gemm_4x8s4__neon(benchmark::State& state, const char* net) {
747 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__neon, 4, 8, 1, 4,
748 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
749 }
f32_gemm_4x8s4__neonfma(benchmark::State & state,const char * net)750 static void f32_gemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
751 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma, 4, 8, 1, 4,
752 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
753 }
f32_gemm_6x8s4__neon(benchmark::State & state,const char * net)754 static void f32_gemm_6x8s4__neon(benchmark::State& state, const char* net) {
755 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__neon, 6, 8, 1, 4,
756 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
757 }
f32_gemm_6x8s4__neonfma(benchmark::State & state,const char * net)758 static void f32_gemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
759 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, 6, 8, 1, 4,
760 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
761 }
f32_gemm_8x8s4__neon(benchmark::State & state,const char * net)762 static void f32_gemm_8x8s4__neon(benchmark::State& state, const char* net) {
763 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x8s4__neon, 8, 8, 1, 4,
764 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
765 }
f32_gemm_8x8s4__neonfma(benchmark::State & state,const char * net)766 static void f32_gemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
767 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, 8, 8, 1, 4,
768 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
769 }
f32_ppmm_4x8_unipass__neonfma(benchmark::State & state,const char * net)770 static void f32_ppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) {
771 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8,
772 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
773 }
f32_ppmm_4x8_twopass__neonfma(benchmark::State & state,const char * net)774 static void f32_ppmm_4x8_twopass__neonfma(benchmark::State& state, const char* net) {
775 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8,
776 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
777 }
778
779 BENCHMARK_GEMM(f32_gemm_1x8__neon_lane_ld64)
BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld64)780 BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld64)
781 BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld128)
782 BENCHMARK_GEMM(f32_gemm_5x8__neon_lane_ld64)
783 BENCHMARK_GEMM(f32_gemm_6x8__neon_lane_ld64)
784 BENCHMARK_GEMM(f32_gemm_6x8__neon_lane_ld128)
785
786 BENCHMARK_GEMM(f32_gemm_1x8__neonfma_dup_ld64)
787 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld128)
788 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld64)
789 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_dup_ld64)
790 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_dup_ld128)
791
792 BENCHMARK_GEMM(f32_gemm_1x8s4__neon)
793 BENCHMARK_GEMM(f32_gemm_4x8s4__neon)
794 BENCHMARK_GEMM(f32_gemm_6x8s4__neon)
795 BENCHMARK_GEMM(f32_gemm_8x8s4__neon)
796
797 BENCHMARK_GEMM(f32_gemm_1x8s4__neonfma)
798 BENCHMARK_GEMM(f32_gemm_4x8s4__neonfma)
799 BENCHMARK_GEMM(f32_gemm_6x8s4__neonfma)
800 BENCHMARK_GEMM(f32_gemm_8x8s4__neonfma)
801
802 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__neonfma)
803 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__neonfma)
804 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
805
806
807 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
808 static void f32_gemm_1x16__avx512f_broadcast(benchmark::State& state, const char* net) {
809 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, 1, 16, 1, 1,
810 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
811 }
f32_gemm_4x16__avx512f_broadcast(benchmark::State & state,const char * net)812 static void f32_gemm_4x16__avx512f_broadcast(benchmark::State& state, const char* net) {
813 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16__avx512f_broadcast, 4, 16, 1, 1,
814 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
815 }
f32_gemm_5x16__avx512f_broadcast(benchmark::State & state,const char * net)816 static void f32_gemm_5x16__avx512f_broadcast(benchmark::State& state, const char* net) {
817 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16__avx512f_broadcast, 5, 16, 1, 1,
818 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
819 }
f32_gemm_6x16__avx512f_broadcast(benchmark::State & state,const char * net)820 static void f32_gemm_6x16__avx512f_broadcast(benchmark::State& state, const char* net) {
821 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, 6, 16, 1, 1,
822 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
823 }
f32_gemm_7x16__avx512f_broadcast(benchmark::State & state,const char * net)824 static void f32_gemm_7x16__avx512f_broadcast(benchmark::State& state, const char* net) {
825 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, 7, 16, 1, 1,
826 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
827 }
f32_gemm_8x16__avx512f_broadcast(benchmark::State & state,const char * net)828 static void f32_gemm_8x16__avx512f_broadcast(benchmark::State& state, const char* net) {
829 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, 8, 16, 1, 1,
830 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
831 }
832
f32_gemm_1x8__fma3_broadcast(benchmark::State & state,const char * net)833 static void f32_gemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
834 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1,
835 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
836 }
f32_gemm_4x8__fma3_broadcast(benchmark::State & state,const char * net)837 static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
838 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1,
839 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
840 }
f32_gemm_5x8__fma3_broadcast(benchmark::State & state,const char * net)841 static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
842 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1,
843 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
844 }
f32_gemm_6x8__fma3_broadcast(benchmark::State & state,const char * net)845 static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
846 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1,
847 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
848 }
f32_gemm_7x8__fma3_broadcast(benchmark::State & state,const char * net)849 static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
850 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1,
851 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
852 }
f32_gemm_8x8__fma3_broadcast(benchmark::State & state,const char * net)853 static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
854 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1,
855 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
856 }
f32_gemm_1x16__fma3_broadcast(benchmark::State & state,const char * net)857 static void f32_gemm_1x16__fma3_broadcast(benchmark::State& state, const char* net) {
858 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, 1, 16, 1, 1,
859 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
860 }
f32_gemm_3x16__fma3_broadcast(benchmark::State & state,const char * net)861 static void f32_gemm_3x16__fma3_broadcast(benchmark::State& state, const char* net) {
862 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, 4, 16, 1, 1,
863 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
864 }
f32_gemm_4x16__fma3_broadcast(benchmark::State & state,const char * net)865 static void f32_gemm_4x16__fma3_broadcast(benchmark::State& state, const char* net) {
866 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast, 4, 16, 1, 1,
867 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
868 }
f32_gemm_5x16__fma3_broadcast(benchmark::State & state,const char * net)869 static void f32_gemm_5x16__fma3_broadcast(benchmark::State& state, const char* net) {
870 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast, 5, 16, 1, 1,
871 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
872 }
873
f32_gemm_1x16s4__fma3_broadcast(benchmark::State & state,const char * net)874 static void f32_gemm_1x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
875 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast, 1, 16, 1, 4,
876 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
877 }
f32_gemm_3x16s4__fma3_broadcast(benchmark::State & state,const char * net)878 static void f32_gemm_3x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
879 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast, 4, 16, 1, 4,
880 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
881 }
f32_gemm_4x16s4__fma3_broadcast(benchmark::State & state,const char * net)882 static void f32_gemm_4x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
883 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast, 4, 16, 1, 4,
884 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
885 }
f32_gemm_5x16s4__fma3_broadcast(benchmark::State & state,const char * net)886 static void f32_gemm_5x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
887 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast, 5, 16, 1, 4,
888 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
889 }
890
f32_gemm_1x8__avx_broadcast(benchmark::State & state,const char * net)891 static void f32_gemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
892 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, 1, 8, 1, 1,
893 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
894 }
f32_gemm_4x8__avx_broadcast(benchmark::State & state,const char * net)895 static void f32_gemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
896 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__avx_broadcast, 4, 8, 1, 1,
897 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
898 }
f32_gemm_5x8__avx_broadcast(benchmark::State & state,const char * net)899 static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
900 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, 5, 8, 1, 1,
901 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
902 }
f32_gemm_6x8__avx_broadcast(benchmark::State & state,const char * net)903 static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
904 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, 6, 8, 1, 1,
905 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
906 }
f32_gemm_7x8__avx_broadcast(benchmark::State & state,const char * net)907 static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
908 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast, 7, 8, 1, 1,
909 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
910 }
f32_gemm_1x16__avx_broadcast(benchmark::State & state,const char * net)911 static void f32_gemm_1x16__avx_broadcast(benchmark::State& state, const char* net) {
912 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast, 1, 16, 1, 1,
913 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
914 }
f32_gemm_3x16__avx_broadcast(benchmark::State & state,const char * net)915 static void f32_gemm_3x16__avx_broadcast(benchmark::State& state, const char* net) {
916 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, 4, 16, 1, 1,
917 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
918 }
f32_gemm_4x16__avx_broadcast(benchmark::State & state,const char * net)919 static void f32_gemm_4x16__avx_broadcast(benchmark::State& state, const char* net) {
920 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast, 4, 16, 1, 1,
921 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
922 }
f32_gemm_5x16__avx_broadcast(benchmark::State & state,const char * net)923 static void f32_gemm_5x16__avx_broadcast(benchmark::State& state, const char* net) {
924 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast, 5, 16, 1, 1,
925 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
926 }
927
f32_gemm_1x8__sse2_dup(benchmark::State & state,const char * net)928 static void f32_gemm_1x8__sse2_dup(benchmark::State& state, const char* net) {
929 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, 1, 8, 1, 1,
930 xnn_init_f32_minmax_sse_params);
931 }
f32_gemm_3x8__sse2_dup(benchmark::State & state,const char * net)932 static void f32_gemm_3x8__sse2_dup(benchmark::State& state, const char* net) {
933 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup, 3, 8, 1, 1,
934 xnn_init_f32_minmax_sse_params);
935 }
f32_gemm_4x8__sse2_dup(benchmark::State & state,const char * net)936 static void f32_gemm_4x8__sse2_dup(benchmark::State& state, const char* net) {
937 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup, 4, 8, 1, 1,
938 xnn_init_f32_minmax_sse_params);
939 }
f32_gemm_5x8__sse2_dup(benchmark::State & state,const char * net)940 static void f32_gemm_5x8__sse2_dup(benchmark::State& state, const char* net) {
941 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, 5, 8, 1, 1,
942 xnn_init_f32_minmax_sse_params);
943 }
944
f32_gemm_1x8__sse_load1(benchmark::State & state,const char * net)945 static void f32_gemm_1x8__sse_load1(benchmark::State& state, const char* net) {
946 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__sse_load1, 1, 8, 1, 1,
947 xnn_init_f32_minmax_sse_params);
948 }
f32_gemm_3x8__sse_load1(benchmark::State & state,const char * net)949 static void f32_gemm_3x8__sse_load1(benchmark::State& state, const char* net) {
950 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, 3, 8, 1, 1,
951 xnn_init_f32_minmax_sse_params);
952 }
f32_gemm_4x8__sse_load1(benchmark::State & state,const char * net)953 static void f32_gemm_4x8__sse_load1(benchmark::State& state, const char* net) {
954 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, 4, 8, 1, 1,
955 xnn_init_f32_minmax_sse_params);
956 }
f32_gemm_5x8__sse_load1(benchmark::State & state,const char * net)957 static void f32_gemm_5x8__sse_load1(benchmark::State& state, const char* net) {
958 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, 5, 8, 1, 1,
959 xnn_init_f32_minmax_sse_params);
960 }
961
f32_gemm_1x8__sse_dup(benchmark::State & state,const char * net)962 static void f32_gemm_1x8__sse_dup(benchmark::State& state, const char* net) {
963 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__sse_dup, 1, 8, 1, 1,
964 xnn_init_f32_minmax_sse_params);
965 }
f32_gemm_3x8__sse_dup(benchmark::State & state,const char * net)966 static void f32_gemm_3x8__sse_dup(benchmark::State& state, const char* net) {
967 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, 3, 8, 1, 1,
968 xnn_init_f32_minmax_sse_params);
969 }
f32_gemm_4x8__sse_dup(benchmark::State & state,const char * net)970 static void f32_gemm_4x8__sse_dup(benchmark::State& state, const char* net) {
971 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__sse_dup, 4, 8, 1, 1,
972 xnn_init_f32_minmax_sse_params);
973 }
f32_gemm_5x8__sse_dup(benchmark::State & state,const char * net)974 static void f32_gemm_5x8__sse_dup(benchmark::State& state, const char* net) {
975 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, 5, 8, 1, 1,
976 xnn_init_f32_minmax_sse_params);
977 }
978
f32_gemm_1x8s4__sse(benchmark::State & state,const char * net)979 static void f32_gemm_1x8s4__sse(benchmark::State& state, const char* net) {
980 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8s4__sse, 1, 8, 1, 4,
981 xnn_init_f32_minmax_sse_params);
982 }
f32_gemm_3x8s4__sse(benchmark::State & state,const char * net)983 static void f32_gemm_3x8s4__sse(benchmark::State& state, const char* net) {
984 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__sse, 3, 8, 1, 4,
985 xnn_init_f32_minmax_sse_params);
986 }
f32_gemm_4x8s4__sse(benchmark::State & state,const char * net)987 static void f32_gemm_4x8s4__sse(benchmark::State& state, const char* net) {
988 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__sse, 4, 8, 1, 4,
989 xnn_init_f32_minmax_sse_params);
990 }
f32_gemm_5x8s4__sse(benchmark::State & state,const char * net)991 static void f32_gemm_5x8s4__sse(benchmark::State& state, const char* net) {
992 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__sse, 5, 8, 1, 4,
993 xnn_init_f32_minmax_sse_params);
994 }
995
f32_ppmm_4x8_unipass__sse(benchmark::State & state,const char * net)996 static void f32_ppmm_4x8_unipass__sse(benchmark::State& state, const char* net) {
997 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8,
998 xnn_init_f32_minmax_sse_params);
999 }
f32_ppmm_4x8_twopass__sse(benchmark::State & state,const char * net)1000 static void f32_ppmm_4x8_twopass__sse(benchmark::State& state, const char* net) {
1001 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8,
1002 xnn_init_f32_minmax_sse_params);
1003 }
1004
1005 BENCHMARK_GEMM(f32_gemm_1x16__avx512f_broadcast)
BENCHMARK_GEMM(f32_gemm_4x16__avx512f_broadcast)1006 BENCHMARK_GEMM(f32_gemm_4x16__avx512f_broadcast)
1007 BENCHMARK_GEMM(f32_gemm_5x16__avx512f_broadcast)
1008 BENCHMARK_GEMM(f32_gemm_6x16__avx512f_broadcast)
1009 BENCHMARK_GEMM(f32_gemm_7x16__avx512f_broadcast)
1010 BENCHMARK_GEMM(f32_gemm_8x16__avx512f_broadcast)
1011
1012 BENCHMARK_GEMM(f32_gemm_1x8__fma3_broadcast)
1013 BENCHMARK_GEMM(f32_gemm_4x8__fma3_broadcast)
1014 BENCHMARK_GEMM(f32_gemm_5x8__fma3_broadcast)
1015 BENCHMARK_GEMM(f32_gemm_6x8__fma3_broadcast)
1016 BENCHMARK_GEMM(f32_gemm_7x8__fma3_broadcast)
1017 BENCHMARK_GEMM(f32_gemm_8x8__fma3_broadcast)
1018 BENCHMARK_GEMM(f32_gemm_1x16__fma3_broadcast)
1019 BENCHMARK_GEMM(f32_gemm_3x16__fma3_broadcast)
1020 BENCHMARK_GEMM(f32_gemm_4x16__fma3_broadcast)
1021 BENCHMARK_GEMM(f32_gemm_5x16__fma3_broadcast)
1022
1023 BENCHMARK_GEMM(f32_gemm_1x16s4__fma3_broadcast)
1024 BENCHMARK_GEMM(f32_gemm_3x16s4__fma3_broadcast)
1025 BENCHMARK_GEMM(f32_gemm_4x16s4__fma3_broadcast)
1026 BENCHMARK_GEMM(f32_gemm_5x16s4__fma3_broadcast)
1027
1028 BENCHMARK_GEMM(f32_gemm_1x8__avx_broadcast)
1029 BENCHMARK_GEMM(f32_gemm_4x8__avx_broadcast)
1030 BENCHMARK_GEMM(f32_gemm_5x8__avx_broadcast)
1031 BENCHMARK_GEMM(f32_gemm_6x8__avx_broadcast)
1032 BENCHMARK_GEMM(f32_gemm_7x8__avx_broadcast)
1033 BENCHMARK_GEMM(f32_gemm_1x16__avx_broadcast)
1034 BENCHMARK_GEMM(f32_gemm_3x16__avx_broadcast)
1035 BENCHMARK_GEMM(f32_gemm_4x16__avx_broadcast)
1036 BENCHMARK_GEMM(f32_gemm_5x16__avx_broadcast)
1037
1038 BENCHMARK_GEMM(f32_gemm_1x8__sse2_dup)
1039 BENCHMARK_GEMM(f32_gemm_3x8__sse2_dup)
1040 BENCHMARK_GEMM(f32_gemm_4x8__sse2_dup)
1041 BENCHMARK_GEMM(f32_gemm_5x8__sse2_dup)
1042
1043 BENCHMARK_GEMM(f32_gemm_1x8__sse_load1)
1044 BENCHMARK_GEMM(f32_gemm_3x8__sse_load1)
1045 BENCHMARK_GEMM(f32_gemm_4x8__sse_load1)
1046 BENCHMARK_GEMM(f32_gemm_5x8__sse_load1)
1047
1048 BENCHMARK_GEMM(f32_gemm_1x8__sse_dup)
1049 BENCHMARK_GEMM(f32_gemm_3x8__sse_dup)
1050 BENCHMARK_GEMM(f32_gemm_4x8__sse_dup)
1051 BENCHMARK_GEMM(f32_gemm_5x8__sse_dup)
1052
1053 BENCHMARK_GEMM(f32_gemm_1x8s4__sse)
1054 BENCHMARK_GEMM(f32_gemm_3x8s4__sse)
1055 BENCHMARK_GEMM(f32_gemm_4x8s4__sse)
1056 BENCHMARK_GEMM(f32_gemm_5x8s4__sse)
1057
1058 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__sse)
1059 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__sse)
1060 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1061
1062
1063 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1064 static void f32_gemm_3x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
1065 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat, 3, 8, 1, 1,
1066 xnn_init_f32_minmax_scalar_params);
1067 }
f32_gemm_4x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)1068 static void f32_gemm_4x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
1069 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat, 4, 8, 1, 1,
1070 xnn_init_f32_minmax_scalar_params);
1071 }
f32_gemm_5x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)1072 static void f32_gemm_5x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
1073 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, 5, 8, 1, 1,
1074 xnn_init_f32_minmax_scalar_params);
1075 }
f32_gemm_6x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)1076 static void f32_gemm_6x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
1077 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat, 6, 8, 1, 1,
1078 xnn_init_f32_minmax_scalar_params);
1079 }
f32_gemm_3x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)1080 static void f32_gemm_3x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
1081 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, 3, 8, 1, 1,
1082 xnn_init_f32_minmax_scalar_params);
1083 }
f32_gemm_4x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)1084 static void f32_gemm_4x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
1085 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, 4, 8, 1, 1,
1086 xnn_init_f32_minmax_scalar_params);
1087 }
f32_gemm_5x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)1088 static void f32_gemm_5x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
1089 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat, 5, 8, 1, 1,
1090 xnn_init_f32_minmax_scalar_params);
1091 }
f32_gemm_6x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)1092 static void f32_gemm_6x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
1093 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, 6, 8, 1, 1,
1094 xnn_init_f32_minmax_scalar_params);
1095 }
f32_gemm_3x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)1096 static void f32_gemm_3x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1097 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat, 3, 8, 1, 1,
1098 xnn_init_f32_minmax_scalar_params);
1099 }
f32_gemm_4x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)1100 static void f32_gemm_4x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1101 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat, 4, 8, 1, 1,
1102 xnn_init_f32_minmax_scalar_params);
1103 }
f32_gemm_5x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)1104 static void f32_gemm_5x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1105 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat, 5, 8, 1, 1,
1106 xnn_init_f32_minmax_scalar_params);
1107 }
f32_gemm_6x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)1108 static void f32_gemm_6x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1109 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, 6, 8, 1, 1,
1110 xnn_init_f32_minmax_scalar_params);
1111 }
f32_gemm_3x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)1112 static void f32_gemm_3x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1113 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat, 3, 8, 1, 1,
1114 xnn_init_f32_minmax_scalar_params);
1115 }
f32_gemm_4x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)1116 static void f32_gemm_4x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1117 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat, 4, 8, 1, 1,
1118 xnn_init_f32_minmax_scalar_params);
1119 }
f32_gemm_5x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)1120 static void f32_gemm_5x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1121 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat, 5, 8, 1, 1,
1122 xnn_init_f32_minmax_scalar_params);
1123 }
f32_gemm_6x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)1124 static void f32_gemm_6x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1125 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, 6, 8, 1, 1,
1126 xnn_init_f32_minmax_scalar_params);
1127 }
f32_gemm_3x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1128 static void f32_gemm_3x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1129 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, 3, 8, 1, 4,
1130 xnn_init_f32_minmax_scalar_params);
1131 }
f32_gemm_4x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1132 static void f32_gemm_4x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1133 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm, 4, 8, 1, 4,
1134 xnn_init_f32_minmax_scalar_params);
1135 }
f32_gemm_5x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1136 static void f32_gemm_5x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1137 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm, 5, 8, 1, 4,
1138 xnn_init_f32_minmax_scalar_params);
1139 }
f32_gemm_6x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1140 static void f32_gemm_6x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1141 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, 6, 8, 1, 4,
1142 xnn_init_f32_minmax_scalar_params);
1143 }
f32_gemm_3x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1144 static void f32_gemm_3x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1145 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, 3, 8, 1, 4,
1146 xnn_init_f32_minmax_scalar_params);
1147 }
f32_gemm_4x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1148 static void f32_gemm_4x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1149 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86, 4, 8, 1, 4,
1150 xnn_init_f32_minmax_scalar_params);
1151 }
f32_gemm_5x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1152 static void f32_gemm_5x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1153 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86, 5, 8, 1, 4,
1154 xnn_init_f32_minmax_scalar_params);
1155 }
f32_gemm_6x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1156 static void f32_gemm_6x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1157 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, 6, 8, 1, 4,
1158 xnn_init_f32_minmax_scalar_params);
1159 }
1160
f32_ppmm_4x8_unipass__wasmsimd_arm_splat(benchmark::State & state,const char * net)1161 static void f32_ppmm_4x8_unipass__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1162 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8,
1163 xnn_init_f32_minmax_scalar_params);
1164 }
f32_ppmm_4x8_unipass__wasmsimd_x86_splat(benchmark::State & state,const char * net)1165 static void f32_ppmm_4x8_unipass__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1166 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8,
1167 xnn_init_f32_minmax_scalar_params);
1168 }
1169
f32_ppmm_4x8_twopass__wasmsimd_arm_splat(benchmark::State & state,const char * net)1170 static void f32_ppmm_4x8_twopass__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1171 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8,
1172 xnn_init_f32_minmax_scalar_params);
1173 }
f32_ppmm_4x8_twopass__wasmsimd_x86_splat(benchmark::State & state,const char * net)1174 static void f32_ppmm_4x8_twopass__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1175 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8,
1176 xnn_init_f32_minmax_scalar_params);
1177 }
1178
1179 BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_arm_loadsplat)
BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_arm_loadsplat)1180 BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_arm_loadsplat)
1181 BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_arm_loadsplat)
1182 BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_arm_loadsplat)
1183 BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_x86_loadsplat)
1184 BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_x86_loadsplat)
1185 BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_x86_loadsplat)
1186 BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_x86_loadsplat)
1187 BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_arm_splat)
1188 BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_arm_splat)
1189 BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_arm_splat)
1190 BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_arm_splat)
1191 BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_x86_splat)
1192 BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_x86_splat)
1193 BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_x86_splat)
1194 BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_x86_splat)
1195 BENCHMARK_GEMM(f32_gemm_3x8s4__wasmsimd_arm)
1196 BENCHMARK_GEMM(f32_gemm_4x8s4__wasmsimd_arm)
1197 BENCHMARK_GEMM(f32_gemm_5x8s4__wasmsimd_arm)
1198 BENCHMARK_GEMM(f32_gemm_6x8s4__wasmsimd_arm)
1199 BENCHMARK_GEMM(f32_gemm_3x8s4__wasmsimd_x86)
1200 BENCHMARK_GEMM(f32_gemm_4x8s4__wasmsimd_x86)
1201 BENCHMARK_GEMM(f32_gemm_5x8s4__wasmsimd_x86)
1202 BENCHMARK_GEMM(f32_gemm_6x8s4__wasmsimd_x86)
1203
1204 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__wasmsimd_arm_splat)
1205 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__wasmsimd_x86_splat)
1206
1207 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__wasmsimd_arm_splat)
1208 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__wasmsimd_x86_splat)
1209 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1210
1211
1212 static void f32_gemm_1x4__scalar(benchmark::State& state, const char* net) {
1213 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x4__scalar, 1, 4, 1, 1,
1214 xnn_init_f32_minmax_scalar_params);
1215 }
f32_gemm_2x4__scalar(benchmark::State & state,const char * net)1216 static void f32_gemm_2x4__scalar(benchmark::State& state, const char* net) {
1217 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_2x4__scalar, 2, 4, 1, 1,
1218 xnn_init_f32_minmax_scalar_params);
1219 }
f32_gemm_4x4__scalar(benchmark::State & state,const char * net)1220 static void f32_gemm_4x4__scalar(benchmark::State& state, const char* net) {
1221 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x4__scalar, 4, 4, 1, 1,
1222 xnn_init_f32_minmax_scalar_params);
1223 }
1224
f32_ppmm_2x4_unipass__scalar(benchmark::State & state,const char * net)1225 static void f32_ppmm_2x4_unipass__scalar(benchmark::State& state, const char* net) {
1226 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4,
1227 xnn_init_f32_minmax_scalar_params);
1228 }
f32_ppmm_4x2_unipass__scalar(benchmark::State & state,const char * net)1229 static void f32_ppmm_4x2_unipass__scalar(benchmark::State& state, const char* net) {
1230 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2,
1231 xnn_init_f32_minmax_scalar_params);
1232 }
f32_ppmm_4x4_unipass__scalar(benchmark::State & state,const char * net)1233 static void f32_ppmm_4x4_unipass__scalar(benchmark::State& state, const char* net) {
1234 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4,
1235 xnn_init_f32_minmax_scalar_params);
1236 }
f32_ppmm_3x3_unipass__scalar(benchmark::State & state,const char * net)1237 static void f32_ppmm_3x3_unipass__scalar(benchmark::State& state, const char* net) {
1238 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3,
1239 xnn_init_f32_minmax_scalar_params);
1240 }
1241
f32_ppmm_2x4_twopass__scalar(benchmark::State & state,const char * net)1242 static void f32_ppmm_2x4_twopass__scalar(benchmark::State& state, const char* net) {
1243 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4,
1244 xnn_init_f32_minmax_scalar_params);
1245 }
f32_ppmm_4x2_twopass__scalar(benchmark::State & state,const char * net)1246 static void f32_ppmm_4x2_twopass__scalar(benchmark::State& state, const char* net) {
1247 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2,
1248 xnn_init_f32_minmax_scalar_params);
1249 }
f32_ppmm_4x4_twopass__scalar(benchmark::State & state,const char * net)1250 static void f32_ppmm_4x4_twopass__scalar(benchmark::State& state, const char* net) {
1251 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4,
1252 xnn_init_f32_minmax_scalar_params);
1253 }
f32_ppmm_3x3_twopass__scalar(benchmark::State & state,const char * net)1254 static void f32_ppmm_3x3_twopass__scalar(benchmark::State& state, const char* net) {
1255 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3,
1256 xnn_init_f32_minmax_scalar_params);
1257 }
1258
1259 BENCHMARK_GEMM(f32_gemm_1x4__scalar)
1260 BENCHMARK_GEMM(f32_gemm_2x4__scalar)
1261 BENCHMARK_GEMM(f32_gemm_4x4__scalar)
1262
1263 BENCHMARK_GEMM(f32_ppmm_2x4_unipass__scalar)
1264 BENCHMARK_GEMM(f32_ppmm_4x2_unipass__scalar)
1265 BENCHMARK_GEMM(f32_ppmm_4x4_unipass__scalar)
1266 BENCHMARK_GEMM(f32_ppmm_3x3_unipass__scalar)
1267
1268 BENCHMARK_GEMM(f32_ppmm_2x4_twopass__scalar)
1269 BENCHMARK_GEMM(f32_ppmm_4x2_twopass__scalar)
1270 BENCHMARK_GEMM(f32_ppmm_4x4_twopass__scalar)
1271 BENCHMARK_GEMM(f32_ppmm_3x3_twopass__scalar)
1272
1273
1274 #ifdef BENCHMARK_RUY
1275 BENCHMARK_GEMM(ruy_st)
1276 #endif // BENCHMARK_RUY
1277
1278 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1279 BENCHMARK_MAIN();
1280 #endif
1281