• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <algorithm>
10 #include <cfloat>
11 #include <chrono>
12 #include <cmath>
13 #include <functional>
14 #include <mutex>
15 #include <random>
16 #include <vector>
17 
18 #include <benchmark/benchmark.h>
19 #ifdef BENCHMARK_RUY
20 #include "ruy/ruy.h"
21 #endif  // BENCHMARK_RUY
22 #include "bench/gemm.h"
23 #include "bench/utils.h"
24 #include <xnnpack/AlignedAllocator.h>
25 #include <xnnpack/allocator.h>
26 #include <xnnpack/common.h>
27 #include <xnnpack/gemm.h>
28 #include <xnnpack/pack.h>
29 #include <xnnpack/packx.h>
30 #include <xnnpack/params-init.h>
31 #include <xnnpack/params.h>
32 #include <xnnpack/ppmm.h>
33 
34 
GEMMBenchmark(benchmark::State & state,xnn_f32_gemm_minmax_ukernel_function gemm,size_t mr,size_t nr,size_t kr,size_t sr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)35 static void GEMMBenchmark(benchmark::State& state,
36   xnn_f32_gemm_minmax_ukernel_function gemm,
37   size_t mr, size_t nr, size_t kr, size_t sr,
38   xnn_init_f32_minmax_params_fn init_params,
39   benchmark::utils::IsaCheckFunction isa_check = nullptr)
40 {
41   if (isa_check && !isa_check(state)) {
42     return;
43   }
44 
45   const size_t mc = state.range(0);
46   const size_t nc = state.range(1);
47   const size_t kc = state.range(2);
48 
49   const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
50   const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
51 
52   std::random_device random_device;
53   auto rng = std::mt19937(random_device());
54   auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
55 
56   std::vector<float> a(mc * kc);
57   std::generate(a.begin(), a.end(), std::ref(f32rng));
58   std::vector<float> k(nc * kc);
59   std::generate(k.begin(), k.end(), std::ref(f32rng));
60   std::vector<float> b(nc);
61   std::generate(b.begin(), b.end(), std::ref(f32rng));
62 
63   const size_t w_elements = nc_stride * kc_stride + nc_stride;
64   const size_t c_elements = mc * nc;
65   const size_t num_buffers = 1 +
66     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
67       sizeof(float) * (w_elements + c_elements));
68 
69   std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
70   std::fill(w.begin(), w.end(), 0.0f);
71   xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
72   std::vector<float> c(c_elements * num_buffers);
73   std::fill(c.begin(), c.end(), std::nanf(""));
74 
75   xnn_f32_minmax_params params;
76   init_params(&params,
77     -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
78 
79   size_t buffer_index = 0;
80   for (auto _ : state) {
81     // Use circular buffers (exceeding cache size) and prefetch to control cache state:
82     // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
83     // - W is not in cache (for any cache level)
84     // - C is not in cache (for any cache level)
85     state.PauseTiming();
86     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
87     buffer_index = (buffer_index + 1) % num_buffers;
88     state.ResumeTiming();
89 
90     for (uint32_t m = 0; m < mc; m += mr) {
91       const uint32_t mb = min(mc - m, mr);
92       gemm(
93         mb, nc, kc * sizeof(float),
94         a.data() + m * kc, kc * sizeof(float),
95         w.data() + buffer_index * nc_stride * (kc_stride + 1),
96         c.data() + (buffer_index * mc + m) * nc, nc * sizeof(float), nr * sizeof(float),
97         &params);
98     }
99   }
100 
101   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
102   if (cpu_frequency != 0) {
103     state.counters["cpufreq"] = cpu_frequency;
104   }
105 
106   state.counters["FLOPS"] = benchmark::Counter(
107     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
108 }
109 
PPMM1PBenchmark(benchmark::State & state,xnn_f32_ppmm_minmax_ukernel_function ppmm,xnn_x32_packx_ukernel_function packx,size_t mr,size_t nr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)110 static void PPMM1PBenchmark(benchmark::State& state,
111   xnn_f32_ppmm_minmax_ukernel_function ppmm,
112   xnn_x32_packx_ukernel_function packx,
113   size_t mr, size_t nr,
114   xnn_init_f32_minmax_params_fn init_params,
115   benchmark::utils::IsaCheckFunction isa_check = nullptr)
116 {
117   if (isa_check && !isa_check(state)) {
118     return;
119   }
120 
121   const size_t mc = state.range(0);
122   const size_t nc = state.range(1);
123   const size_t kc = state.range(2);
124 
125   const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
126 
127   std::random_device random_device;
128   auto rng = std::mt19937(random_device());
129   auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
130 
131   std::vector<float> a(mc * kc);
132   std::generate(a.begin(), a.end(), std::ref(f32rng));
133   std::vector<float> k(nc * kc);
134   std::generate(k.begin(), k.end(), std::ref(f32rng));
135   std::vector<float> b(nc);
136   std::generate(b.begin(), b.end(), std::ref(f32rng));
137 
138   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> t(mr * kc);
139 
140   const size_t w_elements = nc_stride * kc + nc_stride;
141   const size_t c_elements = mc * nc;
142   const size_t num_buffers = 1 +
143     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
144       sizeof(float) * (w_elements + c_elements));
145 
146   std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
147   std::fill(w.begin(), w.end(), 0.0f);
148   xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data(), 0, nullptr);
149   std::vector<float> c(c_elements * num_buffers);
150   std::fill(c.begin(), c.end(), std::nanf(""));
151 
152   xnn_f32_minmax_params params;
153   init_params(&params,
154     -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
155 
156   size_t buffer_index = 0;
157   for (auto _ : state) {
158     // Use circular buffers (exceeding cache size) and prefetch to control cache state:
159     // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
160     // - W is not in cache (for any cache level)
161     // - C is not in cache (for any cache level)
162     state.PauseTiming();
163     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
164     buffer_index = (buffer_index + 1) % num_buffers;
165     state.ResumeTiming();
166 
167     for (uint32_t m = 0; m < mc; m += mr) {
168       const uint32_t mb = min(mc - m, mr);
169       packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data());
170       ppmm(
171         mb, nc, kc * sizeof(float),
172         reinterpret_cast<const float*>(t.data()),
173         w.data() + nc_stride * buffer_index * (kc + 1),
174         c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
175         &params);
176     }
177   }
178 
179   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
180   if (cpu_frequency != 0) {
181     state.counters["cpufreq"] = cpu_frequency;
182   }
183 
184   state.counters["FLOPS"] = benchmark::Counter(
185     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
186 }
187 
PPMM2PBenchmark(benchmark::State & state,xnn_f32_ppmm_minmax_ukernel_function ppmm,xnn_x32_packx_ukernel_function packx,size_t mr,size_t nr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)188 static void PPMM2PBenchmark(benchmark::State& state,
189   xnn_f32_ppmm_minmax_ukernel_function ppmm,
190   xnn_x32_packx_ukernel_function packx,
191   size_t mr, size_t nr,
192   xnn_init_f32_minmax_params_fn init_params,
193   benchmark::utils::IsaCheckFunction isa_check = nullptr)
194 {
195   if (isa_check && !isa_check(state)) {
196     return;
197   }
198 
199   const size_t mc = state.range(0);
200   const size_t nc = state.range(1);
201   const size_t kc = state.range(2);
202 
203   const size_t mc_stride = benchmark::utils::RoundUp(mc, mr);
204   const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
205 
206   std::random_device random_device;
207   auto rng = std::mt19937(random_device());
208   auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
209 
210   std::vector<float> a(mc * kc);
211   std::generate(a.begin(), a.end(), std::ref(f32rng));
212   std::vector<float> k(nc * kc);
213   std::generate(k.begin(), k.end(), std::ref(f32rng));
214   std::vector<float> b(nc);
215   std::generate(b.begin(), b.end(), std::ref(f32rng));
216 
217   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> t(mc_stride * kc);
218 
219   const size_t w_elements = nc_stride * kc + nc_stride;
220   const size_t c_elements = mc * nc;
221   const size_t num_buffers = 1 +
222     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
223       sizeof(float) * (w_elements + c_elements));
224 
225   std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
226   std::fill(w.begin(), w.end(), 0.0f);
227   xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data(), 0, nullptr);
228   std::vector<float> c(c_elements * num_buffers);
229   std::fill(c.begin(), c.end(), std::nanf(""));
230 
231   xnn_f32_minmax_params params;
232   init_params(&params,
233     -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
234 
235   size_t buffer_index = 0;
236   for (auto _ : state) {
237     // Use circular buffers (exceeding cache size) and prefetch to control cache state:
238     // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
239     // - W is not in cache (for any cache level)
240     // - C is not in cache (for any cache level)
241     state.PauseTiming();
242     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
243     buffer_index = (buffer_index + 1) % num_buffers;
244     state.ResumeTiming();
245 
246     for (uint32_t m = 0; m < mc; m += mr) {
247       const uint32_t mb = min(mc - m, mr);
248       packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data() + m * kc);
249     }
250     for (uint32_t m = 0; m < mc; m += mr) {
251       const uint32_t mb = min(mc - m, mr);
252       ppmm(
253         mb, nc, kc * sizeof(float),
254         reinterpret_cast<const float*>(t.data() + m * kc),
255         w.data() + nc_stride * buffer_index * (kc + 1),
256         c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
257         &params);
258     }
259   }
260 
261   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
262   if (cpu_frequency != 0) {
263     state.counters["cpufreq"] = cpu_frequency;
264   }
265 
266   state.counters["FLOPS"] = benchmark::Counter(
267     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
268 }
269 
270 #ifdef BENCHMARK_RUY
RuyBenchmark(benchmark::State & state,uint32_t threads)271 static void RuyBenchmark(benchmark::State& state, uint32_t threads)
272 {
273   std::random_device random_device;
274   auto rng = std::mt19937(random_device());
275   auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
276 
277   const size_t mc = state.range(0);
278   const size_t nc = state.range(1);
279   const size_t kc = state.range(2);
280 
281   const size_t num_buffers = 1 +
282     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
283       sizeof(float) * (nc * (mc + kc + 1)));
284 
285   std::vector<float> a(mc * kc);
286   std::generate(a.begin(), a.end(), std::ref(f32rng));
287   std::vector<float> k(num_buffers * nc * kc);
288   std::generate(k.begin(), k.end(), std::ref(f32rng));
289   std::vector<float> b(num_buffers * nc);
290   std::generate(b.begin(), b.end(), std::ref(f32rng));
291   std::vector<float> c(num_buffers * nc * mc);
292   std::fill(c.begin(), c.end(), std::nanf(""));
293 
294   // Note: context must be static to avoid the cost of re-creating it for each benchmark.
295   static ruy::Context context;
296   context.set_max_num_threads(threads);
297 
298   ruy::Matrix<float> ruy_a;
299   ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, ruy_a.mutable_layout());
300   ruy::Matrix<float> ruy_b;
301   ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, ruy_b.mutable_layout());
302   ruy_b.set_data(a.data());
303   ruy::Matrix<float> ruy_c;
304   ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, ruy_c.mutable_layout());
305 
306   ruy::MulParams<float, float> mul_params;
307 
308   // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
309   // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
310   // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
311   // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
312   static std::once_flag warmup;
313   std::call_once(warmup, [&](){
314     auto start = std::chrono::steady_clock::now();
315     do {
316       ruy_a.set_data(k.data());
317       ruy_c.set_data(c.data());
318       mul_params.set_bias(b.data());
319 
320       ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
321     } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
322   });
323 
324   size_t buffer_index = 0;
325   for (auto _ : state) {
326     // Use circular buffers (exceeding cache size) and prefetch to control cache state:
327     // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
328     // - K is not in cache (for any cache level)
329     // - B is not in cache (for any cache level)
330     // - C is not in cache (for any cache level)
331     state.PauseTiming();
332     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
333     buffer_index = (buffer_index + 1) % num_buffers;
334     state.ResumeTiming();
335 
336     ruy_a.set_data(k.data() + buffer_index * nc * kc);
337     ruy_c.set_data(c.data() + buffer_index * mc * nc);
338     mul_params.set_bias(b.data() + buffer_index * nc);
339 
340     ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
341   }
342 
343   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
344   if (cpu_frequency != 0) {
345     state.counters["cpufreq"] = cpu_frequency;
346   }
347 
348   state.counters["FLOPS"] = benchmark::Counter(
349     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
350 }
351 
ruy_st(benchmark::State & state,const char * net)352 static void ruy_st(benchmark::State& state, const char* net)
353 {
354   RuyBenchmark(state, 1);
355 }
356 #endif  // BENCHMARK_RUY
357 
358 #if XNN_PLATFORM_JIT
GEMMBenchmark(benchmark::State & state,xnn_jit_gemm_code_generator_function generator,size_t mr,size_t nr,size_t kr,size_t sr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)359 static void GEMMBenchmark(benchmark::State& state,
360   xnn_jit_gemm_code_generator_function generator,
361   size_t mr, size_t nr, size_t kr, size_t sr,
362   xnn_init_f32_minmax_params_fn init_params,
363   benchmark::utils::IsaCheckFunction isa_check = nullptr)
364 {
365   if (isa_check && !isa_check(state)) {
366     return;
367   }
368 
369   const size_t mc = state.range(0);
370   const size_t nc = state.range(1);
371   const size_t kc = state.range(2);
372 
373   const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
374   const size_t kc_stride = benchmark::utils::RoundUp(kc, kr);
375 
376   std::random_device random_device;
377   auto rng = std::mt19937(random_device());
378   auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
379 
380   std::vector<float> a(mc * kc);
381   std::generate(a.begin(), a.end(), std::ref(f32rng));
382   std::vector<float> k(nc * kc);
383   std::generate(k.begin(), k.end(), std::ref(f32rng));
384   std::vector<float> b(nc);
385   std::generate(b.begin(), b.end(), std::ref(f32rng));
386 
387   const size_t w_elements = nc_stride * kc_stride + nc_stride;
388   const size_t c_elements = mc * nc;
389   const size_t num_buffers = 1 +
390     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
391       sizeof(float) * (w_elements + c_elements));
392 
393   std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
394   std::fill(w.begin(), w.end(), 0.0f);
395   xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
396   std::vector<float> c(c_elements * num_buffers);
397   std::fill(c.begin(), c.end(), std::nanf(""));
398 
399   xnn_f32_minmax_params params;
400   init_params(&params,
401     -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
402 
403   xnn_code_buffer code_buffer;
404   xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
405   jit_gemm_params jit_params = {
406     .f32_minmax = {
407       .min = -std::numeric_limits<float>::infinity(),
408       .max = +std::numeric_limits<float>::infinity()
409     }
410   };
411   generator(&code_buffer, nc, kc * sizeof(float), &jit_params);
412   xnn_f32_gemm_minmax_ukernel_function gemm = reinterpret_cast<xnn_f32_gemm_minmax_ukernel_function>(code_buffer.code);
413 
414   size_t buffer_index = 0;
415   for (auto _ : state) {
416     // Use circular buffers (exceeding cache size) and prefetch to control cache state:
417     // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
418     // - W is not in cache (for any cache level)
419     // - C is not in cache (for any cache level)
420     state.PauseTiming();
421     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
422     buffer_index = (buffer_index + 1) % num_buffers;
423     state.ResumeTiming();
424 
425     for (uint32_t m = 0; m < mc; m += mr) {
426       const uint32_t mb = min(mc - m, mr);
427       gemm(
428         mb, nc, kc * sizeof(float),
429         a.data() + m * kc, kc * sizeof(float),
430         w.data() + buffer_index * nc_stride * (kc_stride + 1),
431         c.data() + (buffer_index * mc + m) * nc, nc * sizeof(float), nr * sizeof(float),
432         &params);
433     }
434   }
435 
436   xnn_release_code_memory(&code_buffer);
437 
438   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
439   if (cpu_frequency != 0) {
440     state.counters["cpufreq"] = cpu_frequency;
441   }
442 
443   state.counters["FLOPS"] = benchmark::Counter(
444     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
445 }
446 #endif  // XNN_PLATFORM_JIT
447 
448 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
f32_gemm_1x8__aarch64_neonfma_ld64(benchmark::State & state,const char * net)449   static void f32_gemm_1x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
450     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, 1, 8, 1, 1,
451       xnn_init_f32_minmax_scalar_params);
452   }
f32_gemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)453   static void f32_gemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
454     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1,
455       xnn_init_f32_minmax_scalar_params);
456   }
f32_gemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)457   static void f32_gemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
458     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, 1, 8, 1, 1,
459       xnn_init_f32_minmax_scalar_params);
460   }
f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)461   static void f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
462     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1,
463       xnn_init_f32_minmax_scalar_params);
464   }
f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)465   static void f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
466     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8, 1, 1,
467       xnn_init_f32_minmax_scalar_params);
468   }
f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)469   static void f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
470     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1,
471       xnn_init_f32_minmax_scalar_params);
472   }
f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)473   static void f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
474     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53, 4, 8, 1, 1,
475       xnn_init_f32_minmax_scalar_params);
476   }
f32_gemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State & state,const char * net)477   static void f32_gemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
478     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, 4, 8, 1, 1,
479       xnn_init_f32_minmax_scalar_params);
480   }
f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)481   static void f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
482     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1,
483       xnn_init_f32_minmax_scalar_params);
484   }
f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)485   static void f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
486     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, 4, 8, 1, 1,
487       xnn_init_f32_minmax_scalar_params);
488   }
f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State & state,const char * net)489   static void f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
490     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1,
491       xnn_init_f32_minmax_scalar_params);
492   }
f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State & state,const char * net)493   static void f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
494     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld128, 4, 8, 1, 1,
495       xnn_init_f32_minmax_scalar_params);
496   }
f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)497   static void f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
498     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75, 5, 8, 1, 1,
499       xnn_init_f32_minmax_scalar_params);
500   }
f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)501   static void f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
502     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, 5, 8, 1, 1,
503       xnn_init_f32_minmax_scalar_params);
504   }
f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State & state,const char * net)505   static void f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
506     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1,
507       xnn_init_f32_minmax_scalar_params);
508   }
f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State & state,const char * net)509   static void f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
510     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1,
511       xnn_init_f32_minmax_scalar_params);
512   }
f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)513   static void f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
514     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53, 6, 8, 1, 1,
515       xnn_init_f32_minmax_scalar_params);
516   }
f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State & state,const char * net)517   static void f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
518     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, 6, 8, 1, 1,
519       xnn_init_f32_minmax_scalar_params);
520   }
f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State & state,const char * net)521   static void f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
522     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1,
523       xnn_init_f32_minmax_scalar_params);
524   }
f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)525   static void f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
526     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1,
527       xnn_init_f32_minmax_scalar_params);
528   }
f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)529   static void f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
530     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, 6, 8, 1, 1,
531       xnn_init_f32_minmax_scalar_params);
532   }
f32_gemm_1x8__neonfma_lane_ld64(benchmark::State & state,const char * net)533   static void f32_gemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
534     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64, 1, 8, 1, 1,
535       xnn_init_f32_minmax_scalar_params);
536   }
f32_gemm_4x8__neonfma_lane_ld64(benchmark::State & state,const char * net)537   static void f32_gemm_4x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
538     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld64, 4, 8, 1, 1,
539       xnn_init_f32_minmax_scalar_params);
540   }
f32_gemm_4x8__neonfma_lane_ld128(benchmark::State & state,const char * net)541   static void f32_gemm_4x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
542     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, 4, 8, 1, 1,
543       xnn_init_f32_minmax_scalar_params);
544   }
f32_gemm_5x8__neonfma_lane_ld64(benchmark::State & state,const char * net)545   static void f32_gemm_5x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
546     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__neonfma_lane_ld64, 5, 8, 1, 1,
547       xnn_init_f32_minmax_scalar_params);
548   }
f32_gemm_6x8__neonfma_lane_ld64(benchmark::State & state,const char * net)549   static void f32_gemm_6x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
550     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64, 6, 8, 1, 1,
551       xnn_init_f32_minmax_scalar_params);
552   }
f32_gemm_6x8__neonfma_lane_ld128(benchmark::State & state,const char * net)553   static void f32_gemm_6x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
554     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128, 6, 8, 1, 1,
555       xnn_init_f32_minmax_scalar_params);
556   }
557 
558   BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_ld64)
BENCHMARK_GEMM(f32_gemm_1x12__aarch64_neonfma_cortex_a53)559   BENCHMARK_GEMM(f32_gemm_1x12__aarch64_neonfma_cortex_a53)
560   BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a53)
561   BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a75)
562   BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75)
563   BENCHMARK_GEMM(f32_gemm_4x12__aarch64_neonfma_cortex_a53)
564   BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a53)
565   BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a55)
566   BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a75)
567   BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75)
568   BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld128)
569   BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld64)
570   BENCHMARK_GEMM(f32_gemm_5x8__aarch64_neonfma_cortex_a75)
571   BENCHMARK_GEMM(f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75)
572   BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a53)
573   BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a55)
574   BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a73)
575   BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a75)
576   BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75)
577   BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld64)
578   BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld128)
579   BENCHMARK_GEMM(f32_gemm_1x8__neonfma_lane_ld64)
580   BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld64)
581   BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld128)
582   BENCHMARK_GEMM(f32_gemm_5x8__neonfma_lane_ld64)
583   BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld64)
584   BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld128)
585 #endif  // XNN_ARCH_ARM64
586 
587 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
588   static void f32_gemm_4x4__aarch32_vfp_ld64(benchmark::State& state, const char* net) {
589     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, 4, 4, 1, 1,
590       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckVFP);
591   }
f32_gemm_4x8__aarch32_neon_ld64(benchmark::State & state,const char * net)592   static void f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
593     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1,
594       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
595   }
f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State & state,const char * net)596   static void f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net) {
597     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1,
598       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
599   }
f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State & state,const char * net)600   static void f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
601     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1,
602       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
603   }
f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State & state,const char * net)604   static void f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
605     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1,
606       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
607   }
f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State & state,const char * net)608   static void f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
609     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1,
610       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
611   }
f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State & state,const char * net)612   static void f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net) {
613     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75, 4, 8, 1, 1,
614       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
615   }
616 
617   BENCHMARK_GEMM(f32_gemm_4x4__aarch32_vfp_ld64)
BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_ld64)618   BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_ld64)
619   BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a7)
620   BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a53)
621   BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a55)
622   BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a75)
623   BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_prfm_cortex_a75)
624 #endif  // XNN_ARCH_ARM
625 
626 #if XNN_ARCH_ARM && XNN_PLATFORM_JIT
627   static void jit_f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net)
628   {
629     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1,
630       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
631   }
jit_f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State & state,const char * net)632   static void jit_f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net)
633   {
634     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1,
635       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
636   }
jit_f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State & state,const char * net)637   static void jit_f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net)
638   {
639     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1,
640       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
641   }
jit_f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State & state,const char * net)642   static void jit_f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net)
643   {
644     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1,
645       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
646   }
jit_f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State & state,const char * net)647   static void jit_f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net)
648   {
649     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1,
650       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
651   }
jit_f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State & state,const char * net)652   static void jit_f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net)
653   {
654     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, 4, 8, 1, 1,
655       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
656   }
657 
658   BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_ld64)
BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a7)659   BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a7)
660   BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a53)
661   BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a55)
662   BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a75)
663   BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_prfm_cortex_a75)
664 #endif  // XNN_ARCH_ARM && XNN_PLATFORM_JIT
665 
666 #if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
667   static void jit_f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net)
668   {
669     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1,
670       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
671   }
jit_f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)672   static void jit_f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net)
673   {
674     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8, 1, 1,
675       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
676   }
jit_f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)677   static void jit_f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net)
678   {
679     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1,
680       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
681   }
jit_f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)682   static void jit_f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net)
683   {
684     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, 6, 8, 1, 1,
685       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
686   }
687   BENCHMARK_GEMM(jit_f32_gemm_1x8__aarch64_neonfma_cortex_a75)
BENCHMARK_GEMM(jit_f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75)688   BENCHMARK_GEMM(jit_f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75)
689   BENCHMARK_GEMM(jit_f32_gemm_6x8__aarch64_neonfma_cortex_a75)
690   BENCHMARK_GEMM(jit_f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75)
691 #endif  // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
692 
693 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
694   static void f32_gemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
695     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1,
696       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
697   }
f32_gemm_4x8__neon_lane_ld64(benchmark::State & state,const char * net)698   static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
699     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1,
700       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
701   }
f32_gemm_4x8__neon_lane_ld128(benchmark::State & state,const char * net)702   static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
703     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1,
704       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
705   }
f32_gemm_5x8__neon_lane_ld64(benchmark::State & state,const char * net)706   static void f32_gemm_5x8__neon_lane_ld64(benchmark::State& state, const char* net) {
707     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, 5, 8, 1, 1,
708       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
709   }
f32_gemm_6x8__neon_lane_ld64(benchmark::State & state,const char * net)710   static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
711     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1,
712       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
713   }
f32_gemm_6x8__neon_lane_ld128(benchmark::State & state,const char * net)714   static void f32_gemm_6x8__neon_lane_ld128(benchmark::State& state, const char* net) {
715     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, 6, 8, 1, 1,
716       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
717   }
f32_gemm_1x8__neonfma_dup_ld64(benchmark::State & state,const char * net)718   static void f32_gemm_1x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
719     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1,
720       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
721   }
f32_gemm_4x8__neonfma_dup_ld64(benchmark::State & state,const char * net)722   static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
723     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1,
724       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
725   }
f32_gemm_4x8__neonfma_dup_ld128(benchmark::State & state,const char * net)726   static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
727     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1,
728       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
729   }
f32_gemm_6x8__neonfma_dup_ld64(benchmark::State & state,const char * net)730   static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
731     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1,
732       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
733   }
f32_gemm_6x8__neonfma_dup_ld128(benchmark::State & state,const char * net)734   static void f32_gemm_6x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
735     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld128, 6, 8, 1, 1,
736       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
737   }
f32_gemm_1x8s4__neon(benchmark::State & state,const char * net)738   static void f32_gemm_1x8s4__neon(benchmark::State& state, const char* net) {
739     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8s4__neon, 1, 8, 1, 4,
740       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
741   }
f32_gemm_1x8s4__neonfma(benchmark::State & state,const char * net)742   static void f32_gemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
743     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, 1, 8, 1, 4,
744       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
745   }
f32_gemm_4x8s4__neon(benchmark::State & state,const char * net)746   static void f32_gemm_4x8s4__neon(benchmark::State& state, const char* net) {
747     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__neon, 4, 8, 1, 4,
748       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
749   }
f32_gemm_4x8s4__neonfma(benchmark::State & state,const char * net)750   static void f32_gemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
751     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma, 4, 8, 1, 4,
752       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
753   }
f32_gemm_6x8s4__neon(benchmark::State & state,const char * net)754   static void f32_gemm_6x8s4__neon(benchmark::State& state, const char* net) {
755     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__neon, 6, 8, 1, 4,
756       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
757   }
f32_gemm_6x8s4__neonfma(benchmark::State & state,const char * net)758   static void f32_gemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
759     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, 6, 8, 1, 4,
760       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
761   }
f32_gemm_8x8s4__neon(benchmark::State & state,const char * net)762   static void f32_gemm_8x8s4__neon(benchmark::State& state, const char* net) {
763     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x8s4__neon, 8, 8, 1, 4,
764       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
765   }
f32_gemm_8x8s4__neonfma(benchmark::State & state,const char * net)766   static void f32_gemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
767     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, 8, 8, 1, 4,
768       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
769   }
f32_ppmm_4x8_unipass__neonfma(benchmark::State & state,const char * net)770   static void f32_ppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) {
771     PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8,
772       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
773   }
f32_ppmm_4x8_twopass__neonfma(benchmark::State & state,const char * net)774   static void f32_ppmm_4x8_twopass__neonfma(benchmark::State& state, const char* net) {
775     PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8,
776       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
777   }
778 
779   BENCHMARK_GEMM(f32_gemm_1x8__neon_lane_ld64)
BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld64)780   BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld64)
781   BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld128)
782   BENCHMARK_GEMM(f32_gemm_5x8__neon_lane_ld64)
783   BENCHMARK_GEMM(f32_gemm_6x8__neon_lane_ld64)
784   BENCHMARK_GEMM(f32_gemm_6x8__neon_lane_ld128)
785 
786   BENCHMARK_GEMM(f32_gemm_1x8__neonfma_dup_ld64)
787   BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld128)
788   BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld64)
789   BENCHMARK_GEMM(f32_gemm_6x8__neonfma_dup_ld64)
790   BENCHMARK_GEMM(f32_gemm_6x8__neonfma_dup_ld128)
791 
792   BENCHMARK_GEMM(f32_gemm_1x8s4__neon)
793   BENCHMARK_GEMM(f32_gemm_4x8s4__neon)
794   BENCHMARK_GEMM(f32_gemm_6x8s4__neon)
795   BENCHMARK_GEMM(f32_gemm_8x8s4__neon)
796 
797   BENCHMARK_GEMM(f32_gemm_1x8s4__neonfma)
798   BENCHMARK_GEMM(f32_gemm_4x8s4__neonfma)
799   BENCHMARK_GEMM(f32_gemm_6x8s4__neonfma)
800   BENCHMARK_GEMM(f32_gemm_8x8s4__neonfma)
801 
802   BENCHMARK_GEMM(f32_ppmm_4x8_unipass__neonfma)
803   BENCHMARK_GEMM(f32_ppmm_4x8_twopass__neonfma)
804 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
805 
806 
807 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
808   static void f32_gemm_1x16__avx512f_broadcast(benchmark::State& state, const char* net) {
809     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, 1, 16, 1, 1,
810       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
811   }
f32_gemm_4x16__avx512f_broadcast(benchmark::State & state,const char * net)812   static void f32_gemm_4x16__avx512f_broadcast(benchmark::State& state, const char* net) {
813     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16__avx512f_broadcast, 4, 16, 1, 1,
814       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
815   }
f32_gemm_5x16__avx512f_broadcast(benchmark::State & state,const char * net)816   static void f32_gemm_5x16__avx512f_broadcast(benchmark::State& state, const char* net) {
817     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16__avx512f_broadcast, 5, 16, 1, 1,
818       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
819   }
f32_gemm_6x16__avx512f_broadcast(benchmark::State & state,const char * net)820   static void f32_gemm_6x16__avx512f_broadcast(benchmark::State& state, const char* net) {
821     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, 6, 16, 1, 1,
822       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
823   }
f32_gemm_7x16__avx512f_broadcast(benchmark::State & state,const char * net)824   static void f32_gemm_7x16__avx512f_broadcast(benchmark::State& state, const char* net) {
825     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, 7, 16, 1, 1,
826       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
827   }
f32_gemm_8x16__avx512f_broadcast(benchmark::State & state,const char * net)828   static void f32_gemm_8x16__avx512f_broadcast(benchmark::State& state, const char* net) {
829     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, 8, 16, 1, 1,
830       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
831   }
832 
f32_gemm_1x8__fma3_broadcast(benchmark::State & state,const char * net)833   static void f32_gemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
834     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1,
835       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
836   }
f32_gemm_4x8__fma3_broadcast(benchmark::State & state,const char * net)837   static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
838     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1,
839       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
840   }
f32_gemm_5x8__fma3_broadcast(benchmark::State & state,const char * net)841   static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
842     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1,
843       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
844   }
f32_gemm_6x8__fma3_broadcast(benchmark::State & state,const char * net)845   static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
846     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1,
847       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
848   }
f32_gemm_7x8__fma3_broadcast(benchmark::State & state,const char * net)849   static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
850     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1,
851       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
852   }
f32_gemm_8x8__fma3_broadcast(benchmark::State & state,const char * net)853   static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
854     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1,
855       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
856   }
f32_gemm_1x16__fma3_broadcast(benchmark::State & state,const char * net)857   static void f32_gemm_1x16__fma3_broadcast(benchmark::State& state, const char* net) {
858     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, 1, 16, 1, 1,
859       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
860   }
f32_gemm_3x16__fma3_broadcast(benchmark::State & state,const char * net)861   static void f32_gemm_3x16__fma3_broadcast(benchmark::State& state, const char* net) {
862     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, 4, 16, 1, 1,
863       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
864   }
f32_gemm_4x16__fma3_broadcast(benchmark::State & state,const char * net)865   static void f32_gemm_4x16__fma3_broadcast(benchmark::State& state, const char* net) {
866     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast, 4, 16, 1, 1,
867       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
868   }
f32_gemm_5x16__fma3_broadcast(benchmark::State & state,const char * net)869   static void f32_gemm_5x16__fma3_broadcast(benchmark::State& state, const char* net) {
870     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast, 5, 16, 1, 1,
871       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
872   }
873 
f32_gemm_1x16s4__fma3_broadcast(benchmark::State & state,const char * net)874   static void f32_gemm_1x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
875     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast, 1, 16, 1, 4,
876       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
877   }
f32_gemm_3x16s4__fma3_broadcast(benchmark::State & state,const char * net)878   static void f32_gemm_3x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
879     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast, 4, 16, 1, 4,
880       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
881   }
f32_gemm_4x16s4__fma3_broadcast(benchmark::State & state,const char * net)882   static void f32_gemm_4x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
883     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast, 4, 16, 1, 4,
884       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
885   }
f32_gemm_5x16s4__fma3_broadcast(benchmark::State & state,const char * net)886   static void f32_gemm_5x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
887     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast, 5, 16, 1, 4,
888       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
889   }
890 
f32_gemm_1x8__avx_broadcast(benchmark::State & state,const char * net)891   static void f32_gemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
892     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, 1, 8, 1, 1,
893       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
894   }
f32_gemm_4x8__avx_broadcast(benchmark::State & state,const char * net)895   static void f32_gemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
896     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__avx_broadcast, 4, 8, 1, 1,
897       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
898   }
f32_gemm_5x8__avx_broadcast(benchmark::State & state,const char * net)899   static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
900     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, 5, 8, 1, 1,
901       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
902   }
f32_gemm_6x8__avx_broadcast(benchmark::State & state,const char * net)903   static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
904     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, 6, 8, 1, 1,
905       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
906   }
f32_gemm_7x8__avx_broadcast(benchmark::State & state,const char * net)907   static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
908     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast, 7, 8, 1, 1,
909       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
910   }
f32_gemm_1x16__avx_broadcast(benchmark::State & state,const char * net)911   static void f32_gemm_1x16__avx_broadcast(benchmark::State& state, const char* net) {
912     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast, 1, 16, 1, 1,
913       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
914   }
f32_gemm_3x16__avx_broadcast(benchmark::State & state,const char * net)915   static void f32_gemm_3x16__avx_broadcast(benchmark::State& state, const char* net) {
916     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, 4, 16, 1, 1,
917       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
918   }
f32_gemm_4x16__avx_broadcast(benchmark::State & state,const char * net)919   static void f32_gemm_4x16__avx_broadcast(benchmark::State& state, const char* net) {
920     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast, 4, 16, 1, 1,
921       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
922   }
f32_gemm_5x16__avx_broadcast(benchmark::State & state,const char * net)923   static void f32_gemm_5x16__avx_broadcast(benchmark::State& state, const char* net) {
924     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast, 5, 16, 1, 1,
925       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
926   }
927 
f32_gemm_1x8__sse2_dup(benchmark::State & state,const char * net)928   static void f32_gemm_1x8__sse2_dup(benchmark::State& state, const char* net) {
929     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, 1, 8, 1, 1,
930       xnn_init_f32_minmax_sse_params);
931   }
f32_gemm_3x8__sse2_dup(benchmark::State & state,const char * net)932   static void f32_gemm_3x8__sse2_dup(benchmark::State& state, const char* net) {
933     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup, 3, 8, 1, 1,
934       xnn_init_f32_minmax_sse_params);
935   }
f32_gemm_4x8__sse2_dup(benchmark::State & state,const char * net)936   static void f32_gemm_4x8__sse2_dup(benchmark::State& state, const char* net) {
937     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup, 4, 8, 1, 1,
938       xnn_init_f32_minmax_sse_params);
939   }
f32_gemm_5x8__sse2_dup(benchmark::State & state,const char * net)940   static void f32_gemm_5x8__sse2_dup(benchmark::State& state, const char* net) {
941     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, 5, 8, 1, 1,
942       xnn_init_f32_minmax_sse_params);
943   }
944 
f32_gemm_1x8__sse_load1(benchmark::State & state,const char * net)945   static void f32_gemm_1x8__sse_load1(benchmark::State& state, const char* net) {
946     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__sse_load1, 1, 8, 1, 1,
947       xnn_init_f32_minmax_sse_params);
948   }
f32_gemm_3x8__sse_load1(benchmark::State & state,const char * net)949   static void f32_gemm_3x8__sse_load1(benchmark::State& state, const char* net) {
950     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, 3, 8, 1, 1,
951       xnn_init_f32_minmax_sse_params);
952   }
f32_gemm_4x8__sse_load1(benchmark::State & state,const char * net)953   static void f32_gemm_4x8__sse_load1(benchmark::State& state, const char* net) {
954     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, 4, 8, 1, 1,
955       xnn_init_f32_minmax_sse_params);
956   }
f32_gemm_5x8__sse_load1(benchmark::State & state,const char * net)957   static void f32_gemm_5x8__sse_load1(benchmark::State& state, const char* net) {
958     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, 5, 8, 1, 1,
959       xnn_init_f32_minmax_sse_params);
960   }
961 
f32_gemm_1x8__sse_dup(benchmark::State & state,const char * net)962   static void f32_gemm_1x8__sse_dup(benchmark::State& state, const char* net) {
963     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__sse_dup, 1, 8, 1, 1,
964       xnn_init_f32_minmax_sse_params);
965   }
f32_gemm_3x8__sse_dup(benchmark::State & state,const char * net)966   static void f32_gemm_3x8__sse_dup(benchmark::State& state, const char* net) {
967     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, 3, 8, 1, 1,
968       xnn_init_f32_minmax_sse_params);
969   }
f32_gemm_4x8__sse_dup(benchmark::State & state,const char * net)970   static void f32_gemm_4x8__sse_dup(benchmark::State& state, const char* net) {
971     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__sse_dup, 4, 8, 1, 1,
972       xnn_init_f32_minmax_sse_params);
973   }
f32_gemm_5x8__sse_dup(benchmark::State & state,const char * net)974   static void f32_gemm_5x8__sse_dup(benchmark::State& state, const char* net) {
975     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, 5, 8, 1, 1,
976       xnn_init_f32_minmax_sse_params);
977   }
978 
f32_gemm_1x8s4__sse(benchmark::State & state,const char * net)979   static void f32_gemm_1x8s4__sse(benchmark::State& state, const char* net) {
980     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8s4__sse, 1, 8, 1, 4,
981       xnn_init_f32_minmax_sse_params);
982   }
f32_gemm_3x8s4__sse(benchmark::State & state,const char * net)983   static void f32_gemm_3x8s4__sse(benchmark::State& state, const char* net) {
984     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__sse, 3, 8, 1, 4,
985       xnn_init_f32_minmax_sse_params);
986   }
f32_gemm_4x8s4__sse(benchmark::State & state,const char * net)987   static void f32_gemm_4x8s4__sse(benchmark::State& state, const char* net) {
988     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__sse, 4, 8, 1, 4,
989       xnn_init_f32_minmax_sse_params);
990   }
f32_gemm_5x8s4__sse(benchmark::State & state,const char * net)991   static void f32_gemm_5x8s4__sse(benchmark::State& state, const char* net) {
992     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__sse, 5, 8, 1, 4,
993       xnn_init_f32_minmax_sse_params);
994   }
995 
f32_ppmm_4x8_unipass__sse(benchmark::State & state,const char * net)996   static void f32_ppmm_4x8_unipass__sse(benchmark::State& state, const char* net) {
997     PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8,
998       xnn_init_f32_minmax_sse_params);
999   }
f32_ppmm_4x8_twopass__sse(benchmark::State & state,const char * net)1000   static void f32_ppmm_4x8_twopass__sse(benchmark::State& state, const char* net) {
1001     PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8,
1002       xnn_init_f32_minmax_sse_params);
1003   }
1004 
1005   BENCHMARK_GEMM(f32_gemm_1x16__avx512f_broadcast)
BENCHMARK_GEMM(f32_gemm_4x16__avx512f_broadcast)1006   BENCHMARK_GEMM(f32_gemm_4x16__avx512f_broadcast)
1007   BENCHMARK_GEMM(f32_gemm_5x16__avx512f_broadcast)
1008   BENCHMARK_GEMM(f32_gemm_6x16__avx512f_broadcast)
1009   BENCHMARK_GEMM(f32_gemm_7x16__avx512f_broadcast)
1010   BENCHMARK_GEMM(f32_gemm_8x16__avx512f_broadcast)
1011 
1012   BENCHMARK_GEMM(f32_gemm_1x8__fma3_broadcast)
1013   BENCHMARK_GEMM(f32_gemm_4x8__fma3_broadcast)
1014   BENCHMARK_GEMM(f32_gemm_5x8__fma3_broadcast)
1015   BENCHMARK_GEMM(f32_gemm_6x8__fma3_broadcast)
1016   BENCHMARK_GEMM(f32_gemm_7x8__fma3_broadcast)
1017   BENCHMARK_GEMM(f32_gemm_8x8__fma3_broadcast)
1018   BENCHMARK_GEMM(f32_gemm_1x16__fma3_broadcast)
1019   BENCHMARK_GEMM(f32_gemm_3x16__fma3_broadcast)
1020   BENCHMARK_GEMM(f32_gemm_4x16__fma3_broadcast)
1021   BENCHMARK_GEMM(f32_gemm_5x16__fma3_broadcast)
1022 
1023   BENCHMARK_GEMM(f32_gemm_1x16s4__fma3_broadcast)
1024   BENCHMARK_GEMM(f32_gemm_3x16s4__fma3_broadcast)
1025   BENCHMARK_GEMM(f32_gemm_4x16s4__fma3_broadcast)
1026   BENCHMARK_GEMM(f32_gemm_5x16s4__fma3_broadcast)
1027 
1028   BENCHMARK_GEMM(f32_gemm_1x8__avx_broadcast)
1029   BENCHMARK_GEMM(f32_gemm_4x8__avx_broadcast)
1030   BENCHMARK_GEMM(f32_gemm_5x8__avx_broadcast)
1031   BENCHMARK_GEMM(f32_gemm_6x8__avx_broadcast)
1032   BENCHMARK_GEMM(f32_gemm_7x8__avx_broadcast)
1033   BENCHMARK_GEMM(f32_gemm_1x16__avx_broadcast)
1034   BENCHMARK_GEMM(f32_gemm_3x16__avx_broadcast)
1035   BENCHMARK_GEMM(f32_gemm_4x16__avx_broadcast)
1036   BENCHMARK_GEMM(f32_gemm_5x16__avx_broadcast)
1037 
1038   BENCHMARK_GEMM(f32_gemm_1x8__sse2_dup)
1039   BENCHMARK_GEMM(f32_gemm_3x8__sse2_dup)
1040   BENCHMARK_GEMM(f32_gemm_4x8__sse2_dup)
1041   BENCHMARK_GEMM(f32_gemm_5x8__sse2_dup)
1042 
1043   BENCHMARK_GEMM(f32_gemm_1x8__sse_load1)
1044   BENCHMARK_GEMM(f32_gemm_3x8__sse_load1)
1045   BENCHMARK_GEMM(f32_gemm_4x8__sse_load1)
1046   BENCHMARK_GEMM(f32_gemm_5x8__sse_load1)
1047 
1048   BENCHMARK_GEMM(f32_gemm_1x8__sse_dup)
1049   BENCHMARK_GEMM(f32_gemm_3x8__sse_dup)
1050   BENCHMARK_GEMM(f32_gemm_4x8__sse_dup)
1051   BENCHMARK_GEMM(f32_gemm_5x8__sse_dup)
1052 
1053   BENCHMARK_GEMM(f32_gemm_1x8s4__sse)
1054   BENCHMARK_GEMM(f32_gemm_3x8s4__sse)
1055   BENCHMARK_GEMM(f32_gemm_4x8s4__sse)
1056   BENCHMARK_GEMM(f32_gemm_5x8s4__sse)
1057 
1058   BENCHMARK_GEMM(f32_ppmm_4x8_unipass__sse)
1059   BENCHMARK_GEMM(f32_ppmm_4x8_twopass__sse)
1060 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1061 
1062 
1063 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1064   static void f32_gemm_3x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
1065     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat, 3, 8, 1, 1,
1066       xnn_init_f32_minmax_scalar_params);
1067   }
f32_gemm_4x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)1068   static void f32_gemm_4x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
1069     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat, 4, 8, 1, 1,
1070       xnn_init_f32_minmax_scalar_params);
1071   }
f32_gemm_5x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)1072   static void f32_gemm_5x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
1073     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, 5, 8, 1, 1,
1074       xnn_init_f32_minmax_scalar_params);
1075   }
f32_gemm_6x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)1076   static void f32_gemm_6x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
1077     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat, 6, 8, 1, 1,
1078       xnn_init_f32_minmax_scalar_params);
1079   }
f32_gemm_3x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)1080   static void f32_gemm_3x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
1081     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, 3, 8, 1, 1,
1082       xnn_init_f32_minmax_scalar_params);
1083   }
f32_gemm_4x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)1084   static void f32_gemm_4x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
1085     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, 4, 8, 1, 1,
1086       xnn_init_f32_minmax_scalar_params);
1087   }
f32_gemm_5x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)1088   static void f32_gemm_5x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
1089     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat, 5, 8, 1, 1,
1090       xnn_init_f32_minmax_scalar_params);
1091   }
f32_gemm_6x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)1092   static void f32_gemm_6x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
1093     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, 6, 8, 1, 1,
1094       xnn_init_f32_minmax_scalar_params);
1095   }
f32_gemm_3x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)1096   static void f32_gemm_3x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1097     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat, 3, 8, 1, 1,
1098       xnn_init_f32_minmax_scalar_params);
1099   }
f32_gemm_4x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)1100   static void f32_gemm_4x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1101     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat, 4, 8, 1, 1,
1102       xnn_init_f32_minmax_scalar_params);
1103   }
f32_gemm_5x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)1104   static void f32_gemm_5x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1105     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat, 5, 8, 1, 1,
1106       xnn_init_f32_minmax_scalar_params);
1107   }
f32_gemm_6x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)1108   static void f32_gemm_6x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1109     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, 6, 8, 1, 1,
1110       xnn_init_f32_minmax_scalar_params);
1111   }
f32_gemm_3x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)1112   static void f32_gemm_3x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1113     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat, 3, 8, 1, 1,
1114       xnn_init_f32_minmax_scalar_params);
1115   }
f32_gemm_4x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)1116   static void f32_gemm_4x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1117     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat, 4, 8, 1, 1,
1118       xnn_init_f32_minmax_scalar_params);
1119   }
f32_gemm_5x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)1120   static void f32_gemm_5x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1121     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat, 5, 8, 1, 1,
1122       xnn_init_f32_minmax_scalar_params);
1123   }
f32_gemm_6x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)1124   static void f32_gemm_6x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1125     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, 6, 8, 1, 1,
1126       xnn_init_f32_minmax_scalar_params);
1127   }
f32_gemm_3x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1128   static void f32_gemm_3x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1129     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, 3, 8, 1, 4,
1130       xnn_init_f32_minmax_scalar_params);
1131   }
f32_gemm_4x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1132   static void f32_gemm_4x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1133     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm, 4, 8, 1, 4,
1134       xnn_init_f32_minmax_scalar_params);
1135   }
f32_gemm_5x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1136   static void f32_gemm_5x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1137     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm, 5, 8, 1, 4,
1138       xnn_init_f32_minmax_scalar_params);
1139   }
f32_gemm_6x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1140   static void f32_gemm_6x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1141     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, 6, 8, 1, 4,
1142       xnn_init_f32_minmax_scalar_params);
1143   }
f32_gemm_3x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1144   static void f32_gemm_3x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1145     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, 3, 8, 1, 4,
1146       xnn_init_f32_minmax_scalar_params);
1147   }
f32_gemm_4x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1148   static void f32_gemm_4x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1149     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86, 4, 8, 1, 4,
1150       xnn_init_f32_minmax_scalar_params);
1151   }
f32_gemm_5x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1152   static void f32_gemm_5x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1153     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86, 5, 8, 1, 4,
1154       xnn_init_f32_minmax_scalar_params);
1155   }
f32_gemm_6x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1156   static void f32_gemm_6x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1157     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, 6, 8, 1, 4,
1158       xnn_init_f32_minmax_scalar_params);
1159   }
1160 
f32_ppmm_4x8_unipass__wasmsimd_arm_splat(benchmark::State & state,const char * net)1161   static void f32_ppmm_4x8_unipass__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1162     PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8,
1163       xnn_init_f32_minmax_scalar_params);
1164   }
f32_ppmm_4x8_unipass__wasmsimd_x86_splat(benchmark::State & state,const char * net)1165   static void f32_ppmm_4x8_unipass__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1166     PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8,
1167       xnn_init_f32_minmax_scalar_params);
1168   }
1169 
f32_ppmm_4x8_twopass__wasmsimd_arm_splat(benchmark::State & state,const char * net)1170   static void f32_ppmm_4x8_twopass__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1171     PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8,
1172       xnn_init_f32_minmax_scalar_params);
1173   }
f32_ppmm_4x8_twopass__wasmsimd_x86_splat(benchmark::State & state,const char * net)1174   static void f32_ppmm_4x8_twopass__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1175     PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8,
1176       xnn_init_f32_minmax_scalar_params);
1177   }
1178 
1179   BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_arm_loadsplat)
BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_arm_loadsplat)1180   BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_arm_loadsplat)
1181   BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_arm_loadsplat)
1182   BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_arm_loadsplat)
1183   BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_x86_loadsplat)
1184   BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_x86_loadsplat)
1185   BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_x86_loadsplat)
1186   BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_x86_loadsplat)
1187   BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_arm_splat)
1188   BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_arm_splat)
1189   BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_arm_splat)
1190   BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_arm_splat)
1191   BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_x86_splat)
1192   BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_x86_splat)
1193   BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_x86_splat)
1194   BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_x86_splat)
1195   BENCHMARK_GEMM(f32_gemm_3x8s4__wasmsimd_arm)
1196   BENCHMARK_GEMM(f32_gemm_4x8s4__wasmsimd_arm)
1197   BENCHMARK_GEMM(f32_gemm_5x8s4__wasmsimd_arm)
1198   BENCHMARK_GEMM(f32_gemm_6x8s4__wasmsimd_arm)
1199   BENCHMARK_GEMM(f32_gemm_3x8s4__wasmsimd_x86)
1200   BENCHMARK_GEMM(f32_gemm_4x8s4__wasmsimd_x86)
1201   BENCHMARK_GEMM(f32_gemm_5x8s4__wasmsimd_x86)
1202   BENCHMARK_GEMM(f32_gemm_6x8s4__wasmsimd_x86)
1203 
1204   BENCHMARK_GEMM(f32_ppmm_4x8_unipass__wasmsimd_arm_splat)
1205   BENCHMARK_GEMM(f32_ppmm_4x8_unipass__wasmsimd_x86_splat)
1206 
1207   BENCHMARK_GEMM(f32_ppmm_4x8_twopass__wasmsimd_arm_splat)
1208   BENCHMARK_GEMM(f32_ppmm_4x8_twopass__wasmsimd_x86_splat)
1209 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1210 
1211 
1212 static void f32_gemm_1x4__scalar(benchmark::State& state, const char* net) {
1213   GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x4__scalar, 1, 4, 1, 1,
1214     xnn_init_f32_minmax_scalar_params);
1215 }
f32_gemm_2x4__scalar(benchmark::State & state,const char * net)1216 static void f32_gemm_2x4__scalar(benchmark::State& state, const char* net) {
1217   GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_2x4__scalar, 2, 4, 1, 1,
1218     xnn_init_f32_minmax_scalar_params);
1219 }
f32_gemm_4x4__scalar(benchmark::State & state,const char * net)1220 static void f32_gemm_4x4__scalar(benchmark::State& state, const char* net) {
1221   GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x4__scalar, 4, 4, 1, 1,
1222     xnn_init_f32_minmax_scalar_params);
1223 }
1224 
f32_ppmm_2x4_unipass__scalar(benchmark::State & state,const char * net)1225 static void f32_ppmm_2x4_unipass__scalar(benchmark::State& state, const char* net) {
1226   PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4,
1227     xnn_init_f32_minmax_scalar_params);
1228 }
f32_ppmm_4x2_unipass__scalar(benchmark::State & state,const char * net)1229 static void f32_ppmm_4x2_unipass__scalar(benchmark::State& state, const char* net) {
1230   PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2,
1231     xnn_init_f32_minmax_scalar_params);
1232 }
f32_ppmm_4x4_unipass__scalar(benchmark::State & state,const char * net)1233 static void f32_ppmm_4x4_unipass__scalar(benchmark::State& state, const char* net) {
1234   PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4,
1235     xnn_init_f32_minmax_scalar_params);
1236 }
f32_ppmm_3x3_unipass__scalar(benchmark::State & state,const char * net)1237 static void f32_ppmm_3x3_unipass__scalar(benchmark::State& state, const char* net) {
1238   PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3,
1239     xnn_init_f32_minmax_scalar_params);
1240 }
1241 
f32_ppmm_2x4_twopass__scalar(benchmark::State & state,const char * net)1242 static void f32_ppmm_2x4_twopass__scalar(benchmark::State& state, const char* net) {
1243   PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4,
1244     xnn_init_f32_minmax_scalar_params);
1245 }
f32_ppmm_4x2_twopass__scalar(benchmark::State & state,const char * net)1246 static void f32_ppmm_4x2_twopass__scalar(benchmark::State& state, const char* net) {
1247   PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2,
1248     xnn_init_f32_minmax_scalar_params);
1249 }
f32_ppmm_4x4_twopass__scalar(benchmark::State & state,const char * net)1250 static void f32_ppmm_4x4_twopass__scalar(benchmark::State& state, const char* net) {
1251   PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4,
1252     xnn_init_f32_minmax_scalar_params);
1253 }
f32_ppmm_3x3_twopass__scalar(benchmark::State & state,const char * net)1254 static void f32_ppmm_3x3_twopass__scalar(benchmark::State& state, const char* net) {
1255   PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3,
1256     xnn_init_f32_minmax_scalar_params);
1257 }
1258 
1259 BENCHMARK_GEMM(f32_gemm_1x4__scalar)
1260 BENCHMARK_GEMM(f32_gemm_2x4__scalar)
1261 BENCHMARK_GEMM(f32_gemm_4x4__scalar)
1262 
1263 BENCHMARK_GEMM(f32_ppmm_2x4_unipass__scalar)
1264 BENCHMARK_GEMM(f32_ppmm_4x2_unipass__scalar)
1265 BENCHMARK_GEMM(f32_ppmm_4x4_unipass__scalar)
1266 BENCHMARK_GEMM(f32_ppmm_3x3_unipass__scalar)
1267 
1268 BENCHMARK_GEMM(f32_ppmm_2x4_twopass__scalar)
1269 BENCHMARK_GEMM(f32_ppmm_4x2_twopass__scalar)
1270 BENCHMARK_GEMM(f32_ppmm_4x4_twopass__scalar)
1271 BENCHMARK_GEMM(f32_ppmm_3x3_twopass__scalar)
1272 
1273 
1274 #ifdef BENCHMARK_RUY
1275 BENCHMARK_GEMM(ruy_st)
1276 #endif  // BENCHMARK_RUY
1277 
1278 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1279 BENCHMARK_MAIN();
1280 #endif
1281