• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11 
12 #include <xnnpack.h>
13 
14 #include <benchmark/benchmark.h>
15 
16 #include "bench/end2end.h"
17 #include "bench/utils.h"
18 #include "models/models.h"
19 #include <xnnpack/gemm.h>
20 #include <xnnpack/igemm.h>
21 #include <xnnpack/params.h>
22 #include <xnnpack/params-init.h>
23 
24 
GEMMEnd2EndBenchmark(benchmark::State & state,models::ExecutionPlanFactory model_factory,xnn_f32_gemm_minmax_ukernel_function gemm,xnn_f32_igemm_minmax_ukernel_function igemm,xnn_f32_gemm_minmax_ukernel_function gemm1,xnn_f32_igemm_minmax_ukernel_function igemm1,xnn_init_f32_minmax_params_fn init_params,uint8_t mr,uint8_t nr,uint8_t log2_kr=0,uint8_t log2_sr=0,benchmark::utils::IsaCheckFunction isa_check=nullptr)25 static void GEMMEnd2EndBenchmark(
26   benchmark::State& state,
27   models::ExecutionPlanFactory model_factory,
28   xnn_f32_gemm_minmax_ukernel_function gemm,
29   xnn_f32_igemm_minmax_ukernel_function igemm,
30   xnn_f32_gemm_minmax_ukernel_function gemm1,
31   xnn_f32_igemm_minmax_ukernel_function igemm1,
32   xnn_init_f32_minmax_params_fn init_params,
33   uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
34   benchmark::utils::IsaCheckFunction isa_check = nullptr)
35 {
36   if (isa_check && !isa_check(state)) {
37     return;
38   }
39   if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
40     state.SkipWithError("failed to initialize XNNPACK");
41     return;
42   }
43 
44   // Override microkernels chosen in xnn_initialize
45   // Note: do not directly assign to xnn_params.f32.gemm because it breaks older gcc.
46   xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm));
47   xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm));
48   xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm1));
49   xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm1));
50   xnn_params.f32.gemm.init.f32 = init_params;
51   xnn_params.f32.gemm.mr = mr;
52   xnn_params.f32.gemm.nr = nr;
53   xnn_params.f32.gemm.log2_kr = log2_kr;
54   xnn_params.f32.gemm.log2_sr = log2_sr;
55 
56   auto execution_plan = model_factory(nullptr);
57   if (execution_plan.empty()) {
58     state.SkipWithError("failed to create a model");
59     return;
60   }
61 
62   for (auto _ : state) {
63     for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
64       xnn_status status = xnn_run_operator(op.get(), nullptr);
65       if (status != xnn_status_success) {
66         state.SkipWithError("failed to run a model");
67         return;
68       }
69     }
70   }
71 
72   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
73   if (cpu_frequency != 0) {
74     state.counters["cpufreq"] = cpu_frequency;
75   }
76 }
77 
78 #if XNN_PLATFORM_JIT
GEMMEnd2EndBenchmark(benchmark::State & state,models::ExecutionPlanFactory model_factory,xnn_jit_gemm_code_generator_function gemm_generator,xnn_f32_gemm_minmax_ukernel_function gemm,xnn_f32_igemm_minmax_ukernel_function igemm,xnn_f32_gemm_minmax_ukernel_function gemm1,xnn_f32_igemm_minmax_ukernel_function igemm1,xnn_init_f32_minmax_params_fn init_params,uint8_t mr,uint8_t nr,uint8_t log2_kr=0,uint8_t log2_sr=0,benchmark::utils::IsaCheckFunction isa_check=nullptr)79 static void GEMMEnd2EndBenchmark(
80   benchmark::State& state,
81   models::ExecutionPlanFactory model_factory,
82   xnn_jit_gemm_code_generator_function gemm_generator,
83   xnn_f32_gemm_minmax_ukernel_function gemm,
84   xnn_f32_igemm_minmax_ukernel_function igemm,
85   xnn_f32_gemm_minmax_ukernel_function gemm1,
86   xnn_f32_igemm_minmax_ukernel_function igemm1,
87   xnn_init_f32_minmax_params_fn init_params,
88   uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
89   benchmark::utils::IsaCheckFunction isa_check = nullptr)
90 {
91   if (isa_check && !isa_check(state)) {
92     return;
93   }
94   if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
95     state.SkipWithError("failed to initialize XNNPACK");
96     return;
97   }
98 
99   // Override microkernels chosen in xnn_initialize
100   // Note: do not directly assign to xnn_params.f32.gemm because it breaks older gcc.
101   xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm));
102   xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm));
103   xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm1));
104   xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm1));
105   xnn_params.f32.gemm.init.f32 = init_params;
106   xnn_params.f32.gemm.mr = mr;
107   xnn_params.f32.gemm.nr = nr;
108   xnn_params.f32.gemm.log2_kr = log2_kr;
109   xnn_params.f32.gemm.log2_sr = log2_sr;
110 
111   xnn_params.f32.gemm.generator.gemm = xnn_init_hmp_gemm_codegen(gemm_generator);
112 
113   auto execution_plan = model_factory(nullptr);
114   if (execution_plan.empty()) {
115     state.SkipWithError("failed to create a model");
116     return;
117   }
118 
119   for (auto _ : state) {
120     for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
121       xnn_status status = xnn_run_operator(op.get(), nullptr);
122       if (status != xnn_status_success) {
123         state.SkipWithError("failed to run a model");
124         return;
125       }
126     }
127   }
128 
129   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
130   if (cpu_frequency != 0) {
131     state.counters["cpufreq"] = cpu_frequency;
132   }
133 }
134 #endif  // XNN_PLATFORM_JIT
135 
136 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)137   static void f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
138     GEMMEnd2EndBenchmark(state, model,
139       xnn_f32_gemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53,
140       xnn_f32_igemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53,
141       xnn_f32_gemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53,
142       xnn_f32_igemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53,
143       xnn_init_f32_minmax_scalar_params,
144       4 /* mr */, 12 /* nr */);
145   }
f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)146   static void f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
147     GEMMEnd2EndBenchmark(state, model,
148       xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53,
149       xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53,
150       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
151       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
152       xnn_init_f32_minmax_scalar_params,
153       4 /* mr */, 8 /* nr */);
154   }
f32_gemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)155   static void f32_gemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
156     GEMMEnd2EndBenchmark(state, model,
157       xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55,
158       xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55,
159       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
160       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
161       xnn_init_f32_minmax_scalar_params,
162       4 /* mr */, 8 /* nr */);
163   }
f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)164   static void f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
165     GEMMEnd2EndBenchmark(state, model,
166       xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75,
167       xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75,
168       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
169       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
170       xnn_init_f32_minmax_scalar_params,
171       4 /* mr */, 8 /* nr */);
172   }
f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)173   static void f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
174     GEMMEnd2EndBenchmark(state, model,
175       xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75,
176       xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75,
177       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
178       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
179       xnn_init_f32_minmax_scalar_params,
180       4 /* mr */, 8 /* nr */);
181   }
f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State & state,models::ExecutionPlanFactory model)182   static void f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
183     GEMMEnd2EndBenchmark(state, model,
184       xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld64,
185       xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld64,
186       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64,
187       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
188       xnn_init_f32_minmax_scalar_params,
189       4 /* mr */, 8 /* nr */);
190   }
f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State & state,models::ExecutionPlanFactory model)191   static void f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
192     GEMMEnd2EndBenchmark(state, model,
193       xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld128,
194       xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld128,
195       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64,
196       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
197       xnn_init_f32_minmax_scalar_params,
198       4 /* mr */, 8 /* nr */);
199   }
f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)200   static void f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
201     GEMMEnd2EndBenchmark(state, model,
202       xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75,
203       xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75,
204       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
205       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
206       xnn_init_f32_minmax_scalar_params,
207       5 /* mr */, 8 /* nr */);
208   }
f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)209   static void f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
210     GEMMEnd2EndBenchmark(state, model,
211       xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75,
212       xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75,
213       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
214       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
215       xnn_init_f32_minmax_scalar_params,
216       5 /* mr */, 8 /* nr */);
217   }
f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)218   static void f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
219     GEMMEnd2EndBenchmark(state, model,
220       xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53,
221       xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53,
222       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
223       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
224       xnn_init_f32_minmax_scalar_params,
225       6 /* mr */, 8 /* nr */);
226   }
f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)227   static void f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
228     GEMMEnd2EndBenchmark(state, model,
229       xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55,
230       xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55,
231       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
232       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
233       xnn_init_f32_minmax_scalar_params,
234       6 /* mr */, 8 /* nr */);
235   }
f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State & state,models::ExecutionPlanFactory model)236   static void f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, models::ExecutionPlanFactory model) {
237     GEMMEnd2EndBenchmark(state, model,
238       xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73,
239       xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73,
240       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
241       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
242       xnn_init_f32_minmax_scalar_params,
243       6 /* mr */, 8 /* nr */);
244   }
f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)245   static void f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
246     GEMMEnd2EndBenchmark(state, model,
247       xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75,
248       xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75,
249       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
250       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
251       xnn_init_f32_minmax_scalar_params,
252       6 /* mr */, 8 /* nr */);
253   }
f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)254   static void f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
255     GEMMEnd2EndBenchmark(state, model,
256       xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75,
257       xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75,
258       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
259       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
260       xnn_init_f32_minmax_scalar_params,
261       6 /* mr */, 8 /* nr */);
262   }
f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State & state,models::ExecutionPlanFactory model)263   static void f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
264     GEMMEnd2EndBenchmark(state, model,
265       xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64,
266       xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld64,
267       xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
268       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
269       xnn_init_f32_minmax_scalar_params,
270       6 /* mr */, 8 /* nr */);
271   }
f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State & state,models::ExecutionPlanFactory model)272   static void f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
273     GEMMEnd2EndBenchmark(state, model,
274       xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128,
275       xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128,
276       xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
277       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
278       xnn_init_f32_minmax_scalar_params,
279       6 /* mr */, 8 /* nr */);
280   }
f32_gemm_4x8__neonfma_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)281   static void f32_gemm_4x8__neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
282     GEMMEnd2EndBenchmark(state, model,
283       xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld64,
284       xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld64,
285       xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
286       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
287       xnn_init_f32_minmax_scalar_params,
288       4 /* mr */, 8 /* nr */);
289   }
f32_gemm_4x8__neonfma_lane_ld128(benchmark::State & state,models::ExecutionPlanFactory model)290   static void f32_gemm_4x8__neonfma_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
291     GEMMEnd2EndBenchmark(state, model,
292       xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128,
293       xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld128,
294       xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
295       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
296       xnn_init_f32_minmax_scalar_params,
297       4 /* mr */, 8 /* nr */);
298   }
f32_gemm_6x8__neonfma_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)299   static void f32_gemm_6x8__neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
300     GEMMEnd2EndBenchmark(state, model,
301       xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64,
302       xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64,
303       xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
304       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
305       xnn_init_f32_minmax_scalar_params,
306       6 /* mr */, 8 /* nr */);
307   }
f32_gemm_6x8__neonfma_lane_ld128(benchmark::State & state,models::ExecutionPlanFactory model)308   static void f32_gemm_6x8__neonfma_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
309     GEMMEnd2EndBenchmark(state, model,
310       xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128,
311       xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128,
312       xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
313       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
314       xnn_init_f32_minmax_scalar_params,
315       6 /* mr */, 8 /* nr */);
316   }
317 
318   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_ld64)
319   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_ld128);
320   BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_ld64);
321   BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_ld128);
322   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_cortex_a53)
323   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_cortex_a55)
324   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_cortex_a75)
325   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75)
326   BENCHMARK_FP32_END2END(f32_gemm_5x8__aarch64_neonfma_cortex_a75);
327   BENCHMARK_FP32_END2END(f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75);
328   BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a53);
329   BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a55);
330   BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a73);
331   BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a75);
332   BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75);
333   BENCHMARK_FP32_END2END(f32_gemm_4x12__aarch64_neonfma_cortex_a53);
334 
335   BENCHMARK_FP32_END2END(f32_gemm_4x8__neonfma_lane_ld64);
336   BENCHMARK_FP32_END2END(f32_gemm_4x8__neonfma_lane_ld128);
337 
338   BENCHMARK_FP32_END2END(f32_gemm_6x8__neonfma_lane_ld64);
339   BENCHMARK_FP32_END2END(f32_gemm_6x8__neonfma_lane_ld128);
340 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
341 
342 #if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
jit_f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)343   static void jit_f32_gemm_6x8__aarch64_neonfma_cortex_a75(
344       benchmark::State &state, models::ExecutionPlanFactory model) {
345     GEMMEnd2EndBenchmark(
346         state, model,
347         xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75,
348         xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75,
349         xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75,
350         xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
351         xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
352         xnn_init_f32_minmax_scalar_params, 6 /* mr */, 8 /* nr */);
353   }
jit_f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)354   static void jit_f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(
355       benchmark::State &state, models::ExecutionPlanFactory model) {
356     GEMMEnd2EndBenchmark(
357         state, model,
358         xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75,
359         xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75,
360         xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75,
361         xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
362         xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
363         xnn_init_f32_minmax_scalar_params, 6 /* mr */, 8 /* nr */);
364   }
365 
366   BENCHMARK_FP32_END2END(jit_f32_gemm_6x8__aarch64_neonfma_cortex_a75);
367   BENCHMARK_FP32_END2END(jit_f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75);
368 #endif  // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
369 
370 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
f32_gemm_4x8__aarch32_neon_ld64(benchmark::State & state,models::ExecutionPlanFactory model)371   static void f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
372     GEMMEnd2EndBenchmark(state, model,
373       xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_ld64,
374       xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_ld64,
375       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
376       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
377       xnn_init_f32_minmax_scalar_params,
378       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
379       benchmark::utils::CheckNEON);
380   }
f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State & state,models::ExecutionPlanFactory model)381   static void f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) {
382     GEMMEnd2EndBenchmark(state, model,
383       xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7,
384       xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7,
385       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
386       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
387       xnn_init_f32_minmax_scalar_params,
388       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
389       benchmark::utils::CheckNEON);
390   }
f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)391   static void f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
392     GEMMEnd2EndBenchmark(state, model,
393       xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53,
394       xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53,
395       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
396       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
397       xnn_init_f32_minmax_scalar_params,
398       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
399       benchmark::utils::CheckNEON);
400   }
f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)401   static void f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
402     GEMMEnd2EndBenchmark(state, model,
403       xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55,
404       xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55,
405       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
406       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
407       xnn_init_f32_minmax_scalar_params,
408       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
409       benchmark::utils::CheckNEON);
410   }
f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)411   static void f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
412     GEMMEnd2EndBenchmark(state, model,
413       xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75,
414       xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75,
415       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
416       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
417       xnn_init_f32_minmax_scalar_params,
418       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
419       benchmark::utils::CheckNEON);
420   }
f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)421   static void f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
422     GEMMEnd2EndBenchmark(state, model,
423       xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75,
424       xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75,
425       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
426       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
427       xnn_init_f32_minmax_scalar_params,
428       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
429       benchmark::utils::CheckNEON);
430   }
431 
432   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_ld64);
433   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_cortex_a7);
434   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_cortex_a53);
435   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_cortex_a55);
436   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_cortex_a75);
437   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_prfm_cortex_a75);
438 #endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
439 
440 
441 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
f32_gemm_4x8__neon_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)442   static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
443     GEMMEnd2EndBenchmark(state, model,
444       xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64,
445       xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld64,
446       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
447       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
448       xnn_init_f32_minmax_scalar_params,
449       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
450       benchmark::utils::CheckNEON);
451   }
452 
f32_gemm_4x8__neon_lane_ld128(benchmark::State & state,models::ExecutionPlanFactory model)453   static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
454     GEMMEnd2EndBenchmark(state, model,
455       xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128,
456       xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128,
457       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
458       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
459       xnn_init_f32_minmax_scalar_params,
460       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
461       benchmark::utils::CheckNEON);
462   }
463 
f32_gemm_6x8__neon_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)464   static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
465     GEMMEnd2EndBenchmark(state, model,
466       xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld64,
467       xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld64,
468       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
469       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
470       xnn_init_f32_minmax_scalar_params,
471       6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
472       benchmark::utils::CheckNEON);
473   }
474 
f32_gemm_6x8__neon_lane_ld128(benchmark::State & state,models::ExecutionPlanFactory model)475   static void f32_gemm_6x8__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
476     GEMMEnd2EndBenchmark(state, model,
477       xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128,
478       xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128,
479       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
480       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
481       xnn_init_f32_minmax_scalar_params,
482       6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
483       benchmark::utils::CheckNEON);
484   }
485 
f32_gemm_4x8__neon_dup_ld64(benchmark::State & state,models::ExecutionPlanFactory model)486   static void f32_gemm_4x8__neon_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
487     GEMMEnd2EndBenchmark(state, model,
488       xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld64,
489       xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld64,
490       xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64,
491       xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64,
492       xnn_init_f32_minmax_scalar_params,
493       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
494       benchmark::utils::CheckNEON);
495   }
496 
f32_gemm_4x8__neon_dup_ld128(benchmark::State & state,models::ExecutionPlanFactory model)497   static void f32_gemm_4x8__neon_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
498     GEMMEnd2EndBenchmark(state, model,
499       xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128,
500       xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld128,
501       xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64,
502       xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64,
503       xnn_init_f32_minmax_scalar_params,
504       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
505       benchmark::utils::CheckNEON);
506   }
507 
f32_gemm_6x8__neon_dup_ld64(benchmark::State & state,models::ExecutionPlanFactory model)508   static void f32_gemm_6x8__neon_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
509     GEMMEnd2EndBenchmark(state, model,
510       xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64,
511       xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld64,
512       xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64,
513       xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64,
514       xnn_init_f32_minmax_scalar_params,
515       6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
516       benchmark::utils::CheckNEON);
517   }
518 
f32_gemm_6x8__neon_dup_ld128(benchmark::State & state,models::ExecutionPlanFactory model)519   static void f32_gemm_6x8__neon_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
520     GEMMEnd2EndBenchmark(state, model,
521       xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld128,
522       xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld128,
523       xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64,
524       xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64,
525       xnn_init_f32_minmax_scalar_params,
526       6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
527       benchmark::utils::CheckNEON);
528   }
529 
f32_gemm_4x8__neonfma_dup_ld64(benchmark::State & state,models::ExecutionPlanFactory model)530   static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
531     GEMMEnd2EndBenchmark(state, model,
532       xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64,
533       xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld64,
534       xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64,
535       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64,
536       xnn_init_f32_minmax_scalar_params,
537       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
538       benchmark::utils::CheckNEONFMA);
539   }
540 
f32_gemm_4x8__neonfma_dup_ld128(benchmark::State & state,models::ExecutionPlanFactory model)541   static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
542     GEMMEnd2EndBenchmark(state, model,
543       xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld128,
544       xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld128,
545       xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64,
546       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64,
547       xnn_init_f32_minmax_scalar_params,
548       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
549       benchmark::utils::CheckNEONFMA);
550   }
551 
f32_gemm_6x8__neonfma_dup_ld64(benchmark::State & state,models::ExecutionPlanFactory model)552   static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
553     GEMMEnd2EndBenchmark(state, model,
554       xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld64,
555       xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld64,
556       xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64,
557       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64,
558       xnn_init_f32_minmax_scalar_params,
559       6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
560       benchmark::utils::CheckNEONFMA);
561   }
562 
f32_gemm_6x8__neonfma_dup_ld128(benchmark::State & state,models::ExecutionPlanFactory model)563   static void f32_gemm_6x8__neonfma_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
564     GEMMEnd2EndBenchmark(state, model,
565       xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld128,
566       xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld128,
567       xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64,
568       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64,
569       xnn_init_f32_minmax_scalar_params,
570       6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
571       benchmark::utils::CheckNEONFMA);
572   }
573 
f32_gemm_4x8s4__neon(benchmark::State & state,models::ExecutionPlanFactory model)574   static void f32_gemm_4x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
575     GEMMEnd2EndBenchmark(state, model,
576       xnn_f32_gemm_minmax_ukernel_4x8s4__neon,
577       xnn_f32_igemm_minmax_ukernel_4x8s4__neon,
578       xnn_f32_gemm_minmax_ukernel_1x8s4__neon,
579       xnn_f32_igemm_minmax_ukernel_1x8s4__neon,
580       xnn_init_f32_minmax_scalar_params,
581       4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
582       benchmark::utils::CheckNEON);
583   }
584 
f32_gemm_4x8s4__neonfma(benchmark::State & state,models::ExecutionPlanFactory model)585   static void f32_gemm_4x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
586     GEMMEnd2EndBenchmark(state, model,
587       xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma,
588       xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma,
589       xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma,
590       xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma,
591       xnn_init_f32_minmax_scalar_params,
592       4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
593       benchmark::utils::CheckNEONFMA);
594   }
595 
f32_gemm_6x8s4__neon(benchmark::State & state,models::ExecutionPlanFactory model)596   static void f32_gemm_6x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
597     GEMMEnd2EndBenchmark(state, model,
598       xnn_f32_gemm_minmax_ukernel_6x8s4__neon,
599       xnn_f32_igemm_minmax_ukernel_6x8s4__neon,
600       xnn_f32_gemm_minmax_ukernel_1x8s4__neon,
601       xnn_f32_igemm_minmax_ukernel_1x8s4__neon,
602       xnn_init_f32_minmax_scalar_params,
603       6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
604       benchmark::utils::CheckNEON);
605   }
606 
f32_gemm_6x8s4__neonfma(benchmark::State & state,models::ExecutionPlanFactory model)607   static void f32_gemm_6x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
608     GEMMEnd2EndBenchmark(state, model,
609       xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma,
610       xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma,
611       xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma,
612       xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma,
613       xnn_init_f32_minmax_scalar_params,
614       6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
615       benchmark::utils::CheckNEONFMA);
616   }
617 
f32_gemm_8x8s4__neon(benchmark::State & state,models::ExecutionPlanFactory model)618   static void f32_gemm_8x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
619     GEMMEnd2EndBenchmark(state, model,
620       xnn_f32_gemm_minmax_ukernel_8x8s4__neon,
621       xnn_f32_igemm_minmax_ukernel_8x8s4__neon,
622       xnn_f32_gemm_minmax_ukernel_1x8s4__neon,
623       xnn_f32_igemm_minmax_ukernel_1x8s4__neon,
624       xnn_init_f32_minmax_scalar_params,
625       8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
626       benchmark::utils::CheckNEON);
627   }
628 
f32_gemm_8x8s4__neonfma(benchmark::State & state,models::ExecutionPlanFactory model)629   static void f32_gemm_8x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
630     GEMMEnd2EndBenchmark(state, model,
631       xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma,
632       xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma,
633       xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma,
634       xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma,
635       xnn_init_f32_minmax_scalar_params,
636       8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
637       benchmark::utils::CheckNEONFMA);
638   }
639 
640   BENCHMARK_FP32_END2END(f32_gemm_4x8__neon_lane_ld64);
641   BENCHMARK_FP32_END2END(f32_gemm_4x8__neon_lane_ld128);
642   BENCHMARK_FP32_END2END(f32_gemm_6x8__neon_lane_ld64);
643   BENCHMARK_FP32_END2END(f32_gemm_6x8__neon_lane_ld128);
644 
645   BENCHMARK_FP32_END2END(f32_gemm_4x8__neon_dup_ld64);
646   BENCHMARK_FP32_END2END(f32_gemm_4x8__neon_dup_ld128);
647   BENCHMARK_FP32_END2END(f32_gemm_6x8__neon_dup_ld64);
648   BENCHMARK_FP32_END2END(f32_gemm_6x8__neon_dup_ld128);
649 
650   BENCHMARK_FP32_END2END(f32_gemm_4x8__neonfma_dup_ld64);
651   BENCHMARK_FP32_END2END(f32_gemm_4x8__neonfma_dup_ld128);
652   BENCHMARK_FP32_END2END(f32_gemm_6x8__neonfma_dup_ld64);
653   BENCHMARK_FP32_END2END(f32_gemm_6x8__neonfma_dup_ld128);
654 
655   BENCHMARK_FP32_END2END(f32_gemm_4x8s4__neon);
656   BENCHMARK_FP32_END2END(f32_gemm_6x8s4__neon);
657   BENCHMARK_FP32_END2END(f32_gemm_8x8s4__neon);
658 
659   BENCHMARK_FP32_END2END(f32_gemm_4x8s4__neonfma);
660   BENCHMARK_FP32_END2END(f32_gemm_6x8s4__neonfma);
661   BENCHMARK_FP32_END2END(f32_gemm_8x8s4__neonfma);
662 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
663 
664 
665 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
f32_gemm_4x16__avx512f_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)666   static void f32_gemm_4x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
667     GEMMEnd2EndBenchmark(state, model,
668       xnn_f32_gemm_minmax_ukernel_4x16__avx512f_broadcast,
669       xnn_f32_igemm_minmax_ukernel_4x16__avx512f_broadcast,
670       xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
671       xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
672       xnn_init_f32_minmax_scalar_params,
673       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
674       benchmark::utils::CheckAVX512F);
675   }
f32_gemm_5x16__avx512f_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)676   static void f32_gemm_5x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
677     GEMMEnd2EndBenchmark(state, model,
678       xnn_f32_gemm_minmax_ukernel_5x16__avx512f_broadcast,
679       xnn_f32_igemm_minmax_ukernel_5x16__avx512f_broadcast,
680       xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
681       xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
682       xnn_init_f32_minmax_scalar_params,
683       5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
684       benchmark::utils::CheckAVX512F);
685   }
f32_gemm_6x16__avx512f_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)686   static void f32_gemm_6x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
687     GEMMEnd2EndBenchmark(state, model,
688       xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast,
689       xnn_f32_igemm_minmax_ukernel_6x16__avx512f_broadcast,
690       xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
691       xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
692       xnn_init_f32_minmax_scalar_params,
693       6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
694       benchmark::utils::CheckAVX512F);
695   }
f32_gemm_7x16__avx512f_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)696   static void f32_gemm_7x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
697     GEMMEnd2EndBenchmark(state, model,
698       xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast,
699       xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast,
700       xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
701       xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
702       xnn_init_f32_minmax_scalar_params,
703       7 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
704       benchmark::utils::CheckAVX512F);
705   }
f32_gemm_8x16__avx512f_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)706   static void f32_gemm_8x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
707     GEMMEnd2EndBenchmark(state, model,
708       xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast,
709       xnn_f32_igemm_minmax_ukernel_8x16__avx512f_broadcast,
710       xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
711       xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
712       xnn_init_f32_minmax_scalar_params,
713       8 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
714       benchmark::utils::CheckAVX512F);
715   }
716 
f32_gemm_4x8__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)717   static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
718     GEMMEnd2EndBenchmark(state, model,
719       xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast,
720       xnn_f32_igemm_minmax_ukernel_4x8__fma3_broadcast,
721       xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
722       xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast,
723       xnn_init_f32_minmax_avx_params,
724       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
725       benchmark::utils::CheckFMA3);
726   }
f32_gemm_5x8__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)727   static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
728     GEMMEnd2EndBenchmark(state, model,
729       xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast,
730       xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast,
731       xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
732       xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast,
733       xnn_init_f32_minmax_avx_params,
734       5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
735       benchmark::utils::CheckFMA3);
736   }
f32_gemm_6x8__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)737   static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
738     GEMMEnd2EndBenchmark(state, model,
739       xnn_f32_gemm_minmax_ukernel_6x8__fma3_broadcast,
740       xnn_f32_igemm_minmax_ukernel_6x8__fma3_broadcast,
741       xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
742       xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast,
743       xnn_init_f32_minmax_avx_params,
744       6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
745       benchmark::utils::CheckFMA3);
746   }
f32_gemm_7x8__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)747   static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
748     GEMMEnd2EndBenchmark(state, model,
749       xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast,
750       xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast,
751       xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
752       xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast,
753       xnn_init_f32_minmax_avx_params,
754       7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
755       benchmark::utils::CheckFMA3);
756   }
f32_gemm_8x8__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)757   static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
758     GEMMEnd2EndBenchmark(state, model,
759       xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast,
760       xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast,
761       xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
762       xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast,
763       xnn_init_f32_minmax_avx_params,
764       8 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
765       benchmark::utils::CheckFMA3);
766   }
f32_gemm_3x16__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)767   static void f32_gemm_3x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
768     GEMMEnd2EndBenchmark(state, model,
769       xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast,
770       xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast,
771       xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast,
772       xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast,
773       xnn_init_f32_minmax_avx_params,
774       3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
775       benchmark::utils::CheckFMA3);
776   }
f32_gemm_4x16__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)777   static void f32_gemm_4x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
778     GEMMEnd2EndBenchmark(state, model,
779       xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast,
780       xnn_f32_igemm_minmax_ukernel_4x16__fma3_broadcast,
781       xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast,
782       xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast,
783       xnn_init_f32_minmax_avx_params,
784       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
785       benchmark::utils::CheckFMA3);
786   }
f32_gemm_5x16__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)787   static void f32_gemm_5x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
788     GEMMEnd2EndBenchmark(state, model,
789       xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast,
790       xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast,
791       xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast,
792       xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast,
793       xnn_init_f32_minmax_avx_params,
794       5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
795       benchmark::utils::CheckFMA3);
796   }
f32_gemm_3x16s4__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)797   static void f32_gemm_3x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
798     GEMMEnd2EndBenchmark(state, model,
799       xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast,
800       xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast,
801       xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast,
802       xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast,
803       xnn_init_f32_minmax_avx_params,
804       3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 2 /* log2_sr */,
805       benchmark::utils::CheckFMA3);
806   }
f32_gemm_4x16s4__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)807   static void f32_gemm_4x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
808     GEMMEnd2EndBenchmark(state, model,
809       xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast,
810       xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast,
811       xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast,
812       xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast,
813       xnn_init_f32_minmax_avx_params,
814       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 2 /* log2_sr */,
815       benchmark::utils::CheckFMA3);
816   }
f32_gemm_5x16s4__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)817   static void f32_gemm_5x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
818     GEMMEnd2EndBenchmark(state, model,
819       xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast,
820       xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast,
821       xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast,
822       xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast,
823       xnn_init_f32_minmax_avx_params,
824       5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 2 /* log2_sr */,
825       benchmark::utils::CheckFMA3);
826   }
827 
f32_gemm_4x8__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)828   static void f32_gemm_4x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
829     GEMMEnd2EndBenchmark(state, model,
830       xnn_f32_gemm_minmax_ukernel_4x8__avx_broadcast,
831       xnn_f32_igemm_minmax_ukernel_4x8__avx_broadcast,
832       xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast,
833       xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast,
834       xnn_init_f32_minmax_avx_params,
835       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
836       benchmark::utils::CheckAVX);
837   }
f32_gemm_5x8__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)838   static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
839     GEMMEnd2EndBenchmark(state, model,
840       xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast,
841       xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast,
842       xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast,
843       xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast,
844       xnn_init_f32_minmax_avx_params,
845       5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
846       benchmark::utils::CheckAVX);
847   }
f32_gemm_6x8__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)848   static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
849     GEMMEnd2EndBenchmark(state, model,
850       xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast,
851       xnn_f32_igemm_minmax_ukernel_6x8__avx_broadcast,
852       xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast,
853       xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast,
854       xnn_init_f32_minmax_avx_params,
855       6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
856       benchmark::utils::CheckAVX);
857   }
f32_gemm_7x8__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)858   static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
859     GEMMEnd2EndBenchmark(state, model,
860       xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast,
861       xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast,
862       xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast,
863       xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast,
864       xnn_init_f32_minmax_avx_params,
865       7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
866       benchmark::utils::CheckAVX);
867   }
f32_gemm_3x16__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)868   static void f32_gemm_3x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
869     GEMMEnd2EndBenchmark(state, model,
870       xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast,
871       xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast,
872       xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast,
873       xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast,
874       xnn_init_f32_minmax_avx_params,
875       3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
876       benchmark::utils::CheckAVX);
877   }
f32_gemm_4x16__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)878   static void f32_gemm_4x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
879     GEMMEnd2EndBenchmark(state, model,
880       xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast,
881       xnn_f32_igemm_minmax_ukernel_4x16__avx_broadcast,
882       xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast,
883       xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast,
884       xnn_init_f32_minmax_avx_params,
885       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
886       benchmark::utils::CheckAVX);
887   }
f32_gemm_5x16__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)888   static void f32_gemm_5x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
889     GEMMEnd2EndBenchmark(state, model,
890       xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast,
891       xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast,
892       xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast,
893       xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast,
894       xnn_init_f32_minmax_avx_params,
895       5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
896       benchmark::utils::CheckAVX);
897   }
898 
f32_gemm_3x8__sse2_dup(benchmark::State & state,models::ExecutionPlanFactory model)899   static void f32_gemm_3x8__sse2_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
900     GEMMEnd2EndBenchmark(state, model,
901       xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup,
902       xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup,
903       xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup,
904       xnn_f32_igemm_minmax_ukernel_1x8__sse2_dup,
905       xnn_init_f32_minmax_sse_params,
906       3 /* mr */, 8 /* nr */);
907   }
f32_gemm_4x8__sse2_dup(benchmark::State & state,models::ExecutionPlanFactory model)908   static void f32_gemm_4x8__sse2_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
909     GEMMEnd2EndBenchmark(state, model,
910       xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup,
911       xnn_f32_igemm_minmax_ukernel_4x8__sse2_dup,
912       xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup,
913       xnn_f32_igemm_minmax_ukernel_1x8__sse2_dup,
914       xnn_init_f32_minmax_sse_params,
915       4 /* mr */, 8 /* nr */);
916   }
f32_gemm_5x8__sse2_dup(benchmark::State & state,models::ExecutionPlanFactory model)917   static void f32_gemm_5x8__sse2_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
918     GEMMEnd2EndBenchmark(state, model,
919       xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup,
920       xnn_f32_igemm_minmax_ukernel_5x8__sse2_dup,
921       xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup,
922       xnn_f32_igemm_minmax_ukernel_1x8__sse2_dup,
923       xnn_init_f32_minmax_sse_params,
924       5 /* mr */, 8 /* nr */);
925   }
926 
f32_gemm_3x8__sse_load1(benchmark::State & state,models::ExecutionPlanFactory model)927   static void f32_gemm_3x8__sse_load1(benchmark::State& state, models::ExecutionPlanFactory model) {
928     GEMMEnd2EndBenchmark(state, model,
929       xnn_f32_gemm_minmax_ukernel_3x8__sse_load1,
930       xnn_f32_igemm_minmax_ukernel_3x8__sse_load1,
931       xnn_f32_gemm_minmax_ukernel_1x8__sse_load1,
932       xnn_f32_igemm_minmax_ukernel_1x8__sse_load1,
933       xnn_init_f32_minmax_sse_params,
934       3 /* mr */, 8 /* nr */);
935   }
f32_gemm_4x8__sse_load1(benchmark::State & state,models::ExecutionPlanFactory model)936   static void f32_gemm_4x8__sse_load1(benchmark::State& state, models::ExecutionPlanFactory model) {
937     GEMMEnd2EndBenchmark(state, model,
938       xnn_f32_gemm_minmax_ukernel_4x8__sse_load1,
939       xnn_f32_igemm_minmax_ukernel_4x8__sse_load1,
940       xnn_f32_gemm_minmax_ukernel_1x8__sse_load1,
941       xnn_f32_igemm_minmax_ukernel_1x8__sse_load1,
942       xnn_init_f32_minmax_sse_params,
943       4 /* mr */, 8 /* nr */);
944   }
f32_gemm_5x8__sse_load1(benchmark::State & state,models::ExecutionPlanFactory model)945   static void f32_gemm_5x8__sse_load1(benchmark::State& state, models::ExecutionPlanFactory model) {
946     GEMMEnd2EndBenchmark(state, model,
947       xnn_f32_gemm_minmax_ukernel_5x8__sse_load1,
948       xnn_f32_igemm_minmax_ukernel_5x8__sse_load1,
949       xnn_f32_gemm_minmax_ukernel_1x8__sse_load1,
950       xnn_f32_igemm_minmax_ukernel_1x8__sse_load1,
951       xnn_init_f32_minmax_sse_params,
952       5 /* mr */, 8 /* nr */);
953   }
f32_gemm_3x8__sse_dup(benchmark::State & state,models::ExecutionPlanFactory model)954   static void f32_gemm_3x8__sse_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
955     GEMMEnd2EndBenchmark(state, model,
956       xnn_f32_gemm_minmax_ukernel_3x8__sse_dup,
957       xnn_f32_igemm_minmax_ukernel_3x8__sse_dup,
958       xnn_f32_gemm_minmax_ukernel_1x8__sse_dup,
959       xnn_f32_igemm_minmax_ukernel_1x8__sse_dup,
960       xnn_init_f32_minmax_sse_params,
961       3 /* mr */, 8 /* nr */);
962   }
f32_gemm_4x8__sse_dup(benchmark::State & state,models::ExecutionPlanFactory model)963   static void f32_gemm_4x8__sse_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
964     GEMMEnd2EndBenchmark(state, model,
965       xnn_f32_gemm_minmax_ukernel_4x8__sse_dup,
966       xnn_f32_igemm_minmax_ukernel_4x8__sse_dup,
967       xnn_f32_gemm_minmax_ukernel_1x8__sse_dup,
968       xnn_f32_igemm_minmax_ukernel_1x8__sse_dup,
969       xnn_init_f32_minmax_sse_params,
970       4 /* mr */, 8 /* nr */);
971   }
f32_gemm_5x8__sse_dup(benchmark::State & state,models::ExecutionPlanFactory model)972   static void f32_gemm_5x8__sse_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
973     GEMMEnd2EndBenchmark(state, model,
974       xnn_f32_gemm_minmax_ukernel_5x8__sse_dup,
975       xnn_f32_igemm_minmax_ukernel_5x8__sse_dup,
976       xnn_f32_gemm_minmax_ukernel_1x8__sse_dup,
977       xnn_f32_igemm_minmax_ukernel_1x8__sse_dup,
978       xnn_init_f32_minmax_sse_params,
979       5 /* mr */, 8 /* nr */);
980   }
f32_gemm_3x8s4__sse(benchmark::State & state,models::ExecutionPlanFactory model)981   static void f32_gemm_3x8s4__sse(benchmark::State& state, models::ExecutionPlanFactory model) {
982     GEMMEnd2EndBenchmark(state, model,
983       xnn_f32_gemm_minmax_ukernel_3x8s4__sse,
984       xnn_f32_igemm_minmax_ukernel_3x8s4__sse,
985       xnn_f32_gemm_minmax_ukernel_1x8s4__sse,
986       xnn_f32_igemm_minmax_ukernel_1x8s4__sse,
987       xnn_init_f32_minmax_sse_params,
988       3 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
989   }
f32_gemm_4x8s4__sse(benchmark::State & state,models::ExecutionPlanFactory model)990   static void f32_gemm_4x8s4__sse(benchmark::State& state, models::ExecutionPlanFactory model) {
991     GEMMEnd2EndBenchmark(state, model,
992       xnn_f32_gemm_minmax_ukernel_4x8s4__sse,
993       xnn_f32_igemm_minmax_ukernel_4x8s4__sse,
994       xnn_f32_gemm_minmax_ukernel_1x8s4__sse,
995       xnn_f32_igemm_minmax_ukernel_1x8s4__sse,
996       xnn_init_f32_minmax_sse_params,
997       4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
998   }
f32_gemm_5x8s4__sse(benchmark::State & state,models::ExecutionPlanFactory model)999   static void f32_gemm_5x8s4__sse(benchmark::State& state, models::ExecutionPlanFactory model) {
1000     GEMMEnd2EndBenchmark(state, model,
1001       xnn_f32_gemm_minmax_ukernel_5x8s4__sse,
1002       xnn_f32_igemm_minmax_ukernel_5x8s4__sse,
1003       xnn_f32_gemm_minmax_ukernel_1x8s4__sse,
1004       xnn_f32_igemm_minmax_ukernel_1x8s4__sse,
1005       xnn_init_f32_minmax_sse_params,
1006       5 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1007   }
1008 
1009   BENCHMARK_FP32_END2END(f32_gemm_4x16__avx512f_broadcast);
1010   BENCHMARK_FP32_END2END(f32_gemm_5x16__avx512f_broadcast);
1011   BENCHMARK_FP32_END2END(f32_gemm_6x16__avx512f_broadcast);
1012   BENCHMARK_FP32_END2END(f32_gemm_7x16__avx512f_broadcast);
1013   BENCHMARK_FP32_END2END(f32_gemm_8x16__avx512f_broadcast);
1014 
1015   BENCHMARK_FP32_END2END(f32_gemm_4x8__fma3_broadcast);
1016   BENCHMARK_FP32_END2END(f32_gemm_5x8__fma3_broadcast);
1017   BENCHMARK_FP32_END2END(f32_gemm_6x8__fma3_broadcast);
1018   BENCHMARK_FP32_END2END(f32_gemm_7x8__fma3_broadcast);
1019   BENCHMARK_FP32_END2END(f32_gemm_8x8__fma3_broadcast);
1020   BENCHMARK_FP32_END2END(f32_gemm_3x16__fma3_broadcast);
1021   BENCHMARK_FP32_END2END(f32_gemm_4x16__fma3_broadcast);
1022   BENCHMARK_FP32_END2END(f32_gemm_5x16__fma3_broadcast);
1023 
1024   BENCHMARK_FP32_END2END(f32_gemm_3x16s4__fma3_broadcast);
1025   BENCHMARK_FP32_END2END(f32_gemm_4x16s4__fma3_broadcast);
1026   BENCHMARK_FP32_END2END(f32_gemm_5x16s4__fma3_broadcast);
1027 
1028   BENCHMARK_FP32_END2END(f32_gemm_4x8__avx_broadcast);
1029   BENCHMARK_FP32_END2END(f32_gemm_5x8__avx_broadcast);
1030   BENCHMARK_FP32_END2END(f32_gemm_6x8__avx_broadcast);
1031   BENCHMARK_FP32_END2END(f32_gemm_7x8__avx_broadcast);
1032   BENCHMARK_FP32_END2END(f32_gemm_3x16__avx_broadcast);
1033   BENCHMARK_FP32_END2END(f32_gemm_4x16__avx_broadcast);
1034   BENCHMARK_FP32_END2END(f32_gemm_5x16__avx_broadcast);
1035 
1036   BENCHMARK_FP32_END2END(f32_gemm_3x8__sse2_dup);
1037   BENCHMARK_FP32_END2END(f32_gemm_4x8__sse2_dup);
1038   BENCHMARK_FP32_END2END(f32_gemm_5x8__sse2_dup);
1039 
1040   BENCHMARK_FP32_END2END(f32_gemm_3x8__sse_load1);
1041   BENCHMARK_FP32_END2END(f32_gemm_4x8__sse_load1);
1042   BENCHMARK_FP32_END2END(f32_gemm_5x8__sse_load1);
1043 
1044   BENCHMARK_FP32_END2END(f32_gemm_3x8__sse_dup);
1045   BENCHMARK_FP32_END2END(f32_gemm_4x8__sse_dup);
1046   BENCHMARK_FP32_END2END(f32_gemm_5x8__sse_dup);
1047 
1048   BENCHMARK_FP32_END2END(f32_gemm_3x8s4__sse);
1049   BENCHMARK_FP32_END2END(f32_gemm_4x8s4__sse);
1050   BENCHMARK_FP32_END2END(f32_gemm_5x8s4__sse);
1051 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1052 
1053 
1054 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
f32_gemm_3x8__wasmsimd_arm_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1055   static void f32_gemm_3x8__wasmsimd_arm_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1056     GEMMEnd2EndBenchmark(state, model,
1057       xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat,
1058       xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat,
1059       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1060       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1061       xnn_init_f32_minmax_scalar_params,
1062       3 /* mr */, 8 /* nr */);
1063   }
f32_gemm_4x8__wasmsimd_arm_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1064   static void f32_gemm_4x8__wasmsimd_arm_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1065     GEMMEnd2EndBenchmark(state, model,
1066       xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat,
1067       xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat,
1068       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1069       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1070       xnn_init_f32_minmax_scalar_params,
1071       4 /* mr */, 8 /* nr */);
1072   }
f32_gemm_5x8__wasmsimd_arm_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1073   static void f32_gemm_5x8__wasmsimd_arm_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1074     GEMMEnd2EndBenchmark(state, model,
1075       xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat,
1076       xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat,
1077       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1078       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1079       xnn_init_f32_minmax_scalar_params,
1080       5 /* mr */, 8 /* nr */);
1081   }
f32_gemm_6x8__wasmsimd_arm_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1082   static void f32_gemm_6x8__wasmsimd_arm_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1083     GEMMEnd2EndBenchmark(state, model,
1084       xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat,
1085       xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat,
1086       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1087       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1088       xnn_init_f32_minmax_scalar_params,
1089       6 /* mr */, 8 /* nr */);
1090   }
f32_gemm_3x8__wasmsimd_x86_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1091   static void f32_gemm_3x8__wasmsimd_x86_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1092     GEMMEnd2EndBenchmark(state, model,
1093       xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat,
1094       xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat,
1095       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
1096       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
1097       xnn_init_f32_minmax_scalar_params,
1098       3 /* mr */, 8 /* nr */);
1099   }
f32_gemm_4x8__wasmsimd_x86_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1100   static void f32_gemm_4x8__wasmsimd_x86_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1101     GEMMEnd2EndBenchmark(state, model,
1102       xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat,
1103       xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat,
1104       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
1105       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
1106       xnn_init_f32_minmax_scalar_params,
1107       4 /* mr */, 8 /* nr */);
1108   }
f32_gemm_5x8__wasmsimd_x86_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1109   static void f32_gemm_5x8__wasmsimd_x86_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1110     GEMMEnd2EndBenchmark(state, model,
1111       xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat,
1112       xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat,
1113       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
1114       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
1115       xnn_init_f32_minmax_scalar_params,
1116       5 /* mr */, 8 /* nr */);
1117   }
f32_gemm_6x8__wasmsimd_x86_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1118   static void f32_gemm_6x8__wasmsimd_x86_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1119     GEMMEnd2EndBenchmark(state, model,
1120       xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat,
1121       xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat,
1122       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
1123       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
1124       xnn_init_f32_minmax_scalar_params,
1125       6 /* mr */, 8 /* nr */);
1126   }
f32_gemm_3x8__wasmsimd_arm_splat(benchmark::State & state,models::ExecutionPlanFactory model)1127   static void f32_gemm_3x8__wasmsimd_arm_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1128     GEMMEnd2EndBenchmark(state, model,
1129       xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat,
1130       xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat,
1131       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
1132       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
1133       xnn_init_f32_minmax_scalar_params,
1134       3 /* mr */, 8 /* nr */);
1135   }
f32_gemm_4x8__wasmsimd_arm_splat(benchmark::State & state,models::ExecutionPlanFactory model)1136   static void f32_gemm_4x8__wasmsimd_arm_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1137     GEMMEnd2EndBenchmark(state, model,
1138       xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat,
1139       xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_splat,
1140       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
1141       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
1142       xnn_init_f32_minmax_scalar_params,
1143       4 /* mr */, 8 /* nr */);
1144   }
f32_gemm_5x8__wasmsimd_arm_splat(benchmark::State & state,models::ExecutionPlanFactory model)1145   static void f32_gemm_5x8__wasmsimd_arm_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1146     GEMMEnd2EndBenchmark(state, model,
1147       xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat,
1148       xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat,
1149       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
1150       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
1151       xnn_init_f32_minmax_scalar_params,
1152       5 /* mr */, 8 /* nr */);
1153   }
f32_gemm_6x8__wasmsimd_arm_splat(benchmark::State & state,models::ExecutionPlanFactory model)1154   static void f32_gemm_6x8__wasmsimd_arm_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1155     GEMMEnd2EndBenchmark(state, model,
1156       xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat,
1157       xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat,
1158       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
1159       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
1160       xnn_init_f32_minmax_scalar_params,
1161       6 /* mr */, 8 /* nr */);
1162   }
f32_gemm_3x8__wasmsimd_x86_splat(benchmark::State & state,models::ExecutionPlanFactory model)1163   static void f32_gemm_3x8__wasmsimd_x86_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1164     GEMMEnd2EndBenchmark(state, model,
1165       xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat,
1166       xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat,
1167       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
1168       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
1169       xnn_init_f32_minmax_scalar_params,
1170       3 /* mr */, 8 /* nr */);
1171   }
f32_gemm_4x8__wasmsimd_x86_splat(benchmark::State & state,models::ExecutionPlanFactory model)1172   static void f32_gemm_4x8__wasmsimd_x86_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1173     GEMMEnd2EndBenchmark(state, model,
1174       xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat,
1175       xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat,
1176       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
1177       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
1178       xnn_init_f32_minmax_scalar_params,
1179       4 /* mr */, 8 /* nr */);
1180   }
f32_gemm_5x8__wasmsimd_x86_splat(benchmark::State & state,models::ExecutionPlanFactory model)1181   static void f32_gemm_5x8__wasmsimd_x86_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1182     GEMMEnd2EndBenchmark(state, model,
1183       xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat,
1184       xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat,
1185       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
1186       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
1187       xnn_init_f32_minmax_scalar_params,
1188       5 /* mr */, 8 /* nr */);
1189   }
f32_gemm_6x8__wasmsimd_x86_splat(benchmark::State & state,models::ExecutionPlanFactory model)1190   static void f32_gemm_6x8__wasmsimd_x86_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1191     GEMMEnd2EndBenchmark(state, model,
1192       xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat,
1193       xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat,
1194       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
1195       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
1196       xnn_init_f32_minmax_scalar_params,
1197       6 /* mr */, 8 /* nr */);
1198   }
f32_gemm_3x8s4__wasmsimd_arm(benchmark::State & state,models::ExecutionPlanFactory model)1199   static void f32_gemm_3x8s4__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) {
1200     GEMMEnd2EndBenchmark(state, model,
1201       xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm,
1202       xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm,
1203       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm,
1204       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm,
1205       xnn_init_f32_minmax_scalar_params,
1206       3 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1207   }
f32_gemm_4x8s4__wasmsimd_arm(benchmark::State & state,models::ExecutionPlanFactory model)1208   static void f32_gemm_4x8s4__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) {
1209     GEMMEnd2EndBenchmark(state, model,
1210       xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm,
1211       xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm,
1212       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm,
1213       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm,
1214       xnn_init_f32_minmax_scalar_params,
1215       4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1216   }
f32_gemm_5x8s4__wasmsimd_arm(benchmark::State & state,models::ExecutionPlanFactory model)1217   static void f32_gemm_5x8s4__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) {
1218     GEMMEnd2EndBenchmark(state, model,
1219       xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm,
1220       xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm,
1221       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm,
1222       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm,
1223       xnn_init_f32_minmax_scalar_params,
1224       5 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1225   }
f32_gemm_6x8s4__wasmsimd_arm(benchmark::State & state,models::ExecutionPlanFactory model)1226   static void f32_gemm_6x8s4__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) {
1227     GEMMEnd2EndBenchmark(state, model,
1228       xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm,
1229       xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm,
1230       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm,
1231       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm,
1232       xnn_init_f32_minmax_scalar_params,
1233       6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1234   }
f32_gemm_3x8s4__wasmsimd_x86(benchmark::State & state,models::ExecutionPlanFactory model)1235   static void f32_gemm_3x8s4__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) {
1236     GEMMEnd2EndBenchmark(state, model,
1237       xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86,
1238       xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86,
1239       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86,
1240       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86,
1241       xnn_init_f32_minmax_scalar_params,
1242       3 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1243   }
f32_gemm_4x8s4__wasmsimd_x86(benchmark::State & state,models::ExecutionPlanFactory model)1244   static void f32_gemm_4x8s4__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) {
1245     GEMMEnd2EndBenchmark(state, model,
1246       xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86,
1247       xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86,
1248       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86,
1249       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86,
1250       xnn_init_f32_minmax_scalar_params,
1251       4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1252   }
f32_gemm_5x8s4__wasmsimd_x86(benchmark::State & state,models::ExecutionPlanFactory model)1253   static void f32_gemm_5x8s4__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) {
1254     GEMMEnd2EndBenchmark(state, model,
1255       xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86,
1256       xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86,
1257       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86,
1258       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86,
1259       xnn_init_f32_minmax_scalar_params,
1260       5 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1261   }
f32_gemm_6x8s4__wasmsimd_x86(benchmark::State & state,models::ExecutionPlanFactory model)1262   static void f32_gemm_6x8s4__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) {
1263     GEMMEnd2EndBenchmark(state, model,
1264       xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86,
1265       xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86,
1266       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86,
1267       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86,
1268       xnn_init_f32_minmax_scalar_params,
1269       6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1270   }
1271 
1272   BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmsimd_arm_loadsplat);
1273   BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmsimd_arm_loadsplat);
1274   BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmsimd_arm_loadsplat);
1275   BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmsimd_arm_loadsplat);
1276 
1277   BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmsimd_x86_loadsplat);
1278   BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmsimd_x86_loadsplat);
1279   BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmsimd_x86_loadsplat);
1280   BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmsimd_x86_loadsplat);
1281 
1282   BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmsimd_arm_splat);
1283   BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmsimd_arm_splat);
1284   BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmsimd_arm_splat);
1285   BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmsimd_arm_splat);
1286 
1287   BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmsimd_x86_splat);
1288   BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmsimd_x86_splat);
1289   BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmsimd_x86_splat);
1290   BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmsimd_x86_splat);
1291 
1292   BENCHMARK_FP32_END2END(f32_gemm_3x8s4__wasmsimd_arm);
1293   BENCHMARK_FP32_END2END(f32_gemm_4x8s4__wasmsimd_arm);
1294   BENCHMARK_FP32_END2END(f32_gemm_5x8s4__wasmsimd_arm);
1295   BENCHMARK_FP32_END2END(f32_gemm_6x8s4__wasmsimd_arm);
1296 
1297   BENCHMARK_FP32_END2END(f32_gemm_3x8s4__wasmsimd_x86);
1298   BENCHMARK_FP32_END2END(f32_gemm_4x8s4__wasmsimd_x86);
1299   BENCHMARK_FP32_END2END(f32_gemm_5x8s4__wasmsimd_x86);
1300   BENCHMARK_FP32_END2END(f32_gemm_6x8s4__wasmsimd_x86);
1301 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1302 
1303 
1304 #if XNN_ARCH_WASM
f32_gemm_2x4__wasm(benchmark::State & state,models::ExecutionPlanFactory model)1305   static void f32_gemm_2x4__wasm(benchmark::State& state, models::ExecutionPlanFactory model) {
1306     GEMMEnd2EndBenchmark(state, model,
1307       xnn_f32_gemm_minmax_ukernel_2x4__wasm,
1308       xnn_f32_igemm_minmax_ukernel_2x4__wasm,
1309       xnn_f32_gemm_minmax_ukernel_1x4__wasm,
1310       xnn_f32_igemm_minmax_ukernel_1x4__wasm,
1311       xnn_init_f32_minmax_scalar_params,
1312       2 /* mr */, 4 /* nr */);
1313   }
1314 
f32_gemm_4x4__wasm(benchmark::State & state,models::ExecutionPlanFactory model)1315   static void f32_gemm_4x4__wasm(benchmark::State& state, models::ExecutionPlanFactory model) {
1316     GEMMEnd2EndBenchmark(state, model,
1317       xnn_f32_gemm_minmax_ukernel_4x4__wasm,
1318       xnn_f32_igemm_minmax_ukernel_4x4__wasm,
1319       xnn_f32_gemm_minmax_ukernel_1x4__wasm,
1320       xnn_f32_igemm_minmax_ukernel_1x4__wasm,
1321       xnn_init_f32_minmax_scalar_params,
1322       4 /* mr */, 4 /* nr */);
1323   }
1324 
1325   BENCHMARK_FP32_END2END(f32_gemm_2x4__wasm);
1326   BENCHMARK_FP32_END2END(f32_gemm_4x4__wasm);
1327 #endif  // XNN_ARCH_WASM
1328 
1329 
f32_gemm_2x4__scalar(benchmark::State & state,models::ExecutionPlanFactory model)1330 static void f32_gemm_2x4__scalar(benchmark::State& state, models::ExecutionPlanFactory model) {
1331   GEMMEnd2EndBenchmark(state, model,
1332     xnn_f32_gemm_minmax_ukernel_2x4__scalar,
1333     xnn_f32_igemm_minmax_ukernel_2x4__scalar,
1334     xnn_f32_gemm_minmax_ukernel_1x4__scalar,
1335     xnn_f32_igemm_minmax_ukernel_1x4__scalar,
1336     xnn_init_f32_minmax_scalar_params,
1337     2 /* mr */, 4 /* nr */);
1338 }
1339 
f32_gemm_4x4__scalar(benchmark::State & state,models::ExecutionPlanFactory model)1340 static void f32_gemm_4x4__scalar(benchmark::State& state, models::ExecutionPlanFactory model) {
1341   GEMMEnd2EndBenchmark(state, model,
1342     xnn_f32_gemm_minmax_ukernel_4x4__scalar,
1343     xnn_f32_igemm_minmax_ukernel_4x4__scalar,
1344     xnn_f32_gemm_minmax_ukernel_1x4__scalar,
1345     xnn_f32_igemm_minmax_ukernel_1x4__scalar,
1346     xnn_init_f32_minmax_scalar_params,
1347     4 /* mr */, 4 /* nr */);
1348 }
1349 
1350 BENCHMARK_FP32_END2END(f32_gemm_2x4__scalar);
1351 BENCHMARK_FP32_END2END(f32_gemm_4x4__scalar);
1352 
1353 
1354 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1355 BENCHMARK_MAIN();
1356 #endif
1357