1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11
12 #include <xnnpack.h>
13
14 #include <benchmark/benchmark.h>
15
16 #include "bench/end2end.h"
17 #include "bench/utils.h"
18 #include "models/models.h"
19 #include <xnnpack/gemm.h>
20 #include <xnnpack/igemm.h>
21 #include <xnnpack/params.h>
22 #include <xnnpack/params-init.h>
23
24
GEMMEnd2EndBenchmark(benchmark::State & state,models::ExecutionPlanFactory model_factory,xnn_f32_gemm_minmax_ukernel_function gemm,xnn_f32_igemm_minmax_ukernel_function igemm,xnn_f32_gemm_minmax_ukernel_function gemm1,xnn_f32_igemm_minmax_ukernel_function igemm1,xnn_init_f32_minmax_params_fn init_params,uint8_t mr,uint8_t nr,uint8_t log2_kr=0,uint8_t log2_sr=0,benchmark::utils::IsaCheckFunction isa_check=nullptr)25 static void GEMMEnd2EndBenchmark(
26 benchmark::State& state,
27 models::ExecutionPlanFactory model_factory,
28 xnn_f32_gemm_minmax_ukernel_function gemm,
29 xnn_f32_igemm_minmax_ukernel_function igemm,
30 xnn_f32_gemm_minmax_ukernel_function gemm1,
31 xnn_f32_igemm_minmax_ukernel_function igemm1,
32 xnn_init_f32_minmax_params_fn init_params,
33 uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
34 benchmark::utils::IsaCheckFunction isa_check = nullptr)
35 {
36 if (isa_check && !isa_check(state)) {
37 return;
38 }
39 if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
40 state.SkipWithError("failed to initialize XNNPACK");
41 return;
42 }
43
44 // Override microkernels chosen in xnn_initialize
45 // Note: do not directly assign to xnn_params.f32.gemm because it breaks older gcc.
46 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm));
47 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm));
48 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm1));
49 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm1));
50 xnn_params.f32.gemm.init.f32 = init_params;
51 xnn_params.f32.gemm.mr = mr;
52 xnn_params.f32.gemm.nr = nr;
53 xnn_params.f32.gemm.log2_kr = log2_kr;
54 xnn_params.f32.gemm.log2_sr = log2_sr;
55
56 auto execution_plan = model_factory(nullptr);
57 if (execution_plan.empty()) {
58 state.SkipWithError("failed to create a model");
59 return;
60 }
61
62 for (auto _ : state) {
63 for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
64 xnn_status status = xnn_run_operator(op.get(), nullptr);
65 if (status != xnn_status_success) {
66 state.SkipWithError("failed to run a model");
67 return;
68 }
69 }
70 }
71
72 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
73 if (cpu_frequency != 0) {
74 state.counters["cpufreq"] = cpu_frequency;
75 }
76 }
77
78 #if XNN_PLATFORM_JIT
GEMMEnd2EndBenchmark(benchmark::State & state,models::ExecutionPlanFactory model_factory,xnn_jit_gemm_code_generator_function gemm_generator,xnn_f32_gemm_minmax_ukernel_function gemm,xnn_f32_igemm_minmax_ukernel_function igemm,xnn_f32_gemm_minmax_ukernel_function gemm1,xnn_f32_igemm_minmax_ukernel_function igemm1,xnn_init_f32_minmax_params_fn init_params,uint8_t mr,uint8_t nr,uint8_t log2_kr=0,uint8_t log2_sr=0,benchmark::utils::IsaCheckFunction isa_check=nullptr)79 static void GEMMEnd2EndBenchmark(
80 benchmark::State& state,
81 models::ExecutionPlanFactory model_factory,
82 xnn_jit_gemm_code_generator_function gemm_generator,
83 xnn_f32_gemm_minmax_ukernel_function gemm,
84 xnn_f32_igemm_minmax_ukernel_function igemm,
85 xnn_f32_gemm_minmax_ukernel_function gemm1,
86 xnn_f32_igemm_minmax_ukernel_function igemm1,
87 xnn_init_f32_minmax_params_fn init_params,
88 uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
89 benchmark::utils::IsaCheckFunction isa_check = nullptr)
90 {
91 if (isa_check && !isa_check(state)) {
92 return;
93 }
94 if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
95 state.SkipWithError("failed to initialize XNNPACK");
96 return;
97 }
98
99 // Override microkernels chosen in xnn_initialize
100 // Note: do not directly assign to xnn_params.f32.gemm because it breaks older gcc.
101 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm));
102 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm));
103 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm1));
104 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm1));
105 xnn_params.f32.gemm.init.f32 = init_params;
106 xnn_params.f32.gemm.mr = mr;
107 xnn_params.f32.gemm.nr = nr;
108 xnn_params.f32.gemm.log2_kr = log2_kr;
109 xnn_params.f32.gemm.log2_sr = log2_sr;
110
111 xnn_params.f32.gemm.generator.gemm = xnn_init_hmp_gemm_codegen(gemm_generator);
112
113 auto execution_plan = model_factory(nullptr);
114 if (execution_plan.empty()) {
115 state.SkipWithError("failed to create a model");
116 return;
117 }
118
119 for (auto _ : state) {
120 for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
121 xnn_status status = xnn_run_operator(op.get(), nullptr);
122 if (status != xnn_status_success) {
123 state.SkipWithError("failed to run a model");
124 return;
125 }
126 }
127 }
128
129 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
130 if (cpu_frequency != 0) {
131 state.counters["cpufreq"] = cpu_frequency;
132 }
133 }
134 #endif // XNN_PLATFORM_JIT
135
136 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)137 static void f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
138 GEMMEnd2EndBenchmark(state, model,
139 xnn_f32_gemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53,
140 xnn_f32_igemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53,
141 xnn_f32_gemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53,
142 xnn_f32_igemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53,
143 xnn_init_f32_minmax_scalar_params,
144 4 /* mr */, 12 /* nr */);
145 }
f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)146 static void f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
147 GEMMEnd2EndBenchmark(state, model,
148 xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53,
149 xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53,
150 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
151 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
152 xnn_init_f32_minmax_scalar_params,
153 4 /* mr */, 8 /* nr */);
154 }
f32_gemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)155 static void f32_gemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
156 GEMMEnd2EndBenchmark(state, model,
157 xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55,
158 xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55,
159 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
160 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
161 xnn_init_f32_minmax_scalar_params,
162 4 /* mr */, 8 /* nr */);
163 }
f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)164 static void f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
165 GEMMEnd2EndBenchmark(state, model,
166 xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75,
167 xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75,
168 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
169 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
170 xnn_init_f32_minmax_scalar_params,
171 4 /* mr */, 8 /* nr */);
172 }
f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)173 static void f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
174 GEMMEnd2EndBenchmark(state, model,
175 xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75,
176 xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75,
177 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
178 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
179 xnn_init_f32_minmax_scalar_params,
180 4 /* mr */, 8 /* nr */);
181 }
f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State & state,models::ExecutionPlanFactory model)182 static void f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
183 GEMMEnd2EndBenchmark(state, model,
184 xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld64,
185 xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld64,
186 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64,
187 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
188 xnn_init_f32_minmax_scalar_params,
189 4 /* mr */, 8 /* nr */);
190 }
f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State & state,models::ExecutionPlanFactory model)191 static void f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
192 GEMMEnd2EndBenchmark(state, model,
193 xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld128,
194 xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld128,
195 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64,
196 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
197 xnn_init_f32_minmax_scalar_params,
198 4 /* mr */, 8 /* nr */);
199 }
f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)200 static void f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
201 GEMMEnd2EndBenchmark(state, model,
202 xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75,
203 xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75,
204 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
205 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
206 xnn_init_f32_minmax_scalar_params,
207 5 /* mr */, 8 /* nr */);
208 }
f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)209 static void f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
210 GEMMEnd2EndBenchmark(state, model,
211 xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75,
212 xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75,
213 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
214 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
215 xnn_init_f32_minmax_scalar_params,
216 5 /* mr */, 8 /* nr */);
217 }
f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)218 static void f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
219 GEMMEnd2EndBenchmark(state, model,
220 xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53,
221 xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53,
222 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
223 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
224 xnn_init_f32_minmax_scalar_params,
225 6 /* mr */, 8 /* nr */);
226 }
f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)227 static void f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
228 GEMMEnd2EndBenchmark(state, model,
229 xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55,
230 xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55,
231 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
232 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
233 xnn_init_f32_minmax_scalar_params,
234 6 /* mr */, 8 /* nr */);
235 }
f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State & state,models::ExecutionPlanFactory model)236 static void f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, models::ExecutionPlanFactory model) {
237 GEMMEnd2EndBenchmark(state, model,
238 xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73,
239 xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73,
240 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
241 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
242 xnn_init_f32_minmax_scalar_params,
243 6 /* mr */, 8 /* nr */);
244 }
f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)245 static void f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
246 GEMMEnd2EndBenchmark(state, model,
247 xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75,
248 xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75,
249 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
250 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
251 xnn_init_f32_minmax_scalar_params,
252 6 /* mr */, 8 /* nr */);
253 }
f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)254 static void f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
255 GEMMEnd2EndBenchmark(state, model,
256 xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75,
257 xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75,
258 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
259 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
260 xnn_init_f32_minmax_scalar_params,
261 6 /* mr */, 8 /* nr */);
262 }
f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State & state,models::ExecutionPlanFactory model)263 static void f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
264 GEMMEnd2EndBenchmark(state, model,
265 xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64,
266 xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld64,
267 xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
268 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
269 xnn_init_f32_minmax_scalar_params,
270 6 /* mr */, 8 /* nr */);
271 }
f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State & state,models::ExecutionPlanFactory model)272 static void f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
273 GEMMEnd2EndBenchmark(state, model,
274 xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128,
275 xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128,
276 xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
277 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
278 xnn_init_f32_minmax_scalar_params,
279 6 /* mr */, 8 /* nr */);
280 }
f32_gemm_4x8__neonfma_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)281 static void f32_gemm_4x8__neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
282 GEMMEnd2EndBenchmark(state, model,
283 xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld64,
284 xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld64,
285 xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
286 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
287 xnn_init_f32_minmax_scalar_params,
288 4 /* mr */, 8 /* nr */);
289 }
f32_gemm_4x8__neonfma_lane_ld128(benchmark::State & state,models::ExecutionPlanFactory model)290 static void f32_gemm_4x8__neonfma_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
291 GEMMEnd2EndBenchmark(state, model,
292 xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128,
293 xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld128,
294 xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
295 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
296 xnn_init_f32_minmax_scalar_params,
297 4 /* mr */, 8 /* nr */);
298 }
f32_gemm_6x8__neonfma_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)299 static void f32_gemm_6x8__neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
300 GEMMEnd2EndBenchmark(state, model,
301 xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64,
302 xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64,
303 xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
304 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
305 xnn_init_f32_minmax_scalar_params,
306 6 /* mr */, 8 /* nr */);
307 }
f32_gemm_6x8__neonfma_lane_ld128(benchmark::State & state,models::ExecutionPlanFactory model)308 static void f32_gemm_6x8__neonfma_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
309 GEMMEnd2EndBenchmark(state, model,
310 xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128,
311 xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128,
312 xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
313 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
314 xnn_init_f32_minmax_scalar_params,
315 6 /* mr */, 8 /* nr */);
316 }
317
318 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_ld64)
319 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_ld128);
320 BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_ld64);
321 BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_ld128);
322 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_cortex_a53)
323 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_cortex_a55)
324 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_cortex_a75)
325 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75)
326 BENCHMARK_FP32_END2END(f32_gemm_5x8__aarch64_neonfma_cortex_a75);
327 BENCHMARK_FP32_END2END(f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75);
328 BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a53);
329 BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a55);
330 BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a73);
331 BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a75);
332 BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75);
333 BENCHMARK_FP32_END2END(f32_gemm_4x12__aarch64_neonfma_cortex_a53);
334
335 BENCHMARK_FP32_END2END(f32_gemm_4x8__neonfma_lane_ld64);
336 BENCHMARK_FP32_END2END(f32_gemm_4x8__neonfma_lane_ld128);
337
338 BENCHMARK_FP32_END2END(f32_gemm_6x8__neonfma_lane_ld64);
339 BENCHMARK_FP32_END2END(f32_gemm_6x8__neonfma_lane_ld128);
340 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
341
342 #if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
jit_f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)343 static void jit_f32_gemm_6x8__aarch64_neonfma_cortex_a75(
344 benchmark::State &state, models::ExecutionPlanFactory model) {
345 GEMMEnd2EndBenchmark(
346 state, model,
347 xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75,
348 xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75,
349 xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75,
350 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
351 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
352 xnn_init_f32_minmax_scalar_params, 6 /* mr */, 8 /* nr */);
353 }
jit_f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)354 static void jit_f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(
355 benchmark::State &state, models::ExecutionPlanFactory model) {
356 GEMMEnd2EndBenchmark(
357 state, model,
358 xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75,
359 xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75,
360 xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75,
361 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
362 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
363 xnn_init_f32_minmax_scalar_params, 6 /* mr */, 8 /* nr */);
364 }
365
366 BENCHMARK_FP32_END2END(jit_f32_gemm_6x8__aarch64_neonfma_cortex_a75);
367 BENCHMARK_FP32_END2END(jit_f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75);
368 #endif // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
369
370 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
f32_gemm_4x8__aarch32_neon_ld64(benchmark::State & state,models::ExecutionPlanFactory model)371 static void f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
372 GEMMEnd2EndBenchmark(state, model,
373 xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_ld64,
374 xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_ld64,
375 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
376 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
377 xnn_init_f32_minmax_scalar_params,
378 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
379 benchmark::utils::CheckNEON);
380 }
f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State & state,models::ExecutionPlanFactory model)381 static void f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) {
382 GEMMEnd2EndBenchmark(state, model,
383 xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7,
384 xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7,
385 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
386 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
387 xnn_init_f32_minmax_scalar_params,
388 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
389 benchmark::utils::CheckNEON);
390 }
f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)391 static void f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
392 GEMMEnd2EndBenchmark(state, model,
393 xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53,
394 xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53,
395 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
396 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
397 xnn_init_f32_minmax_scalar_params,
398 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
399 benchmark::utils::CheckNEON);
400 }
f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)401 static void f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
402 GEMMEnd2EndBenchmark(state, model,
403 xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55,
404 xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55,
405 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
406 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
407 xnn_init_f32_minmax_scalar_params,
408 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
409 benchmark::utils::CheckNEON);
410 }
f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)411 static void f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
412 GEMMEnd2EndBenchmark(state, model,
413 xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75,
414 xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75,
415 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
416 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
417 xnn_init_f32_minmax_scalar_params,
418 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
419 benchmark::utils::CheckNEON);
420 }
f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)421 static void f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
422 GEMMEnd2EndBenchmark(state, model,
423 xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75,
424 xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75,
425 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
426 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
427 xnn_init_f32_minmax_scalar_params,
428 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
429 benchmark::utils::CheckNEON);
430 }
431
432 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_ld64);
433 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_cortex_a7);
434 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_cortex_a53);
435 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_cortex_a55);
436 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_cortex_a75);
437 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_prfm_cortex_a75);
438 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
439
440
441 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
f32_gemm_4x8__neon_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)442 static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
443 GEMMEnd2EndBenchmark(state, model,
444 xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64,
445 xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld64,
446 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
447 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
448 xnn_init_f32_minmax_scalar_params,
449 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
450 benchmark::utils::CheckNEON);
451 }
452
f32_gemm_4x8__neon_lane_ld128(benchmark::State & state,models::ExecutionPlanFactory model)453 static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
454 GEMMEnd2EndBenchmark(state, model,
455 xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128,
456 xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128,
457 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
458 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
459 xnn_init_f32_minmax_scalar_params,
460 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
461 benchmark::utils::CheckNEON);
462 }
463
f32_gemm_6x8__neon_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)464 static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
465 GEMMEnd2EndBenchmark(state, model,
466 xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld64,
467 xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld64,
468 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
469 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
470 xnn_init_f32_minmax_scalar_params,
471 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
472 benchmark::utils::CheckNEON);
473 }
474
f32_gemm_6x8__neon_lane_ld128(benchmark::State & state,models::ExecutionPlanFactory model)475 static void f32_gemm_6x8__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
476 GEMMEnd2EndBenchmark(state, model,
477 xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128,
478 xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128,
479 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
480 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
481 xnn_init_f32_minmax_scalar_params,
482 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
483 benchmark::utils::CheckNEON);
484 }
485
f32_gemm_4x8__neon_dup_ld64(benchmark::State & state,models::ExecutionPlanFactory model)486 static void f32_gemm_4x8__neon_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
487 GEMMEnd2EndBenchmark(state, model,
488 xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld64,
489 xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld64,
490 xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64,
491 xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64,
492 xnn_init_f32_minmax_scalar_params,
493 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
494 benchmark::utils::CheckNEON);
495 }
496
f32_gemm_4x8__neon_dup_ld128(benchmark::State & state,models::ExecutionPlanFactory model)497 static void f32_gemm_4x8__neon_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
498 GEMMEnd2EndBenchmark(state, model,
499 xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128,
500 xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld128,
501 xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64,
502 xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64,
503 xnn_init_f32_minmax_scalar_params,
504 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
505 benchmark::utils::CheckNEON);
506 }
507
f32_gemm_6x8__neon_dup_ld64(benchmark::State & state,models::ExecutionPlanFactory model)508 static void f32_gemm_6x8__neon_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
509 GEMMEnd2EndBenchmark(state, model,
510 xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64,
511 xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld64,
512 xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64,
513 xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64,
514 xnn_init_f32_minmax_scalar_params,
515 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
516 benchmark::utils::CheckNEON);
517 }
518
f32_gemm_6x8__neon_dup_ld128(benchmark::State & state,models::ExecutionPlanFactory model)519 static void f32_gemm_6x8__neon_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
520 GEMMEnd2EndBenchmark(state, model,
521 xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld128,
522 xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld128,
523 xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64,
524 xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64,
525 xnn_init_f32_minmax_scalar_params,
526 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
527 benchmark::utils::CheckNEON);
528 }
529
f32_gemm_4x8__neonfma_dup_ld64(benchmark::State & state,models::ExecutionPlanFactory model)530 static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
531 GEMMEnd2EndBenchmark(state, model,
532 xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64,
533 xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld64,
534 xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64,
535 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64,
536 xnn_init_f32_minmax_scalar_params,
537 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
538 benchmark::utils::CheckNEONFMA);
539 }
540
f32_gemm_4x8__neonfma_dup_ld128(benchmark::State & state,models::ExecutionPlanFactory model)541 static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
542 GEMMEnd2EndBenchmark(state, model,
543 xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld128,
544 xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld128,
545 xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64,
546 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64,
547 xnn_init_f32_minmax_scalar_params,
548 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
549 benchmark::utils::CheckNEONFMA);
550 }
551
f32_gemm_6x8__neonfma_dup_ld64(benchmark::State & state,models::ExecutionPlanFactory model)552 static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
553 GEMMEnd2EndBenchmark(state, model,
554 xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld64,
555 xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld64,
556 xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64,
557 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64,
558 xnn_init_f32_minmax_scalar_params,
559 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
560 benchmark::utils::CheckNEONFMA);
561 }
562
f32_gemm_6x8__neonfma_dup_ld128(benchmark::State & state,models::ExecutionPlanFactory model)563 static void f32_gemm_6x8__neonfma_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
564 GEMMEnd2EndBenchmark(state, model,
565 xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld128,
566 xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld128,
567 xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64,
568 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64,
569 xnn_init_f32_minmax_scalar_params,
570 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
571 benchmark::utils::CheckNEONFMA);
572 }
573
f32_gemm_4x8s4__neon(benchmark::State & state,models::ExecutionPlanFactory model)574 static void f32_gemm_4x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
575 GEMMEnd2EndBenchmark(state, model,
576 xnn_f32_gemm_minmax_ukernel_4x8s4__neon,
577 xnn_f32_igemm_minmax_ukernel_4x8s4__neon,
578 xnn_f32_gemm_minmax_ukernel_1x8s4__neon,
579 xnn_f32_igemm_minmax_ukernel_1x8s4__neon,
580 xnn_init_f32_minmax_scalar_params,
581 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
582 benchmark::utils::CheckNEON);
583 }
584
f32_gemm_4x8s4__neonfma(benchmark::State & state,models::ExecutionPlanFactory model)585 static void f32_gemm_4x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
586 GEMMEnd2EndBenchmark(state, model,
587 xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma,
588 xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma,
589 xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma,
590 xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma,
591 xnn_init_f32_minmax_scalar_params,
592 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
593 benchmark::utils::CheckNEONFMA);
594 }
595
f32_gemm_6x8s4__neon(benchmark::State & state,models::ExecutionPlanFactory model)596 static void f32_gemm_6x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
597 GEMMEnd2EndBenchmark(state, model,
598 xnn_f32_gemm_minmax_ukernel_6x8s4__neon,
599 xnn_f32_igemm_minmax_ukernel_6x8s4__neon,
600 xnn_f32_gemm_minmax_ukernel_1x8s4__neon,
601 xnn_f32_igemm_minmax_ukernel_1x8s4__neon,
602 xnn_init_f32_minmax_scalar_params,
603 6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
604 benchmark::utils::CheckNEON);
605 }
606
f32_gemm_6x8s4__neonfma(benchmark::State & state,models::ExecutionPlanFactory model)607 static void f32_gemm_6x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
608 GEMMEnd2EndBenchmark(state, model,
609 xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma,
610 xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma,
611 xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma,
612 xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma,
613 xnn_init_f32_minmax_scalar_params,
614 6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
615 benchmark::utils::CheckNEONFMA);
616 }
617
f32_gemm_8x8s4__neon(benchmark::State & state,models::ExecutionPlanFactory model)618 static void f32_gemm_8x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
619 GEMMEnd2EndBenchmark(state, model,
620 xnn_f32_gemm_minmax_ukernel_8x8s4__neon,
621 xnn_f32_igemm_minmax_ukernel_8x8s4__neon,
622 xnn_f32_gemm_minmax_ukernel_1x8s4__neon,
623 xnn_f32_igemm_minmax_ukernel_1x8s4__neon,
624 xnn_init_f32_minmax_scalar_params,
625 8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
626 benchmark::utils::CheckNEON);
627 }
628
f32_gemm_8x8s4__neonfma(benchmark::State & state,models::ExecutionPlanFactory model)629 static void f32_gemm_8x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
630 GEMMEnd2EndBenchmark(state, model,
631 xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma,
632 xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma,
633 xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma,
634 xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma,
635 xnn_init_f32_minmax_scalar_params,
636 8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
637 benchmark::utils::CheckNEONFMA);
638 }
639
640 BENCHMARK_FP32_END2END(f32_gemm_4x8__neon_lane_ld64);
641 BENCHMARK_FP32_END2END(f32_gemm_4x8__neon_lane_ld128);
642 BENCHMARK_FP32_END2END(f32_gemm_6x8__neon_lane_ld64);
643 BENCHMARK_FP32_END2END(f32_gemm_6x8__neon_lane_ld128);
644
645 BENCHMARK_FP32_END2END(f32_gemm_4x8__neon_dup_ld64);
646 BENCHMARK_FP32_END2END(f32_gemm_4x8__neon_dup_ld128);
647 BENCHMARK_FP32_END2END(f32_gemm_6x8__neon_dup_ld64);
648 BENCHMARK_FP32_END2END(f32_gemm_6x8__neon_dup_ld128);
649
650 BENCHMARK_FP32_END2END(f32_gemm_4x8__neonfma_dup_ld64);
651 BENCHMARK_FP32_END2END(f32_gemm_4x8__neonfma_dup_ld128);
652 BENCHMARK_FP32_END2END(f32_gemm_6x8__neonfma_dup_ld64);
653 BENCHMARK_FP32_END2END(f32_gemm_6x8__neonfma_dup_ld128);
654
655 BENCHMARK_FP32_END2END(f32_gemm_4x8s4__neon);
656 BENCHMARK_FP32_END2END(f32_gemm_6x8s4__neon);
657 BENCHMARK_FP32_END2END(f32_gemm_8x8s4__neon);
658
659 BENCHMARK_FP32_END2END(f32_gemm_4x8s4__neonfma);
660 BENCHMARK_FP32_END2END(f32_gemm_6x8s4__neonfma);
661 BENCHMARK_FP32_END2END(f32_gemm_8x8s4__neonfma);
662 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
663
664
665 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
f32_gemm_4x16__avx512f_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)666 static void f32_gemm_4x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
667 GEMMEnd2EndBenchmark(state, model,
668 xnn_f32_gemm_minmax_ukernel_4x16__avx512f_broadcast,
669 xnn_f32_igemm_minmax_ukernel_4x16__avx512f_broadcast,
670 xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
671 xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
672 xnn_init_f32_minmax_scalar_params,
673 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
674 benchmark::utils::CheckAVX512F);
675 }
f32_gemm_5x16__avx512f_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)676 static void f32_gemm_5x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
677 GEMMEnd2EndBenchmark(state, model,
678 xnn_f32_gemm_minmax_ukernel_5x16__avx512f_broadcast,
679 xnn_f32_igemm_minmax_ukernel_5x16__avx512f_broadcast,
680 xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
681 xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
682 xnn_init_f32_minmax_scalar_params,
683 5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
684 benchmark::utils::CheckAVX512F);
685 }
f32_gemm_6x16__avx512f_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)686 static void f32_gemm_6x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
687 GEMMEnd2EndBenchmark(state, model,
688 xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast,
689 xnn_f32_igemm_minmax_ukernel_6x16__avx512f_broadcast,
690 xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
691 xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
692 xnn_init_f32_minmax_scalar_params,
693 6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
694 benchmark::utils::CheckAVX512F);
695 }
f32_gemm_7x16__avx512f_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)696 static void f32_gemm_7x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
697 GEMMEnd2EndBenchmark(state, model,
698 xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast,
699 xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast,
700 xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
701 xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
702 xnn_init_f32_minmax_scalar_params,
703 7 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
704 benchmark::utils::CheckAVX512F);
705 }
f32_gemm_8x16__avx512f_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)706 static void f32_gemm_8x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
707 GEMMEnd2EndBenchmark(state, model,
708 xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast,
709 xnn_f32_igemm_minmax_ukernel_8x16__avx512f_broadcast,
710 xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
711 xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
712 xnn_init_f32_minmax_scalar_params,
713 8 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
714 benchmark::utils::CheckAVX512F);
715 }
716
f32_gemm_4x8__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)717 static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
718 GEMMEnd2EndBenchmark(state, model,
719 xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast,
720 xnn_f32_igemm_minmax_ukernel_4x8__fma3_broadcast,
721 xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
722 xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast,
723 xnn_init_f32_minmax_avx_params,
724 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
725 benchmark::utils::CheckFMA3);
726 }
f32_gemm_5x8__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)727 static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
728 GEMMEnd2EndBenchmark(state, model,
729 xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast,
730 xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast,
731 xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
732 xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast,
733 xnn_init_f32_minmax_avx_params,
734 5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
735 benchmark::utils::CheckFMA3);
736 }
f32_gemm_6x8__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)737 static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
738 GEMMEnd2EndBenchmark(state, model,
739 xnn_f32_gemm_minmax_ukernel_6x8__fma3_broadcast,
740 xnn_f32_igemm_minmax_ukernel_6x8__fma3_broadcast,
741 xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
742 xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast,
743 xnn_init_f32_minmax_avx_params,
744 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
745 benchmark::utils::CheckFMA3);
746 }
f32_gemm_7x8__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)747 static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
748 GEMMEnd2EndBenchmark(state, model,
749 xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast,
750 xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast,
751 xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
752 xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast,
753 xnn_init_f32_minmax_avx_params,
754 7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
755 benchmark::utils::CheckFMA3);
756 }
f32_gemm_8x8__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)757 static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
758 GEMMEnd2EndBenchmark(state, model,
759 xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast,
760 xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast,
761 xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
762 xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast,
763 xnn_init_f32_minmax_avx_params,
764 8 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
765 benchmark::utils::CheckFMA3);
766 }
f32_gemm_3x16__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)767 static void f32_gemm_3x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
768 GEMMEnd2EndBenchmark(state, model,
769 xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast,
770 xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast,
771 xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast,
772 xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast,
773 xnn_init_f32_minmax_avx_params,
774 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
775 benchmark::utils::CheckFMA3);
776 }
f32_gemm_4x16__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)777 static void f32_gemm_4x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
778 GEMMEnd2EndBenchmark(state, model,
779 xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast,
780 xnn_f32_igemm_minmax_ukernel_4x16__fma3_broadcast,
781 xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast,
782 xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast,
783 xnn_init_f32_minmax_avx_params,
784 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
785 benchmark::utils::CheckFMA3);
786 }
f32_gemm_5x16__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)787 static void f32_gemm_5x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
788 GEMMEnd2EndBenchmark(state, model,
789 xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast,
790 xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast,
791 xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast,
792 xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast,
793 xnn_init_f32_minmax_avx_params,
794 5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
795 benchmark::utils::CheckFMA3);
796 }
f32_gemm_3x16s4__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)797 static void f32_gemm_3x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
798 GEMMEnd2EndBenchmark(state, model,
799 xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast,
800 xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast,
801 xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast,
802 xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast,
803 xnn_init_f32_minmax_avx_params,
804 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 2 /* log2_sr */,
805 benchmark::utils::CheckFMA3);
806 }
f32_gemm_4x16s4__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)807 static void f32_gemm_4x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
808 GEMMEnd2EndBenchmark(state, model,
809 xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast,
810 xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast,
811 xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast,
812 xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast,
813 xnn_init_f32_minmax_avx_params,
814 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 2 /* log2_sr */,
815 benchmark::utils::CheckFMA3);
816 }
f32_gemm_5x16s4__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)817 static void f32_gemm_5x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
818 GEMMEnd2EndBenchmark(state, model,
819 xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast,
820 xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast,
821 xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast,
822 xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast,
823 xnn_init_f32_minmax_avx_params,
824 5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 2 /* log2_sr */,
825 benchmark::utils::CheckFMA3);
826 }
827
f32_gemm_4x8__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)828 static void f32_gemm_4x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
829 GEMMEnd2EndBenchmark(state, model,
830 xnn_f32_gemm_minmax_ukernel_4x8__avx_broadcast,
831 xnn_f32_igemm_minmax_ukernel_4x8__avx_broadcast,
832 xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast,
833 xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast,
834 xnn_init_f32_minmax_avx_params,
835 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
836 benchmark::utils::CheckAVX);
837 }
f32_gemm_5x8__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)838 static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
839 GEMMEnd2EndBenchmark(state, model,
840 xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast,
841 xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast,
842 xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast,
843 xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast,
844 xnn_init_f32_minmax_avx_params,
845 5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
846 benchmark::utils::CheckAVX);
847 }
f32_gemm_6x8__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)848 static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
849 GEMMEnd2EndBenchmark(state, model,
850 xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast,
851 xnn_f32_igemm_minmax_ukernel_6x8__avx_broadcast,
852 xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast,
853 xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast,
854 xnn_init_f32_minmax_avx_params,
855 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
856 benchmark::utils::CheckAVX);
857 }
f32_gemm_7x8__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)858 static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
859 GEMMEnd2EndBenchmark(state, model,
860 xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast,
861 xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast,
862 xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast,
863 xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast,
864 xnn_init_f32_minmax_avx_params,
865 7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
866 benchmark::utils::CheckAVX);
867 }
f32_gemm_3x16__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)868 static void f32_gemm_3x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
869 GEMMEnd2EndBenchmark(state, model,
870 xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast,
871 xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast,
872 xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast,
873 xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast,
874 xnn_init_f32_minmax_avx_params,
875 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
876 benchmark::utils::CheckAVX);
877 }
f32_gemm_4x16__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)878 static void f32_gemm_4x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
879 GEMMEnd2EndBenchmark(state, model,
880 xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast,
881 xnn_f32_igemm_minmax_ukernel_4x16__avx_broadcast,
882 xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast,
883 xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast,
884 xnn_init_f32_minmax_avx_params,
885 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
886 benchmark::utils::CheckAVX);
887 }
f32_gemm_5x16__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)888 static void f32_gemm_5x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
889 GEMMEnd2EndBenchmark(state, model,
890 xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast,
891 xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast,
892 xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast,
893 xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast,
894 xnn_init_f32_minmax_avx_params,
895 5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
896 benchmark::utils::CheckAVX);
897 }
898
f32_gemm_3x8__sse2_dup(benchmark::State & state,models::ExecutionPlanFactory model)899 static void f32_gemm_3x8__sse2_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
900 GEMMEnd2EndBenchmark(state, model,
901 xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup,
902 xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup,
903 xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup,
904 xnn_f32_igemm_minmax_ukernel_1x8__sse2_dup,
905 xnn_init_f32_minmax_sse_params,
906 3 /* mr */, 8 /* nr */);
907 }
f32_gemm_4x8__sse2_dup(benchmark::State & state,models::ExecutionPlanFactory model)908 static void f32_gemm_4x8__sse2_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
909 GEMMEnd2EndBenchmark(state, model,
910 xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup,
911 xnn_f32_igemm_minmax_ukernel_4x8__sse2_dup,
912 xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup,
913 xnn_f32_igemm_minmax_ukernel_1x8__sse2_dup,
914 xnn_init_f32_minmax_sse_params,
915 4 /* mr */, 8 /* nr */);
916 }
f32_gemm_5x8__sse2_dup(benchmark::State & state,models::ExecutionPlanFactory model)917 static void f32_gemm_5x8__sse2_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
918 GEMMEnd2EndBenchmark(state, model,
919 xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup,
920 xnn_f32_igemm_minmax_ukernel_5x8__sse2_dup,
921 xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup,
922 xnn_f32_igemm_minmax_ukernel_1x8__sse2_dup,
923 xnn_init_f32_minmax_sse_params,
924 5 /* mr */, 8 /* nr */);
925 }
926
f32_gemm_3x8__sse_load1(benchmark::State & state,models::ExecutionPlanFactory model)927 static void f32_gemm_3x8__sse_load1(benchmark::State& state, models::ExecutionPlanFactory model) {
928 GEMMEnd2EndBenchmark(state, model,
929 xnn_f32_gemm_minmax_ukernel_3x8__sse_load1,
930 xnn_f32_igemm_minmax_ukernel_3x8__sse_load1,
931 xnn_f32_gemm_minmax_ukernel_1x8__sse_load1,
932 xnn_f32_igemm_minmax_ukernel_1x8__sse_load1,
933 xnn_init_f32_minmax_sse_params,
934 3 /* mr */, 8 /* nr */);
935 }
f32_gemm_4x8__sse_load1(benchmark::State & state,models::ExecutionPlanFactory model)936 static void f32_gemm_4x8__sse_load1(benchmark::State& state, models::ExecutionPlanFactory model) {
937 GEMMEnd2EndBenchmark(state, model,
938 xnn_f32_gemm_minmax_ukernel_4x8__sse_load1,
939 xnn_f32_igemm_minmax_ukernel_4x8__sse_load1,
940 xnn_f32_gemm_minmax_ukernel_1x8__sse_load1,
941 xnn_f32_igemm_minmax_ukernel_1x8__sse_load1,
942 xnn_init_f32_minmax_sse_params,
943 4 /* mr */, 8 /* nr */);
944 }
f32_gemm_5x8__sse_load1(benchmark::State & state,models::ExecutionPlanFactory model)945 static void f32_gemm_5x8__sse_load1(benchmark::State& state, models::ExecutionPlanFactory model) {
946 GEMMEnd2EndBenchmark(state, model,
947 xnn_f32_gemm_minmax_ukernel_5x8__sse_load1,
948 xnn_f32_igemm_minmax_ukernel_5x8__sse_load1,
949 xnn_f32_gemm_minmax_ukernel_1x8__sse_load1,
950 xnn_f32_igemm_minmax_ukernel_1x8__sse_load1,
951 xnn_init_f32_minmax_sse_params,
952 5 /* mr */, 8 /* nr */);
953 }
f32_gemm_3x8__sse_dup(benchmark::State & state,models::ExecutionPlanFactory model)954 static void f32_gemm_3x8__sse_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
955 GEMMEnd2EndBenchmark(state, model,
956 xnn_f32_gemm_minmax_ukernel_3x8__sse_dup,
957 xnn_f32_igemm_minmax_ukernel_3x8__sse_dup,
958 xnn_f32_gemm_minmax_ukernel_1x8__sse_dup,
959 xnn_f32_igemm_minmax_ukernel_1x8__sse_dup,
960 xnn_init_f32_minmax_sse_params,
961 3 /* mr */, 8 /* nr */);
962 }
f32_gemm_4x8__sse_dup(benchmark::State & state,models::ExecutionPlanFactory model)963 static void f32_gemm_4x8__sse_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
964 GEMMEnd2EndBenchmark(state, model,
965 xnn_f32_gemm_minmax_ukernel_4x8__sse_dup,
966 xnn_f32_igemm_minmax_ukernel_4x8__sse_dup,
967 xnn_f32_gemm_minmax_ukernel_1x8__sse_dup,
968 xnn_f32_igemm_minmax_ukernel_1x8__sse_dup,
969 xnn_init_f32_minmax_sse_params,
970 4 /* mr */, 8 /* nr */);
971 }
f32_gemm_5x8__sse_dup(benchmark::State & state,models::ExecutionPlanFactory model)972 static void f32_gemm_5x8__sse_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
973 GEMMEnd2EndBenchmark(state, model,
974 xnn_f32_gemm_minmax_ukernel_5x8__sse_dup,
975 xnn_f32_igemm_minmax_ukernel_5x8__sse_dup,
976 xnn_f32_gemm_minmax_ukernel_1x8__sse_dup,
977 xnn_f32_igemm_minmax_ukernel_1x8__sse_dup,
978 xnn_init_f32_minmax_sse_params,
979 5 /* mr */, 8 /* nr */);
980 }
f32_gemm_3x8s4__sse(benchmark::State & state,models::ExecutionPlanFactory model)981 static void f32_gemm_3x8s4__sse(benchmark::State& state, models::ExecutionPlanFactory model) {
982 GEMMEnd2EndBenchmark(state, model,
983 xnn_f32_gemm_minmax_ukernel_3x8s4__sse,
984 xnn_f32_igemm_minmax_ukernel_3x8s4__sse,
985 xnn_f32_gemm_minmax_ukernel_1x8s4__sse,
986 xnn_f32_igemm_minmax_ukernel_1x8s4__sse,
987 xnn_init_f32_minmax_sse_params,
988 3 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
989 }
f32_gemm_4x8s4__sse(benchmark::State & state,models::ExecutionPlanFactory model)990 static void f32_gemm_4x8s4__sse(benchmark::State& state, models::ExecutionPlanFactory model) {
991 GEMMEnd2EndBenchmark(state, model,
992 xnn_f32_gemm_minmax_ukernel_4x8s4__sse,
993 xnn_f32_igemm_minmax_ukernel_4x8s4__sse,
994 xnn_f32_gemm_minmax_ukernel_1x8s4__sse,
995 xnn_f32_igemm_minmax_ukernel_1x8s4__sse,
996 xnn_init_f32_minmax_sse_params,
997 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
998 }
f32_gemm_5x8s4__sse(benchmark::State & state,models::ExecutionPlanFactory model)999 static void f32_gemm_5x8s4__sse(benchmark::State& state, models::ExecutionPlanFactory model) {
1000 GEMMEnd2EndBenchmark(state, model,
1001 xnn_f32_gemm_minmax_ukernel_5x8s4__sse,
1002 xnn_f32_igemm_minmax_ukernel_5x8s4__sse,
1003 xnn_f32_gemm_minmax_ukernel_1x8s4__sse,
1004 xnn_f32_igemm_minmax_ukernel_1x8s4__sse,
1005 xnn_init_f32_minmax_sse_params,
1006 5 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1007 }
1008
1009 BENCHMARK_FP32_END2END(f32_gemm_4x16__avx512f_broadcast);
1010 BENCHMARK_FP32_END2END(f32_gemm_5x16__avx512f_broadcast);
1011 BENCHMARK_FP32_END2END(f32_gemm_6x16__avx512f_broadcast);
1012 BENCHMARK_FP32_END2END(f32_gemm_7x16__avx512f_broadcast);
1013 BENCHMARK_FP32_END2END(f32_gemm_8x16__avx512f_broadcast);
1014
1015 BENCHMARK_FP32_END2END(f32_gemm_4x8__fma3_broadcast);
1016 BENCHMARK_FP32_END2END(f32_gemm_5x8__fma3_broadcast);
1017 BENCHMARK_FP32_END2END(f32_gemm_6x8__fma3_broadcast);
1018 BENCHMARK_FP32_END2END(f32_gemm_7x8__fma3_broadcast);
1019 BENCHMARK_FP32_END2END(f32_gemm_8x8__fma3_broadcast);
1020 BENCHMARK_FP32_END2END(f32_gemm_3x16__fma3_broadcast);
1021 BENCHMARK_FP32_END2END(f32_gemm_4x16__fma3_broadcast);
1022 BENCHMARK_FP32_END2END(f32_gemm_5x16__fma3_broadcast);
1023
1024 BENCHMARK_FP32_END2END(f32_gemm_3x16s4__fma3_broadcast);
1025 BENCHMARK_FP32_END2END(f32_gemm_4x16s4__fma3_broadcast);
1026 BENCHMARK_FP32_END2END(f32_gemm_5x16s4__fma3_broadcast);
1027
1028 BENCHMARK_FP32_END2END(f32_gemm_4x8__avx_broadcast);
1029 BENCHMARK_FP32_END2END(f32_gemm_5x8__avx_broadcast);
1030 BENCHMARK_FP32_END2END(f32_gemm_6x8__avx_broadcast);
1031 BENCHMARK_FP32_END2END(f32_gemm_7x8__avx_broadcast);
1032 BENCHMARK_FP32_END2END(f32_gemm_3x16__avx_broadcast);
1033 BENCHMARK_FP32_END2END(f32_gemm_4x16__avx_broadcast);
1034 BENCHMARK_FP32_END2END(f32_gemm_5x16__avx_broadcast);
1035
1036 BENCHMARK_FP32_END2END(f32_gemm_3x8__sse2_dup);
1037 BENCHMARK_FP32_END2END(f32_gemm_4x8__sse2_dup);
1038 BENCHMARK_FP32_END2END(f32_gemm_5x8__sse2_dup);
1039
1040 BENCHMARK_FP32_END2END(f32_gemm_3x8__sse_load1);
1041 BENCHMARK_FP32_END2END(f32_gemm_4x8__sse_load1);
1042 BENCHMARK_FP32_END2END(f32_gemm_5x8__sse_load1);
1043
1044 BENCHMARK_FP32_END2END(f32_gemm_3x8__sse_dup);
1045 BENCHMARK_FP32_END2END(f32_gemm_4x8__sse_dup);
1046 BENCHMARK_FP32_END2END(f32_gemm_5x8__sse_dup);
1047
1048 BENCHMARK_FP32_END2END(f32_gemm_3x8s4__sse);
1049 BENCHMARK_FP32_END2END(f32_gemm_4x8s4__sse);
1050 BENCHMARK_FP32_END2END(f32_gemm_5x8s4__sse);
1051 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1052
1053
1054 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
f32_gemm_3x8__wasmsimd_arm_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1055 static void f32_gemm_3x8__wasmsimd_arm_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1056 GEMMEnd2EndBenchmark(state, model,
1057 xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat,
1058 xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat,
1059 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1060 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1061 xnn_init_f32_minmax_scalar_params,
1062 3 /* mr */, 8 /* nr */);
1063 }
f32_gemm_4x8__wasmsimd_arm_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1064 static void f32_gemm_4x8__wasmsimd_arm_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1065 GEMMEnd2EndBenchmark(state, model,
1066 xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat,
1067 xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat,
1068 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1069 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1070 xnn_init_f32_minmax_scalar_params,
1071 4 /* mr */, 8 /* nr */);
1072 }
f32_gemm_5x8__wasmsimd_arm_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1073 static void f32_gemm_5x8__wasmsimd_arm_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1074 GEMMEnd2EndBenchmark(state, model,
1075 xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat,
1076 xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat,
1077 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1078 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1079 xnn_init_f32_minmax_scalar_params,
1080 5 /* mr */, 8 /* nr */);
1081 }
f32_gemm_6x8__wasmsimd_arm_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1082 static void f32_gemm_6x8__wasmsimd_arm_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1083 GEMMEnd2EndBenchmark(state, model,
1084 xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat,
1085 xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat,
1086 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1087 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1088 xnn_init_f32_minmax_scalar_params,
1089 6 /* mr */, 8 /* nr */);
1090 }
f32_gemm_3x8__wasmsimd_x86_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1091 static void f32_gemm_3x8__wasmsimd_x86_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1092 GEMMEnd2EndBenchmark(state, model,
1093 xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat,
1094 xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat,
1095 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
1096 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
1097 xnn_init_f32_minmax_scalar_params,
1098 3 /* mr */, 8 /* nr */);
1099 }
f32_gemm_4x8__wasmsimd_x86_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1100 static void f32_gemm_4x8__wasmsimd_x86_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1101 GEMMEnd2EndBenchmark(state, model,
1102 xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat,
1103 xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat,
1104 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
1105 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
1106 xnn_init_f32_minmax_scalar_params,
1107 4 /* mr */, 8 /* nr */);
1108 }
f32_gemm_5x8__wasmsimd_x86_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1109 static void f32_gemm_5x8__wasmsimd_x86_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1110 GEMMEnd2EndBenchmark(state, model,
1111 xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat,
1112 xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat,
1113 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
1114 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
1115 xnn_init_f32_minmax_scalar_params,
1116 5 /* mr */, 8 /* nr */);
1117 }
f32_gemm_6x8__wasmsimd_x86_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1118 static void f32_gemm_6x8__wasmsimd_x86_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1119 GEMMEnd2EndBenchmark(state, model,
1120 xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat,
1121 xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat,
1122 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
1123 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
1124 xnn_init_f32_minmax_scalar_params,
1125 6 /* mr */, 8 /* nr */);
1126 }
f32_gemm_3x8__wasmsimd_arm_splat(benchmark::State & state,models::ExecutionPlanFactory model)1127 static void f32_gemm_3x8__wasmsimd_arm_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1128 GEMMEnd2EndBenchmark(state, model,
1129 xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat,
1130 xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat,
1131 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
1132 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
1133 xnn_init_f32_minmax_scalar_params,
1134 3 /* mr */, 8 /* nr */);
1135 }
f32_gemm_4x8__wasmsimd_arm_splat(benchmark::State & state,models::ExecutionPlanFactory model)1136 static void f32_gemm_4x8__wasmsimd_arm_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1137 GEMMEnd2EndBenchmark(state, model,
1138 xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat,
1139 xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_splat,
1140 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
1141 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
1142 xnn_init_f32_minmax_scalar_params,
1143 4 /* mr */, 8 /* nr */);
1144 }
f32_gemm_5x8__wasmsimd_arm_splat(benchmark::State & state,models::ExecutionPlanFactory model)1145 static void f32_gemm_5x8__wasmsimd_arm_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1146 GEMMEnd2EndBenchmark(state, model,
1147 xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat,
1148 xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat,
1149 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
1150 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
1151 xnn_init_f32_minmax_scalar_params,
1152 5 /* mr */, 8 /* nr */);
1153 }
f32_gemm_6x8__wasmsimd_arm_splat(benchmark::State & state,models::ExecutionPlanFactory model)1154 static void f32_gemm_6x8__wasmsimd_arm_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1155 GEMMEnd2EndBenchmark(state, model,
1156 xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat,
1157 xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat,
1158 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
1159 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
1160 xnn_init_f32_minmax_scalar_params,
1161 6 /* mr */, 8 /* nr */);
1162 }
f32_gemm_3x8__wasmsimd_x86_splat(benchmark::State & state,models::ExecutionPlanFactory model)1163 static void f32_gemm_3x8__wasmsimd_x86_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1164 GEMMEnd2EndBenchmark(state, model,
1165 xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat,
1166 xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat,
1167 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
1168 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
1169 xnn_init_f32_minmax_scalar_params,
1170 3 /* mr */, 8 /* nr */);
1171 }
f32_gemm_4x8__wasmsimd_x86_splat(benchmark::State & state,models::ExecutionPlanFactory model)1172 static void f32_gemm_4x8__wasmsimd_x86_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1173 GEMMEnd2EndBenchmark(state, model,
1174 xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat,
1175 xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat,
1176 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
1177 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
1178 xnn_init_f32_minmax_scalar_params,
1179 4 /* mr */, 8 /* nr */);
1180 }
f32_gemm_5x8__wasmsimd_x86_splat(benchmark::State & state,models::ExecutionPlanFactory model)1181 static void f32_gemm_5x8__wasmsimd_x86_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1182 GEMMEnd2EndBenchmark(state, model,
1183 xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat,
1184 xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat,
1185 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
1186 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
1187 xnn_init_f32_minmax_scalar_params,
1188 5 /* mr */, 8 /* nr */);
1189 }
f32_gemm_6x8__wasmsimd_x86_splat(benchmark::State & state,models::ExecutionPlanFactory model)1190 static void f32_gemm_6x8__wasmsimd_x86_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1191 GEMMEnd2EndBenchmark(state, model,
1192 xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat,
1193 xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat,
1194 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
1195 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
1196 xnn_init_f32_minmax_scalar_params,
1197 6 /* mr */, 8 /* nr */);
1198 }
f32_gemm_3x8s4__wasmsimd_arm(benchmark::State & state,models::ExecutionPlanFactory model)1199 static void f32_gemm_3x8s4__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) {
1200 GEMMEnd2EndBenchmark(state, model,
1201 xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm,
1202 xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm,
1203 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm,
1204 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm,
1205 xnn_init_f32_minmax_scalar_params,
1206 3 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1207 }
f32_gemm_4x8s4__wasmsimd_arm(benchmark::State & state,models::ExecutionPlanFactory model)1208 static void f32_gemm_4x8s4__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) {
1209 GEMMEnd2EndBenchmark(state, model,
1210 xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm,
1211 xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm,
1212 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm,
1213 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm,
1214 xnn_init_f32_minmax_scalar_params,
1215 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1216 }
f32_gemm_5x8s4__wasmsimd_arm(benchmark::State & state,models::ExecutionPlanFactory model)1217 static void f32_gemm_5x8s4__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) {
1218 GEMMEnd2EndBenchmark(state, model,
1219 xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm,
1220 xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm,
1221 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm,
1222 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm,
1223 xnn_init_f32_minmax_scalar_params,
1224 5 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1225 }
f32_gemm_6x8s4__wasmsimd_arm(benchmark::State & state,models::ExecutionPlanFactory model)1226 static void f32_gemm_6x8s4__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) {
1227 GEMMEnd2EndBenchmark(state, model,
1228 xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm,
1229 xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm,
1230 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm,
1231 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm,
1232 xnn_init_f32_minmax_scalar_params,
1233 6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1234 }
f32_gemm_3x8s4__wasmsimd_x86(benchmark::State & state,models::ExecutionPlanFactory model)1235 static void f32_gemm_3x8s4__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) {
1236 GEMMEnd2EndBenchmark(state, model,
1237 xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86,
1238 xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86,
1239 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86,
1240 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86,
1241 xnn_init_f32_minmax_scalar_params,
1242 3 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1243 }
f32_gemm_4x8s4__wasmsimd_x86(benchmark::State & state,models::ExecutionPlanFactory model)1244 static void f32_gemm_4x8s4__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) {
1245 GEMMEnd2EndBenchmark(state, model,
1246 xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86,
1247 xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86,
1248 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86,
1249 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86,
1250 xnn_init_f32_minmax_scalar_params,
1251 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1252 }
f32_gemm_5x8s4__wasmsimd_x86(benchmark::State & state,models::ExecutionPlanFactory model)1253 static void f32_gemm_5x8s4__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) {
1254 GEMMEnd2EndBenchmark(state, model,
1255 xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86,
1256 xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86,
1257 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86,
1258 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86,
1259 xnn_init_f32_minmax_scalar_params,
1260 5 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1261 }
f32_gemm_6x8s4__wasmsimd_x86(benchmark::State & state,models::ExecutionPlanFactory model)1262 static void f32_gemm_6x8s4__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) {
1263 GEMMEnd2EndBenchmark(state, model,
1264 xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86,
1265 xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86,
1266 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86,
1267 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86,
1268 xnn_init_f32_minmax_scalar_params,
1269 6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1270 }
1271
1272 BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmsimd_arm_loadsplat);
1273 BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmsimd_arm_loadsplat);
1274 BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmsimd_arm_loadsplat);
1275 BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmsimd_arm_loadsplat);
1276
1277 BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmsimd_x86_loadsplat);
1278 BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmsimd_x86_loadsplat);
1279 BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmsimd_x86_loadsplat);
1280 BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmsimd_x86_loadsplat);
1281
1282 BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmsimd_arm_splat);
1283 BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmsimd_arm_splat);
1284 BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmsimd_arm_splat);
1285 BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmsimd_arm_splat);
1286
1287 BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmsimd_x86_splat);
1288 BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmsimd_x86_splat);
1289 BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmsimd_x86_splat);
1290 BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmsimd_x86_splat);
1291
1292 BENCHMARK_FP32_END2END(f32_gemm_3x8s4__wasmsimd_arm);
1293 BENCHMARK_FP32_END2END(f32_gemm_4x8s4__wasmsimd_arm);
1294 BENCHMARK_FP32_END2END(f32_gemm_5x8s4__wasmsimd_arm);
1295 BENCHMARK_FP32_END2END(f32_gemm_6x8s4__wasmsimd_arm);
1296
1297 BENCHMARK_FP32_END2END(f32_gemm_3x8s4__wasmsimd_x86);
1298 BENCHMARK_FP32_END2END(f32_gemm_4x8s4__wasmsimd_x86);
1299 BENCHMARK_FP32_END2END(f32_gemm_5x8s4__wasmsimd_x86);
1300 BENCHMARK_FP32_END2END(f32_gemm_6x8s4__wasmsimd_x86);
1301 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1302
1303
1304 #if XNN_ARCH_WASM
f32_gemm_2x4__wasm(benchmark::State & state,models::ExecutionPlanFactory model)1305 static void f32_gemm_2x4__wasm(benchmark::State& state, models::ExecutionPlanFactory model) {
1306 GEMMEnd2EndBenchmark(state, model,
1307 xnn_f32_gemm_minmax_ukernel_2x4__wasm,
1308 xnn_f32_igemm_minmax_ukernel_2x4__wasm,
1309 xnn_f32_gemm_minmax_ukernel_1x4__wasm,
1310 xnn_f32_igemm_minmax_ukernel_1x4__wasm,
1311 xnn_init_f32_minmax_scalar_params,
1312 2 /* mr */, 4 /* nr */);
1313 }
1314
f32_gemm_4x4__wasm(benchmark::State & state,models::ExecutionPlanFactory model)1315 static void f32_gemm_4x4__wasm(benchmark::State& state, models::ExecutionPlanFactory model) {
1316 GEMMEnd2EndBenchmark(state, model,
1317 xnn_f32_gemm_minmax_ukernel_4x4__wasm,
1318 xnn_f32_igemm_minmax_ukernel_4x4__wasm,
1319 xnn_f32_gemm_minmax_ukernel_1x4__wasm,
1320 xnn_f32_igemm_minmax_ukernel_1x4__wasm,
1321 xnn_init_f32_minmax_scalar_params,
1322 4 /* mr */, 4 /* nr */);
1323 }
1324
1325 BENCHMARK_FP32_END2END(f32_gemm_2x4__wasm);
1326 BENCHMARK_FP32_END2END(f32_gemm_4x4__wasm);
1327 #endif // XNN_ARCH_WASM
1328
1329
f32_gemm_2x4__scalar(benchmark::State & state,models::ExecutionPlanFactory model)1330 static void f32_gemm_2x4__scalar(benchmark::State& state, models::ExecutionPlanFactory model) {
1331 GEMMEnd2EndBenchmark(state, model,
1332 xnn_f32_gemm_minmax_ukernel_2x4__scalar,
1333 xnn_f32_igemm_minmax_ukernel_2x4__scalar,
1334 xnn_f32_gemm_minmax_ukernel_1x4__scalar,
1335 xnn_f32_igemm_minmax_ukernel_1x4__scalar,
1336 xnn_init_f32_minmax_scalar_params,
1337 2 /* mr */, 4 /* nr */);
1338 }
1339
f32_gemm_4x4__scalar(benchmark::State & state,models::ExecutionPlanFactory model)1340 static void f32_gemm_4x4__scalar(benchmark::State& state, models::ExecutionPlanFactory model) {
1341 GEMMEnd2EndBenchmark(state, model,
1342 xnn_f32_gemm_minmax_ukernel_4x4__scalar,
1343 xnn_f32_igemm_minmax_ukernel_4x4__scalar,
1344 xnn_f32_gemm_minmax_ukernel_1x4__scalar,
1345 xnn_f32_igemm_minmax_ukernel_1x4__scalar,
1346 xnn_init_f32_minmax_scalar_params,
1347 4 /* mr */, 4 /* nr */);
1348 }
1349
1350 BENCHMARK_FP32_END2END(f32_gemm_2x4__scalar);
1351 BENCHMARK_FP32_END2END(f32_gemm_4x4__scalar);
1352
1353
1354 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1355 BENCHMARK_MAIN();
1356 #endif
1357