1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cfloat>
8 #include <cmath>
9 #include <functional>
10 #include <limits>
11 #include <random>
12 #include <vector>
13
14 #include <benchmark/benchmark.h>
15 #include "bench/conv.h"
16 #include "bench/utils.h"
17 #include <xnnpack/AlignedAllocator.h>
18 #include <xnnpack/common.h>
19 #include <xnnpack/igemm.h>
20 #include <xnnpack/indirection.h>
21 #include <xnnpack/operator.h>
22 #include <xnnpack/pack.h>
23 #include <xnnpack/params-init.h>
24 #include <xnnpack/params.h>
25
26
IGEMMBenchmark(benchmark::State & state,xnn_f32_igemm_minmax_ukernel_function f32_igemm,uint32_t mr,uint32_t nr,uint32_t kr,uint32_t sr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)27 static void IGEMMBenchmark(benchmark::State& state,
28 xnn_f32_igemm_minmax_ukernel_function f32_igemm,
29 uint32_t mr, uint32_t nr, uint32_t kr, uint32_t sr,
30 xnn_init_f32_minmax_params_fn init_params,
31 benchmark::utils::IsaCheckFunction isa_check = nullptr)
32 {
33 if (isa_check && !isa_check(state)) {
34 return;
35 }
36
37 const size_t input_height = state.range(0);
38 const size_t input_width = state.range(1);
39 const size_t kernel_height = state.range(2);
40 const size_t kernel_width = state.range(3);
41 const size_t kernel_size = kernel_height * kernel_width;
42 const size_t padding_height = state.range(4);
43 const size_t padding_width = state.range(5);
44 const size_t subsampling = state.range(6);
45 const size_t dilation = state.range(7);
46 const size_t group_input_channels = state.range(8);
47 const size_t group_output_channels = state.range(9);
48
49 std::random_device random_device;
50 auto rng = std::mt19937(random_device());
51 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
52
53 const size_t output_pixel_stride = group_output_channels;
54 const size_t input_pixel_stride = group_input_channels;
55 const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
56 const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
57 const size_t padding_left = padding_width / 2;
58 const size_t padding_top = padding_height / 2;
59 const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
60 const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
61 const size_t output_size = output_height * output_width;
62
63 const size_t mc_stride = benchmark::utils::RoundUp<size_t>(output_size, mr);
64 const size_t nc_stride = benchmark::utils::RoundUp<size_t>(group_output_channels, nr);
65 const size_t kc_stride = benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
66
67 std::vector<float> a(input_height * input_width * input_pixel_stride);
68 std::generate(a.begin(), a.end(), std::ref(f32rng));
69 std::vector<float> k(group_output_channels * kernel_height * kernel_width * group_input_channels);
70 std::generate(k.begin(), k.end(), std::ref(f32rng));
71 std::vector<float> b(group_output_channels);
72 std::generate(b.begin(), b.end(), std::ref(f32rng));
73
74 std::vector<float> z(group_input_channels);
75
76 const size_t w_elements = kernel_size * kc_stride * nc_stride + nc_stride;
77 const size_t i_elements = mc_stride * kernel_size;
78 const size_t c_elements = output_height * output_width * output_pixel_stride;
79 const size_t num_buffers = 1 +
80 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
81 sizeof(float) * (w_elements + c_elements) + sizeof(void*) * i_elements);
82
83 std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
84 std::fill(w.begin(), w.end(), 0.0f);
85 xnn_pack_f32_conv_goki_w(
86 1 /* groups */, group_output_channels, kernel_size, group_input_channels,
87 nr, kr, sr, k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
88 for (size_t n = 1; n < num_buffers; n++) {
89 std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
90 }
91
92 std::vector<const float*> i(i_elements * num_buffers);
93 xnn_operator convolution_op = { };
94 convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
95 convolution_op.input = a.data();
96 convolution_op.input_pixel_stride = input_pixel_stride;
97 convolution_op.zero_buffer = z.data();
98 convolution_op.groups = 1;
99 convolution_op.group_input_channels = group_input_channels;
100 convolution_op.batch_size = 1;
101 convolution_op.input_height = input_height;
102 convolution_op.input_width = input_width;
103 convolution_op.output_height = output_height;
104 convolution_op.output_width = output_width;
105 convolution_op.kernel_height = kernel_height;
106 convolution_op.kernel_width = kernel_width;
107 convolution_op.stride_height = subsampling;
108 convolution_op.stride_width = subsampling;
109 convolution_op.dilation_height = dilation;
110 convolution_op.dilation_width = dilation;
111 convolution_op.padding_top = padding_top;
112 convolution_op.padding_left = padding_left;
113 xnn_indirection_init_conv2d(&convolution_op, mr, 2 /* log2(sizeof(float)) */);
114 for (size_t n = 1; n < num_buffers; n++) {
115 std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
116 }
117
118 std::vector<float> c(c_elements * num_buffers);
119 std::fill(c.begin(), c.end(), std::nanf(""));
120
121 xnn_f32_minmax_params params;
122 init_params(¶ms,
123 -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
124
125 size_t buffer_index = 0;
126 for (auto _ : state) {
127 state.PauseTiming();
128 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
129 buffer_index = (buffer_index + 1) % num_buffers;
130 state.ResumeTiming();
131
132 for (uint32_t m = 0; m < output_size; m += mr) {
133 const uint32_t mb = min(output_size - m, mr);
134 f32_igemm(
135 mb, group_output_channels, group_input_channels * sizeof(float), kernel_size * mr * sizeof(void*),
136 i.data() + buffer_index * i_elements + m,
137 w.data() + buffer_index * w_elements,
138 c.data() + buffer_index * c_elements + m * group_output_channels, group_output_channels * sizeof(float), nr * sizeof(float),
139 0, z.data(), ¶ms);
140 }
141 }
142
143 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
144 if (cpu_frequency != 0) {
145 state.counters["cpufreq"] = cpu_frequency;
146 }
147
148 state.counters["FLOPS"] = benchmark::Counter(
149 uint64_t(state.iterations()) * 2 *
150 output_height * output_width *
151 group_input_channels * group_output_channels *
152 kernel_height * kernel_width,
153 benchmark::Counter::kIsRate);
154 }
155
156 #if XNN_PLATFORM_JIT
IGEMMBenchmark(benchmark::State & state,xnn_jit_igemm_code_generator_function generator,size_t mr,size_t nr,size_t kr,size_t sr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)157 static void IGEMMBenchmark(benchmark::State& state,
158 xnn_jit_igemm_code_generator_function generator,
159 size_t mr, size_t nr, size_t kr, size_t sr,
160 xnn_init_f32_minmax_params_fn init_params,
161 benchmark::utils::IsaCheckFunction isa_check = nullptr)
162 {
163 if (isa_check && !isa_check(state)) {
164 return;
165 }
166
167 const size_t input_height = state.range(0);
168 const size_t input_width = state.range(1);
169 const size_t kernel_height = state.range(2);
170 const size_t kernel_width = state.range(3);
171 const size_t kernel_size = kernel_height * kernel_width;
172 const size_t padding_height = state.range(4);
173 const size_t padding_width = state.range(5);
174 const size_t subsampling = state.range(6);
175 const size_t dilation = state.range(7);
176 const size_t group_input_channels = state.range(8);
177 const size_t group_output_channels = state.range(9);
178
179 std::random_device random_device;
180 auto rng = std::mt19937(random_device());
181 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
182
183 const size_t output_pixel_stride = group_output_channels;
184 const size_t input_pixel_stride = group_input_channels;
185 const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
186 const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
187 const size_t padding_left = padding_width / 2;
188 const size_t padding_top = padding_height / 2;
189 const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
190 const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
191 const size_t output_size = output_height * output_width;
192
193 const size_t mc_stride = benchmark::utils::RoundUp<size_t>(output_size, mr);
194 const size_t nc_stride = benchmark::utils::RoundUp<size_t>(group_output_channels, nr);
195 const size_t kc_stride = benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
196
197 std::vector<float> a(input_height * input_width * input_pixel_stride);
198 std::generate(a.begin(), a.end(), std::ref(f32rng));
199 std::vector<float> k(group_output_channels * kernel_height * kernel_width * group_input_channels);
200 std::generate(k.begin(), k.end(), std::ref(f32rng));
201 std::vector<float> b(group_output_channels);
202 std::generate(b.begin(), b.end(), std::ref(f32rng));
203
204 std::vector<float> z(group_input_channels);
205
206 const size_t w_elements = kernel_size * kc_stride * nc_stride + nc_stride;
207 const size_t i_elements = mc_stride * kernel_size;
208 const size_t c_elements = output_height * output_width * output_pixel_stride;
209 const size_t num_buffers = 1 +
210 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
211 sizeof(float) * (w_elements + c_elements) + sizeof(void*) * i_elements);
212
213 std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
214 std::fill(w.begin(), w.end(), 0.0f);
215 xnn_pack_f32_conv_goki_w(
216 1 /* groups */, group_output_channels, kernel_size, group_input_channels,
217 nr, kr, sr, k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
218 for (size_t n = 1; n < num_buffers; n++) {
219 std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
220 }
221
222 std::vector<const float*> i(i_elements * num_buffers);
223 xnn_operator convolution_op = { };
224 convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
225 convolution_op.input = a.data();
226 convolution_op.input_pixel_stride = input_pixel_stride;
227 convolution_op.zero_buffer = z.data();
228 convolution_op.groups = 1;
229 convolution_op.group_input_channels = group_input_channels;
230 convolution_op.batch_size = 1;
231 convolution_op.input_height = input_height;
232 convolution_op.input_width = input_width;
233 convolution_op.output_height = output_height;
234 convolution_op.output_width = output_width;
235 convolution_op.kernel_height = kernel_height;
236 convolution_op.kernel_width = kernel_width;
237 convolution_op.stride_height = subsampling;
238 convolution_op.stride_width = subsampling;
239 convolution_op.dilation_height = dilation;
240 convolution_op.dilation_width = dilation;
241 convolution_op.padding_top = padding_top;
242 convolution_op.padding_left = padding_left;
243 xnn_indirection_init_conv2d(&convolution_op, mr, 2 /* log2(sizeof(float)) */);
244 for (size_t n = 1; n < num_buffers; n++) {
245 std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
246 }
247
248 std::vector<float> c(c_elements * num_buffers);
249 std::fill(c.begin(), c.end(), std::nanf(""));
250
251 xnn_f32_minmax_params params;
252 init_params(¶ms,
253 -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
254
255 jit_gemm_params jit_params = {
256 .f32_minmax = {
257 .min = -std::numeric_limits<float>::infinity(),
258 .max = +std::numeric_limits<float>::infinity()
259 }
260 };
261
262 xnn_code_buffer code_buffer;
263 xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
264 generator(&code_buffer, group_output_channels, group_input_channels * sizeof(float), kernel_size * mr * sizeof(void *), &jit_params);
265 auto f32_igemm = reinterpret_cast<xnn_f32_igemm_minmax_ukernel_function>(code_buffer.code);
266
267 size_t buffer_index = 0;
268 for (auto _ : state) {
269 state.PauseTiming();
270 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
271 buffer_index = (buffer_index + 1) % num_buffers;
272 state.ResumeTiming();
273
274 for (uint32_t m = 0; m < output_size; m += mr) {
275 const uint32_t mb = min(output_size - m, mr);
276 f32_igemm(
277 mb, group_output_channels, group_input_channels * sizeof(float), kernel_size * mr * sizeof(void*),
278 i.data() + buffer_index * i_elements + m,
279 w.data() + buffer_index * w_elements,
280 c.data() + buffer_index * c_elements + m * group_output_channels, group_output_channels * sizeof(float), nr * sizeof(float),
281 0, z.data(), ¶ms);
282 }
283 }
284 xnn_release_code_memory(&code_buffer);
285
286 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
287 if (cpu_frequency != 0) {
288 state.counters["cpufreq"] = cpu_frequency;
289 }
290
291 state.counters["FLOPS"] = benchmark::Counter(
292 uint64_t(state.iterations()) * 2 *
293 output_height * output_width *
294 group_input_channels * group_output_channels *
295 kernel_height * kernel_width,
296 benchmark::Counter::kIsRate);
297
298 }
299 #endif // XNN_PLATFORM_JIT
300
301 #if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
jit_f32_igemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)302 static void jit_f32_igemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
303 IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1,
304 xnn_init_f32_minmax_scalar_params);
305 }
jit_f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)306 static void jit_f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
307 IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8, 1, 1,
308 xnn_init_f32_minmax_scalar_params);
309 }
jit_f32_igemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)310 static void jit_f32_igemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
311 IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1,
312 xnn_init_f32_minmax_scalar_params);
313 }
jit_f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)314 static void jit_f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
315 IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, 6, 8, 1, 1,
316 xnn_init_f32_minmax_scalar_params);
317 }
318
319 BENCHMARK_CONV(jit_f32_igemm_1x8__aarch64_neonfma_cortex_a75)
BENCHMARK_CONV(jit_f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75)320 BENCHMARK_CONV(jit_f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75)
321 BENCHMARK_CONV(jit_f32_igemm_6x8__aarch64_neonfma_cortex_a75)
322 BENCHMARK_CONV(jit_f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75)
323 #endif // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
324
325 #if XNN_ARCH_ARM && XNN_PLATFORM_JIT
326 static void jit_f32_igemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
327 IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1,
328 xnn_init_f32_minmax_scalar_params);
329 }
jit_f32_igemm_4x8__aarch32_neon_cortex_a7(benchmark::State & state,const char * net)330 static void jit_f32_igemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net) {
331 IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1,
332 xnn_init_f32_minmax_scalar_params);
333 }
jit_f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State & state,const char * net)334 static void jit_f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
335 IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1,
336 xnn_init_f32_minmax_scalar_params);
337 }
jit_f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State & state,const char * net)338 static void jit_f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
339 IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1,
340 xnn_init_f32_minmax_scalar_params);
341 }
jit_f32_igemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State & state,const char * net)342 static void jit_f32_igemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net) {
343 IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, 4, 8, 1, 1,
344 xnn_init_f32_minmax_scalar_params);
345 }
jit_f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State & state,const char * net)346 static void jit_f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
347 IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1,
348 xnn_init_f32_minmax_scalar_params);
349 }
350
351 BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_ld64)
BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a7)352 BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a7)
353 BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a53)
354 BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a55)
355 BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_prfm_cortex_a75)
356 BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a75)
357 #endif // XNN_ARCH_ARM && XNN_PLATFORM_JIT
358
359 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
360 static void f32_igemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
361 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1,
362 xnn_init_f32_minmax_scalar_params);
363 }
f32_igemm_4x8__aarch32_neon_cortex_a7(benchmark::State & state,const char * net)364 static void f32_igemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net) {
365 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1,
366 xnn_init_f32_minmax_scalar_params);
367 }
f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State & state,const char * net)368 static void f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
369 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1,
370 xnn_init_f32_minmax_scalar_params);
371 }
f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State & state,const char * net)372 static void f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
373 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1,
374 xnn_init_f32_minmax_scalar_params);
375 }
f32_igemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State & state,const char * net)376 static void f32_igemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net) {
377 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75, 4, 8, 1, 1,
378 xnn_init_f32_minmax_scalar_params);
379 }
f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State & state,const char * net)380 static void f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
381 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1,
382 xnn_init_f32_minmax_scalar_params);
383 }
384
385 BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_ld64)
BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a7)386 BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a7)
387 BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a53)
388 BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a55)
389 BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_prfm_cortex_a75)
390 BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a75)
391 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
392
393
394 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
395 static void f32_igemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
396 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1,
397 xnn_init_f32_minmax_scalar_params);
398 }
f32_igemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)399 static void f32_igemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
400 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, 1, 8, 1, 1,
401 xnn_init_f32_minmax_scalar_params);
402 }
f32_igemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)403 static void f32_igemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
404 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1,
405 xnn_init_f32_minmax_scalar_params);
406 }
f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)407 static void f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
408 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8, 1, 1,
409 xnn_init_f32_minmax_scalar_params);
410 }
f32_igemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)411 static void f32_igemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
412 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53, 4, 8, 1, 1,
413 xnn_init_f32_minmax_scalar_params);
414 }
f32_igemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State & state,const char * net)415 static void f32_igemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
416 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, 4, 8, 1, 1,
417 xnn_init_f32_minmax_scalar_params);
418 }
f32_igemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)419 static void f32_igemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
420 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1,
421 xnn_init_f32_minmax_scalar_params);
422 }
f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)423 static void f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
424 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, 4, 8, 1, 1,
425 xnn_init_f32_minmax_scalar_params);
426 }
f32_igemm_4x8__aarch64_neonfma_ld64(benchmark::State & state,const char * net)427 static void f32_igemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
428 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1,
429 xnn_init_f32_minmax_scalar_params);
430 }
f32_igemm_4x8__aarch64_neonfma_ld128(benchmark::State & state,const char * net)431 static void f32_igemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
432 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld128, 4, 8, 1, 1,
433 xnn_init_f32_minmax_scalar_params);
434 }
f32_igemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)435 static void f32_igemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
436 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75, 5, 8, 1, 1,
437 xnn_init_f32_minmax_scalar_params);
438 }
f32_igemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)439 static void f32_igemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
440 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, 5, 8, 1, 1,
441 xnn_init_f32_minmax_scalar_params);
442 }
f32_igemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)443 static void f32_igemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
444 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1,
445 xnn_init_f32_minmax_scalar_params);
446 }
f32_igemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)447 static void f32_igemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
448 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53, 6, 8, 1, 1,
449 xnn_init_f32_minmax_scalar_params);
450 }
f32_igemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State & state,const char * net)451 static void f32_igemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
452 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, 6, 8, 1, 1,
453 xnn_init_f32_minmax_scalar_params);
454 }
f32_igemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State & state,const char * net)455 static void f32_igemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
456 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1,
457 xnn_init_f32_minmax_scalar_params);
458 }
f32_igemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)459 static void f32_igemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
460 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1,
461 xnn_init_f32_minmax_scalar_params);
462 }
f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)463 static void f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
464 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, 6, 8, 1, 1,
465 xnn_init_f32_minmax_scalar_params);
466 }
f32_igemm_6x8__aarch64_neonfma_ld64(benchmark::State & state,const char * net)467 static void f32_igemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
468 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1,
469 xnn_init_f32_minmax_scalar_params);
470 }
f32_igemm_6x8__aarch64_neonfma_ld128(benchmark::State & state,const char * net)471 static void f32_igemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
472 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1,
473 xnn_init_f32_minmax_scalar_params);
474 }
f32_igemm_1x8__neonfma_lane_ld64(benchmark::State & state,const char * net)475 static void f32_igemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
476 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64, 1, 8, 1, 1,
477 xnn_init_f32_minmax_scalar_params);
478 }
f32_igemm_4x2__neonfma_lane_ld64(benchmark::State & state,const char * net)479 static void f32_igemm_4x2__neonfma_lane_ld64(benchmark::State& state, const char* net) {
480 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x2__neonfma_lane_ld64, 4, 2, 1, 1,
481 xnn_init_f32_minmax_scalar_params);
482 }
f32_igemm_4x4__neonfma_lane_ld64(benchmark::State & state,const char * net)483 static void f32_igemm_4x4__neonfma_lane_ld64(benchmark::State& state, const char* net) {
484 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x4__neonfma_lane_ld64, 4, 4, 1, 1,
485 xnn_init_f32_minmax_scalar_params);
486 }
f32_igemm_4x8__neonfma_lane_ld128(benchmark::State & state,const char * net)487 static void f32_igemm_4x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
488 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld128, 4, 8, 1, 1,
489 xnn_init_f32_minmax_scalar_params);
490 }
f32_igemm_4x8__neonfma_lane_ld64(benchmark::State & state,const char * net)491 static void f32_igemm_4x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
492 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld64, 4, 8, 1, 1,
493 xnn_init_f32_minmax_scalar_params);
494 }
f32_igemm_6x8__neonfma_lane_ld64(benchmark::State & state,const char * net)495 static void f32_igemm_6x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
496 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64, 6, 8, 1, 1,
497 xnn_init_f32_minmax_scalar_params);
498 }
f32_igemm_6x8__neonfma_lane_ld128(benchmark::State & state,const char * net)499 static void f32_igemm_6x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
500 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128, 6, 8, 1, 1,
501 xnn_init_f32_minmax_scalar_params);
502 }
503
504 BENCHMARK_CONV(f32_igemm_1x12__aarch64_neonfma_cortex_a53)
BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_cortex_a53)505 BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_cortex_a53)
506 BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_cortex_a75)
507 BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75)
508 BENCHMARK_CONV(f32_igemm_4x12__aarch64_neonfma_cortex_a53)
509 BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_cortex_a53)
510 BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_cortex_a55)
511 BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_cortex_a75)
512 BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a75)
513 BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_ld64)
514 BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_ld128)
515 BENCHMARK_CONV(f32_igemm_5x8__aarch64_neonfma_cortex_a75)
516 BENCHMARK_CONV(f32_igemm_5x8__aarch64_neonfma_prfm_cortex_a75)
517 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a53)
518 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a55)
519 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a73)
520 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a75)
521 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75)
522 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_ld64)
523 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_ld128)
524 BENCHMARK_CONV(f32_igemm_1x8__neonfma_lane_ld64)
525 BENCHMARK_CONV(f32_igemm_4x2__neonfma_lane_ld64)
526 BENCHMARK_CONV(f32_igemm_4x4__neonfma_lane_ld64)
527 BENCHMARK_CONV(f32_igemm_4x8__neonfma_lane_ld128)
528 BENCHMARK_CONV(f32_igemm_4x8__neonfma_lane_ld64)
529 BENCHMARK_CONV(f32_igemm_6x8__neonfma_lane_ld64)
530 BENCHMARK_CONV(f32_igemm_6x8__neonfma_lane_ld128)
531 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
532
533
534 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
535 static void f32_igemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
536 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1,
537 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
538 }
f32_igemm_4x2__neon_lane_ld64(benchmark::State & state,const char * net)539 static void f32_igemm_4x2__neon_lane_ld64(benchmark::State& state, const char* net) {
540 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64, 4, 2, 1, 1,
541 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
542 }
f32_igemm_4x4__neon_lane_ld64(benchmark::State & state,const char * net)543 static void f32_igemm_4x4__neon_lane_ld64(benchmark::State& state, const char* net) {
544 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x4__neon_lane_ld64, 4, 4, 1, 1,
545 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
546 }
f32_igemm_4x8__neon_lane_ld64(benchmark::State & state,const char * net)547 static void f32_igemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
548 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1,
549 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
550 }
f32_igemm_4x8__neon_lane_ld128(benchmark::State & state,const char * net)551 static void f32_igemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
552 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1,
553 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
554 }
f32_igemm_6x8__neon_lane_ld64(benchmark::State & state,const char * net)555 static void f32_igemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
556 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1,
557 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
558 }
f32_igemm_6x8__neon_lane_ld128(benchmark::State & state,const char * net)559 static void f32_igemm_6x8__neon_lane_ld128(benchmark::State& state, const char* net) {
560 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128, 6, 8, 1, 1,
561 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
562 }
f32_igemm_1x8__neon_dup_ld64(benchmark::State & state,const char * net)563 static void f32_igemm_1x8__neon_dup_ld64(benchmark::State& state, const char* net) {
564 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64, 1, 8, 1, 1,
565 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
566 }
f32_igemm_4x8__neon_dup_ld128(benchmark::State & state,const char * net)567 static void f32_igemm_4x8__neon_dup_ld128(benchmark::State& state, const char* net) {
568 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld128, 4, 8, 1, 1,
569 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
570 }
f32_igemm_4x8__neon_dup_ld64(benchmark::State & state,const char * net)571 static void f32_igemm_4x8__neon_dup_ld64(benchmark::State& state, const char* net) {
572 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld64, 4, 8, 1, 1,
573 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
574 }
f32_igemm_6x8__neon_dup_ld64(benchmark::State & state,const char * net)575 static void f32_igemm_6x8__neon_dup_ld64(benchmark::State& state, const char* net) {
576 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld64, 6, 8, 1, 1,
577 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
578 }
f32_igemm_6x8__neon_dup_ld128(benchmark::State & state,const char * net)579 static void f32_igemm_6x8__neon_dup_ld128(benchmark::State& state, const char* net) {
580 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld128, 6, 8, 1, 1,
581 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
582 }
f32_igemm_1x8__neonfma_dup_ld64(benchmark::State & state,const char * net)583 static void f32_igemm_1x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
584 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1,
585 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
586 }
f32_igemm_4x8__neonfma_dup_ld128(benchmark::State & state,const char * net)587 static void f32_igemm_4x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
588 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1,
589 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
590 }
f32_igemm_4x8__neonfma_dup_ld64(benchmark::State & state,const char * net)591 static void f32_igemm_4x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
592 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1,
593 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
594 }
f32_igemm_6x8__neonfma_dup_ld64(benchmark::State & state,const char * net)595 static void f32_igemm_6x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
596 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1,
597 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
598 }
f32_igemm_6x8__neonfma_dup_ld128(benchmark::State & state,const char * net)599 static void f32_igemm_6x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
600 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld128, 6, 8, 1, 1,
601 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
602 }
f32_igemm_1x8s4__neon(benchmark::State & state,const char * net)603 static void f32_igemm_1x8s4__neon(benchmark::State& state, const char* net) {
604 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8s4__neon, 1, 8, 1, 4,
605 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
606 }
f32_igemm_4x8s4__neon(benchmark::State & state,const char * net)607 static void f32_igemm_4x8s4__neon(benchmark::State& state, const char* net) {
608 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8s4__neon, 4, 8, 1, 4,
609 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
610 }
f32_igemm_6x8s4__neon(benchmark::State & state,const char * net)611 static void f32_igemm_6x8s4__neon(benchmark::State& state, const char* net) {
612 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8s4__neon, 6, 8, 1, 4,
613 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
614 }
f32_igemm_8x8s4__neon(benchmark::State & state,const char * net)615 static void f32_igemm_8x8s4__neon(benchmark::State& state, const char* net) {
616 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_8x8s4__neon, 8, 8, 1, 4,
617 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
618 }
f32_igemm_1x8s4__neonfma(benchmark::State & state,const char * net)619 static void f32_igemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
620 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma, 1, 8, 1, 4,
621 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
622 }
f32_igemm_4x8s4__neonfma(benchmark::State & state,const char * net)623 static void f32_igemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
624 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma, 4, 8, 1, 4,
625 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
626 }
f32_igemm_6x8s4__neonfma(benchmark::State & state,const char * net)627 static void f32_igemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
628 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma, 6, 8, 1, 4,
629 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
630 }
f32_igemm_8x8s4__neonfma(benchmark::State & state,const char * net)631 static void f32_igemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
632 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma, 8, 8, 1, 4,
633 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
634 }
635
636 BENCHMARK_CONV(f32_igemm_1x8__neon_lane_ld64)
BENCHMARK_CONV(f32_igemm_4x2__neon_lane_ld64)637 BENCHMARK_CONV(f32_igemm_4x2__neon_lane_ld64)
638 BENCHMARK_CONV(f32_igemm_4x4__neon_lane_ld64)
639 BENCHMARK_CONV(f32_igemm_4x8__neon_lane_ld128)
640 BENCHMARK_CONV(f32_igemm_4x8__neon_lane_ld64)
641 BENCHMARK_CONV(f32_igemm_6x8__neon_lane_ld64)
642 BENCHMARK_CONV(f32_igemm_6x8__neon_lane_ld128)
643 BENCHMARK_CONV(f32_igemm_1x8__neon_dup_ld64)
644 BENCHMARK_CONV(f32_igemm_4x8__neon_dup_ld128)
645 BENCHMARK_CONV(f32_igemm_4x8__neon_dup_ld64)
646 BENCHMARK_CONV(f32_igemm_6x8__neon_dup_ld64)
647 BENCHMARK_CONV(f32_igemm_6x8__neon_dup_ld128)
648 BENCHMARK_CONV(f32_igemm_1x8__neonfma_dup_ld64)
649 BENCHMARK_CONV(f32_igemm_4x8__neonfma_dup_ld128)
650 BENCHMARK_CONV(f32_igemm_4x8__neonfma_dup_ld64)
651 BENCHMARK_CONV(f32_igemm_6x8__neonfma_dup_ld64)
652 BENCHMARK_CONV(f32_igemm_6x8__neonfma_dup_ld128)
653 BENCHMARK_CONV(f32_igemm_1x8s4__neon)
654 BENCHMARK_CONV(f32_igemm_4x8s4__neon)
655 BENCHMARK_CONV(f32_igemm_6x8s4__neon)
656 BENCHMARK_CONV(f32_igemm_8x8s4__neon)
657 BENCHMARK_CONV(f32_igemm_1x8s4__neonfma)
658 BENCHMARK_CONV(f32_igemm_4x8s4__neonfma)
659 BENCHMARK_CONV(f32_igemm_6x8s4__neonfma)
660 BENCHMARK_CONV(f32_igemm_8x8s4__neonfma)
661 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
662
663
664 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
665 static void f32_igemm_1x8__sse_load1(benchmark::State& state, const char* net) {
666 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__sse_load1, 1, 8, 1, 1,
667 xnn_init_f32_minmax_sse_params);
668 }
f32_igemm_3x8__sse_load1(benchmark::State & state,const char * net)669 static void f32_igemm_3x8__sse_load1(benchmark::State& state, const char* net) {
670 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__sse_load1, 3, 8, 1, 1,
671 xnn_init_f32_minmax_sse_params);
672 }
f32_igemm_4x8__sse_load1(benchmark::State & state,const char * net)673 static void f32_igemm_4x8__sse_load1(benchmark::State& state, const char* net) {
674 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__sse_load1, 4, 8, 1, 1,
675 xnn_init_f32_minmax_sse_params);
676 }
f32_igemm_5x8__sse_load1(benchmark::State & state,const char * net)677 static void f32_igemm_5x8__sse_load1(benchmark::State& state, const char* net) {
678 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__sse_load1, 5, 8, 1, 1,
679 xnn_init_f32_minmax_sse_params);
680 }
681
f32_igemm_1x8__sse_dup(benchmark::State & state,const char * net)682 static void f32_igemm_1x8__sse_dup(benchmark::State& state, const char* net) {
683 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__sse_dup, 1, 8, 1, 1,
684 xnn_init_f32_minmax_sse_params);
685 }
f32_igemm_3x8__sse_dup(benchmark::State & state,const char * net)686 static void f32_igemm_3x8__sse_dup(benchmark::State& state, const char* net) {
687 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__sse_dup, 3, 8, 1, 1,
688 xnn_init_f32_minmax_sse_params);
689 }
f32_igemm_4x8__sse_dup(benchmark::State & state,const char * net)690 static void f32_igemm_4x8__sse_dup(benchmark::State& state, const char* net) {
691 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__sse_dup, 4, 8, 1, 1,
692 xnn_init_f32_minmax_sse_params);
693 }
f32_igemm_5x8__sse_dup(benchmark::State & state,const char * net)694 static void f32_igemm_5x8__sse_dup(benchmark::State& state, const char* net) {
695 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__sse_dup, 5, 8, 1, 1,
696 xnn_init_f32_minmax_sse_params);
697 }
698
f32_igemm_1x8s4__sse(benchmark::State & state,const char * net)699 static void f32_igemm_1x8s4__sse(benchmark::State& state, const char* net) {
700 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8s4__sse, 1, 8, 1, 4,
701 xnn_init_f32_minmax_sse_params);
702 }
f32_igemm_3x8s4__sse(benchmark::State & state,const char * net)703 static void f32_igemm_3x8s4__sse(benchmark::State& state, const char* net) {
704 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8s4__sse, 3, 8, 1, 4,
705 xnn_init_f32_minmax_sse_params);
706 }
f32_igemm_4x8s4__sse(benchmark::State & state,const char * net)707 static void f32_igemm_4x8s4__sse(benchmark::State& state, const char* net) {
708 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8s4__sse, 4, 8, 1, 4,
709 xnn_init_f32_minmax_sse_params);
710 }
f32_igemm_5x8s4__sse(benchmark::State & state,const char * net)711 static void f32_igemm_5x8s4__sse(benchmark::State& state, const char* net) {
712 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8s4__sse, 5, 8, 1, 4,
713 xnn_init_f32_minmax_sse_params);
714 }
715
f32_igemm_1x8__sse2_dup(benchmark::State & state,const char * net)716 static void f32_igemm_1x8__sse2_dup(benchmark::State& state, const char* net) {
717 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__sse2_dup, 1, 8, 1, 1,
718 xnn_init_f32_minmax_sse_params);
719 }
f32_igemm_3x8__sse2_dup(benchmark::State & state,const char * net)720 static void f32_igemm_3x8__sse2_dup(benchmark::State& state, const char* net) {
721 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup, 3, 8, 1, 1,
722 xnn_init_f32_minmax_sse_params);
723 }
f32_igemm_4x8__sse2_dup(benchmark::State & state,const char * net)724 static void f32_igemm_4x8__sse2_dup(benchmark::State& state, const char* net) {
725 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__sse2_dup, 4, 8, 1, 1,
726 xnn_init_f32_minmax_sse_params);
727 }
f32_igemm_5x8__sse2_dup(benchmark::State & state,const char * net)728 static void f32_igemm_5x8__sse2_dup(benchmark::State& state, const char* net) {
729 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__sse2_dup, 5, 8, 1, 1,
730 xnn_init_f32_minmax_sse_params);
731 }
732
f32_igemm_1x8__avx_broadcast(benchmark::State & state,const char * net)733 static void f32_igemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
734 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast, 1, 8, 1, 1,
735 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
736 }
f32_igemm_4x8__avx_broadcast(benchmark::State & state,const char * net)737 static void f32_igemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
738 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__avx_broadcast, 4, 8, 1, 1,
739 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
740 }
f32_igemm_5x8__avx_broadcast(benchmark::State & state,const char * net)741 static void f32_igemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
742 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast, 5, 8, 1, 1,
743 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
744 }
f32_igemm_6x8__avx_broadcast(benchmark::State & state,const char * net)745 static void f32_igemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
746 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__avx_broadcast, 6, 8, 1, 1,
747 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
748 }
f32_igemm_7x8__avx_broadcast(benchmark::State & state,const char * net)749 static void f32_igemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
750 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast, 7, 8, 1, 1,
751 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
752 }
753
f32_igemm_1x8__fma3_broadcast(benchmark::State & state,const char * net)754 static void f32_igemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
755 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1,
756 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
757 }
f32_igemm_4x8__fma3_broadcast(benchmark::State & state,const char * net)758 static void f32_igemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
759 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1,
760 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
761 }
f32_igemm_5x8__fma3_broadcast(benchmark::State & state,const char * net)762 static void f32_igemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
763 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1,
764 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
765 }
f32_igemm_6x8__fma3_broadcast(benchmark::State & state,const char * net)766 static void f32_igemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
767 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1,
768 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
769 }
f32_igemm_7x8__fma3_broadcast(benchmark::State & state,const char * net)770 static void f32_igemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
771 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1,
772 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
773 }
f32_igemm_8x8__fma3_broadcast(benchmark::State & state,const char * net)774 static void f32_igemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
775 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1,
776 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
777 }
778
f32_igemm_1x16__avx512f_broadcast(benchmark::State & state,const char * net)779 static void f32_igemm_1x16__avx512f_broadcast(benchmark::State& state, const char* net) {
780 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast, 1, 16, 1, 1,
781 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
782 }
f32_igemm_4x16__avx512f_broadcast(benchmark::State & state,const char * net)783 static void f32_igemm_4x16__avx512f_broadcast(benchmark::State& state, const char* net) {
784 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x16__avx512f_broadcast, 4, 16, 1, 1,
785 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
786 }
f32_igemm_5x16__avx512f_broadcast(benchmark::State & state,const char * net)787 static void f32_igemm_5x16__avx512f_broadcast(benchmark::State& state, const char* net) {
788 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x16__avx512f_broadcast, 5, 16, 1, 1,
789 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
790 }
f32_igemm_6x16__avx512f_broadcast(benchmark::State & state,const char * net)791 static void f32_igemm_6x16__avx512f_broadcast(benchmark::State& state, const char* net) {
792 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x16__avx512f_broadcast, 6, 16, 1, 1,
793 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
794 }
f32_igemm_7x16__avx512f_broadcast(benchmark::State & state,const char * net)795 static void f32_igemm_7x16__avx512f_broadcast(benchmark::State& state, const char* net) {
796 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast, 7, 16, 1, 1,
797 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
798 }
f32_igemm_8x16__avx512f_broadcast(benchmark::State & state,const char * net)799 static void f32_igemm_8x16__avx512f_broadcast(benchmark::State& state, const char* net) {
800 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_8x16__avx512f_broadcast, 8, 16, 1, 1,
801 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
802 }
803
804 BENCHMARK_CONV(f32_igemm_1x8__sse_load1)
BENCHMARK_CONV(f32_igemm_3x8__sse_load1)805 BENCHMARK_CONV(f32_igemm_3x8__sse_load1)
806 BENCHMARK_CONV(f32_igemm_4x8__sse_load1)
807 BENCHMARK_CONV(f32_igemm_5x8__sse_load1)
808
809 BENCHMARK_CONV(f32_igemm_1x8__sse_dup)
810 BENCHMARK_CONV(f32_igemm_3x8__sse_dup)
811 BENCHMARK_CONV(f32_igemm_4x8__sse_dup)
812 BENCHMARK_CONV(f32_igemm_5x8__sse_dup)
813
814 BENCHMARK_CONV(f32_igemm_1x8s4__sse)
815 BENCHMARK_CONV(f32_igemm_3x8s4__sse)
816 BENCHMARK_CONV(f32_igemm_4x8s4__sse)
817 BENCHMARK_CONV(f32_igemm_5x8s4__sse)
818
819 BENCHMARK_CONV(f32_igemm_1x8__sse2_dup)
820 BENCHMARK_CONV(f32_igemm_3x8__sse2_dup)
821 BENCHMARK_CONV(f32_igemm_4x8__sse2_dup)
822 BENCHMARK_CONV(f32_igemm_5x8__sse2_dup)
823
824 BENCHMARK_CONV(f32_igemm_1x8__avx_broadcast)
825 BENCHMARK_CONV(f32_igemm_4x8__avx_broadcast)
826 BENCHMARK_CONV(f32_igemm_5x8__avx_broadcast)
827 BENCHMARK_CONV(f32_igemm_6x8__avx_broadcast)
828 BENCHMARK_CONV(f32_igemm_7x8__avx_broadcast)
829
830 BENCHMARK_CONV(f32_igemm_1x8__fma3_broadcast)
831 BENCHMARK_CONV(f32_igemm_4x8__fma3_broadcast)
832 BENCHMARK_CONV(f32_igemm_5x8__fma3_broadcast)
833 BENCHMARK_CONV(f32_igemm_6x8__fma3_broadcast)
834 BENCHMARK_CONV(f32_igemm_7x8__fma3_broadcast)
835 BENCHMARK_CONV(f32_igemm_8x8__fma3_broadcast)
836
837 BENCHMARK_CONV(f32_igemm_1x16__avx512f_broadcast)
838 BENCHMARK_CONV(f32_igemm_4x16__avx512f_broadcast)
839 BENCHMARK_CONV(f32_igemm_5x16__avx512f_broadcast)
840 BENCHMARK_CONV(f32_igemm_6x16__avx512f_broadcast)
841 BENCHMARK_CONV(f32_igemm_7x16__avx512f_broadcast)
842 BENCHMARK_CONV(f32_igemm_8x16__avx512f_broadcast)
843 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
844
845
846 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
847 static void f32_igemm_3x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
848 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat, 3, 8, 1, 1,
849 xnn_init_f32_minmax_scalar_params);
850 }
851
f32_igemm_4x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)852 static void f32_igemm_4x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
853 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat, 4, 8, 1, 1,
854 xnn_init_f32_minmax_scalar_params);
855 }
856
f32_igemm_5x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)857 static void f32_igemm_5x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
858 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, 5, 8, 1, 1,
859 xnn_init_f32_minmax_scalar_params);
860 }
861
f32_igemm_6x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)862 static void f32_igemm_6x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
863 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat, 6, 8, 1, 1,
864 xnn_init_f32_minmax_scalar_params);
865 }
866
f32_igemm_3x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)867 static void f32_igemm_3x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
868 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, 3, 8, 1, 1,
869 xnn_init_f32_minmax_scalar_params);
870 }
871
f32_igemm_4x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)872 static void f32_igemm_4x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
873 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, 4, 8, 1, 1,
874 xnn_init_f32_minmax_scalar_params);
875 }
876
f32_igemm_5x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)877 static void f32_igemm_5x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
878 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat, 5, 8, 1, 1,
879 xnn_init_f32_minmax_scalar_params);
880 }
881
f32_igemm_6x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)882 static void f32_igemm_6x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
883 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, 6, 8, 1, 1,
884 xnn_init_f32_minmax_scalar_params);
885 }
886
f32_igemm_3x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)887 static void f32_igemm_3x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
888 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat, 3, 8, 1, 1,
889 xnn_init_f32_minmax_scalar_params);
890 }
891
f32_igemm_4x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)892 static void f32_igemm_4x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
893 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_splat, 4, 8, 1, 1,
894 xnn_init_f32_minmax_scalar_params);
895 }
896
f32_igemm_5x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)897 static void f32_igemm_5x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
898 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat, 5, 8, 1, 1,
899 xnn_init_f32_minmax_scalar_params);
900 }
901
f32_igemm_6x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)902 static void f32_igemm_6x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
903 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat, 6, 8, 1, 1,
904 xnn_init_f32_minmax_scalar_params);
905 }
906
f32_igemm_3x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)907 static void f32_igemm_3x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
908 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat, 3, 8, 1, 1,
909 xnn_init_f32_minmax_scalar_params);
910 }
911
f32_igemm_4x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)912 static void f32_igemm_4x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
913 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat, 4, 8, 1, 1,
914 xnn_init_f32_minmax_scalar_params);
915 }
916
f32_igemm_5x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)917 static void f32_igemm_5x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
918 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat, 5, 8, 1, 1,
919 xnn_init_f32_minmax_scalar_params);
920 }
921
f32_igemm_6x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)922 static void f32_igemm_6x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
923 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat, 6, 8, 1, 1,
924 xnn_init_f32_minmax_scalar_params);
925 }
926
f32_igemm_3x8s4__wasmsimd_arm(benchmark::State & state,const char * net)927 static void f32_igemm_3x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
928 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm, 3, 8, 1, 4,
929 xnn_init_f32_minmax_scalar_params);
930 }
931
f32_igemm_4x8s4__wasmsimd_arm(benchmark::State & state,const char * net)932 static void f32_igemm_4x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
933 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm, 4, 8, 1, 4,
934 xnn_init_f32_minmax_scalar_params);
935 }
936
f32_igemm_5x8s4__wasmsimd_arm(benchmark::State & state,const char * net)937 static void f32_igemm_5x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
938 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm, 5, 8, 1, 4,
939 xnn_init_f32_minmax_scalar_params);
940 }
941
f32_igemm_6x8s4__wasmsimd_arm(benchmark::State & state,const char * net)942 static void f32_igemm_6x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
943 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm, 6, 8, 1, 4,
944 xnn_init_f32_minmax_scalar_params);
945 }
946
f32_igemm_3x8s4__wasmsimd_x86(benchmark::State & state,const char * net)947 static void f32_igemm_3x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
948 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86, 3, 8, 1, 4,
949 xnn_init_f32_minmax_scalar_params);
950 }
951
f32_igemm_4x8s4__wasmsimd_x86(benchmark::State & state,const char * net)952 static void f32_igemm_4x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
953 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86, 4, 8, 1, 4,
954 xnn_init_f32_minmax_scalar_params);
955 }
956
f32_igemm_5x8s4__wasmsimd_x86(benchmark::State & state,const char * net)957 static void f32_igemm_5x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
958 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86, 5, 8, 1, 4,
959 xnn_init_f32_minmax_scalar_params);
960 }
961
f32_igemm_6x8s4__wasmsimd_x86(benchmark::State & state,const char * net)962 static void f32_igemm_6x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
963 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86, 6, 8, 1, 4,
964 xnn_init_f32_minmax_scalar_params);
965 }
966
967 BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_arm_loadsplat)
BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_arm_loadsplat)968 BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_arm_loadsplat)
969 BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_arm_loadsplat)
970 BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_arm_loadsplat)
971 BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_x86_loadsplat)
972 BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_x86_loadsplat)
973 BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_x86_loadsplat)
974 BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_x86_loadsplat)
975 BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_arm_splat)
976 BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_arm_splat)
977 BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_arm_splat)
978 BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_arm_splat)
979 BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_x86_splat)
980 BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_x86_splat)
981 BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_x86_splat)
982 BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_x86_splat)
983 BENCHMARK_CONV(f32_igemm_3x8s4__wasmsimd_arm)
984 BENCHMARK_CONV(f32_igemm_4x8s4__wasmsimd_arm)
985 BENCHMARK_CONV(f32_igemm_5x8s4__wasmsimd_arm)
986 BENCHMARK_CONV(f32_igemm_6x8s4__wasmsimd_arm)
987 BENCHMARK_CONV(f32_igemm_3x8s4__wasmsimd_x86)
988 BENCHMARK_CONV(f32_igemm_4x8s4__wasmsimd_x86)
989 BENCHMARK_CONV(f32_igemm_5x8s4__wasmsimd_x86)
990 BENCHMARK_CONV(f32_igemm_6x8s4__wasmsimd_x86)
991 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
992
993
994 static void f32_igemm_1x4__scalar(benchmark::State& state, const char* net) {
995 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x4__scalar, 1, 4, 1, 1,
996 xnn_init_f32_minmax_scalar_params);
997 }
998
f32_igemm_2x4__scalar(benchmark::State & state,const char * net)999 static void f32_igemm_2x4__scalar(benchmark::State& state, const char* net) {
1000 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_2x4__scalar, 2, 4, 1, 1,
1001 xnn_init_f32_minmax_scalar_params);
1002 }
1003
f32_igemm_4x4__scalar(benchmark::State & state,const char * net)1004 static void f32_igemm_4x4__scalar(benchmark::State& state, const char* net) {
1005 IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x4__scalar, 4, 4, 1, 1,
1006 xnn_init_f32_minmax_scalar_params);
1007 }
1008
1009 BENCHMARK_CONV(f32_igemm_1x4__scalar)
1010 BENCHMARK_CONV(f32_igemm_2x4__scalar)
1011 BENCHMARK_CONV(f32_igemm_4x4__scalar)
1012
1013 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1014 BENCHMARK_MAIN();
1015 #endif
1016