• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cfloat>
8 #include <cmath>
9 #include <functional>
10 #include <limits>
11 #include <random>
12 #include <vector>
13 
14 #include <benchmark/benchmark.h>
15 #include "bench/conv.h"
16 #include "bench/utils.h"
17 #include <xnnpack/AlignedAllocator.h>
18 #include <xnnpack/common.h>
19 #include <xnnpack/igemm.h>
20 #include <xnnpack/indirection.h>
21 #include <xnnpack/operator.h>
22 #include <xnnpack/pack.h>
23 #include <xnnpack/params-init.h>
24 #include <xnnpack/params.h>
25 
26 
IGEMMBenchmark(benchmark::State & state,xnn_f32_igemm_minmax_ukernel_function f32_igemm,uint32_t mr,uint32_t nr,uint32_t kr,uint32_t sr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)27 static void IGEMMBenchmark(benchmark::State& state,
28   xnn_f32_igemm_minmax_ukernel_function f32_igemm,
29   uint32_t mr, uint32_t nr, uint32_t kr, uint32_t sr,
30   xnn_init_f32_minmax_params_fn init_params,
31   benchmark::utils::IsaCheckFunction isa_check = nullptr)
32 {
33   if (isa_check && !isa_check(state)) {
34     return;
35   }
36 
37   const size_t input_height = state.range(0);
38   const size_t input_width = state.range(1);
39   const size_t kernel_height = state.range(2);
40   const size_t kernel_width = state.range(3);
41   const size_t kernel_size = kernel_height * kernel_width;
42   const size_t padding_height = state.range(4);
43   const size_t padding_width = state.range(5);
44   const size_t subsampling = state.range(6);
45   const size_t dilation = state.range(7);
46   const size_t group_input_channels = state.range(8);
47   const size_t group_output_channels = state.range(9);
48 
49   std::random_device random_device;
50   auto rng = std::mt19937(random_device());
51   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
52 
53   const size_t output_pixel_stride = group_output_channels;
54   const size_t input_pixel_stride = group_input_channels;
55   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
56   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
57   const size_t padding_left = padding_width / 2;
58   const size_t padding_top = padding_height / 2;
59   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
60   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
61   const size_t output_size = output_height * output_width;
62 
63   const size_t mc_stride = benchmark::utils::RoundUp<size_t>(output_size, mr);
64   const size_t nc_stride = benchmark::utils::RoundUp<size_t>(group_output_channels, nr);
65   const size_t kc_stride = benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
66 
67   std::vector<float> a(input_height * input_width * input_pixel_stride);
68   std::generate(a.begin(), a.end(), std::ref(f32rng));
69   std::vector<float> k(group_output_channels * kernel_height * kernel_width * group_input_channels);
70   std::generate(k.begin(), k.end(), std::ref(f32rng));
71   std::vector<float> b(group_output_channels);
72   std::generate(b.begin(), b.end(), std::ref(f32rng));
73 
74   std::vector<float> z(group_input_channels);
75 
76   const size_t w_elements = kernel_size * kc_stride * nc_stride + nc_stride;
77   const size_t i_elements = mc_stride * kernel_size;
78   const size_t c_elements = output_height * output_width * output_pixel_stride;
79   const size_t num_buffers = 1 +
80     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
81       sizeof(float) * (w_elements + c_elements) + sizeof(void*) * i_elements);
82 
83   std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
84   std::fill(w.begin(), w.end(), 0.0f);
85   xnn_pack_f32_conv_goki_w(
86     1 /* groups */, group_output_channels, kernel_size, group_input_channels,
87     nr, kr, sr, k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
88   for (size_t n = 1; n < num_buffers; n++) {
89     std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
90   }
91 
92   std::vector<const float*> i(i_elements * num_buffers);
93   xnn_operator convolution_op = { };
94   convolution_op.indirection_buffer   = reinterpret_cast<const void**>(i.data());
95   convolution_op.input                = a.data();
96   convolution_op.input_pixel_stride   = input_pixel_stride;
97   convolution_op.zero_buffer          = z.data();
98   convolution_op.groups               = 1;
99   convolution_op.group_input_channels = group_input_channels;
100   convolution_op.batch_size           = 1;
101   convolution_op.input_height         = input_height;
102   convolution_op.input_width          = input_width;
103   convolution_op.output_height        = output_height;
104   convolution_op.output_width         = output_width;
105   convolution_op.kernel_height        = kernel_height;
106   convolution_op.kernel_width         = kernel_width;
107   convolution_op.stride_height        = subsampling;
108   convolution_op.stride_width         = subsampling;
109   convolution_op.dilation_height      = dilation;
110   convolution_op.dilation_width       = dilation;
111   convolution_op.padding_top          = padding_top;
112   convolution_op.padding_left         = padding_left;
113   xnn_indirection_init_conv2d(&convolution_op, mr, 2 /* log2(sizeof(float)) */);
114   for (size_t n = 1; n < num_buffers; n++) {
115     std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
116   }
117 
118   std::vector<float> c(c_elements * num_buffers);
119   std::fill(c.begin(), c.end(), std::nanf(""));
120 
121   xnn_f32_minmax_params params;
122   init_params(&params,
123     -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
124 
125   size_t buffer_index = 0;
126   for (auto _ : state) {
127     state.PauseTiming();
128     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
129     buffer_index = (buffer_index + 1) % num_buffers;
130     state.ResumeTiming();
131 
132     for (uint32_t m = 0; m < output_size; m += mr) {
133       const uint32_t mb = min(output_size - m, mr);
134       f32_igemm(
135         mb, group_output_channels, group_input_channels * sizeof(float), kernel_size * mr * sizeof(void*),
136         i.data() + buffer_index * i_elements + m,
137         w.data() + buffer_index * w_elements,
138         c.data() + buffer_index * c_elements + m * group_output_channels, group_output_channels * sizeof(float), nr * sizeof(float),
139         0, z.data(), &params);
140     }
141   }
142 
143   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
144   if (cpu_frequency != 0) {
145     state.counters["cpufreq"] = cpu_frequency;
146   }
147 
148   state.counters["FLOPS"] = benchmark::Counter(
149     uint64_t(state.iterations()) * 2 *
150       output_height * output_width *
151       group_input_channels * group_output_channels *
152       kernel_height * kernel_width,
153     benchmark::Counter::kIsRate);
154 }
155 
156 #if XNN_PLATFORM_JIT
IGEMMBenchmark(benchmark::State & state,xnn_jit_igemm_code_generator_function generator,size_t mr,size_t nr,size_t kr,size_t sr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)157   static void IGEMMBenchmark(benchmark::State& state,
158     xnn_jit_igemm_code_generator_function generator,
159     size_t mr, size_t nr, size_t kr, size_t sr,
160     xnn_init_f32_minmax_params_fn init_params,
161     benchmark::utils::IsaCheckFunction isa_check = nullptr)
162 {
163   if (isa_check && !isa_check(state)) {
164     return;
165   }
166 
167   const size_t input_height = state.range(0);
168   const size_t input_width = state.range(1);
169   const size_t kernel_height = state.range(2);
170   const size_t kernel_width = state.range(3);
171   const size_t kernel_size = kernel_height * kernel_width;
172   const size_t padding_height = state.range(4);
173   const size_t padding_width = state.range(5);
174   const size_t subsampling = state.range(6);
175   const size_t dilation = state.range(7);
176   const size_t group_input_channels = state.range(8);
177   const size_t group_output_channels = state.range(9);
178 
179   std::random_device random_device;
180   auto rng = std::mt19937(random_device());
181   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
182 
183   const size_t output_pixel_stride = group_output_channels;
184   const size_t input_pixel_stride = group_input_channels;
185   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
186   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
187   const size_t padding_left = padding_width / 2;
188   const size_t padding_top = padding_height / 2;
189   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
190   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
191   const size_t output_size = output_height * output_width;
192 
193   const size_t mc_stride = benchmark::utils::RoundUp<size_t>(output_size, mr);
194   const size_t nc_stride = benchmark::utils::RoundUp<size_t>(group_output_channels, nr);
195   const size_t kc_stride = benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
196 
197   std::vector<float> a(input_height * input_width * input_pixel_stride);
198   std::generate(a.begin(), a.end(), std::ref(f32rng));
199   std::vector<float> k(group_output_channels * kernel_height * kernel_width * group_input_channels);
200   std::generate(k.begin(), k.end(), std::ref(f32rng));
201   std::vector<float> b(group_output_channels);
202   std::generate(b.begin(), b.end(), std::ref(f32rng));
203 
204   std::vector<float> z(group_input_channels);
205 
206   const size_t w_elements = kernel_size * kc_stride * nc_stride + nc_stride;
207   const size_t i_elements = mc_stride * kernel_size;
208   const size_t c_elements = output_height * output_width * output_pixel_stride;
209   const size_t num_buffers = 1 +
210     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
211       sizeof(float) * (w_elements + c_elements) + sizeof(void*) * i_elements);
212 
213   std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
214   std::fill(w.begin(), w.end(), 0.0f);
215   xnn_pack_f32_conv_goki_w(
216     1 /* groups */, group_output_channels, kernel_size, group_input_channels,
217     nr, kr, sr, k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
218   for (size_t n = 1; n < num_buffers; n++) {
219     std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
220   }
221 
222   std::vector<const float*> i(i_elements * num_buffers);
223   xnn_operator convolution_op = { };
224   convolution_op.indirection_buffer   = reinterpret_cast<const void**>(i.data());
225   convolution_op.input                = a.data();
226   convolution_op.input_pixel_stride   = input_pixel_stride;
227   convolution_op.zero_buffer          = z.data();
228   convolution_op.groups               = 1;
229   convolution_op.group_input_channels = group_input_channels;
230   convolution_op.batch_size           = 1;
231   convolution_op.input_height         = input_height;
232   convolution_op.input_width          = input_width;
233   convolution_op.output_height        = output_height;
234   convolution_op.output_width         = output_width;
235   convolution_op.kernel_height        = kernel_height;
236   convolution_op.kernel_width         = kernel_width;
237   convolution_op.stride_height        = subsampling;
238   convolution_op.stride_width         = subsampling;
239   convolution_op.dilation_height      = dilation;
240   convolution_op.dilation_width       = dilation;
241   convolution_op.padding_top          = padding_top;
242   convolution_op.padding_left         = padding_left;
243   xnn_indirection_init_conv2d(&convolution_op, mr, 2 /* log2(sizeof(float)) */);
244   for (size_t n = 1; n < num_buffers; n++) {
245     std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
246   }
247 
248   std::vector<float> c(c_elements * num_buffers);
249   std::fill(c.begin(), c.end(), std::nanf(""));
250 
251   xnn_f32_minmax_params params;
252   init_params(&params,
253     -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
254 
255   jit_gemm_params jit_params = {
256     .f32_minmax = {
257       .min = -std::numeric_limits<float>::infinity(),
258       .max = +std::numeric_limits<float>::infinity()
259     }
260   };
261 
262   xnn_code_buffer code_buffer;
263   xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
264   generator(&code_buffer, group_output_channels, group_input_channels * sizeof(float), kernel_size * mr * sizeof(void *), &jit_params);
265   auto f32_igemm = reinterpret_cast<xnn_f32_igemm_minmax_ukernel_function>(code_buffer.code);
266 
267   size_t buffer_index = 0;
268   for (auto _ : state) {
269     state.PauseTiming();
270     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
271     buffer_index = (buffer_index + 1) % num_buffers;
272     state.ResumeTiming();
273 
274     for (uint32_t m = 0; m < output_size; m += mr) {
275       const uint32_t mb = min(output_size - m, mr);
276       f32_igemm(
277         mb, group_output_channels, group_input_channels * sizeof(float), kernel_size * mr * sizeof(void*),
278         i.data() + buffer_index * i_elements + m,
279         w.data() + buffer_index * w_elements,
280         c.data() + buffer_index * c_elements + m * group_output_channels, group_output_channels * sizeof(float), nr * sizeof(float),
281         0, z.data(), &params);
282     }
283   }
284   xnn_release_code_memory(&code_buffer);
285 
286   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
287   if (cpu_frequency != 0) {
288     state.counters["cpufreq"] = cpu_frequency;
289   }
290 
291   state.counters["FLOPS"] = benchmark::Counter(
292     uint64_t(state.iterations()) * 2 *
293       output_height * output_width *
294       group_input_channels * group_output_channels *
295       kernel_height * kernel_width,
296     benchmark::Counter::kIsRate);
297 
298 }
299 #endif  // XNN_PLATFORM_JIT
300 
301 #if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
jit_f32_igemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)302   static void jit_f32_igemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
303     IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1,
304       xnn_init_f32_minmax_scalar_params);
305   }
jit_f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)306   static void jit_f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
307     IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8, 1, 1,
308       xnn_init_f32_minmax_scalar_params);
309   }
jit_f32_igemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)310   static void jit_f32_igemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
311     IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1,
312       xnn_init_f32_minmax_scalar_params);
313   }
jit_f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)314   static void jit_f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
315     IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, 6, 8, 1, 1,
316       xnn_init_f32_minmax_scalar_params);
317   }
318 
319   BENCHMARK_CONV(jit_f32_igemm_1x8__aarch64_neonfma_cortex_a75)
BENCHMARK_CONV(jit_f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75)320   BENCHMARK_CONV(jit_f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75)
321   BENCHMARK_CONV(jit_f32_igemm_6x8__aarch64_neonfma_cortex_a75)
322   BENCHMARK_CONV(jit_f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75)
323 #endif  // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
324 
325 #if XNN_ARCH_ARM && XNN_PLATFORM_JIT
326   static void jit_f32_igemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
327     IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1,
328       xnn_init_f32_minmax_scalar_params);
329   }
jit_f32_igemm_4x8__aarch32_neon_cortex_a7(benchmark::State & state,const char * net)330   static void jit_f32_igemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net) {
331     IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1,
332       xnn_init_f32_minmax_scalar_params);
333   }
jit_f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State & state,const char * net)334   static void jit_f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
335     IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1,
336       xnn_init_f32_minmax_scalar_params);
337   }
jit_f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State & state,const char * net)338   static void jit_f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
339     IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1,
340       xnn_init_f32_minmax_scalar_params);
341   }
jit_f32_igemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State & state,const char * net)342   static void jit_f32_igemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net) {
343     IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, 4, 8, 1, 1,
344       xnn_init_f32_minmax_scalar_params);
345   }
jit_f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State & state,const char * net)346   static void jit_f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
347     IGEMMBenchmark(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1,
348       xnn_init_f32_minmax_scalar_params);
349   }
350 
351   BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_ld64)
BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a7)352   BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a7)
353   BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a53)
354   BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a55)
355   BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_prfm_cortex_a75)
356   BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a75)
357 #endif  // XNN_ARCH_ARM && XNN_PLATFORM_JIT
358 
359 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
360   static void f32_igemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
361     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1,
362       xnn_init_f32_minmax_scalar_params);
363   }
f32_igemm_4x8__aarch32_neon_cortex_a7(benchmark::State & state,const char * net)364   static void f32_igemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net) {
365     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1,
366       xnn_init_f32_minmax_scalar_params);
367   }
f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State & state,const char * net)368   static void f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
369     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1,
370       xnn_init_f32_minmax_scalar_params);
371   }
f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State & state,const char * net)372   static void f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
373     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1,
374       xnn_init_f32_minmax_scalar_params);
375   }
f32_igemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State & state,const char * net)376   static void f32_igemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net) {
377     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75, 4, 8, 1, 1,
378       xnn_init_f32_minmax_scalar_params);
379   }
f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State & state,const char * net)380   static void f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
381     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1,
382       xnn_init_f32_minmax_scalar_params);
383   }
384 
385   BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_ld64)
BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a7)386   BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a7)
387   BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a53)
388   BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a55)
389   BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_prfm_cortex_a75)
390   BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a75)
391 #endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
392 
393 
394 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
395   static void f32_igemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
396     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1,
397       xnn_init_f32_minmax_scalar_params);
398   }
f32_igemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)399   static void f32_igemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
400     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, 1, 8, 1, 1,
401       xnn_init_f32_minmax_scalar_params);
402   }
f32_igemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)403   static void f32_igemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
404     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1,
405       xnn_init_f32_minmax_scalar_params);
406   }
f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)407   static void f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
408     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8, 1, 1,
409       xnn_init_f32_minmax_scalar_params);
410   }
f32_igemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)411   static void f32_igemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
412     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53, 4, 8, 1, 1,
413       xnn_init_f32_minmax_scalar_params);
414   }
f32_igemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State & state,const char * net)415   static void f32_igemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
416     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, 4, 8, 1, 1,
417       xnn_init_f32_minmax_scalar_params);
418   }
f32_igemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)419   static void f32_igemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
420     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1,
421       xnn_init_f32_minmax_scalar_params);
422   }
f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)423   static void f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
424     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, 4, 8, 1, 1,
425       xnn_init_f32_minmax_scalar_params);
426   }
f32_igemm_4x8__aarch64_neonfma_ld64(benchmark::State & state,const char * net)427   static void f32_igemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
428     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1,
429       xnn_init_f32_minmax_scalar_params);
430   }
f32_igemm_4x8__aarch64_neonfma_ld128(benchmark::State & state,const char * net)431   static void f32_igemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
432     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld128, 4, 8, 1, 1,
433       xnn_init_f32_minmax_scalar_params);
434   }
f32_igemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)435   static void f32_igemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
436     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75, 5, 8, 1, 1,
437       xnn_init_f32_minmax_scalar_params);
438   }
f32_igemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)439   static void f32_igemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
440     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, 5, 8, 1, 1,
441       xnn_init_f32_minmax_scalar_params);
442   }
f32_igemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)443   static void f32_igemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
444     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1,
445       xnn_init_f32_minmax_scalar_params);
446   }
f32_igemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)447   static void f32_igemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
448     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53, 6, 8, 1, 1,
449       xnn_init_f32_minmax_scalar_params);
450   }
f32_igemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State & state,const char * net)451   static void f32_igemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
452     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, 6, 8, 1, 1,
453       xnn_init_f32_minmax_scalar_params);
454   }
f32_igemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State & state,const char * net)455   static void f32_igemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
456     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1,
457       xnn_init_f32_minmax_scalar_params);
458   }
f32_igemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)459   static void f32_igemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
460     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1,
461       xnn_init_f32_minmax_scalar_params);
462   }
f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)463   static void f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
464     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, 6, 8, 1, 1,
465       xnn_init_f32_minmax_scalar_params);
466   }
f32_igemm_6x8__aarch64_neonfma_ld64(benchmark::State & state,const char * net)467   static void f32_igemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
468     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1,
469       xnn_init_f32_minmax_scalar_params);
470   }
f32_igemm_6x8__aarch64_neonfma_ld128(benchmark::State & state,const char * net)471   static void f32_igemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
472     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1,
473       xnn_init_f32_minmax_scalar_params);
474   }
f32_igemm_1x8__neonfma_lane_ld64(benchmark::State & state,const char * net)475   static void f32_igemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
476     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64, 1, 8, 1, 1,
477       xnn_init_f32_minmax_scalar_params);
478   }
f32_igemm_4x2__neonfma_lane_ld64(benchmark::State & state,const char * net)479   static void f32_igemm_4x2__neonfma_lane_ld64(benchmark::State& state, const char* net) {
480     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x2__neonfma_lane_ld64, 4, 2, 1, 1,
481       xnn_init_f32_minmax_scalar_params);
482   }
f32_igemm_4x4__neonfma_lane_ld64(benchmark::State & state,const char * net)483   static void f32_igemm_4x4__neonfma_lane_ld64(benchmark::State& state, const char* net) {
484     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x4__neonfma_lane_ld64, 4, 4, 1, 1,
485       xnn_init_f32_minmax_scalar_params);
486   }
f32_igemm_4x8__neonfma_lane_ld128(benchmark::State & state,const char * net)487   static void f32_igemm_4x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
488     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld128, 4, 8, 1, 1,
489       xnn_init_f32_minmax_scalar_params);
490   }
f32_igemm_4x8__neonfma_lane_ld64(benchmark::State & state,const char * net)491   static void f32_igemm_4x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
492     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld64, 4, 8, 1, 1,
493       xnn_init_f32_minmax_scalar_params);
494   }
f32_igemm_6x8__neonfma_lane_ld64(benchmark::State & state,const char * net)495   static void f32_igemm_6x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
496     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64, 6, 8, 1, 1,
497       xnn_init_f32_minmax_scalar_params);
498   }
f32_igemm_6x8__neonfma_lane_ld128(benchmark::State & state,const char * net)499   static void f32_igemm_6x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
500     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128, 6, 8, 1, 1,
501       xnn_init_f32_minmax_scalar_params);
502   }
503 
504   BENCHMARK_CONV(f32_igemm_1x12__aarch64_neonfma_cortex_a53)
BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_cortex_a53)505   BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_cortex_a53)
506   BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_cortex_a75)
507   BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75)
508   BENCHMARK_CONV(f32_igemm_4x12__aarch64_neonfma_cortex_a53)
509   BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_cortex_a53)
510   BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_cortex_a55)
511   BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_cortex_a75)
512   BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a75)
513   BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_ld64)
514   BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_ld128)
515   BENCHMARK_CONV(f32_igemm_5x8__aarch64_neonfma_cortex_a75)
516   BENCHMARK_CONV(f32_igemm_5x8__aarch64_neonfma_prfm_cortex_a75)
517   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a53)
518   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a55)
519   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a73)
520   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a75)
521   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75)
522   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_ld64)
523   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_ld128)
524   BENCHMARK_CONV(f32_igemm_1x8__neonfma_lane_ld64)
525   BENCHMARK_CONV(f32_igemm_4x2__neonfma_lane_ld64)
526   BENCHMARK_CONV(f32_igemm_4x4__neonfma_lane_ld64)
527   BENCHMARK_CONV(f32_igemm_4x8__neonfma_lane_ld128)
528   BENCHMARK_CONV(f32_igemm_4x8__neonfma_lane_ld64)
529   BENCHMARK_CONV(f32_igemm_6x8__neonfma_lane_ld64)
530   BENCHMARK_CONV(f32_igemm_6x8__neonfma_lane_ld128)
531 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
532 
533 
534 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
535   static void f32_igemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
536     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1,
537       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
538   }
f32_igemm_4x2__neon_lane_ld64(benchmark::State & state,const char * net)539   static void f32_igemm_4x2__neon_lane_ld64(benchmark::State& state, const char* net) {
540     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64, 4, 2, 1, 1,
541       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
542   }
f32_igemm_4x4__neon_lane_ld64(benchmark::State & state,const char * net)543   static void f32_igemm_4x4__neon_lane_ld64(benchmark::State& state, const char* net) {
544     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x4__neon_lane_ld64, 4, 4, 1, 1,
545       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
546   }
f32_igemm_4x8__neon_lane_ld64(benchmark::State & state,const char * net)547   static void f32_igemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
548     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1,
549       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
550   }
f32_igemm_4x8__neon_lane_ld128(benchmark::State & state,const char * net)551   static void f32_igemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
552     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1,
553       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
554   }
f32_igemm_6x8__neon_lane_ld64(benchmark::State & state,const char * net)555   static void f32_igemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
556     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1,
557       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
558   }
f32_igemm_6x8__neon_lane_ld128(benchmark::State & state,const char * net)559   static void f32_igemm_6x8__neon_lane_ld128(benchmark::State& state, const char* net) {
560     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128, 6, 8, 1, 1,
561       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
562   }
f32_igemm_1x8__neon_dup_ld64(benchmark::State & state,const char * net)563   static void f32_igemm_1x8__neon_dup_ld64(benchmark::State& state, const char* net) {
564     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64, 1, 8, 1, 1,
565       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
566   }
f32_igemm_4x8__neon_dup_ld128(benchmark::State & state,const char * net)567   static void f32_igemm_4x8__neon_dup_ld128(benchmark::State& state, const char* net) {
568     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld128, 4, 8, 1, 1,
569       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
570   }
f32_igemm_4x8__neon_dup_ld64(benchmark::State & state,const char * net)571   static void f32_igemm_4x8__neon_dup_ld64(benchmark::State& state, const char* net) {
572     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld64, 4, 8, 1, 1,
573       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
574   }
f32_igemm_6x8__neon_dup_ld64(benchmark::State & state,const char * net)575   static void f32_igemm_6x8__neon_dup_ld64(benchmark::State& state, const char* net) {
576     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld64, 6, 8, 1, 1,
577       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
578   }
f32_igemm_6x8__neon_dup_ld128(benchmark::State & state,const char * net)579   static void f32_igemm_6x8__neon_dup_ld128(benchmark::State& state, const char* net) {
580     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld128, 6, 8, 1, 1,
581       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
582   }
f32_igemm_1x8__neonfma_dup_ld64(benchmark::State & state,const char * net)583   static void f32_igemm_1x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
584     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1,
585       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
586   }
f32_igemm_4x8__neonfma_dup_ld128(benchmark::State & state,const char * net)587   static void f32_igemm_4x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
588     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1,
589       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
590   }
f32_igemm_4x8__neonfma_dup_ld64(benchmark::State & state,const char * net)591   static void f32_igemm_4x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
592     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1,
593       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
594   }
f32_igemm_6x8__neonfma_dup_ld64(benchmark::State & state,const char * net)595   static void f32_igemm_6x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
596     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1,
597       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
598   }
f32_igemm_6x8__neonfma_dup_ld128(benchmark::State & state,const char * net)599   static void f32_igemm_6x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
600     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld128, 6, 8, 1, 1,
601       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
602   }
f32_igemm_1x8s4__neon(benchmark::State & state,const char * net)603   static void f32_igemm_1x8s4__neon(benchmark::State& state, const char* net) {
604     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8s4__neon, 1, 8, 1, 4,
605       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
606   }
f32_igemm_4x8s4__neon(benchmark::State & state,const char * net)607   static void f32_igemm_4x8s4__neon(benchmark::State& state, const char* net) {
608     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8s4__neon, 4, 8, 1, 4,
609       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
610   }
f32_igemm_6x8s4__neon(benchmark::State & state,const char * net)611   static void f32_igemm_6x8s4__neon(benchmark::State& state, const char* net) {
612     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8s4__neon, 6, 8, 1, 4,
613       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
614   }
f32_igemm_8x8s4__neon(benchmark::State & state,const char * net)615   static void f32_igemm_8x8s4__neon(benchmark::State& state, const char* net) {
616     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_8x8s4__neon, 8, 8, 1, 4,
617       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
618   }
f32_igemm_1x8s4__neonfma(benchmark::State & state,const char * net)619   static void f32_igemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
620     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma, 1, 8, 1, 4,
621       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
622   }
f32_igemm_4x8s4__neonfma(benchmark::State & state,const char * net)623   static void f32_igemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
624     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma, 4, 8, 1, 4,
625       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
626   }
f32_igemm_6x8s4__neonfma(benchmark::State & state,const char * net)627   static void f32_igemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
628     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma, 6, 8, 1, 4,
629       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
630   }
f32_igemm_8x8s4__neonfma(benchmark::State & state,const char * net)631   static void f32_igemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
632     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma, 8, 8, 1, 4,
633       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
634   }
635 
636   BENCHMARK_CONV(f32_igemm_1x8__neon_lane_ld64)
BENCHMARK_CONV(f32_igemm_4x2__neon_lane_ld64)637   BENCHMARK_CONV(f32_igemm_4x2__neon_lane_ld64)
638   BENCHMARK_CONV(f32_igemm_4x4__neon_lane_ld64)
639   BENCHMARK_CONV(f32_igemm_4x8__neon_lane_ld128)
640   BENCHMARK_CONV(f32_igemm_4x8__neon_lane_ld64)
641   BENCHMARK_CONV(f32_igemm_6x8__neon_lane_ld64)
642   BENCHMARK_CONV(f32_igemm_6x8__neon_lane_ld128)
643   BENCHMARK_CONV(f32_igemm_1x8__neon_dup_ld64)
644   BENCHMARK_CONV(f32_igemm_4x8__neon_dup_ld128)
645   BENCHMARK_CONV(f32_igemm_4x8__neon_dup_ld64)
646   BENCHMARK_CONV(f32_igemm_6x8__neon_dup_ld64)
647   BENCHMARK_CONV(f32_igemm_6x8__neon_dup_ld128)
648   BENCHMARK_CONV(f32_igemm_1x8__neonfma_dup_ld64)
649   BENCHMARK_CONV(f32_igemm_4x8__neonfma_dup_ld128)
650   BENCHMARK_CONV(f32_igemm_4x8__neonfma_dup_ld64)
651   BENCHMARK_CONV(f32_igemm_6x8__neonfma_dup_ld64)
652   BENCHMARK_CONV(f32_igemm_6x8__neonfma_dup_ld128)
653   BENCHMARK_CONV(f32_igemm_1x8s4__neon)
654   BENCHMARK_CONV(f32_igemm_4x8s4__neon)
655   BENCHMARK_CONV(f32_igemm_6x8s4__neon)
656   BENCHMARK_CONV(f32_igemm_8x8s4__neon)
657   BENCHMARK_CONV(f32_igemm_1x8s4__neonfma)
658   BENCHMARK_CONV(f32_igemm_4x8s4__neonfma)
659   BENCHMARK_CONV(f32_igemm_6x8s4__neonfma)
660   BENCHMARK_CONV(f32_igemm_8x8s4__neonfma)
661 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
662 
663 
664 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
665   static void f32_igemm_1x8__sse_load1(benchmark::State& state, const char* net) {
666     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__sse_load1, 1, 8, 1, 1,
667       xnn_init_f32_minmax_sse_params);
668   }
f32_igemm_3x8__sse_load1(benchmark::State & state,const char * net)669   static void f32_igemm_3x8__sse_load1(benchmark::State& state, const char* net) {
670     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__sse_load1, 3, 8, 1, 1,
671       xnn_init_f32_minmax_sse_params);
672   }
f32_igemm_4x8__sse_load1(benchmark::State & state,const char * net)673   static void f32_igemm_4x8__sse_load1(benchmark::State& state, const char* net) {
674     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__sse_load1, 4, 8, 1, 1,
675       xnn_init_f32_minmax_sse_params);
676   }
f32_igemm_5x8__sse_load1(benchmark::State & state,const char * net)677   static void f32_igemm_5x8__sse_load1(benchmark::State& state, const char* net) {
678     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__sse_load1, 5, 8, 1, 1,
679       xnn_init_f32_minmax_sse_params);
680   }
681 
f32_igemm_1x8__sse_dup(benchmark::State & state,const char * net)682   static void f32_igemm_1x8__sse_dup(benchmark::State& state, const char* net) {
683     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__sse_dup, 1, 8, 1, 1,
684       xnn_init_f32_minmax_sse_params);
685   }
f32_igemm_3x8__sse_dup(benchmark::State & state,const char * net)686   static void f32_igemm_3x8__sse_dup(benchmark::State& state, const char* net) {
687     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__sse_dup, 3, 8, 1, 1,
688       xnn_init_f32_minmax_sse_params);
689   }
f32_igemm_4x8__sse_dup(benchmark::State & state,const char * net)690   static void f32_igemm_4x8__sse_dup(benchmark::State& state, const char* net) {
691     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__sse_dup, 4, 8, 1, 1,
692       xnn_init_f32_minmax_sse_params);
693   }
f32_igemm_5x8__sse_dup(benchmark::State & state,const char * net)694   static void f32_igemm_5x8__sse_dup(benchmark::State& state, const char* net) {
695     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__sse_dup, 5, 8, 1, 1,
696       xnn_init_f32_minmax_sse_params);
697   }
698 
f32_igemm_1x8s4__sse(benchmark::State & state,const char * net)699   static void f32_igemm_1x8s4__sse(benchmark::State& state, const char* net) {
700     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8s4__sse, 1, 8, 1, 4,
701       xnn_init_f32_minmax_sse_params);
702   }
f32_igemm_3x8s4__sse(benchmark::State & state,const char * net)703   static void f32_igemm_3x8s4__sse(benchmark::State& state, const char* net) {
704     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8s4__sse, 3, 8, 1, 4,
705       xnn_init_f32_minmax_sse_params);
706   }
f32_igemm_4x8s4__sse(benchmark::State & state,const char * net)707   static void f32_igemm_4x8s4__sse(benchmark::State& state, const char* net) {
708     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8s4__sse, 4, 8, 1, 4,
709       xnn_init_f32_minmax_sse_params);
710   }
f32_igemm_5x8s4__sse(benchmark::State & state,const char * net)711   static void f32_igemm_5x8s4__sse(benchmark::State& state, const char* net) {
712     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8s4__sse, 5, 8, 1, 4,
713       xnn_init_f32_minmax_sse_params);
714   }
715 
f32_igemm_1x8__sse2_dup(benchmark::State & state,const char * net)716   static void f32_igemm_1x8__sse2_dup(benchmark::State& state, const char* net) {
717     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__sse2_dup, 1, 8, 1, 1,
718       xnn_init_f32_minmax_sse_params);
719   }
f32_igemm_3x8__sse2_dup(benchmark::State & state,const char * net)720   static void f32_igemm_3x8__sse2_dup(benchmark::State& state, const char* net) {
721     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup, 3, 8, 1, 1,
722       xnn_init_f32_minmax_sse_params);
723   }
f32_igemm_4x8__sse2_dup(benchmark::State & state,const char * net)724   static void f32_igemm_4x8__sse2_dup(benchmark::State& state, const char* net) {
725     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__sse2_dup, 4, 8, 1, 1,
726       xnn_init_f32_minmax_sse_params);
727   }
f32_igemm_5x8__sse2_dup(benchmark::State & state,const char * net)728   static void f32_igemm_5x8__sse2_dup(benchmark::State& state, const char* net) {
729     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__sse2_dup, 5, 8, 1, 1,
730       xnn_init_f32_minmax_sse_params);
731   }
732 
f32_igemm_1x8__avx_broadcast(benchmark::State & state,const char * net)733   static void f32_igemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
734     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast, 1, 8, 1, 1,
735       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
736   }
f32_igemm_4x8__avx_broadcast(benchmark::State & state,const char * net)737   static void f32_igemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
738     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__avx_broadcast, 4, 8, 1, 1,
739       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
740   }
f32_igemm_5x8__avx_broadcast(benchmark::State & state,const char * net)741   static void f32_igemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
742     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast, 5, 8, 1, 1,
743       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
744   }
f32_igemm_6x8__avx_broadcast(benchmark::State & state,const char * net)745   static void f32_igemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
746     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__avx_broadcast, 6, 8, 1, 1,
747       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
748   }
f32_igemm_7x8__avx_broadcast(benchmark::State & state,const char * net)749   static void f32_igemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
750     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast, 7, 8, 1, 1,
751       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
752   }
753 
f32_igemm_1x8__fma3_broadcast(benchmark::State & state,const char * net)754   static void f32_igemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
755     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1,
756       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
757   }
f32_igemm_4x8__fma3_broadcast(benchmark::State & state,const char * net)758   static void f32_igemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
759     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1,
760       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
761   }
f32_igemm_5x8__fma3_broadcast(benchmark::State & state,const char * net)762   static void f32_igemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
763     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1,
764       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
765   }
f32_igemm_6x8__fma3_broadcast(benchmark::State & state,const char * net)766   static void f32_igemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
767     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1,
768       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
769   }
f32_igemm_7x8__fma3_broadcast(benchmark::State & state,const char * net)770   static void f32_igemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
771     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1,
772       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
773   }
f32_igemm_8x8__fma3_broadcast(benchmark::State & state,const char * net)774   static void f32_igemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
775     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1,
776       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
777   }
778 
f32_igemm_1x16__avx512f_broadcast(benchmark::State & state,const char * net)779   static void f32_igemm_1x16__avx512f_broadcast(benchmark::State& state, const char* net) {
780     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast, 1, 16, 1, 1,
781       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
782   }
f32_igemm_4x16__avx512f_broadcast(benchmark::State & state,const char * net)783   static void f32_igemm_4x16__avx512f_broadcast(benchmark::State& state, const char* net) {
784     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x16__avx512f_broadcast, 4, 16, 1, 1,
785       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
786   }
f32_igemm_5x16__avx512f_broadcast(benchmark::State & state,const char * net)787   static void f32_igemm_5x16__avx512f_broadcast(benchmark::State& state, const char* net) {
788     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x16__avx512f_broadcast, 5, 16, 1, 1,
789       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
790   }
f32_igemm_6x16__avx512f_broadcast(benchmark::State & state,const char * net)791   static void f32_igemm_6x16__avx512f_broadcast(benchmark::State& state, const char* net) {
792     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x16__avx512f_broadcast, 6, 16, 1, 1,
793       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
794   }
f32_igemm_7x16__avx512f_broadcast(benchmark::State & state,const char * net)795   static void f32_igemm_7x16__avx512f_broadcast(benchmark::State& state, const char* net) {
796     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast, 7, 16, 1, 1,
797       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
798   }
f32_igemm_8x16__avx512f_broadcast(benchmark::State & state,const char * net)799   static void f32_igemm_8x16__avx512f_broadcast(benchmark::State& state, const char* net) {
800     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_8x16__avx512f_broadcast, 8, 16, 1, 1,
801       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
802   }
803 
804   BENCHMARK_CONV(f32_igemm_1x8__sse_load1)
BENCHMARK_CONV(f32_igemm_3x8__sse_load1)805   BENCHMARK_CONV(f32_igemm_3x8__sse_load1)
806   BENCHMARK_CONV(f32_igemm_4x8__sse_load1)
807   BENCHMARK_CONV(f32_igemm_5x8__sse_load1)
808 
809   BENCHMARK_CONV(f32_igemm_1x8__sse_dup)
810   BENCHMARK_CONV(f32_igemm_3x8__sse_dup)
811   BENCHMARK_CONV(f32_igemm_4x8__sse_dup)
812   BENCHMARK_CONV(f32_igemm_5x8__sse_dup)
813 
814   BENCHMARK_CONV(f32_igemm_1x8s4__sse)
815   BENCHMARK_CONV(f32_igemm_3x8s4__sse)
816   BENCHMARK_CONV(f32_igemm_4x8s4__sse)
817   BENCHMARK_CONV(f32_igemm_5x8s4__sse)
818 
819   BENCHMARK_CONV(f32_igemm_1x8__sse2_dup)
820   BENCHMARK_CONV(f32_igemm_3x8__sse2_dup)
821   BENCHMARK_CONV(f32_igemm_4x8__sse2_dup)
822   BENCHMARK_CONV(f32_igemm_5x8__sse2_dup)
823 
824   BENCHMARK_CONV(f32_igemm_1x8__avx_broadcast)
825   BENCHMARK_CONV(f32_igemm_4x8__avx_broadcast)
826   BENCHMARK_CONV(f32_igemm_5x8__avx_broadcast)
827   BENCHMARK_CONV(f32_igemm_6x8__avx_broadcast)
828   BENCHMARK_CONV(f32_igemm_7x8__avx_broadcast)
829 
830   BENCHMARK_CONV(f32_igemm_1x8__fma3_broadcast)
831   BENCHMARK_CONV(f32_igemm_4x8__fma3_broadcast)
832   BENCHMARK_CONV(f32_igemm_5x8__fma3_broadcast)
833   BENCHMARK_CONV(f32_igemm_6x8__fma3_broadcast)
834   BENCHMARK_CONV(f32_igemm_7x8__fma3_broadcast)
835   BENCHMARK_CONV(f32_igemm_8x8__fma3_broadcast)
836 
837   BENCHMARK_CONV(f32_igemm_1x16__avx512f_broadcast)
838   BENCHMARK_CONV(f32_igemm_4x16__avx512f_broadcast)
839   BENCHMARK_CONV(f32_igemm_5x16__avx512f_broadcast)
840   BENCHMARK_CONV(f32_igemm_6x16__avx512f_broadcast)
841   BENCHMARK_CONV(f32_igemm_7x16__avx512f_broadcast)
842   BENCHMARK_CONV(f32_igemm_8x16__avx512f_broadcast)
843 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
844 
845 
846 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
847   static void f32_igemm_3x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
848     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat, 3, 8, 1, 1,
849       xnn_init_f32_minmax_scalar_params);
850   }
851 
f32_igemm_4x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)852   static void f32_igemm_4x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
853     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat, 4, 8, 1, 1,
854       xnn_init_f32_minmax_scalar_params);
855   }
856 
f32_igemm_5x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)857   static void f32_igemm_5x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
858     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, 5, 8, 1, 1,
859       xnn_init_f32_minmax_scalar_params);
860   }
861 
f32_igemm_6x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)862   static void f32_igemm_6x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
863     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat, 6, 8, 1, 1,
864       xnn_init_f32_minmax_scalar_params);
865   }
866 
f32_igemm_3x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)867   static void f32_igemm_3x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
868     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, 3, 8, 1, 1,
869       xnn_init_f32_minmax_scalar_params);
870   }
871 
f32_igemm_4x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)872   static void f32_igemm_4x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
873     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, 4, 8, 1, 1,
874       xnn_init_f32_minmax_scalar_params);
875   }
876 
f32_igemm_5x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)877   static void f32_igemm_5x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
878     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat, 5, 8, 1, 1,
879       xnn_init_f32_minmax_scalar_params);
880   }
881 
f32_igemm_6x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)882   static void f32_igemm_6x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
883     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, 6, 8, 1, 1,
884       xnn_init_f32_minmax_scalar_params);
885   }
886 
f32_igemm_3x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)887   static void f32_igemm_3x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
888     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat, 3, 8, 1, 1,
889       xnn_init_f32_minmax_scalar_params);
890   }
891 
f32_igemm_4x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)892   static void f32_igemm_4x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
893     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_splat, 4, 8, 1, 1,
894       xnn_init_f32_minmax_scalar_params);
895   }
896 
f32_igemm_5x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)897   static void f32_igemm_5x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
898     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat, 5, 8, 1, 1,
899       xnn_init_f32_minmax_scalar_params);
900   }
901 
f32_igemm_6x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)902   static void f32_igemm_6x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
903     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat, 6, 8, 1, 1,
904       xnn_init_f32_minmax_scalar_params);
905   }
906 
f32_igemm_3x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)907   static void f32_igemm_3x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
908     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat, 3, 8, 1, 1,
909       xnn_init_f32_minmax_scalar_params);
910   }
911 
f32_igemm_4x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)912   static void f32_igemm_4x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
913     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat, 4, 8, 1, 1,
914       xnn_init_f32_minmax_scalar_params);
915   }
916 
f32_igemm_5x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)917   static void f32_igemm_5x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
918     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat, 5, 8, 1, 1,
919       xnn_init_f32_minmax_scalar_params);
920   }
921 
f32_igemm_6x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)922   static void f32_igemm_6x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
923     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat, 6, 8, 1, 1,
924       xnn_init_f32_minmax_scalar_params);
925   }
926 
f32_igemm_3x8s4__wasmsimd_arm(benchmark::State & state,const char * net)927   static void f32_igemm_3x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
928     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm, 3, 8, 1, 4,
929       xnn_init_f32_minmax_scalar_params);
930   }
931 
f32_igemm_4x8s4__wasmsimd_arm(benchmark::State & state,const char * net)932   static void f32_igemm_4x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
933     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm, 4, 8, 1, 4,
934       xnn_init_f32_minmax_scalar_params);
935   }
936 
f32_igemm_5x8s4__wasmsimd_arm(benchmark::State & state,const char * net)937   static void f32_igemm_5x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
938     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm, 5, 8, 1, 4,
939       xnn_init_f32_minmax_scalar_params);
940   }
941 
f32_igemm_6x8s4__wasmsimd_arm(benchmark::State & state,const char * net)942   static void f32_igemm_6x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
943     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm, 6, 8, 1, 4,
944       xnn_init_f32_minmax_scalar_params);
945   }
946 
f32_igemm_3x8s4__wasmsimd_x86(benchmark::State & state,const char * net)947   static void f32_igemm_3x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
948     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86, 3, 8, 1, 4,
949       xnn_init_f32_minmax_scalar_params);
950   }
951 
f32_igemm_4x8s4__wasmsimd_x86(benchmark::State & state,const char * net)952   static void f32_igemm_4x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
953     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86, 4, 8, 1, 4,
954       xnn_init_f32_minmax_scalar_params);
955   }
956 
f32_igemm_5x8s4__wasmsimd_x86(benchmark::State & state,const char * net)957   static void f32_igemm_5x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
958     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86, 5, 8, 1, 4,
959       xnn_init_f32_minmax_scalar_params);
960   }
961 
f32_igemm_6x8s4__wasmsimd_x86(benchmark::State & state,const char * net)962   static void f32_igemm_6x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
963     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86, 6, 8, 1, 4,
964       xnn_init_f32_minmax_scalar_params);
965   }
966 
967   BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_arm_loadsplat)
BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_arm_loadsplat)968   BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_arm_loadsplat)
969   BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_arm_loadsplat)
970   BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_arm_loadsplat)
971   BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_x86_loadsplat)
972   BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_x86_loadsplat)
973   BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_x86_loadsplat)
974   BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_x86_loadsplat)
975   BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_arm_splat)
976   BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_arm_splat)
977   BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_arm_splat)
978   BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_arm_splat)
979   BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_x86_splat)
980   BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_x86_splat)
981   BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_x86_splat)
982   BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_x86_splat)
983   BENCHMARK_CONV(f32_igemm_3x8s4__wasmsimd_arm)
984   BENCHMARK_CONV(f32_igemm_4x8s4__wasmsimd_arm)
985   BENCHMARK_CONV(f32_igemm_5x8s4__wasmsimd_arm)
986   BENCHMARK_CONV(f32_igemm_6x8s4__wasmsimd_arm)
987   BENCHMARK_CONV(f32_igemm_3x8s4__wasmsimd_x86)
988   BENCHMARK_CONV(f32_igemm_4x8s4__wasmsimd_x86)
989   BENCHMARK_CONV(f32_igemm_5x8s4__wasmsimd_x86)
990   BENCHMARK_CONV(f32_igemm_6x8s4__wasmsimd_x86)
991 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
992 
993 
994 static void f32_igemm_1x4__scalar(benchmark::State& state, const char* net) {
995   IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x4__scalar, 1, 4, 1, 1,
996     xnn_init_f32_minmax_scalar_params);
997 }
998 
f32_igemm_2x4__scalar(benchmark::State & state,const char * net)999 static void f32_igemm_2x4__scalar(benchmark::State& state, const char* net) {
1000   IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_2x4__scalar, 2, 4, 1, 1,
1001     xnn_init_f32_minmax_scalar_params);
1002 }
1003 
f32_igemm_4x4__scalar(benchmark::State & state,const char * net)1004 static void f32_igemm_4x4__scalar(benchmark::State& state, const char* net) {
1005   IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x4__scalar, 4, 4, 1, 1,
1006     xnn_init_f32_minmax_scalar_params);
1007 }
1008 
1009 BENCHMARK_CONV(f32_igemm_1x4__scalar)
1010 BENCHMARK_CONV(f32_igemm_2x4__scalar)
1011 BENCHMARK_CONV(f32_igemm_4x4__scalar)
1012 
1013 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1014 BENCHMARK_MAIN();
1015 #endif
1016