• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2022 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11 
12 #include <benchmark/benchmark.h>
13 #include <fp16/fp16.h>
14 #include "bench/utils.h"
15 
16 #include <xnnpack.h>
17 #include <xnnpack/aligned-allocator.h>
18 #include <xnnpack/common.h>
19 #include <xnnpack/microfnptr.h>
20 #include <xnnpack/microparams-init.h>
21 #include <xnnpack/raddstoreexpminusmax.h>
22 #include <xnnpack/rmax.h>
23 
24 
f16_raddstoreexpminusmax(benchmark::State & state,xnn_f16_rmax_ukernel_function rmax,xnn_f16_raddstoreexpminusmax_ukernel_function raddstoreexpminusmax,xnn_init_f16_expminus_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)25 static void f16_raddstoreexpminusmax(
26   benchmark::State& state,
27   xnn_f16_rmax_ukernel_function rmax,
28   xnn_f16_raddstoreexpminusmax_ukernel_function raddstoreexpminusmax,
29   xnn_init_f16_expminus_params_fn init_params,
30   benchmark::utils::IsaCheckFunction isa_check = nullptr)
31 {
32   if (isa_check && !isa_check(state)) {
33     return;
34   }
35 
36   const size_t elements = state.range(0);
37   const size_t cache_line_size_max = 128;
38   const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(uint16_t));
39 
40   std::random_device random_device;
41   auto rng = std::mt19937(random_device());
42   auto f32rng = std::bind(std::uniform_real_distribution<float>(-100.0f, 100.0f), std::ref(rng));
43   auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
44 
45   const size_t num_buffers = 1 +
46     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(uint16_t));
47   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(elements);
48   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(packed_elements * num_buffers);
49 
50   std::generate(x.begin(), x.end(), std::ref(f16rng));
51 
52   benchmark::utils::DisableDenormals();
53 
54   xnn_f16_expminus_params params;
55   init_params(&params);
56 
57   size_t buffer_index = 0;
58   for (auto _ : state) {
59     state.PauseTiming();
60     uint16_t x_max = UINT16_C(0x7E00) /* NaN */;
61     rmax(elements * sizeof(uint16_t), x.data(), &x_max);
62     if (++buffer_index == num_buffers) {
63       buffer_index = 0;
64     }
65     state.ResumeTiming();
66 
67     uint16_t y_sum = UINT16_C(0x7E00) /* NaN */;
68     raddstoreexpminusmax(elements * sizeof(uint16_t), x.data(), &x_max, y.data() + buffer_index * packed_elements, &y_sum, &params);
69   }
70 
71   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
72   if (cpu_frequency != 0) {
73     state.counters["cpufreq"] = cpu_frequency;
74   }
75 
76   const size_t elements_per_iteration = elements;
77   state.counters["elements"] =
78     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
79 
80   const size_t bytes_per_iteration = 2 * elements * sizeof(uint16_t);
81   state.counters["bytes"] =
82     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
83 }
84 
85 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
86   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x32,
87                     xnn_f16_rmax_ukernel__neonfp16arith,
88                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32,
89                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
90                     benchmark::utils::CheckNEONFP16ARITH)
91     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
92     ->UseRealTime();
93   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x32_acc2,
94                     xnn_f16_rmax_ukernel__neonfp16arith,
95                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32_acc2,
96                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
97                     benchmark::utils::CheckNEONFP16ARITH)
98     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
99     ->UseRealTime();
100   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x32_acc4,
101                     xnn_f16_rmax_ukernel__neonfp16arith,
102                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32_acc4,
103                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
104                     benchmark::utils::CheckNEONFP16ARITH)
105     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
106     ->UseRealTime();
107   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x40,
108                     xnn_f16_rmax_ukernel__neonfp16arith,
109                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40,
110                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
111                     benchmark::utils::CheckNEONFP16ARITH)
112     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
113     ->UseRealTime();
114   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x40_acc2,
115                     xnn_f16_rmax_ukernel__neonfp16arith,
116                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40_acc2,
117                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
118                     benchmark::utils::CheckNEONFP16ARITH)
119     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
120     ->UseRealTime();
121   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x40_acc5,
122                     xnn_f16_rmax_ukernel__neonfp16arith,
123                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40_acc5,
124                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
125                     benchmark::utils::CheckNEONFP16ARITH)
126     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
127     ->UseRealTime();
128   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x48,
129                     xnn_f16_rmax_ukernel__neonfp16arith,
130                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48,
131                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
132                     benchmark::utils::CheckNEONFP16ARITH)
133     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
134     ->UseRealTime();
135   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x48_acc2,
136                     xnn_f16_rmax_ukernel__neonfp16arith,
137                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48_acc2,
138                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
139                     benchmark::utils::CheckNEONFP16ARITH)
140     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
141     ->UseRealTime();
142   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x48_acc3,
143                     xnn_f16_rmax_ukernel__neonfp16arith,
144                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48_acc3,
145                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
146                     benchmark::utils::CheckNEONFP16ARITH)
147     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
148     ->UseRealTime();
149   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x64,
150                     xnn_f16_rmax_ukernel__neonfp16arith,
151                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x64,
152                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
153                     benchmark::utils::CheckNEONFP16ARITH)
154     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
155     ->UseRealTime();
156   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x64_acc2,
157                     xnn_f16_rmax_ukernel__neonfp16arith,
158                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x64_acc2,
159                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
160                     benchmark::utils::CheckNEONFP16ARITH)
161     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
162     ->UseRealTime();
163   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x64_acc4,
164                     xnn_f16_rmax_ukernel__neonfp16arith,
165                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x64_acc4,
166                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
167                     benchmark::utils::CheckNEONFP16ARITH)
168     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
169     ->UseRealTime();
170   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x72,
171                     xnn_f16_rmax_ukernel__neonfp16arith,
172                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x72,
173                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
174                     benchmark::utils::CheckNEONFP16ARITH)
175     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
176     ->UseRealTime();
177   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x72_acc3,
178                     xnn_f16_rmax_ukernel__neonfp16arith,
179                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x72_acc3,
180                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
181                     benchmark::utils::CheckNEONFP16ARITH)
182     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
183     ->UseRealTime();
184   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x80,
185                     xnn_f16_rmax_ukernel__neonfp16arith,
186                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x80,
187                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
188                     benchmark::utils::CheckNEONFP16ARITH)
189     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
190     ->UseRealTime();
191   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x80_acc2,
192                     xnn_f16_rmax_ukernel__neonfp16arith,
193                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x80_acc2,
194                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
195                     benchmark::utils::CheckNEONFP16ARITH)
196     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
197     ->UseRealTime();
198   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x80_acc5,
199                     xnn_f16_rmax_ukernel__neonfp16arith,
200                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x80_acc5,
201                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
202                     benchmark::utils::CheckNEONFP16ARITH)
203     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
204     ->UseRealTime();
205   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x96,
206                     xnn_f16_rmax_ukernel__neonfp16arith,
207                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96,
208                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
209                     benchmark::utils::CheckNEONFP16ARITH)
210     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
211     ->UseRealTime();
212   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x96_acc2,
213                     xnn_f16_rmax_ukernel__neonfp16arith,
214                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc2,
215                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
216                     benchmark::utils::CheckNEONFP16ARITH)
217     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
218     ->UseRealTime();
219   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x96_acc3,
220                     xnn_f16_rmax_ukernel__neonfp16arith,
221                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc3,
222                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
223                     benchmark::utils::CheckNEONFP16ARITH)
224     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
225     ->UseRealTime();
226   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x96_acc6,
227                     xnn_f16_rmax_ukernel__neonfp16arith,
228                     xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc6,
229                     xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
230                     benchmark::utils::CheckNEONFP16ARITH)
231     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
232     ->UseRealTime();
233 #endif  // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
234 
235 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
236   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x32,
237                     xnn_f16_rmax_ukernel__f16c,
238                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x32,
239                     xnn_init_f16_expminus_avx2_rr1_p2_params,
240                     benchmark::utils::CheckAVX2)
241     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
242     ->UseRealTime();
243   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x32_acc2,
244                     xnn_f16_rmax_ukernel__f16c,
245                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x32_acc2,
246                     xnn_init_f16_expminus_avx2_rr1_p2_params,
247                     benchmark::utils::CheckAVX2)
248     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
249     ->UseRealTime();
250   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x32_acc4,
251                     xnn_f16_rmax_ukernel__f16c,
252                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x32_acc4,
253                     xnn_init_f16_expminus_avx2_rr1_p2_params,
254                     benchmark::utils::CheckAVX2)
255     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
256     ->UseRealTime();
257   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x40,
258                     xnn_f16_rmax_ukernel__f16c,
259                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x40,
260                     xnn_init_f16_expminus_avx2_rr1_p2_params,
261                     benchmark::utils::CheckAVX2)
262     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
263     ->UseRealTime();
264   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x40_acc2,
265                     xnn_f16_rmax_ukernel__f16c,
266                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x40_acc2,
267                     xnn_init_f16_expminus_avx2_rr1_p2_params,
268                     benchmark::utils::CheckAVX2)
269     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
270     ->UseRealTime();
271   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x40_acc5,
272                     xnn_f16_rmax_ukernel__f16c,
273                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x40_acc5,
274                     xnn_init_f16_expminus_avx2_rr1_p2_params,
275                     benchmark::utils::CheckAVX2)
276     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
277     ->UseRealTime();
278   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x48,
279                     xnn_f16_rmax_ukernel__f16c,
280                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x48,
281                     xnn_init_f16_expminus_avx2_rr1_p2_params,
282                     benchmark::utils::CheckAVX2)
283     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
284     ->UseRealTime();
285   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x48_acc2,
286                     xnn_f16_rmax_ukernel__f16c,
287                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x48_acc2,
288                     xnn_init_f16_expminus_avx2_rr1_p2_params,
289                     benchmark::utils::CheckAVX2)
290     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
291     ->UseRealTime();
292   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x48_acc3,
293                     xnn_f16_rmax_ukernel__f16c,
294                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x48_acc3,
295                     xnn_init_f16_expminus_avx2_rr1_p2_params,
296                     benchmark::utils::CheckAVX2)
297     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
298     ->UseRealTime();
299   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x64,
300                     xnn_f16_rmax_ukernel__f16c,
301                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64,
302                     xnn_init_f16_expminus_avx2_rr1_p2_params,
303                     benchmark::utils::CheckAVX2)
304     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
305     ->UseRealTime();
306   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x64_acc2,
307                     xnn_f16_rmax_ukernel__f16c,
308                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc2,
309                     xnn_init_f16_expminus_avx2_rr1_p2_params,
310                     benchmark::utils::CheckAVX2)
311     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
312     ->UseRealTime();
313   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x64_acc4,
314                     xnn_f16_rmax_ukernel__f16c,
315                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc4,
316                     xnn_init_f16_expminus_avx2_rr1_p2_params,
317                     benchmark::utils::CheckAVX2)
318     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
319     ->UseRealTime();
320   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x72,
321                     xnn_f16_rmax_ukernel__f16c,
322                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72,
323                     xnn_init_f16_expminus_avx2_rr1_p2_params,
324                     benchmark::utils::CheckAVX2)
325     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
326     ->UseRealTime();
327   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x72_acc3,
328                     xnn_f16_rmax_ukernel__f16c,
329                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72_acc3,
330                     xnn_init_f16_expminus_avx2_rr1_p2_params,
331                     benchmark::utils::CheckAVX2)
332     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
333     ->UseRealTime();
334   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x80,
335                     xnn_f16_rmax_ukernel__f16c,
336                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80,
337                     xnn_init_f16_expminus_avx2_rr1_p2_params,
338                     benchmark::utils::CheckAVX2)
339     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
340     ->UseRealTime();
341   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x80_acc2,
342                     xnn_f16_rmax_ukernel__f16c,
343                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc2,
344                     xnn_init_f16_expminus_avx2_rr1_p2_params,
345                     benchmark::utils::CheckAVX2)
346     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
347     ->UseRealTime();
348   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x80_acc5,
349                     xnn_f16_rmax_ukernel__f16c,
350                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc5,
351                     xnn_init_f16_expminus_avx2_rr1_p2_params,
352                     benchmark::utils::CheckAVX2)
353     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
354     ->UseRealTime();
355   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x96,
356                     xnn_f16_rmax_ukernel__f16c,
357                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96,
358                     xnn_init_f16_expminus_avx2_rr1_p2_params,
359                     benchmark::utils::CheckAVX2)
360     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
361     ->UseRealTime();
362   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x96_acc2,
363                     xnn_f16_rmax_ukernel__f16c,
364                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc2,
365                     xnn_init_f16_expminus_avx2_rr1_p2_params,
366                     benchmark::utils::CheckAVX2)
367     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
368     ->UseRealTime();
369   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x96_acc3,
370                     xnn_f16_rmax_ukernel__f16c,
371                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc3,
372                     xnn_init_f16_expminus_avx2_rr1_p2_params,
373                     benchmark::utils::CheckAVX2)
374     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
375     ->UseRealTime();
376   BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x96_acc6,
377                     xnn_f16_rmax_ukernel__f16c,
378                     xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc6,
379                     xnn_init_f16_expminus_avx2_rr1_p2_params,
380                     benchmark::utils::CheckAVX2)
381     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
382     ->UseRealTime();
383 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
384 
385 #ifndef XNNPACK_BENCHMARK_NO_MAIN
386 BENCHMARK_MAIN();
387 #endif
388