1 // Copyright 2022 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11
12 #include <benchmark/benchmark.h>
13 #include <fp16/fp16.h>
14 #include "bench/utils.h"
15
16 #include <xnnpack.h>
17 #include <xnnpack/aligned-allocator.h>
18 #include <xnnpack/common.h>
19 #include <xnnpack/microfnptr.h>
20 #include <xnnpack/microparams-init.h>
21 #include <xnnpack/raddstoreexpminusmax.h>
22 #include <xnnpack/rmax.h>
23
24
f16_raddstoreexpminusmax(benchmark::State & state,xnn_f16_rmax_ukernel_function rmax,xnn_f16_raddstoreexpminusmax_ukernel_function raddstoreexpminusmax,xnn_init_f16_expminus_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)25 static void f16_raddstoreexpminusmax(
26 benchmark::State& state,
27 xnn_f16_rmax_ukernel_function rmax,
28 xnn_f16_raddstoreexpminusmax_ukernel_function raddstoreexpminusmax,
29 xnn_init_f16_expminus_params_fn init_params,
30 benchmark::utils::IsaCheckFunction isa_check = nullptr)
31 {
32 if (isa_check && !isa_check(state)) {
33 return;
34 }
35
36 const size_t elements = state.range(0);
37 const size_t cache_line_size_max = 128;
38 const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(uint16_t));
39
40 std::random_device random_device;
41 auto rng = std::mt19937(random_device());
42 auto f32rng = std::bind(std::uniform_real_distribution<float>(-100.0f, 100.0f), std::ref(rng));
43 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
44
45 const size_t num_buffers = 1 +
46 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(uint16_t));
47 std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(elements);
48 std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(packed_elements * num_buffers);
49
50 std::generate(x.begin(), x.end(), std::ref(f16rng));
51
52 benchmark::utils::DisableDenormals();
53
54 xnn_f16_expminus_params params;
55 init_params(¶ms);
56
57 size_t buffer_index = 0;
58 for (auto _ : state) {
59 state.PauseTiming();
60 uint16_t x_max = UINT16_C(0x7E00) /* NaN */;
61 rmax(elements * sizeof(uint16_t), x.data(), &x_max);
62 if (++buffer_index == num_buffers) {
63 buffer_index = 0;
64 }
65 state.ResumeTiming();
66
67 uint16_t y_sum = UINT16_C(0x7E00) /* NaN */;
68 raddstoreexpminusmax(elements * sizeof(uint16_t), x.data(), &x_max, y.data() + buffer_index * packed_elements, &y_sum, ¶ms);
69 }
70
71 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
72 if (cpu_frequency != 0) {
73 state.counters["cpufreq"] = cpu_frequency;
74 }
75
76 const size_t elements_per_iteration = elements;
77 state.counters["elements"] =
78 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
79
80 const size_t bytes_per_iteration = 2 * elements * sizeof(uint16_t);
81 state.counters["bytes"] =
82 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
83 }
84
85 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
86 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x32,
87 xnn_f16_rmax_ukernel__neonfp16arith,
88 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32,
89 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
90 benchmark::utils::CheckNEONFP16ARITH)
91 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
92 ->UseRealTime();
93 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x32_acc2,
94 xnn_f16_rmax_ukernel__neonfp16arith,
95 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32_acc2,
96 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
97 benchmark::utils::CheckNEONFP16ARITH)
98 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
99 ->UseRealTime();
100 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x32_acc4,
101 xnn_f16_rmax_ukernel__neonfp16arith,
102 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32_acc4,
103 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
104 benchmark::utils::CheckNEONFP16ARITH)
105 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
106 ->UseRealTime();
107 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x40,
108 xnn_f16_rmax_ukernel__neonfp16arith,
109 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40,
110 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
111 benchmark::utils::CheckNEONFP16ARITH)
112 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
113 ->UseRealTime();
114 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x40_acc2,
115 xnn_f16_rmax_ukernel__neonfp16arith,
116 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40_acc2,
117 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
118 benchmark::utils::CheckNEONFP16ARITH)
119 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
120 ->UseRealTime();
121 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x40_acc5,
122 xnn_f16_rmax_ukernel__neonfp16arith,
123 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40_acc5,
124 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
125 benchmark::utils::CheckNEONFP16ARITH)
126 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
127 ->UseRealTime();
128 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x48,
129 xnn_f16_rmax_ukernel__neonfp16arith,
130 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48,
131 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
132 benchmark::utils::CheckNEONFP16ARITH)
133 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
134 ->UseRealTime();
135 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x48_acc2,
136 xnn_f16_rmax_ukernel__neonfp16arith,
137 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48_acc2,
138 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
139 benchmark::utils::CheckNEONFP16ARITH)
140 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
141 ->UseRealTime();
142 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x48_acc3,
143 xnn_f16_rmax_ukernel__neonfp16arith,
144 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48_acc3,
145 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
146 benchmark::utils::CheckNEONFP16ARITH)
147 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
148 ->UseRealTime();
149 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x64,
150 xnn_f16_rmax_ukernel__neonfp16arith,
151 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x64,
152 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
153 benchmark::utils::CheckNEONFP16ARITH)
154 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
155 ->UseRealTime();
156 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x64_acc2,
157 xnn_f16_rmax_ukernel__neonfp16arith,
158 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x64_acc2,
159 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
160 benchmark::utils::CheckNEONFP16ARITH)
161 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
162 ->UseRealTime();
163 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x64_acc4,
164 xnn_f16_rmax_ukernel__neonfp16arith,
165 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x64_acc4,
166 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
167 benchmark::utils::CheckNEONFP16ARITH)
168 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
169 ->UseRealTime();
170 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x72,
171 xnn_f16_rmax_ukernel__neonfp16arith,
172 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x72,
173 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
174 benchmark::utils::CheckNEONFP16ARITH)
175 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
176 ->UseRealTime();
177 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x72_acc3,
178 xnn_f16_rmax_ukernel__neonfp16arith,
179 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x72_acc3,
180 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
181 benchmark::utils::CheckNEONFP16ARITH)
182 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
183 ->UseRealTime();
184 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x80,
185 xnn_f16_rmax_ukernel__neonfp16arith,
186 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x80,
187 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
188 benchmark::utils::CheckNEONFP16ARITH)
189 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
190 ->UseRealTime();
191 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x80_acc2,
192 xnn_f16_rmax_ukernel__neonfp16arith,
193 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x80_acc2,
194 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
195 benchmark::utils::CheckNEONFP16ARITH)
196 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
197 ->UseRealTime();
198 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x80_acc5,
199 xnn_f16_rmax_ukernel__neonfp16arith,
200 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x80_acc5,
201 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
202 benchmark::utils::CheckNEONFP16ARITH)
203 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
204 ->UseRealTime();
205 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x96,
206 xnn_f16_rmax_ukernel__neonfp16arith,
207 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96,
208 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
209 benchmark::utils::CheckNEONFP16ARITH)
210 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
211 ->UseRealTime();
212 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x96_acc2,
213 xnn_f16_rmax_ukernel__neonfp16arith,
214 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc2,
215 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
216 benchmark::utils::CheckNEONFP16ARITH)
217 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
218 ->UseRealTime();
219 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x96_acc3,
220 xnn_f16_rmax_ukernel__neonfp16arith,
221 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc3,
222 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
223 benchmark::utils::CheckNEONFP16ARITH)
224 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
225 ->UseRealTime();
226 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x96_acc6,
227 xnn_f16_rmax_ukernel__neonfp16arith,
228 xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc6,
229 xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
230 benchmark::utils::CheckNEONFP16ARITH)
231 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
232 ->UseRealTime();
233 #endif // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
234
235 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
236 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x32,
237 xnn_f16_rmax_ukernel__f16c,
238 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x32,
239 xnn_init_f16_expminus_avx2_rr1_p2_params,
240 benchmark::utils::CheckAVX2)
241 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
242 ->UseRealTime();
243 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x32_acc2,
244 xnn_f16_rmax_ukernel__f16c,
245 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x32_acc2,
246 xnn_init_f16_expminus_avx2_rr1_p2_params,
247 benchmark::utils::CheckAVX2)
248 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
249 ->UseRealTime();
250 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x32_acc4,
251 xnn_f16_rmax_ukernel__f16c,
252 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x32_acc4,
253 xnn_init_f16_expminus_avx2_rr1_p2_params,
254 benchmark::utils::CheckAVX2)
255 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
256 ->UseRealTime();
257 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x40,
258 xnn_f16_rmax_ukernel__f16c,
259 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x40,
260 xnn_init_f16_expminus_avx2_rr1_p2_params,
261 benchmark::utils::CheckAVX2)
262 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
263 ->UseRealTime();
264 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x40_acc2,
265 xnn_f16_rmax_ukernel__f16c,
266 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x40_acc2,
267 xnn_init_f16_expminus_avx2_rr1_p2_params,
268 benchmark::utils::CheckAVX2)
269 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
270 ->UseRealTime();
271 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x40_acc5,
272 xnn_f16_rmax_ukernel__f16c,
273 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x40_acc5,
274 xnn_init_f16_expminus_avx2_rr1_p2_params,
275 benchmark::utils::CheckAVX2)
276 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
277 ->UseRealTime();
278 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x48,
279 xnn_f16_rmax_ukernel__f16c,
280 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x48,
281 xnn_init_f16_expminus_avx2_rr1_p2_params,
282 benchmark::utils::CheckAVX2)
283 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
284 ->UseRealTime();
285 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x48_acc2,
286 xnn_f16_rmax_ukernel__f16c,
287 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x48_acc2,
288 xnn_init_f16_expminus_avx2_rr1_p2_params,
289 benchmark::utils::CheckAVX2)
290 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
291 ->UseRealTime();
292 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x48_acc3,
293 xnn_f16_rmax_ukernel__f16c,
294 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x48_acc3,
295 xnn_init_f16_expminus_avx2_rr1_p2_params,
296 benchmark::utils::CheckAVX2)
297 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
298 ->UseRealTime();
299 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x64,
300 xnn_f16_rmax_ukernel__f16c,
301 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64,
302 xnn_init_f16_expminus_avx2_rr1_p2_params,
303 benchmark::utils::CheckAVX2)
304 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
305 ->UseRealTime();
306 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x64_acc2,
307 xnn_f16_rmax_ukernel__f16c,
308 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc2,
309 xnn_init_f16_expminus_avx2_rr1_p2_params,
310 benchmark::utils::CheckAVX2)
311 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
312 ->UseRealTime();
313 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x64_acc4,
314 xnn_f16_rmax_ukernel__f16c,
315 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc4,
316 xnn_init_f16_expminus_avx2_rr1_p2_params,
317 benchmark::utils::CheckAVX2)
318 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
319 ->UseRealTime();
320 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x72,
321 xnn_f16_rmax_ukernel__f16c,
322 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72,
323 xnn_init_f16_expminus_avx2_rr1_p2_params,
324 benchmark::utils::CheckAVX2)
325 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
326 ->UseRealTime();
327 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x72_acc3,
328 xnn_f16_rmax_ukernel__f16c,
329 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72_acc3,
330 xnn_init_f16_expminus_avx2_rr1_p2_params,
331 benchmark::utils::CheckAVX2)
332 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
333 ->UseRealTime();
334 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x80,
335 xnn_f16_rmax_ukernel__f16c,
336 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80,
337 xnn_init_f16_expminus_avx2_rr1_p2_params,
338 benchmark::utils::CheckAVX2)
339 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
340 ->UseRealTime();
341 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x80_acc2,
342 xnn_f16_rmax_ukernel__f16c,
343 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc2,
344 xnn_init_f16_expminus_avx2_rr1_p2_params,
345 benchmark::utils::CheckAVX2)
346 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
347 ->UseRealTime();
348 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x80_acc5,
349 xnn_f16_rmax_ukernel__f16c,
350 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc5,
351 xnn_init_f16_expminus_avx2_rr1_p2_params,
352 benchmark::utils::CheckAVX2)
353 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
354 ->UseRealTime();
355 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x96,
356 xnn_f16_rmax_ukernel__f16c,
357 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96,
358 xnn_init_f16_expminus_avx2_rr1_p2_params,
359 benchmark::utils::CheckAVX2)
360 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
361 ->UseRealTime();
362 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x96_acc2,
363 xnn_f16_rmax_ukernel__f16c,
364 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc2,
365 xnn_init_f16_expminus_avx2_rr1_p2_params,
366 benchmark::utils::CheckAVX2)
367 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
368 ->UseRealTime();
369 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x96_acc3,
370 xnn_f16_rmax_ukernel__f16c,
371 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc3,
372 xnn_init_f16_expminus_avx2_rr1_p2_params,
373 benchmark::utils::CheckAVX2)
374 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
375 ->UseRealTime();
376 BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x96_acc6,
377 xnn_f16_rmax_ukernel__f16c,
378 xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc6,
379 xnn_init_f16_expminus_avx2_rr1_p2_params,
380 benchmark::utils::CheckAVX2)
381 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
382 ->UseRealTime();
383 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
384
385 #ifndef XNNPACK_BENCHMARK_NO_MAIN
386 BENCHMARK_MAIN();
387 #endif
388