1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11
12 #include <benchmark/benchmark.h>
13 #include "bench/utils.h"
14 #include <xnnpack/AlignedAllocator.h>
15 #include <xnnpack/common.h>
16 #include <xnnpack/params.h>
17 #include <xnnpack/vunary.h>
18
19
f32_sigmoid(benchmark::State & state,xnn_f32_vunary_ukernel_function sigmoid)20 static void f32_sigmoid(
21 benchmark::State& state,
22 xnn_f32_vunary_ukernel_function sigmoid)
23 {
24 const size_t elements = state.range(0);
25
26 std::random_device random_device;
27 auto rng = std::mt19937(random_device());
28 auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), rng);
29
30 std::vector<float, AlignedAllocator<float, 64>> x(elements);
31 std::vector<float, AlignedAllocator<float, 64>> y(elements);
32 std::generate(x.begin(), x.end(), std::ref(f32rng));
33 std::fill(y.begin(), y.end(), std::nanf(""));
34
35 for (auto _ : state) {
36 sigmoid(elements * sizeof(float), x.data(), y.data(), nullptr /* params */);
37 }
38
39 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
40
41 const size_t elements_per_iteration = elements;
42 state.counters["elements"] =
43 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
44
45 const size_t bytes_per_iteration = 2 * elements * sizeof(float);
46 state.counters["bytes"] =
47 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
48 }
49
50 #if XNN_ARCH_ARM64
51 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_div_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_div_x4)
52 ->RangeMultiplier(10)
53 ->Range(1000, 1000000)
54 ->UseRealTime();
55 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_div_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_div_x8)
56 ->RangeMultiplier(10)
57 ->Range(1000, 1000000)
58 ->UseRealTime();
59 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_div_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_div_x12)
60 ->RangeMultiplier(10)
61 ->Range(1000, 1000000)
62 ->UseRealTime();
63 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_div_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_div_x16)
64 ->RangeMultiplier(10)
65 ->Range(1000, 1000000)
66 ->UseRealTime();
67 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_div_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_div_x20)
68 ->RangeMultiplier(10)
69 ->Range(1000, 1000000)
70 ->UseRealTime();
71 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_div_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_div_x24)
72 ->RangeMultiplier(10)
73 ->Range(1000, 1000000)
74 ->UseRealTime();
75
76 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_div_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_div_x4)
77 ->RangeMultiplier(10)
78 ->Range(1000, 1000000)
79 ->UseRealTime();
80 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_div_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_div_x8)
81 ->RangeMultiplier(10)
82 ->Range(1000, 1000000)
83 ->UseRealTime();
84 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_div_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_div_x12)
85 ->RangeMultiplier(10)
86 ->Range(1000, 1000000)
87 ->UseRealTime();
88 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_div_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_div_x16)
89 ->RangeMultiplier(10)
90 ->Range(1000, 1000000)
91 ->UseRealTime();
92 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_div_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_div_x20)
93 ->RangeMultiplier(10)
94 ->Range(1000, 1000000)
95 ->UseRealTime();
96 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_div_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_div_x24)
97 ->RangeMultiplier(10)
98 ->Range(1000, 1000000)
99 ->UseRealTime();
100
101 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_div_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x4)
102 ->RangeMultiplier(10)
103 ->Range(1000, 1000000)
104 ->UseRealTime();
105 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_div_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x8)
106 ->RangeMultiplier(10)
107 ->Range(1000, 1000000)
108 ->UseRealTime();
109 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_div_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x12)
110 ->RangeMultiplier(10)
111 ->Range(1000, 1000000)
112 ->UseRealTime();
113 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_div_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x16)
114 ->RangeMultiplier(10)
115 ->Range(1000, 1000000)
116 ->UseRealTime();
117 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_div_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x20)
118 ->RangeMultiplier(10)
119 ->Range(1000, 1000000)
120 ->UseRealTime();
121 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_div_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x24)
122 ->RangeMultiplier(10)
123 ->Range(1000, 1000000)
124 ->UseRealTime();
125 #endif // XNN_ARCH_ARM64
126
127 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
128 BENCHMARK_CAPTURE(f32_sigmoid, neon_frac_p9_p10_nr1recps_x16, xnn_f32_sigmoid_ukernel__neon_frac_p9_p10_nr1recps_x16)
129 ->RangeMultiplier(10)
130 ->Range(1000, 1000000)
131 ->UseRealTime();
132
133 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2fma_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2fma_x4)
134 ->RangeMultiplier(10)
135 ->Range(1000, 1000000)
136 ->UseRealTime();
137 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2fma_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2fma_x8)
138 ->RangeMultiplier(10)
139 ->Range(1000, 1000000)
140 ->UseRealTime();
141 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2fma_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2fma_x12)
142 ->RangeMultiplier(10)
143 ->Range(1000, 1000000)
144 ->UseRealTime();
145 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2fma_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2fma_x16)
146 ->RangeMultiplier(10)
147 ->Range(1000, 1000000)
148 ->UseRealTime();
149 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2fma_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2fma_x20)
150 ->RangeMultiplier(10)
151 ->Range(1000, 1000000)
152 ->UseRealTime();
153 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2fma_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2fma_x24)
154 ->RangeMultiplier(10)
155 ->Range(1000, 1000000)
156 ->UseRealTime();
157
158 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr1recps1fma_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_x4)
159 ->RangeMultiplier(10)
160 ->Range(1000, 1000000)
161 ->UseRealTime();
162 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr1recps1fma_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_x8)
163 ->RangeMultiplier(10)
164 ->Range(1000, 1000000)
165 ->UseRealTime();
166 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr1recps1fma_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_x12)
167 ->RangeMultiplier(10)
168 ->Range(1000, 1000000)
169 ->UseRealTime();
170 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr1recps1fma_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_x16)
171 ->RangeMultiplier(10)
172 ->Range(1000, 1000000)
173 ->UseRealTime();
174 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr1recps1fma_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_x20)
175 ->RangeMultiplier(10)
176 ->Range(1000, 1000000)
177 ->UseRealTime();
178 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr1recps1fma_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_x24)
179 ->RangeMultiplier(10)
180 ->Range(1000, 1000000)
181 ->UseRealTime();
182
183 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2recps_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2recps_x4)
184 ->RangeMultiplier(10)
185 ->Range(1000, 1000000)
186 ->UseRealTime();
187 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2recps_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2recps_x8)
188 ->RangeMultiplier(10)
189 ->Range(1000, 1000000)
190 ->UseRealTime();
191 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2recps_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2recps_x12)
192 ->RangeMultiplier(10)
193 ->Range(1000, 1000000)
194 ->UseRealTime();
195 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2recps_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2recps_x16)
196 ->RangeMultiplier(10)
197 ->Range(1000, 1000000)
198 ->UseRealTime();
199 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2recps_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2recps_x20)
200 ->RangeMultiplier(10)
201 ->Range(1000, 1000000)
202 ->UseRealTime();
203 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2recps_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2recps_x24)
204 ->RangeMultiplier(10)
205 ->Range(1000, 1000000)
206 ->UseRealTime();
207
208 BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_p5_nr2recps_x4, xnn_f32_sigmoid_ukernel__neon_rr2_p5_nr2recps_x4)
209 ->RangeMultiplier(10)
210 ->Range(1000, 1000000)
211 ->UseRealTime();
212 BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_p5_nr2recps_x8, xnn_f32_sigmoid_ukernel__neon_rr2_p5_nr2recps_x8)
213 ->RangeMultiplier(10)
214 ->Range(1000, 1000000)
215 ->UseRealTime();
216 BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_p5_nr2recps_x12, xnn_f32_sigmoid_ukernel__neon_rr2_p5_nr2recps_x12)
217 ->RangeMultiplier(10)
218 ->Range(1000, 1000000)
219 ->UseRealTime();
220 BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_p5_nr2recps_x16, xnn_f32_sigmoid_ukernel__neon_rr2_p5_nr2recps_x16)
221 ->RangeMultiplier(10)
222 ->Range(1000, 1000000)
223 ->UseRealTime();
224 BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_p5_nr2recps_x20, xnn_f32_sigmoid_ukernel__neon_rr2_p5_nr2recps_x20)
225 ->RangeMultiplier(10)
226 ->Range(1000, 1000000)
227 ->UseRealTime();
228 BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_p5_nr2recps_x24, xnn_f32_sigmoid_ukernel__neon_rr2_p5_nr2recps_x24)
229 ->RangeMultiplier(10)
230 ->Range(1000, 1000000)
231 ->UseRealTime();
232
233 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2fma_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x4)
234 ->RangeMultiplier(10)
235 ->Range(1000, 1000000)
236 ->UseRealTime();
237 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2fma_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x8)
238 ->RangeMultiplier(10)
239 ->Range(1000, 1000000)
240 ->UseRealTime();
241 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2fma_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x12)
242 ->RangeMultiplier(10)
243 ->Range(1000, 1000000)
244 ->UseRealTime();
245 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2fma_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x16)
246 ->RangeMultiplier(10)
247 ->Range(1000, 1000000)
248 ->UseRealTime();
249 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2fma_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x20)
250 ->RangeMultiplier(10)
251 ->Range(1000, 1000000)
252 ->UseRealTime();
253 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2fma_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x24)
254 ->RangeMultiplier(10)
255 ->Range(1000, 1000000)
256 ->UseRealTime();
257
258 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr1recps1fma_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x4)
259 ->RangeMultiplier(10)
260 ->Range(1000, 1000000)
261 ->UseRealTime();
262 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr1recps1fma_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x8)
263 ->RangeMultiplier(10)
264 ->Range(1000, 1000000)
265 ->UseRealTime();
266 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr1recps1fma_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x12)
267 ->RangeMultiplier(10)
268 ->Range(1000, 1000000)
269 ->UseRealTime();
270 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr1recps1fma_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x16)
271 ->RangeMultiplier(10)
272 ->Range(1000, 1000000)
273 ->UseRealTime();
274 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr1recps1fma_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x20)
275 ->RangeMultiplier(10)
276 ->Range(1000, 1000000)
277 ->UseRealTime();
278 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr1recps1fma_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x24)
279 ->RangeMultiplier(10)
280 ->Range(1000, 1000000)
281 ->UseRealTime();
282
283 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2recps_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x4)
284 ->RangeMultiplier(10)
285 ->Range(1000, 1000000)
286 ->UseRealTime();
287 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2recps_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x8)
288 ->RangeMultiplier(10)
289 ->Range(1000, 1000000)
290 ->UseRealTime();
291 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2recps_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x12)
292 ->RangeMultiplier(10)
293 ->Range(1000, 1000000)
294 ->UseRealTime();
295 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2recps_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x16)
296 ->RangeMultiplier(10)
297 ->Range(1000, 1000000)
298 ->UseRealTime();
299 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2recps_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x20)
300 ->RangeMultiplier(10)
301 ->Range(1000, 1000000)
302 ->UseRealTime();
303 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2recps_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x24)
304 ->RangeMultiplier(10)
305 ->Range(1000, 1000000)
306 ->UseRealTime();
307
308 BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut64_p2_nr2recps_x4, xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x4)
309 ->RangeMultiplier(10)
310 ->Range(1000, 1000000)
311 ->UseRealTime();
312 BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut64_p2_nr2recps_x8, xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x8)
313 ->RangeMultiplier(10)
314 ->Range(1000, 1000000)
315 ->UseRealTime();
316 BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut64_p2_nr2recps_x12, xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x12)
317 ->RangeMultiplier(10)
318 ->Range(1000, 1000000)
319 ->UseRealTime();
320 BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut64_p2_nr2recps_x16, xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x16)
321 ->RangeMultiplier(10)
322 ->Range(1000, 1000000)
323 ->UseRealTime();
324 BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut64_p2_nr2recps_x20, xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x20)
325 ->RangeMultiplier(10)
326 ->Range(1000, 1000000)
327 ->UseRealTime();
328 BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut64_p2_nr2recps_x24, xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x24)
329 ->RangeMultiplier(10)
330 ->Range(1000, 1000000)
331 ->UseRealTime();
332
333 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2fma_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x4)
334 ->RangeMultiplier(10)
335 ->Range(1000, 1000000)
336 ->UseRealTime();
337 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2fma_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x8)
338 ->RangeMultiplier(10)
339 ->Range(1000, 1000000)
340 ->UseRealTime();
341 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2fma_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x12)
342 ->RangeMultiplier(10)
343 ->Range(1000, 1000000)
344 ->UseRealTime();
345 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2fma_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x16)
346 ->RangeMultiplier(10)
347 ->Range(1000, 1000000)
348 ->UseRealTime();
349 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2fma_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x20)
350 ->RangeMultiplier(10)
351 ->Range(1000, 1000000)
352 ->UseRealTime();
353 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2fma_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x24)
354 ->RangeMultiplier(10)
355 ->Range(1000, 1000000)
356 ->UseRealTime();
357
358 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr1recps1fma_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x4)
359 ->RangeMultiplier(10)
360 ->Range(1000, 1000000)
361 ->UseRealTime();
362 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr1recps1fma_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x8)
363 ->RangeMultiplier(10)
364 ->Range(1000, 1000000)
365 ->UseRealTime();
366 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr1recps1fma_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x12)
367 ->RangeMultiplier(10)
368 ->Range(1000, 1000000)
369 ->UseRealTime();
370 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr1recps1fma_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x16)
371 ->RangeMultiplier(10)
372 ->Range(1000, 1000000)
373 ->UseRealTime();
374 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr1recps1fma_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x20)
375 ->RangeMultiplier(10)
376 ->Range(1000, 1000000)
377 ->UseRealTime();
378 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr1recps1fma_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x24)
379 ->RangeMultiplier(10)
380 ->Range(1000, 1000000)
381 ->UseRealTime();
382
383 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2recps_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x4)
384 ->RangeMultiplier(10)
385 ->Range(1000, 1000000)
386 ->UseRealTime();
387 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2recps_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x8)
388 ->RangeMultiplier(10)
389 ->Range(1000, 1000000)
390 ->UseRealTime();
391 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2recps_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x12)
392 ->RangeMultiplier(10)
393 ->Range(1000, 1000000)
394 ->UseRealTime();
395 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2recps_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x16)
396 ->RangeMultiplier(10)
397 ->Range(1000, 1000000)
398 ->UseRealTime();
399 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2recps_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x20)
400 ->RangeMultiplier(10)
401 ->Range(1000, 1000000)
402 ->UseRealTime();
403 BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2recps_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x24)
404 ->RangeMultiplier(10)
405 ->Range(1000, 1000000)
406 ->UseRealTime();
407
408 BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut2048_p1_nr2recps_x4, xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x4)
409 ->RangeMultiplier(10)
410 ->Range(1000, 1000000)
411 ->UseRealTime();
412 BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut2048_p1_nr2recps_x8, xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x8)
413 ->RangeMultiplier(10)
414 ->Range(1000, 1000000)
415 ->UseRealTime();
416 BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut2048_p1_nr2recps_x12, xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x12)
417 ->RangeMultiplier(10)
418 ->Range(1000, 1000000)
419 ->UseRealTime();
420 BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut2048_p1_nr2recps_x16, xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x16)
421 ->RangeMultiplier(10)
422 ->Range(1000, 1000000)
423 ->UseRealTime();
424 BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut2048_p1_nr2recps_x20, xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x20)
425 ->RangeMultiplier(10)
426 ->Range(1000, 1000000)
427 ->UseRealTime();
428 BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut2048_p1_nr2recps_x24, xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x24)
429 ->RangeMultiplier(10)
430 ->Range(1000, 1000000)
431 ->UseRealTime();
432 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
433
434 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
435 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_div_x8, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x8)
436 ->RangeMultiplier(10)
437 ->Range(1000, 1000000)
438 ->UseRealTime();
439 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_div_x16, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x16)
440 ->RangeMultiplier(10)
441 ->Range(1000, 1000000)
442 ->UseRealTime();
443 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_div_x24, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x24)
444 ->RangeMultiplier(10)
445 ->Range(1000, 1000000)
446 ->UseRealTime();
447 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_div_x32, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x32)
448 ->RangeMultiplier(10)
449 ->Range(1000, 1000000)
450 ->UseRealTime();
451 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_div_x40, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x40)
452 ->RangeMultiplier(10)
453 ->Range(1000, 1000000)
454 ->UseRealTime();
455 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_div_x48, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x48)
456 ->RangeMultiplier(10)
457 ->Range(1000, 1000000)
458 ->UseRealTime();
459 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_div_x56, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x56)
460 ->RangeMultiplier(10)
461 ->Range(1000, 1000000)
462 ->UseRealTime();
463 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_div_x64, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x64)
464 ->RangeMultiplier(10)
465 ->Range(1000, 1000000)
466 ->UseRealTime();
467 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_div_x72, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x72)
468 ->RangeMultiplier(10)
469 ->Range(1000, 1000000)
470 ->UseRealTime();
471 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_div_x80, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x80)
472 ->RangeMultiplier(10)
473 ->Range(1000, 1000000)
474 ->UseRealTime();
475
476 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr1fma_x8, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr1fma_x8)
477 ->RangeMultiplier(10)
478 ->Range(1000, 1000000)
479 ->UseRealTime();
480 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr1fma_x16, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr1fma_x16)
481 ->RangeMultiplier(10)
482 ->Range(1000, 1000000)
483 ->UseRealTime();
484 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr1fma_x24, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr1fma_x24)
485 ->RangeMultiplier(10)
486 ->Range(1000, 1000000)
487 ->UseRealTime();
488 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr1fma_x32, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr1fma_x32)
489 ->RangeMultiplier(10)
490 ->Range(1000, 1000000)
491 ->UseRealTime();
492 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr1fma_x40, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr1fma_x40)
493 ->RangeMultiplier(10)
494 ->Range(1000, 1000000)
495 ->UseRealTime();
496 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr1fma_x48, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr1fma_x48)
497 ->RangeMultiplier(10)
498 ->Range(1000, 1000000)
499 ->UseRealTime();
500 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr1fma_x56, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr1fma_x56)
501 ->RangeMultiplier(10)
502 ->Range(1000, 1000000)
503 ->UseRealTime();
504 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr1fma_x64, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr1fma_x64)
505 ->RangeMultiplier(10)
506 ->Range(1000, 1000000)
507 ->UseRealTime();
508 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr1fma_x72, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr1fma_x72)
509 ->RangeMultiplier(10)
510 ->Range(1000, 1000000)
511 ->UseRealTime();
512 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr1fma_x80, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr1fma_x80)
513 ->RangeMultiplier(10)
514 ->Range(1000, 1000000)
515 ->UseRealTime();
516
517 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr2fma_x8, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr2fma_x8)
518 ->RangeMultiplier(10)
519 ->Range(1000, 1000000)
520 ->UseRealTime();
521 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr2fma_x16, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr2fma_x16)
522 ->RangeMultiplier(10)
523 ->Range(1000, 1000000)
524 ->UseRealTime();
525 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr2fma_x24, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr2fma_x24)
526 ->RangeMultiplier(10)
527 ->Range(1000, 1000000)
528 ->UseRealTime();
529 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr2fma_x32, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr2fma_x32)
530 ->RangeMultiplier(10)
531 ->Range(1000, 1000000)
532 ->UseRealTime();
533 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr2fma_x40, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr2fma_x40)
534 ->RangeMultiplier(10)
535 ->Range(1000, 1000000)
536 ->UseRealTime();
537 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr2fma_x48, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr2fma_x48)
538 ->RangeMultiplier(10)
539 ->Range(1000, 1000000)
540 ->UseRealTime();
541 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr2fma_x56, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr2fma_x56)
542 ->RangeMultiplier(10)
543 ->Range(1000, 1000000)
544 ->UseRealTime();
545 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr2fma_x64, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr2fma_x64)
546 ->RangeMultiplier(10)
547 ->Range(1000, 1000000)
548 ->UseRealTime();
549 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr2fma_x72, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr2fma_x72)
550 ->RangeMultiplier(10)
551 ->Range(1000, 1000000)
552 ->UseRealTime();
553 BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr2fma_x80, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr2fma_x80)
554 ->RangeMultiplier(10)
555 ->Range(1000, 1000000)
556 ->UseRealTime();
557
558 BENCHMARK_CAPTURE(f32_sigmoid, sse2_p5_div_x8, xnn_f32_sigmoid_ukernel__sse2_p5_div_x8)
559 ->RangeMultiplier(10)
560 ->Range(1000, 1000000)
561 ->UseRealTime();
562 BENCHMARK_CAPTURE(f32_sigmoid, sse2_p5_div_x16, xnn_f32_sigmoid_ukernel__sse2_p5_div_x16)
563 ->RangeMultiplier(10)
564 ->Range(1000, 1000000)
565 ->UseRealTime();
566 BENCHMARK_CAPTURE(f32_sigmoid, sse41_p5_div_x8, xnn_f32_sigmoid_ukernel__sse41_p5_div_x8)
567 ->RangeMultiplier(10)
568 ->Range(1000, 1000000)
569 ->UseRealTime();
570 BENCHMARK_CAPTURE(f32_sigmoid, sse41_p5_div_x16, xnn_f32_sigmoid_ukernel__sse41_p5_div_x16)
571 ->RangeMultiplier(10)
572 ->Range(1000, 1000000)
573 ->UseRealTime();
574 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
575
576 #if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
577 BENCHMARK_CAPTURE(f32_sigmoid, psimd_p5_div_x4, xnn_f32_sigmoid_ukernel__psimd_p5_div_x4)
578 ->RangeMultiplier(10)
579 ->Range(1000, 1000000)
580 ->UseRealTime();
581 BENCHMARK_CAPTURE(f32_sigmoid, psimd_p5_div_x8, xnn_f32_sigmoid_ukernel__psimd_p5_div_x8)
582 ->RangeMultiplier(10)
583 ->Range(1000, 1000000)
584 ->UseRealTime();
585 BENCHMARK_CAPTURE(f32_sigmoid, psimd_p5_div_x12, xnn_f32_sigmoid_ukernel__psimd_p5_div_x12)
586 ->RangeMultiplier(10)
587 ->Range(1000, 1000000)
588 ->UseRealTime();
589 BENCHMARK_CAPTURE(f32_sigmoid, psimd_p5_div_x16, xnn_f32_sigmoid_ukernel__psimd_p5_div_x16)
590 ->RangeMultiplier(10)
591 ->Range(1000, 1000000)
592 ->UseRealTime();
593 BENCHMARK_CAPTURE(f32_sigmoid, psimd_p5_div_x20, xnn_f32_sigmoid_ukernel__psimd_p5_div_x20)
594 ->RangeMultiplier(10)
595 ->Range(1000, 1000000)
596 ->UseRealTime();
597 BENCHMARK_CAPTURE(f32_sigmoid, psimd_p5_div_x24, xnn_f32_sigmoid_ukernel__psimd_p5_div_x24)
598 ->RangeMultiplier(10)
599 ->Range(1000, 1000000)
600 ->UseRealTime();
601 #endif // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
602
603 BENCHMARK_CAPTURE(f32_sigmoid, scalar_lut2048_p1_div_x1, xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x1)
604 ->RangeMultiplier(10)
605 ->Range(1000, 1000000)
606 ->UseRealTime();
607 BENCHMARK_CAPTURE(f32_sigmoid, scalar_lut2048_p1_div_x2, xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x2)
608 ->RangeMultiplier(10)
609 ->Range(1000, 1000000)
610 ->UseRealTime();
611 BENCHMARK_CAPTURE(f32_sigmoid, scalar_lut2048_p1_div_x4, xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x4)
612 ->RangeMultiplier(10)
613 ->Range(1000, 1000000)
614 ->UseRealTime();
615
616 BENCHMARK_CAPTURE(f32_sigmoid, scalar_lut64_p2_div_x1, xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x1)
617 ->RangeMultiplier(10)
618 ->Range(1000, 1000000)
619 ->UseRealTime();
620 BENCHMARK_CAPTURE(f32_sigmoid, scalar_lut64_p2_div_x2, xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x2)
621 ->RangeMultiplier(10)
622 ->Range(1000, 1000000)
623 ->UseRealTime();
624 BENCHMARK_CAPTURE(f32_sigmoid, scalar_lut64_p2_div_x4, xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x4)
625 ->RangeMultiplier(10)
626 ->Range(1000, 1000000)
627 ->UseRealTime();
628
629 BENCHMARK_CAPTURE(f32_sigmoid, scalar_p5_div_x1, xnn_f32_sigmoid_ukernel__scalar_p5_div_x1)
630 ->RangeMultiplier(10)
631 ->Range(1000, 1000000)
632 ->UseRealTime();
633 BENCHMARK_CAPTURE(f32_sigmoid, scalar_p5_div_x2, xnn_f32_sigmoid_ukernel__scalar_p5_div_x2)
634 ->RangeMultiplier(10)
635 ->Range(1000, 1000000)
636 ->UseRealTime();
637 BENCHMARK_CAPTURE(f32_sigmoid, scalar_p5_div_x4, xnn_f32_sigmoid_ukernel__scalar_p5_div_x4)
638 ->RangeMultiplier(10)
639 ->Range(1000, 1000000)
640 ->UseRealTime();
641
642 #ifndef XNNPACK_BENCHMARK_NO_MAIN
643 BENCHMARK_MAIN();
644 #endif
645