• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11 
12 #include <benchmark/benchmark.h>
13 #include "bench/utils.h"
14 #include <xnnpack/AlignedAllocator.h>
15 #include <xnnpack/common.h>
16 #include <xnnpack/params.h>
17 #include <xnnpack/vunary.h>
18 
19 
f32_sigmoid(benchmark::State & state,xnn_f32_vunary_ukernel_function sigmoid)20 static void f32_sigmoid(
21   benchmark::State& state,
22   xnn_f32_vunary_ukernel_function sigmoid)
23 {
24   const size_t elements = state.range(0);
25 
26   std::random_device random_device;
27   auto rng = std::mt19937(random_device());
28   auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), rng);
29 
30   std::vector<float, AlignedAllocator<float, 64>> x(elements);
31   std::vector<float, AlignedAllocator<float, 64>> y(elements);
32   std::generate(x.begin(), x.end(), std::ref(f32rng));
33   std::fill(y.begin(), y.end(), std::nanf(""));
34 
35   for (auto _ : state) {
36     sigmoid(elements * sizeof(float), x.data(), y.data(), nullptr /* params */);
37   }
38 
39   state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
40 
41   const size_t elements_per_iteration = elements;
42   state.counters["elements"] =
43     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
44 
45   const size_t bytes_per_iteration = 2 * elements * sizeof(float);
46   state.counters["bytes"] =
47     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
48 }
49 
50 #if XNN_ARCH_ARM64
51   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_div_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_div_x4)
52     ->RangeMultiplier(10)
53     ->Range(1000, 1000000)
54     ->UseRealTime();
55   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_div_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_div_x8)
56     ->RangeMultiplier(10)
57     ->Range(1000, 1000000)
58     ->UseRealTime();
59   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_div_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_div_x12)
60     ->RangeMultiplier(10)
61     ->Range(1000, 1000000)
62     ->UseRealTime();
63   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_div_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_div_x16)
64     ->RangeMultiplier(10)
65     ->Range(1000, 1000000)
66     ->UseRealTime();
67   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_div_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_div_x20)
68     ->RangeMultiplier(10)
69     ->Range(1000, 1000000)
70     ->UseRealTime();
71   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_div_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_div_x24)
72     ->RangeMultiplier(10)
73     ->Range(1000, 1000000)
74     ->UseRealTime();
75 
76   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_div_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_div_x4)
77     ->RangeMultiplier(10)
78     ->Range(1000, 1000000)
79     ->UseRealTime();
80   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_div_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_div_x8)
81     ->RangeMultiplier(10)
82     ->Range(1000, 1000000)
83     ->UseRealTime();
84   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_div_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_div_x12)
85     ->RangeMultiplier(10)
86     ->Range(1000, 1000000)
87     ->UseRealTime();
88   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_div_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_div_x16)
89     ->RangeMultiplier(10)
90     ->Range(1000, 1000000)
91     ->UseRealTime();
92   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_div_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_div_x20)
93     ->RangeMultiplier(10)
94     ->Range(1000, 1000000)
95     ->UseRealTime();
96   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_div_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_div_x24)
97     ->RangeMultiplier(10)
98     ->Range(1000, 1000000)
99     ->UseRealTime();
100 
101   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_div_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x4)
102     ->RangeMultiplier(10)
103     ->Range(1000, 1000000)
104     ->UseRealTime();
105   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_div_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x8)
106     ->RangeMultiplier(10)
107     ->Range(1000, 1000000)
108     ->UseRealTime();
109   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_div_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x12)
110     ->RangeMultiplier(10)
111     ->Range(1000, 1000000)
112     ->UseRealTime();
113   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_div_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x16)
114     ->RangeMultiplier(10)
115     ->Range(1000, 1000000)
116     ->UseRealTime();
117   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_div_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x20)
118     ->RangeMultiplier(10)
119     ->Range(1000, 1000000)
120     ->UseRealTime();
121   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_div_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_div_x24)
122     ->RangeMultiplier(10)
123     ->Range(1000, 1000000)
124     ->UseRealTime();
125 #endif  // XNN_ARCH_ARM64
126 
127 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
128   BENCHMARK_CAPTURE(f32_sigmoid, neon_frac_p9_p10_nr1recps_x16, xnn_f32_sigmoid_ukernel__neon_frac_p9_p10_nr1recps_x16)
129     ->RangeMultiplier(10)
130     ->Range(1000, 1000000)
131     ->UseRealTime();
132 
133   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2fma_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2fma_x4)
134     ->RangeMultiplier(10)
135     ->Range(1000, 1000000)
136     ->UseRealTime();
137   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2fma_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2fma_x8)
138     ->RangeMultiplier(10)
139     ->Range(1000, 1000000)
140     ->UseRealTime();
141   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2fma_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2fma_x12)
142     ->RangeMultiplier(10)
143     ->Range(1000, 1000000)
144     ->UseRealTime();
145   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2fma_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2fma_x16)
146     ->RangeMultiplier(10)
147     ->Range(1000, 1000000)
148     ->UseRealTime();
149   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2fma_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2fma_x20)
150     ->RangeMultiplier(10)
151     ->Range(1000, 1000000)
152     ->UseRealTime();
153   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2fma_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2fma_x24)
154     ->RangeMultiplier(10)
155     ->Range(1000, 1000000)
156     ->UseRealTime();
157 
158   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr1recps1fma_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_x4)
159     ->RangeMultiplier(10)
160     ->Range(1000, 1000000)
161     ->UseRealTime();
162   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr1recps1fma_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_x8)
163     ->RangeMultiplier(10)
164     ->Range(1000, 1000000)
165     ->UseRealTime();
166   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr1recps1fma_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_x12)
167     ->RangeMultiplier(10)
168     ->Range(1000, 1000000)
169     ->UseRealTime();
170   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr1recps1fma_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_x16)
171     ->RangeMultiplier(10)
172     ->Range(1000, 1000000)
173     ->UseRealTime();
174   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr1recps1fma_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_x20)
175     ->RangeMultiplier(10)
176     ->Range(1000, 1000000)
177     ->UseRealTime();
178   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr1recps1fma_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_x24)
179     ->RangeMultiplier(10)
180     ->Range(1000, 1000000)
181     ->UseRealTime();
182 
183   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2recps_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2recps_x4)
184     ->RangeMultiplier(10)
185     ->Range(1000, 1000000)
186     ->UseRealTime();
187   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2recps_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2recps_x8)
188     ->RangeMultiplier(10)
189     ->Range(1000, 1000000)
190     ->UseRealTime();
191   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2recps_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2recps_x12)
192     ->RangeMultiplier(10)
193     ->Range(1000, 1000000)
194     ->UseRealTime();
195   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2recps_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2recps_x16)
196     ->RangeMultiplier(10)
197     ->Range(1000, 1000000)
198     ->UseRealTime();
199   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2recps_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2recps_x20)
200     ->RangeMultiplier(10)
201     ->Range(1000, 1000000)
202     ->UseRealTime();
203   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_p5_nr2recps_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_p5_nr2recps_x24)
204     ->RangeMultiplier(10)
205     ->Range(1000, 1000000)
206     ->UseRealTime();
207 
208   BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_p5_nr2recps_x4, xnn_f32_sigmoid_ukernel__neon_rr2_p5_nr2recps_x4)
209     ->RangeMultiplier(10)
210     ->Range(1000, 1000000)
211     ->UseRealTime();
212   BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_p5_nr2recps_x8, xnn_f32_sigmoid_ukernel__neon_rr2_p5_nr2recps_x8)
213     ->RangeMultiplier(10)
214     ->Range(1000, 1000000)
215     ->UseRealTime();
216   BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_p5_nr2recps_x12, xnn_f32_sigmoid_ukernel__neon_rr2_p5_nr2recps_x12)
217     ->RangeMultiplier(10)
218     ->Range(1000, 1000000)
219     ->UseRealTime();
220   BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_p5_nr2recps_x16, xnn_f32_sigmoid_ukernel__neon_rr2_p5_nr2recps_x16)
221     ->RangeMultiplier(10)
222     ->Range(1000, 1000000)
223     ->UseRealTime();
224   BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_p5_nr2recps_x20, xnn_f32_sigmoid_ukernel__neon_rr2_p5_nr2recps_x20)
225     ->RangeMultiplier(10)
226     ->Range(1000, 1000000)
227     ->UseRealTime();
228   BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_p5_nr2recps_x24, xnn_f32_sigmoid_ukernel__neon_rr2_p5_nr2recps_x24)
229     ->RangeMultiplier(10)
230     ->Range(1000, 1000000)
231     ->UseRealTime();
232 
233   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2fma_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x4)
234     ->RangeMultiplier(10)
235     ->Range(1000, 1000000)
236     ->UseRealTime();
237   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2fma_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x8)
238     ->RangeMultiplier(10)
239     ->Range(1000, 1000000)
240     ->UseRealTime();
241   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2fma_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x12)
242     ->RangeMultiplier(10)
243     ->Range(1000, 1000000)
244     ->UseRealTime();
245   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2fma_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x16)
246     ->RangeMultiplier(10)
247     ->Range(1000, 1000000)
248     ->UseRealTime();
249   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2fma_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x20)
250     ->RangeMultiplier(10)
251     ->Range(1000, 1000000)
252     ->UseRealTime();
253   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2fma_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_x24)
254     ->RangeMultiplier(10)
255     ->Range(1000, 1000000)
256     ->UseRealTime();
257 
258   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr1recps1fma_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x4)
259     ->RangeMultiplier(10)
260     ->Range(1000, 1000000)
261     ->UseRealTime();
262   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr1recps1fma_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x8)
263     ->RangeMultiplier(10)
264     ->Range(1000, 1000000)
265     ->UseRealTime();
266   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr1recps1fma_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x12)
267     ->RangeMultiplier(10)
268     ->Range(1000, 1000000)
269     ->UseRealTime();
270   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr1recps1fma_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x16)
271     ->RangeMultiplier(10)
272     ->Range(1000, 1000000)
273     ->UseRealTime();
274   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr1recps1fma_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x20)
275     ->RangeMultiplier(10)
276     ->Range(1000, 1000000)
277     ->UseRealTime();
278   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr1recps1fma_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_x24)
279     ->RangeMultiplier(10)
280     ->Range(1000, 1000000)
281     ->UseRealTime();
282 
283   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2recps_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x4)
284     ->RangeMultiplier(10)
285     ->Range(1000, 1000000)
286     ->UseRealTime();
287   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2recps_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x8)
288     ->RangeMultiplier(10)
289     ->Range(1000, 1000000)
290     ->UseRealTime();
291   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2recps_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x12)
292     ->RangeMultiplier(10)
293     ->Range(1000, 1000000)
294     ->UseRealTime();
295   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2recps_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x16)
296     ->RangeMultiplier(10)
297     ->Range(1000, 1000000)
298     ->UseRealTime();
299   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2recps_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x20)
300     ->RangeMultiplier(10)
301     ->Range(1000, 1000000)
302     ->UseRealTime();
303   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut64_p2_nr2recps_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x24)
304     ->RangeMultiplier(10)
305     ->Range(1000, 1000000)
306     ->UseRealTime();
307 
308   BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut64_p2_nr2recps_x4, xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x4)
309     ->RangeMultiplier(10)
310     ->Range(1000, 1000000)
311     ->UseRealTime();
312   BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut64_p2_nr2recps_x8, xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x8)
313     ->RangeMultiplier(10)
314     ->Range(1000, 1000000)
315     ->UseRealTime();
316   BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut64_p2_nr2recps_x12, xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x12)
317     ->RangeMultiplier(10)
318     ->Range(1000, 1000000)
319     ->UseRealTime();
320   BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut64_p2_nr2recps_x16, xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x16)
321     ->RangeMultiplier(10)
322     ->Range(1000, 1000000)
323     ->UseRealTime();
324   BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut64_p2_nr2recps_x20, xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x20)
325     ->RangeMultiplier(10)
326     ->Range(1000, 1000000)
327     ->UseRealTime();
328   BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut64_p2_nr2recps_x24, xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x24)
329     ->RangeMultiplier(10)
330     ->Range(1000, 1000000)
331     ->UseRealTime();
332 
333   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2fma_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x4)
334     ->RangeMultiplier(10)
335     ->Range(1000, 1000000)
336     ->UseRealTime();
337   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2fma_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x8)
338     ->RangeMultiplier(10)
339     ->Range(1000, 1000000)
340     ->UseRealTime();
341   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2fma_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x12)
342     ->RangeMultiplier(10)
343     ->Range(1000, 1000000)
344     ->UseRealTime();
345   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2fma_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x16)
346     ->RangeMultiplier(10)
347     ->Range(1000, 1000000)
348     ->UseRealTime();
349   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2fma_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x20)
350     ->RangeMultiplier(10)
351     ->Range(1000, 1000000)
352     ->UseRealTime();
353   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2fma_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_x24)
354     ->RangeMultiplier(10)
355     ->Range(1000, 1000000)
356     ->UseRealTime();
357 
358   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr1recps1fma_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x4)
359     ->RangeMultiplier(10)
360     ->Range(1000, 1000000)
361     ->UseRealTime();
362   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr1recps1fma_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x8)
363     ->RangeMultiplier(10)
364     ->Range(1000, 1000000)
365     ->UseRealTime();
366   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr1recps1fma_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x12)
367     ->RangeMultiplier(10)
368     ->Range(1000, 1000000)
369     ->UseRealTime();
370   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr1recps1fma_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x16)
371     ->RangeMultiplier(10)
372     ->Range(1000, 1000000)
373     ->UseRealTime();
374   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr1recps1fma_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x20)
375     ->RangeMultiplier(10)
376     ->Range(1000, 1000000)
377     ->UseRealTime();
378   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr1recps1fma_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_x24)
379     ->RangeMultiplier(10)
380     ->Range(1000, 1000000)
381     ->UseRealTime();
382 
383   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2recps_x4, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x4)
384     ->RangeMultiplier(10)
385     ->Range(1000, 1000000)
386     ->UseRealTime();
387   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2recps_x8, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x8)
388     ->RangeMultiplier(10)
389     ->Range(1000, 1000000)
390     ->UseRealTime();
391   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2recps_x12, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x12)
392     ->RangeMultiplier(10)
393     ->Range(1000, 1000000)
394     ->UseRealTime();
395   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2recps_x16, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x16)
396     ->RangeMultiplier(10)
397     ->Range(1000, 1000000)
398     ->UseRealTime();
399   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2recps_x20, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x20)
400     ->RangeMultiplier(10)
401     ->Range(1000, 1000000)
402     ->UseRealTime();
403   BENCHMARK_CAPTURE(f32_sigmoid, neonfma_rr1_lut2048_p1_nr2recps_x24, xnn_f32_sigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_x24)
404     ->RangeMultiplier(10)
405     ->Range(1000, 1000000)
406     ->UseRealTime();
407 
408   BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut2048_p1_nr2recps_x4, xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x4)
409     ->RangeMultiplier(10)
410     ->Range(1000, 1000000)
411     ->UseRealTime();
412   BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut2048_p1_nr2recps_x8, xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x8)
413     ->RangeMultiplier(10)
414     ->Range(1000, 1000000)
415     ->UseRealTime();
416   BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut2048_p1_nr2recps_x12, xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x12)
417     ->RangeMultiplier(10)
418     ->Range(1000, 1000000)
419     ->UseRealTime();
420   BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut2048_p1_nr2recps_x16, xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x16)
421     ->RangeMultiplier(10)
422     ->Range(1000, 1000000)
423     ->UseRealTime();
424   BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut2048_p1_nr2recps_x20, xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x20)
425     ->RangeMultiplier(10)
426     ->Range(1000, 1000000)
427     ->UseRealTime();
428   BENCHMARK_CAPTURE(f32_sigmoid, neon_rr2_lut2048_p1_nr2recps_x24, xnn_f32_sigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_x24)
429     ->RangeMultiplier(10)
430     ->Range(1000, 1000000)
431     ->UseRealTime();
432 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
433 
434 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
435   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_div_x8, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x8)
436     ->RangeMultiplier(10)
437     ->Range(1000, 1000000)
438     ->UseRealTime();
439   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_div_x16, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x16)
440     ->RangeMultiplier(10)
441     ->Range(1000, 1000000)
442     ->UseRealTime();
443   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_div_x24, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x24)
444     ->RangeMultiplier(10)
445     ->Range(1000, 1000000)
446     ->UseRealTime();
447   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_div_x32, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x32)
448     ->RangeMultiplier(10)
449     ->Range(1000, 1000000)
450     ->UseRealTime();
451   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_div_x40, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x40)
452     ->RangeMultiplier(10)
453     ->Range(1000, 1000000)
454     ->UseRealTime();
455   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_div_x48, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x48)
456     ->RangeMultiplier(10)
457     ->Range(1000, 1000000)
458     ->UseRealTime();
459   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_div_x56, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x56)
460     ->RangeMultiplier(10)
461     ->Range(1000, 1000000)
462     ->UseRealTime();
463   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_div_x64, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x64)
464     ->RangeMultiplier(10)
465     ->Range(1000, 1000000)
466     ->UseRealTime();
467   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_div_x72, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x72)
468     ->RangeMultiplier(10)
469     ->Range(1000, 1000000)
470     ->UseRealTime();
471   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_div_x80, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x80)
472     ->RangeMultiplier(10)
473     ->Range(1000, 1000000)
474     ->UseRealTime();
475 
476   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr1fma_x8, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr1fma_x8)
477     ->RangeMultiplier(10)
478     ->Range(1000, 1000000)
479     ->UseRealTime();
480   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr1fma_x16, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr1fma_x16)
481     ->RangeMultiplier(10)
482     ->Range(1000, 1000000)
483     ->UseRealTime();
484   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr1fma_x24, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr1fma_x24)
485     ->RangeMultiplier(10)
486     ->Range(1000, 1000000)
487     ->UseRealTime();
488   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr1fma_x32, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr1fma_x32)
489     ->RangeMultiplier(10)
490     ->Range(1000, 1000000)
491     ->UseRealTime();
492   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr1fma_x40, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr1fma_x40)
493     ->RangeMultiplier(10)
494     ->Range(1000, 1000000)
495     ->UseRealTime();
496   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr1fma_x48, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr1fma_x48)
497     ->RangeMultiplier(10)
498     ->Range(1000, 1000000)
499     ->UseRealTime();
500   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr1fma_x56, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr1fma_x56)
501     ->RangeMultiplier(10)
502     ->Range(1000, 1000000)
503     ->UseRealTime();
504   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr1fma_x64, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr1fma_x64)
505     ->RangeMultiplier(10)
506     ->Range(1000, 1000000)
507     ->UseRealTime();
508   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr1fma_x72, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr1fma_x72)
509     ->RangeMultiplier(10)
510     ->Range(1000, 1000000)
511     ->UseRealTime();
512   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr1fma_x80, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr1fma_x80)
513     ->RangeMultiplier(10)
514     ->Range(1000, 1000000)
515     ->UseRealTime();
516 
517   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr2fma_x8, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr2fma_x8)
518     ->RangeMultiplier(10)
519     ->Range(1000, 1000000)
520     ->UseRealTime();
521   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr2fma_x16, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr2fma_x16)
522     ->RangeMultiplier(10)
523     ->Range(1000, 1000000)
524     ->UseRealTime();
525   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr2fma_x24, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr2fma_x24)
526     ->RangeMultiplier(10)
527     ->Range(1000, 1000000)
528     ->UseRealTime();
529   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr2fma_x32, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr2fma_x32)
530     ->RangeMultiplier(10)
531     ->Range(1000, 1000000)
532     ->UseRealTime();
533   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr2fma_x40, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr2fma_x40)
534     ->RangeMultiplier(10)
535     ->Range(1000, 1000000)
536     ->UseRealTime();
537   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr2fma_x48, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr2fma_x48)
538     ->RangeMultiplier(10)
539     ->Range(1000, 1000000)
540     ->UseRealTime();
541   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr2fma_x56, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr2fma_x56)
542     ->RangeMultiplier(10)
543     ->Range(1000, 1000000)
544     ->UseRealTime();
545   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr2fma_x64, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr2fma_x64)
546     ->RangeMultiplier(10)
547     ->Range(1000, 1000000)
548     ->UseRealTime();
549   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr2fma_x72, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr2fma_x72)
550     ->RangeMultiplier(10)
551     ->Range(1000, 1000000)
552     ->UseRealTime();
553   BENCHMARK_CAPTURE(f32_sigmoid, avx2_p5_nr2fma_x80, xnn_f32_sigmoid_ukernel__avx2_rr1_p5_nr2fma_x80)
554     ->RangeMultiplier(10)
555     ->Range(1000, 1000000)
556     ->UseRealTime();
557 
558   BENCHMARK_CAPTURE(f32_sigmoid, sse2_p5_div_x8, xnn_f32_sigmoid_ukernel__sse2_p5_div_x8)
559     ->RangeMultiplier(10)
560     ->Range(1000, 1000000)
561     ->UseRealTime();
562   BENCHMARK_CAPTURE(f32_sigmoid, sse2_p5_div_x16, xnn_f32_sigmoid_ukernel__sse2_p5_div_x16)
563     ->RangeMultiplier(10)
564     ->Range(1000, 1000000)
565     ->UseRealTime();
566   BENCHMARK_CAPTURE(f32_sigmoid, sse41_p5_div_x8, xnn_f32_sigmoid_ukernel__sse41_p5_div_x8)
567     ->RangeMultiplier(10)
568     ->Range(1000, 1000000)
569     ->UseRealTime();
570   BENCHMARK_CAPTURE(f32_sigmoid, sse41_p5_div_x16, xnn_f32_sigmoid_ukernel__sse41_p5_div_x16)
571     ->RangeMultiplier(10)
572     ->Range(1000, 1000000)
573     ->UseRealTime();
574 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
575 
576 #if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
577   BENCHMARK_CAPTURE(f32_sigmoid, psimd_p5_div_x4, xnn_f32_sigmoid_ukernel__psimd_p5_div_x4)
578     ->RangeMultiplier(10)
579     ->Range(1000, 1000000)
580     ->UseRealTime();
581   BENCHMARK_CAPTURE(f32_sigmoid, psimd_p5_div_x8, xnn_f32_sigmoid_ukernel__psimd_p5_div_x8)
582     ->RangeMultiplier(10)
583     ->Range(1000, 1000000)
584     ->UseRealTime();
585   BENCHMARK_CAPTURE(f32_sigmoid, psimd_p5_div_x12, xnn_f32_sigmoid_ukernel__psimd_p5_div_x12)
586     ->RangeMultiplier(10)
587     ->Range(1000, 1000000)
588     ->UseRealTime();
589   BENCHMARK_CAPTURE(f32_sigmoid, psimd_p5_div_x16, xnn_f32_sigmoid_ukernel__psimd_p5_div_x16)
590     ->RangeMultiplier(10)
591     ->Range(1000, 1000000)
592     ->UseRealTime();
593   BENCHMARK_CAPTURE(f32_sigmoid, psimd_p5_div_x20, xnn_f32_sigmoid_ukernel__psimd_p5_div_x20)
594     ->RangeMultiplier(10)
595     ->Range(1000, 1000000)
596     ->UseRealTime();
597   BENCHMARK_CAPTURE(f32_sigmoid, psimd_p5_div_x24, xnn_f32_sigmoid_ukernel__psimd_p5_div_x24)
598     ->RangeMultiplier(10)
599     ->Range(1000, 1000000)
600     ->UseRealTime();
601 #endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
602 
603 BENCHMARK_CAPTURE(f32_sigmoid, scalar_lut2048_p1_div_x1, xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x1)
604   ->RangeMultiplier(10)
605   ->Range(1000, 1000000)
606   ->UseRealTime();
607 BENCHMARK_CAPTURE(f32_sigmoid, scalar_lut2048_p1_div_x2, xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x2)
608   ->RangeMultiplier(10)
609   ->Range(1000, 1000000)
610   ->UseRealTime();
611 BENCHMARK_CAPTURE(f32_sigmoid, scalar_lut2048_p1_div_x4, xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x4)
612   ->RangeMultiplier(10)
613   ->Range(1000, 1000000)
614   ->UseRealTime();
615 
616 BENCHMARK_CAPTURE(f32_sigmoid, scalar_lut64_p2_div_x1, xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x1)
617   ->RangeMultiplier(10)
618   ->Range(1000, 1000000)
619   ->UseRealTime();
620 BENCHMARK_CAPTURE(f32_sigmoid, scalar_lut64_p2_div_x2, xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x2)
621   ->RangeMultiplier(10)
622   ->Range(1000, 1000000)
623   ->UseRealTime();
624 BENCHMARK_CAPTURE(f32_sigmoid, scalar_lut64_p2_div_x4, xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x4)
625   ->RangeMultiplier(10)
626   ->Range(1000, 1000000)
627   ->UseRealTime();
628 
629 BENCHMARK_CAPTURE(f32_sigmoid, scalar_p5_div_x1, xnn_f32_sigmoid_ukernel__scalar_p5_div_x1)
630   ->RangeMultiplier(10)
631   ->Range(1000, 1000000)
632   ->UseRealTime();
633 BENCHMARK_CAPTURE(f32_sigmoid, scalar_p5_div_x2, xnn_f32_sigmoid_ukernel__scalar_p5_div_x2)
634   ->RangeMultiplier(10)
635   ->Range(1000, 1000000)
636   ->UseRealTime();
637 BENCHMARK_CAPTURE(f32_sigmoid, scalar_p5_div_x4, xnn_f32_sigmoid_ukernel__scalar_p5_div_x4)
638   ->RangeMultiplier(10)
639   ->Range(1000, 1000000)
640   ->UseRealTime();
641 
642 #ifndef XNNPACK_BENCHMARK_NO_MAIN
643 BENCHMARK_MAIN();
644 #endif
645