• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11 
12 #include <benchmark/benchmark.h>
13 #include "bench/utils.h"
14 
15 #include <xnnpack/AlignedAllocator.h>
16 #include <xnnpack/common.h>
17 #include <xnnpack/params.h>
18 #include <xnnpack/params-init.h>
19 #include <xnnpack/vaddsub.h>
20 
21 
qs8_vaddc(benchmark::State & state,xnn_qs8_vaddsub_minmax_ukernel_function vaddc,xnn_init_qs8_addsub_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)22 static void qs8_vaddc(
23   benchmark::State& state,
24   xnn_qs8_vaddsub_minmax_ukernel_function vaddc,
25   xnn_init_qs8_addsub_minmax_params_fn init_params,
26   benchmark::utils::IsaCheckFunction isa_check = nullptr)
27 {
28   if (isa_check && !isa_check(state)) {
29     return;
30   }
31 
32   const size_t num_elements = state.range(0);
33 
34   std::random_device random_device;
35   auto rng = std::mt19937(random_device());
36   auto i8rng = std::bind(
37     std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
38     std::ref(rng));
39 
40   std::vector<int8_t, AlignedAllocator<int8_t, 64>> a(num_elements);
41   std::vector<int8_t, AlignedAllocator<int8_t, 64>> sum(num_elements);
42   std::generate(a.begin(), a.end(), std::ref(i8rng));
43   const int8_t b = i8rng();
44 
45   union xnn_qs8_addsub_minmax_params params;
46   init_params(&params,
47     1 /* a zero point */, 1 /* b zero point */, 1 /* output zero point */,
48     0.5f /* a-output scale */, 0.75f /* b-output scale */,
49     std::numeric_limits<int8_t>::min() + 1, std::numeric_limits<int8_t>::max() - 1);
50   for (auto _ : state) {
51     vaddc(num_elements * sizeof(int8_t), a.data(), &b, sum.data(), &params);
52   }
53 
54   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
55   if (cpu_frequency != 0) {
56     state.counters["cpufreq"] = cpu_frequency;
57   }
58 
59   const size_t num_elements_per_iteration = num_elements;
60   state.counters["num_elements"] =
61     benchmark::Counter(uint64_t(state.iterations()) * num_elements_per_iteration, benchmark::Counter::kIsRate);
62 
63   const size_t bytes_per_iteration = 2 * num_elements * sizeof(int8_t);
64   state.counters["bytes"] =
65     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
66 }
67 
68 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
69   BENCHMARK_CAPTURE(qs8_vaddc, neon_ld64_x8,
70                     xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8,
71                     xnn_init_qs8_add_minmax_neon_params,
72                     benchmark::utils::CheckNEON)
73     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
74     ->UseRealTime();
75   BENCHMARK_CAPTURE(qs8_vaddc, neon_ld64_x16,
76                     xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16,
77                     xnn_init_qs8_add_minmax_neon_params,
78                     benchmark::utils::CheckNEON)
79     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
80     ->UseRealTime();
81   BENCHMARK_CAPTURE(qs8_vaddc, neon_ld64_x24,
82                     xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x24,
83                     xnn_init_qs8_add_minmax_neon_params,
84                     benchmark::utils::CheckNEON)
85     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
86     ->UseRealTime();
87   BENCHMARK_CAPTURE(qs8_vaddc, neon_ld64_x32,
88                     xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32,
89                     xnn_init_qs8_add_minmax_neon_params,
90                     benchmark::utils::CheckNEON)
91     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
92     ->UseRealTime();
93 
94   BENCHMARK_CAPTURE(qs8_vaddc, neon_ld128_x16,
95                     xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16,
96                     xnn_init_qs8_add_minmax_neon_params,
97                     benchmark::utils::CheckNEON)
98     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
99     ->UseRealTime();
100   BENCHMARK_CAPTURE(qs8_vaddc, neon_ld128_x32,
101                     xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32,
102                     xnn_init_qs8_add_minmax_neon_params,
103                     benchmark::utils::CheckNEON)
104     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
105     ->UseRealTime();
106 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
107 
108 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
109   BENCHMARK_CAPTURE(qs8_vaddc, avx512skx_mul32_ld128_x16,
110                     xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
111                     xnn_init_qs8_add_minmax_avx512_params,
112                     benchmark::utils::CheckAVX512SKX)
113     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
114     ->UseRealTime();
115   BENCHMARK_CAPTURE(qs8_vaddc, avx512skx_mul32_ld128_x32,
116                     xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x32,
117                     xnn_init_qs8_add_minmax_avx512_params,
118                     benchmark::utils::CheckAVX512SKX)
119     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
120     ->UseRealTime();
121 
122   BENCHMARK_CAPTURE(qs8_vaddc, avx2_mul32_ld64_x8,
123                     xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x8,
124                     xnn_init_qs8_add_minmax_avx2_params,
125                     benchmark::utils::CheckAVX2)
126     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
127     ->UseRealTime();
128   BENCHMARK_CAPTURE(qs8_vaddc, avx2_mul32_ld64_x16,
129                     xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
130                     xnn_init_qs8_add_minmax_avx2_params,
131                     benchmark::utils::CheckAVX2)
132     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
133     ->UseRealTime();
134   BENCHMARK_CAPTURE(qs8_vaddc, avx2_mul32_ld64_x24,
135                     xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x24,
136                     xnn_init_qs8_add_minmax_avx2_params,
137                     benchmark::utils::CheckAVX2)
138     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
139     ->UseRealTime();
140   BENCHMARK_CAPTURE(qs8_vaddc, avx2_mul32_ld64_x32,
141                     xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x32,
142                     xnn_init_qs8_add_minmax_avx2_params,
143                     benchmark::utils::CheckAVX2)
144     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
145     ->UseRealTime();
146 
147   BENCHMARK_CAPTURE(qs8_vaddc, xop_mul32_ld32_x8,
148                     xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
149                     xnn_init_qs8_add_minmax_sse4_mul32_params,
150                     benchmark::utils::CheckXOP)
151     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
152     ->UseRealTime();
153   BENCHMARK_CAPTURE(qs8_vaddc, xop_mul32_ld32_x16,
154                     xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x16,
155                     xnn_init_qs8_add_minmax_sse4_mul32_params,
156                     benchmark::utils::CheckXOP)
157     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
158     ->UseRealTime();
159   BENCHMARK_CAPTURE(qs8_vaddc, xop_mul32_ld32_x24,
160                     xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x24,
161                     xnn_init_qs8_add_minmax_sse4_mul32_params,
162                     benchmark::utils::CheckXOP)
163     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
164     ->UseRealTime();
165   BENCHMARK_CAPTURE(qs8_vaddc, xop_mul32_ld32_x32,
166                     xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x32,
167                     xnn_init_qs8_add_minmax_sse4_mul32_params,
168                     benchmark::utils::CheckXOP)
169     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
170     ->UseRealTime();
171 
172   BENCHMARK_CAPTURE(qs8_vaddc, avx_mul16_ld64_x8,
173                     xnn_qs8_vaddc_minmax_ukernel__avx_mul16_ld64_x8,
174                     xnn_init_qs8_add_minmax_sse4_mul16_params,
175                     benchmark::utils::CheckAVX)
176     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
177     ->UseRealTime();
178   BENCHMARK_CAPTURE(qs8_vaddc, avx_mul16_ld64_x16,
179                     xnn_qs8_vaddc_minmax_ukernel__avx_mul16_ld64_x16,
180                     xnn_init_qs8_add_minmax_sse4_mul16_params,
181                     benchmark::utils::CheckAVX)
182     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
183     ->UseRealTime();
184   BENCHMARK_CAPTURE(qs8_vaddc, avx_mul16_ld64_x24,
185                     xnn_qs8_vaddc_minmax_ukernel__avx_mul16_ld64_x24,
186                     xnn_init_qs8_add_minmax_sse4_mul16_params,
187                     benchmark::utils::CheckAVX)
188     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
189     ->UseRealTime();
190   BENCHMARK_CAPTURE(qs8_vaddc, avx_mul16_ld64_x32,
191                     xnn_qs8_vaddc_minmax_ukernel__avx_mul16_ld64_x32,
192                     xnn_init_qs8_add_minmax_sse4_mul16_params,
193                     benchmark::utils::CheckAVX)
194     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
195     ->UseRealTime();
196 
197   BENCHMARK_CAPTURE(qs8_vaddc, avx_mul32_ld32_x8,
198                     xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
199                     xnn_init_qs8_add_minmax_sse4_mul32_params,
200                     benchmark::utils::CheckAVX)
201     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
202     ->UseRealTime();
203   BENCHMARK_CAPTURE(qs8_vaddc, avx_mul32_ld32_x16,
204                     xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x16,
205                     xnn_init_qs8_add_minmax_sse4_mul32_params,
206                     benchmark::utils::CheckAVX)
207     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
208     ->UseRealTime();
209   BENCHMARK_CAPTURE(qs8_vaddc, avx_mul32_ld32_x24,
210                     xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x24,
211                     xnn_init_qs8_add_minmax_sse4_mul32_params,
212                     benchmark::utils::CheckAVX)
213     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
214     ->UseRealTime();
215   BENCHMARK_CAPTURE(qs8_vaddc, avx_mul32_ld32_x32,
216                     xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x32,
217                     xnn_init_qs8_add_minmax_sse4_mul32_params,
218                     benchmark::utils::CheckAVX)
219     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
220     ->UseRealTime();
221 
222   BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul16_ld64_x8,
223                     xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
224                     xnn_init_qs8_add_minmax_sse4_mul16_params,
225                     benchmark::utils::CheckSSE41)
226     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
227     ->UseRealTime();
228   BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul16_ld64_x16,
229                     xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16,
230                     xnn_init_qs8_add_minmax_sse4_mul16_params,
231                     benchmark::utils::CheckSSE41)
232     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
233     ->UseRealTime();
234   BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul16_ld64_x24,
235                     xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24,
236                     xnn_init_qs8_add_minmax_sse4_mul16_params,
237                     benchmark::utils::CheckSSE41)
238     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
239     ->UseRealTime();
240   BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul16_ld64_x32,
241                     xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32,
242                     xnn_init_qs8_add_minmax_sse4_mul16_params,
243                     benchmark::utils::CheckSSE41)
244     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
245     ->UseRealTime();
246 
247   BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul32_ld32_x8,
248                     xnn_qs8_vaddc_minmax_ukernel__sse41_mul32_ld32_x8,
249                     xnn_init_qs8_add_minmax_sse4_mul32_params,
250                     benchmark::utils::CheckSSE41)
251     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
252     ->UseRealTime();
253   BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul32_ld32_x16,
254                     xnn_qs8_vaddc_minmax_ukernel__sse41_mul32_ld32_x16,
255                     xnn_init_qs8_add_minmax_sse4_mul32_params,
256                     benchmark::utils::CheckSSE41)
257     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
258     ->UseRealTime();
259   BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul32_ld32_x24,
260                     xnn_qs8_vaddc_minmax_ukernel__sse41_mul32_ld32_x24,
261                     xnn_init_qs8_add_minmax_sse4_mul32_params,
262                     benchmark::utils::CheckSSE41)
263     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
264     ->UseRealTime();
265   BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul32_ld32_x32,
266                     xnn_qs8_vaddc_minmax_ukernel__sse41_mul32_ld32_x32,
267                     xnn_init_qs8_add_minmax_sse4_mul32_params,
268                     benchmark::utils::CheckSSE41)
269     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
270     ->UseRealTime();
271 
272   BENCHMARK_CAPTURE(qs8_vaddc, sse2_mul16_ld64_x8,
273                     xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
274                     xnn_init_qs8_add_minmax_sse2_params)
275     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
276     ->UseRealTime();
277   BENCHMARK_CAPTURE(qs8_vaddc, sse2_mul16_ld64_x16,
278                     xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16,
279                     xnn_init_qs8_add_minmax_sse2_params)
280     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
281     ->UseRealTime();
282   BENCHMARK_CAPTURE(qs8_vaddc, sse2_mul16_ld64_x24,
283                     xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24,
284                     xnn_init_qs8_add_minmax_sse2_params)
285     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
286     ->UseRealTime();
287   BENCHMARK_CAPTURE(qs8_vaddc, sse2_mul16_ld64_x32,
288                     xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32,
289                     xnn_init_qs8_add_minmax_sse2_params)
290     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
291     ->UseRealTime();
292 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
293 
294 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
295   BENCHMARK_CAPTURE(qs8_vaddc, wasmsimd_x8,
296                     xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8,
297                     xnn_init_qs8_add_minmax_wasmsimd_params)
298     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
299     ->UseRealTime();
300   BENCHMARK_CAPTURE(qs8_vaddc, wasmsimd_x16,
301                     xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x16,
302                     xnn_init_qs8_add_minmax_wasmsimd_params)
303     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
304     ->UseRealTime();
305   BENCHMARK_CAPTURE(qs8_vaddc, wasmsimd_x24,
306                     xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x24,
307                     xnn_init_qs8_add_minmax_wasmsimd_params)
308     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
309     ->UseRealTime();
310   BENCHMARK_CAPTURE(qs8_vaddc, wasmsimd_x32,
311                     xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32,
312                     xnn_init_qs8_add_minmax_wasmsimd_params)
313     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
314     ->UseRealTime();
315 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
316 
317 BENCHMARK_CAPTURE(qs8_vaddc, scalar_x1,
318                   xnn_qs8_vaddc_minmax_ukernel__scalar_x1,
319                   xnn_init_qs8_add_minmax_scalar_params)
320   ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
321   ->UseRealTime();
322 BENCHMARK_CAPTURE(qs8_vaddc, scalar_x2,
323                   xnn_qs8_vaddc_minmax_ukernel__scalar_x2,
324                   xnn_init_qs8_add_minmax_scalar_params)
325   ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
326   ->UseRealTime();
327 BENCHMARK_CAPTURE(qs8_vaddc, scalar_x4,
328                   xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
329                   xnn_init_qs8_add_minmax_scalar_params)
330   ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
331   ->UseRealTime();
332 
333 #ifndef XNNPACK_BENCHMARK_NO_MAIN
334 BENCHMARK_MAIN();
335 #endif
336