1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11
12 #include <benchmark/benchmark.h>
13 #include "bench/utils.h"
14
15 #include <xnnpack/AlignedAllocator.h>
16 #include <xnnpack/common.h>
17 #include <xnnpack/params.h>
18 #include <xnnpack/params-init.h>
19 #include <xnnpack/vaddsub.h>
20
21
qs8_vaddc(benchmark::State & state,xnn_qs8_vaddsub_minmax_ukernel_function vaddc,xnn_init_qs8_addsub_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)22 static void qs8_vaddc(
23 benchmark::State& state,
24 xnn_qs8_vaddsub_minmax_ukernel_function vaddc,
25 xnn_init_qs8_addsub_minmax_params_fn init_params,
26 benchmark::utils::IsaCheckFunction isa_check = nullptr)
27 {
28 if (isa_check && !isa_check(state)) {
29 return;
30 }
31
32 const size_t num_elements = state.range(0);
33
34 std::random_device random_device;
35 auto rng = std::mt19937(random_device());
36 auto i8rng = std::bind(
37 std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
38 std::ref(rng));
39
40 std::vector<int8_t, AlignedAllocator<int8_t, 64>> a(num_elements);
41 std::vector<int8_t, AlignedAllocator<int8_t, 64>> sum(num_elements);
42 std::generate(a.begin(), a.end(), std::ref(i8rng));
43 const int8_t b = i8rng();
44
45 union xnn_qs8_addsub_minmax_params params;
46 init_params(¶ms,
47 1 /* a zero point */, 1 /* b zero point */, 1 /* output zero point */,
48 0.5f /* a-output scale */, 0.75f /* b-output scale */,
49 std::numeric_limits<int8_t>::min() + 1, std::numeric_limits<int8_t>::max() - 1);
50 for (auto _ : state) {
51 vaddc(num_elements * sizeof(int8_t), a.data(), &b, sum.data(), ¶ms);
52 }
53
54 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
55 if (cpu_frequency != 0) {
56 state.counters["cpufreq"] = cpu_frequency;
57 }
58
59 const size_t num_elements_per_iteration = num_elements;
60 state.counters["num_elements"] =
61 benchmark::Counter(uint64_t(state.iterations()) * num_elements_per_iteration, benchmark::Counter::kIsRate);
62
63 const size_t bytes_per_iteration = 2 * num_elements * sizeof(int8_t);
64 state.counters["bytes"] =
65 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
66 }
67
68 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
69 BENCHMARK_CAPTURE(qs8_vaddc, neon_ld64_x8,
70 xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8,
71 xnn_init_qs8_add_minmax_neon_params,
72 benchmark::utils::CheckNEON)
73 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
74 ->UseRealTime();
75 BENCHMARK_CAPTURE(qs8_vaddc, neon_ld64_x16,
76 xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16,
77 xnn_init_qs8_add_minmax_neon_params,
78 benchmark::utils::CheckNEON)
79 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
80 ->UseRealTime();
81 BENCHMARK_CAPTURE(qs8_vaddc, neon_ld64_x24,
82 xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x24,
83 xnn_init_qs8_add_minmax_neon_params,
84 benchmark::utils::CheckNEON)
85 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
86 ->UseRealTime();
87 BENCHMARK_CAPTURE(qs8_vaddc, neon_ld64_x32,
88 xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32,
89 xnn_init_qs8_add_minmax_neon_params,
90 benchmark::utils::CheckNEON)
91 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
92 ->UseRealTime();
93
94 BENCHMARK_CAPTURE(qs8_vaddc, neon_ld128_x16,
95 xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16,
96 xnn_init_qs8_add_minmax_neon_params,
97 benchmark::utils::CheckNEON)
98 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
99 ->UseRealTime();
100 BENCHMARK_CAPTURE(qs8_vaddc, neon_ld128_x32,
101 xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32,
102 xnn_init_qs8_add_minmax_neon_params,
103 benchmark::utils::CheckNEON)
104 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
105 ->UseRealTime();
106 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
107
108 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
109 BENCHMARK_CAPTURE(qs8_vaddc, avx512skx_mul32_ld128_x16,
110 xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
111 xnn_init_qs8_add_minmax_avx512_params,
112 benchmark::utils::CheckAVX512SKX)
113 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
114 ->UseRealTime();
115 BENCHMARK_CAPTURE(qs8_vaddc, avx512skx_mul32_ld128_x32,
116 xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x32,
117 xnn_init_qs8_add_minmax_avx512_params,
118 benchmark::utils::CheckAVX512SKX)
119 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
120 ->UseRealTime();
121
122 BENCHMARK_CAPTURE(qs8_vaddc, avx2_mul32_ld64_x8,
123 xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x8,
124 xnn_init_qs8_add_minmax_avx2_params,
125 benchmark::utils::CheckAVX2)
126 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
127 ->UseRealTime();
128 BENCHMARK_CAPTURE(qs8_vaddc, avx2_mul32_ld64_x16,
129 xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
130 xnn_init_qs8_add_minmax_avx2_params,
131 benchmark::utils::CheckAVX2)
132 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
133 ->UseRealTime();
134 BENCHMARK_CAPTURE(qs8_vaddc, avx2_mul32_ld64_x24,
135 xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x24,
136 xnn_init_qs8_add_minmax_avx2_params,
137 benchmark::utils::CheckAVX2)
138 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
139 ->UseRealTime();
140 BENCHMARK_CAPTURE(qs8_vaddc, avx2_mul32_ld64_x32,
141 xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x32,
142 xnn_init_qs8_add_minmax_avx2_params,
143 benchmark::utils::CheckAVX2)
144 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
145 ->UseRealTime();
146
147 BENCHMARK_CAPTURE(qs8_vaddc, xop_mul32_ld32_x8,
148 xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
149 xnn_init_qs8_add_minmax_sse4_mul32_params,
150 benchmark::utils::CheckXOP)
151 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
152 ->UseRealTime();
153 BENCHMARK_CAPTURE(qs8_vaddc, xop_mul32_ld32_x16,
154 xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x16,
155 xnn_init_qs8_add_minmax_sse4_mul32_params,
156 benchmark::utils::CheckXOP)
157 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
158 ->UseRealTime();
159 BENCHMARK_CAPTURE(qs8_vaddc, xop_mul32_ld32_x24,
160 xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x24,
161 xnn_init_qs8_add_minmax_sse4_mul32_params,
162 benchmark::utils::CheckXOP)
163 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
164 ->UseRealTime();
165 BENCHMARK_CAPTURE(qs8_vaddc, xop_mul32_ld32_x32,
166 xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x32,
167 xnn_init_qs8_add_minmax_sse4_mul32_params,
168 benchmark::utils::CheckXOP)
169 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
170 ->UseRealTime();
171
172 BENCHMARK_CAPTURE(qs8_vaddc, avx_mul16_ld64_x8,
173 xnn_qs8_vaddc_minmax_ukernel__avx_mul16_ld64_x8,
174 xnn_init_qs8_add_minmax_sse4_mul16_params,
175 benchmark::utils::CheckAVX)
176 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
177 ->UseRealTime();
178 BENCHMARK_CAPTURE(qs8_vaddc, avx_mul16_ld64_x16,
179 xnn_qs8_vaddc_minmax_ukernel__avx_mul16_ld64_x16,
180 xnn_init_qs8_add_minmax_sse4_mul16_params,
181 benchmark::utils::CheckAVX)
182 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
183 ->UseRealTime();
184 BENCHMARK_CAPTURE(qs8_vaddc, avx_mul16_ld64_x24,
185 xnn_qs8_vaddc_minmax_ukernel__avx_mul16_ld64_x24,
186 xnn_init_qs8_add_minmax_sse4_mul16_params,
187 benchmark::utils::CheckAVX)
188 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
189 ->UseRealTime();
190 BENCHMARK_CAPTURE(qs8_vaddc, avx_mul16_ld64_x32,
191 xnn_qs8_vaddc_minmax_ukernel__avx_mul16_ld64_x32,
192 xnn_init_qs8_add_minmax_sse4_mul16_params,
193 benchmark::utils::CheckAVX)
194 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
195 ->UseRealTime();
196
197 BENCHMARK_CAPTURE(qs8_vaddc, avx_mul32_ld32_x8,
198 xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
199 xnn_init_qs8_add_minmax_sse4_mul32_params,
200 benchmark::utils::CheckAVX)
201 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
202 ->UseRealTime();
203 BENCHMARK_CAPTURE(qs8_vaddc, avx_mul32_ld32_x16,
204 xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x16,
205 xnn_init_qs8_add_minmax_sse4_mul32_params,
206 benchmark::utils::CheckAVX)
207 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
208 ->UseRealTime();
209 BENCHMARK_CAPTURE(qs8_vaddc, avx_mul32_ld32_x24,
210 xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x24,
211 xnn_init_qs8_add_minmax_sse4_mul32_params,
212 benchmark::utils::CheckAVX)
213 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
214 ->UseRealTime();
215 BENCHMARK_CAPTURE(qs8_vaddc, avx_mul32_ld32_x32,
216 xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x32,
217 xnn_init_qs8_add_minmax_sse4_mul32_params,
218 benchmark::utils::CheckAVX)
219 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
220 ->UseRealTime();
221
222 BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul16_ld64_x8,
223 xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
224 xnn_init_qs8_add_minmax_sse4_mul16_params,
225 benchmark::utils::CheckSSE41)
226 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
227 ->UseRealTime();
228 BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul16_ld64_x16,
229 xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16,
230 xnn_init_qs8_add_minmax_sse4_mul16_params,
231 benchmark::utils::CheckSSE41)
232 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
233 ->UseRealTime();
234 BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul16_ld64_x24,
235 xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24,
236 xnn_init_qs8_add_minmax_sse4_mul16_params,
237 benchmark::utils::CheckSSE41)
238 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
239 ->UseRealTime();
240 BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul16_ld64_x32,
241 xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32,
242 xnn_init_qs8_add_minmax_sse4_mul16_params,
243 benchmark::utils::CheckSSE41)
244 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
245 ->UseRealTime();
246
247 BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul32_ld32_x8,
248 xnn_qs8_vaddc_minmax_ukernel__sse41_mul32_ld32_x8,
249 xnn_init_qs8_add_minmax_sse4_mul32_params,
250 benchmark::utils::CheckSSE41)
251 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
252 ->UseRealTime();
253 BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul32_ld32_x16,
254 xnn_qs8_vaddc_minmax_ukernel__sse41_mul32_ld32_x16,
255 xnn_init_qs8_add_minmax_sse4_mul32_params,
256 benchmark::utils::CheckSSE41)
257 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
258 ->UseRealTime();
259 BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul32_ld32_x24,
260 xnn_qs8_vaddc_minmax_ukernel__sse41_mul32_ld32_x24,
261 xnn_init_qs8_add_minmax_sse4_mul32_params,
262 benchmark::utils::CheckSSE41)
263 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
264 ->UseRealTime();
265 BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul32_ld32_x32,
266 xnn_qs8_vaddc_minmax_ukernel__sse41_mul32_ld32_x32,
267 xnn_init_qs8_add_minmax_sse4_mul32_params,
268 benchmark::utils::CheckSSE41)
269 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
270 ->UseRealTime();
271
272 BENCHMARK_CAPTURE(qs8_vaddc, sse2_mul16_ld64_x8,
273 xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
274 xnn_init_qs8_add_minmax_sse2_params)
275 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
276 ->UseRealTime();
277 BENCHMARK_CAPTURE(qs8_vaddc, sse2_mul16_ld64_x16,
278 xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16,
279 xnn_init_qs8_add_minmax_sse2_params)
280 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
281 ->UseRealTime();
282 BENCHMARK_CAPTURE(qs8_vaddc, sse2_mul16_ld64_x24,
283 xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24,
284 xnn_init_qs8_add_minmax_sse2_params)
285 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
286 ->UseRealTime();
287 BENCHMARK_CAPTURE(qs8_vaddc, sse2_mul16_ld64_x32,
288 xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32,
289 xnn_init_qs8_add_minmax_sse2_params)
290 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
291 ->UseRealTime();
292 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
293
294 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
295 BENCHMARK_CAPTURE(qs8_vaddc, wasmsimd_x8,
296 xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8,
297 xnn_init_qs8_add_minmax_wasmsimd_params)
298 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
299 ->UseRealTime();
300 BENCHMARK_CAPTURE(qs8_vaddc, wasmsimd_x16,
301 xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x16,
302 xnn_init_qs8_add_minmax_wasmsimd_params)
303 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
304 ->UseRealTime();
305 BENCHMARK_CAPTURE(qs8_vaddc, wasmsimd_x24,
306 xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x24,
307 xnn_init_qs8_add_minmax_wasmsimd_params)
308 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
309 ->UseRealTime();
310 BENCHMARK_CAPTURE(qs8_vaddc, wasmsimd_x32,
311 xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32,
312 xnn_init_qs8_add_minmax_wasmsimd_params)
313 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
314 ->UseRealTime();
315 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
316
317 BENCHMARK_CAPTURE(qs8_vaddc, scalar_x1,
318 xnn_qs8_vaddc_minmax_ukernel__scalar_x1,
319 xnn_init_qs8_add_minmax_scalar_params)
320 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
321 ->UseRealTime();
322 BENCHMARK_CAPTURE(qs8_vaddc, scalar_x2,
323 xnn_qs8_vaddc_minmax_ukernel__scalar_x2,
324 xnn_init_qs8_add_minmax_scalar_params)
325 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
326 ->UseRealTime();
327 BENCHMARK_CAPTURE(qs8_vaddc, scalar_x4,
328 xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
329 xnn_init_qs8_add_minmax_scalar_params)
330 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
331 ->UseRealTime();
332
333 #ifndef XNNPACK_BENCHMARK_NO_MAIN
334 BENCHMARK_MAIN();
335 #endif
336