• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <array>
8 #include <cmath>
9 #include <functional>
10 #include <random>
11 #include <vector>
12 
13 #include <benchmark/benchmark.h>
14 #include "bench/utils.h"
15 
16 #include <xnnpack/AlignedAllocator.h>
17 #include <xnnpack/common.h>
18 #include <xnnpack/params.h>
19 #include <xnnpack/lut.h>
20 
21 
x8_lut(benchmark::State & state,xnn_x8_lut_ukernel_function lut,benchmark::utils::IsaCheckFunction isa_check=nullptr)22 static void x8_lut(
23   benchmark::State& state,
24   xnn_x8_lut_ukernel_function lut,
25   benchmark::utils::IsaCheckFunction isa_check = nullptr)
26 {
27   if (isa_check && !isa_check(state)) {
28     return;
29   }
30 
31   const size_t num_elements = state.range(0);
32   std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> input(num_elements);
33   std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> output(num_elements);
34   std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> table(256);
35 
36   std::random_device random_device;
37   auto rng = std::mt19937(random_device());
38   auto u8rng = std::bind(
39     std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
40   std::generate(input.begin(), input.end(), std::ref(u8rng));
41   std::generate(table.begin(), table.end(), std::ref(u8rng));
42   std::fill(output.begin(), output.end(), UINT8_C(0xAA));
43 
44   for (auto _ : state) {
45     lut(num_elements * sizeof(uint8_t), input.data(), output.data(), table.data());
46   }
47 
48   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
49   if (cpu_frequency != 0) {
50     state.counters["cpufreq"] = cpu_frequency;
51   }
52 
53   const size_t elements_per_iteration = num_elements;
54   state.counters["elements"] =
55     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
56 
57   const size_t bytes_per_iteration = 2 * num_elements * sizeof(uint8_t);
58   state.counters["bytes"] =
59     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
60 }
61 
62 #if XNN_ARCH_ARM64
63   BENCHMARK_CAPTURE(x8_lut, neon_tbx128x4_x16,
64                     xnn_x8_lut_ukernel__neon_tbx128x4_x16)
65     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
66     ->UseRealTime();
67   BENCHMARK_CAPTURE(x8_lut, neon_tbx128x4_x32,
68                     xnn_x8_lut_ukernel__neon_tbx128x4_x32)
69     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
70     ->UseRealTime();
71   BENCHMARK_CAPTURE(x8_lut, neon_tbx128x4_x48,
72                     xnn_x8_lut_ukernel__neon_tbx128x4_x48)
73     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
74     ->UseRealTime();
75   BENCHMARK_CAPTURE(x8_lut, neon_tbx128x4_x64,
76                     xnn_x8_lut_ukernel__neon_tbx128x4_x64)
77     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
78     ->UseRealTime();
79 #endif  // XNN_ARCH_ARM64
80 
81 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
82   BENCHMARK_CAPTURE(x8_lut, avx512skx_vpshufb_x64,
83                     xnn_x8_lut_ukernel__avx512skx_vpshufb_x64,
84                     benchmark::utils::CheckAVX512SKX)
85     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
86     ->UseRealTime();
87   BENCHMARK_CAPTURE(x8_lut, avx512skx_vpshufb_x128,
88                     xnn_x8_lut_ukernel__avx512skx_vpshufb_x128,
89                     benchmark::utils::CheckAVX512SKX)
90     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
91     ->UseRealTime();
92   BENCHMARK_CAPTURE(x8_lut, avx512skx_vpshufb_x192,
93                     xnn_x8_lut_ukernel__avx512skx_vpshufb_x192,
94                     benchmark::utils::CheckAVX512SKX)
95     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
96     ->UseRealTime();
97   BENCHMARK_CAPTURE(x8_lut, avx512skx_vpshufb_x256,
98                     xnn_x8_lut_ukernel__avx512skx_vpshufb_x256,
99                     benchmark::utils::CheckAVX512SKX)
100     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
101     ->UseRealTime();
102 
103   BENCHMARK_CAPTURE(x8_lut, avx2_x32,
104                     xnn_x8_lut_ukernel__avx2_x32,
105                     benchmark::utils::CheckAVX2)
106     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
107     ->UseRealTime();
108   BENCHMARK_CAPTURE(x8_lut, avx2_x64,
109                     xnn_x8_lut_ukernel__avx2_x64,
110                     benchmark::utils::CheckAVX2)
111     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
112     ->UseRealTime();
113   BENCHMARK_CAPTURE(x8_lut, avx2_x96,
114                     xnn_x8_lut_ukernel__avx2_x96,
115                     benchmark::utils::CheckAVX2)
116     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
117     ->UseRealTime();
118   BENCHMARK_CAPTURE(x8_lut, avx2_x128,
119                     xnn_x8_lut_ukernel__avx2_x128,
120                     benchmark::utils::CheckAVX2)
121     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
122     ->UseRealTime();
123 
124   BENCHMARK_CAPTURE(x8_lut, avx_x16,
125                     xnn_x8_lut_ukernel__avx_x16,
126                     benchmark::utils::CheckAVX)
127     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
128     ->UseRealTime();
129   BENCHMARK_CAPTURE(x8_lut, avx_x32,
130                     xnn_x8_lut_ukernel__avx_x32,
131                     benchmark::utils::CheckAVX)
132     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
133     ->UseRealTime();
134   BENCHMARK_CAPTURE(x8_lut, avx_x48,
135                     xnn_x8_lut_ukernel__avx_x48,
136                     benchmark::utils::CheckAVX)
137     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
138     ->UseRealTime();
139   BENCHMARK_CAPTURE(x8_lut, avx_x64,
140                     xnn_x8_lut_ukernel__avx_x64,
141                     benchmark::utils::CheckAVX)
142     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
143     ->UseRealTime();
144 
145   BENCHMARK_CAPTURE(x8_lut, ssse3_x16,
146                     xnn_x8_lut_ukernel__ssse3_x16,
147                     benchmark::utils::CheckSSSE3)
148     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
149     ->UseRealTime();
150   BENCHMARK_CAPTURE(x8_lut, ssse3_x32,
151                     xnn_x8_lut_ukernel__ssse3_x32,
152                     benchmark::utils::CheckSSSE3)
153     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
154     ->UseRealTime();
155 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
156 
157 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
158   BENCHMARK_CAPTURE(x8_lut, wasmsimd_x16,
159                     xnn_x8_lut_ukernel__wasmsimd_x16)
160     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
161     ->UseRealTime();
162   BENCHMARK_CAPTURE(x8_lut, wasmsimd_x32,
163                     xnn_x8_lut_ukernel__wasmsimd_x32)
164     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
165     ->UseRealTime();
166   BENCHMARK_CAPTURE(x8_lut, wasmsimd_x48,
167                     xnn_x8_lut_ukernel__wasmsimd_x48)
168     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
169     ->UseRealTime();
170   BENCHMARK_CAPTURE(x8_lut, wasmsimd_x64,
171                     xnn_x8_lut_ukernel__wasmsimd_x64)
172     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
173     ->UseRealTime();
174 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
175 
176 BENCHMARK_CAPTURE(x8_lut, scalar_x1,
177                   xnn_x8_lut_ukernel__scalar_x1)
178   ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
179   ->UseRealTime();
180 BENCHMARK_CAPTURE(x8_lut, scalar_x2,
181                   xnn_x8_lut_ukernel__scalar_x2)
182   ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
183   ->UseRealTime();
184 BENCHMARK_CAPTURE(x8_lut, scalar_x4,
185                   xnn_x8_lut_ukernel__scalar_x4)
186   ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
187   ->UseRealTime();
188 BENCHMARK_CAPTURE(x8_lut, scalar_x8,
189                   xnn_x8_lut_ukernel__scalar_x8)
190   ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
191   ->UseRealTime();
192 BENCHMARK_CAPTURE(x8_lut, scalar_x16,
193                   xnn_x8_lut_ukernel__scalar_x16)
194   ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
195   ->UseRealTime();
196 
197 #ifndef XNNPACK_BENCHMARK_NO_MAIN
198 BENCHMARK_MAIN();
199 #endif
200