1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2020 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <algorithm>
10 #include <cfloat>
11 #include <cmath>
12 #include <functional>
13 #include <random>
14 #include <vector>
15
16 #include <cpuinfo.h>
17
18 #include <benchmark/benchmark.h>
19 #include "bench/utils.h"
20 #include <xnnpack/AlignedAllocator.h>
21 #include <xnnpack/common.h>
22 #include <xnnpack/requantization-stubs.h>
23
24
25 class Requantization : public benchmark::Fixture {
26 public:
Requantization()27 inline Requantization()
28 {
29 cpuinfo_initialize();
30 const size_t l1d_size = cpuinfo_get_l1d_cache(0)->size;
31 const size_t l1d_reserve = 1024;
32 n_ = (l1d_size - l1d_reserve) / (sizeof(int32_t) + sizeof(int8_t));
33 n_ = n_ / 16 * 16;
34 }
35
SetUp(benchmark::State & state)36 virtual void SetUp(benchmark::State& state) override
37 {
38 std::random_device random_device;
39 auto rng = std::mt19937(random_device());
40 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(), std::ref(rng));
41
42 input_.resize(n());
43 std::generate(input_.begin(), input_.end(), std::ref(i32rng));
44 output_.resize(n());
45 std::fill(output_.begin(), output_.end(), 0xA5);
46
47 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
48 if (cpu_frequency != 0) {
49 state.counters["cpufreq"] = cpu_frequency;
50 }
51 }
TearDown(benchmark::State & state)52 virtual void TearDown(benchmark::State& state) override
53 {
54 state.SetItemsProcessed(uint64_t(state.iterations()) * n());
55 state.SetBytesProcessed(uint64_t(state.iterations()) * n() * (sizeof(int32_t) + sizeof(int8_t)));
56 input_.clear();
57 output_.clear();
58 }
59
input() const60 inline const int32_t* input() const
61 {
62 return input_.data();
63 }
64
output()65 inline int8_t* output()
66 {
67 return output_.data();
68 }
69
n() const70 inline size_t n() const
71 {
72 return n_;
73 }
74
75 protected:
76 std::vector<int32_t, AlignedAllocator<int32_t, 64>> input_;
77 std::vector<int8_t> output_;
78 size_t n_;
79 };
80
81
82 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_F(Requantization,fp32__neon)83 BENCHMARK_F(Requantization, fp32__neon)(benchmark::State& state) {
84 for (auto _ : state) {
85 xnn_qs8_requantize_fp32__neon(
86 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
87 }
88 }
89
BENCHMARK_F(Requantization,gemmlowp__neon)90 BENCHMARK_F(Requantization, gemmlowp__neon)(benchmark::State& state) {
91 for (auto _ : state) {
92 xnn_qs8_requantize_gemmlowp__neon(
93 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
94 }
95 }
96
BENCHMARK_F(Requantization,rndna__neon)97 BENCHMARK_F(Requantization, rndna__neon)(benchmark::State& state) {
98 for (auto _ : state) {
99 xnn_qs8_requantize_rndna__neon(
100 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
101 }
102 }
103
BENCHMARK_F(Requantization,rndnu__neon_mull)104 BENCHMARK_F(Requantization, rndnu__neon_mull)(benchmark::State& state) {
105 for (auto _ : state) {
106 xnn_qs8_requantize_rndnu__neon_mull(
107 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
108 }
109 }
110
BENCHMARK_F(Requantization,rndnu__neon_qdmulh)111 BENCHMARK_F(Requantization, rndnu__neon_qdmulh)(benchmark::State& state) {
112 for (auto _ : state) {
113 xnn_qs8_requantize_rndnu__neon_qdmulh(
114 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
115 }
116 }
117 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
118
119
120 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_F(Requantization,fp32__sse2)121 BENCHMARK_F(Requantization, fp32__sse2)(benchmark::State& state) {
122 for (auto _ : state) {
123 xnn_qs8_requantize_fp32__sse2(
124 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
125 }
126 }
127
BENCHMARK_F(Requantization,fp32__sse4)128 BENCHMARK_F(Requantization, fp32__sse4)(benchmark::State& state) {
129 for (auto _ : state) {
130 xnn_qs8_requantize_fp32__sse4(
131 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
132 }
133 }
134
BENCHMARK_F(Requantization,gemmlowp__sse2)135 BENCHMARK_F(Requantization, gemmlowp__sse2)(benchmark::State& state) {
136 for (auto _ : state) {
137 xnn_qs8_requantize_gemmlowp__sse2(
138 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
139 }
140 }
141
BENCHMARK_F(Requantization,gemmlowp__ssse3)142 BENCHMARK_F(Requantization, gemmlowp__ssse3)(benchmark::State& state) {
143 for (auto _ : state) {
144 xnn_qs8_requantize_gemmlowp__ssse3(
145 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
146 }
147 }
148
BENCHMARK_F(Requantization,gemmlowp__sse4)149 BENCHMARK_F(Requantization, gemmlowp__sse4)(benchmark::State& state) {
150 for (auto _ : state) {
151 xnn_qs8_requantize_gemmlowp__sse4(
152 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
153 }
154 }
155
BENCHMARK_F(Requantization,rndna__sse2)156 BENCHMARK_F(Requantization, rndna__sse2)(benchmark::State& state) {
157 for (auto _ : state) {
158 xnn_qs8_requantize_rndna__sse2(
159 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
160 }
161 }
162
BENCHMARK_F(Requantization,rndna__ssse3)163 BENCHMARK_F(Requantization, rndna__ssse3)(benchmark::State& state) {
164 for (auto _ : state) {
165 xnn_qs8_requantize_rndna__ssse3(
166 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
167 }
168 }
169
BENCHMARK_F(Requantization,rndna__sse4)170 BENCHMARK_F(Requantization, rndna__sse4)(benchmark::State& state) {
171 for (auto _ : state) {
172 xnn_qs8_requantize_rndna__sse4(
173 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
174 }
175 }
176
BENCHMARK_F(Requantization,rndnu__sse4_sra)177 BENCHMARK_F(Requantization, rndnu__sse4_sra)(benchmark::State& state) {
178 for (auto _ : state) {
179 xnn_qs8_requantize_rndnu__sse4_sra(
180 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
181 }
182 }
183
BENCHMARK_F(Requantization,rndnu__sse4_srl)184 BENCHMARK_F(Requantization, rndnu__sse4_srl)(benchmark::State& state) {
185 for (auto _ : state) {
186 xnn_qs8_requantize_rndnu__sse4_srl(
187 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
188 }
189 }
190 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
191
192
193 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
BENCHMARK_F(Requantization,fp32__wasmsimd)194 BENCHMARK_F(Requantization, fp32__wasmsimd)(benchmark::State& state) {
195 for (auto _ : state) {
196 xnn_qs8_requantize_fp32__wasmsimd(
197 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
198 }
199 }
200
BENCHMARK_F(Requantization,gemmlowp__wasmsimd)201 BENCHMARK_F(Requantization, gemmlowp__wasmsimd)(benchmark::State& state) {
202 for (auto _ : state) {
203 xnn_qs8_requantize_gemmlowp__wasmsimd(
204 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
205 }
206 }
207 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
208
209
BENCHMARK_F(Requantization,fp32__scalar_lrintf)210 BENCHMARK_F(Requantization, fp32__scalar_lrintf)(benchmark::State& state) {
211 for (auto _ : state) {
212 xnn_qs8_requantize_fp32__scalar_lrintf(
213 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
214 }
215 }
216
BENCHMARK_F(Requantization,fp32__scalar_fmagic)217 BENCHMARK_F(Requantization, fp32__scalar_fmagic)(benchmark::State& state) {
218 for (auto _ : state) {
219 xnn_qs8_requantize_fp32__scalar_fmagic(
220 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
221 }
222 }
223
BENCHMARK_F(Requantization,gemmlowp__scalar)224 BENCHMARK_F(Requantization, gemmlowp__scalar)(benchmark::State& state) {
225 for (auto _ : state) {
226 xnn_qs8_requantize_gemmlowp__scalar(
227 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
228 }
229 }
230
BENCHMARK_F(Requantization,rndna__scalar_signed64)231 BENCHMARK_F(Requantization, rndna__scalar_signed64)(benchmark::State& state) {
232 for (auto _ : state) {
233 xnn_qs8_requantize_rndna__scalar_signed64(
234 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
235 }
236 }
237
BENCHMARK_F(Requantization,rndna__scalar_unsigned32)238 BENCHMARK_F(Requantization, rndna__scalar_unsigned32)(benchmark::State& state) {
239 for (auto _ : state) {
240 xnn_qs8_requantize_rndna__scalar_unsigned32(
241 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
242 }
243 }
244
BENCHMARK_F(Requantization,rndna__scalar_unsigned64)245 BENCHMARK_F(Requantization, rndna__scalar_unsigned64)(benchmark::State& state) {
246 for (auto _ : state) {
247 xnn_qs8_requantize_rndna__scalar_unsigned64(
248 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
249 }
250 }
251
BENCHMARK_F(Requantization,rndnu__scalar)252 BENCHMARK_F(Requantization, rndnu__scalar)(benchmark::State& state) {
253 for (auto _ : state) {
254 xnn_qs8_requantize_rndnu__scalar(
255 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
256 }
257 }
258
259
260 #ifndef XNNPACK_BENCHMARK_NO_MAIN
261 BENCHMARK_MAIN();
262 #endif
263