• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <algorithm>
10 #include <cfloat>
11 #include <cmath>
12 #include <functional>
13 #include <random>
14 #include <vector>
15 
16 #include <cpuinfo.h>
17 
18 #include <benchmark/benchmark.h>
19 #include "bench/utils.h"
20 #include <xnnpack/AlignedAllocator.h>
21 #include <xnnpack/common.h>
22 #include <xnnpack/requantization-stubs.h>
23 
divideRoundUp(uint32_t x,uint32_t q)24 inline uint32_t divideRoundUp(uint32_t x, uint32_t q)
25 {
26   return x / q + uint32_t(x % q != 0);
27 }
28 
roundUp(uint32_t x,uint32_t q)29 inline uint32_t roundUp(uint32_t x, uint32_t q)
30 {
31   return q * divideRoundUp(x, q);
32 }
33 
min(uint32_t a,uint32_t b)34 inline uint32_t min(uint32_t a, uint32_t b)
35 {
36   return a < b ? a : b;
37 }
38 
39 class Requantization : public benchmark::Fixture {
40  public:
Requantization()41   inline Requantization()
42   {
43     cpuinfo_initialize();
44     const size_t l1d_size = cpuinfo_get_l1d_cache(0)->size;
45     const size_t l1d_reserve = 1024;
46     n_ = (l1d_size - l1d_reserve) / (sizeof(int32_t) + sizeof(uint8_t));
47     n_ = n_ / 16 * 16;
48   }
49 
SetUp(const benchmark::State &)50   virtual void SetUp(const benchmark::State&) override
51   {
52     std::random_device random_device;
53     auto rng = std::mt19937(random_device());
54     auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(), rng);
55 
56     input_.resize(n());
57     std::generate(input_.begin(), input_.end(), std::ref(s32rng));
58     output_.resize(n());
59     std::fill(output_.begin(), output_.end(), 0xA5);
60   }
61 
TearDown(benchmark::State & state)62   virtual void TearDown(benchmark::State& state) override
63   {
64     state.SetItemsProcessed(uint64_t(state.iterations()) * n());
65     state.SetBytesProcessed(uint64_t(state.iterations()) * n() * (sizeof(int32_t) + sizeof(uint8_t)));
66     input_.clear();
67     output_.clear();
68   }
69 
input() const70   inline const int32_t* input() const
71   {
72     return input_.data();
73   }
74 
output()75   inline uint8_t* output()
76   {
77     return output_.data();
78   }
79 
n() const80   inline size_t n() const
81   {
82     return n_;
83   }
84 
85  protected:
86   std::vector<int32_t, AlignedAllocator<int32_t, 32>> input_;
87   std::vector<uint8_t> output_;
88   size_t n_;
89 };
90 
BENCHMARK_F(Requantization,precise__scalar_unsigned32)91 BENCHMARK_F(Requantization, precise__scalar_unsigned32)(benchmark::State& state)
92 {
93   for (auto _ : state) {
94     xnn_requantize_precise__scalar_unsigned32(
95         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
96   }
97 }
98 
BENCHMARK_F(Requantization,precise__scalar_unsigned64)99 BENCHMARK_F(Requantization, precise__scalar_unsigned64)(benchmark::State& state)
100 {
101   for (auto _ : state) {
102     xnn_requantize_precise__scalar_unsigned64(
103         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
104   }
105 }
106 
BENCHMARK_F(Requantization,precise__scalar_signed64)107 BENCHMARK_F(Requantization, precise__scalar_signed64)(benchmark::State& state)
108 {
109   for (auto _ : state) {
110     xnn_requantize_precise__scalar_signed64(
111         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
112   }
113 }
114 
BENCHMARK_F(Requantization,fp32__scalar_lrintf)115 BENCHMARK_F(Requantization, fp32__scalar_lrintf)(benchmark::State& state)
116 {
117   for (auto _ : state) {
118     xnn_requantize_fp32__scalar_lrintf(
119         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
120   }
121 }
122 
BENCHMARK_F(Requantization,fp32__scalar_magic)123 BENCHMARK_F(Requantization, fp32__scalar_magic)(benchmark::State& state)
124 {
125   for (auto _ : state) {
126     xnn_requantize_fp32__scalar_magic(
127         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
128   }
129 }
130 
BENCHMARK_F(Requantization,gemmlowp__scalar)131 BENCHMARK_F(Requantization, gemmlowp__scalar)(benchmark::State& state)
132 {
133   for (auto _ : state) {
134     xnn_requantize_gemmlowp__scalar(
135         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
136   }
137 }
138 
BENCHMARK_F(Requantization,precise__psimd)139 BENCHMARK_F(Requantization, precise__psimd)(benchmark::State& state)
140 {
141   for (auto _ : state) {
142     xnn_requantize_precise__psimd(
143         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
144   }
145 }
146 
BENCHMARK_F(Requantization,fp32__psimd)147 BENCHMARK_F(Requantization, fp32__psimd)(benchmark::State& state)
148 {
149   for (auto _ : state) {
150     xnn_requantize_fp32__psimd(
151         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
152   }
153 }
154 
155 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_F(Requantization,precise__neon)156 BENCHMARK_F(Requantization, precise__neon)(benchmark::State& state)
157 {
158   for (auto _ : state) {
159     xnn_requantize_precise__neon(
160         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
161   }
162 }
163 
BENCHMARK_F(Requantization,fp32__neon)164 BENCHMARK_F(Requantization, fp32__neon)(benchmark::State& state)
165 {
166   for (auto _ : state) {
167     xnn_requantize_fp32__neon(
168         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
169   }
170 }
171 
BENCHMARK_F(Requantization,q31__neon)172 BENCHMARK_F(Requantization, q31__neon)(benchmark::State& state)
173 {
174   for (auto _ : state) {
175     xnn_requantize_q31__neon(
176         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
177   }
178 }
179 
BENCHMARK_F(Requantization,gemmlowp__neon)180 BENCHMARK_F(Requantization, gemmlowp__neon)(benchmark::State& state)
181 {
182   for (auto _ : state) {
183     xnn_requantize_gemmlowp__neon(
184         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
185   }
186 }
187 #endif
188 
189 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_F(Requantization,precise__sse2)190 BENCHMARK_F(Requantization, precise__sse2)(benchmark::State& state)
191 {
192   for (auto _ : state) {
193     xnn_requantize_precise__sse2(
194         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
195   }
196 }
197 
BENCHMARK_F(Requantization,precise__ssse3)198 BENCHMARK_F(Requantization, precise__ssse3)(benchmark::State& state)
199 {
200   for (auto _ : state) {
201     xnn_requantize_precise__ssse3(
202         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
203   }
204 }
205 
BENCHMARK_F(Requantization,precise__sse4)206 BENCHMARK_F(Requantization, precise__sse4)(benchmark::State& state)
207 {
208   for (auto _ : state) {
209     xnn_requantize_precise__sse4(
210         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
211   }
212 }
213 
BENCHMARK_F(Requantization,fp32__sse2)214 BENCHMARK_F(Requantization, fp32__sse2)(benchmark::State& state)
215 {
216   for (auto _ : state) {
217     xnn_requantize_fp32__sse2(
218         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
219   }
220 }
221 
BENCHMARK_F(Requantization,q31__sse2)222 BENCHMARK_F(Requantization, q31__sse2)(benchmark::State& state)
223 {
224   for (auto _ : state) {
225     xnn_requantize_q31__sse2(
226         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
227   }
228 }
229 
BENCHMARK_F(Requantization,q31__ssse3)230 BENCHMARK_F(Requantization, q31__ssse3)(benchmark::State& state)
231 {
232   for (auto _ : state) {
233     xnn_requantize_q31__ssse3(
234         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
235   }
236 }
237 
BENCHMARK_F(Requantization,q31__sse4)238 BENCHMARK_F(Requantization, q31__sse4)(benchmark::State& state)
239 {
240   for (auto _ : state) {
241     xnn_requantize_q31__sse4(
242         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
243   }
244 }
245 
BENCHMARK_F(Requantization,gemmlowp__sse2)246 BENCHMARK_F(Requantization, gemmlowp__sse2)(benchmark::State& state)
247 {
248   for (auto _ : state) {
249     xnn_requantize_gemmlowp__sse2(
250         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
251   }
252 }
253 
BENCHMARK_F(Requantization,gemmlowp__ssse3)254 BENCHMARK_F(Requantization, gemmlowp__ssse3)(benchmark::State& state)
255 {
256   for (auto _ : state) {
257     xnn_requantize_gemmlowp__ssse3(
258         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
259   }
260 }
261 
BENCHMARK_F(Requantization,gemmlowp__sse4)262 BENCHMARK_F(Requantization, gemmlowp__sse4)(benchmark::State& state)
263 {
264   for (auto _ : state) {
265     xnn_requantize_gemmlowp__sse4(
266         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
267   }
268 }
269 #endif
270 
271 #ifndef XNNPACK_BENCHMARK_NO_MAIN
272 BENCHMARK_MAIN();
273 #endif
274