• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2020 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <algorithm>
10 #include <cfloat>
11 #include <cmath>
12 #include <functional>
13 #include <random>
14 #include <vector>
15 
16 #include <cpuinfo.h>
17 
18 #include <benchmark/benchmark.h>
19 #include "bench/utils.h"
20 #include <xnnpack/AlignedAllocator.h>
21 #include <xnnpack/common.h>
22 #include <xnnpack/requantization-stubs.h>
23 
24 
25 class Requantization : public benchmark::Fixture {
26  public:
Requantization()27   inline Requantization()
28   {
29     cpuinfo_initialize();
30     const size_t l1d_size = cpuinfo_get_l1d_cache(0)->size;
31     const size_t l1d_reserve = 1024;
32     n_ = (l1d_size - l1d_reserve) / (sizeof(int32_t) + sizeof(int8_t));
33     n_ = n_ / 16 * 16;
34   }
35 
SetUp(benchmark::State & state)36   virtual void SetUp(benchmark::State& state) override
37   {
38     std::random_device random_device;
39     auto rng = std::mt19937(random_device());
40     auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(), std::ref(rng));
41 
42     input_.resize(n());
43     std::generate(input_.begin(), input_.end(), std::ref(i32rng));
44     output_.resize(n());
45     std::fill(output_.begin(), output_.end(), 0xA5);
46 
47     const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
48     if (cpu_frequency != 0) {
49       state.counters["cpufreq"] = cpu_frequency;
50     }
51   }
TearDown(benchmark::State & state)52   virtual void TearDown(benchmark::State& state) override
53   {
54     state.SetItemsProcessed(uint64_t(state.iterations()) * n());
55     state.SetBytesProcessed(uint64_t(state.iterations()) * n() * (sizeof(int32_t) + sizeof(int8_t)));
56     input_.clear();
57     output_.clear();
58   }
59 
input() const60   inline const int32_t* input() const
61   {
62     return input_.data();
63   }
64 
output()65   inline int8_t* output()
66   {
67     return output_.data();
68   }
69 
n() const70   inline size_t n() const
71   {
72     return n_;
73   }
74 
75  protected:
76   std::vector<int32_t, AlignedAllocator<int32_t, 64>> input_;
77   std::vector<int8_t> output_;
78   size_t n_;
79 };
80 
81 
82 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_F(Requantization,fp32__neon)83   BENCHMARK_F(Requantization, fp32__neon)(benchmark::State& state) {
84     for (auto _ : state) {
85       xnn_qs8_requantize_fp32__neon(
86           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
87     }
88   }
89 
BENCHMARK_F(Requantization,gemmlowp__neon)90   BENCHMARK_F(Requantization, gemmlowp__neon)(benchmark::State& state) {
91     for (auto _ : state) {
92       xnn_qs8_requantize_gemmlowp__neon(
93           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
94     }
95   }
96 
BENCHMARK_F(Requantization,rndna__neon)97   BENCHMARK_F(Requantization, rndna__neon)(benchmark::State& state) {
98     for (auto _ : state) {
99       xnn_qs8_requantize_rndna__neon(
100           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
101     }
102   }
103 
BENCHMARK_F(Requantization,rndnu__neon_mull)104   BENCHMARK_F(Requantization, rndnu__neon_mull)(benchmark::State& state) {
105     for (auto _ : state) {
106       xnn_qs8_requantize_rndnu__neon_mull(
107           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
108     }
109   }
110 
BENCHMARK_F(Requantization,rndnu__neon_qdmulh)111   BENCHMARK_F(Requantization, rndnu__neon_qdmulh)(benchmark::State& state) {
112     for (auto _ : state) {
113       xnn_qs8_requantize_rndnu__neon_qdmulh(
114           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
115     }
116   }
117 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
118 
119 
120 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_F(Requantization,fp32__sse2)121   BENCHMARK_F(Requantization, fp32__sse2)(benchmark::State& state) {
122     for (auto _ : state) {
123       xnn_qs8_requantize_fp32__sse2(
124           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
125     }
126   }
127 
BENCHMARK_F(Requantization,fp32__sse4)128   BENCHMARK_F(Requantization, fp32__sse4)(benchmark::State& state) {
129     for (auto _ : state) {
130       xnn_qs8_requantize_fp32__sse4(
131           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
132     }
133   }
134 
BENCHMARK_F(Requantization,gemmlowp__sse2)135   BENCHMARK_F(Requantization, gemmlowp__sse2)(benchmark::State& state) {
136     for (auto _ : state) {
137       xnn_qs8_requantize_gemmlowp__sse2(
138           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
139     }
140   }
141 
BENCHMARK_F(Requantization,gemmlowp__ssse3)142   BENCHMARK_F(Requantization, gemmlowp__ssse3)(benchmark::State& state) {
143     for (auto _ : state) {
144       xnn_qs8_requantize_gemmlowp__ssse3(
145           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
146     }
147   }
148 
BENCHMARK_F(Requantization,gemmlowp__sse4)149   BENCHMARK_F(Requantization, gemmlowp__sse4)(benchmark::State& state) {
150     for (auto _ : state) {
151       xnn_qs8_requantize_gemmlowp__sse4(
152           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
153     }
154   }
155 
BENCHMARK_F(Requantization,rndna__sse2)156   BENCHMARK_F(Requantization, rndna__sse2)(benchmark::State& state) {
157     for (auto _ : state) {
158       xnn_qs8_requantize_rndna__sse2(
159           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
160     }
161   }
162 
BENCHMARK_F(Requantization,rndna__ssse3)163   BENCHMARK_F(Requantization, rndna__ssse3)(benchmark::State& state) {
164     for (auto _ : state) {
165       xnn_qs8_requantize_rndna__ssse3(
166           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
167     }
168   }
169 
BENCHMARK_F(Requantization,rndna__sse4)170   BENCHMARK_F(Requantization, rndna__sse4)(benchmark::State& state) {
171     for (auto _ : state) {
172       xnn_qs8_requantize_rndna__sse4(
173           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
174     }
175   }
176 
BENCHMARK_F(Requantization,rndnu__sse4_sra)177   BENCHMARK_F(Requantization, rndnu__sse4_sra)(benchmark::State& state) {
178     for (auto _ : state) {
179       xnn_qs8_requantize_rndnu__sse4_sra(
180           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
181     }
182   }
183 
BENCHMARK_F(Requantization,rndnu__sse4_srl)184   BENCHMARK_F(Requantization, rndnu__sse4_srl)(benchmark::State& state) {
185     for (auto _ : state) {
186       xnn_qs8_requantize_rndnu__sse4_srl(
187           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
188     }
189   }
190 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
191 
192 
193 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
BENCHMARK_F(Requantization,fp32__wasmsimd)194   BENCHMARK_F(Requantization, fp32__wasmsimd)(benchmark::State& state) {
195     for (auto _ : state) {
196       xnn_qs8_requantize_fp32__wasmsimd(
197           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
198     }
199   }
200 
BENCHMARK_F(Requantization,gemmlowp__wasmsimd)201   BENCHMARK_F(Requantization, gemmlowp__wasmsimd)(benchmark::State& state) {
202     for (auto _ : state) {
203       xnn_qs8_requantize_gemmlowp__wasmsimd(
204           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
205     }
206   }
207 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
208 
209 
BENCHMARK_F(Requantization,fp32__scalar_lrintf)210 BENCHMARK_F(Requantization, fp32__scalar_lrintf)(benchmark::State& state) {
211   for (auto _ : state) {
212     xnn_qs8_requantize_fp32__scalar_lrintf(
213         n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
214   }
215 }
216 
BENCHMARK_F(Requantization,fp32__scalar_fmagic)217 BENCHMARK_F(Requantization, fp32__scalar_fmagic)(benchmark::State& state) {
218   for (auto _ : state) {
219     xnn_qs8_requantize_fp32__scalar_fmagic(
220         n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
221   }
222 }
223 
BENCHMARK_F(Requantization,gemmlowp__scalar)224 BENCHMARK_F(Requantization, gemmlowp__scalar)(benchmark::State& state) {
225   for (auto _ : state) {
226     xnn_qs8_requantize_gemmlowp__scalar(
227         n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
228   }
229 }
230 
BENCHMARK_F(Requantization,rndna__scalar_signed64)231 BENCHMARK_F(Requantization, rndna__scalar_signed64)(benchmark::State& state) {
232   for (auto _ : state) {
233     xnn_qs8_requantize_rndna__scalar_signed64(
234         n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
235   }
236 }
237 
BENCHMARK_F(Requantization,rndna__scalar_unsigned32)238 BENCHMARK_F(Requantization, rndna__scalar_unsigned32)(benchmark::State& state) {
239   for (auto _ : state) {
240     xnn_qs8_requantize_rndna__scalar_unsigned32(
241         n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
242   }
243 }
244 
BENCHMARK_F(Requantization,rndna__scalar_unsigned64)245 BENCHMARK_F(Requantization, rndna__scalar_unsigned64)(benchmark::State& state) {
246   for (auto _ : state) {
247     xnn_qs8_requantize_rndna__scalar_unsigned64(
248         n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
249   }
250 }
251 
BENCHMARK_F(Requantization,rndnu__scalar)252 BENCHMARK_F(Requantization, rndnu__scalar)(benchmark::State& state) {
253   for (auto _ : state) {
254     xnn_qs8_requantize_rndnu__scalar(
255         n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
256   }
257 }
258 
259 
260 #ifndef XNNPACK_BENCHMARK_NO_MAIN
261 BENCHMARK_MAIN();
262 #endif
263