1 // Copyright 2019, VIXL authors
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are met:
6 //
7 // * Redistributions of source code must retain the above copyright notice,
8 // this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above copyright notice,
10 // this list of conditions and the following disclaimer in the documentation
11 // and/or other materials provided with the distribution.
12 // * Neither the name of ARM Limited nor the names of its contributors may be
13 // used to endorse or promote products derived from this software without
14 // specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27 #include <vector>
28
29 #include "globals-vixl.h"
30 #include "aarch64/macro-assembler-aarch64.h"
31
32 #include "bench-utils.h"
33
34 using namespace vixl;
35 using namespace vixl::aarch64;
36
37 #define __ masm_->
38
39 const Register BenchCodeGenerator::scratch = x28;
40
PickR(unsigned size_in_bits)41 Register BenchCodeGenerator::PickR(unsigned size_in_bits) {
42 // Only select caller-saved registers [x0, x15].
43 return Register(static_cast<unsigned>(GetRandomBits(4)), size_in_bits);
44 }
45
PickV(unsigned size_in_bits)46 VRegister BenchCodeGenerator::PickV(unsigned size_in_bits) {
47 // Only select caller-saved registers [v0, v7] or [v16, v31].
48 // The resulting distribution is not uniform.
49 unsigned code = static_cast<unsigned>(GetRandomBits(5));
50 if (code < 16) code &= 0x7; // [v8, v15] -> [v0, v7]
51 return VRegister(code, size_in_bits);
52 }
53
GetRandomBits(int bits)54 uint64_t BenchCodeGenerator::GetRandomBits(int bits) {
55 VIXL_ASSERT((bits >= 0) && (bits <= 64));
56 uint64_t result = 0;
57
58 while (bits >= 32) {
59 // For big chunks, call jrand48 directly.
60 result = (result << 32) | jrand48(rand_state_); // [-2^31, 2^31]
61 bits -= 32;
62 }
63 if (bits == 0) return result;
64
65 // We often only want a few bits at a time, so use stored entropy to avoid
66 // frequent calls to jrand48.
67
68 if (bits > rnd_bits_) {
69 // We want more bits than we have.
70 result = (result << rnd_bits_) | rnd_;
71 bits -= rnd_bits_;
72
73 rnd_ = static_cast<uint32_t>(jrand48(rand_state_)); // [-2^31, 2^31]
74 rnd_bits_ = 32;
75 }
76
77 VIXL_ASSERT(bits <= rnd_bits_);
78 result = (result << bits) | (rnd_ % (UINT32_C(1) << bits));
79 rnd_ >>= bits;
80 rnd_bits_ -= bits;
81 return result;
82 }
83
PickRSize()84 unsigned BenchCodeGenerator::PickRSize() {
85 return PickBool() ? kWRegSize : kXRegSize;
86 }
87
PickFPSize()88 unsigned BenchCodeGenerator::PickFPSize() {
89 uint64_t entropy = GetRandomBits(4);
90 // Doubles and floats are common in most languages, so use half-precision
91 // types only rarely.
92 if (entropy == 0) return kHRegSize;
93 return ((entropy & 1) == 0) ? kSRegSize : kDRegSize;
94 }
95
Generate(size_t min_size_in_bytes)96 void BenchCodeGenerator::Generate(size_t min_size_in_bytes) {
97 Label start;
98 __ Bind(&start);
99
100 call_depth_++;
101 GeneratePrologue();
102
103 while (masm_->GetSizeOfCodeGeneratedSince(&start) < min_size_in_bytes) {
104 GenerateArbitrarySequence();
105 }
106
107 GenerateEpilogue();
108 call_depth_--;
109
110 // Make sure that any labels (created by GenerateBranchSequence) are bound
111 // before we exit.
112 if (call_depth_ == 0) BindAllPendingLabels();
113 }
114
GeneratePrologue()115 void BenchCodeGenerator::GeneratePrologue() {
116 // Construct a normal frame.
117 VIXL_ASSERT(masm_->StackPointer().Is(sp));
118 __ Push(lr, x29); // x29 is the frame pointer (fp).
119 __ Mov(x29, sp);
120 VIXL_ASSERT(call_depth_ > 0);
121 if (call_depth_ == 1) {
122 __ Push(scratch, xzr);
123 // Claim space to use for load and stores.
124 // - We need at least 4 * kQRegSize bytes for Ld4/St4.
125 // - The architecture requires that we allocate a multiple of 16 bytes.
126 // - There is no hard upper limit, but the Simulator has a limited stack
127 // space.
128 __ Claim((4 * kQRegSize) + (16 * GetRandomBits(3)));
129 __ Mov(scratch, sp);
130 }
131 }
132
GenerateEpilogue()133 void BenchCodeGenerator::GenerateEpilogue() {
134 VIXL_ASSERT(call_depth_ > 0);
135 if (call_depth_ == 1) {
136 __ Sub(sp, x29, 2 * kXRegSizeInBytes); // Drop the scratch space.
137 __ Pop(xzr, scratch);
138 }
139 __ Pop(x29, lr);
140 __ Ret();
141 }
142
GenerateArbitrarySequence()143 void BenchCodeGenerator::GenerateArbitrarySequence() {
144 // Bind pending labels, and remove them from the list.
145 // Recently-linked labels are much more likely to be bound than old ones. This
146 // should produce a mix of long- (veneered) and short-range branches.
147 uint32_t bind_mask = static_cast<uint32_t>(
148 GetRandomBits(8) | (GetRandomBits(7) << 1) | (GetRandomBits(6) << 2));
149 BindPendingLabels(bind_mask);
150
151 // If we are at the top call level (call_depth_ == 1), generate nested calls
152 // 1/4 of the time, and halve the chance for each call level below that.
153 VIXL_ASSERT(call_depth_ > 0);
154 if (GetRandomBits(call_depth_ + 1) == 0) {
155 GenerateCallReturnSequence();
156 return;
157 }
158
159 // These weightings should be roughly representative of real functions.
160 switch (GetRandomBits(4)) {
161 case 0x0:
162 case 0x1:
163 GenerateTrivialSequence();
164 return;
165 case 0x2:
166 case 0x3:
167 case 0x4:
168 case 0x5:
169 GenerateOperandSequence();
170 return;
171 case 0x6:
172 case 0x7:
173 case 0x8:
174 GenerateMemOperandSequence();
175 return;
176 case 0xb:
177 case 0x9:
178 case 0xa:
179 GenerateImmediateSequence();
180 return;
181 case 0xc:
182 case 0xd:
183 GenerateBranchSequence();
184 return;
185 case 0xe:
186 GenerateFPSequence();
187 return;
188 case 0xf:
189 GenerateNEONSequence();
190 return;
191 }
192 }
193
GenerateTrivialSequence()194 void BenchCodeGenerator::GenerateTrivialSequence() {
195 unsigned size = PickRSize();
196 __ Asr(PickR(size), PickR(size), 4);
197 __ Bfi(PickR(size), PickR(size), 5, 14);
198 __ Bfc(PickR(size), 5, 14);
199 __ Cinc(PickR(size), PickR(size), ge);
200 __ Cinv(PickR(size), PickR(size), ne);
201 __ Cls(PickR(size), PickR(size));
202 __ Cneg(PickR(size), PickR(size), lt);
203 __ Mrs(PickX(), NZCV);
204 __ Nop();
205 __ Mul(PickR(size), PickR(size), PickR(size));
206 __ Rbit(PickR(size), PickR(size));
207 __ Rev(PickR(size), PickR(size));
208 __ Sdiv(PickR(size), PickR(size), PickR(size));
209 if (!labels_.empty()) {
210 __ Adr(PickX(), labels_.begin()->target);
211 }
212 }
213
GenerateOperandSequence()214 void BenchCodeGenerator::GenerateOperandSequence() {
215 unsigned size = PickRSize();
216 // The cast to Operand is normally implicit for simple registers, but we
217 // explicitly specify it in every case here to ensure that the benchmark does
218 // what we expect.
219 __ And(PickR(size), PickR(size), Operand(PickR(size)));
220 __ Bics(PickR(size), PickR(size), Operand(PickR(size)));
221 __ Orr(PickR(size), PickR(size), Operand(PickR(size)));
222 __ Eor(PickR(size), PickR(size), Operand(PickR(size)));
223 __ Tst(PickR(size), Operand(PickR(size)));
224 __ Eon(PickR(size), PickR(size), Operand(PickR(size)));
225 __ Cmp(PickR(size), Operand(PickR(size)));
226 __ Negs(PickR(size), Operand(PickR(size)));
227 __ Mvn(PickR(size), Operand(PickR(size)));
228 __ Ccmp(PickR(size), Operand(PickR(size)), NoFlag, eq);
229 __ Ccmn(PickR(size), Operand(PickR(size)), NoFlag, eq);
230 __ Csel(PickR(size), Operand(PickR(size)), Operand(PickR(size)), lt);
231 {
232 // Ensure that `claim` doesn't alias any PickR().
233 UseScratchRegisterScope temps(masm_);
234 Register claim = temps.AcquireX();
235 // We should only claim a 16-byte-aligned amount, since we're using the
236 // system stack pointer.
237 __ Mov(claim, GetRandomBits(4) * 16);
238 __ Claim(Operand(claim));
239 // Also claim a bit more, so we can store at sp+claim.
240 __ Claim(Operand(32));
241 __ Poke(PickR(size), Operand(claim));
242 __ Peek(PickR(size), Operand(8));
243 __ Poke(PickR(size), Operand(16));
244 __ Peek(PickR(size), Operand(claim.W(), UXTW));
245 __ Drop(Operand(32));
246 __ Drop(Operand(claim));
247 }
248 }
249
GenerateMemOperandSequence()250 void BenchCodeGenerator::GenerateMemOperandSequence() {
251 unsigned size = PickRSize();
252 RegList store_list = GetRandomBits(16); // Restrict to [x0, x15].
253 __ StoreCPURegList(CPURegList(CPURegister::kRegister, size, store_list),
254 MemOperand(scratch));
255 RegList load_list = GetRandomBits(16); // Restrict to [x0, x15].
256 __ LoadCPURegList(CPURegList(CPURegister::kRegister, size, load_list),
257 MemOperand(scratch));
258 __ Str(PickX(), MemOperand(scratch));
259 __ Strb(PickW(), MemOperand(scratch, 42));
260 __ Strh(PickW(), MemOperand(scratch, 42, PostIndex));
261 __ Ldrsw(PickX(), MemOperand(scratch, -42, PreIndex));
262 __ Ldr(PickR(size), MemOperand(scratch, 19)); // Translated to ldur.
263 __ Push(PickX(), PickX());
264 // Ensure unique registers (in [x0, x15]) for Pop.
265 __ Pop(Register(static_cast<int>(GetRandomBits(2)) + 0, kWRegSize),
266 Register(static_cast<int>(GetRandomBits(2)) + 4, kWRegSize),
267 Register(static_cast<int>(GetRandomBits(2)) + 8, kWRegSize),
268 Register(static_cast<int>(GetRandomBits(2)) + 12, kWRegSize));
269 }
270
GenerateImmediateSequence()271 void BenchCodeGenerator::GenerateImmediateSequence() {
272 unsigned size = PickRSize();
273 __ And(PickR(size), PickR(size), GetRandomBits(size));
274 __ Sub(PickR(size), PickR(size), GetRandomBits(size));
275 __ Mov(PickR(size), GetRandomBits(size));
276 __ Movk(PickX(), GetRandomBits(16), static_cast<int>(GetRandomBits(2)) * 16);
277 }
278
BindPendingLabels(uint64_t bind_mask)279 void BenchCodeGenerator::BindPendingLabels(uint64_t bind_mask) {
280 if (bind_mask == 0) return;
281 // The labels we bind here jump back to just after each branch that refers
282 // to them. This allows a simple, linear execution path, whilst still
283 // benchmarking long-range labels.
284 //
285 // Ensure that code falling through into this sequence does not jump
286 // back to an earlier point in the execution path.
287 Label done;
288 __ B(&done);
289
290 std::list<LabelPair>::iterator it = labels_.begin();
291 while ((it != labels_.end()) && (bind_mask != 0)) {
292 if ((bind_mask & 1) != 0) {
293 // Bind the label and jump back to its source.
294 __ Bind(it->target);
295 __ B(it->cont);
296 delete it->target;
297 delete it->cont;
298 it = labels_.erase(it);
299 } else {
300 ++it; // Don't bind this one.
301 }
302 bind_mask >>= 1;
303 }
304 __ Bind(&done);
305 }
306
BindAllPendingLabels()307 void BenchCodeGenerator::BindAllPendingLabels() {
308 while (!labels_.empty()) {
309 // BindPendingLables generates a branch over each block of bound labels.
310 // This will be repeated for each call here, but the effect is minimal and
311 // (empirically) we rarely accumulate more than 64 pending labels anyway.
312 BindPendingLabels(UINT64_MAX);
313 }
314 }
315
GenerateBranchSequence()316 void BenchCodeGenerator::GenerateBranchSequence() {
317 {
318 LabelPair pair = {new Label(), new Label()};
319 __ B(lt, pair.target);
320 __ Bind(pair.cont);
321 labels_.push_front(pair);
322 }
323
324 {
325 LabelPair pair = {new Label(), new Label()};
326 __ Tbz(PickX(),
327 static_cast<int>(GetRandomBits(kXRegSizeLog2)),
328 pair.target);
329 __ Bind(pair.cont);
330 labels_.push_front(pair);
331 }
332
333 {
334 LabelPair pair = {new Label(), new Label()};
335 __ Cbz(PickX(), pair.target);
336 __ Bind(pair.cont);
337 labels_.push_front(pair);
338 }
339 }
340
GenerateCallReturnSequence()341 void BenchCodeGenerator::GenerateCallReturnSequence() {
342 Label fn, done;
343
344 if (PickBool()) {
345 __ Bl(&fn);
346 } else {
347 Register reg = PickX();
348 __ Adr(reg, &fn);
349 __ Blr(reg);
350 }
351 __ B(&done);
352
353 __ Bind(&fn);
354 // Recurse with a randomised (but fairly small) minimum size.
355 Generate(GetRandomBits(8));
356
357 __ Bind(&done);
358 }
359
GenerateFPSequence()360 void BenchCodeGenerator::GenerateFPSequence() {
361 unsigned size = PickFPSize();
362 unsigned other_size = PickBool() ? size * 2 : size / 2;
363 if (other_size < kHRegSize) other_size = kDRegSize;
364 if (other_size > kDRegSize) other_size = kHRegSize;
365
366 __ Fadd(PickV(size), PickV(size), PickV(size));
367 __ Fmul(PickV(size), PickV(size), PickV(size));
368 __ Fcvt(PickV(other_size), PickV(size));
369 __ Fjcvtzs(PickW(), PickD());
370 __ Fccmp(PickV(size), PickV(size), NCVFlag, pl);
371 __ Fdiv(PickV(size), PickV(size), PickV(size));
372 __ Fmov(PickV(size), 1.25 * GetRandomBits(2));
373 __ Fmsub(PickV(size), PickV(size), PickV(size), PickV(size));
374 __ Frintn(PickV(size), PickV(size));
375 }
376
GenerateNEONSequence()377 void BenchCodeGenerator::GenerateNEONSequence() {
378 __ And(PickV().V16B(), PickV().V16B(), PickV().V16B());
379 __ Sqrshl(PickV().V8H(), PickV().V8H(), PickV().V8H());
380 __ Umull(PickV().V2D(), PickV().V2S(), PickV().V2S());
381 __ Sqdmlal2(PickV().V4S(), PickV().V8H(), PickV().V8H());
382
383 // For structured loads and stores, we have to specify sequential (wrapped)
384 // registers, so start with [v16, v31] and allow them to wrap in to the
385 // [v0, v7] range.
386 VRegister vt(16 + static_cast<unsigned>(GetRandomBits(4)), kQRegSize);
387 VRegister vt2((vt.GetCode() + 1) % kNumberOfVRegisters, kQRegSize);
388 VRegister vt3((vt.GetCode() + 2) % kNumberOfVRegisters, kQRegSize);
389 VRegister vt4((vt.GetCode() + 3) % kNumberOfVRegisters, kQRegSize);
390 VIXL_ASSERT(!kCalleeSavedV.IncludesAliasOf(vt));
391 VIXL_ASSERT(!kCalleeSavedV.IncludesAliasOf(vt2));
392 VIXL_ASSERT(!kCalleeSavedV.IncludesAliasOf(vt3));
393 VIXL_ASSERT(!kCalleeSavedV.IncludesAliasOf(vt4));
394 __ Ld3(vt.V4S(), vt2.V4S(), vt3.V4S(), MemOperand(scratch));
395 __ St4(vt.V16B(), vt2.V16B(), vt3.V16B(), vt4.V16B(), MemOperand(scratch));
396
397 __ Fmaxv(PickV().H(), PickV().V8H());
398 __ Fminp(PickV().V4S(), PickV().V4S(), PickV().V4S());
399 }
400