• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019, VIXL authors
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are met:
6 //
7 //   * Redistributions of source code must retain the above copyright notice,
8 //     this list of conditions and the following disclaimer.
9 //   * Redistributions in binary form must reproduce the above copyright notice,
10 //     this list of conditions and the following disclaimer in the documentation
11 //     and/or other materials provided with the distribution.
12 //   * Neither the name of ARM Limited nor the names of its contributors may be
13 //     used to endorse or promote products derived from this software without
14 //     specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 
27 #include "bench-utils.h"
28 
29 #include <vector>
30 
31 #include "globals-vixl.h"
32 
33 #include "aarch64/macro-assembler-aarch64.h"
34 
35 using namespace vixl;
36 using namespace vixl::aarch64;
37 
38 #define __ masm_->
39 
40 const Register BenchCodeGenerator::scratch = x28;
41 
PickR(unsigned size_in_bits)42 Register BenchCodeGenerator::PickR(unsigned size_in_bits) {
43   // Only select caller-saved registers [x0, x15].
44   return Register(static_cast<unsigned>(GetRandomBits(4)), size_in_bits);
45 }
46 
PickV(unsigned size_in_bits)47 VRegister BenchCodeGenerator::PickV(unsigned size_in_bits) {
48   // Only select caller-saved registers [v0, v7] or [v16, v31].
49   // The resulting distribution is not uniform.
50   unsigned code = static_cast<unsigned>(GetRandomBits(5));
51   if (code < 16) code &= 0x7;  // [v8, v15] -> [v0, v7]
52   return VRegister(code, size_in_bits);
53 }
54 
GetRandomBits(int bits)55 uint64_t BenchCodeGenerator::GetRandomBits(int bits) {
56   VIXL_ASSERT((bits >= 0) && (bits <= 64));
57   uint64_t result = 0;
58 
59   while (bits >= 32) {
60     // For big chunks, call jrand48 directly.
61     result = (result << 32) | jrand48(rand_state_);  // [-2^31, 2^31]
62     bits -= 32;
63   }
64   if (bits == 0) return result;
65 
66   // We often only want a few bits at a time, so use stored entropy to avoid
67   // frequent calls to jrand48.
68 
69   if (bits > rnd_bits_) {
70     // We want more bits than we have.
71     result = (result << rnd_bits_) | rnd_;
72     bits -= rnd_bits_;
73 
74     rnd_ = static_cast<uint32_t>(jrand48(rand_state_));  // [-2^31, 2^31]
75     rnd_bits_ = 32;
76   }
77 
78   VIXL_ASSERT(bits <= rnd_bits_);
79   result = (result << bits) | (rnd_ % (UINT32_C(1) << bits));
80   rnd_ >>= bits;
81   rnd_bits_ -= bits;
82   return result;
83 }
84 
PickRSize()85 unsigned BenchCodeGenerator::PickRSize() {
86   return PickBool() ? kWRegSize : kXRegSize;
87 }
88 
PickFPSize()89 unsigned BenchCodeGenerator::PickFPSize() {
90   uint64_t entropy = GetRandomBits(4);
91   // Doubles and floats are common in most languages, so use half-precision
92   // types only rarely.
93   if (entropy == 0) return kHRegSize;
94   return ((entropy & 1) == 0) ? kSRegSize : kDRegSize;
95 }
96 
Generate(size_t min_size_in_bytes)97 void BenchCodeGenerator::Generate(size_t min_size_in_bytes) {
98   Label start;
99   __ Bind(&start);
100 
101   call_depth_++;
102   GeneratePrologue();
103 
104   while (masm_->GetSizeOfCodeGeneratedSince(&start) < min_size_in_bytes) {
105     GenerateArbitrarySequence();
106   }
107 
108   GenerateEpilogue();
109   call_depth_--;
110 
111   // Make sure that any labels (created by GenerateBranchSequence) are bound
112   // before we exit.
113   if (call_depth_ == 0) BindAllPendingLabels();
114 }
115 
GeneratePrologue()116 void BenchCodeGenerator::GeneratePrologue() {
117   // Construct a normal frame.
118   VIXL_ASSERT(masm_->StackPointer().Is(sp));
119   __ Push(lr, x29);  // x29 is the frame pointer (fp).
120   __ Mov(x29, sp);
121   VIXL_ASSERT(call_depth_ > 0);
122   if (call_depth_ == 1) {
123     __ Push(scratch, xzr);
124     // Claim space to use for load and stores.
125     // - We need at least 4 * kQRegSize bytes for Ld4/St4.
126     // - The architecture requires that we allocate a multiple of 16 bytes.
127     // - There is no hard upper limit, but the Simulator has a limited stack
128     //   space.
129     __ Claim((4 * kQRegSize) + (16 * GetRandomBits(3)));
130     __ Mov(scratch, sp);
131   }
132 }
133 
GenerateEpilogue()134 void BenchCodeGenerator::GenerateEpilogue() {
135   VIXL_ASSERT(call_depth_ > 0);
136   if (call_depth_ == 1) {
137     __ Sub(sp, x29, 2 * kXRegSizeInBytes);  // Drop the scratch space.
138     __ Pop(xzr, scratch);
139   }
140   __ Pop(x29, lr);
141   __ Ret();
142 }
143 
GenerateArbitrarySequence()144 void BenchCodeGenerator::GenerateArbitrarySequence() {
145   // Bind pending labels, and remove them from the list.
146   // Recently-linked labels are much more likely to be bound than old ones. This
147   // should produce a mix of long- (veneered) and short-range branches.
148   uint32_t bind_mask = static_cast<uint32_t>(
149       GetRandomBits(8) | (GetRandomBits(7) << 1) | (GetRandomBits(6) << 2));
150   BindPendingLabels(bind_mask);
151 
152   // If we are at the top call level (call_depth_ == 1), generate nested calls
153   // 1/4 of the time, and halve the chance for each call level below that.
154   VIXL_ASSERT(call_depth_ > 0);
155   if (GetRandomBits(call_depth_ + 1) == 0) {
156     GenerateCallReturnSequence();
157     return;
158   }
159 
160   // These weightings should be roughly representative of real functions.
161   switch (GetRandomBits(4)) {
162     case 0x0:
163     case 0x1:
164       GenerateTrivialSequence();
165       return;
166     case 0x2:
167     case 0x3:
168     case 0x4:
169     case 0x5:
170       GenerateOperandSequence();
171       return;
172     case 0x6:
173     case 0x7:
174     case 0x8:
175       GenerateMemOperandSequence();
176       return;
177     case 0xb:
178     case 0x9:
179     case 0xa:
180       GenerateImmediateSequence();
181       return;
182     case 0xc:
183     case 0xd:
184       GenerateBranchSequence();
185       return;
186     case 0xe:
187       GenerateFPSequence();
188       return;
189     case 0xf:
190       GenerateNEONSequence();
191       return;
192   }
193 }
194 
GenerateTrivialSequence()195 void BenchCodeGenerator::GenerateTrivialSequence() {
196   unsigned size = PickRSize();
197   __ Asr(PickR(size), PickR(size), 4);
198   __ Bfi(PickR(size), PickR(size), 5, 14);
199   __ Bfc(PickR(size), 5, 14);
200   __ Cinc(PickR(size), PickR(size), ge);
201   __ Cinv(PickR(size), PickR(size), ne);
202   __ Cls(PickR(size), PickR(size));
203   __ Cneg(PickR(size), PickR(size), lt);
204   __ Mrs(PickX(), NZCV);
205   __ Nop();
206   __ Mul(PickR(size), PickR(size), PickR(size));
207   __ Rbit(PickR(size), PickR(size));
208   __ Rev(PickR(size), PickR(size));
209   __ Sdiv(PickR(size), PickR(size), PickR(size));
210   if (!labels_.empty()) {
211     __ Adr(PickX(), labels_.begin()->target);
212   }
213 }
214 
GenerateOperandSequence()215 void BenchCodeGenerator::GenerateOperandSequence() {
216   unsigned size = PickRSize();
217   // The cast to Operand is normally implicit for simple registers, but we
218   // explicitly specify it in every case here to ensure that the benchmark does
219   // what we expect.
220   __ And(PickR(size), PickR(size), Operand(PickR(size)));
221   __ Bics(PickR(size), PickR(size), Operand(PickR(size)));
222   __ Orr(PickR(size), PickR(size), Operand(PickR(size)));
223   __ Eor(PickR(size), PickR(size), Operand(PickR(size)));
224   __ Tst(PickR(size), Operand(PickR(size)));
225   __ Eon(PickR(size), PickR(size), Operand(PickR(size)));
226   __ Cmp(PickR(size), Operand(PickR(size)));
227   __ Negs(PickR(size), Operand(PickR(size)));
228   __ Mvn(PickR(size), Operand(PickR(size)));
229   __ Ccmp(PickR(size), Operand(PickR(size)), NoFlag, eq);
230   __ Ccmn(PickR(size), Operand(PickR(size)), NoFlag, eq);
231   __ Csel(PickR(size), Operand(PickR(size)), Operand(PickR(size)), lt);
232   {
233     // Ensure that `claim` doesn't alias any PickR().
234     UseScratchRegisterScope temps(masm_);
235     Register claim = temps.AcquireX();
236     // We should only claim a 16-byte-aligned amount, since we're using the
237     // system stack pointer.
238     __ Mov(claim, GetRandomBits(4) * 16);
239     __ Claim(Operand(claim));
240     // Also claim a bit more, so we can store at sp+claim.
241     __ Claim(Operand(32));
242     __ Poke(PickR(size), Operand(claim));
243     __ Peek(PickR(size), Operand(8));
244     __ Poke(PickR(size), Operand(16));
245     __ Peek(PickR(size), Operand(claim.W(), UXTW));
246     __ Drop(Operand(32));
247     __ Drop(Operand(claim));
248   }
249 }
250 
GenerateMemOperandSequence()251 void BenchCodeGenerator::GenerateMemOperandSequence() {
252   unsigned size = PickRSize();
253   RegList store_list = GetRandomBits(16);  // Restrict to [x0, x15].
254   __ StoreCPURegList(CPURegList(CPURegister::kRegister, size, store_list),
255                      MemOperand(scratch));
256   RegList load_list = GetRandomBits(16);  // Restrict to [x0, x15].
257   __ LoadCPURegList(CPURegList(CPURegister::kRegister, size, load_list),
258                     MemOperand(scratch));
259   __ Str(PickX(), MemOperand(scratch));
260   __ Strb(PickW(), MemOperand(scratch, 42));
261   __ Strh(PickW(), MemOperand(scratch, 42, PostIndex));
262   __ Ldrsw(PickX(), MemOperand(scratch, -42, PreIndex));
263   __ Ldr(PickR(size), MemOperand(scratch, 19));  // Translated to ldur.
264   __ Push(PickX(), PickX());
265   // Ensure unique registers (in [x0, x15]) for Pop.
266   __ Pop(Register(static_cast<int>(GetRandomBits(2)) + 0, kWRegSize),
267          Register(static_cast<int>(GetRandomBits(2)) + 4, kWRegSize),
268          Register(static_cast<int>(GetRandomBits(2)) + 8, kWRegSize),
269          Register(static_cast<int>(GetRandomBits(2)) + 12, kWRegSize));
270 }
271 
GenerateImmediateSequence()272 void BenchCodeGenerator::GenerateImmediateSequence() {
273   unsigned size = PickRSize();
274   __ And(PickR(size), PickR(size), GetRandomBits(size));
275   __ Sub(PickR(size), PickR(size), GetRandomBits(size));
276   __ Mov(PickR(size), GetRandomBits(size));
277   __ Movk(PickX(), GetRandomBits(16), static_cast<int>(GetRandomBits(2)) * 16);
278 }
279 
BindPendingLabels(uint64_t bind_mask)280 void BenchCodeGenerator::BindPendingLabels(uint64_t bind_mask) {
281   if (bind_mask == 0) return;
282   // The labels we bind here jump back to just after each branch that refers
283   // to them. This allows a simple, linear execution path, whilst still
284   // benchmarking long-range labels.
285   //
286   // Ensure that code falling through into this sequence does not jump
287   // back to an earlier point in the execution path.
288   Label done;
289   __ B(&done);
290 
291   std::list<LabelPair>::iterator it = labels_.begin();
292   while ((it != labels_.end()) && (bind_mask != 0)) {
293     if ((bind_mask & 1) != 0) {
294       // Bind the label and jump back to its source.
295       __ Bind(it->target);
296       __ B(it->cont);
297       delete it->target;
298       delete it->cont;
299       it = labels_.erase(it);
300     } else {
301       ++it;  // Don't bind this one.
302     }
303     bind_mask >>= 1;
304   }
305   __ Bind(&done);
306 }
307 
BindAllPendingLabels()308 void BenchCodeGenerator::BindAllPendingLabels() {
309   while (!labels_.empty()) {
310     // BindPendingLabels generates a branch over each block of bound labels.
311     // This will be repeated for each call here, but the effect is minimal and
312     // (empirically) we rarely accumulate more than 64 pending labels anyway.
313     BindPendingLabels(UINT64_MAX);
314   }
315 }
316 
GenerateBranchSequence()317 void BenchCodeGenerator::GenerateBranchSequence() {
318   {
319     LabelPair pair = {new Label(), new Label()};
320     __ B(lt, pair.target);
321     __ Bind(pair.cont);
322     labels_.push_front(pair);
323   }
324 
325   {
326     LabelPair pair = {new Label(), new Label()};
327     __ Tbz(PickX(),
328            static_cast<int>(GetRandomBits(kXRegSizeLog2)),
329            pair.target);
330     __ Bind(pair.cont);
331     labels_.push_front(pair);
332   }
333 
334   {
335     LabelPair pair = {new Label(), new Label()};
336     __ Cbz(PickX(), pair.target);
337     __ Bind(pair.cont);
338     labels_.push_front(pair);
339   }
340 }
341 
GenerateCallReturnSequence()342 void BenchCodeGenerator::GenerateCallReturnSequence() {
343   Label fn, done;
344 
345   if (PickBool()) {
346     __ Bl(&fn);
347   } else {
348     Register reg = PickX();
349     __ Adr(reg, &fn);
350     __ Blr(reg);
351   }
352   __ B(&done);
353 
354   __ Bind(&fn);
355   // Recurse with a randomised (but fairly small) minimum size.
356   Generate(GetRandomBits(8));
357 
358   __ Bind(&done);
359 }
360 
GenerateFPSequence()361 void BenchCodeGenerator::GenerateFPSequence() {
362   unsigned size = PickFPSize();
363   unsigned other_size = PickBool() ? size * 2 : size / 2;
364   if (other_size < kHRegSize) other_size = kDRegSize;
365   if (other_size > kDRegSize) other_size = kHRegSize;
366 
367   __ Fadd(PickV(size), PickV(size), PickV(size));
368   __ Fmul(PickV(size), PickV(size), PickV(size));
369   __ Fcvt(PickV(other_size), PickV(size));
370   __ Fjcvtzs(PickW(), PickD());
371   __ Fccmp(PickV(size), PickV(size), NCVFlag, pl);
372   __ Fdiv(PickV(size), PickV(size), PickV(size));
373   __ Fmov(PickV(size), 1.25 * GetRandomBits(2));
374   __ Fmsub(PickV(size), PickV(size), PickV(size), PickV(size));
375   __ Frintn(PickV(size), PickV(size));
376 }
377 
GenerateNEONSequence()378 void BenchCodeGenerator::GenerateNEONSequence() {
379   __ And(PickV().V16B(), PickV().V16B(), PickV().V16B());
380   __ Sqrshl(PickV().V8H(), PickV().V8H(), PickV().V8H());
381   __ Umull(PickV().V2D(), PickV().V2S(), PickV().V2S());
382   __ Sqdmlal2(PickV().V4S(), PickV().V8H(), PickV().V8H());
383 
384   // For structured loads and stores, we have to specify sequential (wrapped)
385   // registers, so start with [v16, v31] and allow them to wrap in to the
386   // [v0, v7] range.
387   VRegister vt(16 + static_cast<unsigned>(GetRandomBits(4)), kQRegSize);
388   VRegister vt2((vt.GetCode() + 1) % kNumberOfVRegisters, kQRegSize);
389   VRegister vt3((vt.GetCode() + 2) % kNumberOfVRegisters, kQRegSize);
390   VRegister vt4((vt.GetCode() + 3) % kNumberOfVRegisters, kQRegSize);
391   VIXL_ASSERT(!kCalleeSavedV.IncludesAliasOf(vt));
392   VIXL_ASSERT(!kCalleeSavedV.IncludesAliasOf(vt2));
393   VIXL_ASSERT(!kCalleeSavedV.IncludesAliasOf(vt3));
394   VIXL_ASSERT(!kCalleeSavedV.IncludesAliasOf(vt4));
395   __ Ld3(vt.V4S(), vt2.V4S(), vt3.V4S(), MemOperand(scratch));
396   __ St4(vt.V16B(), vt2.V16B(), vt3.V16B(), vt4.V16B(), MemOperand(scratch));
397 
398   __ Fmaxv(PickV().H(), PickV().V8H());
399   __ Fminp(PickV().V4S(), PickV().V4S(), PickV().V4S());
400 }
401