1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "AArch64InstrInfo.h"
14 #include "AArch64MachineFunctionInfo.h"
15 #include "AArch64Subtarget.h"
16 #include "MCTargetDesc/AArch64AddressingModes.h"
17 #include "Utils/AArch64BaseInfo.h"
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFrameInfo.h"
23 #include "llvm/CodeGen/MachineFunction.h"
24 #include "llvm/CodeGen/MachineInstr.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineMemOperand.h"
27 #include "llvm/CodeGen/MachineModuleInfo.h"
28 #include "llvm/CodeGen/MachineOperand.h"
29 #include "llvm/CodeGen/MachineRegisterInfo.h"
30 #include "llvm/CodeGen/StackMaps.h"
31 #include "llvm/CodeGen/TargetRegisterInfo.h"
32 #include "llvm/CodeGen/TargetSubtargetInfo.h"
33 #include "llvm/IR/DebugInfoMetadata.h"
34 #include "llvm/IR/DebugLoc.h"
35 #include "llvm/IR/GlobalValue.h"
36 #include "llvm/MC/MCAsmInfo.h"
37 #include "llvm/MC/MCInst.h"
38 #include "llvm/MC/MCInstrDesc.h"
39 #include "llvm/Support/Casting.h"
40 #include "llvm/Support/CodeGen.h"
41 #include "llvm/Support/CommandLine.h"
42 #include "llvm/Support/Compiler.h"
43 #include "llvm/Support/ErrorHandling.h"
44 #include "llvm/Support/MathExtras.h"
45 #include "llvm/Target/TargetMachine.h"
46 #include "llvm/Target/TargetOptions.h"
47 #include <cassert>
48 #include <cstdint>
49 #include <iterator>
50 #include <utility>
51
52 using namespace llvm;
53
54 #define GET_INSTRINFO_CTOR_DTOR
55 #include "AArch64GenInstrInfo.inc"
56
57 static cl::opt<unsigned> TBZDisplacementBits(
58 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
59 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
60
61 static cl::opt<unsigned> CBZDisplacementBits(
62 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
63 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
64
65 static cl::opt<unsigned>
66 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
67 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
68
AArch64InstrInfo(const AArch64Subtarget & STI)69 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
70 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
71 AArch64::CATCHRET),
72 RI(STI.getTargetTriple()), Subtarget(STI) {}
73
74 /// GetInstSize - Return the number of bytes of code the specified
75 /// instruction may be. This returns the maximum number of bytes.
getInstSizeInBytes(const MachineInstr & MI) const76 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
77 const MachineBasicBlock &MBB = *MI.getParent();
78 const MachineFunction *MF = MBB.getParent();
79 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
80
81 {
82 auto Op = MI.getOpcode();
83 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
84 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
85 }
86
87 // Meta-instructions emit no code.
88 if (MI.isMetaInstruction())
89 return 0;
90
91 // FIXME: We currently only handle pseudoinstructions that don't get expanded
92 // before the assembly printer.
93 unsigned NumBytes = 0;
94 const MCInstrDesc &Desc = MI.getDesc();
95 switch (Desc.getOpcode()) {
96 default:
97 // Anything not explicitly designated otherwise is a normal 4-byte insn.
98 NumBytes = 4;
99 break;
100 case TargetOpcode::STACKMAP:
101 // The upper bound for a stackmap intrinsic is the full length of its shadow
102 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
103 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
104 break;
105 case TargetOpcode::PATCHPOINT:
106 // The size of the patchpoint intrinsic is the number of bytes requested
107 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
108 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
109 break;
110 case TargetOpcode::STATEPOINT:
111 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
112 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
113 // No patch bytes means a normal call inst is emitted
114 if (NumBytes == 0)
115 NumBytes = 4;
116 break;
117 case AArch64::TLSDESC_CALLSEQ:
118 // This gets lowered to an instruction sequence which takes 16 bytes
119 NumBytes = 16;
120 break;
121 case AArch64::SpeculationBarrierISBDSBEndBB:
122 // This gets lowered to 2 4-byte instructions.
123 NumBytes = 8;
124 break;
125 case AArch64::SpeculationBarrierSBEndBB:
126 // This gets lowered to 1 4-byte instructions.
127 NumBytes = 4;
128 break;
129 case AArch64::JumpTableDest32:
130 case AArch64::JumpTableDest16:
131 case AArch64::JumpTableDest8:
132 NumBytes = 12;
133 break;
134 case AArch64::SPACE:
135 NumBytes = MI.getOperand(1).getImm();
136 break;
137 case TargetOpcode::BUNDLE:
138 NumBytes = getInstBundleLength(MI);
139 break;
140 }
141
142 return NumBytes;
143 }
144
getInstBundleLength(const MachineInstr & MI) const145 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
146 unsigned Size = 0;
147 MachineBasicBlock::const_instr_iterator I = MI.getIterator();
148 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
149 while (++I != E && I->isInsideBundle()) {
150 assert(!I->isBundle() && "No nested bundle!");
151 Size += getInstSizeInBytes(*I);
152 }
153 return Size;
154 }
155
parseCondBranch(MachineInstr * LastInst,MachineBasicBlock * & Target,SmallVectorImpl<MachineOperand> & Cond)156 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
157 SmallVectorImpl<MachineOperand> &Cond) {
158 // Block ends with fall-through condbranch.
159 switch (LastInst->getOpcode()) {
160 default:
161 llvm_unreachable("Unknown branch instruction?");
162 case AArch64::Bcc:
163 Target = LastInst->getOperand(1).getMBB();
164 Cond.push_back(LastInst->getOperand(0));
165 break;
166 case AArch64::CBZW:
167 case AArch64::CBZX:
168 case AArch64::CBNZW:
169 case AArch64::CBNZX:
170 Target = LastInst->getOperand(1).getMBB();
171 Cond.push_back(MachineOperand::CreateImm(-1));
172 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
173 Cond.push_back(LastInst->getOperand(0));
174 break;
175 case AArch64::TBZW:
176 case AArch64::TBZX:
177 case AArch64::TBNZW:
178 case AArch64::TBNZX:
179 Target = LastInst->getOperand(2).getMBB();
180 Cond.push_back(MachineOperand::CreateImm(-1));
181 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
182 Cond.push_back(LastInst->getOperand(0));
183 Cond.push_back(LastInst->getOperand(1));
184 }
185 }
186
getBranchDisplacementBits(unsigned Opc)187 static unsigned getBranchDisplacementBits(unsigned Opc) {
188 switch (Opc) {
189 default:
190 llvm_unreachable("unexpected opcode!");
191 case AArch64::B:
192 return 64;
193 case AArch64::TBNZW:
194 case AArch64::TBZW:
195 case AArch64::TBNZX:
196 case AArch64::TBZX:
197 return TBZDisplacementBits;
198 case AArch64::CBNZW:
199 case AArch64::CBZW:
200 case AArch64::CBNZX:
201 case AArch64::CBZX:
202 return CBZDisplacementBits;
203 case AArch64::Bcc:
204 return BCCDisplacementBits;
205 }
206 }
207
isBranchOffsetInRange(unsigned BranchOp,int64_t BrOffset) const208 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
209 int64_t BrOffset) const {
210 unsigned Bits = getBranchDisplacementBits(BranchOp);
211 assert(Bits >= 3 && "max branch displacement must be enough to jump"
212 "over conditional branch expansion");
213 return isIntN(Bits, BrOffset / 4);
214 }
215
216 MachineBasicBlock *
getBranchDestBlock(const MachineInstr & MI) const217 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
218 switch (MI.getOpcode()) {
219 default:
220 llvm_unreachable("unexpected opcode!");
221 case AArch64::B:
222 return MI.getOperand(0).getMBB();
223 case AArch64::TBZW:
224 case AArch64::TBNZW:
225 case AArch64::TBZX:
226 case AArch64::TBNZX:
227 return MI.getOperand(2).getMBB();
228 case AArch64::CBZW:
229 case AArch64::CBNZW:
230 case AArch64::CBZX:
231 case AArch64::CBNZX:
232 case AArch64::Bcc:
233 return MI.getOperand(1).getMBB();
234 }
235 }
236
237 // Branch analysis.
analyzeBranch(MachineBasicBlock & MBB,MachineBasicBlock * & TBB,MachineBasicBlock * & FBB,SmallVectorImpl<MachineOperand> & Cond,bool AllowModify) const238 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
239 MachineBasicBlock *&TBB,
240 MachineBasicBlock *&FBB,
241 SmallVectorImpl<MachineOperand> &Cond,
242 bool AllowModify) const {
243 // If the block has no terminators, it just falls into the block after it.
244 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
245 if (I == MBB.end())
246 return false;
247
248 // Skip over SpeculationBarrierEndBB terminators
249 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
250 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
251 --I;
252 }
253
254 if (!isUnpredicatedTerminator(*I))
255 return false;
256
257 // Get the last instruction in the block.
258 MachineInstr *LastInst = &*I;
259
260 // If there is only one terminator instruction, process it.
261 unsigned LastOpc = LastInst->getOpcode();
262 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
263 if (isUncondBranchOpcode(LastOpc)) {
264 TBB = LastInst->getOperand(0).getMBB();
265 return false;
266 }
267 if (isCondBranchOpcode(LastOpc)) {
268 // Block ends with fall-through condbranch.
269 parseCondBranch(LastInst, TBB, Cond);
270 return false;
271 }
272 return true; // Can't handle indirect branch.
273 }
274
275 // Get the instruction before it if it is a terminator.
276 MachineInstr *SecondLastInst = &*I;
277 unsigned SecondLastOpc = SecondLastInst->getOpcode();
278
279 // If AllowModify is true and the block ends with two or more unconditional
280 // branches, delete all but the first unconditional branch.
281 if (AllowModify && isUncondBranchOpcode(LastOpc)) {
282 while (isUncondBranchOpcode(SecondLastOpc)) {
283 LastInst->eraseFromParent();
284 LastInst = SecondLastInst;
285 LastOpc = LastInst->getOpcode();
286 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
287 // Return now the only terminator is an unconditional branch.
288 TBB = LastInst->getOperand(0).getMBB();
289 return false;
290 } else {
291 SecondLastInst = &*I;
292 SecondLastOpc = SecondLastInst->getOpcode();
293 }
294 }
295 }
296
297 // If we're allowed to modify and the block ends in a unconditional branch
298 // which could simply fallthrough, remove the branch. (Note: This case only
299 // matters when we can't understand the whole sequence, otherwise it's also
300 // handled by BranchFolding.cpp.)
301 if (AllowModify && isUncondBranchOpcode(LastOpc) &&
302 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
303 LastInst->eraseFromParent();
304 LastInst = SecondLastInst;
305 LastOpc = LastInst->getOpcode();
306 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
307 assert(!isUncondBranchOpcode(LastOpc) &&
308 "unreachable unconditional branches removed above");
309
310 if (isCondBranchOpcode(LastOpc)) {
311 // Block ends with fall-through condbranch.
312 parseCondBranch(LastInst, TBB, Cond);
313 return false;
314 }
315 return true; // Can't handle indirect branch.
316 } else {
317 SecondLastInst = &*I;
318 SecondLastOpc = SecondLastInst->getOpcode();
319 }
320 }
321
322 // If there are three terminators, we don't know what sort of block this is.
323 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
324 return true;
325
326 // If the block ends with a B and a Bcc, handle it.
327 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
328 parseCondBranch(SecondLastInst, TBB, Cond);
329 FBB = LastInst->getOperand(0).getMBB();
330 return false;
331 }
332
333 // If the block ends with two unconditional branches, handle it. The second
334 // one is not executed, so remove it.
335 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
336 TBB = SecondLastInst->getOperand(0).getMBB();
337 I = LastInst;
338 if (AllowModify)
339 I->eraseFromParent();
340 return false;
341 }
342
343 // ...likewise if it ends with an indirect branch followed by an unconditional
344 // branch.
345 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
346 I = LastInst;
347 if (AllowModify)
348 I->eraseFromParent();
349 return true;
350 }
351
352 // Otherwise, can't handle this.
353 return true;
354 }
355
analyzeBranchPredicate(MachineBasicBlock & MBB,MachineBranchPredicate & MBP,bool AllowModify) const356 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
357 MachineBranchPredicate &MBP,
358 bool AllowModify) const {
359 // For the moment, handle only a block which ends with a cb(n)zx followed by
360 // a fallthrough. Why this? Because it is a common form.
361 // TODO: Should we handle b.cc?
362
363 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
364 if (I == MBB.end())
365 return true;
366
367 // Skip over SpeculationBarrierEndBB terminators
368 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
369 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
370 --I;
371 }
372
373 if (!isUnpredicatedTerminator(*I))
374 return true;
375
376 // Get the last instruction in the block.
377 MachineInstr *LastInst = &*I;
378 unsigned LastOpc = LastInst->getOpcode();
379 if (!isCondBranchOpcode(LastOpc))
380 return true;
381
382 switch (LastOpc) {
383 default:
384 return true;
385 case AArch64::CBZW:
386 case AArch64::CBZX:
387 case AArch64::CBNZW:
388 case AArch64::CBNZX:
389 break;
390 };
391
392 MBP.TrueDest = LastInst->getOperand(1).getMBB();
393 assert(MBP.TrueDest && "expected!");
394 MBP.FalseDest = MBB.getNextNode();
395
396 MBP.ConditionDef = nullptr;
397 MBP.SingleUseCondition = false;
398
399 MBP.LHS = LastInst->getOperand(0);
400 MBP.RHS = MachineOperand::CreateImm(0);
401 MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
402 : MachineBranchPredicate::PRED_EQ;
403 return false;
404 }
405
reverseBranchCondition(SmallVectorImpl<MachineOperand> & Cond) const406 bool AArch64InstrInfo::reverseBranchCondition(
407 SmallVectorImpl<MachineOperand> &Cond) const {
408 if (Cond[0].getImm() != -1) {
409 // Regular Bcc
410 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
411 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
412 } else {
413 // Folded compare-and-branch
414 switch (Cond[1].getImm()) {
415 default:
416 llvm_unreachable("Unknown conditional branch!");
417 case AArch64::CBZW:
418 Cond[1].setImm(AArch64::CBNZW);
419 break;
420 case AArch64::CBNZW:
421 Cond[1].setImm(AArch64::CBZW);
422 break;
423 case AArch64::CBZX:
424 Cond[1].setImm(AArch64::CBNZX);
425 break;
426 case AArch64::CBNZX:
427 Cond[1].setImm(AArch64::CBZX);
428 break;
429 case AArch64::TBZW:
430 Cond[1].setImm(AArch64::TBNZW);
431 break;
432 case AArch64::TBNZW:
433 Cond[1].setImm(AArch64::TBZW);
434 break;
435 case AArch64::TBZX:
436 Cond[1].setImm(AArch64::TBNZX);
437 break;
438 case AArch64::TBNZX:
439 Cond[1].setImm(AArch64::TBZX);
440 break;
441 }
442 }
443
444 return false;
445 }
446
removeBranch(MachineBasicBlock & MBB,int * BytesRemoved) const447 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
448 int *BytesRemoved) const {
449 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
450 if (I == MBB.end())
451 return 0;
452
453 if (!isUncondBranchOpcode(I->getOpcode()) &&
454 !isCondBranchOpcode(I->getOpcode()))
455 return 0;
456
457 // Remove the branch.
458 I->eraseFromParent();
459
460 I = MBB.end();
461
462 if (I == MBB.begin()) {
463 if (BytesRemoved)
464 *BytesRemoved = 4;
465 return 1;
466 }
467 --I;
468 if (!isCondBranchOpcode(I->getOpcode())) {
469 if (BytesRemoved)
470 *BytesRemoved = 4;
471 return 1;
472 }
473
474 // Remove the branch.
475 I->eraseFromParent();
476 if (BytesRemoved)
477 *BytesRemoved = 8;
478
479 return 2;
480 }
481
instantiateCondBranch(MachineBasicBlock & MBB,const DebugLoc & DL,MachineBasicBlock * TBB,ArrayRef<MachineOperand> Cond) const482 void AArch64InstrInfo::instantiateCondBranch(
483 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
484 ArrayRef<MachineOperand> Cond) const {
485 if (Cond[0].getImm() != -1) {
486 // Regular Bcc
487 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
488 } else {
489 // Folded compare-and-branch
490 // Note that we use addOperand instead of addReg to keep the flags.
491 const MachineInstrBuilder MIB =
492 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
493 if (Cond.size() > 3)
494 MIB.addImm(Cond[3].getImm());
495 MIB.addMBB(TBB);
496 }
497 }
498
insertBranch(MachineBasicBlock & MBB,MachineBasicBlock * TBB,MachineBasicBlock * FBB,ArrayRef<MachineOperand> Cond,const DebugLoc & DL,int * BytesAdded) const499 unsigned AArch64InstrInfo::insertBranch(
500 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
501 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
502 // Shouldn't be a fall through.
503 assert(TBB && "insertBranch must not be told to insert a fallthrough");
504
505 if (!FBB) {
506 if (Cond.empty()) // Unconditional branch?
507 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
508 else
509 instantiateCondBranch(MBB, DL, TBB, Cond);
510
511 if (BytesAdded)
512 *BytesAdded = 4;
513
514 return 1;
515 }
516
517 // Two-way conditional branch.
518 instantiateCondBranch(MBB, DL, TBB, Cond);
519 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
520
521 if (BytesAdded)
522 *BytesAdded = 8;
523
524 return 2;
525 }
526
527 // Find the original register that VReg is copied from.
removeCopies(const MachineRegisterInfo & MRI,unsigned VReg)528 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
529 while (Register::isVirtualRegister(VReg)) {
530 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
531 if (!DefMI->isFullCopy())
532 return VReg;
533 VReg = DefMI->getOperand(1).getReg();
534 }
535 return VReg;
536 }
537
538 // Determine if VReg is defined by an instruction that can be folded into a
539 // csel instruction. If so, return the folded opcode, and the replacement
540 // register.
canFoldIntoCSel(const MachineRegisterInfo & MRI,unsigned VReg,unsigned * NewVReg=nullptr)541 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
542 unsigned *NewVReg = nullptr) {
543 VReg = removeCopies(MRI, VReg);
544 if (!Register::isVirtualRegister(VReg))
545 return 0;
546
547 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
548 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
549 unsigned Opc = 0;
550 unsigned SrcOpNum = 0;
551 switch (DefMI->getOpcode()) {
552 case AArch64::ADDSXri:
553 case AArch64::ADDSWri:
554 // if NZCV is used, do not fold.
555 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
556 return 0;
557 // fall-through to ADDXri and ADDWri.
558 LLVM_FALLTHROUGH;
559 case AArch64::ADDXri:
560 case AArch64::ADDWri:
561 // add x, 1 -> csinc.
562 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
563 DefMI->getOperand(3).getImm() != 0)
564 return 0;
565 SrcOpNum = 1;
566 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
567 break;
568
569 case AArch64::ORNXrr:
570 case AArch64::ORNWrr: {
571 // not x -> csinv, represented as orn dst, xzr, src.
572 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
573 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
574 return 0;
575 SrcOpNum = 2;
576 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
577 break;
578 }
579
580 case AArch64::SUBSXrr:
581 case AArch64::SUBSWrr:
582 // if NZCV is used, do not fold.
583 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
584 return 0;
585 // fall-through to SUBXrr and SUBWrr.
586 LLVM_FALLTHROUGH;
587 case AArch64::SUBXrr:
588 case AArch64::SUBWrr: {
589 // neg x -> csneg, represented as sub dst, xzr, src.
590 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
591 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
592 return 0;
593 SrcOpNum = 2;
594 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
595 break;
596 }
597 default:
598 return 0;
599 }
600 assert(Opc && SrcOpNum && "Missing parameters");
601
602 if (NewVReg)
603 *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
604 return Opc;
605 }
606
canInsertSelect(const MachineBasicBlock & MBB,ArrayRef<MachineOperand> Cond,Register DstReg,Register TrueReg,Register FalseReg,int & CondCycles,int & TrueCycles,int & FalseCycles) const607 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
608 ArrayRef<MachineOperand> Cond,
609 Register DstReg, Register TrueReg,
610 Register FalseReg, int &CondCycles,
611 int &TrueCycles,
612 int &FalseCycles) const {
613 // Check register classes.
614 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
615 const TargetRegisterClass *RC =
616 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
617 if (!RC)
618 return false;
619
620 // Also need to check the dest regclass, in case we're trying to optimize
621 // something like:
622 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
623 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
624 return false;
625
626 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
627 unsigned ExtraCondLat = Cond.size() != 1;
628
629 // GPRs are handled by csel.
630 // FIXME: Fold in x+1, -x, and ~x when applicable.
631 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
632 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
633 // Single-cycle csel, csinc, csinv, and csneg.
634 CondCycles = 1 + ExtraCondLat;
635 TrueCycles = FalseCycles = 1;
636 if (canFoldIntoCSel(MRI, TrueReg))
637 TrueCycles = 0;
638 else if (canFoldIntoCSel(MRI, FalseReg))
639 FalseCycles = 0;
640 return true;
641 }
642
643 // Scalar floating point is handled by fcsel.
644 // FIXME: Form fabs, fmin, and fmax when applicable.
645 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
646 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
647 CondCycles = 5 + ExtraCondLat;
648 TrueCycles = FalseCycles = 2;
649 return true;
650 }
651
652 // Can't do vectors.
653 return false;
654 }
655
insertSelect(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register DstReg,ArrayRef<MachineOperand> Cond,Register TrueReg,Register FalseReg) const656 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
657 MachineBasicBlock::iterator I,
658 const DebugLoc &DL, Register DstReg,
659 ArrayRef<MachineOperand> Cond,
660 Register TrueReg, Register FalseReg) const {
661 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
662
663 // Parse the condition code, see parseCondBranch() above.
664 AArch64CC::CondCode CC;
665 switch (Cond.size()) {
666 default:
667 llvm_unreachable("Unknown condition opcode in Cond");
668 case 1: // b.cc
669 CC = AArch64CC::CondCode(Cond[0].getImm());
670 break;
671 case 3: { // cbz/cbnz
672 // We must insert a compare against 0.
673 bool Is64Bit;
674 switch (Cond[1].getImm()) {
675 default:
676 llvm_unreachable("Unknown branch opcode in Cond");
677 case AArch64::CBZW:
678 Is64Bit = false;
679 CC = AArch64CC::EQ;
680 break;
681 case AArch64::CBZX:
682 Is64Bit = true;
683 CC = AArch64CC::EQ;
684 break;
685 case AArch64::CBNZW:
686 Is64Bit = false;
687 CC = AArch64CC::NE;
688 break;
689 case AArch64::CBNZX:
690 Is64Bit = true;
691 CC = AArch64CC::NE;
692 break;
693 }
694 Register SrcReg = Cond[2].getReg();
695 if (Is64Bit) {
696 // cmp reg, #0 is actually subs xzr, reg, #0.
697 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
698 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
699 .addReg(SrcReg)
700 .addImm(0)
701 .addImm(0);
702 } else {
703 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
704 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
705 .addReg(SrcReg)
706 .addImm(0)
707 .addImm(0);
708 }
709 break;
710 }
711 case 4: { // tbz/tbnz
712 // We must insert a tst instruction.
713 switch (Cond[1].getImm()) {
714 default:
715 llvm_unreachable("Unknown branch opcode in Cond");
716 case AArch64::TBZW:
717 case AArch64::TBZX:
718 CC = AArch64CC::EQ;
719 break;
720 case AArch64::TBNZW:
721 case AArch64::TBNZX:
722 CC = AArch64CC::NE;
723 break;
724 }
725 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
726 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
727 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
728 .addReg(Cond[2].getReg())
729 .addImm(
730 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
731 else
732 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
733 .addReg(Cond[2].getReg())
734 .addImm(
735 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
736 break;
737 }
738 }
739
740 unsigned Opc = 0;
741 const TargetRegisterClass *RC = nullptr;
742 bool TryFold = false;
743 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
744 RC = &AArch64::GPR64RegClass;
745 Opc = AArch64::CSELXr;
746 TryFold = true;
747 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
748 RC = &AArch64::GPR32RegClass;
749 Opc = AArch64::CSELWr;
750 TryFold = true;
751 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
752 RC = &AArch64::FPR64RegClass;
753 Opc = AArch64::FCSELDrrr;
754 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
755 RC = &AArch64::FPR32RegClass;
756 Opc = AArch64::FCSELSrrr;
757 }
758 assert(RC && "Unsupported regclass");
759
760 // Try folding simple instructions into the csel.
761 if (TryFold) {
762 unsigned NewVReg = 0;
763 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
764 if (FoldedOpc) {
765 // The folded opcodes csinc, csinc and csneg apply the operation to
766 // FalseReg, so we need to invert the condition.
767 CC = AArch64CC::getInvertedCondCode(CC);
768 TrueReg = FalseReg;
769 } else
770 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
771
772 // Fold the operation. Leave any dead instructions for DCE to clean up.
773 if (FoldedOpc) {
774 FalseReg = NewVReg;
775 Opc = FoldedOpc;
776 // The extends the live range of NewVReg.
777 MRI.clearKillFlags(NewVReg);
778 }
779 }
780
781 // Pull all virtual register into the appropriate class.
782 MRI.constrainRegClass(TrueReg, RC);
783 MRI.constrainRegClass(FalseReg, RC);
784
785 // Insert the csel.
786 BuildMI(MBB, I, DL, get(Opc), DstReg)
787 .addReg(TrueReg)
788 .addReg(FalseReg)
789 .addImm(CC);
790 }
791
792 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx.
canBeExpandedToORR(const MachineInstr & MI,unsigned BitSize)793 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
794 uint64_t Imm = MI.getOperand(1).getImm();
795 uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
796 uint64_t Encoding;
797 return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
798 }
799
800 // FIXME: this implementation should be micro-architecture dependent, so a
801 // micro-architecture target hook should be introduced here in future.
isAsCheapAsAMove(const MachineInstr & MI) const802 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
803 if (!Subtarget.hasCustomCheapAsMoveHandling())
804 return MI.isAsCheapAsAMove();
805
806 const unsigned Opcode = MI.getOpcode();
807
808 // Firstly, check cases gated by features.
809
810 if (Subtarget.hasZeroCycleZeroingFP()) {
811 if (Opcode == AArch64::FMOVH0 ||
812 Opcode == AArch64::FMOVS0 ||
813 Opcode == AArch64::FMOVD0)
814 return true;
815 }
816
817 if (Subtarget.hasZeroCycleZeroingGP()) {
818 if (Opcode == TargetOpcode::COPY &&
819 (MI.getOperand(1).getReg() == AArch64::WZR ||
820 MI.getOperand(1).getReg() == AArch64::XZR))
821 return true;
822 }
823
824 // Secondly, check cases specific to sub-targets.
825
826 if (Subtarget.hasExynosCheapAsMoveHandling()) {
827 if (isExynosCheapAsMove(MI))
828 return true;
829
830 return MI.isAsCheapAsAMove();
831 }
832
833 // Finally, check generic cases.
834
835 switch (Opcode) {
836 default:
837 return false;
838
839 // add/sub on register without shift
840 case AArch64::ADDWri:
841 case AArch64::ADDXri:
842 case AArch64::SUBWri:
843 case AArch64::SUBXri:
844 return (MI.getOperand(3).getImm() == 0);
845
846 // logical ops on immediate
847 case AArch64::ANDWri:
848 case AArch64::ANDXri:
849 case AArch64::EORWri:
850 case AArch64::EORXri:
851 case AArch64::ORRWri:
852 case AArch64::ORRXri:
853 return true;
854
855 // logical ops on register without shift
856 case AArch64::ANDWrr:
857 case AArch64::ANDXrr:
858 case AArch64::BICWrr:
859 case AArch64::BICXrr:
860 case AArch64::EONWrr:
861 case AArch64::EONXrr:
862 case AArch64::EORWrr:
863 case AArch64::EORXrr:
864 case AArch64::ORNWrr:
865 case AArch64::ORNXrr:
866 case AArch64::ORRWrr:
867 case AArch64::ORRXrr:
868 return true;
869
870 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
871 // ORRXri, it is as cheap as MOV
872 case AArch64::MOVi32imm:
873 return canBeExpandedToORR(MI, 32);
874 case AArch64::MOVi64imm:
875 return canBeExpandedToORR(MI, 64);
876 }
877
878 llvm_unreachable("Unknown opcode to check as cheap as a move!");
879 }
880
isFalkorShiftExtFast(const MachineInstr & MI)881 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
882 switch (MI.getOpcode()) {
883 default:
884 return false;
885
886 case AArch64::ADDWrs:
887 case AArch64::ADDXrs:
888 case AArch64::ADDSWrs:
889 case AArch64::ADDSXrs: {
890 unsigned Imm = MI.getOperand(3).getImm();
891 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
892 if (ShiftVal == 0)
893 return true;
894 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
895 }
896
897 case AArch64::ADDWrx:
898 case AArch64::ADDXrx:
899 case AArch64::ADDXrx64:
900 case AArch64::ADDSWrx:
901 case AArch64::ADDSXrx:
902 case AArch64::ADDSXrx64: {
903 unsigned Imm = MI.getOperand(3).getImm();
904 switch (AArch64_AM::getArithExtendType(Imm)) {
905 default:
906 return false;
907 case AArch64_AM::UXTB:
908 case AArch64_AM::UXTH:
909 case AArch64_AM::UXTW:
910 case AArch64_AM::UXTX:
911 return AArch64_AM::getArithShiftValue(Imm) <= 4;
912 }
913 }
914
915 case AArch64::SUBWrs:
916 case AArch64::SUBSWrs: {
917 unsigned Imm = MI.getOperand(3).getImm();
918 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
919 return ShiftVal == 0 ||
920 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
921 }
922
923 case AArch64::SUBXrs:
924 case AArch64::SUBSXrs: {
925 unsigned Imm = MI.getOperand(3).getImm();
926 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
927 return ShiftVal == 0 ||
928 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
929 }
930
931 case AArch64::SUBWrx:
932 case AArch64::SUBXrx:
933 case AArch64::SUBXrx64:
934 case AArch64::SUBSWrx:
935 case AArch64::SUBSXrx:
936 case AArch64::SUBSXrx64: {
937 unsigned Imm = MI.getOperand(3).getImm();
938 switch (AArch64_AM::getArithExtendType(Imm)) {
939 default:
940 return false;
941 case AArch64_AM::UXTB:
942 case AArch64_AM::UXTH:
943 case AArch64_AM::UXTW:
944 case AArch64_AM::UXTX:
945 return AArch64_AM::getArithShiftValue(Imm) == 0;
946 }
947 }
948
949 case AArch64::LDRBBroW:
950 case AArch64::LDRBBroX:
951 case AArch64::LDRBroW:
952 case AArch64::LDRBroX:
953 case AArch64::LDRDroW:
954 case AArch64::LDRDroX:
955 case AArch64::LDRHHroW:
956 case AArch64::LDRHHroX:
957 case AArch64::LDRHroW:
958 case AArch64::LDRHroX:
959 case AArch64::LDRQroW:
960 case AArch64::LDRQroX:
961 case AArch64::LDRSBWroW:
962 case AArch64::LDRSBWroX:
963 case AArch64::LDRSBXroW:
964 case AArch64::LDRSBXroX:
965 case AArch64::LDRSHWroW:
966 case AArch64::LDRSHWroX:
967 case AArch64::LDRSHXroW:
968 case AArch64::LDRSHXroX:
969 case AArch64::LDRSWroW:
970 case AArch64::LDRSWroX:
971 case AArch64::LDRSroW:
972 case AArch64::LDRSroX:
973 case AArch64::LDRWroW:
974 case AArch64::LDRWroX:
975 case AArch64::LDRXroW:
976 case AArch64::LDRXroX:
977 case AArch64::PRFMroW:
978 case AArch64::PRFMroX:
979 case AArch64::STRBBroW:
980 case AArch64::STRBBroX:
981 case AArch64::STRBroW:
982 case AArch64::STRBroX:
983 case AArch64::STRDroW:
984 case AArch64::STRDroX:
985 case AArch64::STRHHroW:
986 case AArch64::STRHHroX:
987 case AArch64::STRHroW:
988 case AArch64::STRHroX:
989 case AArch64::STRQroW:
990 case AArch64::STRQroX:
991 case AArch64::STRSroW:
992 case AArch64::STRSroX:
993 case AArch64::STRWroW:
994 case AArch64::STRWroX:
995 case AArch64::STRXroW:
996 case AArch64::STRXroX: {
997 unsigned IsSigned = MI.getOperand(3).getImm();
998 return !IsSigned;
999 }
1000 }
1001 }
1002
isSEHInstruction(const MachineInstr & MI)1003 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1004 unsigned Opc = MI.getOpcode();
1005 switch (Opc) {
1006 default:
1007 return false;
1008 case AArch64::SEH_StackAlloc:
1009 case AArch64::SEH_SaveFPLR:
1010 case AArch64::SEH_SaveFPLR_X:
1011 case AArch64::SEH_SaveReg:
1012 case AArch64::SEH_SaveReg_X:
1013 case AArch64::SEH_SaveRegP:
1014 case AArch64::SEH_SaveRegP_X:
1015 case AArch64::SEH_SaveFReg:
1016 case AArch64::SEH_SaveFReg_X:
1017 case AArch64::SEH_SaveFRegP:
1018 case AArch64::SEH_SaveFRegP_X:
1019 case AArch64::SEH_SetFP:
1020 case AArch64::SEH_AddFP:
1021 case AArch64::SEH_Nop:
1022 case AArch64::SEH_PrologEnd:
1023 case AArch64::SEH_EpilogStart:
1024 case AArch64::SEH_EpilogEnd:
1025 return true;
1026 }
1027 }
1028
isCoalescableExtInstr(const MachineInstr & MI,Register & SrcReg,Register & DstReg,unsigned & SubIdx) const1029 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
1030 Register &SrcReg, Register &DstReg,
1031 unsigned &SubIdx) const {
1032 switch (MI.getOpcode()) {
1033 default:
1034 return false;
1035 case AArch64::SBFMXri: // aka sxtw
1036 case AArch64::UBFMXri: // aka uxtw
1037 // Check for the 32 -> 64 bit extension case, these instructions can do
1038 // much more.
1039 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1040 return false;
1041 // This is a signed or unsigned 32 -> 64 bit extension.
1042 SrcReg = MI.getOperand(1).getReg();
1043 DstReg = MI.getOperand(0).getReg();
1044 SubIdx = AArch64::sub_32;
1045 return true;
1046 }
1047 }
1048
areMemAccessesTriviallyDisjoint(const MachineInstr & MIa,const MachineInstr & MIb) const1049 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
1050 const MachineInstr &MIa, const MachineInstr &MIb) const {
1051 const TargetRegisterInfo *TRI = &getRegisterInfo();
1052 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1053 int64_t OffsetA = 0, OffsetB = 0;
1054 unsigned WidthA = 0, WidthB = 0;
1055 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1056
1057 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1058 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1059
1060 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1061 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1062 return false;
1063
1064 // Retrieve the base, offset from the base and width. Width
1065 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1066 // base are identical, and the offset of a lower memory access +
1067 // the width doesn't overlap the offset of a higher memory access,
1068 // then the memory accesses are different.
1069 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1070 // are assumed to have the same scale (vscale).
1071 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1072 WidthA, TRI) &&
1073 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1074 WidthB, TRI)) {
1075 if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1076 OffsetAIsScalable == OffsetBIsScalable) {
1077 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1078 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1079 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1080 if (LowOffset + LowWidth <= HighOffset)
1081 return true;
1082 }
1083 }
1084 return false;
1085 }
1086
isSchedulingBoundary(const MachineInstr & MI,const MachineBasicBlock * MBB,const MachineFunction & MF) const1087 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1088 const MachineBasicBlock *MBB,
1089 const MachineFunction &MF) const {
1090 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1091 return true;
1092 switch (MI.getOpcode()) {
1093 case AArch64::HINT:
1094 // CSDB hints are scheduling barriers.
1095 if (MI.getOperand(0).getImm() == 0x14)
1096 return true;
1097 break;
1098 case AArch64::DSB:
1099 case AArch64::ISB:
1100 // DSB and ISB also are scheduling barriers.
1101 return true;
1102 default:;
1103 }
1104 return isSEHInstruction(MI);
1105 }
1106
1107 /// analyzeCompare - For a comparison instruction, return the source registers
1108 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1109 /// Return true if the comparison instruction can be analyzed.
analyzeCompare(const MachineInstr & MI,Register & SrcReg,Register & SrcReg2,int & CmpMask,int & CmpValue) const1110 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1111 Register &SrcReg2, int &CmpMask,
1112 int &CmpValue) const {
1113 // The first operand can be a frame index where we'd normally expect a
1114 // register.
1115 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1116 if (!MI.getOperand(1).isReg())
1117 return false;
1118
1119 switch (MI.getOpcode()) {
1120 default:
1121 break;
1122 case AArch64::SUBSWrr:
1123 case AArch64::SUBSWrs:
1124 case AArch64::SUBSWrx:
1125 case AArch64::SUBSXrr:
1126 case AArch64::SUBSXrs:
1127 case AArch64::SUBSXrx:
1128 case AArch64::ADDSWrr:
1129 case AArch64::ADDSWrs:
1130 case AArch64::ADDSWrx:
1131 case AArch64::ADDSXrr:
1132 case AArch64::ADDSXrs:
1133 case AArch64::ADDSXrx:
1134 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1135 SrcReg = MI.getOperand(1).getReg();
1136 SrcReg2 = MI.getOperand(2).getReg();
1137 CmpMask = ~0;
1138 CmpValue = 0;
1139 return true;
1140 case AArch64::SUBSWri:
1141 case AArch64::ADDSWri:
1142 case AArch64::SUBSXri:
1143 case AArch64::ADDSXri:
1144 SrcReg = MI.getOperand(1).getReg();
1145 SrcReg2 = 0;
1146 CmpMask = ~0;
1147 // FIXME: In order to convert CmpValue to 0 or 1
1148 CmpValue = MI.getOperand(2).getImm() != 0;
1149 return true;
1150 case AArch64::ANDSWri:
1151 case AArch64::ANDSXri:
1152 // ANDS does not use the same encoding scheme as the others xxxS
1153 // instructions.
1154 SrcReg = MI.getOperand(1).getReg();
1155 SrcReg2 = 0;
1156 CmpMask = ~0;
1157 // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
1158 // while the type of CmpValue is int. When converting uint64_t to int,
1159 // the high 32 bits of uint64_t will be lost.
1160 // In fact it causes a bug in spec2006-483.xalancbmk
1161 // CmpValue is only used to compare with zero in OptimizeCompareInstr
1162 CmpValue = AArch64_AM::decodeLogicalImmediate(
1163 MI.getOperand(2).getImm(),
1164 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
1165 return true;
1166 }
1167
1168 return false;
1169 }
1170
UpdateOperandRegClass(MachineInstr & Instr)1171 static bool UpdateOperandRegClass(MachineInstr &Instr) {
1172 MachineBasicBlock *MBB = Instr.getParent();
1173 assert(MBB && "Can't get MachineBasicBlock here");
1174 MachineFunction *MF = MBB->getParent();
1175 assert(MF && "Can't get MachineFunction here");
1176 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1177 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1178 MachineRegisterInfo *MRI = &MF->getRegInfo();
1179
1180 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1181 ++OpIdx) {
1182 MachineOperand &MO = Instr.getOperand(OpIdx);
1183 const TargetRegisterClass *OpRegCstraints =
1184 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1185
1186 // If there's no constraint, there's nothing to do.
1187 if (!OpRegCstraints)
1188 continue;
1189 // If the operand is a frame index, there's nothing to do here.
1190 // A frame index operand will resolve correctly during PEI.
1191 if (MO.isFI())
1192 continue;
1193
1194 assert(MO.isReg() &&
1195 "Operand has register constraints without being a register!");
1196
1197 Register Reg = MO.getReg();
1198 if (Register::isPhysicalRegister(Reg)) {
1199 if (!OpRegCstraints->contains(Reg))
1200 return false;
1201 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1202 !MRI->constrainRegClass(Reg, OpRegCstraints))
1203 return false;
1204 }
1205
1206 return true;
1207 }
1208
1209 /// Return the opcode that does not set flags when possible - otherwise
1210 /// return the original opcode. The caller is responsible to do the actual
1211 /// substitution and legality checking.
convertToNonFlagSettingOpc(const MachineInstr & MI)1212 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1213 // Don't convert all compare instructions, because for some the zero register
1214 // encoding becomes the sp register.
1215 bool MIDefinesZeroReg = false;
1216 if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
1217 MIDefinesZeroReg = true;
1218
1219 switch (MI.getOpcode()) {
1220 default:
1221 return MI.getOpcode();
1222 case AArch64::ADDSWrr:
1223 return AArch64::ADDWrr;
1224 case AArch64::ADDSWri:
1225 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1226 case AArch64::ADDSWrs:
1227 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1228 case AArch64::ADDSWrx:
1229 return AArch64::ADDWrx;
1230 case AArch64::ADDSXrr:
1231 return AArch64::ADDXrr;
1232 case AArch64::ADDSXri:
1233 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1234 case AArch64::ADDSXrs:
1235 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1236 case AArch64::ADDSXrx:
1237 return AArch64::ADDXrx;
1238 case AArch64::SUBSWrr:
1239 return AArch64::SUBWrr;
1240 case AArch64::SUBSWri:
1241 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1242 case AArch64::SUBSWrs:
1243 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1244 case AArch64::SUBSWrx:
1245 return AArch64::SUBWrx;
1246 case AArch64::SUBSXrr:
1247 return AArch64::SUBXrr;
1248 case AArch64::SUBSXri:
1249 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1250 case AArch64::SUBSXrs:
1251 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1252 case AArch64::SUBSXrx:
1253 return AArch64::SUBXrx;
1254 }
1255 }
1256
1257 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1258
1259 /// True when condition flags are accessed (either by writing or reading)
1260 /// on the instruction trace starting at From and ending at To.
1261 ///
1262 /// Note: If From and To are from different blocks it's assumed CC are accessed
1263 /// on the path.
areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From,MachineBasicBlock::iterator To,const TargetRegisterInfo * TRI,const AccessKind AccessToCheck=AK_All)1264 static bool areCFlagsAccessedBetweenInstrs(
1265 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1266 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1267 // Early exit if To is at the beginning of the BB.
1268 if (To == To->getParent()->begin())
1269 return true;
1270
1271 // Check whether the instructions are in the same basic block
1272 // If not, assume the condition flags might get modified somewhere.
1273 if (To->getParent() != From->getParent())
1274 return true;
1275
1276 // From must be above To.
1277 assert(std::find_if(++To.getReverse(), To->getParent()->rend(),
1278 [From](MachineInstr &MI) {
1279 return MI.getIterator() == From;
1280 }) != To->getParent()->rend());
1281
1282 // We iterate backward starting at \p To until we hit \p From.
1283 for (const MachineInstr &Instr :
1284 instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
1285 if (((AccessToCheck & AK_Write) &&
1286 Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1287 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1288 return true;
1289 }
1290 return false;
1291 }
1292
1293 /// Try to optimize a compare instruction. A compare instruction is an
1294 /// instruction which produces AArch64::NZCV. It can be truly compare
1295 /// instruction
1296 /// when there are no uses of its destination register.
1297 ///
1298 /// The following steps are tried in order:
1299 /// 1. Convert CmpInstr into an unconditional version.
1300 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1301 /// condition code or an instruction which can be converted into such an
1302 /// instruction.
1303 /// Only comparison with zero is supported.
optimizeCompareInstr(MachineInstr & CmpInstr,Register SrcReg,Register SrcReg2,int CmpMask,int CmpValue,const MachineRegisterInfo * MRI) const1304 bool AArch64InstrInfo::optimizeCompareInstr(
1305 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask,
1306 int CmpValue, const MachineRegisterInfo *MRI) const {
1307 assert(CmpInstr.getParent());
1308 assert(MRI);
1309
1310 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1311 int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
1312 if (DeadNZCVIdx != -1) {
1313 if (CmpInstr.definesRegister(AArch64::WZR) ||
1314 CmpInstr.definesRegister(AArch64::XZR)) {
1315 CmpInstr.eraseFromParent();
1316 return true;
1317 }
1318 unsigned Opc = CmpInstr.getOpcode();
1319 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1320 if (NewOpc == Opc)
1321 return false;
1322 const MCInstrDesc &MCID = get(NewOpc);
1323 CmpInstr.setDesc(MCID);
1324 CmpInstr.RemoveOperand(DeadNZCVIdx);
1325 bool succeeded = UpdateOperandRegClass(CmpInstr);
1326 (void)succeeded;
1327 assert(succeeded && "Some operands reg class are incompatible!");
1328 return true;
1329 }
1330
1331 // Continue only if we have a "ri" where immediate is zero.
1332 // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
1333 // function.
1334 assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
1335 if (CmpValue != 0 || SrcReg2 != 0)
1336 return false;
1337
1338 // CmpInstr is a Compare instruction if destination register is not used.
1339 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1340 return false;
1341
1342 return substituteCmpToZero(CmpInstr, SrcReg, MRI);
1343 }
1344
1345 /// Get opcode of S version of Instr.
1346 /// If Instr is S version its opcode is returned.
1347 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1348 /// or we are not interested in it.
sForm(MachineInstr & Instr)1349 static unsigned sForm(MachineInstr &Instr) {
1350 switch (Instr.getOpcode()) {
1351 default:
1352 return AArch64::INSTRUCTION_LIST_END;
1353
1354 case AArch64::ADDSWrr:
1355 case AArch64::ADDSWri:
1356 case AArch64::ADDSXrr:
1357 case AArch64::ADDSXri:
1358 case AArch64::SUBSWrr:
1359 case AArch64::SUBSWri:
1360 case AArch64::SUBSXrr:
1361 case AArch64::SUBSXri:
1362 return Instr.getOpcode();
1363
1364 case AArch64::ADDWrr:
1365 return AArch64::ADDSWrr;
1366 case AArch64::ADDWri:
1367 return AArch64::ADDSWri;
1368 case AArch64::ADDXrr:
1369 return AArch64::ADDSXrr;
1370 case AArch64::ADDXri:
1371 return AArch64::ADDSXri;
1372 case AArch64::ADCWr:
1373 return AArch64::ADCSWr;
1374 case AArch64::ADCXr:
1375 return AArch64::ADCSXr;
1376 case AArch64::SUBWrr:
1377 return AArch64::SUBSWrr;
1378 case AArch64::SUBWri:
1379 return AArch64::SUBSWri;
1380 case AArch64::SUBXrr:
1381 return AArch64::SUBSXrr;
1382 case AArch64::SUBXri:
1383 return AArch64::SUBSXri;
1384 case AArch64::SBCWr:
1385 return AArch64::SBCSWr;
1386 case AArch64::SBCXr:
1387 return AArch64::SBCSXr;
1388 case AArch64::ANDWri:
1389 return AArch64::ANDSWri;
1390 case AArch64::ANDXri:
1391 return AArch64::ANDSXri;
1392 }
1393 }
1394
1395 /// Check if AArch64::NZCV should be alive in successors of MBB.
areCFlagsAliveInSuccessors(MachineBasicBlock * MBB)1396 static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
1397 for (auto *BB : MBB->successors())
1398 if (BB->isLiveIn(AArch64::NZCV))
1399 return true;
1400 return false;
1401 }
1402
1403 namespace {
1404
1405 struct UsedNZCV {
1406 bool N = false;
1407 bool Z = false;
1408 bool C = false;
1409 bool V = false;
1410
1411 UsedNZCV() = default;
1412
operator |=__anon02e63d3a0211::UsedNZCV1413 UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
1414 this->N |= UsedFlags.N;
1415 this->Z |= UsedFlags.Z;
1416 this->C |= UsedFlags.C;
1417 this->V |= UsedFlags.V;
1418 return *this;
1419 }
1420 };
1421
1422 } // end anonymous namespace
1423
1424 /// Find a condition code used by the instruction.
1425 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1426 /// codes or we don't optimize CmpInstr in the presence of such instructions.
findCondCodeUsedByInstr(const MachineInstr & Instr)1427 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1428 switch (Instr.getOpcode()) {
1429 default:
1430 return AArch64CC::Invalid;
1431
1432 case AArch64::Bcc: {
1433 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1434 assert(Idx >= 2);
1435 return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
1436 }
1437
1438 case AArch64::CSINVWr:
1439 case AArch64::CSINVXr:
1440 case AArch64::CSINCWr:
1441 case AArch64::CSINCXr:
1442 case AArch64::CSELWr:
1443 case AArch64::CSELXr:
1444 case AArch64::CSNEGWr:
1445 case AArch64::CSNEGXr:
1446 case AArch64::FCSELSrrr:
1447 case AArch64::FCSELDrrr: {
1448 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1449 assert(Idx >= 1);
1450 return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
1451 }
1452 }
1453 }
1454
getUsedNZCV(AArch64CC::CondCode CC)1455 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1456 assert(CC != AArch64CC::Invalid);
1457 UsedNZCV UsedFlags;
1458 switch (CC) {
1459 default:
1460 break;
1461
1462 case AArch64CC::EQ: // Z set
1463 case AArch64CC::NE: // Z clear
1464 UsedFlags.Z = true;
1465 break;
1466
1467 case AArch64CC::HI: // Z clear and C set
1468 case AArch64CC::LS: // Z set or C clear
1469 UsedFlags.Z = true;
1470 LLVM_FALLTHROUGH;
1471 case AArch64CC::HS: // C set
1472 case AArch64CC::LO: // C clear
1473 UsedFlags.C = true;
1474 break;
1475
1476 case AArch64CC::MI: // N set
1477 case AArch64CC::PL: // N clear
1478 UsedFlags.N = true;
1479 break;
1480
1481 case AArch64CC::VS: // V set
1482 case AArch64CC::VC: // V clear
1483 UsedFlags.V = true;
1484 break;
1485
1486 case AArch64CC::GT: // Z clear, N and V the same
1487 case AArch64CC::LE: // Z set, N and V differ
1488 UsedFlags.Z = true;
1489 LLVM_FALLTHROUGH;
1490 case AArch64CC::GE: // N and V the same
1491 case AArch64CC::LT: // N and V differ
1492 UsedFlags.N = true;
1493 UsedFlags.V = true;
1494 break;
1495 }
1496 return UsedFlags;
1497 }
1498
isADDSRegImm(unsigned Opcode)1499 static bool isADDSRegImm(unsigned Opcode) {
1500 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1501 }
1502
isSUBSRegImm(unsigned Opcode)1503 static bool isSUBSRegImm(unsigned Opcode) {
1504 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1505 }
1506
1507 /// Check if CmpInstr can be substituted by MI.
1508 ///
1509 /// CmpInstr can be substituted:
1510 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1511 /// - and, MI and CmpInstr are from the same MachineBB
1512 /// - and, condition flags are not alive in successors of the CmpInstr parent
1513 /// - and, if MI opcode is the S form there must be no defs of flags between
1514 /// MI and CmpInstr
1515 /// or if MI opcode is not the S form there must be neither defs of flags
1516 /// nor uses of flags between MI and CmpInstr.
1517 /// - and C/V flags are not used after CmpInstr
canInstrSubstituteCmpInstr(MachineInstr * MI,MachineInstr * CmpInstr,const TargetRegisterInfo * TRI)1518 static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr,
1519 const TargetRegisterInfo *TRI) {
1520 assert(MI);
1521 assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
1522 assert(CmpInstr);
1523
1524 const unsigned CmpOpcode = CmpInstr->getOpcode();
1525 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1526 return false;
1527
1528 if (MI->getParent() != CmpInstr->getParent())
1529 return false;
1530
1531 if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
1532 return false;
1533
1534 AccessKind AccessToCheck = AK_Write;
1535 if (sForm(*MI) != MI->getOpcode())
1536 AccessToCheck = AK_All;
1537 if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
1538 return false;
1539
1540 UsedNZCV NZCVUsedAfterCmp;
1541 for (const MachineInstr &Instr :
1542 instructionsWithoutDebug(std::next(CmpInstr->getIterator()),
1543 CmpInstr->getParent()->instr_end())) {
1544 if (Instr.readsRegister(AArch64::NZCV, TRI)) {
1545 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1546 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1547 return false;
1548 NZCVUsedAfterCmp |= getUsedNZCV(CC);
1549 }
1550
1551 if (Instr.modifiesRegister(AArch64::NZCV, TRI))
1552 break;
1553 }
1554
1555 return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
1556 }
1557
1558 /// Substitute an instruction comparing to zero with another instruction
1559 /// which produces needed condition flags.
1560 ///
1561 /// Return true on success.
substituteCmpToZero(MachineInstr & CmpInstr,unsigned SrcReg,const MachineRegisterInfo * MRI) const1562 bool AArch64InstrInfo::substituteCmpToZero(
1563 MachineInstr &CmpInstr, unsigned SrcReg,
1564 const MachineRegisterInfo *MRI) const {
1565 assert(MRI);
1566 // Get the unique definition of SrcReg.
1567 MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
1568 if (!MI)
1569 return false;
1570
1571 const TargetRegisterInfo *TRI = &getRegisterInfo();
1572
1573 unsigned NewOpc = sForm(*MI);
1574 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1575 return false;
1576
1577 if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
1578 return false;
1579
1580 // Update the instruction to set NZCV.
1581 MI->setDesc(get(NewOpc));
1582 CmpInstr.eraseFromParent();
1583 bool succeeded = UpdateOperandRegClass(*MI);
1584 (void)succeeded;
1585 assert(succeeded && "Some operands reg class are incompatible!");
1586 MI->addRegisterDefined(AArch64::NZCV, TRI);
1587 return true;
1588 }
1589
expandPostRAPseudo(MachineInstr & MI) const1590 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1591 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1592 MI.getOpcode() != AArch64::CATCHRET)
1593 return false;
1594
1595 MachineBasicBlock &MBB = *MI.getParent();
1596 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1597 auto TRI = Subtarget.getRegisterInfo();
1598 DebugLoc DL = MI.getDebugLoc();
1599
1600 if (MI.getOpcode() == AArch64::CATCHRET) {
1601 // Skip to the first instruction before the epilog.
1602 const TargetInstrInfo *TII =
1603 MBB.getParent()->getSubtarget().getInstrInfo();
1604 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1605 auto MBBI = MachineBasicBlock::iterator(MI);
1606 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1607 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1608 FirstEpilogSEH != MBB.begin())
1609 FirstEpilogSEH = std::prev(FirstEpilogSEH);
1610 if (FirstEpilogSEH != MBB.begin())
1611 FirstEpilogSEH = std::next(FirstEpilogSEH);
1612 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1613 .addReg(AArch64::X0, RegState::Define)
1614 .addMBB(TargetMBB);
1615 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1616 .addReg(AArch64::X0, RegState::Define)
1617 .addReg(AArch64::X0)
1618 .addMBB(TargetMBB)
1619 .addImm(0);
1620 return true;
1621 }
1622
1623 Register Reg = MI.getOperand(0).getReg();
1624 const GlobalValue *GV =
1625 cast<GlobalValue>((*MI.memoperands_begin())->getValue());
1626 const TargetMachine &TM = MBB.getParent()->getTarget();
1627 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
1628 const unsigned char MO_NC = AArch64II::MO_NC;
1629
1630 if ((OpFlags & AArch64II::MO_GOT) != 0) {
1631 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
1632 .addGlobalAddress(GV, 0, OpFlags);
1633 if (Subtarget.isTargetILP32()) {
1634 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
1635 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
1636 .addDef(Reg32, RegState::Dead)
1637 .addUse(Reg, RegState::Kill)
1638 .addImm(0)
1639 .addMemOperand(*MI.memoperands_begin())
1640 .addDef(Reg, RegState::Implicit);
1641 } else {
1642 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1643 .addReg(Reg, RegState::Kill)
1644 .addImm(0)
1645 .addMemOperand(*MI.memoperands_begin());
1646 }
1647 } else if (TM.getCodeModel() == CodeModel::Large) {
1648 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
1649 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
1650 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
1651 .addImm(0);
1652 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1653 .addReg(Reg, RegState::Kill)
1654 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
1655 .addImm(16);
1656 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1657 .addReg(Reg, RegState::Kill)
1658 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
1659 .addImm(32);
1660 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1661 .addReg(Reg, RegState::Kill)
1662 .addGlobalAddress(GV, 0, AArch64II::MO_G3)
1663 .addImm(48);
1664 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1665 .addReg(Reg, RegState::Kill)
1666 .addImm(0)
1667 .addMemOperand(*MI.memoperands_begin());
1668 } else if (TM.getCodeModel() == CodeModel::Tiny) {
1669 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
1670 .addGlobalAddress(GV, 0, OpFlags);
1671 } else {
1672 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
1673 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
1674 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
1675 if (Subtarget.isTargetILP32()) {
1676 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
1677 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
1678 .addDef(Reg32, RegState::Dead)
1679 .addUse(Reg, RegState::Kill)
1680 .addGlobalAddress(GV, 0, LoFlags)
1681 .addMemOperand(*MI.memoperands_begin())
1682 .addDef(Reg, RegState::Implicit);
1683 } else {
1684 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1685 .addReg(Reg, RegState::Kill)
1686 .addGlobalAddress(GV, 0, LoFlags)
1687 .addMemOperand(*MI.memoperands_begin());
1688 }
1689 }
1690
1691 MBB.erase(MI);
1692
1693 return true;
1694 }
1695
1696 // Return true if this instruction simply sets its single destination register
1697 // to zero. This is equivalent to a register rename of the zero-register.
isGPRZero(const MachineInstr & MI)1698 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
1699 switch (MI.getOpcode()) {
1700 default:
1701 break;
1702 case AArch64::MOVZWi:
1703 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
1704 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
1705 assert(MI.getDesc().getNumOperands() == 3 &&
1706 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
1707 return true;
1708 }
1709 break;
1710 case AArch64::ANDWri: // and Rd, Rzr, #imm
1711 return MI.getOperand(1).getReg() == AArch64::WZR;
1712 case AArch64::ANDXri:
1713 return MI.getOperand(1).getReg() == AArch64::XZR;
1714 case TargetOpcode::COPY:
1715 return MI.getOperand(1).getReg() == AArch64::WZR;
1716 }
1717 return false;
1718 }
1719
1720 // Return true if this instruction simply renames a general register without
1721 // modifying bits.
isGPRCopy(const MachineInstr & MI)1722 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
1723 switch (MI.getOpcode()) {
1724 default:
1725 break;
1726 case TargetOpcode::COPY: {
1727 // GPR32 copies will by lowered to ORRXrs
1728 Register DstReg = MI.getOperand(0).getReg();
1729 return (AArch64::GPR32RegClass.contains(DstReg) ||
1730 AArch64::GPR64RegClass.contains(DstReg));
1731 }
1732 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
1733 if (MI.getOperand(1).getReg() == AArch64::XZR) {
1734 assert(MI.getDesc().getNumOperands() == 4 &&
1735 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
1736 return true;
1737 }
1738 break;
1739 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
1740 if (MI.getOperand(2).getImm() == 0) {
1741 assert(MI.getDesc().getNumOperands() == 4 &&
1742 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
1743 return true;
1744 }
1745 break;
1746 }
1747 return false;
1748 }
1749
1750 // Return true if this instruction simply renames a general register without
1751 // modifying bits.
isFPRCopy(const MachineInstr & MI)1752 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
1753 switch (MI.getOpcode()) {
1754 default:
1755 break;
1756 case TargetOpcode::COPY: {
1757 // FPR64 copies will by lowered to ORR.16b
1758 Register DstReg = MI.getOperand(0).getReg();
1759 return (AArch64::FPR64RegClass.contains(DstReg) ||
1760 AArch64::FPR128RegClass.contains(DstReg));
1761 }
1762 case AArch64::ORRv16i8:
1763 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
1764 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
1765 "invalid ORRv16i8 operands");
1766 return true;
1767 }
1768 break;
1769 }
1770 return false;
1771 }
1772
isLoadFromStackSlot(const MachineInstr & MI,int & FrameIndex) const1773 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
1774 int &FrameIndex) const {
1775 switch (MI.getOpcode()) {
1776 default:
1777 break;
1778 case AArch64::LDRWui:
1779 case AArch64::LDRXui:
1780 case AArch64::LDRBui:
1781 case AArch64::LDRHui:
1782 case AArch64::LDRSui:
1783 case AArch64::LDRDui:
1784 case AArch64::LDRQui:
1785 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1786 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1787 FrameIndex = MI.getOperand(1).getIndex();
1788 return MI.getOperand(0).getReg();
1789 }
1790 break;
1791 }
1792
1793 return 0;
1794 }
1795
isStoreToStackSlot(const MachineInstr & MI,int & FrameIndex) const1796 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
1797 int &FrameIndex) const {
1798 switch (MI.getOpcode()) {
1799 default:
1800 break;
1801 case AArch64::STRWui:
1802 case AArch64::STRXui:
1803 case AArch64::STRBui:
1804 case AArch64::STRHui:
1805 case AArch64::STRSui:
1806 case AArch64::STRDui:
1807 case AArch64::STRQui:
1808 case AArch64::LDR_PXI:
1809 case AArch64::STR_PXI:
1810 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1811 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1812 FrameIndex = MI.getOperand(1).getIndex();
1813 return MI.getOperand(0).getReg();
1814 }
1815 break;
1816 }
1817 return 0;
1818 }
1819
1820 /// Check all MachineMemOperands for a hint to suppress pairing.
isLdStPairSuppressed(const MachineInstr & MI)1821 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
1822 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1823 return MMO->getFlags() & MOSuppressPair;
1824 });
1825 }
1826
1827 /// Set a flag on the first MachineMemOperand to suppress pairing.
suppressLdStPair(MachineInstr & MI)1828 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
1829 if (MI.memoperands_empty())
1830 return;
1831 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
1832 }
1833
1834 /// Check all MachineMemOperands for a hint that the load/store is strided.
isStridedAccess(const MachineInstr & MI)1835 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
1836 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1837 return MMO->getFlags() & MOStridedAccess;
1838 });
1839 }
1840
isUnscaledLdSt(unsigned Opc)1841 bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) {
1842 switch (Opc) {
1843 default:
1844 return false;
1845 case AArch64::STURSi:
1846 case AArch64::STURDi:
1847 case AArch64::STURQi:
1848 case AArch64::STURBBi:
1849 case AArch64::STURHHi:
1850 case AArch64::STURWi:
1851 case AArch64::STURXi:
1852 case AArch64::LDURSi:
1853 case AArch64::LDURDi:
1854 case AArch64::LDURQi:
1855 case AArch64::LDURWi:
1856 case AArch64::LDURXi:
1857 case AArch64::LDURSWi:
1858 case AArch64::LDURHHi:
1859 case AArch64::LDURBBi:
1860 case AArch64::LDURSBWi:
1861 case AArch64::LDURSHWi:
1862 return true;
1863 }
1864 }
1865
getUnscaledLdSt(unsigned Opc)1866 Optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
1867 switch (Opc) {
1868 default: return {};
1869 case AArch64::PRFMui: return AArch64::PRFUMi;
1870 case AArch64::LDRXui: return AArch64::LDURXi;
1871 case AArch64::LDRWui: return AArch64::LDURWi;
1872 case AArch64::LDRBui: return AArch64::LDURBi;
1873 case AArch64::LDRHui: return AArch64::LDURHi;
1874 case AArch64::LDRSui: return AArch64::LDURSi;
1875 case AArch64::LDRDui: return AArch64::LDURDi;
1876 case AArch64::LDRQui: return AArch64::LDURQi;
1877 case AArch64::LDRBBui: return AArch64::LDURBBi;
1878 case AArch64::LDRHHui: return AArch64::LDURHHi;
1879 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
1880 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
1881 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
1882 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
1883 case AArch64::LDRSWui: return AArch64::LDURSWi;
1884 case AArch64::STRXui: return AArch64::STURXi;
1885 case AArch64::STRWui: return AArch64::STURWi;
1886 case AArch64::STRBui: return AArch64::STURBi;
1887 case AArch64::STRHui: return AArch64::STURHi;
1888 case AArch64::STRSui: return AArch64::STURSi;
1889 case AArch64::STRDui: return AArch64::STURDi;
1890 case AArch64::STRQui: return AArch64::STURQi;
1891 case AArch64::STRBBui: return AArch64::STURBBi;
1892 case AArch64::STRHHui: return AArch64::STURHHi;
1893 }
1894 }
1895
getLoadStoreImmIdx(unsigned Opc)1896 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
1897 switch (Opc) {
1898 default:
1899 return 2;
1900 case AArch64::LDPXi:
1901 case AArch64::LDPDi:
1902 case AArch64::STPXi:
1903 case AArch64::STPDi:
1904 case AArch64::LDNPXi:
1905 case AArch64::LDNPDi:
1906 case AArch64::STNPXi:
1907 case AArch64::STNPDi:
1908 case AArch64::LDPQi:
1909 case AArch64::STPQi:
1910 case AArch64::LDNPQi:
1911 case AArch64::STNPQi:
1912 case AArch64::LDPWi:
1913 case AArch64::LDPSi:
1914 case AArch64::STPWi:
1915 case AArch64::STPSi:
1916 case AArch64::LDNPWi:
1917 case AArch64::LDNPSi:
1918 case AArch64::STNPWi:
1919 case AArch64::STNPSi:
1920 case AArch64::LDG:
1921 case AArch64::STGPi:
1922 case AArch64::LD1B_IMM:
1923 case AArch64::LD1H_IMM:
1924 case AArch64::LD1W_IMM:
1925 case AArch64::LD1D_IMM:
1926 case AArch64::ST1B_IMM:
1927 case AArch64::ST1H_IMM:
1928 case AArch64::ST1W_IMM:
1929 case AArch64::ST1D_IMM:
1930 case AArch64::LD1B_H_IMM:
1931 case AArch64::LD1SB_H_IMM:
1932 case AArch64::LD1H_S_IMM:
1933 case AArch64::LD1SH_S_IMM:
1934 case AArch64::LD1W_D_IMM:
1935 case AArch64::LD1SW_D_IMM:
1936 case AArch64::ST1B_H_IMM:
1937 case AArch64::ST1H_S_IMM:
1938 case AArch64::ST1W_D_IMM:
1939 case AArch64::LD1B_S_IMM:
1940 case AArch64::LD1SB_S_IMM:
1941 case AArch64::LD1H_D_IMM:
1942 case AArch64::LD1SH_D_IMM:
1943 case AArch64::ST1B_S_IMM:
1944 case AArch64::ST1H_D_IMM:
1945 case AArch64::LD1B_D_IMM:
1946 case AArch64::LD1SB_D_IMM:
1947 case AArch64::ST1B_D_IMM:
1948 return 3;
1949 case AArch64::ADDG:
1950 case AArch64::STGOffset:
1951 case AArch64::LDR_PXI:
1952 case AArch64::STR_PXI:
1953 return 2;
1954 }
1955 }
1956
isPairableLdStInst(const MachineInstr & MI)1957 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
1958 switch (MI.getOpcode()) {
1959 default:
1960 return false;
1961 // Scaled instructions.
1962 case AArch64::STRSui:
1963 case AArch64::STRDui:
1964 case AArch64::STRQui:
1965 case AArch64::STRXui:
1966 case AArch64::STRWui:
1967 case AArch64::LDRSui:
1968 case AArch64::LDRDui:
1969 case AArch64::LDRQui:
1970 case AArch64::LDRXui:
1971 case AArch64::LDRWui:
1972 case AArch64::LDRSWui:
1973 // Unscaled instructions.
1974 case AArch64::STURSi:
1975 case AArch64::STURDi:
1976 case AArch64::STURQi:
1977 case AArch64::STURWi:
1978 case AArch64::STURXi:
1979 case AArch64::LDURSi:
1980 case AArch64::LDURDi:
1981 case AArch64::LDURQi:
1982 case AArch64::LDURWi:
1983 case AArch64::LDURXi:
1984 case AArch64::LDURSWi:
1985 return true;
1986 }
1987 }
1988
convertToFlagSettingOpc(unsigned Opc,bool & Is64Bit)1989 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc,
1990 bool &Is64Bit) {
1991 switch (Opc) {
1992 default:
1993 llvm_unreachable("Opcode has no flag setting equivalent!");
1994 // 32-bit cases:
1995 case AArch64::ADDWri:
1996 Is64Bit = false;
1997 return AArch64::ADDSWri;
1998 case AArch64::ADDWrr:
1999 Is64Bit = false;
2000 return AArch64::ADDSWrr;
2001 case AArch64::ADDWrs:
2002 Is64Bit = false;
2003 return AArch64::ADDSWrs;
2004 case AArch64::ADDWrx:
2005 Is64Bit = false;
2006 return AArch64::ADDSWrx;
2007 case AArch64::ANDWri:
2008 Is64Bit = false;
2009 return AArch64::ANDSWri;
2010 case AArch64::ANDWrr:
2011 Is64Bit = false;
2012 return AArch64::ANDSWrr;
2013 case AArch64::ANDWrs:
2014 Is64Bit = false;
2015 return AArch64::ANDSWrs;
2016 case AArch64::BICWrr:
2017 Is64Bit = false;
2018 return AArch64::BICSWrr;
2019 case AArch64::BICWrs:
2020 Is64Bit = false;
2021 return AArch64::BICSWrs;
2022 case AArch64::SUBWri:
2023 Is64Bit = false;
2024 return AArch64::SUBSWri;
2025 case AArch64::SUBWrr:
2026 Is64Bit = false;
2027 return AArch64::SUBSWrr;
2028 case AArch64::SUBWrs:
2029 Is64Bit = false;
2030 return AArch64::SUBSWrs;
2031 case AArch64::SUBWrx:
2032 Is64Bit = false;
2033 return AArch64::SUBSWrx;
2034 // 64-bit cases:
2035 case AArch64::ADDXri:
2036 Is64Bit = true;
2037 return AArch64::ADDSXri;
2038 case AArch64::ADDXrr:
2039 Is64Bit = true;
2040 return AArch64::ADDSXrr;
2041 case AArch64::ADDXrs:
2042 Is64Bit = true;
2043 return AArch64::ADDSXrs;
2044 case AArch64::ADDXrx:
2045 Is64Bit = true;
2046 return AArch64::ADDSXrx;
2047 case AArch64::ANDXri:
2048 Is64Bit = true;
2049 return AArch64::ANDSXri;
2050 case AArch64::ANDXrr:
2051 Is64Bit = true;
2052 return AArch64::ANDSXrr;
2053 case AArch64::ANDXrs:
2054 Is64Bit = true;
2055 return AArch64::ANDSXrs;
2056 case AArch64::BICXrr:
2057 Is64Bit = true;
2058 return AArch64::BICSXrr;
2059 case AArch64::BICXrs:
2060 Is64Bit = true;
2061 return AArch64::BICSXrs;
2062 case AArch64::SUBXri:
2063 Is64Bit = true;
2064 return AArch64::SUBSXri;
2065 case AArch64::SUBXrr:
2066 Is64Bit = true;
2067 return AArch64::SUBSXrr;
2068 case AArch64::SUBXrs:
2069 Is64Bit = true;
2070 return AArch64::SUBSXrs;
2071 case AArch64::SUBXrx:
2072 Is64Bit = true;
2073 return AArch64::SUBSXrx;
2074 }
2075 }
2076
2077 // Is this a candidate for ld/st merging or pairing? For example, we don't
2078 // touch volatiles or load/stores that have a hint to avoid pair formation.
isCandidateToMergeOrPair(const MachineInstr & MI) const2079 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
2080 // If this is a volatile load/store, don't mess with it.
2081 if (MI.hasOrderedMemoryRef())
2082 return false;
2083
2084 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
2085 assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) &&
2086 "Expected a reg or frame index operand.");
2087 if (!MI.getOperand(2).isImm())
2088 return false;
2089
2090 // Can't merge/pair if the instruction modifies the base register.
2091 // e.g., ldr x0, [x0]
2092 // This case will never occur with an FI base.
2093 if (MI.getOperand(1).isReg()) {
2094 Register BaseReg = MI.getOperand(1).getReg();
2095 const TargetRegisterInfo *TRI = &getRegisterInfo();
2096 if (MI.modifiesRegister(BaseReg, TRI))
2097 return false;
2098 }
2099
2100 // Check if this load/store has a hint to avoid pair formation.
2101 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2102 if (isLdStPairSuppressed(MI))
2103 return false;
2104
2105 // Do not pair any callee-save store/reload instructions in the
2106 // prologue/epilogue if the CFI information encoded the operations as separate
2107 // instructions, as that will cause the size of the actual prologue to mismatch
2108 // with the prologue size recorded in the Windows CFI.
2109 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2110 bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2111 MI.getMF()->getFunction().needsUnwindTableEntry();
2112 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
2113 MI.getFlag(MachineInstr::FrameDestroy)))
2114 return false;
2115
2116 // On some CPUs quad load/store pairs are slower than two single load/stores.
2117 if (Subtarget.isPaired128Slow()) {
2118 switch (MI.getOpcode()) {
2119 default:
2120 break;
2121 case AArch64::LDURQi:
2122 case AArch64::STURQi:
2123 case AArch64::LDRQui:
2124 case AArch64::STRQui:
2125 return false;
2126 }
2127 }
2128
2129 return true;
2130 }
2131
getMemOperandsWithOffsetWidth(const MachineInstr & LdSt,SmallVectorImpl<const MachineOperand * > & BaseOps,int64_t & Offset,bool & OffsetIsScalable,unsigned & Width,const TargetRegisterInfo * TRI) const2132 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
2133 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
2134 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
2135 const TargetRegisterInfo *TRI) const {
2136 if (!LdSt.mayLoadOrStore())
2137 return false;
2138
2139 const MachineOperand *BaseOp;
2140 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
2141 Width, TRI))
2142 return false;
2143 BaseOps.push_back(BaseOp);
2144 return true;
2145 }
2146
2147 Optional<ExtAddrMode>
getAddrModeFromMemoryOp(const MachineInstr & MemI,const TargetRegisterInfo * TRI) const2148 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
2149 const TargetRegisterInfo *TRI) const {
2150 const MachineOperand *Base; // Filled with the base operand of MI.
2151 int64_t Offset; // Filled with the offset of MI.
2152 bool OffsetIsScalable;
2153 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
2154 return None;
2155
2156 if (!Base->isReg())
2157 return None;
2158 ExtAddrMode AM;
2159 AM.BaseReg = Base->getReg();
2160 AM.Displacement = Offset;
2161 AM.ScaledReg = 0;
2162 return AM;
2163 }
2164
getMemOperandWithOffsetWidth(const MachineInstr & LdSt,const MachineOperand * & BaseOp,int64_t & Offset,bool & OffsetIsScalable,unsigned & Width,const TargetRegisterInfo * TRI) const2165 bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
2166 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
2167 bool &OffsetIsScalable, unsigned &Width,
2168 const TargetRegisterInfo *TRI) const {
2169 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
2170 // Handle only loads/stores with base register followed by immediate offset.
2171 if (LdSt.getNumExplicitOperands() == 3) {
2172 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
2173 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
2174 !LdSt.getOperand(2).isImm())
2175 return false;
2176 } else if (LdSt.getNumExplicitOperands() == 4) {
2177 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
2178 if (!LdSt.getOperand(1).isReg() ||
2179 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
2180 !LdSt.getOperand(3).isImm())
2181 return false;
2182 } else
2183 return false;
2184
2185 // Get the scaling factor for the instruction and set the width for the
2186 // instruction.
2187 TypeSize Scale(0U, false);
2188 int64_t Dummy1, Dummy2;
2189
2190 // If this returns false, then it's an instruction we don't want to handle.
2191 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
2192 return false;
2193
2194 // Compute the offset. Offset is calculated as the immediate operand
2195 // multiplied by the scaling factor. Unscaled instructions have scaling factor
2196 // set to 1.
2197 if (LdSt.getNumExplicitOperands() == 3) {
2198 BaseOp = &LdSt.getOperand(1);
2199 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinSize();
2200 } else {
2201 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
2202 BaseOp = &LdSt.getOperand(2);
2203 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinSize();
2204 }
2205 OffsetIsScalable = Scale.isScalable();
2206
2207 if (!BaseOp->isReg() && !BaseOp->isFI())
2208 return false;
2209
2210 return true;
2211 }
2212
2213 MachineOperand &
getMemOpBaseRegImmOfsOffsetOperand(MachineInstr & LdSt) const2214 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
2215 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
2216 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
2217 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
2218 return OfsOp;
2219 }
2220
getMemOpInfo(unsigned Opcode,TypeSize & Scale,unsigned & Width,int64_t & MinOffset,int64_t & MaxOffset)2221 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
2222 unsigned &Width, int64_t &MinOffset,
2223 int64_t &MaxOffset) {
2224 const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8;
2225 switch (Opcode) {
2226 // Not a memory operation or something we want to handle.
2227 default:
2228 Scale = TypeSize::Fixed(0);
2229 Width = 0;
2230 MinOffset = MaxOffset = 0;
2231 return false;
2232 case AArch64::STRWpost:
2233 case AArch64::LDRWpost:
2234 Width = 32;
2235 Scale = TypeSize::Fixed(4);
2236 MinOffset = -256;
2237 MaxOffset = 255;
2238 break;
2239 case AArch64::LDURQi:
2240 case AArch64::STURQi:
2241 Width = 16;
2242 Scale = TypeSize::Fixed(1);
2243 MinOffset = -256;
2244 MaxOffset = 255;
2245 break;
2246 case AArch64::PRFUMi:
2247 case AArch64::LDURXi:
2248 case AArch64::LDURDi:
2249 case AArch64::STURXi:
2250 case AArch64::STURDi:
2251 Width = 8;
2252 Scale = TypeSize::Fixed(1);
2253 MinOffset = -256;
2254 MaxOffset = 255;
2255 break;
2256 case AArch64::LDURWi:
2257 case AArch64::LDURSi:
2258 case AArch64::LDURSWi:
2259 case AArch64::STURWi:
2260 case AArch64::STURSi:
2261 Width = 4;
2262 Scale = TypeSize::Fixed(1);
2263 MinOffset = -256;
2264 MaxOffset = 255;
2265 break;
2266 case AArch64::LDURHi:
2267 case AArch64::LDURHHi:
2268 case AArch64::LDURSHXi:
2269 case AArch64::LDURSHWi:
2270 case AArch64::STURHi:
2271 case AArch64::STURHHi:
2272 Width = 2;
2273 Scale = TypeSize::Fixed(1);
2274 MinOffset = -256;
2275 MaxOffset = 255;
2276 break;
2277 case AArch64::LDURBi:
2278 case AArch64::LDURBBi:
2279 case AArch64::LDURSBXi:
2280 case AArch64::LDURSBWi:
2281 case AArch64::STURBi:
2282 case AArch64::STURBBi:
2283 Width = 1;
2284 Scale = TypeSize::Fixed(1);
2285 MinOffset = -256;
2286 MaxOffset = 255;
2287 break;
2288 case AArch64::LDPQi:
2289 case AArch64::LDNPQi:
2290 case AArch64::STPQi:
2291 case AArch64::STNPQi:
2292 Scale = TypeSize::Fixed(16);
2293 Width = 32;
2294 MinOffset = -64;
2295 MaxOffset = 63;
2296 break;
2297 case AArch64::LDRQui:
2298 case AArch64::STRQui:
2299 Scale = TypeSize::Fixed(16);
2300 Width = 16;
2301 MinOffset = 0;
2302 MaxOffset = 4095;
2303 break;
2304 case AArch64::LDPXi:
2305 case AArch64::LDPDi:
2306 case AArch64::LDNPXi:
2307 case AArch64::LDNPDi:
2308 case AArch64::STPXi:
2309 case AArch64::STPDi:
2310 case AArch64::STNPXi:
2311 case AArch64::STNPDi:
2312 Scale = TypeSize::Fixed(8);
2313 Width = 16;
2314 MinOffset = -64;
2315 MaxOffset = 63;
2316 break;
2317 case AArch64::PRFMui:
2318 case AArch64::LDRXui:
2319 case AArch64::LDRDui:
2320 case AArch64::STRXui:
2321 case AArch64::STRDui:
2322 Scale = TypeSize::Fixed(8);
2323 Width = 8;
2324 MinOffset = 0;
2325 MaxOffset = 4095;
2326 break;
2327 case AArch64::LDPWi:
2328 case AArch64::LDPSi:
2329 case AArch64::LDNPWi:
2330 case AArch64::LDNPSi:
2331 case AArch64::STPWi:
2332 case AArch64::STPSi:
2333 case AArch64::STNPWi:
2334 case AArch64::STNPSi:
2335 Scale = TypeSize::Fixed(4);
2336 Width = 8;
2337 MinOffset = -64;
2338 MaxOffset = 63;
2339 break;
2340 case AArch64::LDRWui:
2341 case AArch64::LDRSui:
2342 case AArch64::LDRSWui:
2343 case AArch64::STRWui:
2344 case AArch64::STRSui:
2345 Scale = TypeSize::Fixed(4);
2346 Width = 4;
2347 MinOffset = 0;
2348 MaxOffset = 4095;
2349 break;
2350 case AArch64::LDRHui:
2351 case AArch64::LDRHHui:
2352 case AArch64::LDRSHWui:
2353 case AArch64::LDRSHXui:
2354 case AArch64::STRHui:
2355 case AArch64::STRHHui:
2356 Scale = TypeSize::Fixed(2);
2357 Width = 2;
2358 MinOffset = 0;
2359 MaxOffset = 4095;
2360 break;
2361 case AArch64::LDRBui:
2362 case AArch64::LDRBBui:
2363 case AArch64::LDRSBWui:
2364 case AArch64::LDRSBXui:
2365 case AArch64::STRBui:
2366 case AArch64::STRBBui:
2367 Scale = TypeSize::Fixed(1);
2368 Width = 1;
2369 MinOffset = 0;
2370 MaxOffset = 4095;
2371 break;
2372 case AArch64::ADDG:
2373 Scale = TypeSize::Fixed(16);
2374 Width = 0;
2375 MinOffset = 0;
2376 MaxOffset = 63;
2377 break;
2378 case AArch64::TAGPstack:
2379 Scale = TypeSize::Fixed(16);
2380 Width = 0;
2381 // TAGP with a negative offset turns into SUBP, which has a maximum offset
2382 // of 63 (not 64!).
2383 MinOffset = -63;
2384 MaxOffset = 63;
2385 break;
2386 case AArch64::LDG:
2387 case AArch64::STGOffset:
2388 case AArch64::STZGOffset:
2389 Scale = TypeSize::Fixed(16);
2390 Width = 16;
2391 MinOffset = -256;
2392 MaxOffset = 255;
2393 break;
2394 case AArch64::STR_ZZZZXI:
2395 case AArch64::LDR_ZZZZXI:
2396 Scale = TypeSize::Scalable(16);
2397 Width = SVEMaxBytesPerVector * 4;
2398 MinOffset = -256;
2399 MaxOffset = 252;
2400 break;
2401 case AArch64::STR_ZZZXI:
2402 case AArch64::LDR_ZZZXI:
2403 Scale = TypeSize::Scalable(16);
2404 Width = SVEMaxBytesPerVector * 3;
2405 MinOffset = -256;
2406 MaxOffset = 253;
2407 break;
2408 case AArch64::STR_ZZXI:
2409 case AArch64::LDR_ZZXI:
2410 Scale = TypeSize::Scalable(16);
2411 Width = SVEMaxBytesPerVector * 2;
2412 MinOffset = -256;
2413 MaxOffset = 254;
2414 break;
2415 case AArch64::LDR_PXI:
2416 case AArch64::STR_PXI:
2417 Scale = TypeSize::Scalable(2);
2418 Width = SVEMaxBytesPerVector / 8;
2419 MinOffset = -256;
2420 MaxOffset = 255;
2421 break;
2422 case AArch64::LDR_ZXI:
2423 case AArch64::STR_ZXI:
2424 Scale = TypeSize::Scalable(16);
2425 Width = SVEMaxBytesPerVector;
2426 MinOffset = -256;
2427 MaxOffset = 255;
2428 break;
2429 case AArch64::LD1B_IMM:
2430 case AArch64::LD1H_IMM:
2431 case AArch64::LD1W_IMM:
2432 case AArch64::LD1D_IMM:
2433 case AArch64::ST1B_IMM:
2434 case AArch64::ST1H_IMM:
2435 case AArch64::ST1W_IMM:
2436 case AArch64::ST1D_IMM:
2437 // A full vectors worth of data
2438 // Width = mbytes * elements
2439 Scale = TypeSize::Scalable(16);
2440 Width = SVEMaxBytesPerVector;
2441 MinOffset = -8;
2442 MaxOffset = 7;
2443 break;
2444 case AArch64::LD1B_H_IMM:
2445 case AArch64::LD1SB_H_IMM:
2446 case AArch64::LD1H_S_IMM:
2447 case AArch64::LD1SH_S_IMM:
2448 case AArch64::LD1W_D_IMM:
2449 case AArch64::LD1SW_D_IMM:
2450 case AArch64::ST1B_H_IMM:
2451 case AArch64::ST1H_S_IMM:
2452 case AArch64::ST1W_D_IMM:
2453 // A half vector worth of data
2454 // Width = mbytes * elements
2455 Scale = TypeSize::Scalable(8);
2456 Width = SVEMaxBytesPerVector / 2;
2457 MinOffset = -8;
2458 MaxOffset = 7;
2459 break;
2460 case AArch64::LD1B_S_IMM:
2461 case AArch64::LD1SB_S_IMM:
2462 case AArch64::LD1H_D_IMM:
2463 case AArch64::LD1SH_D_IMM:
2464 case AArch64::ST1B_S_IMM:
2465 case AArch64::ST1H_D_IMM:
2466 // A quarter vector worth of data
2467 // Width = mbytes * elements
2468 Scale = TypeSize::Scalable(4);
2469 Width = SVEMaxBytesPerVector / 4;
2470 MinOffset = -8;
2471 MaxOffset = 7;
2472 break;
2473 case AArch64::LD1B_D_IMM:
2474 case AArch64::LD1SB_D_IMM:
2475 case AArch64::ST1B_D_IMM:
2476 // A eighth vector worth of data
2477 // Width = mbytes * elements
2478 Scale = TypeSize::Scalable(2);
2479 Width = SVEMaxBytesPerVector / 8;
2480 MinOffset = -8;
2481 MaxOffset = 7;
2482 break;
2483 case AArch64::ST2GOffset:
2484 case AArch64::STZ2GOffset:
2485 Scale = TypeSize::Fixed(16);
2486 Width = 32;
2487 MinOffset = -256;
2488 MaxOffset = 255;
2489 break;
2490 case AArch64::STGPi:
2491 Scale = TypeSize::Fixed(16);
2492 Width = 16;
2493 MinOffset = -64;
2494 MaxOffset = 63;
2495 break;
2496 }
2497
2498 return true;
2499 }
2500
2501 // Scaling factor for unscaled load or store.
getMemScale(unsigned Opc)2502 int AArch64InstrInfo::getMemScale(unsigned Opc) {
2503 switch (Opc) {
2504 default:
2505 llvm_unreachable("Opcode has unknown scale!");
2506 case AArch64::LDRBBui:
2507 case AArch64::LDURBBi:
2508 case AArch64::LDRSBWui:
2509 case AArch64::LDURSBWi:
2510 case AArch64::STRBBui:
2511 case AArch64::STURBBi:
2512 return 1;
2513 case AArch64::LDRHHui:
2514 case AArch64::LDURHHi:
2515 case AArch64::LDRSHWui:
2516 case AArch64::LDURSHWi:
2517 case AArch64::STRHHui:
2518 case AArch64::STURHHi:
2519 return 2;
2520 case AArch64::LDRSui:
2521 case AArch64::LDURSi:
2522 case AArch64::LDRSWui:
2523 case AArch64::LDURSWi:
2524 case AArch64::LDRWui:
2525 case AArch64::LDURWi:
2526 case AArch64::STRSui:
2527 case AArch64::STURSi:
2528 case AArch64::STRWui:
2529 case AArch64::STURWi:
2530 case AArch64::LDPSi:
2531 case AArch64::LDPSWi:
2532 case AArch64::LDPWi:
2533 case AArch64::STPSi:
2534 case AArch64::STPWi:
2535 return 4;
2536 case AArch64::LDRDui:
2537 case AArch64::LDURDi:
2538 case AArch64::LDRXui:
2539 case AArch64::LDURXi:
2540 case AArch64::STRDui:
2541 case AArch64::STURDi:
2542 case AArch64::STRXui:
2543 case AArch64::STURXi:
2544 case AArch64::LDPDi:
2545 case AArch64::LDPXi:
2546 case AArch64::STPDi:
2547 case AArch64::STPXi:
2548 return 8;
2549 case AArch64::LDRQui:
2550 case AArch64::LDURQi:
2551 case AArch64::STRQui:
2552 case AArch64::STURQi:
2553 case AArch64::LDPQi:
2554 case AArch64::STPQi:
2555 case AArch64::STGOffset:
2556 case AArch64::STZGOffset:
2557 case AArch64::ST2GOffset:
2558 case AArch64::STZ2GOffset:
2559 case AArch64::STGPi:
2560 return 16;
2561 }
2562 }
2563
2564 // Scale the unscaled offsets. Returns false if the unscaled offset can't be
2565 // scaled.
scaleOffset(unsigned Opc,int64_t & Offset)2566 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
2567 int Scale = AArch64InstrInfo::getMemScale(Opc);
2568
2569 // If the byte-offset isn't a multiple of the stride, we can't scale this
2570 // offset.
2571 if (Offset % Scale != 0)
2572 return false;
2573
2574 // Convert the byte-offset used by unscaled into an "element" offset used
2575 // by the scaled pair load/store instructions.
2576 Offset /= Scale;
2577 return true;
2578 }
2579
canPairLdStOpc(unsigned FirstOpc,unsigned SecondOpc)2580 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
2581 if (FirstOpc == SecondOpc)
2582 return true;
2583 // We can also pair sign-ext and zero-ext instructions.
2584 switch (FirstOpc) {
2585 default:
2586 return false;
2587 case AArch64::LDRWui:
2588 case AArch64::LDURWi:
2589 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
2590 case AArch64::LDRSWui:
2591 case AArch64::LDURSWi:
2592 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
2593 }
2594 // These instructions can't be paired based on their opcodes.
2595 return false;
2596 }
2597
shouldClusterFI(const MachineFrameInfo & MFI,int FI1,int64_t Offset1,unsigned Opcode1,int FI2,int64_t Offset2,unsigned Opcode2)2598 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
2599 int64_t Offset1, unsigned Opcode1, int FI2,
2600 int64_t Offset2, unsigned Opcode2) {
2601 // Accesses through fixed stack object frame indices may access a different
2602 // fixed stack slot. Check that the object offsets + offsets match.
2603 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
2604 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
2605 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
2606 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
2607 // Convert to scaled object offsets.
2608 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
2609 if (ObjectOffset1 % Scale1 != 0)
2610 return false;
2611 ObjectOffset1 /= Scale1;
2612 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
2613 if (ObjectOffset2 % Scale2 != 0)
2614 return false;
2615 ObjectOffset2 /= Scale2;
2616 ObjectOffset1 += Offset1;
2617 ObjectOffset2 += Offset2;
2618 return ObjectOffset1 + 1 == ObjectOffset2;
2619 }
2620
2621 return FI1 == FI2;
2622 }
2623
2624 /// Detect opportunities for ldp/stp formation.
2625 ///
2626 /// Only called for LdSt for which getMemOperandWithOffset returns true.
shouldClusterMemOps(ArrayRef<const MachineOperand * > BaseOps1,ArrayRef<const MachineOperand * > BaseOps2,unsigned NumLoads,unsigned NumBytes) const2627 bool AArch64InstrInfo::shouldClusterMemOps(
2628 ArrayRef<const MachineOperand *> BaseOps1,
2629 ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads,
2630 unsigned NumBytes) const {
2631 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
2632 const MachineOperand &BaseOp1 = *BaseOps1.front();
2633 const MachineOperand &BaseOp2 = *BaseOps2.front();
2634 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
2635 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
2636 if (BaseOp1.getType() != BaseOp2.getType())
2637 return false;
2638
2639 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
2640 "Only base registers and frame indices are supported.");
2641
2642 // Check for both base regs and base FI.
2643 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
2644 return false;
2645
2646 // Only cluster up to a single pair.
2647 if (NumLoads > 2)
2648 return false;
2649
2650 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
2651 return false;
2652
2653 // Can we pair these instructions based on their opcodes?
2654 unsigned FirstOpc = FirstLdSt.getOpcode();
2655 unsigned SecondOpc = SecondLdSt.getOpcode();
2656 if (!canPairLdStOpc(FirstOpc, SecondOpc))
2657 return false;
2658
2659 // Can't merge volatiles or load/stores that have a hint to avoid pair
2660 // formation, for example.
2661 if (!isCandidateToMergeOrPair(FirstLdSt) ||
2662 !isCandidateToMergeOrPair(SecondLdSt))
2663 return false;
2664
2665 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
2666 int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
2667 if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
2668 return false;
2669
2670 int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
2671 if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
2672 return false;
2673
2674 // Pairwise instructions have a 7-bit signed offset field.
2675 if (Offset1 > 63 || Offset1 < -64)
2676 return false;
2677
2678 // The caller should already have ordered First/SecondLdSt by offset.
2679 // Note: except for non-equal frame index bases
2680 if (BaseOp1.isFI()) {
2681 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
2682 "Caller should have ordered offsets.");
2683
2684 const MachineFrameInfo &MFI =
2685 FirstLdSt.getParent()->getParent()->getFrameInfo();
2686 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
2687 BaseOp2.getIndex(), Offset2, SecondOpc);
2688 }
2689
2690 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
2691
2692 return Offset1 + 1 == Offset2;
2693 }
2694
AddSubReg(const MachineInstrBuilder & MIB,unsigned Reg,unsigned SubIdx,unsigned State,const TargetRegisterInfo * TRI)2695 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
2696 unsigned Reg, unsigned SubIdx,
2697 unsigned State,
2698 const TargetRegisterInfo *TRI) {
2699 if (!SubIdx)
2700 return MIB.addReg(Reg, State);
2701
2702 if (Register::isPhysicalRegister(Reg))
2703 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
2704 return MIB.addReg(Reg, State, SubIdx);
2705 }
2706
forwardCopyWillClobberTuple(unsigned DestReg,unsigned SrcReg,unsigned NumRegs)2707 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
2708 unsigned NumRegs) {
2709 // We really want the positive remainder mod 32 here, that happens to be
2710 // easily obtainable with a mask.
2711 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
2712 }
2713
copyPhysRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc,unsigned Opcode,ArrayRef<unsigned> Indices) const2714 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
2715 MachineBasicBlock::iterator I,
2716 const DebugLoc &DL, MCRegister DestReg,
2717 MCRegister SrcReg, bool KillSrc,
2718 unsigned Opcode,
2719 ArrayRef<unsigned> Indices) const {
2720 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
2721 const TargetRegisterInfo *TRI = &getRegisterInfo();
2722 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2723 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2724 unsigned NumRegs = Indices.size();
2725
2726 int SubReg = 0, End = NumRegs, Incr = 1;
2727 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
2728 SubReg = NumRegs - 1;
2729 End = -1;
2730 Incr = -1;
2731 }
2732
2733 for (; SubReg != End; SubReg += Incr) {
2734 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2735 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2736 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
2737 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2738 }
2739 }
2740
copyGPRRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,DebugLoc DL,unsigned DestReg,unsigned SrcReg,bool KillSrc,unsigned Opcode,unsigned ZeroReg,llvm::ArrayRef<unsigned> Indices) const2741 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
2742 MachineBasicBlock::iterator I,
2743 DebugLoc DL, unsigned DestReg,
2744 unsigned SrcReg, bool KillSrc,
2745 unsigned Opcode, unsigned ZeroReg,
2746 llvm::ArrayRef<unsigned> Indices) const {
2747 const TargetRegisterInfo *TRI = &getRegisterInfo();
2748 unsigned NumRegs = Indices.size();
2749
2750 #ifndef NDEBUG
2751 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2752 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2753 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
2754 "GPR reg sequences should not be able to overlap");
2755 #endif
2756
2757 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
2758 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2759 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2760 MIB.addReg(ZeroReg);
2761 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2762 MIB.addImm(0);
2763 }
2764 }
2765
copyPhysReg(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc) const2766 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
2767 MachineBasicBlock::iterator I,
2768 const DebugLoc &DL, MCRegister DestReg,
2769 MCRegister SrcReg, bool KillSrc) const {
2770 if (AArch64::GPR32spRegClass.contains(DestReg) &&
2771 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
2772 const TargetRegisterInfo *TRI = &getRegisterInfo();
2773
2774 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
2775 // If either operand is WSP, expand to ADD #0.
2776 if (Subtarget.hasZeroCycleRegMove()) {
2777 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
2778 MCRegister DestRegX = TRI->getMatchingSuperReg(
2779 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
2780 MCRegister SrcRegX = TRI->getMatchingSuperReg(
2781 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
2782 // This instruction is reading and writing X registers. This may upset
2783 // the register scavenger and machine verifier, so we need to indicate
2784 // that we are reading an undefined value from SrcRegX, but a proper
2785 // value from SrcReg.
2786 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
2787 .addReg(SrcRegX, RegState::Undef)
2788 .addImm(0)
2789 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
2790 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2791 } else {
2792 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
2793 .addReg(SrcReg, getKillRegState(KillSrc))
2794 .addImm(0)
2795 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2796 }
2797 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
2798 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
2799 .addImm(0)
2800 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2801 } else {
2802 if (Subtarget.hasZeroCycleRegMove()) {
2803 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
2804 MCRegister DestRegX = TRI->getMatchingSuperReg(
2805 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
2806 MCRegister SrcRegX = TRI->getMatchingSuperReg(
2807 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
2808 // This instruction is reading and writing X registers. This may upset
2809 // the register scavenger and machine verifier, so we need to indicate
2810 // that we are reading an undefined value from SrcRegX, but a proper
2811 // value from SrcReg.
2812 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
2813 .addReg(AArch64::XZR)
2814 .addReg(SrcRegX, RegState::Undef)
2815 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2816 } else {
2817 // Otherwise, expand to ORR WZR.
2818 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
2819 .addReg(AArch64::WZR)
2820 .addReg(SrcReg, getKillRegState(KillSrc));
2821 }
2822 }
2823 return;
2824 }
2825
2826 // Copy a Predicate register by ORRing with itself.
2827 if (AArch64::PPRRegClass.contains(DestReg) &&
2828 AArch64::PPRRegClass.contains(SrcReg)) {
2829 assert(Subtarget.hasSVE() && "Unexpected SVE register.");
2830 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
2831 .addReg(SrcReg) // Pg
2832 .addReg(SrcReg)
2833 .addReg(SrcReg, getKillRegState(KillSrc));
2834 return;
2835 }
2836
2837 // Copy a Z register by ORRing with itself.
2838 if (AArch64::ZPRRegClass.contains(DestReg) &&
2839 AArch64::ZPRRegClass.contains(SrcReg)) {
2840 assert(Subtarget.hasSVE() && "Unexpected SVE register.");
2841 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
2842 .addReg(SrcReg)
2843 .addReg(SrcReg, getKillRegState(KillSrc));
2844 return;
2845 }
2846
2847 // Copy a Z register pair by copying the individual sub-registers.
2848 if (AArch64::ZPR2RegClass.contains(DestReg) &&
2849 AArch64::ZPR2RegClass.contains(SrcReg)) {
2850 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
2851 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
2852 Indices);
2853 return;
2854 }
2855
2856 // Copy a Z register triple by copying the individual sub-registers.
2857 if (AArch64::ZPR3RegClass.contains(DestReg) &&
2858 AArch64::ZPR3RegClass.contains(SrcReg)) {
2859 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
2860 AArch64::zsub2};
2861 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
2862 Indices);
2863 return;
2864 }
2865
2866 // Copy a Z register quad by copying the individual sub-registers.
2867 if (AArch64::ZPR4RegClass.contains(DestReg) &&
2868 AArch64::ZPR4RegClass.contains(SrcReg)) {
2869 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
2870 AArch64::zsub2, AArch64::zsub3};
2871 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
2872 Indices);
2873 return;
2874 }
2875
2876 if (AArch64::GPR64spRegClass.contains(DestReg) &&
2877 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
2878 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
2879 // If either operand is SP, expand to ADD #0.
2880 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
2881 .addReg(SrcReg, getKillRegState(KillSrc))
2882 .addImm(0)
2883 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2884 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
2885 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
2886 .addImm(0)
2887 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2888 } else {
2889 // Otherwise, expand to ORR XZR.
2890 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
2891 .addReg(AArch64::XZR)
2892 .addReg(SrcReg, getKillRegState(KillSrc));
2893 }
2894 return;
2895 }
2896
2897 // Copy a DDDD register quad by copying the individual sub-registers.
2898 if (AArch64::DDDDRegClass.contains(DestReg) &&
2899 AArch64::DDDDRegClass.contains(SrcReg)) {
2900 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2901 AArch64::dsub2, AArch64::dsub3};
2902 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2903 Indices);
2904 return;
2905 }
2906
2907 // Copy a DDD register triple by copying the individual sub-registers.
2908 if (AArch64::DDDRegClass.contains(DestReg) &&
2909 AArch64::DDDRegClass.contains(SrcReg)) {
2910 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2911 AArch64::dsub2};
2912 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2913 Indices);
2914 return;
2915 }
2916
2917 // Copy a DD register pair by copying the individual sub-registers.
2918 if (AArch64::DDRegClass.contains(DestReg) &&
2919 AArch64::DDRegClass.contains(SrcReg)) {
2920 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
2921 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2922 Indices);
2923 return;
2924 }
2925
2926 // Copy a QQQQ register quad by copying the individual sub-registers.
2927 if (AArch64::QQQQRegClass.contains(DestReg) &&
2928 AArch64::QQQQRegClass.contains(SrcReg)) {
2929 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2930 AArch64::qsub2, AArch64::qsub3};
2931 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2932 Indices);
2933 return;
2934 }
2935
2936 // Copy a QQQ register triple by copying the individual sub-registers.
2937 if (AArch64::QQQRegClass.contains(DestReg) &&
2938 AArch64::QQQRegClass.contains(SrcReg)) {
2939 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2940 AArch64::qsub2};
2941 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2942 Indices);
2943 return;
2944 }
2945
2946 // Copy a QQ register pair by copying the individual sub-registers.
2947 if (AArch64::QQRegClass.contains(DestReg) &&
2948 AArch64::QQRegClass.contains(SrcReg)) {
2949 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
2950 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2951 Indices);
2952 return;
2953 }
2954
2955 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
2956 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
2957 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
2958 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
2959 AArch64::XZR, Indices);
2960 return;
2961 }
2962
2963 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
2964 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
2965 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
2966 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
2967 AArch64::WZR, Indices);
2968 return;
2969 }
2970
2971 if (AArch64::FPR128RegClass.contains(DestReg) &&
2972 AArch64::FPR128RegClass.contains(SrcReg)) {
2973 if (Subtarget.hasNEON()) {
2974 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2975 .addReg(SrcReg)
2976 .addReg(SrcReg, getKillRegState(KillSrc));
2977 } else {
2978 BuildMI(MBB, I, DL, get(AArch64::STRQpre))
2979 .addReg(AArch64::SP, RegState::Define)
2980 .addReg(SrcReg, getKillRegState(KillSrc))
2981 .addReg(AArch64::SP)
2982 .addImm(-16);
2983 BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
2984 .addReg(AArch64::SP, RegState::Define)
2985 .addReg(DestReg, RegState::Define)
2986 .addReg(AArch64::SP)
2987 .addImm(16);
2988 }
2989 return;
2990 }
2991
2992 if (AArch64::FPR64RegClass.contains(DestReg) &&
2993 AArch64::FPR64RegClass.contains(SrcReg)) {
2994 if (Subtarget.hasNEON()) {
2995 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
2996 &AArch64::FPR128RegClass);
2997 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
2998 &AArch64::FPR128RegClass);
2999 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
3000 .addReg(SrcReg)
3001 .addReg(SrcReg, getKillRegState(KillSrc));
3002 } else {
3003 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
3004 .addReg(SrcReg, getKillRegState(KillSrc));
3005 }
3006 return;
3007 }
3008
3009 if (AArch64::FPR32RegClass.contains(DestReg) &&
3010 AArch64::FPR32RegClass.contains(SrcReg)) {
3011 if (Subtarget.hasNEON()) {
3012 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
3013 &AArch64::FPR128RegClass);
3014 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
3015 &AArch64::FPR128RegClass);
3016 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
3017 .addReg(SrcReg)
3018 .addReg(SrcReg, getKillRegState(KillSrc));
3019 } else {
3020 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
3021 .addReg(SrcReg, getKillRegState(KillSrc));
3022 }
3023 return;
3024 }
3025
3026 if (AArch64::FPR16RegClass.contains(DestReg) &&
3027 AArch64::FPR16RegClass.contains(SrcReg)) {
3028 if (Subtarget.hasNEON()) {
3029 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
3030 &AArch64::FPR128RegClass);
3031 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
3032 &AArch64::FPR128RegClass);
3033 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
3034 .addReg(SrcReg)
3035 .addReg(SrcReg, getKillRegState(KillSrc));
3036 } else {
3037 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
3038 &AArch64::FPR32RegClass);
3039 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
3040 &AArch64::FPR32RegClass);
3041 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
3042 .addReg(SrcReg, getKillRegState(KillSrc));
3043 }
3044 return;
3045 }
3046
3047 if (AArch64::FPR8RegClass.contains(DestReg) &&
3048 AArch64::FPR8RegClass.contains(SrcReg)) {
3049 if (Subtarget.hasNEON()) {
3050 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
3051 &AArch64::FPR128RegClass);
3052 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
3053 &AArch64::FPR128RegClass);
3054 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
3055 .addReg(SrcReg)
3056 .addReg(SrcReg, getKillRegState(KillSrc));
3057 } else {
3058 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
3059 &AArch64::FPR32RegClass);
3060 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
3061 &AArch64::FPR32RegClass);
3062 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
3063 .addReg(SrcReg, getKillRegState(KillSrc));
3064 }
3065 return;
3066 }
3067
3068 // Copies between GPR64 and FPR64.
3069 if (AArch64::FPR64RegClass.contains(DestReg) &&
3070 AArch64::GPR64RegClass.contains(SrcReg)) {
3071 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
3072 .addReg(SrcReg, getKillRegState(KillSrc));
3073 return;
3074 }
3075 if (AArch64::GPR64RegClass.contains(DestReg) &&
3076 AArch64::FPR64RegClass.contains(SrcReg)) {
3077 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
3078 .addReg(SrcReg, getKillRegState(KillSrc));
3079 return;
3080 }
3081 // Copies between GPR32 and FPR32.
3082 if (AArch64::FPR32RegClass.contains(DestReg) &&
3083 AArch64::GPR32RegClass.contains(SrcReg)) {
3084 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
3085 .addReg(SrcReg, getKillRegState(KillSrc));
3086 return;
3087 }
3088 if (AArch64::GPR32RegClass.contains(DestReg) &&
3089 AArch64::FPR32RegClass.contains(SrcReg)) {
3090 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
3091 .addReg(SrcReg, getKillRegState(KillSrc));
3092 return;
3093 }
3094
3095 if (DestReg == AArch64::NZCV) {
3096 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
3097 BuildMI(MBB, I, DL, get(AArch64::MSR))
3098 .addImm(AArch64SysReg::NZCV)
3099 .addReg(SrcReg, getKillRegState(KillSrc))
3100 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
3101 return;
3102 }
3103
3104 if (SrcReg == AArch64::NZCV) {
3105 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
3106 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
3107 .addImm(AArch64SysReg::NZCV)
3108 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
3109 return;
3110 }
3111
3112 llvm_unreachable("unimplemented reg-to-reg copy");
3113 }
3114
storeRegPairToStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,Register SrcReg,bool IsKill,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)3115 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
3116 MachineBasicBlock &MBB,
3117 MachineBasicBlock::iterator InsertBefore,
3118 const MCInstrDesc &MCID,
3119 Register SrcReg, bool IsKill,
3120 unsigned SubIdx0, unsigned SubIdx1, int FI,
3121 MachineMemOperand *MMO) {
3122 Register SrcReg0 = SrcReg;
3123 Register SrcReg1 = SrcReg;
3124 if (Register::isPhysicalRegister(SrcReg)) {
3125 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
3126 SubIdx0 = 0;
3127 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
3128 SubIdx1 = 0;
3129 }
3130 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
3131 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
3132 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
3133 .addFrameIndex(FI)
3134 .addImm(0)
3135 .addMemOperand(MMO);
3136 }
3137
storeRegToStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,Register SrcReg,bool isKill,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI) const3138 void AArch64InstrInfo::storeRegToStackSlot(
3139 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
3140 bool isKill, int FI, const TargetRegisterClass *RC,
3141 const TargetRegisterInfo *TRI) const {
3142 MachineFunction &MF = *MBB.getParent();
3143 MachineFrameInfo &MFI = MF.getFrameInfo();
3144
3145 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
3146 MachineMemOperand *MMO =
3147 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
3148 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
3149 unsigned Opc = 0;
3150 bool Offset = true;
3151 unsigned StackID = TargetStackID::Default;
3152 switch (TRI->getSpillSize(*RC)) {
3153 case 1:
3154 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
3155 Opc = AArch64::STRBui;
3156 break;
3157 case 2:
3158 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
3159 Opc = AArch64::STRHui;
3160 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
3161 assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3162 Opc = AArch64::STR_PXI;
3163 StackID = TargetStackID::SVEVector;
3164 }
3165 break;
3166 case 4:
3167 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
3168 Opc = AArch64::STRWui;
3169 if (Register::isVirtualRegister(SrcReg))
3170 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
3171 else
3172 assert(SrcReg != AArch64::WSP);
3173 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
3174 Opc = AArch64::STRSui;
3175 break;
3176 case 8:
3177 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
3178 Opc = AArch64::STRXui;
3179 if (Register::isVirtualRegister(SrcReg))
3180 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
3181 else
3182 assert(SrcReg != AArch64::SP);
3183 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
3184 Opc = AArch64::STRDui;
3185 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
3186 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
3187 get(AArch64::STPWi), SrcReg, isKill,
3188 AArch64::sube32, AArch64::subo32, FI, MMO);
3189 return;
3190 }
3191 break;
3192 case 16:
3193 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
3194 Opc = AArch64::STRQui;
3195 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
3196 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3197 Opc = AArch64::ST1Twov1d;
3198 Offset = false;
3199 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
3200 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
3201 get(AArch64::STPXi), SrcReg, isKill,
3202 AArch64::sube64, AArch64::subo64, FI, MMO);
3203 return;
3204 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
3205 assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3206 Opc = AArch64::STR_ZXI;
3207 StackID = TargetStackID::SVEVector;
3208 }
3209 break;
3210 case 24:
3211 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
3212 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3213 Opc = AArch64::ST1Threev1d;
3214 Offset = false;
3215 }
3216 break;
3217 case 32:
3218 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
3219 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3220 Opc = AArch64::ST1Fourv1d;
3221 Offset = false;
3222 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
3223 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3224 Opc = AArch64::ST1Twov2d;
3225 Offset = false;
3226 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
3227 assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3228 Opc = AArch64::STR_ZZXI;
3229 StackID = TargetStackID::SVEVector;
3230 }
3231 break;
3232 case 48:
3233 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
3234 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3235 Opc = AArch64::ST1Threev2d;
3236 Offset = false;
3237 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
3238 assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3239 Opc = AArch64::STR_ZZZXI;
3240 StackID = TargetStackID::SVEVector;
3241 }
3242 break;
3243 case 64:
3244 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
3245 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3246 Opc = AArch64::ST1Fourv2d;
3247 Offset = false;
3248 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
3249 assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3250 Opc = AArch64::STR_ZZZZXI;
3251 StackID = TargetStackID::SVEVector;
3252 }
3253 break;
3254 }
3255 assert(Opc && "Unknown register class");
3256 MFI.setStackID(FI, StackID);
3257
3258 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
3259 .addReg(SrcReg, getKillRegState(isKill))
3260 .addFrameIndex(FI);
3261
3262 if (Offset)
3263 MI.addImm(0);
3264 MI.addMemOperand(MMO);
3265 }
3266
loadRegPairFromStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,Register DestReg,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)3267 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
3268 MachineBasicBlock &MBB,
3269 MachineBasicBlock::iterator InsertBefore,
3270 const MCInstrDesc &MCID,
3271 Register DestReg, unsigned SubIdx0,
3272 unsigned SubIdx1, int FI,
3273 MachineMemOperand *MMO) {
3274 Register DestReg0 = DestReg;
3275 Register DestReg1 = DestReg;
3276 bool IsUndef = true;
3277 if (Register::isPhysicalRegister(DestReg)) {
3278 DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
3279 SubIdx0 = 0;
3280 DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
3281 SubIdx1 = 0;
3282 IsUndef = false;
3283 }
3284 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
3285 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
3286 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
3287 .addFrameIndex(FI)
3288 .addImm(0)
3289 .addMemOperand(MMO);
3290 }
3291
loadRegFromStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,Register DestReg,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI) const3292 void AArch64InstrInfo::loadRegFromStackSlot(
3293 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
3294 int FI, const TargetRegisterClass *RC,
3295 const TargetRegisterInfo *TRI) const {
3296 MachineFunction &MF = *MBB.getParent();
3297 MachineFrameInfo &MFI = MF.getFrameInfo();
3298 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
3299 MachineMemOperand *MMO =
3300 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
3301 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
3302
3303 unsigned Opc = 0;
3304 bool Offset = true;
3305 unsigned StackID = TargetStackID::Default;
3306 switch (TRI->getSpillSize(*RC)) {
3307 case 1:
3308 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
3309 Opc = AArch64::LDRBui;
3310 break;
3311 case 2:
3312 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
3313 Opc = AArch64::LDRHui;
3314 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
3315 assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3316 Opc = AArch64::LDR_PXI;
3317 StackID = TargetStackID::SVEVector;
3318 }
3319 break;
3320 case 4:
3321 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
3322 Opc = AArch64::LDRWui;
3323 if (Register::isVirtualRegister(DestReg))
3324 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
3325 else
3326 assert(DestReg != AArch64::WSP);
3327 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
3328 Opc = AArch64::LDRSui;
3329 break;
3330 case 8:
3331 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
3332 Opc = AArch64::LDRXui;
3333 if (Register::isVirtualRegister(DestReg))
3334 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
3335 else
3336 assert(DestReg != AArch64::SP);
3337 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
3338 Opc = AArch64::LDRDui;
3339 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
3340 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
3341 get(AArch64::LDPWi), DestReg, AArch64::sube32,
3342 AArch64::subo32, FI, MMO);
3343 return;
3344 }
3345 break;
3346 case 16:
3347 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
3348 Opc = AArch64::LDRQui;
3349 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
3350 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3351 Opc = AArch64::LD1Twov1d;
3352 Offset = false;
3353 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
3354 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
3355 get(AArch64::LDPXi), DestReg, AArch64::sube64,
3356 AArch64::subo64, FI, MMO);
3357 return;
3358 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
3359 assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3360 Opc = AArch64::LDR_ZXI;
3361 StackID = TargetStackID::SVEVector;
3362 }
3363 break;
3364 case 24:
3365 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
3366 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3367 Opc = AArch64::LD1Threev1d;
3368 Offset = false;
3369 }
3370 break;
3371 case 32:
3372 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
3373 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3374 Opc = AArch64::LD1Fourv1d;
3375 Offset = false;
3376 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
3377 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3378 Opc = AArch64::LD1Twov2d;
3379 Offset = false;
3380 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
3381 assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3382 Opc = AArch64::LDR_ZZXI;
3383 StackID = TargetStackID::SVEVector;
3384 }
3385 break;
3386 case 48:
3387 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
3388 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3389 Opc = AArch64::LD1Threev2d;
3390 Offset = false;
3391 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
3392 assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3393 Opc = AArch64::LDR_ZZZXI;
3394 StackID = TargetStackID::SVEVector;
3395 }
3396 break;
3397 case 64:
3398 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
3399 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3400 Opc = AArch64::LD1Fourv2d;
3401 Offset = false;
3402 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
3403 assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3404 Opc = AArch64::LDR_ZZZZXI;
3405 StackID = TargetStackID::SVEVector;
3406 }
3407 break;
3408 }
3409
3410 assert(Opc && "Unknown register class");
3411 MFI.setStackID(FI, StackID);
3412
3413 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
3414 .addReg(DestReg, getDefRegState(true))
3415 .addFrameIndex(FI);
3416 if (Offset)
3417 MI.addImm(0);
3418 MI.addMemOperand(MMO);
3419 }
3420
isNZCVTouchedInInstructionRange(const MachineInstr & DefMI,const MachineInstr & UseMI,const TargetRegisterInfo * TRI)3421 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
3422 const MachineInstr &UseMI,
3423 const TargetRegisterInfo *TRI) {
3424 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
3425 UseMI.getIterator()),
3426 [TRI](const MachineInstr &I) {
3427 return I.modifiesRegister(AArch64::NZCV, TRI) ||
3428 I.readsRegister(AArch64::NZCV, TRI);
3429 });
3430 }
3431
decomposeStackOffsetForDwarfOffsets(const StackOffset & Offset,int64_t & ByteSized,int64_t & VGSized)3432 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
3433 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
3434 // The smallest scalable element supported by scaled SVE addressing
3435 // modes are predicates, which are 2 scalable bytes in size. So the scalable
3436 // byte offset must always be a multiple of 2.
3437 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
3438
3439 // VGSized offsets are divided by '2', because the VG register is the
3440 // the number of 64bit granules as opposed to 128bit vector chunks,
3441 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
3442 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
3443 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
3444 ByteSized = Offset.getFixed();
3445 VGSized = Offset.getScalable() / 2;
3446 }
3447
3448 /// Returns the offset in parts to which this frame offset can be
3449 /// decomposed for the purpose of describing a frame offset.
3450 /// For non-scalable offsets this is simply its byte size.
decomposeStackOffsetForFrameOffsets(const StackOffset & Offset,int64_t & NumBytes,int64_t & NumPredicateVectors,int64_t & NumDataVectors)3451 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
3452 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
3453 int64_t &NumDataVectors) {
3454 // The smallest scalable element supported by scaled SVE addressing
3455 // modes are predicates, which are 2 scalable bytes in size. So the scalable
3456 // byte offset must always be a multiple of 2.
3457 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
3458
3459 NumBytes = Offset.getFixed();
3460 NumDataVectors = 0;
3461 NumPredicateVectors = Offset.getScalable() / 2;
3462 // This method is used to get the offsets to adjust the frame offset.
3463 // If the function requires ADDPL to be used and needs more than two ADDPL
3464 // instructions, part of the offset is folded into NumDataVectors so that it
3465 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
3466 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
3467 NumPredicateVectors > 62) {
3468 NumDataVectors = NumPredicateVectors / 8;
3469 NumPredicateVectors -= NumDataVectors * 8;
3470 }
3471 }
3472
3473 // Helper function to emit a frame offset adjustment from a given
3474 // pointer (SrcReg), stored into DestReg. This function is explicit
3475 // in that it requires the opcode.
emitFrameOffsetAdj(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,int64_t Offset,unsigned Opc,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool NeedsWinCFI,bool * HasWinCFI)3476 static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
3477 MachineBasicBlock::iterator MBBI,
3478 const DebugLoc &DL, unsigned DestReg,
3479 unsigned SrcReg, int64_t Offset, unsigned Opc,
3480 const TargetInstrInfo *TII,
3481 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
3482 bool *HasWinCFI) {
3483 int Sign = 1;
3484 unsigned MaxEncoding, ShiftSize;
3485 switch (Opc) {
3486 case AArch64::ADDXri:
3487 case AArch64::ADDSXri:
3488 case AArch64::SUBXri:
3489 case AArch64::SUBSXri:
3490 MaxEncoding = 0xfff;
3491 ShiftSize = 12;
3492 break;
3493 case AArch64::ADDVL_XXI:
3494 case AArch64::ADDPL_XXI:
3495 MaxEncoding = 31;
3496 ShiftSize = 0;
3497 if (Offset < 0) {
3498 MaxEncoding = 32;
3499 Sign = -1;
3500 Offset = -Offset;
3501 }
3502 break;
3503 default:
3504 llvm_unreachable("Unsupported opcode");
3505 }
3506
3507 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
3508 // scratch register. If DestReg is a virtual register, use it as the
3509 // scratch register; otherwise, create a new virtual register (to be
3510 // replaced by the scavenger at the end of PEI). That case can be optimized
3511 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
3512 // register can be loaded with offset%8 and the add/sub can use an extending
3513 // instruction with LSL#3.
3514 // Currently the function handles any offsets but generates a poor sequence
3515 // of code.
3516 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
3517
3518 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
3519 Register TmpReg = DestReg;
3520 if (TmpReg == AArch64::XZR)
3521 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
3522 &AArch64::GPR64RegClass);
3523 do {
3524 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
3525 unsigned LocalShiftSize = 0;
3526 if (ThisVal > MaxEncoding) {
3527 ThisVal = ThisVal >> ShiftSize;
3528 LocalShiftSize = ShiftSize;
3529 }
3530 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
3531 "Encoding cannot handle value that big");
3532
3533 Offset -= ThisVal << LocalShiftSize;
3534 if (Offset == 0)
3535 TmpReg = DestReg;
3536 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
3537 .addReg(SrcReg)
3538 .addImm(Sign * (int)ThisVal);
3539 if (ShiftSize)
3540 MBI = MBI.addImm(
3541 AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
3542 MBI = MBI.setMIFlag(Flag);
3543
3544 if (NeedsWinCFI) {
3545 assert(Sign == 1 && "SEH directives should always have a positive sign");
3546 int Imm = (int)(ThisVal << LocalShiftSize);
3547 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
3548 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
3549 if (HasWinCFI)
3550 *HasWinCFI = true;
3551 if (Imm == 0)
3552 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
3553 else
3554 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
3555 .addImm(Imm)
3556 .setMIFlag(Flag);
3557 assert(Offset == 0 && "Expected remaining offset to be zero to "
3558 "emit a single SEH directive");
3559 } else if (DestReg == AArch64::SP) {
3560 if (HasWinCFI)
3561 *HasWinCFI = true;
3562 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
3563 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
3564 .addImm(Imm)
3565 .setMIFlag(Flag);
3566 }
3567 if (HasWinCFI)
3568 *HasWinCFI = true;
3569 }
3570
3571 SrcReg = TmpReg;
3572 } while (Offset);
3573 }
3574
emitFrameOffset(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,StackOffset Offset,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool SetNZCV,bool NeedsWinCFI,bool * HasWinCFI)3575 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
3576 MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
3577 unsigned DestReg, unsigned SrcReg,
3578 StackOffset Offset, const TargetInstrInfo *TII,
3579 MachineInstr::MIFlag Flag, bool SetNZCV,
3580 bool NeedsWinCFI, bool *HasWinCFI) {
3581 int64_t Bytes, NumPredicateVectors, NumDataVectors;
3582 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
3583 Offset, Bytes, NumPredicateVectors, NumDataVectors);
3584
3585 // First emit non-scalable frame offsets, or a simple 'mov'.
3586 if (Bytes || (!Offset && SrcReg != DestReg)) {
3587 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
3588 "SP increment/decrement not 8-byte aligned");
3589 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
3590 if (Bytes < 0) {
3591 Bytes = -Bytes;
3592 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
3593 }
3594 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
3595 NeedsWinCFI, HasWinCFI);
3596 SrcReg = DestReg;
3597 }
3598
3599 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
3600 "SetNZCV not supported with SVE vectors");
3601 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
3602 "WinCFI not supported with SVE vectors");
3603
3604 if (NumDataVectors) {
3605 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
3606 AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr);
3607 SrcReg = DestReg;
3608 }
3609
3610 if (NumPredicateVectors) {
3611 assert(DestReg != AArch64::SP && "Unaligned access to SP");
3612 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
3613 AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr);
3614 }
3615 }
3616
foldMemoryOperandImpl(MachineFunction & MF,MachineInstr & MI,ArrayRef<unsigned> Ops,MachineBasicBlock::iterator InsertPt,int FrameIndex,LiveIntervals * LIS,VirtRegMap * VRM) const3617 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
3618 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
3619 MachineBasicBlock::iterator InsertPt, int FrameIndex,
3620 LiveIntervals *LIS, VirtRegMap *VRM) const {
3621 // This is a bit of a hack. Consider this instruction:
3622 //
3623 // %0 = COPY %sp; GPR64all:%0
3624 //
3625 // We explicitly chose GPR64all for the virtual register so such a copy might
3626 // be eliminated by RegisterCoalescer. However, that may not be possible, and
3627 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
3628 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
3629 //
3630 // To prevent that, we are going to constrain the %0 register class here.
3631 //
3632 // <rdar://problem/11522048>
3633 //
3634 if (MI.isFullCopy()) {
3635 Register DstReg = MI.getOperand(0).getReg();
3636 Register SrcReg = MI.getOperand(1).getReg();
3637 if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) {
3638 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
3639 return nullptr;
3640 }
3641 if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) {
3642 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
3643 return nullptr;
3644 }
3645 }
3646
3647 // Handle the case where a copy is being spilled or filled but the source
3648 // and destination register class don't match. For example:
3649 //
3650 // %0 = COPY %xzr; GPR64common:%0
3651 //
3652 // In this case we can still safely fold away the COPY and generate the
3653 // following spill code:
3654 //
3655 // STRXui %xzr, %stack.0
3656 //
3657 // This also eliminates spilled cross register class COPYs (e.g. between x and
3658 // d regs) of the same size. For example:
3659 //
3660 // %0 = COPY %1; GPR64:%0, FPR64:%1
3661 //
3662 // will be filled as
3663 //
3664 // LDRDui %0, fi<#0>
3665 //
3666 // instead of
3667 //
3668 // LDRXui %Temp, fi<#0>
3669 // %0 = FMOV %Temp
3670 //
3671 if (MI.isCopy() && Ops.size() == 1 &&
3672 // Make sure we're only folding the explicit COPY defs/uses.
3673 (Ops[0] == 0 || Ops[0] == 1)) {
3674 bool IsSpill = Ops[0] == 0;
3675 bool IsFill = !IsSpill;
3676 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
3677 const MachineRegisterInfo &MRI = MF.getRegInfo();
3678 MachineBasicBlock &MBB = *MI.getParent();
3679 const MachineOperand &DstMO = MI.getOperand(0);
3680 const MachineOperand &SrcMO = MI.getOperand(1);
3681 Register DstReg = DstMO.getReg();
3682 Register SrcReg = SrcMO.getReg();
3683 // This is slightly expensive to compute for physical regs since
3684 // getMinimalPhysRegClass is slow.
3685 auto getRegClass = [&](unsigned Reg) {
3686 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
3687 : TRI.getMinimalPhysRegClass(Reg);
3688 };
3689
3690 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
3691 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
3692 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
3693 "Mismatched register size in non subreg COPY");
3694 if (IsSpill)
3695 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
3696 getRegClass(SrcReg), &TRI);
3697 else
3698 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
3699 getRegClass(DstReg), &TRI);
3700 return &*--InsertPt;
3701 }
3702
3703 // Handle cases like spilling def of:
3704 //
3705 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
3706 //
3707 // where the physical register source can be widened and stored to the full
3708 // virtual reg destination stack slot, in this case producing:
3709 //
3710 // STRXui %xzr, %stack.0
3711 //
3712 if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) {
3713 assert(SrcMO.getSubReg() == 0 &&
3714 "Unexpected subreg on physical register");
3715 const TargetRegisterClass *SpillRC;
3716 unsigned SpillSubreg;
3717 switch (DstMO.getSubReg()) {
3718 default:
3719 SpillRC = nullptr;
3720 break;
3721 case AArch64::sub_32:
3722 case AArch64::ssub:
3723 if (AArch64::GPR32RegClass.contains(SrcReg)) {
3724 SpillRC = &AArch64::GPR64RegClass;
3725 SpillSubreg = AArch64::sub_32;
3726 } else if (AArch64::FPR32RegClass.contains(SrcReg)) {
3727 SpillRC = &AArch64::FPR64RegClass;
3728 SpillSubreg = AArch64::ssub;
3729 } else
3730 SpillRC = nullptr;
3731 break;
3732 case AArch64::dsub:
3733 if (AArch64::FPR64RegClass.contains(SrcReg)) {
3734 SpillRC = &AArch64::FPR128RegClass;
3735 SpillSubreg = AArch64::dsub;
3736 } else
3737 SpillRC = nullptr;
3738 break;
3739 }
3740
3741 if (SpillRC)
3742 if (unsigned WidenedSrcReg =
3743 TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
3744 storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
3745 FrameIndex, SpillRC, &TRI);
3746 return &*--InsertPt;
3747 }
3748 }
3749
3750 // Handle cases like filling use of:
3751 //
3752 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
3753 //
3754 // where we can load the full virtual reg source stack slot, into the subreg
3755 // destination, in this case producing:
3756 //
3757 // LDRWui %0:sub_32<def,read-undef>, %stack.0
3758 //
3759 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
3760 const TargetRegisterClass *FillRC;
3761 switch (DstMO.getSubReg()) {
3762 default:
3763 FillRC = nullptr;
3764 break;
3765 case AArch64::sub_32:
3766 FillRC = &AArch64::GPR32RegClass;
3767 break;
3768 case AArch64::ssub:
3769 FillRC = &AArch64::FPR32RegClass;
3770 break;
3771 case AArch64::dsub:
3772 FillRC = &AArch64::FPR64RegClass;
3773 break;
3774 }
3775
3776 if (FillRC) {
3777 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
3778 TRI.getRegSizeInBits(*FillRC) &&
3779 "Mismatched regclass size on folded subreg COPY");
3780 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
3781 MachineInstr &LoadMI = *--InsertPt;
3782 MachineOperand &LoadDst = LoadMI.getOperand(0);
3783 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
3784 LoadDst.setSubReg(DstMO.getSubReg());
3785 LoadDst.setIsUndef();
3786 return &LoadMI;
3787 }
3788 }
3789 }
3790
3791 // Cannot fold.
3792 return nullptr;
3793 }
3794
isAArch64FrameOffsetLegal(const MachineInstr & MI,StackOffset & SOffset,bool * OutUseUnscaledOp,unsigned * OutUnscaledOp,int64_t * EmittableOffset)3795 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
3796 StackOffset &SOffset,
3797 bool *OutUseUnscaledOp,
3798 unsigned *OutUnscaledOp,
3799 int64_t *EmittableOffset) {
3800 // Set output values in case of early exit.
3801 if (EmittableOffset)
3802 *EmittableOffset = 0;
3803 if (OutUseUnscaledOp)
3804 *OutUseUnscaledOp = false;
3805 if (OutUnscaledOp)
3806 *OutUnscaledOp = 0;
3807
3808 // Exit early for structured vector spills/fills as they can't take an
3809 // immediate offset.
3810 switch (MI.getOpcode()) {
3811 default:
3812 break;
3813 case AArch64::LD1Twov2d:
3814 case AArch64::LD1Threev2d:
3815 case AArch64::LD1Fourv2d:
3816 case AArch64::LD1Twov1d:
3817 case AArch64::LD1Threev1d:
3818 case AArch64::LD1Fourv1d:
3819 case AArch64::ST1Twov2d:
3820 case AArch64::ST1Threev2d:
3821 case AArch64::ST1Fourv2d:
3822 case AArch64::ST1Twov1d:
3823 case AArch64::ST1Threev1d:
3824 case AArch64::ST1Fourv1d:
3825 case AArch64::IRG:
3826 case AArch64::IRGstack:
3827 case AArch64::STGloop:
3828 case AArch64::STZGloop:
3829 return AArch64FrameOffsetCannotUpdate;
3830 }
3831
3832 // Get the min/max offset and the scale.
3833 TypeSize ScaleValue(0U, false);
3834 unsigned Width;
3835 int64_t MinOff, MaxOff;
3836 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
3837 MaxOff))
3838 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
3839
3840 // Construct the complete offset.
3841 bool IsMulVL = ScaleValue.isScalable();
3842 unsigned Scale = ScaleValue.getKnownMinSize();
3843 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
3844
3845 const MachineOperand &ImmOpnd =
3846 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
3847 Offset += ImmOpnd.getImm() * Scale;
3848
3849 // If the offset doesn't match the scale, we rewrite the instruction to
3850 // use the unscaled instruction instead. Likewise, if we have a negative
3851 // offset and there is an unscaled op to use.
3852 Optional<unsigned> UnscaledOp =
3853 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
3854 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
3855 if (useUnscaledOp &&
3856 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
3857 MaxOff))
3858 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
3859
3860 Scale = ScaleValue.getKnownMinSize();
3861 assert(IsMulVL == ScaleValue.isScalable() &&
3862 "Unscaled opcode has different value for scalable");
3863
3864 int64_t Remainder = Offset % Scale;
3865 assert(!(Remainder && useUnscaledOp) &&
3866 "Cannot have remainder when using unscaled op");
3867
3868 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
3869 int64_t NewOffset = Offset / Scale;
3870 if (MinOff <= NewOffset && NewOffset <= MaxOff)
3871 Offset = Remainder;
3872 else {
3873 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
3874 Offset = Offset - NewOffset * Scale + Remainder;
3875 }
3876
3877 if (EmittableOffset)
3878 *EmittableOffset = NewOffset;
3879 if (OutUseUnscaledOp)
3880 *OutUseUnscaledOp = useUnscaledOp;
3881 if (OutUnscaledOp && UnscaledOp)
3882 *OutUnscaledOp = *UnscaledOp;
3883
3884 if (IsMulVL)
3885 SOffset = StackOffset::get(SOffset.getFixed(), Offset);
3886 else
3887 SOffset = StackOffset::get(Offset, SOffset.getScalable());
3888 return AArch64FrameOffsetCanUpdate |
3889 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
3890 }
3891
rewriteAArch64FrameIndex(MachineInstr & MI,unsigned FrameRegIdx,unsigned FrameReg,StackOffset & Offset,const AArch64InstrInfo * TII)3892 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
3893 unsigned FrameReg, StackOffset &Offset,
3894 const AArch64InstrInfo *TII) {
3895 unsigned Opcode = MI.getOpcode();
3896 unsigned ImmIdx = FrameRegIdx + 1;
3897
3898 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
3899 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
3900 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
3901 MI.getOperand(0).getReg(), FrameReg, Offset, TII,
3902 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
3903 MI.eraseFromParent();
3904 Offset = StackOffset();
3905 return true;
3906 }
3907
3908 int64_t NewOffset;
3909 unsigned UnscaledOp;
3910 bool UseUnscaledOp;
3911 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
3912 &UnscaledOp, &NewOffset);
3913 if (Status & AArch64FrameOffsetCanUpdate) {
3914 if (Status & AArch64FrameOffsetIsLegal)
3915 // Replace the FrameIndex with FrameReg.
3916 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
3917 if (UseUnscaledOp)
3918 MI.setDesc(TII->get(UnscaledOp));
3919
3920 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
3921 return !Offset;
3922 }
3923
3924 return false;
3925 }
3926
getNoop(MCInst & NopInst) const3927 void AArch64InstrInfo::getNoop(MCInst &NopInst) const {
3928 NopInst.setOpcode(AArch64::HINT);
3929 NopInst.addOperand(MCOperand::createImm(0));
3930 }
3931
3932 // AArch64 supports MachineCombiner.
useMachineCombiner() const3933 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
3934
3935 // True when Opc sets flag
isCombineInstrSettingFlag(unsigned Opc)3936 static bool isCombineInstrSettingFlag(unsigned Opc) {
3937 switch (Opc) {
3938 case AArch64::ADDSWrr:
3939 case AArch64::ADDSWri:
3940 case AArch64::ADDSXrr:
3941 case AArch64::ADDSXri:
3942 case AArch64::SUBSWrr:
3943 case AArch64::SUBSXrr:
3944 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3945 case AArch64::SUBSWri:
3946 case AArch64::SUBSXri:
3947 return true;
3948 default:
3949 break;
3950 }
3951 return false;
3952 }
3953
3954 // 32b Opcodes that can be combined with a MUL
isCombineInstrCandidate32(unsigned Opc)3955 static bool isCombineInstrCandidate32(unsigned Opc) {
3956 switch (Opc) {
3957 case AArch64::ADDWrr:
3958 case AArch64::ADDWri:
3959 case AArch64::SUBWrr:
3960 case AArch64::ADDSWrr:
3961 case AArch64::ADDSWri:
3962 case AArch64::SUBSWrr:
3963 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3964 case AArch64::SUBWri:
3965 case AArch64::SUBSWri:
3966 return true;
3967 default:
3968 break;
3969 }
3970 return false;
3971 }
3972
3973 // 64b Opcodes that can be combined with a MUL
isCombineInstrCandidate64(unsigned Opc)3974 static bool isCombineInstrCandidate64(unsigned Opc) {
3975 switch (Opc) {
3976 case AArch64::ADDXrr:
3977 case AArch64::ADDXri:
3978 case AArch64::SUBXrr:
3979 case AArch64::ADDSXrr:
3980 case AArch64::ADDSXri:
3981 case AArch64::SUBSXrr:
3982 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3983 case AArch64::SUBXri:
3984 case AArch64::SUBSXri:
3985 case AArch64::ADDv8i8:
3986 case AArch64::ADDv16i8:
3987 case AArch64::ADDv4i16:
3988 case AArch64::ADDv8i16:
3989 case AArch64::ADDv2i32:
3990 case AArch64::ADDv4i32:
3991 case AArch64::SUBv8i8:
3992 case AArch64::SUBv16i8:
3993 case AArch64::SUBv4i16:
3994 case AArch64::SUBv8i16:
3995 case AArch64::SUBv2i32:
3996 case AArch64::SUBv4i32:
3997 return true;
3998 default:
3999 break;
4000 }
4001 return false;
4002 }
4003
4004 // FP Opcodes that can be combined with a FMUL.
isCombineInstrCandidateFP(const MachineInstr & Inst)4005 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
4006 switch (Inst.getOpcode()) {
4007 default:
4008 break;
4009 case AArch64::FADDHrr:
4010 case AArch64::FADDSrr:
4011 case AArch64::FADDDrr:
4012 case AArch64::FADDv4f16:
4013 case AArch64::FADDv8f16:
4014 case AArch64::FADDv2f32:
4015 case AArch64::FADDv2f64:
4016 case AArch64::FADDv4f32:
4017 case AArch64::FSUBHrr:
4018 case AArch64::FSUBSrr:
4019 case AArch64::FSUBDrr:
4020 case AArch64::FSUBv4f16:
4021 case AArch64::FSUBv8f16:
4022 case AArch64::FSUBv2f32:
4023 case AArch64::FSUBv2f64:
4024 case AArch64::FSUBv4f32:
4025 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
4026 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
4027 // the target options or if FADD/FSUB has the contract fast-math flag.
4028 return Options.UnsafeFPMath ||
4029 Options.AllowFPOpFusion == FPOpFusion::Fast ||
4030 Inst.getFlag(MachineInstr::FmContract);
4031 return true;
4032 }
4033 return false;
4034 }
4035
4036 // Opcodes that can be combined with a MUL
isCombineInstrCandidate(unsigned Opc)4037 static bool isCombineInstrCandidate(unsigned Opc) {
4038 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
4039 }
4040
4041 //
4042 // Utility routine that checks if \param MO is defined by an
4043 // \param CombineOpc instruction in the basic block \param MBB
canCombine(MachineBasicBlock & MBB,MachineOperand & MO,unsigned CombineOpc,unsigned ZeroReg=0,bool CheckZeroReg=false)4044 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
4045 unsigned CombineOpc, unsigned ZeroReg = 0,
4046 bool CheckZeroReg = false) {
4047 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4048 MachineInstr *MI = nullptr;
4049
4050 if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
4051 MI = MRI.getUniqueVRegDef(MO.getReg());
4052 // And it needs to be in the trace (otherwise, it won't have a depth).
4053 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
4054 return false;
4055 // Must only used by the user we combine with.
4056 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
4057 return false;
4058
4059 if (CheckZeroReg) {
4060 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
4061 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
4062 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
4063 // The third input reg must be zero.
4064 if (MI->getOperand(3).getReg() != ZeroReg)
4065 return false;
4066 }
4067
4068 return true;
4069 }
4070
4071 //
4072 // Is \param MO defined by an integer multiply and can be combined?
canCombineWithMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc,unsigned ZeroReg)4073 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
4074 unsigned MulOpc, unsigned ZeroReg) {
4075 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
4076 }
4077
4078 //
4079 // Is \param MO defined by a floating-point multiply and can be combined?
canCombineWithFMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc)4080 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
4081 unsigned MulOpc) {
4082 return canCombine(MBB, MO, MulOpc);
4083 }
4084
4085 // TODO: There are many more machine instruction opcodes to match:
4086 // 1. Other data types (integer, vectors)
4087 // 2. Other math / logic operations (xor, or)
4088 // 3. Other forms of the same operation (intrinsics and other variants)
isAssociativeAndCommutative(const MachineInstr & Inst) const4089 bool AArch64InstrInfo::isAssociativeAndCommutative(
4090 const MachineInstr &Inst) const {
4091 switch (Inst.getOpcode()) {
4092 case AArch64::FADDDrr:
4093 case AArch64::FADDSrr:
4094 case AArch64::FADDv2f32:
4095 case AArch64::FADDv2f64:
4096 case AArch64::FADDv4f32:
4097 case AArch64::FMULDrr:
4098 case AArch64::FMULSrr:
4099 case AArch64::FMULX32:
4100 case AArch64::FMULX64:
4101 case AArch64::FMULXv2f32:
4102 case AArch64::FMULXv2f64:
4103 case AArch64::FMULXv4f32:
4104 case AArch64::FMULv2f32:
4105 case AArch64::FMULv2f64:
4106 case AArch64::FMULv4f32:
4107 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
4108 default:
4109 return false;
4110 }
4111 }
4112
4113 /// Find instructions that can be turned into madd.
getMaddPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)4114 static bool getMaddPatterns(MachineInstr &Root,
4115 SmallVectorImpl<MachineCombinerPattern> &Patterns) {
4116 unsigned Opc = Root.getOpcode();
4117 MachineBasicBlock &MBB = *Root.getParent();
4118 bool Found = false;
4119
4120 if (!isCombineInstrCandidate(Opc))
4121 return false;
4122 if (isCombineInstrSettingFlag(Opc)) {
4123 int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
4124 // When NZCV is live bail out.
4125 if (Cmp_NZCV == -1)
4126 return false;
4127 unsigned NewOpc = convertToNonFlagSettingOpc(Root);
4128 // When opcode can't change bail out.
4129 // CHECKME: do we miss any cases for opcode conversion?
4130 if (NewOpc == Opc)
4131 return false;
4132 Opc = NewOpc;
4133 }
4134
4135 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
4136 MachineCombinerPattern Pattern) {
4137 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
4138 Patterns.push_back(Pattern);
4139 Found = true;
4140 }
4141 };
4142
4143 auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) {
4144 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
4145 Patterns.push_back(Pattern);
4146 Found = true;
4147 }
4148 };
4149
4150 typedef MachineCombinerPattern MCP;
4151
4152 switch (Opc) {
4153 default:
4154 break;
4155 case AArch64::ADDWrr:
4156 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
4157 "ADDWrr does not have register operands");
4158 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
4159 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
4160 break;
4161 case AArch64::ADDXrr:
4162 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
4163 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
4164 break;
4165 case AArch64::SUBWrr:
4166 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
4167 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
4168 break;
4169 case AArch64::SUBXrr:
4170 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
4171 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
4172 break;
4173 case AArch64::ADDWri:
4174 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
4175 break;
4176 case AArch64::ADDXri:
4177 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
4178 break;
4179 case AArch64::SUBWri:
4180 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
4181 break;
4182 case AArch64::SUBXri:
4183 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
4184 break;
4185 case AArch64::ADDv8i8:
4186 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
4187 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
4188 break;
4189 case AArch64::ADDv16i8:
4190 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
4191 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
4192 break;
4193 case AArch64::ADDv4i16:
4194 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
4195 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
4196 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
4197 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
4198 break;
4199 case AArch64::ADDv8i16:
4200 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
4201 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
4202 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
4203 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
4204 break;
4205 case AArch64::ADDv2i32:
4206 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
4207 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
4208 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
4209 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
4210 break;
4211 case AArch64::ADDv4i32:
4212 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
4213 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
4214 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
4215 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
4216 break;
4217 case AArch64::SUBv8i8:
4218 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
4219 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
4220 break;
4221 case AArch64::SUBv16i8:
4222 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
4223 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
4224 break;
4225 case AArch64::SUBv4i16:
4226 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
4227 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
4228 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
4229 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
4230 break;
4231 case AArch64::SUBv8i16:
4232 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
4233 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
4234 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
4235 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
4236 break;
4237 case AArch64::SUBv2i32:
4238 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
4239 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
4240 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
4241 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
4242 break;
4243 case AArch64::SUBv4i32:
4244 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
4245 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
4246 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
4247 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
4248 break;
4249 }
4250 return Found;
4251 }
4252 /// Floating-Point Support
4253
4254 /// Find instructions that can be turned into madd.
getFMAPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)4255 static bool getFMAPatterns(MachineInstr &Root,
4256 SmallVectorImpl<MachineCombinerPattern> &Patterns) {
4257
4258 if (!isCombineInstrCandidateFP(Root))
4259 return false;
4260
4261 MachineBasicBlock &MBB = *Root.getParent();
4262 bool Found = false;
4263
4264 auto Match = [&](int Opcode, int Operand,
4265 MachineCombinerPattern Pattern) -> bool {
4266 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
4267 Patterns.push_back(Pattern);
4268 return true;
4269 }
4270 return false;
4271 };
4272
4273 typedef MachineCombinerPattern MCP;
4274
4275 switch (Root.getOpcode()) {
4276 default:
4277 assert(false && "Unsupported FP instruction in combiner\n");
4278 break;
4279 case AArch64::FADDHrr:
4280 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
4281 "FADDHrr does not have register operands");
4282
4283 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
4284 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
4285 break;
4286 case AArch64::FADDSrr:
4287 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
4288 "FADDSrr does not have register operands");
4289
4290 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
4291 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
4292
4293 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
4294 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
4295 break;
4296 case AArch64::FADDDrr:
4297 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
4298 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
4299
4300 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
4301 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
4302 break;
4303 case AArch64::FADDv4f16:
4304 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
4305 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
4306
4307 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
4308 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
4309 break;
4310 case AArch64::FADDv8f16:
4311 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
4312 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
4313
4314 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
4315 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
4316 break;
4317 case AArch64::FADDv2f32:
4318 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
4319 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
4320
4321 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
4322 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
4323 break;
4324 case AArch64::FADDv2f64:
4325 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
4326 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
4327
4328 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
4329 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
4330 break;
4331 case AArch64::FADDv4f32:
4332 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
4333 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
4334
4335 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
4336 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
4337 break;
4338 case AArch64::FSUBHrr:
4339 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
4340 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
4341 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
4342 break;
4343 case AArch64::FSUBSrr:
4344 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
4345
4346 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
4347 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
4348
4349 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
4350 break;
4351 case AArch64::FSUBDrr:
4352 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
4353
4354 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
4355 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
4356
4357 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
4358 break;
4359 case AArch64::FSUBv4f16:
4360 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
4361 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
4362
4363 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
4364 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
4365 break;
4366 case AArch64::FSUBv8f16:
4367 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
4368 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
4369
4370 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
4371 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
4372 break;
4373 case AArch64::FSUBv2f32:
4374 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
4375 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
4376
4377 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
4378 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
4379 break;
4380 case AArch64::FSUBv2f64:
4381 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
4382 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
4383
4384 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
4385 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
4386 break;
4387 case AArch64::FSUBv4f32:
4388 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
4389 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
4390
4391 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
4392 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
4393 break;
4394 }
4395 return Found;
4396 }
4397
4398 /// Return true when a code sequence can improve throughput. It
4399 /// should be called only for instructions in loops.
4400 /// \param Pattern - combiner pattern
isThroughputPattern(MachineCombinerPattern Pattern) const4401 bool AArch64InstrInfo::isThroughputPattern(
4402 MachineCombinerPattern Pattern) const {
4403 switch (Pattern) {
4404 default:
4405 break;
4406 case MachineCombinerPattern::FMULADDH_OP1:
4407 case MachineCombinerPattern::FMULADDH_OP2:
4408 case MachineCombinerPattern::FMULSUBH_OP1:
4409 case MachineCombinerPattern::FMULSUBH_OP2:
4410 case MachineCombinerPattern::FMULADDS_OP1:
4411 case MachineCombinerPattern::FMULADDS_OP2:
4412 case MachineCombinerPattern::FMULSUBS_OP1:
4413 case MachineCombinerPattern::FMULSUBS_OP2:
4414 case MachineCombinerPattern::FMULADDD_OP1:
4415 case MachineCombinerPattern::FMULADDD_OP2:
4416 case MachineCombinerPattern::FMULSUBD_OP1:
4417 case MachineCombinerPattern::FMULSUBD_OP2:
4418 case MachineCombinerPattern::FNMULSUBH_OP1:
4419 case MachineCombinerPattern::FNMULSUBS_OP1:
4420 case MachineCombinerPattern::FNMULSUBD_OP1:
4421 case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
4422 case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
4423 case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
4424 case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
4425 case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
4426 case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
4427 case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
4428 case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
4429 case MachineCombinerPattern::FMLAv4f16_OP2:
4430 case MachineCombinerPattern::FMLAv4f16_OP1:
4431 case MachineCombinerPattern::FMLAv8f16_OP1:
4432 case MachineCombinerPattern::FMLAv8f16_OP2:
4433 case MachineCombinerPattern::FMLAv2f32_OP2:
4434 case MachineCombinerPattern::FMLAv2f32_OP1:
4435 case MachineCombinerPattern::FMLAv2f64_OP1:
4436 case MachineCombinerPattern::FMLAv2f64_OP2:
4437 case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
4438 case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
4439 case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
4440 case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
4441 case MachineCombinerPattern::FMLAv4f32_OP1:
4442 case MachineCombinerPattern::FMLAv4f32_OP2:
4443 case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
4444 case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
4445 case MachineCombinerPattern::FMLSv4i16_indexed_OP1:
4446 case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
4447 case MachineCombinerPattern::FMLSv8i16_indexed_OP1:
4448 case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
4449 case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
4450 case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
4451 case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
4452 case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
4453 case MachineCombinerPattern::FMLSv4f16_OP1:
4454 case MachineCombinerPattern::FMLSv4f16_OP2:
4455 case MachineCombinerPattern::FMLSv8f16_OP1:
4456 case MachineCombinerPattern::FMLSv8f16_OP2:
4457 case MachineCombinerPattern::FMLSv2f32_OP2:
4458 case MachineCombinerPattern::FMLSv2f64_OP2:
4459 case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
4460 case MachineCombinerPattern::FMLSv4f32_OP2:
4461 case MachineCombinerPattern::MULADDv8i8_OP1:
4462 case MachineCombinerPattern::MULADDv8i8_OP2:
4463 case MachineCombinerPattern::MULADDv16i8_OP1:
4464 case MachineCombinerPattern::MULADDv16i8_OP2:
4465 case MachineCombinerPattern::MULADDv4i16_OP1:
4466 case MachineCombinerPattern::MULADDv4i16_OP2:
4467 case MachineCombinerPattern::MULADDv8i16_OP1:
4468 case MachineCombinerPattern::MULADDv8i16_OP2:
4469 case MachineCombinerPattern::MULADDv2i32_OP1:
4470 case MachineCombinerPattern::MULADDv2i32_OP2:
4471 case MachineCombinerPattern::MULADDv4i32_OP1:
4472 case MachineCombinerPattern::MULADDv4i32_OP2:
4473 case MachineCombinerPattern::MULSUBv8i8_OP1:
4474 case MachineCombinerPattern::MULSUBv8i8_OP2:
4475 case MachineCombinerPattern::MULSUBv16i8_OP1:
4476 case MachineCombinerPattern::MULSUBv16i8_OP2:
4477 case MachineCombinerPattern::MULSUBv4i16_OP1:
4478 case MachineCombinerPattern::MULSUBv4i16_OP2:
4479 case MachineCombinerPattern::MULSUBv8i16_OP1:
4480 case MachineCombinerPattern::MULSUBv8i16_OP2:
4481 case MachineCombinerPattern::MULSUBv2i32_OP1:
4482 case MachineCombinerPattern::MULSUBv2i32_OP2:
4483 case MachineCombinerPattern::MULSUBv4i32_OP1:
4484 case MachineCombinerPattern::MULSUBv4i32_OP2:
4485 case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
4486 case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
4487 case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
4488 case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
4489 case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
4490 case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
4491 case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
4492 case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
4493 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
4494 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
4495 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
4496 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
4497 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
4498 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
4499 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
4500 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
4501 return true;
4502 } // end switch (Pattern)
4503 return false;
4504 }
4505 /// Return true when there is potentially a faster code sequence for an
4506 /// instruction chain ending in \p Root. All potential patterns are listed in
4507 /// the \p Pattern vector. Pattern should be sorted in priority order since the
4508 /// pattern evaluator stops checking as soon as it finds a faster sequence.
4509
getMachineCombinerPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns) const4510 bool AArch64InstrInfo::getMachineCombinerPatterns(
4511 MachineInstr &Root,
4512 SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
4513 // Integer patterns
4514 if (getMaddPatterns(Root, Patterns))
4515 return true;
4516 // Floating point patterns
4517 if (getFMAPatterns(Root, Patterns))
4518 return true;
4519
4520 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
4521 }
4522
4523 enum class FMAInstKind { Default, Indexed, Accumulator };
4524 /// genFusedMultiply - Generate fused multiply instructions.
4525 /// This function supports both integer and floating point instructions.
4526 /// A typical example:
4527 /// F|MUL I=A,B,0
4528 /// F|ADD R,I,C
4529 /// ==> F|MADD R,A,B,C
4530 /// \param MF Containing MachineFunction
4531 /// \param MRI Register information
4532 /// \param TII Target information
4533 /// \param Root is the F|ADD instruction
4534 /// \param [out] InsInstrs is a vector of machine instructions and will
4535 /// contain the generated madd instruction
4536 /// \param IdxMulOpd is index of operand in Root that is the result of
4537 /// the F|MUL. In the example above IdxMulOpd is 1.
4538 /// \param MaddOpc the opcode fo the f|madd instruction
4539 /// \param RC Register class of operands
4540 /// \param kind of fma instruction (addressing mode) to be generated
4541 /// \param ReplacedAddend is the result register from the instruction
4542 /// replacing the non-combined operand, if any.
4543 static MachineInstr *
genFusedMultiply(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC,FMAInstKind kind=FMAInstKind::Default,const Register * ReplacedAddend=nullptr)4544 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
4545 const TargetInstrInfo *TII, MachineInstr &Root,
4546 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
4547 unsigned MaddOpc, const TargetRegisterClass *RC,
4548 FMAInstKind kind = FMAInstKind::Default,
4549 const Register *ReplacedAddend = nullptr) {
4550 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
4551
4552 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
4553 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
4554 Register ResultReg = Root.getOperand(0).getReg();
4555 Register SrcReg0 = MUL->getOperand(1).getReg();
4556 bool Src0IsKill = MUL->getOperand(1).isKill();
4557 Register SrcReg1 = MUL->getOperand(2).getReg();
4558 bool Src1IsKill = MUL->getOperand(2).isKill();
4559
4560 unsigned SrcReg2;
4561 bool Src2IsKill;
4562 if (ReplacedAddend) {
4563 // If we just generated a new addend, we must be it's only use.
4564 SrcReg2 = *ReplacedAddend;
4565 Src2IsKill = true;
4566 } else {
4567 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
4568 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
4569 }
4570
4571 if (Register::isVirtualRegister(ResultReg))
4572 MRI.constrainRegClass(ResultReg, RC);
4573 if (Register::isVirtualRegister(SrcReg0))
4574 MRI.constrainRegClass(SrcReg0, RC);
4575 if (Register::isVirtualRegister(SrcReg1))
4576 MRI.constrainRegClass(SrcReg1, RC);
4577 if (Register::isVirtualRegister(SrcReg2))
4578 MRI.constrainRegClass(SrcReg2, RC);
4579
4580 MachineInstrBuilder MIB;
4581 if (kind == FMAInstKind::Default)
4582 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4583 .addReg(SrcReg0, getKillRegState(Src0IsKill))
4584 .addReg(SrcReg1, getKillRegState(Src1IsKill))
4585 .addReg(SrcReg2, getKillRegState(Src2IsKill));
4586 else if (kind == FMAInstKind::Indexed)
4587 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4588 .addReg(SrcReg2, getKillRegState(Src2IsKill))
4589 .addReg(SrcReg0, getKillRegState(Src0IsKill))
4590 .addReg(SrcReg1, getKillRegState(Src1IsKill))
4591 .addImm(MUL->getOperand(3).getImm());
4592 else if (kind == FMAInstKind::Accumulator)
4593 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4594 .addReg(SrcReg2, getKillRegState(Src2IsKill))
4595 .addReg(SrcReg0, getKillRegState(Src0IsKill))
4596 .addReg(SrcReg1, getKillRegState(Src1IsKill));
4597 else
4598 assert(false && "Invalid FMA instruction kind \n");
4599 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
4600 InsInstrs.push_back(MIB);
4601 return MUL;
4602 }
4603
4604 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
4605 /// instructions.
4606 ///
4607 /// \see genFusedMultiply
genFusedMultiplyAcc(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC)4608 static MachineInstr *genFusedMultiplyAcc(
4609 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
4610 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
4611 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
4612 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4613 FMAInstKind::Accumulator);
4614 }
4615
4616 /// genNeg - Helper to generate an intermediate negation of the second operand
4617 /// of Root
genNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned MnegOpc,const TargetRegisterClass * RC)4618 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
4619 const TargetInstrInfo *TII, MachineInstr &Root,
4620 SmallVectorImpl<MachineInstr *> &InsInstrs,
4621 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
4622 unsigned MnegOpc, const TargetRegisterClass *RC) {
4623 Register NewVR = MRI.createVirtualRegister(RC);
4624 MachineInstrBuilder MIB =
4625 BuildMI(MF, Root.getDebugLoc(), TII->get(MnegOpc), NewVR)
4626 .add(Root.getOperand(2));
4627 InsInstrs.push_back(MIB);
4628
4629 assert(InstrIdxForVirtReg.empty());
4630 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4631
4632 return NewVR;
4633 }
4634
4635 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
4636 /// instructions with an additional negation of the accumulator
genFusedMultiplyAccNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned IdxMulOpd,unsigned MaddOpc,unsigned MnegOpc,const TargetRegisterClass * RC)4637 static MachineInstr *genFusedMultiplyAccNeg(
4638 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
4639 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
4640 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
4641 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
4642 assert(IdxMulOpd == 1);
4643
4644 Register NewVR =
4645 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
4646 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4647 FMAInstKind::Accumulator, &NewVR);
4648 }
4649
4650 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
4651 /// instructions.
4652 ///
4653 /// \see genFusedMultiply
genFusedMultiplyIdx(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC)4654 static MachineInstr *genFusedMultiplyIdx(
4655 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
4656 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
4657 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
4658 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4659 FMAInstKind::Indexed);
4660 }
4661
4662 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
4663 /// instructions with an additional negation of the accumulator
genFusedMultiplyIdxNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned IdxMulOpd,unsigned MaddOpc,unsigned MnegOpc,const TargetRegisterClass * RC)4664 static MachineInstr *genFusedMultiplyIdxNeg(
4665 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
4666 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
4667 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
4668 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
4669 assert(IdxMulOpd == 1);
4670
4671 Register NewVR =
4672 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
4673
4674 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4675 FMAInstKind::Indexed, &NewVR);
4676 }
4677
4678 /// genMaddR - Generate madd instruction and combine mul and add using
4679 /// an extra virtual register
4680 /// Example - an ADD intermediate needs to be stored in a register:
4681 /// MUL I=A,B,0
4682 /// ADD R,I,Imm
4683 /// ==> ORR V, ZR, Imm
4684 /// ==> MADD R,A,B,V
4685 /// \param MF Containing MachineFunction
4686 /// \param MRI Register information
4687 /// \param TII Target information
4688 /// \param Root is the ADD instruction
4689 /// \param [out] InsInstrs is a vector of machine instructions and will
4690 /// contain the generated madd instruction
4691 /// \param IdxMulOpd is index of operand in Root that is the result of
4692 /// the MUL. In the example above IdxMulOpd is 1.
4693 /// \param MaddOpc the opcode fo the madd instruction
4694 /// \param VR is a virtual register that holds the value of an ADD operand
4695 /// (V in the example above).
4696 /// \param RC Register class of operands
genMaddR(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,unsigned VR,const TargetRegisterClass * RC)4697 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
4698 const TargetInstrInfo *TII, MachineInstr &Root,
4699 SmallVectorImpl<MachineInstr *> &InsInstrs,
4700 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
4701 const TargetRegisterClass *RC) {
4702 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
4703
4704 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
4705 Register ResultReg = Root.getOperand(0).getReg();
4706 Register SrcReg0 = MUL->getOperand(1).getReg();
4707 bool Src0IsKill = MUL->getOperand(1).isKill();
4708 Register SrcReg1 = MUL->getOperand(2).getReg();
4709 bool Src1IsKill = MUL->getOperand(2).isKill();
4710
4711 if (Register::isVirtualRegister(ResultReg))
4712 MRI.constrainRegClass(ResultReg, RC);
4713 if (Register::isVirtualRegister(SrcReg0))
4714 MRI.constrainRegClass(SrcReg0, RC);
4715 if (Register::isVirtualRegister(SrcReg1))
4716 MRI.constrainRegClass(SrcReg1, RC);
4717 if (Register::isVirtualRegister(VR))
4718 MRI.constrainRegClass(VR, RC);
4719
4720 MachineInstrBuilder MIB =
4721 BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4722 .addReg(SrcReg0, getKillRegState(Src0IsKill))
4723 .addReg(SrcReg1, getKillRegState(Src1IsKill))
4724 .addReg(VR);
4725 // Insert the MADD
4726 InsInstrs.push_back(MIB);
4727 return MUL;
4728 }
4729
4730 /// When getMachineCombinerPatterns() finds potential patterns,
4731 /// this function generates the instructions that could replace the
4732 /// original code sequence
genAlternativeCodeSequence(MachineInstr & Root,MachineCombinerPattern Pattern,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg) const4733 void AArch64InstrInfo::genAlternativeCodeSequence(
4734 MachineInstr &Root, MachineCombinerPattern Pattern,
4735 SmallVectorImpl<MachineInstr *> &InsInstrs,
4736 SmallVectorImpl<MachineInstr *> &DelInstrs,
4737 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
4738 MachineBasicBlock &MBB = *Root.getParent();
4739 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4740 MachineFunction &MF = *MBB.getParent();
4741 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
4742
4743 MachineInstr *MUL;
4744 const TargetRegisterClass *RC;
4745 unsigned Opc;
4746 switch (Pattern) {
4747 default:
4748 // Reassociate instructions.
4749 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
4750 DelInstrs, InstrIdxForVirtReg);
4751 return;
4752 case MachineCombinerPattern::MULADDW_OP1:
4753 case MachineCombinerPattern::MULADDX_OP1:
4754 // MUL I=A,B,0
4755 // ADD R,I,C
4756 // ==> MADD R,A,B,C
4757 // --- Create(MADD);
4758 if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
4759 Opc = AArch64::MADDWrrr;
4760 RC = &AArch64::GPR32RegClass;
4761 } else {
4762 Opc = AArch64::MADDXrrr;
4763 RC = &AArch64::GPR64RegClass;
4764 }
4765 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4766 break;
4767 case MachineCombinerPattern::MULADDW_OP2:
4768 case MachineCombinerPattern::MULADDX_OP2:
4769 // MUL I=A,B,0
4770 // ADD R,C,I
4771 // ==> MADD R,A,B,C
4772 // --- Create(MADD);
4773 if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
4774 Opc = AArch64::MADDWrrr;
4775 RC = &AArch64::GPR32RegClass;
4776 } else {
4777 Opc = AArch64::MADDXrrr;
4778 RC = &AArch64::GPR64RegClass;
4779 }
4780 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4781 break;
4782 case MachineCombinerPattern::MULADDWI_OP1:
4783 case MachineCombinerPattern::MULADDXI_OP1: {
4784 // MUL I=A,B,0
4785 // ADD R,I,Imm
4786 // ==> ORR V, ZR, Imm
4787 // ==> MADD R,A,B,V
4788 // --- Create(MADD);
4789 const TargetRegisterClass *OrrRC;
4790 unsigned BitSize, OrrOpc, ZeroReg;
4791 if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
4792 OrrOpc = AArch64::ORRWri;
4793 OrrRC = &AArch64::GPR32spRegClass;
4794 BitSize = 32;
4795 ZeroReg = AArch64::WZR;
4796 Opc = AArch64::MADDWrrr;
4797 RC = &AArch64::GPR32RegClass;
4798 } else {
4799 OrrOpc = AArch64::ORRXri;
4800 OrrRC = &AArch64::GPR64spRegClass;
4801 BitSize = 64;
4802 ZeroReg = AArch64::XZR;
4803 Opc = AArch64::MADDXrrr;
4804 RC = &AArch64::GPR64RegClass;
4805 }
4806 Register NewVR = MRI.createVirtualRegister(OrrRC);
4807 uint64_t Imm = Root.getOperand(2).getImm();
4808
4809 if (Root.getOperand(3).isImm()) {
4810 unsigned Val = Root.getOperand(3).getImm();
4811 Imm = Imm << Val;
4812 }
4813 uint64_t UImm = SignExtend64(Imm, BitSize);
4814 uint64_t Encoding;
4815 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4816 MachineInstrBuilder MIB1 =
4817 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4818 .addReg(ZeroReg)
4819 .addImm(Encoding);
4820 InsInstrs.push_back(MIB1);
4821 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4822 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4823 }
4824 break;
4825 }
4826 case MachineCombinerPattern::MULSUBW_OP1:
4827 case MachineCombinerPattern::MULSUBX_OP1: {
4828 // MUL I=A,B,0
4829 // SUB R,I, C
4830 // ==> SUB V, 0, C
4831 // ==> MADD R,A,B,V // = -C + A*B
4832 // --- Create(MADD);
4833 const TargetRegisterClass *SubRC;
4834 unsigned SubOpc, ZeroReg;
4835 if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
4836 SubOpc = AArch64::SUBWrr;
4837 SubRC = &AArch64::GPR32spRegClass;
4838 ZeroReg = AArch64::WZR;
4839 Opc = AArch64::MADDWrrr;
4840 RC = &AArch64::GPR32RegClass;
4841 } else {
4842 SubOpc = AArch64::SUBXrr;
4843 SubRC = &AArch64::GPR64spRegClass;
4844 ZeroReg = AArch64::XZR;
4845 Opc = AArch64::MADDXrrr;
4846 RC = &AArch64::GPR64RegClass;
4847 }
4848 Register NewVR = MRI.createVirtualRegister(SubRC);
4849 // SUB NewVR, 0, C
4850 MachineInstrBuilder MIB1 =
4851 BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
4852 .addReg(ZeroReg)
4853 .add(Root.getOperand(2));
4854 InsInstrs.push_back(MIB1);
4855 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4856 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4857 break;
4858 }
4859 case MachineCombinerPattern::MULSUBW_OP2:
4860 case MachineCombinerPattern::MULSUBX_OP2:
4861 // MUL I=A,B,0
4862 // SUB R,C,I
4863 // ==> MSUB R,A,B,C (computes C - A*B)
4864 // --- Create(MSUB);
4865 if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
4866 Opc = AArch64::MSUBWrrr;
4867 RC = &AArch64::GPR32RegClass;
4868 } else {
4869 Opc = AArch64::MSUBXrrr;
4870 RC = &AArch64::GPR64RegClass;
4871 }
4872 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4873 break;
4874 case MachineCombinerPattern::MULSUBWI_OP1:
4875 case MachineCombinerPattern::MULSUBXI_OP1: {
4876 // MUL I=A,B,0
4877 // SUB R,I, Imm
4878 // ==> ORR V, ZR, -Imm
4879 // ==> MADD R,A,B,V // = -Imm + A*B
4880 // --- Create(MADD);
4881 const TargetRegisterClass *OrrRC;
4882 unsigned BitSize, OrrOpc, ZeroReg;
4883 if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
4884 OrrOpc = AArch64::ORRWri;
4885 OrrRC = &AArch64::GPR32spRegClass;
4886 BitSize = 32;
4887 ZeroReg = AArch64::WZR;
4888 Opc = AArch64::MADDWrrr;
4889 RC = &AArch64::GPR32RegClass;
4890 } else {
4891 OrrOpc = AArch64::ORRXri;
4892 OrrRC = &AArch64::GPR64spRegClass;
4893 BitSize = 64;
4894 ZeroReg = AArch64::XZR;
4895 Opc = AArch64::MADDXrrr;
4896 RC = &AArch64::GPR64RegClass;
4897 }
4898 Register NewVR = MRI.createVirtualRegister(OrrRC);
4899 uint64_t Imm = Root.getOperand(2).getImm();
4900 if (Root.getOperand(3).isImm()) {
4901 unsigned Val = Root.getOperand(3).getImm();
4902 Imm = Imm << Val;
4903 }
4904 uint64_t UImm = SignExtend64(-Imm, BitSize);
4905 uint64_t Encoding;
4906 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4907 MachineInstrBuilder MIB1 =
4908 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4909 .addReg(ZeroReg)
4910 .addImm(Encoding);
4911 InsInstrs.push_back(MIB1);
4912 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4913 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4914 }
4915 break;
4916 }
4917
4918 case MachineCombinerPattern::MULADDv8i8_OP1:
4919 Opc = AArch64::MLAv8i8;
4920 RC = &AArch64::FPR64RegClass;
4921 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4922 break;
4923 case MachineCombinerPattern::MULADDv8i8_OP2:
4924 Opc = AArch64::MLAv8i8;
4925 RC = &AArch64::FPR64RegClass;
4926 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4927 break;
4928 case MachineCombinerPattern::MULADDv16i8_OP1:
4929 Opc = AArch64::MLAv16i8;
4930 RC = &AArch64::FPR128RegClass;
4931 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4932 break;
4933 case MachineCombinerPattern::MULADDv16i8_OP2:
4934 Opc = AArch64::MLAv16i8;
4935 RC = &AArch64::FPR128RegClass;
4936 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4937 break;
4938 case MachineCombinerPattern::MULADDv4i16_OP1:
4939 Opc = AArch64::MLAv4i16;
4940 RC = &AArch64::FPR64RegClass;
4941 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4942 break;
4943 case MachineCombinerPattern::MULADDv4i16_OP2:
4944 Opc = AArch64::MLAv4i16;
4945 RC = &AArch64::FPR64RegClass;
4946 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4947 break;
4948 case MachineCombinerPattern::MULADDv8i16_OP1:
4949 Opc = AArch64::MLAv8i16;
4950 RC = &AArch64::FPR128RegClass;
4951 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4952 break;
4953 case MachineCombinerPattern::MULADDv8i16_OP2:
4954 Opc = AArch64::MLAv8i16;
4955 RC = &AArch64::FPR128RegClass;
4956 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4957 break;
4958 case MachineCombinerPattern::MULADDv2i32_OP1:
4959 Opc = AArch64::MLAv2i32;
4960 RC = &AArch64::FPR64RegClass;
4961 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4962 break;
4963 case MachineCombinerPattern::MULADDv2i32_OP2:
4964 Opc = AArch64::MLAv2i32;
4965 RC = &AArch64::FPR64RegClass;
4966 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4967 break;
4968 case MachineCombinerPattern::MULADDv4i32_OP1:
4969 Opc = AArch64::MLAv4i32;
4970 RC = &AArch64::FPR128RegClass;
4971 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4972 break;
4973 case MachineCombinerPattern::MULADDv4i32_OP2:
4974 Opc = AArch64::MLAv4i32;
4975 RC = &AArch64::FPR128RegClass;
4976 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4977 break;
4978
4979 case MachineCombinerPattern::MULSUBv8i8_OP1:
4980 Opc = AArch64::MLAv8i8;
4981 RC = &AArch64::FPR64RegClass;
4982 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4983 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
4984 RC);
4985 break;
4986 case MachineCombinerPattern::MULSUBv8i8_OP2:
4987 Opc = AArch64::MLSv8i8;
4988 RC = &AArch64::FPR64RegClass;
4989 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4990 break;
4991 case MachineCombinerPattern::MULSUBv16i8_OP1:
4992 Opc = AArch64::MLAv16i8;
4993 RC = &AArch64::FPR128RegClass;
4994 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4995 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
4996 RC);
4997 break;
4998 case MachineCombinerPattern::MULSUBv16i8_OP2:
4999 Opc = AArch64::MLSv16i8;
5000 RC = &AArch64::FPR128RegClass;
5001 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5002 break;
5003 case MachineCombinerPattern::MULSUBv4i16_OP1:
5004 Opc = AArch64::MLAv4i16;
5005 RC = &AArch64::FPR64RegClass;
5006 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
5007 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
5008 RC);
5009 break;
5010 case MachineCombinerPattern::MULSUBv4i16_OP2:
5011 Opc = AArch64::MLSv4i16;
5012 RC = &AArch64::FPR64RegClass;
5013 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5014 break;
5015 case MachineCombinerPattern::MULSUBv8i16_OP1:
5016 Opc = AArch64::MLAv8i16;
5017 RC = &AArch64::FPR128RegClass;
5018 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
5019 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
5020 RC);
5021 break;
5022 case MachineCombinerPattern::MULSUBv8i16_OP2:
5023 Opc = AArch64::MLSv8i16;
5024 RC = &AArch64::FPR128RegClass;
5025 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5026 break;
5027 case MachineCombinerPattern::MULSUBv2i32_OP1:
5028 Opc = AArch64::MLAv2i32;
5029 RC = &AArch64::FPR64RegClass;
5030 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
5031 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
5032 RC);
5033 break;
5034 case MachineCombinerPattern::MULSUBv2i32_OP2:
5035 Opc = AArch64::MLSv2i32;
5036 RC = &AArch64::FPR64RegClass;
5037 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5038 break;
5039 case MachineCombinerPattern::MULSUBv4i32_OP1:
5040 Opc = AArch64::MLAv4i32;
5041 RC = &AArch64::FPR128RegClass;
5042 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
5043 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
5044 RC);
5045 break;
5046 case MachineCombinerPattern::MULSUBv4i32_OP2:
5047 Opc = AArch64::MLSv4i32;
5048 RC = &AArch64::FPR128RegClass;
5049 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5050 break;
5051
5052 case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
5053 Opc = AArch64::MLAv4i16_indexed;
5054 RC = &AArch64::FPR64RegClass;
5055 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5056 break;
5057 case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
5058 Opc = AArch64::MLAv4i16_indexed;
5059 RC = &AArch64::FPR64RegClass;
5060 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5061 break;
5062 case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
5063 Opc = AArch64::MLAv8i16_indexed;
5064 RC = &AArch64::FPR128RegClass;
5065 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5066 break;
5067 case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
5068 Opc = AArch64::MLAv8i16_indexed;
5069 RC = &AArch64::FPR128RegClass;
5070 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5071 break;
5072 case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
5073 Opc = AArch64::MLAv2i32_indexed;
5074 RC = &AArch64::FPR64RegClass;
5075 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5076 break;
5077 case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
5078 Opc = AArch64::MLAv2i32_indexed;
5079 RC = &AArch64::FPR64RegClass;
5080 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5081 break;
5082 case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
5083 Opc = AArch64::MLAv4i32_indexed;
5084 RC = &AArch64::FPR128RegClass;
5085 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5086 break;
5087 case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
5088 Opc = AArch64::MLAv4i32_indexed;
5089 RC = &AArch64::FPR128RegClass;
5090 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5091 break;
5092
5093 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
5094 Opc = AArch64::MLAv4i16_indexed;
5095 RC = &AArch64::FPR64RegClass;
5096 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
5097 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
5098 RC);
5099 break;
5100 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
5101 Opc = AArch64::MLSv4i16_indexed;
5102 RC = &AArch64::FPR64RegClass;
5103 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5104 break;
5105 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
5106 Opc = AArch64::MLAv8i16_indexed;
5107 RC = &AArch64::FPR128RegClass;
5108 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
5109 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
5110 RC);
5111 break;
5112 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
5113 Opc = AArch64::MLSv8i16_indexed;
5114 RC = &AArch64::FPR128RegClass;
5115 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5116 break;
5117 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
5118 Opc = AArch64::MLAv2i32_indexed;
5119 RC = &AArch64::FPR64RegClass;
5120 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
5121 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
5122 RC);
5123 break;
5124 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
5125 Opc = AArch64::MLSv2i32_indexed;
5126 RC = &AArch64::FPR64RegClass;
5127 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5128 break;
5129 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
5130 Opc = AArch64::MLAv4i32_indexed;
5131 RC = &AArch64::FPR128RegClass;
5132 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
5133 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
5134 RC);
5135 break;
5136 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
5137 Opc = AArch64::MLSv4i32_indexed;
5138 RC = &AArch64::FPR128RegClass;
5139 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5140 break;
5141
5142 // Floating Point Support
5143 case MachineCombinerPattern::FMULADDH_OP1:
5144 Opc = AArch64::FMADDHrrr;
5145 RC = &AArch64::FPR16RegClass;
5146 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5147 break;
5148 case MachineCombinerPattern::FMULADDS_OP1:
5149 Opc = AArch64::FMADDSrrr;
5150 RC = &AArch64::FPR32RegClass;
5151 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5152 break;
5153 case MachineCombinerPattern::FMULADDD_OP1:
5154 Opc = AArch64::FMADDDrrr;
5155 RC = &AArch64::FPR64RegClass;
5156 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5157 break;
5158
5159 case MachineCombinerPattern::FMULADDH_OP2:
5160 Opc = AArch64::FMADDHrrr;
5161 RC = &AArch64::FPR16RegClass;
5162 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5163 break;
5164 case MachineCombinerPattern::FMULADDS_OP2:
5165 Opc = AArch64::FMADDSrrr;
5166 RC = &AArch64::FPR32RegClass;
5167 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5168 break;
5169 case MachineCombinerPattern::FMULADDD_OP2:
5170 Opc = AArch64::FMADDDrrr;
5171 RC = &AArch64::FPR64RegClass;
5172 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5173 break;
5174
5175 case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
5176 Opc = AArch64::FMLAv1i32_indexed;
5177 RC = &AArch64::FPR32RegClass;
5178 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5179 FMAInstKind::Indexed);
5180 break;
5181 case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
5182 Opc = AArch64::FMLAv1i32_indexed;
5183 RC = &AArch64::FPR32RegClass;
5184 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5185 FMAInstKind::Indexed);
5186 break;
5187
5188 case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
5189 Opc = AArch64::FMLAv1i64_indexed;
5190 RC = &AArch64::FPR64RegClass;
5191 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5192 FMAInstKind::Indexed);
5193 break;
5194 case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
5195 Opc = AArch64::FMLAv1i64_indexed;
5196 RC = &AArch64::FPR64RegClass;
5197 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5198 FMAInstKind::Indexed);
5199 break;
5200
5201 case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
5202 RC = &AArch64::FPR64RegClass;
5203 Opc = AArch64::FMLAv4i16_indexed;
5204 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5205 FMAInstKind::Indexed);
5206 break;
5207 case MachineCombinerPattern::FMLAv4f16_OP1:
5208 RC = &AArch64::FPR64RegClass;
5209 Opc = AArch64::FMLAv4f16;
5210 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5211 FMAInstKind::Accumulator);
5212 break;
5213 case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
5214 RC = &AArch64::FPR64RegClass;
5215 Opc = AArch64::FMLAv4i16_indexed;
5216 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5217 FMAInstKind::Indexed);
5218 break;
5219 case MachineCombinerPattern::FMLAv4f16_OP2:
5220 RC = &AArch64::FPR64RegClass;
5221 Opc = AArch64::FMLAv4f16;
5222 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5223 FMAInstKind::Accumulator);
5224 break;
5225
5226 case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
5227 case MachineCombinerPattern::FMLAv2f32_OP1:
5228 RC = &AArch64::FPR64RegClass;
5229 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
5230 Opc = AArch64::FMLAv2i32_indexed;
5231 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5232 FMAInstKind::Indexed);
5233 } else {
5234 Opc = AArch64::FMLAv2f32;
5235 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5236 FMAInstKind::Accumulator);
5237 }
5238 break;
5239 case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
5240 case MachineCombinerPattern::FMLAv2f32_OP2:
5241 RC = &AArch64::FPR64RegClass;
5242 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
5243 Opc = AArch64::FMLAv2i32_indexed;
5244 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5245 FMAInstKind::Indexed);
5246 } else {
5247 Opc = AArch64::FMLAv2f32;
5248 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5249 FMAInstKind::Accumulator);
5250 }
5251 break;
5252
5253 case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
5254 RC = &AArch64::FPR128RegClass;
5255 Opc = AArch64::FMLAv8i16_indexed;
5256 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5257 FMAInstKind::Indexed);
5258 break;
5259 case MachineCombinerPattern::FMLAv8f16_OP1:
5260 RC = &AArch64::FPR128RegClass;
5261 Opc = AArch64::FMLAv8f16;
5262 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5263 FMAInstKind::Accumulator);
5264 break;
5265 case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
5266 RC = &AArch64::FPR128RegClass;
5267 Opc = AArch64::FMLAv8i16_indexed;
5268 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5269 FMAInstKind::Indexed);
5270 break;
5271 case MachineCombinerPattern::FMLAv8f16_OP2:
5272 RC = &AArch64::FPR128RegClass;
5273 Opc = AArch64::FMLAv8f16;
5274 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5275 FMAInstKind::Accumulator);
5276 break;
5277
5278 case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
5279 case MachineCombinerPattern::FMLAv2f64_OP1:
5280 RC = &AArch64::FPR128RegClass;
5281 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
5282 Opc = AArch64::FMLAv2i64_indexed;
5283 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5284 FMAInstKind::Indexed);
5285 } else {
5286 Opc = AArch64::FMLAv2f64;
5287 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5288 FMAInstKind::Accumulator);
5289 }
5290 break;
5291 case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
5292 case MachineCombinerPattern::FMLAv2f64_OP2:
5293 RC = &AArch64::FPR128RegClass;
5294 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
5295 Opc = AArch64::FMLAv2i64_indexed;
5296 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5297 FMAInstKind::Indexed);
5298 } else {
5299 Opc = AArch64::FMLAv2f64;
5300 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5301 FMAInstKind::Accumulator);
5302 }
5303 break;
5304
5305 case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
5306 case MachineCombinerPattern::FMLAv4f32_OP1:
5307 RC = &AArch64::FPR128RegClass;
5308 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
5309 Opc = AArch64::FMLAv4i32_indexed;
5310 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5311 FMAInstKind::Indexed);
5312 } else {
5313 Opc = AArch64::FMLAv4f32;
5314 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5315 FMAInstKind::Accumulator);
5316 }
5317 break;
5318
5319 case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
5320 case MachineCombinerPattern::FMLAv4f32_OP2:
5321 RC = &AArch64::FPR128RegClass;
5322 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
5323 Opc = AArch64::FMLAv4i32_indexed;
5324 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5325 FMAInstKind::Indexed);
5326 } else {
5327 Opc = AArch64::FMLAv4f32;
5328 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5329 FMAInstKind::Accumulator);
5330 }
5331 break;
5332
5333 case MachineCombinerPattern::FMULSUBH_OP1:
5334 Opc = AArch64::FNMSUBHrrr;
5335 RC = &AArch64::FPR16RegClass;
5336 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5337 break;
5338 case MachineCombinerPattern::FMULSUBS_OP1:
5339 Opc = AArch64::FNMSUBSrrr;
5340 RC = &AArch64::FPR32RegClass;
5341 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5342 break;
5343 case MachineCombinerPattern::FMULSUBD_OP1:
5344 Opc = AArch64::FNMSUBDrrr;
5345 RC = &AArch64::FPR64RegClass;
5346 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5347 break;
5348
5349 case MachineCombinerPattern::FNMULSUBH_OP1:
5350 Opc = AArch64::FNMADDHrrr;
5351 RC = &AArch64::FPR16RegClass;
5352 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5353 break;
5354 case MachineCombinerPattern::FNMULSUBS_OP1:
5355 Opc = AArch64::FNMADDSrrr;
5356 RC = &AArch64::FPR32RegClass;
5357 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5358 break;
5359 case MachineCombinerPattern::FNMULSUBD_OP1:
5360 Opc = AArch64::FNMADDDrrr;
5361 RC = &AArch64::FPR64RegClass;
5362 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5363 break;
5364
5365 case MachineCombinerPattern::FMULSUBH_OP2:
5366 Opc = AArch64::FMSUBHrrr;
5367 RC = &AArch64::FPR16RegClass;
5368 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5369 break;
5370 case MachineCombinerPattern::FMULSUBS_OP2:
5371 Opc = AArch64::FMSUBSrrr;
5372 RC = &AArch64::FPR32RegClass;
5373 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5374 break;
5375 case MachineCombinerPattern::FMULSUBD_OP2:
5376 Opc = AArch64::FMSUBDrrr;
5377 RC = &AArch64::FPR64RegClass;
5378 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5379 break;
5380
5381 case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
5382 Opc = AArch64::FMLSv1i32_indexed;
5383 RC = &AArch64::FPR32RegClass;
5384 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5385 FMAInstKind::Indexed);
5386 break;
5387
5388 case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
5389 Opc = AArch64::FMLSv1i64_indexed;
5390 RC = &AArch64::FPR64RegClass;
5391 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5392 FMAInstKind::Indexed);
5393 break;
5394
5395 case MachineCombinerPattern::FMLSv4f16_OP1:
5396 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
5397 RC = &AArch64::FPR64RegClass;
5398 Register NewVR = MRI.createVirtualRegister(RC);
5399 MachineInstrBuilder MIB1 =
5400 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR)
5401 .add(Root.getOperand(2));
5402 InsInstrs.push_back(MIB1);
5403 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5404 if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) {
5405 Opc = AArch64::FMLAv4f16;
5406 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5407 FMAInstKind::Accumulator, &NewVR);
5408 } else {
5409 Opc = AArch64::FMLAv4i16_indexed;
5410 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5411 FMAInstKind::Indexed, &NewVR);
5412 }
5413 break;
5414 }
5415 case MachineCombinerPattern::FMLSv4f16_OP2:
5416 RC = &AArch64::FPR64RegClass;
5417 Opc = AArch64::FMLSv4f16;
5418 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5419 FMAInstKind::Accumulator);
5420 break;
5421 case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
5422 RC = &AArch64::FPR64RegClass;
5423 Opc = AArch64::FMLSv4i16_indexed;
5424 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5425 FMAInstKind::Indexed);
5426 break;
5427
5428 case MachineCombinerPattern::FMLSv2f32_OP2:
5429 case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
5430 RC = &AArch64::FPR64RegClass;
5431 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
5432 Opc = AArch64::FMLSv2i32_indexed;
5433 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5434 FMAInstKind::Indexed);
5435 } else {
5436 Opc = AArch64::FMLSv2f32;
5437 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5438 FMAInstKind::Accumulator);
5439 }
5440 break;
5441
5442 case MachineCombinerPattern::FMLSv8f16_OP1:
5443 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
5444 RC = &AArch64::FPR128RegClass;
5445 Register NewVR = MRI.createVirtualRegister(RC);
5446 MachineInstrBuilder MIB1 =
5447 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR)
5448 .add(Root.getOperand(2));
5449 InsInstrs.push_back(MIB1);
5450 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5451 if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) {
5452 Opc = AArch64::FMLAv8f16;
5453 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5454 FMAInstKind::Accumulator, &NewVR);
5455 } else {
5456 Opc = AArch64::FMLAv8i16_indexed;
5457 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5458 FMAInstKind::Indexed, &NewVR);
5459 }
5460 break;
5461 }
5462 case MachineCombinerPattern::FMLSv8f16_OP2:
5463 RC = &AArch64::FPR128RegClass;
5464 Opc = AArch64::FMLSv8f16;
5465 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5466 FMAInstKind::Accumulator);
5467 break;
5468 case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
5469 RC = &AArch64::FPR128RegClass;
5470 Opc = AArch64::FMLSv8i16_indexed;
5471 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5472 FMAInstKind::Indexed);
5473 break;
5474
5475 case MachineCombinerPattern::FMLSv2f64_OP2:
5476 case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
5477 RC = &AArch64::FPR128RegClass;
5478 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
5479 Opc = AArch64::FMLSv2i64_indexed;
5480 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5481 FMAInstKind::Indexed);
5482 } else {
5483 Opc = AArch64::FMLSv2f64;
5484 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5485 FMAInstKind::Accumulator);
5486 }
5487 break;
5488
5489 case MachineCombinerPattern::FMLSv4f32_OP2:
5490 case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
5491 RC = &AArch64::FPR128RegClass;
5492 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
5493 Opc = AArch64::FMLSv4i32_indexed;
5494 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5495 FMAInstKind::Indexed);
5496 } else {
5497 Opc = AArch64::FMLSv4f32;
5498 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5499 FMAInstKind::Accumulator);
5500 }
5501 break;
5502 case MachineCombinerPattern::FMLSv2f32_OP1:
5503 case MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
5504 RC = &AArch64::FPR64RegClass;
5505 Register NewVR = MRI.createVirtualRegister(RC);
5506 MachineInstrBuilder MIB1 =
5507 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR)
5508 .add(Root.getOperand(2));
5509 InsInstrs.push_back(MIB1);
5510 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5511 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
5512 Opc = AArch64::FMLAv2i32_indexed;
5513 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5514 FMAInstKind::Indexed, &NewVR);
5515 } else {
5516 Opc = AArch64::FMLAv2f32;
5517 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5518 FMAInstKind::Accumulator, &NewVR);
5519 }
5520 break;
5521 }
5522 case MachineCombinerPattern::FMLSv4f32_OP1:
5523 case MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
5524 RC = &AArch64::FPR128RegClass;
5525 Register NewVR = MRI.createVirtualRegister(RC);
5526 MachineInstrBuilder MIB1 =
5527 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR)
5528 .add(Root.getOperand(2));
5529 InsInstrs.push_back(MIB1);
5530 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5531 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
5532 Opc = AArch64::FMLAv4i32_indexed;
5533 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5534 FMAInstKind::Indexed, &NewVR);
5535 } else {
5536 Opc = AArch64::FMLAv4f32;
5537 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5538 FMAInstKind::Accumulator, &NewVR);
5539 }
5540 break;
5541 }
5542 case MachineCombinerPattern::FMLSv2f64_OP1:
5543 case MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
5544 RC = &AArch64::FPR128RegClass;
5545 Register NewVR = MRI.createVirtualRegister(RC);
5546 MachineInstrBuilder MIB1 =
5547 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR)
5548 .add(Root.getOperand(2));
5549 InsInstrs.push_back(MIB1);
5550 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5551 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
5552 Opc = AArch64::FMLAv2i64_indexed;
5553 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5554 FMAInstKind::Indexed, &NewVR);
5555 } else {
5556 Opc = AArch64::FMLAv2f64;
5557 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5558 FMAInstKind::Accumulator, &NewVR);
5559 }
5560 break;
5561 }
5562 } // end switch (Pattern)
5563 // Record MUL and ADD/SUB for deletion
5564 DelInstrs.push_back(MUL);
5565 DelInstrs.push_back(&Root);
5566 }
5567
5568 /// Replace csincr-branch sequence by simple conditional branch
5569 ///
5570 /// Examples:
5571 /// 1. \code
5572 /// csinc w9, wzr, wzr, <condition code>
5573 /// tbnz w9, #0, 0x44
5574 /// \endcode
5575 /// to
5576 /// \code
5577 /// b.<inverted condition code>
5578 /// \endcode
5579 ///
5580 /// 2. \code
5581 /// csinc w9, wzr, wzr, <condition code>
5582 /// tbz w9, #0, 0x44
5583 /// \endcode
5584 /// to
5585 /// \code
5586 /// b.<condition code>
5587 /// \endcode
5588 ///
5589 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
5590 /// compare's constant operand is power of 2.
5591 ///
5592 /// Examples:
5593 /// \code
5594 /// and w8, w8, #0x400
5595 /// cbnz w8, L1
5596 /// \endcode
5597 /// to
5598 /// \code
5599 /// tbnz w8, #10, L1
5600 /// \endcode
5601 ///
5602 /// \param MI Conditional Branch
5603 /// \return True when the simple conditional branch is generated
5604 ///
optimizeCondBranch(MachineInstr & MI) const5605 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
5606 bool IsNegativeBranch = false;
5607 bool IsTestAndBranch = false;
5608 unsigned TargetBBInMI = 0;
5609 switch (MI.getOpcode()) {
5610 default:
5611 llvm_unreachable("Unknown branch instruction?");
5612 case AArch64::Bcc:
5613 return false;
5614 case AArch64::CBZW:
5615 case AArch64::CBZX:
5616 TargetBBInMI = 1;
5617 break;
5618 case AArch64::CBNZW:
5619 case AArch64::CBNZX:
5620 TargetBBInMI = 1;
5621 IsNegativeBranch = true;
5622 break;
5623 case AArch64::TBZW:
5624 case AArch64::TBZX:
5625 TargetBBInMI = 2;
5626 IsTestAndBranch = true;
5627 break;
5628 case AArch64::TBNZW:
5629 case AArch64::TBNZX:
5630 TargetBBInMI = 2;
5631 IsNegativeBranch = true;
5632 IsTestAndBranch = true;
5633 break;
5634 }
5635 // So we increment a zero register and test for bits other
5636 // than bit 0? Conservatively bail out in case the verifier
5637 // missed this case.
5638 if (IsTestAndBranch && MI.getOperand(1).getImm())
5639 return false;
5640
5641 // Find Definition.
5642 assert(MI.getParent() && "Incomplete machine instruciton\n");
5643 MachineBasicBlock *MBB = MI.getParent();
5644 MachineFunction *MF = MBB->getParent();
5645 MachineRegisterInfo *MRI = &MF->getRegInfo();
5646 Register VReg = MI.getOperand(0).getReg();
5647 if (!Register::isVirtualRegister(VReg))
5648 return false;
5649
5650 MachineInstr *DefMI = MRI->getVRegDef(VReg);
5651
5652 // Look through COPY instructions to find definition.
5653 while (DefMI->isCopy()) {
5654 Register CopyVReg = DefMI->getOperand(1).getReg();
5655 if (!MRI->hasOneNonDBGUse(CopyVReg))
5656 return false;
5657 if (!MRI->hasOneDef(CopyVReg))
5658 return false;
5659 DefMI = MRI->getVRegDef(CopyVReg);
5660 }
5661
5662 switch (DefMI->getOpcode()) {
5663 default:
5664 return false;
5665 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
5666 case AArch64::ANDWri:
5667 case AArch64::ANDXri: {
5668 if (IsTestAndBranch)
5669 return false;
5670 if (DefMI->getParent() != MBB)
5671 return false;
5672 if (!MRI->hasOneNonDBGUse(VReg))
5673 return false;
5674
5675 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
5676 uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
5677 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
5678 if (!isPowerOf2_64(Mask))
5679 return false;
5680
5681 MachineOperand &MO = DefMI->getOperand(1);
5682 Register NewReg = MO.getReg();
5683 if (!Register::isVirtualRegister(NewReg))
5684 return false;
5685
5686 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
5687
5688 MachineBasicBlock &RefToMBB = *MBB;
5689 MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
5690 DebugLoc DL = MI.getDebugLoc();
5691 unsigned Imm = Log2_64(Mask);
5692 unsigned Opc = (Imm < 32)
5693 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
5694 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
5695 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
5696 .addReg(NewReg)
5697 .addImm(Imm)
5698 .addMBB(TBB);
5699 // Register lives on to the CBZ now.
5700 MO.setIsKill(false);
5701
5702 // For immediate smaller than 32, we need to use the 32-bit
5703 // variant (W) in all cases. Indeed the 64-bit variant does not
5704 // allow to encode them.
5705 // Therefore, if the input register is 64-bit, we need to take the
5706 // 32-bit sub-part.
5707 if (!Is32Bit && Imm < 32)
5708 NewMI->getOperand(0).setSubReg(AArch64::sub_32);
5709 MI.eraseFromParent();
5710 return true;
5711 }
5712 // Look for CSINC
5713 case AArch64::CSINCWr:
5714 case AArch64::CSINCXr: {
5715 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
5716 DefMI->getOperand(2).getReg() == AArch64::WZR) &&
5717 !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
5718 DefMI->getOperand(2).getReg() == AArch64::XZR))
5719 return false;
5720
5721 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
5722 return false;
5723
5724 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
5725 // Convert only when the condition code is not modified between
5726 // the CSINC and the branch. The CC may be used by other
5727 // instructions in between.
5728 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
5729 return false;
5730 MachineBasicBlock &RefToMBB = *MBB;
5731 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
5732 DebugLoc DL = MI.getDebugLoc();
5733 if (IsNegativeBranch)
5734 CC = AArch64CC::getInvertedCondCode(CC);
5735 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
5736 MI.eraseFromParent();
5737 return true;
5738 }
5739 }
5740 }
5741
5742 std::pair<unsigned, unsigned>
decomposeMachineOperandsTargetFlags(unsigned TF) const5743 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
5744 const unsigned Mask = AArch64II::MO_FRAGMENT;
5745 return std::make_pair(TF & Mask, TF & ~Mask);
5746 }
5747
5748 ArrayRef<std::pair<unsigned, const char *>>
getSerializableDirectMachineOperandTargetFlags() const5749 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
5750 using namespace AArch64II;
5751
5752 static const std::pair<unsigned, const char *> TargetFlags[] = {
5753 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
5754 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
5755 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
5756 {MO_HI12, "aarch64-hi12"}};
5757 return makeArrayRef(TargetFlags);
5758 }
5759
5760 ArrayRef<std::pair<unsigned, const char *>>
getSerializableBitmaskMachineOperandTargetFlags() const5761 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
5762 using namespace AArch64II;
5763
5764 static const std::pair<unsigned, const char *> TargetFlags[] = {
5765 {MO_COFFSTUB, "aarch64-coffstub"},
5766 {MO_GOT, "aarch64-got"},
5767 {MO_NC, "aarch64-nc"},
5768 {MO_S, "aarch64-s"},
5769 {MO_TLS, "aarch64-tls"},
5770 {MO_DLLIMPORT, "aarch64-dllimport"},
5771 {MO_PREL, "aarch64-prel"},
5772 {MO_TAGGED, "aarch64-tagged"}};
5773 return makeArrayRef(TargetFlags);
5774 }
5775
5776 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
getSerializableMachineMemOperandTargetFlags() const5777 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
5778 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
5779 {{MOSuppressPair, "aarch64-suppress-pair"},
5780 {MOStridedAccess, "aarch64-strided-access"}};
5781 return makeArrayRef(TargetFlags);
5782 }
5783
5784 /// Constants defining how certain sequences should be outlined.
5785 /// This encompasses how an outlined function should be called, and what kind of
5786 /// frame should be emitted for that outlined function.
5787 ///
5788 /// \p MachineOutlinerDefault implies that the function should be called with
5789 /// a save and restore of LR to the stack.
5790 ///
5791 /// That is,
5792 ///
5793 /// I1 Save LR OUTLINED_FUNCTION:
5794 /// I2 --> BL OUTLINED_FUNCTION I1
5795 /// I3 Restore LR I2
5796 /// I3
5797 /// RET
5798 ///
5799 /// * Call construction overhead: 3 (save + BL + restore)
5800 /// * Frame construction overhead: 1 (ret)
5801 /// * Requires stack fixups? Yes
5802 ///
5803 /// \p MachineOutlinerTailCall implies that the function is being created from
5804 /// a sequence of instructions ending in a return.
5805 ///
5806 /// That is,
5807 ///
5808 /// I1 OUTLINED_FUNCTION:
5809 /// I2 --> B OUTLINED_FUNCTION I1
5810 /// RET I2
5811 /// RET
5812 ///
5813 /// * Call construction overhead: 1 (B)
5814 /// * Frame construction overhead: 0 (Return included in sequence)
5815 /// * Requires stack fixups? No
5816 ///
5817 /// \p MachineOutlinerNoLRSave implies that the function should be called using
5818 /// a BL instruction, but doesn't require LR to be saved and restored. This
5819 /// happens when LR is known to be dead.
5820 ///
5821 /// That is,
5822 ///
5823 /// I1 OUTLINED_FUNCTION:
5824 /// I2 --> BL OUTLINED_FUNCTION I1
5825 /// I3 I2
5826 /// I3
5827 /// RET
5828 ///
5829 /// * Call construction overhead: 1 (BL)
5830 /// * Frame construction overhead: 1 (RET)
5831 /// * Requires stack fixups? No
5832 ///
5833 /// \p MachineOutlinerThunk implies that the function is being created from
5834 /// a sequence of instructions ending in a call. The outlined function is
5835 /// called with a BL instruction, and the outlined function tail-calls the
5836 /// original call destination.
5837 ///
5838 /// That is,
5839 ///
5840 /// I1 OUTLINED_FUNCTION:
5841 /// I2 --> BL OUTLINED_FUNCTION I1
5842 /// BL f I2
5843 /// B f
5844 /// * Call construction overhead: 1 (BL)
5845 /// * Frame construction overhead: 0
5846 /// * Requires stack fixups? No
5847 ///
5848 /// \p MachineOutlinerRegSave implies that the function should be called with a
5849 /// save and restore of LR to an available register. This allows us to avoid
5850 /// stack fixups. Note that this outlining variant is compatible with the
5851 /// NoLRSave case.
5852 ///
5853 /// That is,
5854 ///
5855 /// I1 Save LR OUTLINED_FUNCTION:
5856 /// I2 --> BL OUTLINED_FUNCTION I1
5857 /// I3 Restore LR I2
5858 /// I3
5859 /// RET
5860 ///
5861 /// * Call construction overhead: 3 (save + BL + restore)
5862 /// * Frame construction overhead: 1 (ret)
5863 /// * Requires stack fixups? No
5864 enum MachineOutlinerClass {
5865 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
5866 MachineOutlinerTailCall, /// Only emit a branch.
5867 MachineOutlinerNoLRSave, /// Emit a call and return.
5868 MachineOutlinerThunk, /// Emit a call and tail-call.
5869 MachineOutlinerRegSave /// Same as default, but save to a register.
5870 };
5871
5872 enum MachineOutlinerMBBFlags {
5873 LRUnavailableSomewhere = 0x2,
5874 HasCalls = 0x4,
5875 UnsafeRegsDead = 0x8
5876 };
5877
5878 unsigned
findRegisterToSaveLRTo(const outliner::Candidate & C) const5879 AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
5880 assert(C.LRUWasSet && "LRU wasn't set?");
5881 MachineFunction *MF = C.getMF();
5882 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
5883 MF->getSubtarget().getRegisterInfo());
5884
5885 // Check if there is an available register across the sequence that we can
5886 // use.
5887 for (unsigned Reg : AArch64::GPR64RegClass) {
5888 if (!ARI->isReservedReg(*MF, Reg) &&
5889 Reg != AArch64::LR && // LR is not reserved, but don't use it.
5890 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
5891 Reg != AArch64::X17 && // Ditto for X17.
5892 C.LRU.available(Reg) && C.UsedInSequence.available(Reg))
5893 return Reg;
5894 }
5895
5896 // No suitable register. Return 0.
5897 return 0u;
5898 }
5899
5900 static bool
outliningCandidatesSigningScopeConsensus(const outliner::Candidate & a,const outliner::Candidate & b)5901 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
5902 const outliner::Candidate &b) {
5903 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
5904 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
5905
5906 return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) &&
5907 MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true);
5908 }
5909
5910 static bool
outliningCandidatesSigningKeyConsensus(const outliner::Candidate & a,const outliner::Candidate & b)5911 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
5912 const outliner::Candidate &b) {
5913 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
5914 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
5915
5916 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
5917 }
5918
outliningCandidatesV8_3OpsConsensus(const outliner::Candidate & a,const outliner::Candidate & b)5919 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
5920 const outliner::Candidate &b) {
5921 const AArch64Subtarget &SubtargetA =
5922 a.getMF()->getSubtarget<AArch64Subtarget>();
5923 const AArch64Subtarget &SubtargetB =
5924 b.getMF()->getSubtarget<AArch64Subtarget>();
5925 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
5926 }
5927
getOutliningCandidateInfo(std::vector<outliner::Candidate> & RepeatedSequenceLocs) const5928 outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
5929 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
5930 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
5931 unsigned SequenceSize =
5932 std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0,
5933 [this](unsigned Sum, const MachineInstr &MI) {
5934 return Sum + getInstSizeInBytes(MI);
5935 });
5936 unsigned NumBytesToCreateFrame = 0;
5937
5938 // We only allow outlining for functions having exactly matching return
5939 // address signing attributes, i.e., all share the same value for the
5940 // attribute "sign-return-address" and all share the same type of key they
5941 // are signed with.
5942 // Additionally we require all functions to simultaniously either support
5943 // v8.3a features or not. Otherwise an outlined function could get signed
5944 // using dedicated v8.3 instructions and a call from a function that doesn't
5945 // support v8.3 instructions would therefore be invalid.
5946 if (std::adjacent_find(
5947 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
5948 [](const outliner::Candidate &a, const outliner::Candidate &b) {
5949 // Return true if a and b are non-equal w.r.t. return address
5950 // signing or support of v8.3a features
5951 if (outliningCandidatesSigningScopeConsensus(a, b) &&
5952 outliningCandidatesSigningKeyConsensus(a, b) &&
5953 outliningCandidatesV8_3OpsConsensus(a, b)) {
5954 return false;
5955 }
5956 return true;
5957 }) != RepeatedSequenceLocs.end()) {
5958 return outliner::OutlinedFunction();
5959 }
5960
5961 // Since at this point all candidates agree on their return address signing
5962 // picking just one is fine. If the candidate functions potentially sign their
5963 // return addresses, the outlined function should do the same. Note that in
5964 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
5965 // not certainly true that the outlined function will have to sign its return
5966 // address but this decision is made later, when the decision to outline
5967 // has already been made.
5968 // The same holds for the number of additional instructions we need: On
5969 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
5970 // necessary. However, at this point we don't know if the outlined function
5971 // will have a RET instruction so we assume the worst.
5972 const TargetRegisterInfo &TRI = getRegisterInfo();
5973 if (FirstCand.getMF()
5974 ->getInfo<AArch64FunctionInfo>()
5975 ->shouldSignReturnAddress(true)) {
5976 // One PAC and one AUT instructions
5977 NumBytesToCreateFrame += 8;
5978
5979 // We have to check if sp modifying instructions would get outlined.
5980 // If so we only allow outlining if sp is unchanged overall, so matching
5981 // sub and add instructions are okay to outline, all other sp modifications
5982 // are not
5983 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
5984 int SPValue = 0;
5985 MachineBasicBlock::iterator MBBI = C.front();
5986 for (;;) {
5987 if (MBBI->modifiesRegister(AArch64::SP, &TRI)) {
5988 switch (MBBI->getOpcode()) {
5989 case AArch64::ADDXri:
5990 case AArch64::ADDWri:
5991 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
5992 assert(MBBI->getOperand(2).isImm() &&
5993 "Expected operand to be immediate");
5994 assert(MBBI->getOperand(1).isReg() &&
5995 "Expected operand to be a register");
5996 // Check if the add just increments sp. If so, we search for
5997 // matching sub instructions that decrement sp. If not, the
5998 // modification is illegal
5999 if (MBBI->getOperand(1).getReg() == AArch64::SP)
6000 SPValue += MBBI->getOperand(2).getImm();
6001 else
6002 return true;
6003 break;
6004 case AArch64::SUBXri:
6005 case AArch64::SUBWri:
6006 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
6007 assert(MBBI->getOperand(2).isImm() &&
6008 "Expected operand to be immediate");
6009 assert(MBBI->getOperand(1).isReg() &&
6010 "Expected operand to be a register");
6011 // Check if the sub just decrements sp. If so, we search for
6012 // matching add instructions that increment sp. If not, the
6013 // modification is illegal
6014 if (MBBI->getOperand(1).getReg() == AArch64::SP)
6015 SPValue -= MBBI->getOperand(2).getImm();
6016 else
6017 return true;
6018 break;
6019 default:
6020 return true;
6021 }
6022 }
6023 if (MBBI == C.back())
6024 break;
6025 ++MBBI;
6026 }
6027 if (SPValue)
6028 return true;
6029 return false;
6030 };
6031 // Remove candidates with illegal stack modifying instructions
6032 RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
6033 RepeatedSequenceLocs.end(),
6034 hasIllegalSPModification),
6035 RepeatedSequenceLocs.end());
6036
6037 // If the sequence doesn't have enough candidates left, then we're done.
6038 if (RepeatedSequenceLocs.size() < 2)
6039 return outliner::OutlinedFunction();
6040 }
6041
6042 // Properties about candidate MBBs that hold for all of them.
6043 unsigned FlagsSetInAll = 0xF;
6044
6045 // Compute liveness information for each candidate, and set FlagsSetInAll.
6046 std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
6047 [&FlagsSetInAll](outliner::Candidate &C) {
6048 FlagsSetInAll &= C.Flags;
6049 });
6050
6051 // According to the AArch64 Procedure Call Standard, the following are
6052 // undefined on entry/exit from a function call:
6053 //
6054 // * Registers x16, x17, (and thus w16, w17)
6055 // * Condition codes (and thus the NZCV register)
6056 //
6057 // Because if this, we can't outline any sequence of instructions where
6058 // one
6059 // of these registers is live into/across it. Thus, we need to delete
6060 // those
6061 // candidates.
6062 auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) {
6063 // If the unsafe registers in this block are all dead, then we don't need
6064 // to compute liveness here.
6065 if (C.Flags & UnsafeRegsDead)
6066 return false;
6067 C.initLRU(TRI);
6068 LiveRegUnits LRU = C.LRU;
6069 return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) ||
6070 !LRU.available(AArch64::NZCV));
6071 };
6072
6073 // Are there any candidates where those registers are live?
6074 if (!(FlagsSetInAll & UnsafeRegsDead)) {
6075 // Erase every candidate that violates the restrictions above. (It could be
6076 // true that we have viable candidates, so it's not worth bailing out in
6077 // the case that, say, 1 out of 20 candidates violate the restructions.)
6078 RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
6079 RepeatedSequenceLocs.end(),
6080 CantGuaranteeValueAcrossCall),
6081 RepeatedSequenceLocs.end());
6082
6083 // If the sequence doesn't have enough candidates left, then we're done.
6084 if (RepeatedSequenceLocs.size() < 2)
6085 return outliner::OutlinedFunction();
6086 }
6087
6088 // At this point, we have only "safe" candidates to outline. Figure out
6089 // frame + call instruction information.
6090
6091 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
6092
6093 // Helper lambda which sets call information for every candidate.
6094 auto SetCandidateCallInfo =
6095 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
6096 for (outliner::Candidate &C : RepeatedSequenceLocs)
6097 C.setCallInfo(CallID, NumBytesForCall);
6098 };
6099
6100 unsigned FrameID = MachineOutlinerDefault;
6101 NumBytesToCreateFrame += 4;
6102
6103 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
6104 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
6105 });
6106
6107 // We check to see if CFI Instructions are present, and if they are
6108 // we find the number of CFI Instructions in the candidates.
6109 unsigned CFICount = 0;
6110 MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front();
6111 for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx();
6112 Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) {
6113 const std::vector<MCCFIInstruction> &CFIInstructions =
6114 RepeatedSequenceLocs[0].getMF()->getFrameInstructions();
6115 if (MBBI->isCFIInstruction()) {
6116 unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex();
6117 MCCFIInstruction CFI = CFIInstructions[CFIIndex];
6118 CFICount++;
6119 }
6120 MBBI++;
6121 }
6122
6123 // We compare the number of found CFI Instructions to the number of CFI
6124 // instructions in the parent function for each candidate. We must check this
6125 // since if we outline one of the CFI instructions in a function, we have to
6126 // outline them all for correctness. If we do not, the address offsets will be
6127 // incorrect between the two sections of the program.
6128 for (outliner::Candidate &C : RepeatedSequenceLocs) {
6129 std::vector<MCCFIInstruction> CFIInstructions =
6130 C.getMF()->getFrameInstructions();
6131
6132 if (CFICount > 0 && CFICount != CFIInstructions.size())
6133 return outliner::OutlinedFunction();
6134 }
6135
6136 // Returns true if an instructions is safe to fix up, false otherwise.
6137 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
6138 if (MI.isCall())
6139 return true;
6140
6141 if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
6142 !MI.readsRegister(AArch64::SP, &TRI))
6143 return true;
6144
6145 // Any modification of SP will break our code to save/restore LR.
6146 // FIXME: We could handle some instructions which add a constant
6147 // offset to SP, with a bit more work.
6148 if (MI.modifiesRegister(AArch64::SP, &TRI))
6149 return false;
6150
6151 // At this point, we have a stack instruction that we might need to
6152 // fix up. We'll handle it if it's a load or store.
6153 if (MI.mayLoadOrStore()) {
6154 const MachineOperand *Base; // Filled with the base operand of MI.
6155 int64_t Offset; // Filled with the offset of MI.
6156 bool OffsetIsScalable;
6157
6158 // Does it allow us to offset the base operand and is the base the
6159 // register SP?
6160 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
6161 !Base->isReg() || Base->getReg() != AArch64::SP)
6162 return false;
6163
6164 // Fixe-up code below assumes bytes.
6165 if (OffsetIsScalable)
6166 return false;
6167
6168 // Find the minimum/maximum offset for this instruction and check
6169 // if fixing it up would be in range.
6170 int64_t MinOffset,
6171 MaxOffset; // Unscaled offsets for the instruction.
6172 TypeSize Scale(0U, false); // The scale to multiply the offsets by.
6173 unsigned DummyWidth;
6174 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
6175
6176 Offset += 16; // Update the offset to what it would be if we outlined.
6177 if (Offset < MinOffset * (int64_t)Scale.getFixedSize() ||
6178 Offset > MaxOffset * (int64_t)Scale.getFixedSize())
6179 return false;
6180
6181 // It's in range, so we can outline it.
6182 return true;
6183 }
6184
6185 // FIXME: Add handling for instructions like "add x0, sp, #8".
6186
6187 // We can't fix it up, so don't outline it.
6188 return false;
6189 };
6190
6191 // True if it's possible to fix up each stack instruction in this sequence.
6192 // Important for frames/call variants that modify the stack.
6193 bool AllStackInstrsSafe = std::all_of(
6194 FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup);
6195
6196 // If the last instruction in any candidate is a terminator, then we should
6197 // tail call all of the candidates.
6198 if (RepeatedSequenceLocs[0].back()->isTerminator()) {
6199 FrameID = MachineOutlinerTailCall;
6200 NumBytesToCreateFrame = 0;
6201 SetCandidateCallInfo(MachineOutlinerTailCall, 4);
6202 }
6203
6204 else if (LastInstrOpcode == AArch64::BL ||
6205 ((LastInstrOpcode == AArch64::BLR ||
6206 LastInstrOpcode == AArch64::BLRNoIP) &&
6207 !HasBTI)) {
6208 // FIXME: Do we need to check if the code after this uses the value of LR?
6209 FrameID = MachineOutlinerThunk;
6210 NumBytesToCreateFrame = 0;
6211 SetCandidateCallInfo(MachineOutlinerThunk, 4);
6212 }
6213
6214 else {
6215 // We need to decide how to emit calls + frames. We can always emit the same
6216 // frame if we don't need to save to the stack. If we have to save to the
6217 // stack, then we need a different frame.
6218 unsigned NumBytesNoStackCalls = 0;
6219 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
6220
6221 // Check if we have to save LR.
6222 for (outliner::Candidate &C : RepeatedSequenceLocs) {
6223 C.initLRU(TRI);
6224
6225 // If we have a noreturn caller, then we're going to be conservative and
6226 // say that we have to save LR. If we don't have a ret at the end of the
6227 // block, then we can't reason about liveness accurately.
6228 //
6229 // FIXME: We can probably do better than always disabling this in
6230 // noreturn functions by fixing up the liveness info.
6231 bool IsNoReturn =
6232 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
6233
6234 // Is LR available? If so, we don't need a save.
6235 if (C.LRU.available(AArch64::LR) && !IsNoReturn) {
6236 NumBytesNoStackCalls += 4;
6237 C.setCallInfo(MachineOutlinerNoLRSave, 4);
6238 CandidatesWithoutStackFixups.push_back(C);
6239 }
6240
6241 // Is an unused register available? If so, we won't modify the stack, so
6242 // we can outline with the same frame type as those that don't save LR.
6243 else if (findRegisterToSaveLRTo(C)) {
6244 NumBytesNoStackCalls += 12;
6245 C.setCallInfo(MachineOutlinerRegSave, 12);
6246 CandidatesWithoutStackFixups.push_back(C);
6247 }
6248
6249 // Is SP used in the sequence at all? If not, we don't have to modify
6250 // the stack, so we are guaranteed to get the same frame.
6251 else if (C.UsedInSequence.available(AArch64::SP)) {
6252 NumBytesNoStackCalls += 12;
6253 C.setCallInfo(MachineOutlinerDefault, 12);
6254 CandidatesWithoutStackFixups.push_back(C);
6255 }
6256
6257 // If we outline this, we need to modify the stack. Pretend we don't
6258 // outline this by saving all of its bytes.
6259 else {
6260 NumBytesNoStackCalls += SequenceSize;
6261 }
6262 }
6263
6264 // If there are no places where we have to save LR, then note that we
6265 // don't have to update the stack. Otherwise, give every candidate the
6266 // default call type, as long as it's safe to do so.
6267 if (!AllStackInstrsSafe ||
6268 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
6269 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
6270 FrameID = MachineOutlinerNoLRSave;
6271 } else {
6272 SetCandidateCallInfo(MachineOutlinerDefault, 12);
6273
6274 // Bugzilla ID: 46767
6275 // TODO: Check if fixing up the stack more than once is safe so we can
6276 // outline these.
6277 //
6278 // An outline resulting in a caller that requires stack fixups at the
6279 // callsite to a callee that also requires stack fixups can happen when
6280 // there are no available registers at the candidate callsite for a
6281 // candidate that itself also has calls.
6282 //
6283 // In other words if function_containing_sequence in the following pseudo
6284 // assembly requires that we save LR at the point of the call, but there
6285 // are no available registers: in this case we save using SP and as a
6286 // result the SP offsets requires stack fixups by multiples of 16.
6287 //
6288 // function_containing_sequence:
6289 // ...
6290 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
6291 // call OUTLINED_FUNCTION_N
6292 // restore LR from SP
6293 // ...
6294 //
6295 // OUTLINED_FUNCTION_N:
6296 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
6297 // ...
6298 // bl foo
6299 // restore LR from SP
6300 // ret
6301 //
6302 // Because the code to handle more than one stack fixup does not
6303 // currently have the proper checks for legality, these cases will assert
6304 // in the AArch64 MachineOutliner. This is because the code to do this
6305 // needs more hardening, testing, better checks that generated code is
6306 // legal, etc and because it is only verified to handle a single pass of
6307 // stack fixup.
6308 //
6309 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
6310 // these cases until they are known to be handled. Bugzilla 46767 is
6311 // referenced in comments at the assert site.
6312 //
6313 // To avoid asserting (or generating non-legal code on noassert builds)
6314 // we remove all candidates which would need more than one stack fixup by
6315 // pruning the cases where the candidate has calls while also having no
6316 // available LR and having no available general purpose registers to copy
6317 // LR to (ie one extra stack save/restore).
6318 //
6319 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
6320 erase_if(RepeatedSequenceLocs, [this](outliner::Candidate &C) {
6321 return (std::any_of(
6322 C.front(), std::next(C.back()),
6323 [](const MachineInstr &MI) { return MI.isCall(); })) &&
6324 (!C.LRU.available(AArch64::LR) || !findRegisterToSaveLRTo(C));
6325 });
6326 }
6327 }
6328
6329 // If we dropped all of the candidates, bail out here.
6330 if (RepeatedSequenceLocs.size() < 2) {
6331 RepeatedSequenceLocs.clear();
6332 return outliner::OutlinedFunction();
6333 }
6334 }
6335
6336 // Does every candidate's MBB contain a call? If so, then we might have a call
6337 // in the range.
6338 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
6339 // Check if the range contains a call. These require a save + restore of the
6340 // link register.
6341 bool ModStackToSaveLR = false;
6342 if (std::any_of(FirstCand.front(), FirstCand.back(),
6343 [](const MachineInstr &MI) { return MI.isCall(); }))
6344 ModStackToSaveLR = true;
6345
6346 // Handle the last instruction separately. If this is a tail call, then the
6347 // last instruction is a call. We don't want to save + restore in this case.
6348 // However, it could be possible that the last instruction is a call without
6349 // it being valid to tail call this sequence. We should consider this as
6350 // well.
6351 else if (FrameID != MachineOutlinerThunk &&
6352 FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall())
6353 ModStackToSaveLR = true;
6354
6355 if (ModStackToSaveLR) {
6356 // We can't fix up the stack. Bail out.
6357 if (!AllStackInstrsSafe) {
6358 RepeatedSequenceLocs.clear();
6359 return outliner::OutlinedFunction();
6360 }
6361
6362 // Save + restore LR.
6363 NumBytesToCreateFrame += 8;
6364 }
6365 }
6366
6367 // If we have CFI instructions, we can only outline if the outlined section
6368 // can be a tail call
6369 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
6370 return outliner::OutlinedFunction();
6371
6372 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
6373 NumBytesToCreateFrame, FrameID);
6374 }
6375
isFunctionSafeToOutlineFrom(MachineFunction & MF,bool OutlineFromLinkOnceODRs) const6376 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
6377 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
6378 const Function &F = MF.getFunction();
6379
6380 // Can F be deduplicated by the linker? If it can, don't outline from it.
6381 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
6382 return false;
6383
6384 // Don't outline from functions with section markings; the program could
6385 // expect that all the code is in the named section.
6386 // FIXME: Allow outlining from multiple functions with the same section
6387 // marking.
6388 if (F.hasSection())
6389 return false;
6390
6391 // Outlining from functions with redzones is unsafe since the outliner may
6392 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
6393 // outline from it.
6394 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
6395 if (!AFI || AFI->hasRedZone().getValueOr(true))
6396 return false;
6397
6398 // FIXME: Teach the outliner to generate/handle Windows unwind info.
6399 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
6400 return false;
6401
6402 // It's safe to outline from MF.
6403 return true;
6404 }
6405
isMBBSafeToOutlineFrom(MachineBasicBlock & MBB,unsigned & Flags) const6406 bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
6407 unsigned &Flags) const {
6408 // Check if LR is available through all of the MBB. If it's not, then set
6409 // a flag.
6410 assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
6411 "Suitable Machine Function for outlining must track liveness");
6412 LiveRegUnits LRU(getRegisterInfo());
6413
6414 std::for_each(MBB.rbegin(), MBB.rend(),
6415 [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
6416
6417 // Check if each of the unsafe registers are available...
6418 bool W16AvailableInBlock = LRU.available(AArch64::W16);
6419 bool W17AvailableInBlock = LRU.available(AArch64::W17);
6420 bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV);
6421
6422 // If all of these are dead (and not live out), we know we don't have to check
6423 // them later.
6424 if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock)
6425 Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead;
6426
6427 // Now, add the live outs to the set.
6428 LRU.addLiveOuts(MBB);
6429
6430 // If any of these registers is available in the MBB, but also a live out of
6431 // the block, then we know outlining is unsafe.
6432 if (W16AvailableInBlock && !LRU.available(AArch64::W16))
6433 return false;
6434 if (W17AvailableInBlock && !LRU.available(AArch64::W17))
6435 return false;
6436 if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV))
6437 return false;
6438
6439 // Check if there's a call inside this MachineBasicBlock. If there is, then
6440 // set a flag.
6441 if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
6442 Flags |= MachineOutlinerMBBFlags::HasCalls;
6443
6444 MachineFunction *MF = MBB.getParent();
6445
6446 // In the event that we outline, we may have to save LR. If there is an
6447 // available register in the MBB, then we'll always save LR there. Check if
6448 // this is true.
6449 bool CanSaveLR = false;
6450 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
6451 MF->getSubtarget().getRegisterInfo());
6452
6453 // Check if there is an available register across the sequence that we can
6454 // use.
6455 for (unsigned Reg : AArch64::GPR64RegClass) {
6456 if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR &&
6457 Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) {
6458 CanSaveLR = true;
6459 break;
6460 }
6461 }
6462
6463 // Check if we have a register we can save LR to, and if LR was used
6464 // somewhere. If both of those things are true, then we need to evaluate the
6465 // safety of outlining stack instructions later.
6466 if (!CanSaveLR && !LRU.available(AArch64::LR))
6467 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
6468
6469 return true;
6470 }
6471
6472 outliner::InstrType
getOutliningType(MachineBasicBlock::iterator & MIT,unsigned Flags) const6473 AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
6474 unsigned Flags) const {
6475 MachineInstr &MI = *MIT;
6476 MachineBasicBlock *MBB = MI.getParent();
6477 MachineFunction *MF = MBB->getParent();
6478 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
6479
6480 // Don't outline anything used for return address signing. The outlined
6481 // function will get signed later if needed
6482 switch (MI.getOpcode()) {
6483 case AArch64::PACIASP:
6484 case AArch64::PACIBSP:
6485 case AArch64::AUTIASP:
6486 case AArch64::AUTIBSP:
6487 case AArch64::RETAA:
6488 case AArch64::RETAB:
6489 case AArch64::EMITBKEY:
6490 return outliner::InstrType::Illegal;
6491 }
6492
6493 // Don't outline LOHs.
6494 if (FuncInfo->getLOHRelated().count(&MI))
6495 return outliner::InstrType::Illegal;
6496
6497 // We can only outline these if we will tail call the outlined function, or
6498 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
6499 // in a tail call.
6500 //
6501 // FIXME: If the proper fixups for the offset are implemented, this should be
6502 // possible.
6503 if (MI.isCFIInstruction())
6504 return outliner::InstrType::Legal;
6505
6506 // Don't allow debug values to impact outlining type.
6507 if (MI.isDebugInstr() || MI.isIndirectDebugValue())
6508 return outliner::InstrType::Invisible;
6509
6510 // At this point, KILL instructions don't really tell us much so we can go
6511 // ahead and skip over them.
6512 if (MI.isKill())
6513 return outliner::InstrType::Invisible;
6514
6515 // Is this a terminator for a basic block?
6516 if (MI.isTerminator()) {
6517
6518 // Is this the end of a function?
6519 if (MI.getParent()->succ_empty())
6520 return outliner::InstrType::Legal;
6521
6522 // It's not, so don't outline it.
6523 return outliner::InstrType::Illegal;
6524 }
6525
6526 // Make sure none of the operands are un-outlinable.
6527 for (const MachineOperand &MOP : MI.operands()) {
6528 if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
6529 MOP.isTargetIndex())
6530 return outliner::InstrType::Illegal;
6531
6532 // If it uses LR or W30 explicitly, then don't touch it.
6533 if (MOP.isReg() && !MOP.isImplicit() &&
6534 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
6535 return outliner::InstrType::Illegal;
6536 }
6537
6538 // Special cases for instructions that can always be outlined, but will fail
6539 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
6540 // be outlined because they don't require a *specific* value to be in LR.
6541 if (MI.getOpcode() == AArch64::ADRP)
6542 return outliner::InstrType::Legal;
6543
6544 // If MI is a call we might be able to outline it. We don't want to outline
6545 // any calls that rely on the position of items on the stack. When we outline
6546 // something containing a call, we have to emit a save and restore of LR in
6547 // the outlined function. Currently, this always happens by saving LR to the
6548 // stack. Thus, if we outline, say, half the parameters for a function call
6549 // plus the call, then we'll break the callee's expectations for the layout
6550 // of the stack.
6551 //
6552 // FIXME: Allow calls to functions which construct a stack frame, as long
6553 // as they don't access arguments on the stack.
6554 // FIXME: Figure out some way to analyze functions defined in other modules.
6555 // We should be able to compute the memory usage based on the IR calling
6556 // convention, even if we can't see the definition.
6557 if (MI.isCall()) {
6558 // Get the function associated with the call. Look at each operand and find
6559 // the one that represents the callee and get its name.
6560 const Function *Callee = nullptr;
6561 for (const MachineOperand &MOP : MI.operands()) {
6562 if (MOP.isGlobal()) {
6563 Callee = dyn_cast<Function>(MOP.getGlobal());
6564 break;
6565 }
6566 }
6567
6568 // Never outline calls to mcount. There isn't any rule that would require
6569 // this, but the Linux kernel's "ftrace" feature depends on it.
6570 if (Callee && Callee->getName() == "\01_mcount")
6571 return outliner::InstrType::Illegal;
6572
6573 // If we don't know anything about the callee, assume it depends on the
6574 // stack layout of the caller. In that case, it's only legal to outline
6575 // as a tail-call. Explicitly list the call instructions we know about so we
6576 // don't get unexpected results with call pseudo-instructions.
6577 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
6578 if (MI.getOpcode() == AArch64::BLR ||
6579 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
6580 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
6581
6582 if (!Callee)
6583 return UnknownCallOutlineType;
6584
6585 // We have a function we have information about. Check it if it's something
6586 // can safely outline.
6587 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
6588
6589 // We don't know what's going on with the callee at all. Don't touch it.
6590 if (!CalleeMF)
6591 return UnknownCallOutlineType;
6592
6593 // Check if we know anything about the callee saves on the function. If we
6594 // don't, then don't touch it, since that implies that we haven't
6595 // computed anything about its stack frame yet.
6596 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
6597 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
6598 MFI.getNumObjects() > 0)
6599 return UnknownCallOutlineType;
6600
6601 // At this point, we can say that CalleeMF ought to not pass anything on the
6602 // stack. Therefore, we can outline it.
6603 return outliner::InstrType::Legal;
6604 }
6605
6606 // Don't outline positions.
6607 if (MI.isPosition())
6608 return outliner::InstrType::Illegal;
6609
6610 // Don't touch the link register or W30.
6611 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
6612 MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
6613 return outliner::InstrType::Illegal;
6614
6615 // Don't outline BTI instructions, because that will prevent the outlining
6616 // site from being indirectly callable.
6617 if (MI.getOpcode() == AArch64::HINT) {
6618 int64_t Imm = MI.getOperand(0).getImm();
6619 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
6620 return outliner::InstrType::Illegal;
6621 }
6622
6623 return outliner::InstrType::Legal;
6624 }
6625
fixupPostOutline(MachineBasicBlock & MBB) const6626 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
6627 for (MachineInstr &MI : MBB) {
6628 const MachineOperand *Base;
6629 unsigned Width;
6630 int64_t Offset;
6631 bool OffsetIsScalable;
6632
6633 // Is this a load or store with an immediate offset with SP as the base?
6634 if (!MI.mayLoadOrStore() ||
6635 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
6636 &RI) ||
6637 (Base->isReg() && Base->getReg() != AArch64::SP))
6638 continue;
6639
6640 // It is, so we have to fix it up.
6641 TypeSize Scale(0U, false);
6642 int64_t Dummy1, Dummy2;
6643
6644 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
6645 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
6646 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
6647 assert(Scale != 0 && "Unexpected opcode!");
6648 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
6649
6650 // We've pushed the return address to the stack, so add 16 to the offset.
6651 // This is safe, since we already checked if it would overflow when we
6652 // checked if this instruction was legal to outline.
6653 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedSize();
6654 StackOffsetOperand.setImm(NewImm);
6655 }
6656 }
6657
signOutlinedFunction(MachineFunction & MF,MachineBasicBlock & MBB,bool ShouldSignReturnAddr,bool ShouldSignReturnAddrWithAKey)6658 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
6659 bool ShouldSignReturnAddr,
6660 bool ShouldSignReturnAddrWithAKey) {
6661 if (ShouldSignReturnAddr) {
6662 MachineBasicBlock::iterator MBBPAC = MBB.begin();
6663 MachineBasicBlock::iterator MBBAUT = MBB.getFirstTerminator();
6664 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
6665 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
6666 DebugLoc DL;
6667
6668 if (MBBAUT != MBB.end())
6669 DL = MBBAUT->getDebugLoc();
6670
6671 // At the very beginning of the basic block we insert the following
6672 // depending on the key type
6673 //
6674 // a_key: b_key:
6675 // PACIASP EMITBKEY
6676 // CFI_INSTRUCTION PACIBSP
6677 // CFI_INSTRUCTION
6678 if (ShouldSignReturnAddrWithAKey) {
6679 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIASP))
6680 .setMIFlag(MachineInstr::FrameSetup);
6681 } else {
6682 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::EMITBKEY))
6683 .setMIFlag(MachineInstr::FrameSetup);
6684 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIBSP))
6685 .setMIFlag(MachineInstr::FrameSetup);
6686 }
6687 unsigned CFIIndex =
6688 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
6689 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION))
6690 .addCFIIndex(CFIIndex)
6691 .setMIFlags(MachineInstr::FrameSetup);
6692
6693 // If v8.3a features are available we can replace a RET instruction by
6694 // RETAA or RETAB and omit the AUT instructions
6695 if (Subtarget.hasV8_3aOps() && MBBAUT != MBB.end() &&
6696 MBBAUT->getOpcode() == AArch64::RET) {
6697 BuildMI(MBB, MBBAUT, DL,
6698 TII->get(ShouldSignReturnAddrWithAKey ? AArch64::RETAA
6699 : AArch64::RETAB))
6700 .copyImplicitOps(*MBBAUT);
6701 MBB.erase(MBBAUT);
6702 } else {
6703 BuildMI(MBB, MBBAUT, DL,
6704 TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP
6705 : AArch64::AUTIBSP))
6706 .setMIFlag(MachineInstr::FrameDestroy);
6707 }
6708 }
6709 }
6710
buildOutlinedFrame(MachineBasicBlock & MBB,MachineFunction & MF,const outliner::OutlinedFunction & OF) const6711 void AArch64InstrInfo::buildOutlinedFrame(
6712 MachineBasicBlock &MBB, MachineFunction &MF,
6713 const outliner::OutlinedFunction &OF) const {
6714
6715 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
6716
6717 if (OF.FrameConstructionID == MachineOutlinerTailCall)
6718 FI->setOutliningStyle("Tail Call");
6719 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
6720 // For thunk outlining, rewrite the last instruction from a call to a
6721 // tail-call.
6722 MachineInstr *Call = &*--MBB.instr_end();
6723 unsigned TailOpcode;
6724 if (Call->getOpcode() == AArch64::BL) {
6725 TailOpcode = AArch64::TCRETURNdi;
6726 } else {
6727 assert(Call->getOpcode() == AArch64::BLR ||
6728 Call->getOpcode() == AArch64::BLRNoIP);
6729 TailOpcode = AArch64::TCRETURNriALL;
6730 }
6731 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
6732 .add(Call->getOperand(0))
6733 .addImm(0);
6734 MBB.insert(MBB.end(), TC);
6735 Call->eraseFromParent();
6736
6737 FI->setOutliningStyle("Thunk");
6738 }
6739
6740 bool IsLeafFunction = true;
6741
6742 // Is there a call in the outlined range?
6743 auto IsNonTailCall = [](const MachineInstr &MI) {
6744 return MI.isCall() && !MI.isReturn();
6745 };
6746
6747 if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) {
6748 // Fix up the instructions in the range, since we're going to modify the
6749 // stack.
6750
6751 // Bugzilla ID: 46767
6752 // TODO: Check if fixing up twice is safe so we can outline these.
6753 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
6754 "Can only fix up stack references once");
6755 fixupPostOutline(MBB);
6756
6757 IsLeafFunction = false;
6758
6759 // LR has to be a live in so that we can save it.
6760 if (!MBB.isLiveIn(AArch64::LR))
6761 MBB.addLiveIn(AArch64::LR);
6762
6763 MachineBasicBlock::iterator It = MBB.begin();
6764 MachineBasicBlock::iterator Et = MBB.end();
6765
6766 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
6767 OF.FrameConstructionID == MachineOutlinerThunk)
6768 Et = std::prev(MBB.end());
6769
6770 // Insert a save before the outlined region
6771 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
6772 .addReg(AArch64::SP, RegState::Define)
6773 .addReg(AArch64::LR)
6774 .addReg(AArch64::SP)
6775 .addImm(-16);
6776 It = MBB.insert(It, STRXpre);
6777
6778 const TargetSubtargetInfo &STI = MF.getSubtarget();
6779 const MCRegisterInfo *MRI = STI.getRegisterInfo();
6780 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
6781
6782 // Add a CFI saying the stack was moved 16 B down.
6783 int64_t StackPosEntry =
6784 MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
6785 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
6786 .addCFIIndex(StackPosEntry)
6787 .setMIFlags(MachineInstr::FrameSetup);
6788
6789 // Add a CFI saying that the LR that we want to find is now 16 B higher than
6790 // before.
6791 int64_t LRPosEntry =
6792 MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
6793 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
6794 .addCFIIndex(LRPosEntry)
6795 .setMIFlags(MachineInstr::FrameSetup);
6796
6797 // Insert a restore before the terminator for the function.
6798 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
6799 .addReg(AArch64::SP, RegState::Define)
6800 .addReg(AArch64::LR, RegState::Define)
6801 .addReg(AArch64::SP)
6802 .addImm(16);
6803 Et = MBB.insert(Et, LDRXpost);
6804 }
6805
6806 // If a bunch of candidates reach this point they must agree on their return
6807 // address signing. It is therefore enough to just consider the signing
6808 // behaviour of one of them
6809 const auto &MFI = *OF.Candidates.front().getMF()->getInfo<AArch64FunctionInfo>();
6810 bool ShouldSignReturnAddr = MFI.shouldSignReturnAddress(!IsLeafFunction);
6811
6812 // a_key is the default
6813 bool ShouldSignReturnAddrWithAKey = !MFI.shouldSignWithBKey();
6814
6815 // If this is a tail call outlined function, then there's already a return.
6816 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
6817 OF.FrameConstructionID == MachineOutlinerThunk) {
6818 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr,
6819 ShouldSignReturnAddrWithAKey);
6820 return;
6821 }
6822
6823 // It's not a tail call, so we have to insert the return ourselves.
6824
6825 // LR has to be a live in so that we can return to it.
6826 if (!MBB.isLiveIn(AArch64::LR))
6827 MBB.addLiveIn(AArch64::LR);
6828
6829 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
6830 .addReg(AArch64::LR);
6831 MBB.insert(MBB.end(), ret);
6832
6833 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr,
6834 ShouldSignReturnAddrWithAKey);
6835
6836 FI->setOutliningStyle("Function");
6837
6838 // Did we have to modify the stack by saving the link register?
6839 if (OF.FrameConstructionID != MachineOutlinerDefault)
6840 return;
6841
6842 // We modified the stack.
6843 // Walk over the basic block and fix up all the stack accesses.
6844 fixupPostOutline(MBB);
6845 }
6846
insertOutlinedCall(Module & M,MachineBasicBlock & MBB,MachineBasicBlock::iterator & It,MachineFunction & MF,const outliner::Candidate & C) const6847 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
6848 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
6849 MachineFunction &MF, const outliner::Candidate &C) const {
6850
6851 // Are we tail calling?
6852 if (C.CallConstructionID == MachineOutlinerTailCall) {
6853 // If yes, then we can just branch to the label.
6854 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
6855 .addGlobalAddress(M.getNamedValue(MF.getName()))
6856 .addImm(0));
6857 return It;
6858 }
6859
6860 // Are we saving the link register?
6861 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
6862 C.CallConstructionID == MachineOutlinerThunk) {
6863 // No, so just insert the call.
6864 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
6865 .addGlobalAddress(M.getNamedValue(MF.getName())));
6866 return It;
6867 }
6868
6869 // We want to return the spot where we inserted the call.
6870 MachineBasicBlock::iterator CallPt;
6871
6872 // Instructions for saving and restoring LR around the call instruction we're
6873 // going to insert.
6874 MachineInstr *Save;
6875 MachineInstr *Restore;
6876 // Can we save to a register?
6877 if (C.CallConstructionID == MachineOutlinerRegSave) {
6878 // FIXME: This logic should be sunk into a target-specific interface so that
6879 // we don't have to recompute the register.
6880 unsigned Reg = findRegisterToSaveLRTo(C);
6881 assert(Reg != 0 && "No callee-saved register available?");
6882
6883 // Save and restore LR from that register.
6884 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
6885 .addReg(AArch64::XZR)
6886 .addReg(AArch64::LR)
6887 .addImm(0);
6888 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
6889 .addReg(AArch64::XZR)
6890 .addReg(Reg)
6891 .addImm(0);
6892 } else {
6893 // We have the default case. Save and restore from SP.
6894 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
6895 .addReg(AArch64::SP, RegState::Define)
6896 .addReg(AArch64::LR)
6897 .addReg(AArch64::SP)
6898 .addImm(-16);
6899 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
6900 .addReg(AArch64::SP, RegState::Define)
6901 .addReg(AArch64::LR, RegState::Define)
6902 .addReg(AArch64::SP)
6903 .addImm(16);
6904 }
6905
6906 It = MBB.insert(It, Save);
6907 It++;
6908
6909 // Insert the call.
6910 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
6911 .addGlobalAddress(M.getNamedValue(MF.getName())));
6912 CallPt = It;
6913 It++;
6914
6915 It = MBB.insert(It, Restore);
6916 return CallPt;
6917 }
6918
shouldOutlineFromFunctionByDefault(MachineFunction & MF) const6919 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
6920 MachineFunction &MF) const {
6921 return MF.getFunction().hasMinSize();
6922 }
6923
6924 Optional<DestSourcePair>
isCopyInstrImpl(const MachineInstr & MI) const6925 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
6926
6927 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
6928 // and zero immediate operands used as an alias for mov instruction.
6929 if (MI.getOpcode() == AArch64::ORRWrs &&
6930 MI.getOperand(1).getReg() == AArch64::WZR &&
6931 MI.getOperand(3).getImm() == 0x0) {
6932 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
6933 }
6934
6935 if (MI.getOpcode() == AArch64::ORRXrs &&
6936 MI.getOperand(1).getReg() == AArch64::XZR &&
6937 MI.getOperand(3).getImm() == 0x0) {
6938 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
6939 }
6940
6941 return None;
6942 }
6943
isAddImmediate(const MachineInstr & MI,Register Reg) const6944 Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI,
6945 Register Reg) const {
6946 int Sign = 1;
6947 int64_t Offset = 0;
6948
6949 // TODO: Handle cases where Reg is a super- or sub-register of the
6950 // destination register.
6951 const MachineOperand &Op0 = MI.getOperand(0);
6952 if (!Op0.isReg() || Reg != Op0.getReg())
6953 return None;
6954
6955 switch (MI.getOpcode()) {
6956 default:
6957 return None;
6958 case AArch64::SUBWri:
6959 case AArch64::SUBXri:
6960 case AArch64::SUBSWri:
6961 case AArch64::SUBSXri:
6962 Sign *= -1;
6963 LLVM_FALLTHROUGH;
6964 case AArch64::ADDSWri:
6965 case AArch64::ADDSXri:
6966 case AArch64::ADDWri:
6967 case AArch64::ADDXri: {
6968 // TODO: Third operand can be global address (usually some string).
6969 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
6970 !MI.getOperand(2).isImm())
6971 return None;
6972 int Shift = MI.getOperand(3).getImm();
6973 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
6974 Offset = Sign * (MI.getOperand(2).getImm() << Shift);
6975 }
6976 }
6977 return RegImmPair{MI.getOperand(1).getReg(), Offset};
6978 }
6979
6980 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
6981 /// the destination register then, if possible, describe the value in terms of
6982 /// the source register.
6983 static Optional<ParamLoadedValue>
describeORRLoadedValue(const MachineInstr & MI,Register DescribedReg,const TargetInstrInfo * TII,const TargetRegisterInfo * TRI)6984 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
6985 const TargetInstrInfo *TII,
6986 const TargetRegisterInfo *TRI) {
6987 auto DestSrc = TII->isCopyInstr(MI);
6988 if (!DestSrc)
6989 return None;
6990
6991 Register DestReg = DestSrc->Destination->getReg();
6992 Register SrcReg = DestSrc->Source->getReg();
6993
6994 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
6995
6996 // If the described register is the destination, just return the source.
6997 if (DestReg == DescribedReg)
6998 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
6999
7000 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
7001 if (MI.getOpcode() == AArch64::ORRWrs &&
7002 TRI->isSuperRegister(DestReg, DescribedReg))
7003 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
7004
7005 // We may need to describe the lower part of a ORRXrs move.
7006 if (MI.getOpcode() == AArch64::ORRXrs &&
7007 TRI->isSubRegister(DestReg, DescribedReg)) {
7008 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
7009 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
7010 }
7011
7012 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
7013 "Unhandled ORR[XW]rs copy case");
7014
7015 return None;
7016 }
7017
7018 Optional<ParamLoadedValue>
describeLoadedValue(const MachineInstr & MI,Register Reg) const7019 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
7020 Register Reg) const {
7021 const MachineFunction *MF = MI.getMF();
7022 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
7023 switch (MI.getOpcode()) {
7024 case AArch64::MOVZWi:
7025 case AArch64::MOVZXi: {
7026 // MOVZWi may be used for producing zero-extended 32-bit immediates in
7027 // 64-bit parameters, so we need to consider super-registers.
7028 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
7029 return None;
7030
7031 if (!MI.getOperand(1).isImm())
7032 return None;
7033 int64_t Immediate = MI.getOperand(1).getImm();
7034 int Shift = MI.getOperand(2).getImm();
7035 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
7036 nullptr);
7037 }
7038 case AArch64::ORRWrs:
7039 case AArch64::ORRXrs:
7040 return describeORRLoadedValue(MI, Reg, this, TRI);
7041 }
7042
7043 return TargetInstrInfo::describeLoadedValue(MI, Reg);
7044 }
7045
getElementSizeForOpcode(unsigned Opc) const7046 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
7047 return get(Opc).TSFlags & AArch64::ElementSizeMask;
7048 }
7049
getBLRCallOpcode(const MachineFunction & MF)7050 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
7051 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
7052 return AArch64::BLRNoIP;
7053 else
7054 return AArch64::BLR;
7055 }
7056
7057 #define GET_INSTRINFO_HELPERS
7058 #define GET_INSTRMAP_INFO
7059 #include "AArch64GenInstrInfo.inc"
7060