• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //=== A15SDOptimizerPass.cpp - Optimize DPR and SPR register accesses on A15==//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // The Cortex-A15 processor employs a tracking scheme in its register renaming
11 // in order to process each instruction's micro-ops speculatively and
12 // out-of-order with appropriate forwarding. The ARM architecture allows VFP
13 // instructions to read and write 32-bit S-registers.  Each S-register
14 // corresponds to one half (upper or lower) of an overlaid 64-bit D-register.
15 //
16 // There are several instruction patterns which can be used to provide this
17 // capability which can provide higher performance than other, potentially more
18 // direct patterns, specifically around when one micro-op reads a D-register
19 // operand that has recently been written as one or more S-register results.
20 //
21 // This file defines a pre-regalloc pass which looks for SPR producers which
22 // are going to be used by a DPR (or QPR) consumers and creates the more
23 // optimized access pattern.
24 //
25 //===----------------------------------------------------------------------===//
26 
27 #include "ARM.h"
28 #include "ARMBaseInstrInfo.h"
29 #include "ARMBaseRegisterInfo.h"
30 #include "ARMSubtarget.h"
31 #include "llvm/ADT/Statistic.h"
32 #include "llvm/CodeGen/MachineFunction.h"
33 #include "llvm/CodeGen/MachineFunctionPass.h"
34 #include "llvm/CodeGen/MachineInstr.h"
35 #include "llvm/CodeGen/MachineInstrBuilder.h"
36 #include "llvm/CodeGen/MachineRegisterInfo.h"
37 #include "llvm/CodeGen/TargetRegisterInfo.h"
38 #include "llvm/CodeGen/TargetSubtargetInfo.h"
39 #include "llvm/Support/Debug.h"
40 #include "llvm/Support/raw_ostream.h"
41 #include <map>
42 #include <set>
43 
44 using namespace llvm;
45 
46 #define DEBUG_TYPE "a15-sd-optimizer"
47 
48 namespace {
49   struct A15SDOptimizer : public MachineFunctionPass {
50     static char ID;
A15SDOptimizer__anon893fac390111::A15SDOptimizer51     A15SDOptimizer() : MachineFunctionPass(ID) {}
52 
53     bool runOnMachineFunction(MachineFunction &Fn) override;
54 
getPassName__anon893fac390111::A15SDOptimizer55     StringRef getPassName() const override { return "ARM A15 S->D optimizer"; }
56 
57   private:
58     const ARMBaseInstrInfo *TII;
59     const TargetRegisterInfo *TRI;
60     MachineRegisterInfo *MRI;
61 
62     bool runOnInstruction(MachineInstr *MI);
63 
64     //
65     // Instruction builder helpers
66     //
67     unsigned createDupLane(MachineBasicBlock &MBB,
68                            MachineBasicBlock::iterator InsertBefore,
69                            const DebugLoc &DL, unsigned Reg, unsigned Lane,
70                            bool QPR = false);
71 
72     unsigned createExtractSubreg(MachineBasicBlock &MBB,
73                                  MachineBasicBlock::iterator InsertBefore,
74                                  const DebugLoc &DL, unsigned DReg,
75                                  unsigned Lane, const TargetRegisterClass *TRC);
76 
77     unsigned createVExt(MachineBasicBlock &MBB,
78                         MachineBasicBlock::iterator InsertBefore,
79                         const DebugLoc &DL, unsigned Ssub0, unsigned Ssub1);
80 
81     unsigned createRegSequence(MachineBasicBlock &MBB,
82                                MachineBasicBlock::iterator InsertBefore,
83                                const DebugLoc &DL, unsigned Reg1,
84                                unsigned Reg2);
85 
86     unsigned createInsertSubreg(MachineBasicBlock &MBB,
87                                 MachineBasicBlock::iterator InsertBefore,
88                                 const DebugLoc &DL, unsigned DReg,
89                                 unsigned Lane, unsigned ToInsert);
90 
91     unsigned createImplicitDef(MachineBasicBlock &MBB,
92                                MachineBasicBlock::iterator InsertBefore,
93                                const DebugLoc &DL);
94 
95     //
96     // Various property checkers
97     //
98     bool usesRegClass(MachineOperand &MO, const TargetRegisterClass *TRC);
99     bool hasPartialWrite(MachineInstr *MI);
100     SmallVector<unsigned, 8> getReadDPRs(MachineInstr *MI);
101     unsigned getDPRLaneFromSPR(unsigned SReg);
102 
103     //
104     // Methods used for getting the definitions of partial registers
105     //
106 
107     MachineInstr *elideCopies(MachineInstr *MI);
108     void elideCopiesAndPHIs(MachineInstr *MI,
109                             SmallVectorImpl<MachineInstr*> &Outs);
110 
111     //
112     // Pattern optimization methods
113     //
114     unsigned optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg);
115     unsigned optimizeSDPattern(MachineInstr *MI);
116     unsigned getPrefSPRLane(unsigned SReg);
117 
118     //
119     // Sanitizing method - used to make sure if don't leave dead code around.
120     //
121     void eraseInstrWithNoUses(MachineInstr *MI);
122 
123     //
124     // A map used to track the changes done by this pass.
125     //
126     std::map<MachineInstr*, unsigned> Replacements;
127     std::set<MachineInstr *> DeadInstr;
128   };
129   char A15SDOptimizer::ID = 0;
130 } // end anonymous namespace
131 
132 // Returns true if this is a use of a SPR register.
usesRegClass(MachineOperand & MO,const TargetRegisterClass * TRC)133 bool A15SDOptimizer::usesRegClass(MachineOperand &MO,
134                                   const TargetRegisterClass *TRC) {
135   if (!MO.isReg())
136     return false;
137   unsigned Reg = MO.getReg();
138 
139   if (TargetRegisterInfo::isVirtualRegister(Reg))
140     return MRI->getRegClass(Reg)->hasSuperClassEq(TRC);
141   else
142     return TRC->contains(Reg);
143 }
144 
getDPRLaneFromSPR(unsigned SReg)145 unsigned A15SDOptimizer::getDPRLaneFromSPR(unsigned SReg) {
146   unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1,
147                                            &ARM::DPRRegClass);
148   if (DReg != ARM::NoRegister) return ARM::ssub_1;
149   return ARM::ssub_0;
150 }
151 
152 // Get the subreg type that is most likely to be coalesced
153 // for an SPR register that will be used in VDUP32d pseudo.
getPrefSPRLane(unsigned SReg)154 unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) {
155   if (!TRI->isVirtualRegister(SReg))
156     return getDPRLaneFromSPR(SReg);
157 
158   MachineInstr *MI = MRI->getVRegDef(SReg);
159   if (!MI) return ARM::ssub_0;
160   MachineOperand *MO = MI->findRegisterDefOperand(SReg);
161 
162   assert(MO->isReg() && "Non-register operand found!");
163   if (!MO) return ARM::ssub_0;
164 
165   if (MI->isCopy() && usesRegClass(MI->getOperand(1),
166                                     &ARM::SPRRegClass)) {
167     SReg = MI->getOperand(1).getReg();
168   }
169 
170   if (TargetRegisterInfo::isVirtualRegister(SReg)) {
171     if (MO->getSubReg() == ARM::ssub_1) return ARM::ssub_1;
172     return ARM::ssub_0;
173   }
174   return getDPRLaneFromSPR(SReg);
175 }
176 
177 // MI is known to be dead. Figure out what instructions
178 // are also made dead by this and mark them for removal.
eraseInstrWithNoUses(MachineInstr * MI)179 void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) {
180   SmallVector<MachineInstr *, 8> Front;
181   DeadInstr.insert(MI);
182 
183   LLVM_DEBUG(dbgs() << "Deleting base instruction " << *MI << "\n");
184   Front.push_back(MI);
185 
186   while (Front.size() != 0) {
187     MI = Front.back();
188     Front.pop_back();
189 
190     // MI is already known to be dead. We need to see
191     // if other instructions can also be removed.
192     for (MachineOperand &MO : MI->operands()) {
193       if ((!MO.isReg()) || (!MO.isUse()))
194         continue;
195       unsigned Reg = MO.getReg();
196       if (!TRI->isVirtualRegister(Reg))
197         continue;
198       MachineOperand *Op = MI->findRegisterDefOperand(Reg);
199 
200       if (!Op)
201         continue;
202 
203       MachineInstr *Def = Op->getParent();
204 
205       // We don't need to do anything if we have already marked
206       // this instruction as being dead.
207       if (DeadInstr.find(Def) != DeadInstr.end())
208         continue;
209 
210       // Check if all the uses of this instruction are marked as
211       // dead. If so, we can also mark this instruction as being
212       // dead.
213       bool IsDead = true;
214       for (MachineOperand &MODef : Def->operands()) {
215         if ((!MODef.isReg()) || (!MODef.isDef()))
216           continue;
217         unsigned DefReg = MODef.getReg();
218         if (!TRI->isVirtualRegister(DefReg)) {
219           IsDead = false;
220           break;
221         }
222         for (MachineInstr &Use : MRI->use_instructions(Reg)) {
223           // We don't care about self references.
224           if (&Use == Def)
225             continue;
226           if (DeadInstr.find(&Use) == DeadInstr.end()) {
227             IsDead = false;
228             break;
229           }
230         }
231       }
232 
233       if (!IsDead) continue;
234 
235       LLVM_DEBUG(dbgs() << "Deleting instruction " << *Def << "\n");
236       DeadInstr.insert(Def);
237     }
238   }
239 }
240 
241 // Creates the more optimized patterns and generally does all the code
242 // transformations in this pass.
optimizeSDPattern(MachineInstr * MI)243 unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
244   if (MI->isCopy()) {
245     return optimizeAllLanesPattern(MI, MI->getOperand(1).getReg());
246   }
247 
248   if (MI->isInsertSubreg()) {
249     unsigned DPRReg = MI->getOperand(1).getReg();
250     unsigned SPRReg = MI->getOperand(2).getReg();
251 
252     if (TRI->isVirtualRegister(DPRReg) && TRI->isVirtualRegister(SPRReg)) {
253       MachineInstr *DPRMI = MRI->getVRegDef(MI->getOperand(1).getReg());
254       MachineInstr *SPRMI = MRI->getVRegDef(MI->getOperand(2).getReg());
255 
256       if (DPRMI && SPRMI) {
257         // See if the first operand of this insert_subreg is IMPLICIT_DEF
258         MachineInstr *ECDef = elideCopies(DPRMI);
259         if (ECDef && ECDef->isImplicitDef()) {
260           // Another corner case - if we're inserting something that is purely
261           // a subreg copy of a DPR, just use that DPR.
262 
263           MachineInstr *EC = elideCopies(SPRMI);
264           // Is it a subreg copy of ssub_0?
265           if (EC && EC->isCopy() &&
266               EC->getOperand(1).getSubReg() == ARM::ssub_0) {
267             LLVM_DEBUG(dbgs() << "Found a subreg copy: " << *SPRMI);
268 
269             // Find the thing we're subreg copying out of - is it of the same
270             // regclass as DPRMI? (i.e. a DPR or QPR).
271             unsigned FullReg = SPRMI->getOperand(1).getReg();
272             const TargetRegisterClass *TRC =
273               MRI->getRegClass(MI->getOperand(1).getReg());
274             if (TRC->hasSuperClassEq(MRI->getRegClass(FullReg))) {
275               LLVM_DEBUG(dbgs() << "Subreg copy is compatible - returning ");
276               LLVM_DEBUG(dbgs() << printReg(FullReg) << "\n");
277               eraseInstrWithNoUses(MI);
278               return FullReg;
279             }
280           }
281 
282           return optimizeAllLanesPattern(MI, MI->getOperand(2).getReg());
283         }
284       }
285     }
286     return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg());
287   }
288 
289   if (MI->isRegSequence() && usesRegClass(MI->getOperand(1),
290                                           &ARM::SPRRegClass)) {
291     // See if all bar one of the operands are IMPLICIT_DEF and insert the
292     // optimizer pattern accordingly.
293     unsigned NumImplicit = 0, NumTotal = 0;
294     unsigned NonImplicitReg = ~0U;
295 
296     for (unsigned I = 1; I < MI->getNumExplicitOperands(); ++I) {
297       if (!MI->getOperand(I).isReg())
298         continue;
299       ++NumTotal;
300       unsigned OpReg = MI->getOperand(I).getReg();
301 
302       if (!TRI->isVirtualRegister(OpReg))
303         break;
304 
305       MachineInstr *Def = MRI->getVRegDef(OpReg);
306       if (!Def)
307         break;
308       if (Def->isImplicitDef())
309         ++NumImplicit;
310       else
311         NonImplicitReg = MI->getOperand(I).getReg();
312     }
313 
314     if (NumImplicit == NumTotal - 1)
315       return optimizeAllLanesPattern(MI, NonImplicitReg);
316     else
317       return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg());
318   }
319 
320   llvm_unreachable("Unhandled update pattern!");
321 }
322 
323 // Return true if this MachineInstr inserts a scalar (SPR) value into
324 // a D or Q register.
hasPartialWrite(MachineInstr * MI)325 bool A15SDOptimizer::hasPartialWrite(MachineInstr *MI) {
326   // The only way we can do a partial register update is through a COPY,
327   // INSERT_SUBREG or REG_SEQUENCE.
328   if (MI->isCopy() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass))
329     return true;
330 
331   if (MI->isInsertSubreg() && usesRegClass(MI->getOperand(2),
332                                            &ARM::SPRRegClass))
333     return true;
334 
335   if (MI->isRegSequence() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass))
336     return true;
337 
338   return false;
339 }
340 
341 // Looks through full copies to get the instruction that defines the input
342 // operand for MI.
elideCopies(MachineInstr * MI)343 MachineInstr *A15SDOptimizer::elideCopies(MachineInstr *MI) {
344   if (!MI->isFullCopy())
345     return MI;
346   if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
347     return nullptr;
348   MachineInstr *Def = MRI->getVRegDef(MI->getOperand(1).getReg());
349   if (!Def)
350     return nullptr;
351   return elideCopies(Def);
352 }
353 
354 // Look through full copies and PHIs to get the set of non-copy MachineInstrs
355 // that can produce MI.
elideCopiesAndPHIs(MachineInstr * MI,SmallVectorImpl<MachineInstr * > & Outs)356 void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI,
357                                         SmallVectorImpl<MachineInstr*> &Outs) {
358    // Looking through PHIs may create loops so we need to track what
359    // instructions we have visited before.
360    std::set<MachineInstr *> Reached;
361    SmallVector<MachineInstr *, 8> Front;
362    Front.push_back(MI);
363    while (Front.size() != 0) {
364      MI = Front.back();
365      Front.pop_back();
366 
367      // If we have already explored this MachineInstr, ignore it.
368      if (Reached.find(MI) != Reached.end())
369        continue;
370      Reached.insert(MI);
371      if (MI->isPHI()) {
372        for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
373          unsigned Reg = MI->getOperand(I).getReg();
374          if (!TRI->isVirtualRegister(Reg)) {
375            continue;
376          }
377          MachineInstr *NewMI = MRI->getVRegDef(Reg);
378          if (!NewMI)
379            continue;
380          Front.push_back(NewMI);
381        }
382      } else if (MI->isFullCopy()) {
383        if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
384          continue;
385        MachineInstr *NewMI = MRI->getVRegDef(MI->getOperand(1).getReg());
386        if (!NewMI)
387          continue;
388        Front.push_back(NewMI);
389      } else {
390        LLVM_DEBUG(dbgs() << "Found partial copy" << *MI << "\n");
391        Outs.push_back(MI);
392      }
393    }
394 }
395 
396 // Return the DPR virtual registers that are read by this machine instruction
397 // (if any).
getReadDPRs(MachineInstr * MI)398 SmallVector<unsigned, 8> A15SDOptimizer::getReadDPRs(MachineInstr *MI) {
399   if (MI->isCopyLike() || MI->isInsertSubreg() || MI->isRegSequence() ||
400       MI->isKill())
401     return SmallVector<unsigned, 8>();
402 
403   SmallVector<unsigned, 8> Defs;
404   for (MachineOperand &MO : MI->operands()) {
405     if (!MO.isReg() || !MO.isUse())
406       continue;
407     if (!usesRegClass(MO, &ARM::DPRRegClass) &&
408         !usesRegClass(MO, &ARM::QPRRegClass) &&
409         !usesRegClass(MO, &ARM::DPairRegClass)) // Treat DPair as QPR
410       continue;
411 
412     Defs.push_back(MO.getReg());
413   }
414   return Defs;
415 }
416 
417 // Creates a DPR register from an SPR one by using a VDUP.
createDupLane(MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const DebugLoc & DL,unsigned Reg,unsigned Lane,bool QPR)418 unsigned A15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
419                                        MachineBasicBlock::iterator InsertBefore,
420                                        const DebugLoc &DL, unsigned Reg,
421                                        unsigned Lane, bool QPR) {
422   unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass :
423                                                   &ARM::DPRRegClass);
424   BuildMI(MBB, InsertBefore, DL,
425           TII->get(QPR ? ARM::VDUPLN32q : ARM::VDUPLN32d), Out)
426       .addReg(Reg)
427       .addImm(Lane)
428       .add(predOps(ARMCC::AL));
429 
430   return Out;
431 }
432 
433 // Creates a SPR register from a DPR by copying the value in lane 0.
createExtractSubreg(MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const DebugLoc & DL,unsigned DReg,unsigned Lane,const TargetRegisterClass * TRC)434 unsigned A15SDOptimizer::createExtractSubreg(
435     MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
436     const DebugLoc &DL, unsigned DReg, unsigned Lane,
437     const TargetRegisterClass *TRC) {
438   unsigned Out = MRI->createVirtualRegister(TRC);
439   BuildMI(MBB,
440           InsertBefore,
441           DL,
442           TII->get(TargetOpcode::COPY), Out)
443     .addReg(DReg, 0, Lane);
444 
445   return Out;
446 }
447 
448 // Takes two SPR registers and creates a DPR by using a REG_SEQUENCE.
createRegSequence(MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const DebugLoc & DL,unsigned Reg1,unsigned Reg2)449 unsigned A15SDOptimizer::createRegSequence(
450     MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
451     const DebugLoc &DL, unsigned Reg1, unsigned Reg2) {
452   unsigned Out = MRI->createVirtualRegister(&ARM::QPRRegClass);
453   BuildMI(MBB,
454           InsertBefore,
455           DL,
456           TII->get(TargetOpcode::REG_SEQUENCE), Out)
457     .addReg(Reg1)
458     .addImm(ARM::dsub_0)
459     .addReg(Reg2)
460     .addImm(ARM::dsub_1);
461   return Out;
462 }
463 
464 // Takes two DPR registers that have previously been VDUPed (Ssub0 and Ssub1)
465 // and merges them into one DPR register.
createVExt(MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const DebugLoc & DL,unsigned Ssub0,unsigned Ssub1)466 unsigned A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
467                                     MachineBasicBlock::iterator InsertBefore,
468                                     const DebugLoc &DL, unsigned Ssub0,
469                                     unsigned Ssub1) {
470   unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
471   BuildMI(MBB, InsertBefore, DL, TII->get(ARM::VEXTd32), Out)
472       .addReg(Ssub0)
473       .addReg(Ssub1)
474       .addImm(1)
475       .add(predOps(ARMCC::AL));
476   return Out;
477 }
478 
createInsertSubreg(MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const DebugLoc & DL,unsigned DReg,unsigned Lane,unsigned ToInsert)479 unsigned A15SDOptimizer::createInsertSubreg(
480     MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
481     const DebugLoc &DL, unsigned DReg, unsigned Lane, unsigned ToInsert) {
482   unsigned Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass);
483   BuildMI(MBB,
484           InsertBefore,
485           DL,
486           TII->get(TargetOpcode::INSERT_SUBREG), Out)
487     .addReg(DReg)
488     .addReg(ToInsert)
489     .addImm(Lane);
490 
491   return Out;
492 }
493 
494 unsigned
createImplicitDef(MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const DebugLoc & DL)495 A15SDOptimizer::createImplicitDef(MachineBasicBlock &MBB,
496                                   MachineBasicBlock::iterator InsertBefore,
497                                   const DebugLoc &DL) {
498   unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
499   BuildMI(MBB,
500           InsertBefore,
501           DL,
502           TII->get(TargetOpcode::IMPLICIT_DEF), Out);
503   return Out;
504 }
505 
506 // This function inserts instructions in order to optimize interactions between
507 // SPR registers and DPR/QPR registers. It does so by performing VDUPs on all
508 // lanes, and the using VEXT instructions to recompose the result.
509 unsigned
optimizeAllLanesPattern(MachineInstr * MI,unsigned Reg)510 A15SDOptimizer::optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg) {
511   MachineBasicBlock::iterator InsertPt(MI);
512   DebugLoc DL = MI->getDebugLoc();
513   MachineBasicBlock &MBB = *MI->getParent();
514   InsertPt++;
515   unsigned Out;
516 
517   // DPair has the same length as QPR and also has two DPRs as subreg.
518   // Treat DPair as QPR.
519   if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::QPRRegClass) ||
520       MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPairRegClass)) {
521     unsigned DSub0 = createExtractSubreg(MBB, InsertPt, DL, Reg,
522                                          ARM::dsub_0, &ARM::DPRRegClass);
523     unsigned DSub1 = createExtractSubreg(MBB, InsertPt, DL, Reg,
524                                          ARM::dsub_1, &ARM::DPRRegClass);
525 
526     unsigned Out1 = createDupLane(MBB, InsertPt, DL, DSub0, 0);
527     unsigned Out2 = createDupLane(MBB, InsertPt, DL, DSub0, 1);
528     Out = createVExt(MBB, InsertPt, DL, Out1, Out2);
529 
530     unsigned Out3 = createDupLane(MBB, InsertPt, DL, DSub1, 0);
531     unsigned Out4 = createDupLane(MBB, InsertPt, DL, DSub1, 1);
532     Out2 = createVExt(MBB, InsertPt, DL, Out3, Out4);
533 
534     Out = createRegSequence(MBB, InsertPt, DL, Out, Out2);
535 
536   } else if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPRRegClass)) {
537     unsigned Out1 = createDupLane(MBB, InsertPt, DL, Reg, 0);
538     unsigned Out2 = createDupLane(MBB, InsertPt, DL, Reg, 1);
539     Out = createVExt(MBB, InsertPt, DL, Out1, Out2);
540 
541   } else {
542     assert(MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::SPRRegClass) &&
543            "Found unexpected regclass!");
544 
545     unsigned PrefLane = getPrefSPRLane(Reg);
546     unsigned Lane;
547     switch (PrefLane) {
548       case ARM::ssub_0: Lane = 0; break;
549       case ARM::ssub_1: Lane = 1; break;
550       default: llvm_unreachable("Unknown preferred lane!");
551     }
552 
553     // Treat DPair as QPR
554     bool UsesQPR = usesRegClass(MI->getOperand(0), &ARM::QPRRegClass) ||
555                    usesRegClass(MI->getOperand(0), &ARM::DPairRegClass);
556 
557     Out = createImplicitDef(MBB, InsertPt, DL);
558     Out = createInsertSubreg(MBB, InsertPt, DL, Out, PrefLane, Reg);
559     Out = createDupLane(MBB, InsertPt, DL, Out, Lane, UsesQPR);
560     eraseInstrWithNoUses(MI);
561   }
562   return Out;
563 }
564 
runOnInstruction(MachineInstr * MI)565 bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
566   // We look for instructions that write S registers that are then read as
567   // D/Q registers. These can only be caused by COPY, INSERT_SUBREG and
568   // REG_SEQUENCE pseudos that insert an SPR value into a DPR register or
569   // merge two SPR values to form a DPR register.  In order avoid false
570   // positives we make sure that there is an SPR producer so we look past
571   // COPY and PHI nodes to find it.
572   //
573   // The best code pattern for when an SPR producer is going to be used by a
574   // DPR or QPR consumer depends on whether the other lanes of the
575   // corresponding DPR/QPR are currently defined.
576   //
577   // We can handle these efficiently, depending on the type of
578   // pseudo-instruction that is producing the pattern
579   //
580   //   * COPY:          * VDUP all lanes and merge the results together
581   //                      using VEXTs.
582   //
583   //   * INSERT_SUBREG: * If the SPR value was originally in another DPR/QPR
584   //                      lane, and the other lane(s) of the DPR/QPR register
585   //                      that we are inserting in are undefined, use the
586   //                      original DPR/QPR value.
587   //                    * Otherwise, fall back on the same stategy as COPY.
588   //
589   //   * REG_SEQUENCE:  * If all except one of the input operands are
590   //                      IMPLICIT_DEFs, insert the VDUP pattern for just the
591   //                      defined input operand
592   //                    * Otherwise, fall back on the same stategy as COPY.
593   //
594 
595   // First, get all the reads of D-registers done by this instruction.
596   SmallVector<unsigned, 8> Defs = getReadDPRs(MI);
597   bool Modified = false;
598 
599   for (SmallVectorImpl<unsigned>::iterator I = Defs.begin(), E = Defs.end();
600      I != E; ++I) {
601     // Follow the def-use chain for this DPR through COPYs, and also through
602     // PHIs (which are essentially multi-way COPYs). It is because of PHIs that
603     // we can end up with multiple defs of this DPR.
604 
605     SmallVector<MachineInstr *, 8> DefSrcs;
606     if (!TRI->isVirtualRegister(*I))
607       continue;
608     MachineInstr *Def = MRI->getVRegDef(*I);
609     if (!Def)
610       continue;
611 
612     elideCopiesAndPHIs(Def, DefSrcs);
613 
614     for (MachineInstr *MI : DefSrcs) {
615       // If we've already analyzed and replaced this operand, don't do
616       // anything.
617       if (Replacements.find(MI) != Replacements.end())
618         continue;
619 
620       // Now, work out if the instruction causes a SPR->DPR dependency.
621       if (!hasPartialWrite(MI))
622         continue;
623 
624       // Collect all the uses of this MI's DPR def for updating later.
625       SmallVector<MachineOperand*, 8> Uses;
626       unsigned DPRDefReg = MI->getOperand(0).getReg();
627       for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DPRDefReg),
628              E = MRI->use_end(); I != E; ++I)
629         Uses.push_back(&*I);
630 
631       // We can optimize this.
632       unsigned NewReg = optimizeSDPattern(MI);
633 
634       if (NewReg != 0) {
635         Modified = true;
636         for (SmallVectorImpl<MachineOperand *>::const_iterator I = Uses.begin(),
637                E = Uses.end(); I != E; ++I) {
638           // Make sure to constrain the register class of the new register to
639           // match what we're replacing. Otherwise we can optimize a DPR_VFP2
640           // reference into a plain DPR, and that will end poorly. NewReg is
641           // always virtual here, so there will always be a matching subclass
642           // to find.
643           MRI->constrainRegClass(NewReg, MRI->getRegClass((*I)->getReg()));
644 
645           LLVM_DEBUG(dbgs() << "Replacing operand " << **I << " with "
646                             << printReg(NewReg) << "\n");
647           (*I)->substVirtReg(NewReg, 0, *TRI);
648         }
649       }
650       Replacements[MI] = NewReg;
651     }
652   }
653   return Modified;
654 }
655 
runOnMachineFunction(MachineFunction & Fn)656 bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) {
657   if (skipFunction(Fn.getFunction()))
658     return false;
659 
660   const ARMSubtarget &STI = Fn.getSubtarget<ARMSubtarget>();
661   // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be
662   // enabled when NEON is available.
663   if (!(STI.useSplatVFPToNeon() && STI.hasNEON()))
664     return false;
665 
666   TII = STI.getInstrInfo();
667   TRI = STI.getRegisterInfo();
668   MRI = &Fn.getRegInfo();
669   bool Modified = false;
670 
671   LLVM_DEBUG(dbgs() << "Running on function " << Fn.getName() << "\n");
672 
673   DeadInstr.clear();
674   Replacements.clear();
675 
676   for (MachineBasicBlock &MBB : Fn) {
677     for (MachineInstr &MI : MBB) {
678       Modified |= runOnInstruction(&MI);
679     }
680   }
681 
682   for (MachineInstr *MI : DeadInstr) {
683     MI->eraseFromParent();
684   }
685 
686   return Modified;
687 }
688 
createA15SDOptimizerPass()689 FunctionPass *llvm::createA15SDOptimizerPass() {
690   return new A15SDOptimizer();
691 }
692