1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64 specific subclass of TargetSubtarget.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "AArch64Subtarget.h"
14
15 #include "AArch64.h"
16 #include "AArch64InstrInfo.h"
17 #include "AArch64PBQPRegAlloc.h"
18 #include "AArch64TargetMachine.h"
19 #include "GISel/AArch64CallLowering.h"
20 #include "GISel/AArch64LegalizerInfo.h"
21 #include "GISel/AArch64RegisterBankInfo.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/IR/GlobalValue.h"
26 #include "llvm/Support/TargetParser.h"
27
28 using namespace llvm;
29
30 #define DEBUG_TYPE "aarch64-subtarget"
31
32 #define GET_SUBTARGETINFO_CTOR
33 #define GET_SUBTARGETINFO_TARGET_DESC
34 #include "AArch64GenSubtargetInfo.inc"
35
36 static cl::opt<bool>
37 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
38 "converter pass"), cl::init(true), cl::Hidden);
39
40 // If OS supports TBI, use this flag to enable it.
41 static cl::opt<bool>
42 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
43 "an address is ignored"), cl::init(false), cl::Hidden);
44
45 static cl::opt<bool>
46 UseNonLazyBind("aarch64-enable-nonlazybind",
47 cl::desc("Call nonlazybind functions via direct GOT load"),
48 cl::init(false), cl::Hidden);
49
50 static cl::opt<unsigned> SVEVectorBitsMax(
51 "aarch64-sve-vector-bits-max",
52 cl::desc("Assume SVE vector registers are at most this big, "
53 "with zero meaning no maximum size is assumed."),
54 cl::init(0), cl::Hidden);
55
56 static cl::opt<unsigned> SVEVectorBitsMin(
57 "aarch64-sve-vector-bits-min",
58 cl::desc("Assume SVE vector registers are at least this big, "
59 "with zero meaning no minimum size is assumed."),
60 cl::init(0), cl::Hidden);
61
62 AArch64Subtarget &
initializeSubtargetDependencies(StringRef FS,StringRef CPUString)63 AArch64Subtarget::initializeSubtargetDependencies(StringRef FS,
64 StringRef CPUString) {
65 // Determine default and user-specified characteristics
66
67 if (CPUString.empty())
68 CPUString = "generic";
69
70 ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, FS);
71 initializeProperties();
72
73 return *this;
74 }
75
initializeProperties()76 void AArch64Subtarget::initializeProperties() {
77 // Initialize CPU specific properties. We should add a tablegen feature for
78 // this in the future so we can specify it together with the subtarget
79 // features.
80 switch (ARMProcFamily) {
81 case Others:
82 break;
83 case Carmel:
84 CacheLineSize = 64;
85 break;
86 case CortexA35:
87 break;
88 case CortexA53:
89 PrefFunctionLogAlignment = 3;
90 break;
91 case CortexA55:
92 break;
93 case CortexA57:
94 MaxInterleaveFactor = 4;
95 PrefFunctionLogAlignment = 4;
96 break;
97 case CortexA65:
98 PrefFunctionLogAlignment = 3;
99 break;
100 case CortexA72:
101 case CortexA73:
102 case CortexA75:
103 case CortexA76:
104 case CortexA77:
105 case CortexA78:
106 case CortexR82:
107 case CortexX1:
108 PrefFunctionLogAlignment = 4;
109 break;
110 case A64FX:
111 CacheLineSize = 256;
112 PrefFunctionLogAlignment = 5;
113 PrefLoopLogAlignment = 5;
114 break;
115 case AppleA7:
116 case AppleA10:
117 case AppleA11:
118 case AppleA12:
119 case AppleA13:
120 CacheLineSize = 64;
121 PrefetchDistance = 280;
122 MinPrefetchStride = 2048;
123 MaxPrefetchIterationsAhead = 3;
124 break;
125 case ExynosM3:
126 MaxInterleaveFactor = 4;
127 MaxJumpTableSize = 20;
128 PrefFunctionLogAlignment = 5;
129 PrefLoopLogAlignment = 4;
130 break;
131 case Falkor:
132 MaxInterleaveFactor = 4;
133 // FIXME: remove this to enable 64-bit SLP if performance looks good.
134 MinVectorRegisterBitWidth = 128;
135 CacheLineSize = 128;
136 PrefetchDistance = 820;
137 MinPrefetchStride = 2048;
138 MaxPrefetchIterationsAhead = 8;
139 break;
140 case Kryo:
141 MaxInterleaveFactor = 4;
142 VectorInsertExtractBaseCost = 2;
143 CacheLineSize = 128;
144 PrefetchDistance = 740;
145 MinPrefetchStride = 1024;
146 MaxPrefetchIterationsAhead = 11;
147 // FIXME: remove this to enable 64-bit SLP if performance looks good.
148 MinVectorRegisterBitWidth = 128;
149 break;
150 case NeoverseE1:
151 PrefFunctionLogAlignment = 3;
152 break;
153 case NeoverseN1:
154 case NeoverseN2:
155 case NeoverseV1:
156 PrefFunctionLogAlignment = 4;
157 break;
158 case Saphira:
159 MaxInterleaveFactor = 4;
160 // FIXME: remove this to enable 64-bit SLP if performance looks good.
161 MinVectorRegisterBitWidth = 128;
162 break;
163 case ThunderX2T99:
164 CacheLineSize = 64;
165 PrefFunctionLogAlignment = 3;
166 PrefLoopLogAlignment = 2;
167 MaxInterleaveFactor = 4;
168 PrefetchDistance = 128;
169 MinPrefetchStride = 1024;
170 MaxPrefetchIterationsAhead = 4;
171 // FIXME: remove this to enable 64-bit SLP if performance looks good.
172 MinVectorRegisterBitWidth = 128;
173 break;
174 case ThunderX:
175 case ThunderXT88:
176 case ThunderXT81:
177 case ThunderXT83:
178 CacheLineSize = 128;
179 PrefFunctionLogAlignment = 3;
180 PrefLoopLogAlignment = 2;
181 // FIXME: remove this to enable 64-bit SLP if performance looks good.
182 MinVectorRegisterBitWidth = 128;
183 break;
184 case TSV110:
185 CacheLineSize = 64;
186 PrefFunctionLogAlignment = 4;
187 PrefLoopLogAlignment = 2;
188 break;
189 case ThunderX3T110:
190 CacheLineSize = 64;
191 PrefFunctionLogAlignment = 4;
192 PrefLoopLogAlignment = 2;
193 MaxInterleaveFactor = 4;
194 PrefetchDistance = 128;
195 MinPrefetchStride = 1024;
196 MaxPrefetchIterationsAhead = 4;
197 // FIXME: remove this to enable 64-bit SLP if performance looks good.
198 MinVectorRegisterBitWidth = 128;
199 break;
200 }
201 }
202
AArch64Subtarget(const Triple & TT,const std::string & CPU,const std::string & FS,const TargetMachine & TM,bool LittleEndian)203 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
204 const std::string &FS,
205 const TargetMachine &TM, bool LittleEndian)
206 : AArch64GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
207 ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
208 CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
209 IsLittle(LittleEndian),
210 TargetTriple(TT), FrameLowering(),
211 InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(),
212 TLInfo(TM, *this) {
213 if (AArch64::isX18ReservedByDefault(TT))
214 ReserveXRegister.set(18);
215
216 CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
217 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
218 Legalizer.reset(new AArch64LegalizerInfo(*this));
219
220 auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
221
222 // FIXME: At this point, we can't rely on Subtarget having RBI.
223 // It's awkward to mix passing RBI and the Subtarget; should we pass
224 // TII/TRI as well?
225 InstSelector.reset(createAArch64InstructionSelector(
226 *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
227
228 RegBankInfo.reset(RBI);
229 }
230
getCallLowering() const231 const CallLowering *AArch64Subtarget::getCallLowering() const {
232 return CallLoweringInfo.get();
233 }
234
getInlineAsmLowering() const235 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
236 return InlineAsmLoweringInfo.get();
237 }
238
getInstructionSelector() const239 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
240 return InstSelector.get();
241 }
242
getLegalizerInfo() const243 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
244 return Legalizer.get();
245 }
246
getRegBankInfo() const247 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
248 return RegBankInfo.get();
249 }
250
251 /// Find the target operand flags that describe how a global value should be
252 /// referenced for the current subtarget.
253 unsigned
ClassifyGlobalReference(const GlobalValue * GV,const TargetMachine & TM) const254 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
255 const TargetMachine &TM) const {
256 // MachO large model always goes via a GOT, simply to get a single 8-byte
257 // absolute relocation on all global addresses.
258 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
259 return AArch64II::MO_GOT;
260
261 if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
262 if (GV->hasDLLImportStorageClass())
263 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
264 if (getTargetTriple().isOSWindows())
265 return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
266 return AArch64II::MO_GOT;
267 }
268
269 // The small code model's direct accesses use ADRP, which cannot
270 // necessarily produce the value 0 (if the code is above 4GB).
271 // Same for the tiny code model, where we have a pc relative LDR.
272 if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
273 GV->hasExternalWeakLinkage())
274 return AArch64II::MO_GOT;
275
276 // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
277 // that their nominal addresses are tagged and outside of the code model. In
278 // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
279 // tag if necessary based on MO_TAGGED.
280 if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
281 return AArch64II::MO_NC | AArch64II::MO_TAGGED;
282
283 return AArch64II::MO_NO_FLAG;
284 }
285
classifyGlobalFunctionReference(const GlobalValue * GV,const TargetMachine & TM) const286 unsigned AArch64Subtarget::classifyGlobalFunctionReference(
287 const GlobalValue *GV, const TargetMachine &TM) const {
288 // MachO large model always goes via a GOT, because we don't have the
289 // relocations available to do anything else..
290 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
291 !GV->hasInternalLinkage())
292 return AArch64II::MO_GOT;
293
294 // NonLazyBind goes via GOT unless we know it's available locally.
295 auto *F = dyn_cast<Function>(GV);
296 if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
297 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
298 return AArch64II::MO_GOT;
299
300 // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
301 if (getTargetTriple().isOSWindows())
302 return ClassifyGlobalReference(GV, TM);
303
304 return AArch64II::MO_NO_FLAG;
305 }
306
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const307 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
308 unsigned NumRegionInstrs) const {
309 // LNT run (at least on Cyclone) showed reasonably significant gains for
310 // bi-directional scheduling. 253.perlbmk.
311 Policy.OnlyTopDown = false;
312 Policy.OnlyBottomUp = false;
313 // Enabling or Disabling the latency heuristic is a close call: It seems to
314 // help nearly no benchmark on out-of-order architectures, on the other hand
315 // it regresses register pressure on a few benchmarking.
316 Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
317 }
318
enableEarlyIfConversion() const319 bool AArch64Subtarget::enableEarlyIfConversion() const {
320 return EnableEarlyIfConvert;
321 }
322
supportsAddressTopByteIgnored() const323 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
324 if (!UseAddressTopByteIgnored)
325 return false;
326
327 if (TargetTriple.isiOS()) {
328 unsigned Major, Minor, Micro;
329 TargetTriple.getiOSVersion(Major, Minor, Micro);
330 return Major >= 8;
331 }
332
333 return false;
334 }
335
336 std::unique_ptr<PBQPRAConstraint>
getCustomPBQPConstraints() const337 AArch64Subtarget::getCustomPBQPConstraints() const {
338 return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
339 }
340
mirFileLoaded(MachineFunction & MF) const341 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
342 // We usually compute max call frame size after ISel. Do the computation now
343 // if the .mir file didn't specify it. Note that this will probably give you
344 // bogus values after PEI has eliminated the callframe setup/destroy pseudo
345 // instructions, specify explicitly if you need it to be correct.
346 MachineFrameInfo &MFI = MF.getFrameInfo();
347 if (!MFI.isMaxCallFrameSizeComputed())
348 MFI.computeMaxCallFrameSize(MF);
349 }
350
getMaxSVEVectorSizeInBits() const351 unsigned AArch64Subtarget::getMaxSVEVectorSizeInBits() const {
352 assert(HasSVE && "Tried to get SVE vector length without SVE support!");
353 assert(SVEVectorBitsMax % 128 == 0 &&
354 "SVE requires vector length in multiples of 128!");
355 assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) &&
356 "Minimum SVE vector size should not be larger than its maximum!");
357 if (SVEVectorBitsMax == 0)
358 return 0;
359 return (std::max(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
360 }
361
getMinSVEVectorSizeInBits() const362 unsigned AArch64Subtarget::getMinSVEVectorSizeInBits() const {
363 assert(HasSVE && "Tried to get SVE vector length without SVE support!");
364 assert(SVEVectorBitsMin % 128 == 0 &&
365 "SVE requires vector length in multiples of 128!");
366 assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) &&
367 "Minimum SVE vector size should not be larger than its maximum!");
368 if (SVEVectorBitsMax == 0)
369 return (SVEVectorBitsMin / 128) * 128;
370 return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
371 }
372
useSVEForFixedLengthVectors() const373 bool AArch64Subtarget::useSVEForFixedLengthVectors() const {
374 // Prefer NEON unless larger SVE registers are available.
375 return hasSVE() && getMinSVEVectorSizeInBits() >= 256;
376 }
377