1 //===------------------------- LSUnit.h --------------------------*- C++-*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// 10 /// A Load/Store unit class that models load/store queues and that implements 11 /// a simple weak memory consistency model. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #ifndef LLVM_MCA_LSUNIT_H 16 #define LLVM_MCA_LSUNIT_H 17 18 #include "llvm/ADT/DenseMap.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/MC/MCSchedule.h" 21 #include "llvm/MCA/HardwareUnits/HardwareUnit.h" 22 #include "llvm/MCA/Instruction.h" 23 24 namespace llvm { 25 namespace mca { 26 27 class Scheduler; 28 29 /// A node of a memory dependency graph. A MemoryGroup describes a set of 30 /// instructions with same memory dependencies. 31 /// 32 /// By construction, instructions of a MemoryGroup don't depend on each other. 33 /// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups. 34 /// A Memory group identifier is then stored as a "token" in field 35 /// Instruction::LSUTokenID of each dispatched instructions. That token is used 36 /// internally by the LSUnit to track memory dependencies. 37 class MemoryGroup { 38 unsigned NumPredecessors; 39 unsigned NumExecutingPredecessors; 40 unsigned NumExecutedPredecessors; 41 42 unsigned NumInstructions; 43 unsigned NumExecuting; 44 unsigned NumExecuted; 45 SmallVector<MemoryGroup *, 4> Succ; 46 47 CriticalDependency CriticalPredecessor; 48 InstRef CriticalMemoryInstruction; 49 50 MemoryGroup(const MemoryGroup &) = delete; 51 MemoryGroup &operator=(const MemoryGroup &) = delete; 52 53 public: MemoryGroup()54 MemoryGroup() 55 : NumPredecessors(0), NumExecutingPredecessors(0), 56 NumExecutedPredecessors(0), NumInstructions(0), NumExecuting(0), 57 NumExecuted(0), CriticalPredecessor(), CriticalMemoryInstruction() {} 58 MemoryGroup(MemoryGroup &&) = default; 59 getSuccessors()60 ArrayRef<MemoryGroup *> getSuccessors() const { return Succ; } getNumSuccessors()61 unsigned getNumSuccessors() const { return Succ.size(); } getNumPredecessors()62 unsigned getNumPredecessors() const { return NumPredecessors; } getNumExecutingPredecessors()63 unsigned getNumExecutingPredecessors() const { 64 return NumExecutingPredecessors; 65 } getNumExecutedPredecessors()66 unsigned getNumExecutedPredecessors() const { 67 return NumExecutedPredecessors; 68 } getNumInstructions()69 unsigned getNumInstructions() const { return NumInstructions; } getNumExecuting()70 unsigned getNumExecuting() const { return NumExecuting; } getNumExecuted()71 unsigned getNumExecuted() const { return NumExecuted; } 72 getCriticalMemoryInstruction()73 const InstRef &getCriticalMemoryInstruction() const { 74 return CriticalMemoryInstruction; 75 } getCriticalPredecessor()76 const CriticalDependency &getCriticalPredecessor() const { 77 return CriticalPredecessor; 78 } 79 addSuccessor(MemoryGroup * Group)80 void addSuccessor(MemoryGroup *Group) { 81 Group->NumPredecessors++; 82 assert(!isExecuted() && "Should have been removed!"); 83 if (isExecuting()) 84 Group->onGroupIssued(CriticalMemoryInstruction); 85 Succ.emplace_back(Group); 86 } 87 isWaiting()88 bool isWaiting() const { 89 return NumPredecessors > 90 (NumExecutingPredecessors + NumExecutedPredecessors); 91 } isPending()92 bool isPending() const { 93 return NumExecutingPredecessors && 94 ((NumExecutedPredecessors + NumExecutingPredecessors) == 95 NumPredecessors); 96 } isReady()97 bool isReady() const { return NumExecutedPredecessors == NumPredecessors; } isExecuting()98 bool isExecuting() const { 99 return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted)); 100 } isExecuted()101 bool isExecuted() const { return NumInstructions == NumExecuted; } 102 onGroupIssued(const InstRef & IR)103 void onGroupIssued(const InstRef &IR) { 104 assert(!isReady() && "Unexpected group-start event!"); 105 NumExecutingPredecessors++; 106 107 unsigned Cycles = IR.getInstruction()->getCyclesLeft(); 108 if (CriticalPredecessor.Cycles < Cycles) { 109 CriticalPredecessor.IID = IR.getSourceIndex(); 110 CriticalPredecessor.Cycles = Cycles; 111 } 112 } 113 onGroupExecuted()114 void onGroupExecuted() { 115 assert(!isReady() && "Inconsistent state found!"); 116 NumExecutingPredecessors--; 117 NumExecutedPredecessors++; 118 } 119 onInstructionIssued(const InstRef & IR)120 void onInstructionIssued(const InstRef &IR) { 121 assert(!isExecuting() && "Invalid internal state!"); 122 ++NumExecuting; 123 124 // update the CriticalMemDep. 125 const Instruction &IS = *IR.getInstruction(); 126 if ((bool)CriticalMemoryInstruction) { 127 const Instruction &OtherIS = *CriticalMemoryInstruction.getInstruction(); 128 if (OtherIS.getCyclesLeft() < IS.getCyclesLeft()) 129 CriticalMemoryInstruction = IR; 130 } else { 131 CriticalMemoryInstruction = IR; 132 } 133 134 if (!isExecuting()) 135 return; 136 137 // Notify successors that this group started execution. 138 for (MemoryGroup *MG : Succ) 139 MG->onGroupIssued(CriticalMemoryInstruction); 140 } 141 onInstructionExecuted()142 void onInstructionExecuted() { 143 assert(isReady() && !isExecuted() && "Invalid internal state!"); 144 --NumExecuting; 145 ++NumExecuted; 146 147 if (!isExecuted()) 148 return; 149 150 // Notify successors that this group has finished execution. 151 for (MemoryGroup *MG : Succ) 152 MG->onGroupExecuted(); 153 } 154 addInstruction()155 void addInstruction() { 156 assert(!getNumSuccessors() && "Cannot add instructions to this group!"); 157 ++NumInstructions; 158 } 159 cycleEvent()160 void cycleEvent() { 161 if (isWaiting() && CriticalPredecessor.Cycles) 162 CriticalPredecessor.Cycles--; 163 } 164 }; 165 166 /// Abstract base interface for LS (load/store) units in llvm-mca. 167 class LSUnitBase : public HardwareUnit { 168 /// Load queue size. 169 /// 170 /// A value of zero for this field means that the load queue is unbounded. 171 /// Processor models can declare the size of a load queue via tablegen (see 172 /// the definition of tablegen class LoadQueue in 173 /// llvm/Target/TargetSchedule.td). 174 unsigned LQSize; 175 176 /// Load queue size. 177 /// 178 /// A value of zero for this field means that the store queue is unbounded. 179 /// Processor models can declare the size of a store queue via tablegen (see 180 /// the definition of tablegen class StoreQueue in 181 /// llvm/Target/TargetSchedule.td). 182 unsigned SQSize; 183 184 unsigned UsedLQEntries; 185 unsigned UsedSQEntries; 186 187 /// True if loads don't alias with stores. 188 /// 189 /// By default, the LS unit assumes that loads and stores don't alias with 190 /// eachother. If this field is set to false, then loads are always assumed to 191 /// alias with stores. 192 const bool NoAlias; 193 194 /// Used to map group identifiers to MemoryGroups. 195 DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups; 196 unsigned NextGroupID; 197 198 public: 199 LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize, 200 unsigned StoreQueueSize, bool AssumeNoAlias); 201 202 virtual ~LSUnitBase(); 203 204 /// Returns the total number of entries in the load queue. getLoadQueueSize()205 unsigned getLoadQueueSize() const { return LQSize; } 206 207 /// Returns the total number of entries in the store queue. getStoreQueueSize()208 unsigned getStoreQueueSize() const { return SQSize; } 209 getUsedLQEntries()210 unsigned getUsedLQEntries() const { return UsedLQEntries; } getUsedSQEntries()211 unsigned getUsedSQEntries() const { return UsedSQEntries; } acquireLQSlot()212 void acquireLQSlot() { ++UsedLQEntries; } acquireSQSlot()213 void acquireSQSlot() { ++UsedSQEntries; } releaseLQSlot()214 void releaseLQSlot() { --UsedLQEntries; } releaseSQSlot()215 void releaseSQSlot() { --UsedSQEntries; } 216 assumeNoAlias()217 bool assumeNoAlias() const { return NoAlias; } 218 219 enum Status { 220 LSU_AVAILABLE = 0, 221 LSU_LQUEUE_FULL, // Load Queue unavailable 222 LSU_SQUEUE_FULL // Store Queue unavailable 223 }; 224 225 /// This method checks the availability of the load/store buffers. 226 /// 227 /// Returns LSU_AVAILABLE if there are enough load/store queue entries to 228 /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is 229 /// not a memory operation. 230 virtual Status isAvailable(const InstRef &IR) const = 0; 231 232 /// Allocates LS resources for instruction IR. 233 /// 234 /// This method assumes that a previous call to `isAvailable(IR)` succeeded 235 /// with a LSUnitBase::Status value of LSU_AVAILABLE. 236 /// Returns the GroupID associated with this instruction. That value will be 237 /// used to set the LSUTokenID field in class Instruction. 238 virtual unsigned dispatch(const InstRef &IR) = 0; 239 isSQEmpty()240 bool isSQEmpty() const { return !UsedSQEntries; } isLQEmpty()241 bool isLQEmpty() const { return !UsedLQEntries; } isSQFull()242 bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; } isLQFull()243 bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; } 244 isValidGroupID(unsigned Index)245 bool isValidGroupID(unsigned Index) const { 246 return Index && (Groups.find(Index) != Groups.end()); 247 } 248 249 /// Check if a peviously dispatched instruction IR is now ready for execution. isReady(const InstRef & IR)250 bool isReady(const InstRef &IR) const { 251 unsigned GroupID = IR.getInstruction()->getLSUTokenID(); 252 const MemoryGroup &Group = getGroup(GroupID); 253 return Group.isReady(); 254 } 255 256 /// Check if instruction IR only depends on memory instructions that are 257 /// currently executing. isPending(const InstRef & IR)258 bool isPending(const InstRef &IR) const { 259 unsigned GroupID = IR.getInstruction()->getLSUTokenID(); 260 const MemoryGroup &Group = getGroup(GroupID); 261 return Group.isPending(); 262 } 263 264 /// Check if instruction IR is still waiting on memory operations, and the 265 /// wait time is still unknown. isWaiting(const InstRef & IR)266 bool isWaiting(const InstRef &IR) const { 267 unsigned GroupID = IR.getInstruction()->getLSUTokenID(); 268 const MemoryGroup &Group = getGroup(GroupID); 269 return Group.isWaiting(); 270 } 271 hasDependentUsers(const InstRef & IR)272 bool hasDependentUsers(const InstRef &IR) const { 273 unsigned GroupID = IR.getInstruction()->getLSUTokenID(); 274 const MemoryGroup &Group = getGroup(GroupID); 275 return !Group.isExecuted() && Group.getNumSuccessors(); 276 } 277 getGroup(unsigned Index)278 const MemoryGroup &getGroup(unsigned Index) const { 279 assert(isValidGroupID(Index) && "Group doesn't exist!"); 280 return *Groups.find(Index)->second; 281 } 282 getGroup(unsigned Index)283 MemoryGroup &getGroup(unsigned Index) { 284 assert(isValidGroupID(Index) && "Group doesn't exist!"); 285 return *Groups.find(Index)->second; 286 } 287 createMemoryGroup()288 unsigned createMemoryGroup() { 289 Groups.insert( 290 std::make_pair(NextGroupID, std::make_unique<MemoryGroup>())); 291 return NextGroupID++; 292 } 293 294 virtual void onInstructionExecuted(const InstRef &IR); 295 296 // Loads are tracked by the LDQ (load queue) from dispatch until completion. 297 // Stores are tracked by the STQ (store queue) from dispatch until commitment. 298 // By default we conservatively assume that the LDQ receives a load at 299 // dispatch. Loads leave the LDQ at retirement stage. 300 virtual void onInstructionRetired(const InstRef &IR); 301 onInstructionIssued(const InstRef & IR)302 virtual void onInstructionIssued(const InstRef &IR) { 303 unsigned GroupID = IR.getInstruction()->getLSUTokenID(); 304 Groups[GroupID]->onInstructionIssued(IR); 305 } 306 307 virtual void cycleEvent(); 308 309 #ifndef NDEBUG 310 void dump() const; 311 #endif 312 }; 313 314 /// Default Load/Store Unit (LS Unit) for simulated processors. 315 /// 316 /// Each load (or store) consumes one entry in the load (or store) queue. 317 /// 318 /// Rules are: 319 /// 1) A younger load is allowed to pass an older load only if there are no 320 /// stores nor barriers in between the two loads. 321 /// 2) An younger store is not allowed to pass an older store. 322 /// 3) A younger store is not allowed to pass an older load. 323 /// 4) A younger load is allowed to pass an older store only if the load does 324 /// not alias with the store. 325 /// 326 /// This class optimistically assumes that loads don't alias store operations. 327 /// Under this assumption, younger loads are always allowed to pass older 328 /// stores (this would only affects rule 4). 329 /// Essentially, this class doesn't perform any sort alias analysis to 330 /// identify aliasing loads and stores. 331 /// 332 /// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be 333 /// set to `false` by the constructor of LSUnit. 334 /// 335 /// Note that this class doesn't know about the existence of different memory 336 /// types for memory operations (example: write-through, write-combining, etc.). 337 /// Derived classes are responsible for implementing that extra knowledge, and 338 /// provide different sets of rules for loads and stores by overriding method 339 /// `isReady()`. 340 /// To emulate a write-combining memory type, rule 2. must be relaxed in a 341 /// derived class to enable the reordering of non-aliasing store operations. 342 /// 343 /// No assumptions are made by this class on the size of the store buffer. This 344 /// class doesn't know how to identify cases where store-to-load forwarding may 345 /// occur. 346 /// 347 /// LSUnit doesn't attempt to predict whether a load or store hits or misses 348 /// the L1 cache. To be more specific, LSUnit doesn't know anything about 349 /// cache hierarchy and memory types. 350 /// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the 351 /// scheduling model provides an "optimistic" load-to-use latency (which usually 352 /// matches the load-to-use latency for when there is a hit in the L1D). 353 /// Derived classes may expand this knowledge. 354 /// 355 /// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor 356 /// memory-barrier like instructions. 357 /// LSUnit conservatively assumes that an instruction which `mayLoad` and has 358 /// `unmodeled side effects` behave like a "soft" load-barrier. That means, it 359 /// serializes loads without forcing a flush of the load queue. 360 /// Similarly, instructions that both `mayStore` and have `unmodeled side 361 /// effects` are treated like store barriers. A full memory 362 /// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side 363 /// effects. This is obviously inaccurate, but this is the best that we can do 364 /// at the moment. 365 /// 366 /// Each load/store barrier consumes one entry in the load/store queue. A 367 /// load/store barrier enforces ordering of loads/stores: 368 /// - A younger load cannot pass a load barrier. 369 /// - A younger store cannot pass a store barrier. 370 /// 371 /// A younger load has to wait for the memory load barrier to execute. 372 /// A load/store barrier is "executed" when it becomes the oldest entry in 373 /// the load/store queue(s). That also means, all the older loads/stores have 374 /// already been executed. 375 class LSUnit : public LSUnitBase { 376 // This class doesn't know about the latency of a load instruction. So, it 377 // conservatively/pessimistically assumes that the latency of a load opcode 378 // matches the instruction latency. 379 // 380 // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses), 381 // and load/store conflicts, the latency of a load is determined by the depth 382 // of the load pipeline. So, we could use field `LoadLatency` in the 383 // MCSchedModel to model that latency. 384 // Field `LoadLatency` often matches the so-called 'load-to-use' latency from 385 // L1D, and it usually already accounts for any extra latency due to data 386 // forwarding. 387 // When doing throughput analysis, `LoadLatency` is likely to 388 // be a better predictor of load latency than instruction latency. This is 389 // particularly true when simulating code with temporal/spatial locality of 390 // memory accesses. 391 // Using `LoadLatency` (instead of the instruction latency) is also expected 392 // to improve the load queue allocation for long latency instructions with 393 // folded memory operands (See PR39829). 394 // 395 // FIXME: On some processors, load/store operations are split into multiple 396 // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but 397 // not 256-bit data types. So, a 256-bit load is effectively split into two 398 // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For 399 // simplicity, this class optimistically assumes that a load instruction only 400 // consumes one entry in the LoadQueue. Similarly, store instructions only 401 // consume a single entry in the StoreQueue. 402 // In future, we should reassess the quality of this design, and consider 403 // alternative approaches that let instructions specify the number of 404 // load/store queue entries which they consume at dispatch stage (See 405 // PR39830). 406 // 407 // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is 408 // conservatively treated as a store barrier. It forces older store to be 409 // executed before newer stores are issued. 410 // 411 // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is 412 // conservatively treated as a load barrier. It forces older loads to execute 413 // before newer loads are issued. 414 unsigned CurrentLoadGroupID; 415 unsigned CurrentLoadBarrierGroupID; 416 unsigned CurrentStoreGroupID; 417 418 public: LSUnit(const MCSchedModel & SM)419 LSUnit(const MCSchedModel &SM) 420 : LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {} LSUnit(const MCSchedModel & SM,unsigned LQ,unsigned SQ)421 LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ) 422 : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {} LSUnit(const MCSchedModel & SM,unsigned LQ,unsigned SQ,bool AssumeNoAlias)423 LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias) 424 : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0), 425 CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0) {} 426 427 /// Returns LSU_AVAILABLE if there are enough load/store queue entries to 428 /// accomodate instruction IR. 429 Status isAvailable(const InstRef &IR) const override; 430 431 /// Allocates LS resources for instruction IR. 432 /// 433 /// This method assumes that a previous call to `isAvailable(IR)` succeeded 434 /// returning LSU_AVAILABLE. 435 /// 436 /// Rules are: 437 /// By default, rules are: 438 /// 1. A store may not pass a previous store. 439 /// 2. A load may not pass a previous store unless flag 'NoAlias' is set. 440 /// 3. A load may pass a previous load. 441 /// 4. A store may not pass a previous load (regardless of flag 'NoAlias'). 442 /// 5. A load has to wait until an older load barrier is fully executed. 443 /// 6. A store has to wait until an older store barrier is fully executed. 444 unsigned dispatch(const InstRef &IR) override; 445 446 void onInstructionExecuted(const InstRef &IR) override; 447 }; 448 449 } // namespace mca 450 } // namespace llvm 451 452 #endif // LLVM_MCA_LSUNIT_H 453