1 /* 2 * Copyright (C) 2016 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_ 18 #define ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_ 19 20 #include "base/macros.h" 21 #include "base/scoped_arena_allocator.h" 22 #include "base/scoped_arena_containers.h" 23 #include "induction_var_range.h" 24 #include "loop_analysis.h" 25 #include "nodes.h" 26 #include "optimization.h" 27 #include "superblock_cloner.h" 28 29 namespace art HIDDEN { 30 31 class CompilerOptions; 32 class ArchNoOptsLoopHelper; 33 34 /** 35 * Loop optimizations. Builds a loop hierarchy and applies optimizations to 36 * the detected nested loops, such as removal of dead induction and empty loops 37 * and inner loop vectorization. 38 */ 39 class HLoopOptimization : public HOptimization { 40 public: 41 HLoopOptimization(HGraph* graph, 42 const CodeGenerator& codegen, // Needs info about the target. 43 HInductionVarAnalysis* induction_analysis, 44 OptimizingCompilerStats* stats, 45 const char* name = kLoopOptimizationPassName); 46 47 bool Run() override; 48 49 static constexpr const char* kLoopOptimizationPassName = "loop_optimization"; 50 51 // The maximum number of total instructions (trip_count * instruction_count), 52 // where the optimization of removing SuspendChecks from the loop header could 53 // be performed. 54 static constexpr int64_t kMaxTotalInstRemoveSuspendCheck = 128; 55 56 private: 57 /** 58 * A single loop inside the loop hierarchy representation. 59 */ 60 struct LoopNode : public ArenaObject<kArenaAllocLoopOptimization> { LoopNodeLoopNode61 explicit LoopNode(HLoopInformation* lp_info) 62 : loop_info(lp_info), 63 outer(nullptr), 64 inner(nullptr), 65 previous(nullptr), 66 next(nullptr), 67 try_catch_kind(TryCatchKind::kUnknown) {} 68 69 enum class TryCatchKind { 70 kUnknown, 71 // Either if we have a try catch in the loop, or if the loop is inside of an outer try catch, 72 // we set `kHasTryCatch`. 73 kHasTryCatch, 74 kNoTryCatch 75 }; 76 77 HLoopInformation* loop_info; 78 LoopNode* outer; 79 LoopNode* inner; 80 LoopNode* previous; 81 LoopNode* next; 82 TryCatchKind try_catch_kind; 83 }; 84 85 /* 86 * Vectorization restrictions (bit mask). 87 */ 88 enum VectorRestrictions { 89 kNone = 0, // no restrictions 90 kNoMul = 1 << 0, // no multiplication 91 kNoDiv = 1 << 1, // no division 92 kNoShift = 1 << 2, // no shift 93 kNoShr = 1 << 3, // no arithmetic shift right 94 kNoHiBits = 1 << 4, // "wider" operations cannot bring in higher order bits 95 kNoSignedHAdd = 1 << 5, // no signed halving add 96 kNoUnsignedHAdd = 1 << 6, // no unsigned halving add 97 kNoUnroundedHAdd = 1 << 7, // no unrounded halving add 98 kNoAbs = 1 << 8, // no absolute value 99 kNoStringCharAt = 1 << 9, // no StringCharAt 100 kNoReduction = 1 << 10, // no reduction 101 kNoSAD = 1 << 11, // no sum of absolute differences (SAD) 102 kNoWideSAD = 1 << 12, // no sum of absolute differences (SAD) with operand widening 103 kNoDotProd = 1 << 13, // no dot product 104 }; 105 106 /* 107 * Vectorization mode during synthesis 108 * (sequential peeling/cleanup loop or vector loop). 109 */ 110 enum VectorMode { 111 kSequential, 112 kVector 113 }; 114 115 /* 116 * Representation of a unit-stride array reference. 117 */ 118 struct ArrayReference { 119 ArrayReference(HInstruction* b, HInstruction* o, DataType::Type t, bool l, bool c = false) baseArrayReference120 : base(b), offset(o), type(t), lhs(l), is_string_char_at(c) { } 121 bool operator<(const ArrayReference& other) const { 122 return 123 (base < other.base) || 124 (base == other.base && 125 (offset < other.offset || (offset == other.offset && 126 (type < other.type || 127 (type == other.type && 128 (lhs < other.lhs || 129 (lhs == other.lhs && 130 is_string_char_at < other.is_string_char_at))))))); 131 } 132 HInstruction* base; // base address 133 HInstruction* offset; // offset + i 134 DataType::Type type; // component type 135 bool lhs; // def/use 136 bool is_string_char_at; // compressed string read 137 }; 138 139 // 140 // Loop setup and traversal. 141 // 142 143 bool LocalRun(); 144 void AddLoop(HLoopInformation* loop_info); 145 void RemoveLoop(LoopNode* node); 146 147 // Traverses all loops inner to outer to perform simplifications and optimizations. 148 // Returns true if loops nested inside current loop (node) have changed. 149 bool TraverseLoopsInnerToOuter(LoopNode* node); 150 151 // Calculates `node`'s `try_catch_kind` and sets it to: 152 // 1) kHasTryCatch if it has try catches (or if it's inside of an outer try catch) 153 // 2) kNoTryCatch otherwise. 154 void CalculateAndSetTryCatchKind(LoopNode* node); 155 156 // 157 // Optimization. 158 // 159 160 void SimplifyInduction(LoopNode* node); 161 void SimplifyBlocks(LoopNode* node); 162 163 // Performs optimizations specific to inner loop with finite header logic (empty loop removal, 164 // unrolling, vectorization). Returns true if anything changed. 165 bool TryOptimizeInnerLoopFinite(LoopNode* node); 166 167 // Performs optimizations specific to inner loop. Returns true if anything changed. 168 bool OptimizeInnerLoop(LoopNode* node); 169 170 // Tries to apply loop unrolling for branch penalty reduction and better instruction scheduling 171 // opportunities. Returns whether transformation happened. 'generate_code' determines whether the 172 // optimization should be actually applied. 173 bool TryUnrollingForBranchPenaltyReduction(LoopAnalysisInfo* analysis_info, 174 bool generate_code = true); 175 176 // Tries to apply loop peeling for loop invariant exits elimination. Returns whether 177 // transformation happened. 'generate_code' determines whether the optimization should be 178 // actually applied. 179 bool TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info, 180 bool generate_code = true); 181 182 // Tries to perform whole loop unrolling for a small loop with a small trip count to eliminate 183 // the loop check overhead and to have more opportunities for inter-iteration optimizations. 184 // Returns whether transformation happened. 'generate_code' determines whether the optimization 185 // should be actually applied. 186 bool TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code = true); 187 188 // Tries to remove SuspendCheck for plain loops with a low trip count. The 189 // SuspendCheck in the codegen makes sure that the thread can be interrupted 190 // during execution for GC. Not being able to do so might decrease the 191 // responsiveness of GC when a very long loop or a long recursion is being 192 // executed. However, for plain loops with a small trip count, the removal of 193 // SuspendCheck should not affect the GC's responsiveness by a large margin. 194 // Consequently, since the thread won't be interrupted for plain loops, it is 195 // assumed that the performance might increase by removing SuspendCheck. 196 bool TryToRemoveSuspendCheckFromLoopHeader(LoopAnalysisInfo* analysis_info, 197 bool generate_code = true); 198 199 // Tries to apply scalar loop optimizations. 200 bool TryLoopScalarOpts(LoopNode* node); 201 202 // 203 // Vectorization analysis and synthesis. 204 // 205 206 bool ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count); 207 void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count); 208 void GenerateNewLoop(LoopNode* node, 209 HBasicBlock* block, 210 HBasicBlock* new_preheader, 211 HInstruction* lo, 212 HInstruction* hi, 213 HInstruction* step, 214 uint32_t unroll); 215 bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code); 216 bool VectorizeUse(LoopNode* node, 217 HInstruction* instruction, 218 bool generate_code, 219 DataType::Type type, 220 uint64_t restrictions); 221 uint32_t GetVectorSizeInBytes(); 222 bool TrySetVectorType(DataType::Type type, /*out*/ uint64_t* restrictions); 223 bool TrySetVectorLengthImpl(uint32_t length); 224 TrySetVectorLength(DataType::Type type,uint32_t length)225 bool TrySetVectorLength(DataType::Type type, uint32_t length) { 226 bool res = TrySetVectorLengthImpl(length); 227 // Currently the vectorizer supports only the mode when full SIMD registers are used. 228 DCHECK_IMPLIES(res, DataType::Size(type) * length == GetVectorSizeInBytes()); 229 return res; 230 } 231 232 void GenerateVecInv(HInstruction* org, DataType::Type type); 233 void GenerateVecSub(HInstruction* org, HInstruction* offset); 234 void GenerateVecMem(HInstruction* org, 235 HInstruction* opa, 236 HInstruction* opb, 237 HInstruction* offset, 238 DataType::Type type); 239 void GenerateVecReductionPhi(HPhi* phi); 240 void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction); 241 HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction); 242 void GenerateVecOp(HInstruction* org, 243 HInstruction* opa, 244 HInstruction* opb, 245 DataType::Type type); 246 247 // Vectorization idioms. 248 bool VectorizeSaturationIdiom(LoopNode* node, 249 HInstruction* instruction, 250 bool generate_code, 251 DataType::Type type, 252 uint64_t restrictions); 253 bool VectorizeHalvingAddIdiom(LoopNode* node, 254 HInstruction* instruction, 255 bool generate_code, 256 DataType::Type type, 257 uint64_t restrictions); 258 bool VectorizeSADIdiom(LoopNode* node, 259 HInstruction* instruction, 260 bool generate_code, 261 DataType::Type type, 262 uint64_t restrictions); 263 bool VectorizeDotProdIdiom(LoopNode* node, 264 HInstruction* instruction, 265 bool generate_code, 266 DataType::Type type, 267 uint64_t restrictions); 268 269 // Vectorization heuristics. 270 Alignment ComputeAlignment(HInstruction* offset, 271 DataType::Type type, 272 bool is_string_char_at, 273 uint32_t peeling = 0); 274 void SetAlignmentStrategy(const ScopedArenaVector<uint32_t>& peeling_votes, 275 const ArrayReference* peeling_candidate); 276 uint32_t MaxNumberPeeled(); 277 bool IsVectorizationProfitable(int64_t trip_count); 278 279 // 280 // Helpers. 281 // 282 283 bool TrySetPhiInduction(HPhi* phi, bool restrict_uses); 284 bool TrySetPhiReduction(HPhi* phi); 285 286 // Detects loop header with a single induction (returned in main_phi), possibly 287 // other phis for reductions, but no other side effects. Returns true on success. 288 bool TrySetSimpleLoopHeader(HBasicBlock* block, /*out*/ HPhi** main_phi); 289 290 bool IsEmptyBody(HBasicBlock* block); 291 bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info, 292 HInstruction* instruction, 293 bool collect_loop_uses, 294 /*out*/ uint32_t* use_count); 295 bool IsUsedOutsideLoop(HLoopInformation* loop_info, 296 HInstruction* instruction); 297 bool TryReplaceWithLastValue(HLoopInformation* loop_info, 298 HInstruction* instruction, 299 HBasicBlock* block); 300 bool TryAssignLastValue(HLoopInformation* loop_info, 301 HInstruction* instruction, 302 HBasicBlock* block, 303 bool collect_loop_uses); 304 void RemoveDeadInstructions(const HInstructionList& list); 305 bool CanRemoveCycle(); // Whether the current 'iset_' is removable. 306 IsInPredicatedVectorizationMode()307 bool IsInPredicatedVectorizationMode() const { return predicated_vectorization_mode_; } 308 309 // Compiler options (to query ISA features). 310 const CompilerOptions* compiler_options_; 311 312 // Cached target SIMD vector register size in bytes. 313 const size_t simd_register_size_; 314 315 // Range information based on prior induction variable analysis. 316 InductionVarRange induction_range_; 317 318 // Phase-local heap memory allocator for the loop optimizer. Storage obtained 319 // through this allocator is immediately released when the loop optimizer is done. 320 ScopedArenaAllocator* loop_allocator_; 321 322 // Global heap memory allocator. Used to build HIR. 323 ArenaAllocator* global_allocator_; 324 325 // Entries into the loop hierarchy representation. The hierarchy resides 326 // in phase-local heap memory. 327 LoopNode* top_loop_; 328 LoopNode* last_loop_; 329 330 // Temporary bookkeeping of a set of instructions. 331 // Contents reside in phase-local heap memory. 332 ScopedArenaSet<HInstruction*>* iset_; 333 334 // Temporary bookkeeping of reduction instructions. Mapping is two-fold: 335 // (1) reductions in the loop-body are mapped back to their phi definition, 336 // (2) phi definitions are mapped to their initial value (updated during 337 // code generation to feed the proper values into the new chain). 338 // Contents reside in phase-local heap memory. 339 ScopedArenaSafeMap<HInstruction*, HInstruction*>* reductions_; 340 341 // Flag that tracks if any simplifications have occurred. 342 bool simplified_; 343 344 // Whether to use predicated loop vectorization (e.g. for arm64 SVE target). 345 bool predicated_vectorization_mode_; 346 347 // Number of "lanes" for selected packed type. 348 uint32_t vector_length_; 349 350 // Set of array references in the vector loop. 351 // Contents reside in phase-local heap memory. 352 ScopedArenaSet<ArrayReference>* vector_refs_; 353 354 // Static or dynamic loop peeling for alignment. 355 uint32_t vector_static_peeling_factor_; 356 const ArrayReference* vector_dynamic_peeling_candidate_; 357 358 // Dynamic data dependence test of the form a != b. 359 HInstruction* vector_runtime_test_a_; 360 HInstruction* vector_runtime_test_b_; 361 362 // Mapping used during vectorization synthesis for both the scalar peeling/cleanup 363 // loop (mode is kSequential) and the actual vector loop (mode is kVector). The data 364 // structure maps original instructions into the new instructions. 365 // Contents reside in phase-local heap memory. 366 ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_map_; 367 368 // Permanent mapping used during vectorization synthesis. 369 // Contents reside in phase-local heap memory. 370 ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_permanent_map_; 371 372 // Temporary vectorization bookkeeping. 373 VectorMode vector_mode_; // synthesis mode 374 HBasicBlock* vector_preheader_; // preheader of the new loop 375 HBasicBlock* vector_header_; // header of the new loop 376 HBasicBlock* vector_body_; // body of the new loop 377 HInstruction* vector_index_; // normalized index of the new loop 378 379 // Helper for target-specific behaviour for loop optimizations. 380 ArchNoOptsLoopHelper* arch_loop_helper_; 381 382 friend class LoopOptimizationTest; 383 384 DISALLOW_COPY_AND_ASSIGN(HLoopOptimization); 385 }; 386 387 } // namespace art 388 389 #endif // ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_ 390