1 /* 2 * Copyright (C) 2016 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_ 18 #define ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_ 19 20 #include "base/scoped_arena_allocator.h" 21 #include "base/scoped_arena_containers.h" 22 #include "induction_var_range.h" 23 #include "loop_analysis.h" 24 #include "nodes.h" 25 #include "optimization.h" 26 #include "superblock_cloner.h" 27 28 namespace art { 29 30 class CompilerOptions; 31 class ArchNoOptsLoopHelper; 32 33 /** 34 * Loop optimizations. Builds a loop hierarchy and applies optimizations to 35 * the detected nested loops, such as removal of dead induction and empty loops 36 * and inner loop vectorization. 37 */ 38 class HLoopOptimization : public HOptimization { 39 public: 40 HLoopOptimization(HGraph* graph, 41 const CodeGenerator& codegen, // Needs info about the target. 42 HInductionVarAnalysis* induction_analysis, 43 OptimizingCompilerStats* stats, 44 const char* name = kLoopOptimizationPassName); 45 46 bool Run() override; 47 48 static constexpr const char* kLoopOptimizationPassName = "loop_optimization"; 49 50 private: 51 /** 52 * A single loop inside the loop hierarchy representation. 53 */ 54 struct LoopNode : public ArenaObject<kArenaAllocLoopOptimization> { LoopNodeLoopNode55 explicit LoopNode(HLoopInformation* lp_info) 56 : loop_info(lp_info), 57 outer(nullptr), 58 inner(nullptr), 59 previous(nullptr), 60 next(nullptr), 61 try_catch_kind(TryCatchKind::kUnknown) {} 62 63 enum class TryCatchKind { 64 kUnknown, 65 // Either if we have a try catch in the loop, or if the loop is inside of an outer try catch, 66 // we set `kHasTryCatch`. 67 kHasTryCatch, 68 kNoTryCatch 69 }; 70 71 HLoopInformation* loop_info; 72 LoopNode* outer; 73 LoopNode* inner; 74 LoopNode* previous; 75 LoopNode* next; 76 TryCatchKind try_catch_kind; 77 }; 78 79 /* 80 * Vectorization restrictions (bit mask). 81 */ 82 enum VectorRestrictions { 83 kNone = 0, // no restrictions 84 kNoMul = 1 << 0, // no multiplication 85 kNoDiv = 1 << 1, // no division 86 kNoShift = 1 << 2, // no shift 87 kNoShr = 1 << 3, // no arithmetic shift right 88 kNoHiBits = 1 << 4, // "wider" operations cannot bring in higher order bits 89 kNoSignedHAdd = 1 << 5, // no signed halving add 90 kNoUnsignedHAdd = 1 << 6, // no unsigned halving add 91 kNoUnroundedHAdd = 1 << 7, // no unrounded halving add 92 kNoAbs = 1 << 8, // no absolute value 93 kNoStringCharAt = 1 << 9, // no StringCharAt 94 kNoReduction = 1 << 10, // no reduction 95 kNoSAD = 1 << 11, // no sum of absolute differences (SAD) 96 kNoWideSAD = 1 << 12, // no sum of absolute differences (SAD) with operand widening 97 kNoDotProd = 1 << 13, // no dot product 98 }; 99 100 /* 101 * Vectorization mode during synthesis 102 * (sequential peeling/cleanup loop or vector loop). 103 */ 104 enum VectorMode { 105 kSequential, 106 kVector 107 }; 108 109 /* 110 * Representation of a unit-stride array reference. 111 */ 112 struct ArrayReference { 113 ArrayReference(HInstruction* b, HInstruction* o, DataType::Type t, bool l, bool c = false) baseArrayReference114 : base(b), offset(o), type(t), lhs(l), is_string_char_at(c) { } 115 bool operator<(const ArrayReference& other) const { 116 return 117 (base < other.base) || 118 (base == other.base && 119 (offset < other.offset || (offset == other.offset && 120 (type < other.type || 121 (type == other.type && 122 (lhs < other.lhs || 123 (lhs == other.lhs && 124 is_string_char_at < other.is_string_char_at))))))); 125 } 126 HInstruction* base; // base address 127 HInstruction* offset; // offset + i 128 DataType::Type type; // component type 129 bool lhs; // def/use 130 bool is_string_char_at; // compressed string read 131 }; 132 133 // 134 // Loop setup and traversal. 135 // 136 137 bool LocalRun(); 138 void AddLoop(HLoopInformation* loop_info); 139 void RemoveLoop(LoopNode* node); 140 141 // Traverses all loops inner to outer to perform simplifications and optimizations. 142 // Returns true if loops nested inside current loop (node) have changed. 143 bool TraverseLoopsInnerToOuter(LoopNode* node); 144 145 // Calculates `node`'s `try_catch_kind` and sets it to: 146 // 1) kHasTryCatch if it has try catches (or if it's inside of an outer try catch) 147 // 2) kNoTryCatch otherwise. 148 void CalculateAndSetTryCatchKind(LoopNode* node); 149 150 // 151 // Optimization. 152 // 153 154 void SimplifyInduction(LoopNode* node); 155 void SimplifyBlocks(LoopNode* node); 156 157 // Performs optimizations specific to inner loop with finite header logic (empty loop removal, 158 // unrolling, vectorization). Returns true if anything changed. 159 bool TryOptimizeInnerLoopFinite(LoopNode* node); 160 161 // Performs optimizations specific to inner loop. Returns true if anything changed. 162 bool OptimizeInnerLoop(LoopNode* node); 163 164 // Tries to apply loop unrolling for branch penalty reduction and better instruction scheduling 165 // opportunities. Returns whether transformation happened. 'generate_code' determines whether the 166 // optimization should be actually applied. 167 bool TryUnrollingForBranchPenaltyReduction(LoopAnalysisInfo* analysis_info, 168 bool generate_code = true); 169 170 // Tries to apply loop peeling for loop invariant exits elimination. Returns whether 171 // transformation happened. 'generate_code' determines whether the optimization should be 172 // actually applied. 173 bool TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info, 174 bool generate_code = true); 175 176 // Tries to perform whole loop unrolling for a small loop with a small trip count to eliminate 177 // the loop check overhead and to have more opportunities for inter-iteration optimizations. 178 // Returns whether transformation happened. 'generate_code' determines whether the optimization 179 // should be actually applied. 180 bool TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code = true); 181 182 // Tries to apply scalar loop peeling and unrolling. 183 bool TryPeelingAndUnrolling(LoopNode* node); 184 185 // 186 // Vectorization analysis and synthesis. 187 // 188 189 bool ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count); 190 void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count); 191 void GenerateNewLoop(LoopNode* node, 192 HBasicBlock* block, 193 HBasicBlock* new_preheader, 194 HInstruction* lo, 195 HInstruction* hi, 196 HInstruction* step, 197 uint32_t unroll); 198 bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code); 199 bool VectorizeUse(LoopNode* node, 200 HInstruction* instruction, 201 bool generate_code, 202 DataType::Type type, 203 uint64_t restrictions); 204 uint32_t GetVectorSizeInBytes(); 205 bool TrySetVectorType(DataType::Type type, /*out*/ uint64_t* restrictions); 206 bool TrySetVectorLengthImpl(uint32_t length); 207 TrySetVectorLength(DataType::Type type,uint32_t length)208 bool TrySetVectorLength(DataType::Type type, uint32_t length) { 209 bool res = TrySetVectorLengthImpl(length); 210 // Currently the vectorizer supports only the mode when full SIMD registers are used. 211 DCHECK_IMPLIES(res, DataType::Size(type) * length == GetVectorSizeInBytes()); 212 return res; 213 } 214 215 void GenerateVecInv(HInstruction* org, DataType::Type type); 216 void GenerateVecSub(HInstruction* org, HInstruction* offset); 217 void GenerateVecMem(HInstruction* org, 218 HInstruction* opa, 219 HInstruction* opb, 220 HInstruction* offset, 221 DataType::Type type); 222 void GenerateVecReductionPhi(HPhi* phi); 223 void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction); 224 HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction); 225 void GenerateVecOp(HInstruction* org, 226 HInstruction* opa, 227 HInstruction* opb, 228 DataType::Type type); 229 230 // Vectorization idioms. 231 bool VectorizeSaturationIdiom(LoopNode* node, 232 HInstruction* instruction, 233 bool generate_code, 234 DataType::Type type, 235 uint64_t restrictions); 236 bool VectorizeHalvingAddIdiom(LoopNode* node, 237 HInstruction* instruction, 238 bool generate_code, 239 DataType::Type type, 240 uint64_t restrictions); 241 bool VectorizeSADIdiom(LoopNode* node, 242 HInstruction* instruction, 243 bool generate_code, 244 DataType::Type type, 245 uint64_t restrictions); 246 bool VectorizeDotProdIdiom(LoopNode* node, 247 HInstruction* instruction, 248 bool generate_code, 249 DataType::Type type, 250 uint64_t restrictions); 251 252 // Vectorization heuristics. 253 Alignment ComputeAlignment(HInstruction* offset, 254 DataType::Type type, 255 bool is_string_char_at, 256 uint32_t peeling = 0); 257 void SetAlignmentStrategy(const ScopedArenaVector<uint32_t>& peeling_votes, 258 const ArrayReference* peeling_candidate); 259 uint32_t MaxNumberPeeled(); 260 bool IsVectorizationProfitable(int64_t trip_count); 261 262 // 263 // Helpers. 264 // 265 266 bool TrySetPhiInduction(HPhi* phi, bool restrict_uses); 267 bool TrySetPhiReduction(HPhi* phi); 268 269 // Detects loop header with a single induction (returned in main_phi), possibly 270 // other phis for reductions, but no other side effects. Returns true on success. 271 bool TrySetSimpleLoopHeader(HBasicBlock* block, /*out*/ HPhi** main_phi); 272 273 bool IsEmptyBody(HBasicBlock* block); 274 bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info, 275 HInstruction* instruction, 276 bool collect_loop_uses, 277 /*out*/ uint32_t* use_count); 278 bool IsUsedOutsideLoop(HLoopInformation* loop_info, 279 HInstruction* instruction); 280 bool TryReplaceWithLastValue(HLoopInformation* loop_info, 281 HInstruction* instruction, 282 HBasicBlock* block); 283 bool TryAssignLastValue(HLoopInformation* loop_info, 284 HInstruction* instruction, 285 HBasicBlock* block, 286 bool collect_loop_uses); 287 void RemoveDeadInstructions(const HInstructionList& list); 288 bool CanRemoveCycle(); // Whether the current 'iset_' is removable. 289 IsInPredicatedVectorizationMode()290 bool IsInPredicatedVectorizationMode() const { return predicated_vectorization_mode_; } 291 292 // Compiler options (to query ISA features). 293 const CompilerOptions* compiler_options_; 294 295 // Cached target SIMD vector register size in bytes. 296 const size_t simd_register_size_; 297 298 // Range information based on prior induction variable analysis. 299 InductionVarRange induction_range_; 300 301 // Phase-local heap memory allocator for the loop optimizer. Storage obtained 302 // through this allocator is immediately released when the loop optimizer is done. 303 ScopedArenaAllocator* loop_allocator_; 304 305 // Global heap memory allocator. Used to build HIR. 306 ArenaAllocator* global_allocator_; 307 308 // Entries into the loop hierarchy representation. The hierarchy resides 309 // in phase-local heap memory. 310 LoopNode* top_loop_; 311 LoopNode* last_loop_; 312 313 // Temporary bookkeeping of a set of instructions. 314 // Contents reside in phase-local heap memory. 315 ScopedArenaSet<HInstruction*>* iset_; 316 317 // Temporary bookkeeping of reduction instructions. Mapping is two-fold: 318 // (1) reductions in the loop-body are mapped back to their phi definition, 319 // (2) phi definitions are mapped to their initial value (updated during 320 // code generation to feed the proper values into the new chain). 321 // Contents reside in phase-local heap memory. 322 ScopedArenaSafeMap<HInstruction*, HInstruction*>* reductions_; 323 324 // Flag that tracks if any simplifications have occurred. 325 bool simplified_; 326 327 // Whether to use predicated loop vectorization (e.g. for arm64 SVE target). 328 bool predicated_vectorization_mode_; 329 330 // Number of "lanes" for selected packed type. 331 uint32_t vector_length_; 332 333 // Set of array references in the vector loop. 334 // Contents reside in phase-local heap memory. 335 ScopedArenaSet<ArrayReference>* vector_refs_; 336 337 // Static or dynamic loop peeling for alignment. 338 uint32_t vector_static_peeling_factor_; 339 const ArrayReference* vector_dynamic_peeling_candidate_; 340 341 // Dynamic data dependence test of the form a != b. 342 HInstruction* vector_runtime_test_a_; 343 HInstruction* vector_runtime_test_b_; 344 345 // Mapping used during vectorization synthesis for both the scalar peeling/cleanup 346 // loop (mode is kSequential) and the actual vector loop (mode is kVector). The data 347 // structure maps original instructions into the new instructions. 348 // Contents reside in phase-local heap memory. 349 ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_map_; 350 351 // Permanent mapping used during vectorization synthesis. 352 // Contents reside in phase-local heap memory. 353 ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_permanent_map_; 354 355 // Temporary vectorization bookkeeping. 356 VectorMode vector_mode_; // synthesis mode 357 HBasicBlock* vector_preheader_; // preheader of the new loop 358 HBasicBlock* vector_header_; // header of the new loop 359 HBasicBlock* vector_body_; // body of the new loop 360 HInstruction* vector_index_; // normalized index of the new loop 361 362 // Helper for target-specific behaviour for loop optimizations. 363 ArchNoOptsLoopHelper* arch_loop_helper_; 364 365 friend class LoopOptimizationTest; 366 367 DISALLOW_COPY_AND_ASSIGN(HLoopOptimization); 368 }; 369 370 } // namespace art 371 372 #endif // ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_ 373