1 /* 2 * Copyright (C) 2016 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_ 18 #define ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_ 19 20 #include "base/scoped_arena_allocator.h" 21 #include "base/scoped_arena_containers.h" 22 #include "induction_var_range.h" 23 #include "loop_analysis.h" 24 #include "nodes.h" 25 #include "optimization.h" 26 #include "superblock_cloner.h" 27 28 namespace art { 29 30 class CompilerOptions; 31 class ArchNoOptsLoopHelper; 32 33 /** 34 * Loop optimizations. Builds a loop hierarchy and applies optimizations to 35 * the detected nested loops, such as removal of dead induction and empty loops 36 * and inner loop vectorization. 37 */ 38 class HLoopOptimization : public HOptimization { 39 public: 40 HLoopOptimization(HGraph* graph, 41 const CodeGenerator& codegen, // Needs info about the target. 42 HInductionVarAnalysis* induction_analysis, 43 OptimizingCompilerStats* stats, 44 const char* name = kLoopOptimizationPassName); 45 46 bool Run() override; 47 48 static constexpr const char* kLoopOptimizationPassName = "loop_optimization"; 49 50 private: 51 /** 52 * A single loop inside the loop hierarchy representation. 53 */ 54 struct LoopNode : public ArenaObject<kArenaAllocLoopOptimization> { LoopNodeLoopNode55 explicit LoopNode(HLoopInformation* lp_info) 56 : loop_info(lp_info), 57 outer(nullptr), 58 inner(nullptr), 59 previous(nullptr), 60 next(nullptr) {} 61 HLoopInformation* loop_info; 62 LoopNode* outer; 63 LoopNode* inner; 64 LoopNode* previous; 65 LoopNode* next; 66 }; 67 68 /* 69 * Vectorization restrictions (bit mask). 70 */ 71 enum VectorRestrictions { 72 kNone = 0, // no restrictions 73 kNoMul = 1 << 0, // no multiplication 74 kNoDiv = 1 << 1, // no division 75 kNoShift = 1 << 2, // no shift 76 kNoShr = 1 << 3, // no arithmetic shift right 77 kNoHiBits = 1 << 4, // "wider" operations cannot bring in higher order bits 78 kNoSignedHAdd = 1 << 5, // no signed halving add 79 kNoUnsignedHAdd = 1 << 6, // no unsigned halving add 80 kNoUnroundedHAdd = 1 << 7, // no unrounded halving add 81 kNoAbs = 1 << 8, // no absolute value 82 kNoStringCharAt = 1 << 9, // no StringCharAt 83 kNoReduction = 1 << 10, // no reduction 84 kNoSAD = 1 << 11, // no sum of absolute differences (SAD) 85 kNoWideSAD = 1 << 12, // no sum of absolute differences (SAD) with operand widening 86 kNoDotProd = 1 << 13, // no dot product 87 }; 88 89 /* 90 * Vectorization mode during synthesis 91 * (sequential peeling/cleanup loop or vector loop). 92 */ 93 enum VectorMode { 94 kSequential, 95 kVector 96 }; 97 98 /* 99 * Representation of a unit-stride array reference. 100 */ 101 struct ArrayReference { 102 ArrayReference(HInstruction* b, HInstruction* o, DataType::Type t, bool l, bool c = false) baseArrayReference103 : base(b), offset(o), type(t), lhs(l), is_string_char_at(c) { } 104 bool operator<(const ArrayReference& other) const { 105 return 106 (base < other.base) || 107 (base == other.base && 108 (offset < other.offset || (offset == other.offset && 109 (type < other.type || 110 (type == other.type && 111 (lhs < other.lhs || 112 (lhs == other.lhs && 113 is_string_char_at < other.is_string_char_at))))))); 114 } 115 HInstruction* base; // base address 116 HInstruction* offset; // offset + i 117 DataType::Type type; // component type 118 bool lhs; // def/use 119 bool is_string_char_at; // compressed string read 120 }; 121 122 // 123 // Loop setup and traversal. 124 // 125 126 bool LocalRun(); 127 void AddLoop(HLoopInformation* loop_info); 128 void RemoveLoop(LoopNode* node); 129 130 // Traverses all loops inner to outer to perform simplifications and optimizations. 131 // Returns true if loops nested inside current loop (node) have changed. 132 bool TraverseLoopsInnerToOuter(LoopNode* node); 133 134 // 135 // Optimization. 136 // 137 138 void SimplifyInduction(LoopNode* node); 139 void SimplifyBlocks(LoopNode* node); 140 141 // Performs optimizations specific to inner loop with finite header logic (empty loop removal, 142 // unrolling, vectorization). Returns true if anything changed. 143 bool TryOptimizeInnerLoopFinite(LoopNode* node); 144 145 // Performs optimizations specific to inner loop. Returns true if anything changed. 146 bool OptimizeInnerLoop(LoopNode* node); 147 148 // Tries to apply loop unrolling for branch penalty reduction and better instruction scheduling 149 // opportunities. Returns whether transformation happened. 'generate_code' determines whether the 150 // optimization should be actually applied. 151 bool TryUnrollingForBranchPenaltyReduction(LoopAnalysisInfo* analysis_info, 152 bool generate_code = true); 153 154 // Tries to apply loop peeling for loop invariant exits elimination. Returns whether 155 // transformation happened. 'generate_code' determines whether the optimization should be 156 // actually applied. 157 bool TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info, 158 bool generate_code = true); 159 160 // Tries to perform whole loop unrolling for a small loop with a small trip count to eliminate 161 // the loop check overhead and to have more opportunities for inter-iteration optimizations. 162 // Returns whether transformation happened. 'generate_code' determines whether the optimization 163 // should be actually applied. 164 bool TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code = true); 165 166 // Tries to apply scalar loop peeling and unrolling. 167 bool TryPeelingAndUnrolling(LoopNode* node); 168 169 // 170 // Vectorization analysis and synthesis. 171 // 172 173 bool ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count); 174 void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count); 175 void GenerateNewLoop(LoopNode* node, 176 HBasicBlock* block, 177 HBasicBlock* new_preheader, 178 HInstruction* lo, 179 HInstruction* hi, 180 HInstruction* step, 181 uint32_t unroll); 182 bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code); 183 bool VectorizeUse(LoopNode* node, 184 HInstruction* instruction, 185 bool generate_code, 186 DataType::Type type, 187 uint64_t restrictions); 188 uint32_t GetVectorSizeInBytes(); 189 bool TrySetVectorType(DataType::Type type, /*out*/ uint64_t* restrictions); 190 bool TrySetVectorLengthImpl(uint32_t length); 191 TrySetVectorLength(DataType::Type type,uint32_t length)192 bool TrySetVectorLength(DataType::Type type, uint32_t length) { 193 bool res = TrySetVectorLengthImpl(length); 194 // Currently the vectorizer supports only the mode when full SIMD registers are used. 195 DCHECK(!res || (DataType::Size(type) * length == GetVectorSizeInBytes())); 196 return res; 197 } 198 199 void GenerateVecInv(HInstruction* org, DataType::Type type); 200 void GenerateVecSub(HInstruction* org, HInstruction* offset); 201 void GenerateVecMem(HInstruction* org, 202 HInstruction* opa, 203 HInstruction* opb, 204 HInstruction* offset, 205 DataType::Type type); 206 void GenerateVecReductionPhi(HPhi* phi); 207 void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction); 208 HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction); 209 void GenerateVecOp(HInstruction* org, 210 HInstruction* opa, 211 HInstruction* opb, 212 DataType::Type type); 213 214 // Vectorization idioms. 215 bool VectorizeSaturationIdiom(LoopNode* node, 216 HInstruction* instruction, 217 bool generate_code, 218 DataType::Type type, 219 uint64_t restrictions); 220 bool VectorizeHalvingAddIdiom(LoopNode* node, 221 HInstruction* instruction, 222 bool generate_code, 223 DataType::Type type, 224 uint64_t restrictions); 225 bool VectorizeSADIdiom(LoopNode* node, 226 HInstruction* instruction, 227 bool generate_code, 228 DataType::Type type, 229 uint64_t restrictions); 230 bool VectorizeDotProdIdiom(LoopNode* node, 231 HInstruction* instruction, 232 bool generate_code, 233 DataType::Type type, 234 uint64_t restrictions); 235 236 // Vectorization heuristics. 237 Alignment ComputeAlignment(HInstruction* offset, 238 DataType::Type type, 239 bool is_string_char_at, 240 uint32_t peeling = 0); 241 void SetAlignmentStrategy(const ScopedArenaVector<uint32_t>& peeling_votes, 242 const ArrayReference* peeling_candidate); 243 uint32_t MaxNumberPeeled(); 244 bool IsVectorizationProfitable(int64_t trip_count); 245 246 // 247 // Helpers. 248 // 249 250 bool TrySetPhiInduction(HPhi* phi, bool restrict_uses); 251 bool TrySetPhiReduction(HPhi* phi); 252 253 // Detects loop header with a single induction (returned in main_phi), possibly 254 // other phis for reductions, but no other side effects. Returns true on success. 255 bool TrySetSimpleLoopHeader(HBasicBlock* block, /*out*/ HPhi** main_phi); 256 257 bool IsEmptyBody(HBasicBlock* block); 258 bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info, 259 HInstruction* instruction, 260 bool collect_loop_uses, 261 /*out*/ uint32_t* use_count); 262 bool IsUsedOutsideLoop(HLoopInformation* loop_info, 263 HInstruction* instruction); 264 bool TryReplaceWithLastValue(HLoopInformation* loop_info, 265 HInstruction* instruction, 266 HBasicBlock* block); 267 bool TryAssignLastValue(HLoopInformation* loop_info, 268 HInstruction* instruction, 269 HBasicBlock* block, 270 bool collect_loop_uses); 271 void RemoveDeadInstructions(const HInstructionList& list); 272 bool CanRemoveCycle(); // Whether the current 'iset_' is removable. 273 IsInPredicatedVectorizationMode()274 bool IsInPredicatedVectorizationMode() const { return predicated_vectorization_mode_; } 275 276 // Compiler options (to query ISA features). 277 const CompilerOptions* compiler_options_; 278 279 // Cached target SIMD vector register size in bytes. 280 const size_t simd_register_size_; 281 282 // Range information based on prior induction variable analysis. 283 InductionVarRange induction_range_; 284 285 // Phase-local heap memory allocator for the loop optimizer. Storage obtained 286 // through this allocator is immediately released when the loop optimizer is done. 287 ScopedArenaAllocator* loop_allocator_; 288 289 // Global heap memory allocator. Used to build HIR. 290 ArenaAllocator* global_allocator_; 291 292 // Entries into the loop hierarchy representation. The hierarchy resides 293 // in phase-local heap memory. 294 LoopNode* top_loop_; 295 LoopNode* last_loop_; 296 297 // Temporary bookkeeping of a set of instructions. 298 // Contents reside in phase-local heap memory. 299 ScopedArenaSet<HInstruction*>* iset_; 300 301 // Temporary bookkeeping of reduction instructions. Mapping is two-fold: 302 // (1) reductions in the loop-body are mapped back to their phi definition, 303 // (2) phi definitions are mapped to their initial value (updated during 304 // code generation to feed the proper values into the new chain). 305 // Contents reside in phase-local heap memory. 306 ScopedArenaSafeMap<HInstruction*, HInstruction*>* reductions_; 307 308 // Flag that tracks if any simplifications have occurred. 309 bool simplified_; 310 311 // Whether to use predicated loop vectorization (e.g. for arm64 SVE target). 312 bool predicated_vectorization_mode_; 313 314 // Number of "lanes" for selected packed type. 315 uint32_t vector_length_; 316 317 // Set of array references in the vector loop. 318 // Contents reside in phase-local heap memory. 319 ScopedArenaSet<ArrayReference>* vector_refs_; 320 321 // Static or dynamic loop peeling for alignment. 322 uint32_t vector_static_peeling_factor_; 323 const ArrayReference* vector_dynamic_peeling_candidate_; 324 325 // Dynamic data dependence test of the form a != b. 326 HInstruction* vector_runtime_test_a_; 327 HInstruction* vector_runtime_test_b_; 328 329 // Mapping used during vectorization synthesis for both the scalar peeling/cleanup 330 // loop (mode is kSequential) and the actual vector loop (mode is kVector). The data 331 // structure maps original instructions into the new instructions. 332 // Contents reside in phase-local heap memory. 333 ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_map_; 334 335 // Permanent mapping used during vectorization synthesis. 336 // Contents reside in phase-local heap memory. 337 ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_permanent_map_; 338 339 // Temporary vectorization bookkeeping. 340 VectorMode vector_mode_; // synthesis mode 341 HBasicBlock* vector_preheader_; // preheader of the new loop 342 HBasicBlock* vector_header_; // header of the new loop 343 HBasicBlock* vector_body_; // body of the new loop 344 HInstruction* vector_index_; // normalized index of the new loop 345 346 // Helper for target-specific behaviour for loop optimizations. 347 ArchNoOptsLoopHelper* arch_loop_helper_; 348 349 friend class LoopOptimizationTest; 350 351 DISALLOW_COPY_AND_ASSIGN(HLoopOptimization); 352 }; 353 354 } // namespace art 355 356 #endif // ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_ 357