1 /* 2 * Copyright (C) 2016 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_ 18 #define ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_ 19 20 #include "base/scoped_arena_allocator.h" 21 #include "base/scoped_arena_containers.h" 22 #include "induction_var_range.h" 23 #include "loop_analysis.h" 24 #include "nodes.h" 25 #include "optimization.h" 26 #include "superblock_cloner.h" 27 28 namespace art { 29 30 class CompilerOptions; 31 class ArchNoOptsLoopHelper; 32 33 /** 34 * Loop optimizations. Builds a loop hierarchy and applies optimizations to 35 * the detected nested loops, such as removal of dead induction and empty loops 36 * and inner loop vectorization. 37 */ 38 class HLoopOptimization : public HOptimization { 39 public: 40 HLoopOptimization(HGraph* graph, 41 const CompilerOptions* compiler_options, 42 HInductionVarAnalysis* induction_analysis, 43 OptimizingCompilerStats* stats, 44 const char* name = kLoopOptimizationPassName); 45 46 bool Run() override; 47 48 static constexpr const char* kLoopOptimizationPassName = "loop_optimization"; 49 50 private: 51 /** 52 * A single loop inside the loop hierarchy representation. 53 */ 54 struct LoopNode : public ArenaObject<kArenaAllocLoopOptimization> { LoopNodeLoopNode55 explicit LoopNode(HLoopInformation* lp_info) 56 : loop_info(lp_info), 57 outer(nullptr), 58 inner(nullptr), 59 previous(nullptr), 60 next(nullptr) {} 61 HLoopInformation* loop_info; 62 LoopNode* outer; 63 LoopNode* inner; 64 LoopNode* previous; 65 LoopNode* next; 66 }; 67 68 /* 69 * Vectorization restrictions (bit mask). 70 */ 71 enum VectorRestrictions { 72 kNone = 0, // no restrictions 73 kNoMul = 1 << 0, // no multiplication 74 kNoDiv = 1 << 1, // no division 75 kNoShift = 1 << 2, // no shift 76 kNoShr = 1 << 3, // no arithmetic shift right 77 kNoHiBits = 1 << 4, // "wider" operations cannot bring in higher order bits 78 kNoSignedHAdd = 1 << 5, // no signed halving add 79 kNoUnroundedHAdd = 1 << 6, // no unrounded halving add 80 kNoAbs = 1 << 7, // no absolute value 81 kNoStringCharAt = 1 << 8, // no StringCharAt 82 kNoReduction = 1 << 9, // no reduction 83 kNoSAD = 1 << 10, // no sum of absolute differences (SAD) 84 kNoWideSAD = 1 << 11, // no sum of absolute differences (SAD) with operand widening 85 kNoDotProd = 1 << 12, // no dot product 86 }; 87 88 /* 89 * Vectorization mode during synthesis 90 * (sequential peeling/cleanup loop or vector loop). 91 */ 92 enum VectorMode { 93 kSequential, 94 kVector 95 }; 96 97 /* 98 * Representation of a unit-stride array reference. 99 */ 100 struct ArrayReference { 101 ArrayReference(HInstruction* b, HInstruction* o, DataType::Type t, bool l, bool c = false) baseArrayReference102 : base(b), offset(o), type(t), lhs(l), is_string_char_at(c) { } 103 bool operator<(const ArrayReference& other) const { 104 return 105 (base < other.base) || 106 (base == other.base && 107 (offset < other.offset || (offset == other.offset && 108 (type < other.type || 109 (type == other.type && 110 (lhs < other.lhs || 111 (lhs == other.lhs && 112 is_string_char_at < other.is_string_char_at))))))); 113 } 114 HInstruction* base; // base address 115 HInstruction* offset; // offset + i 116 DataType::Type type; // component type 117 bool lhs; // def/use 118 bool is_string_char_at; // compressed string read 119 }; 120 121 // 122 // Loop setup and traversal. 123 // 124 125 bool LocalRun(); 126 void AddLoop(HLoopInformation* loop_info); 127 void RemoveLoop(LoopNode* node); 128 129 // Traverses all loops inner to outer to perform simplifications and optimizations. 130 // Returns true if loops nested inside current loop (node) have changed. 131 bool TraverseLoopsInnerToOuter(LoopNode* node); 132 133 // 134 // Optimization. 135 // 136 137 void SimplifyInduction(LoopNode* node); 138 void SimplifyBlocks(LoopNode* node); 139 140 // Performs optimizations specific to inner loop with finite header logic (empty loop removal, 141 // unrolling, vectorization). Returns true if anything changed. 142 bool TryOptimizeInnerLoopFinite(LoopNode* node); 143 144 // Performs optimizations specific to inner loop. Returns true if anything changed. 145 bool OptimizeInnerLoop(LoopNode* node); 146 147 // Tries to apply loop unrolling for branch penalty reduction and better instruction scheduling 148 // opportunities. Returns whether transformation happened. 'generate_code' determines whether the 149 // optimization should be actually applied. 150 bool TryUnrollingForBranchPenaltyReduction(LoopAnalysisInfo* analysis_info, 151 bool generate_code = true); 152 153 // Tries to apply loop peeling for loop invariant exits elimination. Returns whether 154 // transformation happened. 'generate_code' determines whether the optimization should be 155 // actually applied. 156 bool TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info, 157 bool generate_code = true); 158 159 // Tries to perform whole loop unrolling for a small loop with a small trip count to eliminate 160 // the loop check overhead and to have more opportunities for inter-iteration optimizations. 161 // Returns whether transformation happened. 'generate_code' determines whether the optimization 162 // should be actually applied. 163 bool TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code = true); 164 165 // Tries to apply scalar loop peeling and unrolling. 166 bool TryPeelingAndUnrolling(LoopNode* node); 167 168 // 169 // Vectorization analysis and synthesis. 170 // 171 172 bool ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count); 173 void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count); 174 void GenerateNewLoop(LoopNode* node, 175 HBasicBlock* block, 176 HBasicBlock* new_preheader, 177 HInstruction* lo, 178 HInstruction* hi, 179 HInstruction* step, 180 uint32_t unroll); 181 bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code); 182 bool VectorizeUse(LoopNode* node, 183 HInstruction* instruction, 184 bool generate_code, 185 DataType::Type type, 186 uint64_t restrictions); 187 uint32_t GetVectorSizeInBytes(); 188 bool TrySetVectorType(DataType::Type type, /*out*/ uint64_t* restrictions); 189 bool TrySetVectorLength(uint32_t length); 190 void GenerateVecInv(HInstruction* org, DataType::Type type); 191 void GenerateVecSub(HInstruction* org, HInstruction* offset); 192 void GenerateVecMem(HInstruction* org, 193 HInstruction* opa, 194 HInstruction* opb, 195 HInstruction* offset, 196 DataType::Type type); 197 void GenerateVecReductionPhi(HPhi* phi); 198 void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction); 199 HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction); 200 void GenerateVecOp(HInstruction* org, 201 HInstruction* opa, 202 HInstruction* opb, 203 DataType::Type type); 204 205 // Vectorization idioms. 206 bool VectorizeSaturationIdiom(LoopNode* node, 207 HInstruction* instruction, 208 bool generate_code, 209 DataType::Type type, 210 uint64_t restrictions); 211 bool VectorizeHalvingAddIdiom(LoopNode* node, 212 HInstruction* instruction, 213 bool generate_code, 214 DataType::Type type, 215 uint64_t restrictions); 216 bool VectorizeSADIdiom(LoopNode* node, 217 HInstruction* instruction, 218 bool generate_code, 219 DataType::Type type, 220 uint64_t restrictions); 221 bool VectorizeDotProdIdiom(LoopNode* node, 222 HInstruction* instruction, 223 bool generate_code, 224 DataType::Type type, 225 uint64_t restrictions); 226 227 // Vectorization heuristics. 228 Alignment ComputeAlignment(HInstruction* offset, 229 DataType::Type type, 230 bool is_string_char_at, 231 uint32_t peeling = 0); 232 void SetAlignmentStrategy(uint32_t peeling_votes[], 233 const ArrayReference* peeling_candidate); 234 uint32_t MaxNumberPeeled(); 235 bool IsVectorizationProfitable(int64_t trip_count); 236 237 // 238 // Helpers. 239 // 240 241 bool TrySetPhiInduction(HPhi* phi, bool restrict_uses); 242 bool TrySetPhiReduction(HPhi* phi); 243 244 // Detects loop header with a single induction (returned in main_phi), possibly 245 // other phis for reductions, but no other side effects. Returns true on success. 246 bool TrySetSimpleLoopHeader(HBasicBlock* block, /*out*/ HPhi** main_phi); 247 248 bool IsEmptyBody(HBasicBlock* block); 249 bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info, 250 HInstruction* instruction, 251 bool collect_loop_uses, 252 /*out*/ uint32_t* use_count); 253 bool IsUsedOutsideLoop(HLoopInformation* loop_info, 254 HInstruction* instruction); 255 bool TryReplaceWithLastValue(HLoopInformation* loop_info, 256 HInstruction* instruction, 257 HBasicBlock* block); 258 bool TryAssignLastValue(HLoopInformation* loop_info, 259 HInstruction* instruction, 260 HBasicBlock* block, 261 bool collect_loop_uses); 262 void RemoveDeadInstructions(const HInstructionList& list); 263 bool CanRemoveCycle(); // Whether the current 'iset_' is removable. 264 265 // Compiler options (to query ISA features). 266 const CompilerOptions* compiler_options_; 267 268 // Range information based on prior induction variable analysis. 269 InductionVarRange induction_range_; 270 271 // Phase-local heap memory allocator for the loop optimizer. Storage obtained 272 // through this allocator is immediately released when the loop optimizer is done. 273 ScopedArenaAllocator* loop_allocator_; 274 275 // Global heap memory allocator. Used to build HIR. 276 ArenaAllocator* global_allocator_; 277 278 // Entries into the loop hierarchy representation. The hierarchy resides 279 // in phase-local heap memory. 280 LoopNode* top_loop_; 281 LoopNode* last_loop_; 282 283 // Temporary bookkeeping of a set of instructions. 284 // Contents reside in phase-local heap memory. 285 ScopedArenaSet<HInstruction*>* iset_; 286 287 // Temporary bookkeeping of reduction instructions. Mapping is two-fold: 288 // (1) reductions in the loop-body are mapped back to their phi definition, 289 // (2) phi definitions are mapped to their initial value (updated during 290 // code generation to feed the proper values into the new chain). 291 // Contents reside in phase-local heap memory. 292 ScopedArenaSafeMap<HInstruction*, HInstruction*>* reductions_; 293 294 // Flag that tracks if any simplifications have occurred. 295 bool simplified_; 296 297 // Number of "lanes" for selected packed type. 298 uint32_t vector_length_; 299 300 // Set of array references in the vector loop. 301 // Contents reside in phase-local heap memory. 302 ScopedArenaSet<ArrayReference>* vector_refs_; 303 304 // Static or dynamic loop peeling for alignment. 305 uint32_t vector_static_peeling_factor_; 306 const ArrayReference* vector_dynamic_peeling_candidate_; 307 308 // Dynamic data dependence test of the form a != b. 309 HInstruction* vector_runtime_test_a_; 310 HInstruction* vector_runtime_test_b_; 311 312 // Mapping used during vectorization synthesis for both the scalar peeling/cleanup 313 // loop (mode is kSequential) and the actual vector loop (mode is kVector). The data 314 // structure maps original instructions into the new instructions. 315 // Contents reside in phase-local heap memory. 316 ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_map_; 317 318 // Permanent mapping used during vectorization synthesis. 319 // Contents reside in phase-local heap memory. 320 ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_permanent_map_; 321 322 // Temporary vectorization bookkeeping. 323 VectorMode vector_mode_; // synthesis mode 324 HBasicBlock* vector_preheader_; // preheader of the new loop 325 HBasicBlock* vector_header_; // header of the new loop 326 HBasicBlock* vector_body_; // body of the new loop 327 HInstruction* vector_index_; // normalized index of the new loop 328 329 // Helper for target-specific behaviour for loop optimizations. 330 ArchNoOptsLoopHelper* arch_loop_helper_; 331 332 friend class LoopOptimizationTest; 333 334 DISALLOW_COPY_AND_ASSIGN(HLoopOptimization); 335 }; 336 337 } // namespace art 338 339 #endif // ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_ 340