• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2016 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
18 #define ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
19 
20 #include "base/scoped_arena_allocator.h"
21 #include "base/scoped_arena_containers.h"
22 #include "induction_var_range.h"
23 #include "loop_analysis.h"
24 #include "nodes.h"
25 #include "optimization.h"
26 #include "superblock_cloner.h"
27 
28 namespace art {
29 
30 class CompilerOptions;
31 class ArchNoOptsLoopHelper;
32 
33 /**
34  * Loop optimizations. Builds a loop hierarchy and applies optimizations to
35  * the detected nested loops, such as removal of dead induction and empty loops
36  * and inner loop vectorization.
37  */
38 class HLoopOptimization : public HOptimization {
39  public:
40   HLoopOptimization(HGraph* graph,
41                     const CodeGenerator& codegen,    // Needs info about the target.
42                     HInductionVarAnalysis* induction_analysis,
43                     OptimizingCompilerStats* stats,
44                     const char* name = kLoopOptimizationPassName);
45 
46   bool Run() override;
47 
48   static constexpr const char* kLoopOptimizationPassName = "loop_optimization";
49 
50  private:
51   /**
52    * A single loop inside the loop hierarchy representation.
53    */
54   struct LoopNode : public ArenaObject<kArenaAllocLoopOptimization> {
LoopNodeLoopNode55     explicit LoopNode(HLoopInformation* lp_info)
56         : loop_info(lp_info),
57           outer(nullptr),
58           inner(nullptr),
59           previous(nullptr),
60           next(nullptr),
61           try_catch_kind(TryCatchKind::kUnknown) {}
62 
63     enum class TryCatchKind {
64       kUnknown,
65       // Either if we have a try catch in the loop, or if the loop is inside of an outer try catch,
66       // we set `kHasTryCatch`.
67       kHasTryCatch,
68       kNoTryCatch
69     };
70 
71     HLoopInformation* loop_info;
72     LoopNode* outer;
73     LoopNode* inner;
74     LoopNode* previous;
75     LoopNode* next;
76     TryCatchKind try_catch_kind;
77   };
78 
79   /*
80    * Vectorization restrictions (bit mask).
81    */
82   enum VectorRestrictions {
83     kNone            = 0,        // no restrictions
84     kNoMul           = 1 << 0,   // no multiplication
85     kNoDiv           = 1 << 1,   // no division
86     kNoShift         = 1 << 2,   // no shift
87     kNoShr           = 1 << 3,   // no arithmetic shift right
88     kNoHiBits        = 1 << 4,   // "wider" operations cannot bring in higher order bits
89     kNoSignedHAdd    = 1 << 5,   // no signed halving add
90     kNoUnsignedHAdd  = 1 << 6,   // no unsigned halving add
91     kNoUnroundedHAdd = 1 << 7,   // no unrounded halving add
92     kNoAbs           = 1 << 8,   // no absolute value
93     kNoStringCharAt  = 1 << 9,   // no StringCharAt
94     kNoReduction     = 1 << 10,  // no reduction
95     kNoSAD           = 1 << 11,  // no sum of absolute differences (SAD)
96     kNoWideSAD       = 1 << 12,  // no sum of absolute differences (SAD) with operand widening
97     kNoDotProd       = 1 << 13,  // no dot product
98   };
99 
100   /*
101    * Vectorization mode during synthesis
102    * (sequential peeling/cleanup loop or vector loop).
103    */
104   enum VectorMode {
105     kSequential,
106     kVector
107   };
108 
109   /*
110    * Representation of a unit-stride array reference.
111    */
112   struct ArrayReference {
113     ArrayReference(HInstruction* b, HInstruction* o, DataType::Type t, bool l, bool c = false)
baseArrayReference114         : base(b), offset(o), type(t), lhs(l), is_string_char_at(c) { }
115     bool operator<(const ArrayReference& other) const {
116       return
117           (base < other.base) ||
118           (base == other.base &&
119            (offset < other.offset || (offset == other.offset &&
120                                       (type < other.type ||
121                                        (type == other.type &&
122                                         (lhs < other.lhs ||
123                                          (lhs == other.lhs &&
124                                           is_string_char_at < other.is_string_char_at)))))));
125     }
126     HInstruction* base;      // base address
127     HInstruction* offset;    // offset + i
128     DataType::Type type;     // component type
129     bool lhs;                // def/use
130     bool is_string_char_at;  // compressed string read
131   };
132 
133   //
134   // Loop setup and traversal.
135   //
136 
137   bool LocalRun();
138   void AddLoop(HLoopInformation* loop_info);
139   void RemoveLoop(LoopNode* node);
140 
141   // Traverses all loops inner to outer to perform simplifications and optimizations.
142   // Returns true if loops nested inside current loop (node) have changed.
143   bool TraverseLoopsInnerToOuter(LoopNode* node);
144 
145   // Calculates `node`'s `try_catch_kind` and sets it to:
146   // 1) kHasTryCatch if it has try catches (or if it's inside of an outer try catch)
147   // 2) kNoTryCatch otherwise.
148   void CalculateAndSetTryCatchKind(LoopNode* node);
149 
150   //
151   // Optimization.
152   //
153 
154   void SimplifyInduction(LoopNode* node);
155   void SimplifyBlocks(LoopNode* node);
156 
157   // Performs optimizations specific to inner loop with finite header logic (empty loop removal,
158   // unrolling, vectorization). Returns true if anything changed.
159   bool TryOptimizeInnerLoopFinite(LoopNode* node);
160 
161   // Performs optimizations specific to inner loop. Returns true if anything changed.
162   bool OptimizeInnerLoop(LoopNode* node);
163 
164   // Tries to apply loop unrolling for branch penalty reduction and better instruction scheduling
165   // opportunities. Returns whether transformation happened. 'generate_code' determines whether the
166   // optimization should be actually applied.
167   bool TryUnrollingForBranchPenaltyReduction(LoopAnalysisInfo* analysis_info,
168                                              bool generate_code = true);
169 
170   // Tries to apply loop peeling for loop invariant exits elimination. Returns whether
171   // transformation happened. 'generate_code' determines whether the optimization should be
172   // actually applied.
173   bool TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info,
174                                                   bool generate_code = true);
175 
176   // Tries to perform whole loop unrolling for a small loop with a small trip count to eliminate
177   // the loop check overhead and to have more opportunities for inter-iteration optimizations.
178   // Returns whether transformation happened. 'generate_code' determines whether the optimization
179   // should be actually applied.
180   bool TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code = true);
181 
182   // Tries to apply scalar loop peeling and unrolling.
183   bool TryPeelingAndUnrolling(LoopNode* node);
184 
185   //
186   // Vectorization analysis and synthesis.
187   //
188 
189   bool ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count);
190   void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count);
191   void GenerateNewLoop(LoopNode* node,
192                        HBasicBlock* block,
193                        HBasicBlock* new_preheader,
194                        HInstruction* lo,
195                        HInstruction* hi,
196                        HInstruction* step,
197                        uint32_t unroll);
198   bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code);
199   bool VectorizeUse(LoopNode* node,
200                     HInstruction* instruction,
201                     bool generate_code,
202                     DataType::Type type,
203                     uint64_t restrictions);
204   uint32_t GetVectorSizeInBytes();
205   bool TrySetVectorType(DataType::Type type, /*out*/ uint64_t* restrictions);
206   bool TrySetVectorLengthImpl(uint32_t length);
207 
TrySetVectorLength(DataType::Type type,uint32_t length)208   bool TrySetVectorLength(DataType::Type type, uint32_t length) {
209     bool res = TrySetVectorLengthImpl(length);
210     // Currently the vectorizer supports only the mode when full SIMD registers are used.
211     DCHECK_IMPLIES(res, DataType::Size(type) * length == GetVectorSizeInBytes());
212     return res;
213   }
214 
215   void GenerateVecInv(HInstruction* org, DataType::Type type);
216   void GenerateVecSub(HInstruction* org, HInstruction* offset);
217   void GenerateVecMem(HInstruction* org,
218                       HInstruction* opa,
219                       HInstruction* opb,
220                       HInstruction* offset,
221                       DataType::Type type);
222   void GenerateVecReductionPhi(HPhi* phi);
223   void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction);
224   HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction);
225   void GenerateVecOp(HInstruction* org,
226                      HInstruction* opa,
227                      HInstruction* opb,
228                      DataType::Type type);
229 
230   // Vectorization idioms.
231   bool VectorizeSaturationIdiom(LoopNode* node,
232                                 HInstruction* instruction,
233                                 bool generate_code,
234                                 DataType::Type type,
235                                 uint64_t restrictions);
236   bool VectorizeHalvingAddIdiom(LoopNode* node,
237                                 HInstruction* instruction,
238                                 bool generate_code,
239                                 DataType::Type type,
240                                 uint64_t restrictions);
241   bool VectorizeSADIdiom(LoopNode* node,
242                          HInstruction* instruction,
243                          bool generate_code,
244                          DataType::Type type,
245                          uint64_t restrictions);
246   bool VectorizeDotProdIdiom(LoopNode* node,
247                              HInstruction* instruction,
248                              bool generate_code,
249                              DataType::Type type,
250                              uint64_t restrictions);
251 
252   // Vectorization heuristics.
253   Alignment ComputeAlignment(HInstruction* offset,
254                              DataType::Type type,
255                              bool is_string_char_at,
256                              uint32_t peeling = 0);
257   void SetAlignmentStrategy(const ScopedArenaVector<uint32_t>& peeling_votes,
258                             const ArrayReference* peeling_candidate);
259   uint32_t MaxNumberPeeled();
260   bool IsVectorizationProfitable(int64_t trip_count);
261 
262   //
263   // Helpers.
264   //
265 
266   bool TrySetPhiInduction(HPhi* phi, bool restrict_uses);
267   bool TrySetPhiReduction(HPhi* phi);
268 
269   // Detects loop header with a single induction (returned in main_phi), possibly
270   // other phis for reductions, but no other side effects. Returns true on success.
271   bool TrySetSimpleLoopHeader(HBasicBlock* block, /*out*/ HPhi** main_phi);
272 
273   bool IsEmptyBody(HBasicBlock* block);
274   bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info,
275                            HInstruction* instruction,
276                            bool collect_loop_uses,
277                            /*out*/ uint32_t* use_count);
278   bool IsUsedOutsideLoop(HLoopInformation* loop_info,
279                          HInstruction* instruction);
280   bool TryReplaceWithLastValue(HLoopInformation* loop_info,
281                                HInstruction* instruction,
282                                HBasicBlock* block);
283   bool TryAssignLastValue(HLoopInformation* loop_info,
284                           HInstruction* instruction,
285                           HBasicBlock* block,
286                           bool collect_loop_uses);
287   void RemoveDeadInstructions(const HInstructionList& list);
288   bool CanRemoveCycle();  // Whether the current 'iset_' is removable.
289 
IsInPredicatedVectorizationMode()290   bool IsInPredicatedVectorizationMode() const { return predicated_vectorization_mode_; }
291 
292   // Compiler options (to query ISA features).
293   const CompilerOptions* compiler_options_;
294 
295   // Cached target SIMD vector register size in bytes.
296   const size_t simd_register_size_;
297 
298   // Range information based on prior induction variable analysis.
299   InductionVarRange induction_range_;
300 
301   // Phase-local heap memory allocator for the loop optimizer. Storage obtained
302   // through this allocator is immediately released when the loop optimizer is done.
303   ScopedArenaAllocator* loop_allocator_;
304 
305   // Global heap memory allocator. Used to build HIR.
306   ArenaAllocator* global_allocator_;
307 
308   // Entries into the loop hierarchy representation. The hierarchy resides
309   // in phase-local heap memory.
310   LoopNode* top_loop_;
311   LoopNode* last_loop_;
312 
313   // Temporary bookkeeping of a set of instructions.
314   // Contents reside in phase-local heap memory.
315   ScopedArenaSet<HInstruction*>* iset_;
316 
317   // Temporary bookkeeping of reduction instructions. Mapping is two-fold:
318   // (1) reductions in the loop-body are mapped back to their phi definition,
319   // (2) phi definitions are mapped to their initial value (updated during
320   //     code generation to feed the proper values into the new chain).
321   // Contents reside in phase-local heap memory.
322   ScopedArenaSafeMap<HInstruction*, HInstruction*>* reductions_;
323 
324   // Flag that tracks if any simplifications have occurred.
325   bool simplified_;
326 
327   // Whether to use predicated loop vectorization (e.g. for arm64 SVE target).
328   bool predicated_vectorization_mode_;
329 
330   // Number of "lanes" for selected packed type.
331   uint32_t vector_length_;
332 
333   // Set of array references in the vector loop.
334   // Contents reside in phase-local heap memory.
335   ScopedArenaSet<ArrayReference>* vector_refs_;
336 
337   // Static or dynamic loop peeling for alignment.
338   uint32_t vector_static_peeling_factor_;
339   const ArrayReference* vector_dynamic_peeling_candidate_;
340 
341   // Dynamic data dependence test of the form a != b.
342   HInstruction* vector_runtime_test_a_;
343   HInstruction* vector_runtime_test_b_;
344 
345   // Mapping used during vectorization synthesis for both the scalar peeling/cleanup
346   // loop (mode is kSequential) and the actual vector loop (mode is kVector). The data
347   // structure maps original instructions into the new instructions.
348   // Contents reside in phase-local heap memory.
349   ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_map_;
350 
351   // Permanent mapping used during vectorization synthesis.
352   // Contents reside in phase-local heap memory.
353   ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_permanent_map_;
354 
355   // Temporary vectorization bookkeeping.
356   VectorMode vector_mode_;  // synthesis mode
357   HBasicBlock* vector_preheader_;  // preheader of the new loop
358   HBasicBlock* vector_header_;  // header of the new loop
359   HBasicBlock* vector_body_;  // body of the new loop
360   HInstruction* vector_index_;  // normalized index of the new loop
361 
362   // Helper for target-specific behaviour for loop optimizations.
363   ArchNoOptsLoopHelper* arch_loop_helper_;
364 
365   friend class LoopOptimizationTest;
366 
367   DISALLOW_COPY_AND_ASSIGN(HLoopOptimization);
368 };
369 
370 }  // namespace art
371 
372 #endif  // ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
373