• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2016 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
18 #define ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
19 
20 #include "base/macros.h"
21 #include "base/scoped_arena_allocator.h"
22 #include "base/scoped_arena_containers.h"
23 #include "induction_var_range.h"
24 #include "loop_analysis.h"
25 #include "nodes.h"
26 #include "optimization.h"
27 #include "superblock_cloner.h"
28 
29 namespace art HIDDEN {
30 
31 class CompilerOptions;
32 class ArchNoOptsLoopHelper;
33 
34 /**
35  * Loop optimizations. Builds a loop hierarchy and applies optimizations to
36  * the detected nested loops, such as removal of dead induction and empty loops
37  * and inner loop vectorization.
38  */
39 class HLoopOptimization : public HOptimization {
40  public:
41   HLoopOptimization(HGraph* graph,
42                     const CodeGenerator& codegen,    // Needs info about the target.
43                     HInductionVarAnalysis* induction_analysis,
44                     OptimizingCompilerStats* stats,
45                     const char* name = kLoopOptimizationPassName);
46 
47   bool Run() override;
48 
49   static constexpr const char* kLoopOptimizationPassName = "loop_optimization";
50 
51   // The maximum number of total instructions (trip_count * instruction_count),
52   // where the optimization of removing SuspendChecks from the loop header could
53   // be performed.
54   static constexpr int64_t kMaxTotalInstRemoveSuspendCheck = 128;
55 
56  private:
57   /**
58    * A single loop inside the loop hierarchy representation.
59    */
60   struct LoopNode : public ArenaObject<kArenaAllocLoopOptimization> {
LoopNodeLoopNode61     explicit LoopNode(HLoopInformation* lp_info)
62         : loop_info(lp_info),
63           outer(nullptr),
64           inner(nullptr),
65           previous(nullptr),
66           next(nullptr),
67           try_catch_kind(TryCatchKind::kUnknown) {}
68 
69     enum class TryCatchKind {
70       kUnknown,
71       // Either if we have a try catch in the loop, or if the loop is inside of an outer try catch,
72       // we set `kHasTryCatch`.
73       kHasTryCatch,
74       kNoTryCatch
75     };
76 
77     HLoopInformation* loop_info;
78     LoopNode* outer;
79     LoopNode* inner;
80     LoopNode* previous;
81     LoopNode* next;
82     TryCatchKind try_catch_kind;
83   };
84 
85   /*
86    * Vectorization restrictions (bit mask).
87    */
88   enum VectorRestrictions {
89     kNone            = 0,        // no restrictions
90     kNoMul           = 1 << 0,   // no multiplication
91     kNoDiv           = 1 << 1,   // no division
92     kNoShift         = 1 << 2,   // no shift
93     kNoShr           = 1 << 3,   // no arithmetic shift right
94     kNoHiBits        = 1 << 4,   // "wider" operations cannot bring in higher order bits
95     kNoSignedHAdd    = 1 << 5,   // no signed halving add
96     kNoUnsignedHAdd  = 1 << 6,   // no unsigned halving add
97     kNoUnroundedHAdd = 1 << 7,   // no unrounded halving add
98     kNoAbs           = 1 << 8,   // no absolute value
99     kNoStringCharAt  = 1 << 9,   // no StringCharAt
100     kNoReduction     = 1 << 10,  // no reduction
101     kNoSAD           = 1 << 11,  // no sum of absolute differences (SAD)
102     kNoWideSAD       = 1 << 12,  // no sum of absolute differences (SAD) with operand widening
103     kNoDotProd       = 1 << 13,  // no dot product
104   };
105 
106   /*
107    * Vectorization mode during synthesis
108    * (sequential peeling/cleanup loop or vector loop).
109    */
110   enum VectorMode {
111     kSequential,
112     kVector
113   };
114 
115   /*
116    * Representation of a unit-stride array reference.
117    */
118   struct ArrayReference {
119     ArrayReference(HInstruction* b, HInstruction* o, DataType::Type t, bool l, bool c = false)
baseArrayReference120         : base(b), offset(o), type(t), lhs(l), is_string_char_at(c) { }
121     bool operator<(const ArrayReference& other) const {
122       return
123           (base < other.base) ||
124           (base == other.base &&
125            (offset < other.offset || (offset == other.offset &&
126                                       (type < other.type ||
127                                        (type == other.type &&
128                                         (lhs < other.lhs ||
129                                          (lhs == other.lhs &&
130                                           is_string_char_at < other.is_string_char_at)))))));
131     }
132     HInstruction* base;      // base address
133     HInstruction* offset;    // offset + i
134     DataType::Type type;     // component type
135     bool lhs;                // def/use
136     bool is_string_char_at;  // compressed string read
137   };
138 
139   //
140   // Loop setup and traversal.
141   //
142 
143   bool LocalRun();
144   void AddLoop(HLoopInformation* loop_info);
145   void RemoveLoop(LoopNode* node);
146 
147   // Traverses all loops inner to outer to perform simplifications and optimizations.
148   // Returns true if loops nested inside current loop (node) have changed.
149   bool TraverseLoopsInnerToOuter(LoopNode* node);
150 
151   // Calculates `node`'s `try_catch_kind` and sets it to:
152   // 1) kHasTryCatch if it has try catches (or if it's inside of an outer try catch)
153   // 2) kNoTryCatch otherwise.
154   void CalculateAndSetTryCatchKind(LoopNode* node);
155 
156   //
157   // Optimization.
158   //
159 
160   void SimplifyInduction(LoopNode* node);
161   void SimplifyBlocks(LoopNode* node);
162 
163   // Performs optimizations specific to inner loop with finite header logic (empty loop removal,
164   // unrolling, vectorization). Returns true if anything changed.
165   bool TryOptimizeInnerLoopFinite(LoopNode* node);
166 
167   // Performs optimizations specific to inner loop. Returns true if anything changed.
168   bool OptimizeInnerLoop(LoopNode* node);
169 
170   // Tries to apply loop unrolling for branch penalty reduction and better instruction scheduling
171   // opportunities. Returns whether transformation happened. 'generate_code' determines whether the
172   // optimization should be actually applied.
173   bool TryUnrollingForBranchPenaltyReduction(LoopAnalysisInfo* analysis_info,
174                                              bool generate_code = true);
175 
176   // Tries to apply loop peeling for loop invariant exits elimination. Returns whether
177   // transformation happened. 'generate_code' determines whether the optimization should be
178   // actually applied.
179   bool TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info,
180                                                   bool generate_code = true);
181 
182   // Tries to perform whole loop unrolling for a small loop with a small trip count to eliminate
183   // the loop check overhead and to have more opportunities for inter-iteration optimizations.
184   // Returns whether transformation happened. 'generate_code' determines whether the optimization
185   // should be actually applied.
186   bool TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code = true);
187 
188   // Tries to remove SuspendCheck for plain loops with a low trip count. The
189   // SuspendCheck in the codegen makes sure that the thread can be interrupted
190   // during execution for GC. Not being able to do so might decrease the
191   // responsiveness of GC when a very long loop or a long recursion is being
192   // executed. However, for plain loops with a small trip count, the removal of
193   // SuspendCheck should not affect the GC's responsiveness by a large margin.
194   // Consequently, since the thread won't be interrupted for plain loops, it is
195   // assumed that the performance might increase by removing SuspendCheck.
196   bool TryToRemoveSuspendCheckFromLoopHeader(LoopAnalysisInfo* analysis_info,
197                                              bool generate_code = true);
198 
199   // Tries to apply scalar loop optimizations.
200   bool TryLoopScalarOpts(LoopNode* node);
201 
202   //
203   // Vectorization analysis and synthesis.
204   //
205 
206   bool ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count);
207   void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count);
208   void GenerateNewLoop(LoopNode* node,
209                        HBasicBlock* block,
210                        HBasicBlock* new_preheader,
211                        HInstruction* lo,
212                        HInstruction* hi,
213                        HInstruction* step,
214                        uint32_t unroll);
215   bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code);
216   bool VectorizeUse(LoopNode* node,
217                     HInstruction* instruction,
218                     bool generate_code,
219                     DataType::Type type,
220                     uint64_t restrictions);
221   uint32_t GetVectorSizeInBytes();
222   bool TrySetVectorType(DataType::Type type, /*out*/ uint64_t* restrictions);
223   bool TrySetVectorLengthImpl(uint32_t length);
224 
TrySetVectorLength(DataType::Type type,uint32_t length)225   bool TrySetVectorLength(DataType::Type type, uint32_t length) {
226     bool res = TrySetVectorLengthImpl(length);
227     // Currently the vectorizer supports only the mode when full SIMD registers are used.
228     DCHECK_IMPLIES(res, DataType::Size(type) * length == GetVectorSizeInBytes());
229     return res;
230   }
231 
232   void GenerateVecInv(HInstruction* org, DataType::Type type);
233   void GenerateVecSub(HInstruction* org, HInstruction* offset);
234   void GenerateVecMem(HInstruction* org,
235                       HInstruction* opa,
236                       HInstruction* opb,
237                       HInstruction* offset,
238                       DataType::Type type);
239   void GenerateVecReductionPhi(HPhi* phi);
240   void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction);
241   HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction);
242   void GenerateVecOp(HInstruction* org,
243                      HInstruction* opa,
244                      HInstruction* opb,
245                      DataType::Type type);
246 
247   // Vectorization idioms.
248   bool VectorizeSaturationIdiom(LoopNode* node,
249                                 HInstruction* instruction,
250                                 bool generate_code,
251                                 DataType::Type type,
252                                 uint64_t restrictions);
253   bool VectorizeHalvingAddIdiom(LoopNode* node,
254                                 HInstruction* instruction,
255                                 bool generate_code,
256                                 DataType::Type type,
257                                 uint64_t restrictions);
258   bool VectorizeSADIdiom(LoopNode* node,
259                          HInstruction* instruction,
260                          bool generate_code,
261                          DataType::Type type,
262                          uint64_t restrictions);
263   bool VectorizeDotProdIdiom(LoopNode* node,
264                              HInstruction* instruction,
265                              bool generate_code,
266                              DataType::Type type,
267                              uint64_t restrictions);
268 
269   // Vectorization heuristics.
270   Alignment ComputeAlignment(HInstruction* offset,
271                              DataType::Type type,
272                              bool is_string_char_at,
273                              uint32_t peeling = 0);
274   void SetAlignmentStrategy(const ScopedArenaVector<uint32_t>& peeling_votes,
275                             const ArrayReference* peeling_candidate);
276   uint32_t MaxNumberPeeled();
277   bool IsVectorizationProfitable(int64_t trip_count);
278 
279   //
280   // Helpers.
281   //
282 
283   bool TrySetPhiInduction(HPhi* phi, bool restrict_uses);
284   bool TrySetPhiReduction(HPhi* phi);
285 
286   // Detects loop header with a single induction (returned in main_phi), possibly
287   // other phis for reductions, but no other side effects. Returns true on success.
288   bool TrySetSimpleLoopHeader(HBasicBlock* block, /*out*/ HPhi** main_phi);
289 
290   bool IsEmptyBody(HBasicBlock* block);
291   bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info,
292                            HInstruction* instruction,
293                            bool collect_loop_uses,
294                            /*out*/ uint32_t* use_count);
295   bool IsUsedOutsideLoop(HLoopInformation* loop_info,
296                          HInstruction* instruction);
297   bool TryReplaceWithLastValue(HLoopInformation* loop_info,
298                                HInstruction* instruction,
299                                HBasicBlock* block);
300   bool TryAssignLastValue(HLoopInformation* loop_info,
301                           HInstruction* instruction,
302                           HBasicBlock* block,
303                           bool collect_loop_uses);
304   void RemoveDeadInstructions(const HInstructionList& list);
305   bool CanRemoveCycle();  // Whether the current 'iset_' is removable.
306 
IsInPredicatedVectorizationMode()307   bool IsInPredicatedVectorizationMode() const { return predicated_vectorization_mode_; }
308 
309   // Compiler options (to query ISA features).
310   const CompilerOptions* compiler_options_;
311 
312   // Cached target SIMD vector register size in bytes.
313   const size_t simd_register_size_;
314 
315   // Range information based on prior induction variable analysis.
316   InductionVarRange induction_range_;
317 
318   // Phase-local heap memory allocator for the loop optimizer. Storage obtained
319   // through this allocator is immediately released when the loop optimizer is done.
320   ScopedArenaAllocator* loop_allocator_;
321 
322   // Global heap memory allocator. Used to build HIR.
323   ArenaAllocator* global_allocator_;
324 
325   // Entries into the loop hierarchy representation. The hierarchy resides
326   // in phase-local heap memory.
327   LoopNode* top_loop_;
328   LoopNode* last_loop_;
329 
330   // Temporary bookkeeping of a set of instructions.
331   // Contents reside in phase-local heap memory.
332   ScopedArenaSet<HInstruction*>* iset_;
333 
334   // Temporary bookkeeping of reduction instructions. Mapping is two-fold:
335   // (1) reductions in the loop-body are mapped back to their phi definition,
336   // (2) phi definitions are mapped to their initial value (updated during
337   //     code generation to feed the proper values into the new chain).
338   // Contents reside in phase-local heap memory.
339   ScopedArenaSafeMap<HInstruction*, HInstruction*>* reductions_;
340 
341   // Flag that tracks if any simplifications have occurred.
342   bool simplified_;
343 
344   // Whether to use predicated loop vectorization (e.g. for arm64 SVE target).
345   bool predicated_vectorization_mode_;
346 
347   // Number of "lanes" for selected packed type.
348   uint32_t vector_length_;
349 
350   // Set of array references in the vector loop.
351   // Contents reside in phase-local heap memory.
352   ScopedArenaSet<ArrayReference>* vector_refs_;
353 
354   // Static or dynamic loop peeling for alignment.
355   uint32_t vector_static_peeling_factor_;
356   const ArrayReference* vector_dynamic_peeling_candidate_;
357 
358   // Dynamic data dependence test of the form a != b.
359   HInstruction* vector_runtime_test_a_;
360   HInstruction* vector_runtime_test_b_;
361 
362   // Mapping used during vectorization synthesis for both the scalar peeling/cleanup
363   // loop (mode is kSequential) and the actual vector loop (mode is kVector). The data
364   // structure maps original instructions into the new instructions.
365   // Contents reside in phase-local heap memory.
366   ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_map_;
367 
368   // Permanent mapping used during vectorization synthesis.
369   // Contents reside in phase-local heap memory.
370   ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_permanent_map_;
371 
372   // Temporary vectorization bookkeeping.
373   VectorMode vector_mode_;  // synthesis mode
374   HBasicBlock* vector_preheader_;  // preheader of the new loop
375   HBasicBlock* vector_header_;  // header of the new loop
376   HBasicBlock* vector_body_;  // body of the new loop
377   HInstruction* vector_index_;  // normalized index of the new loop
378 
379   // Helper for target-specific behaviour for loop optimizations.
380   ArchNoOptsLoopHelper* arch_loop_helper_;
381 
382   friend class LoopOptimizationTest;
383 
384   DISALLOW_COPY_AND_ASSIGN(HLoopOptimization);
385 };
386 
387 }  // namespace art
388 
389 #endif  // ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
390