1 /*
2  * Copyright (C) 2016 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
18 #define ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
19 
20 #include "base/scoped_arena_allocator.h"
21 #include "base/scoped_arena_containers.h"
22 #include "induction_var_range.h"
23 #include "loop_analysis.h"
24 #include "nodes.h"
25 #include "optimization.h"
26 #include "superblock_cloner.h"
27 
28 namespace art {
29 
30 class CompilerOptions;
31 class ArchNoOptsLoopHelper;
32 
33 /**
34  * Loop optimizations. Builds a loop hierarchy and applies optimizations to
35  * the detected nested loops, such as removal of dead induction and empty loops
36  * and inner loop vectorization.
37  */
38 class HLoopOptimization : public HOptimization {
39  public:
40   HLoopOptimization(HGraph* graph,
41                     const CodeGenerator& codegen,    // Needs info about the target.
42                     HInductionVarAnalysis* induction_analysis,
43                     OptimizingCompilerStats* stats,
44                     const char* name = kLoopOptimizationPassName);
45 
46   bool Run() override;
47 
48   static constexpr const char* kLoopOptimizationPassName = "loop_optimization";
49 
50  private:
51   /**
52    * A single loop inside the loop hierarchy representation.
53    */
54   struct LoopNode : public ArenaObject<kArenaAllocLoopOptimization> {
LoopNodeLoopNode55     explicit LoopNode(HLoopInformation* lp_info)
56         : loop_info(lp_info),
57           outer(nullptr),
58           inner(nullptr),
59           previous(nullptr),
60           next(nullptr) {}
61     HLoopInformation* loop_info;
62     LoopNode* outer;
63     LoopNode* inner;
64     LoopNode* previous;
65     LoopNode* next;
66   };
67 
68   /*
69    * Vectorization restrictions (bit mask).
70    */
71   enum VectorRestrictions {
72     kNone            = 0,        // no restrictions
73     kNoMul           = 1 << 0,   // no multiplication
74     kNoDiv           = 1 << 1,   // no division
75     kNoShift         = 1 << 2,   // no shift
76     kNoShr           = 1 << 3,   // no arithmetic shift right
77     kNoHiBits        = 1 << 4,   // "wider" operations cannot bring in higher order bits
78     kNoSignedHAdd    = 1 << 5,   // no signed halving add
79     kNoUnroundedHAdd = 1 << 6,   // no unrounded halving add
80     kNoAbs           = 1 << 7,   // no absolute value
81     kNoStringCharAt  = 1 << 8,   // no StringCharAt
82     kNoReduction     = 1 << 9,   // no reduction
83     kNoSAD           = 1 << 10,  // no sum of absolute differences (SAD)
84     kNoWideSAD       = 1 << 11,  // no sum of absolute differences (SAD) with operand widening
85     kNoDotProd       = 1 << 12,  // no dot product
86   };
87 
88   /*
89    * Vectorization mode during synthesis
90    * (sequential peeling/cleanup loop or vector loop).
91    */
92   enum VectorMode {
93     kSequential,
94     kVector
95   };
96 
97   /*
98    * Representation of a unit-stride array reference.
99    */
100   struct ArrayReference {
101     ArrayReference(HInstruction* b, HInstruction* o, DataType::Type t, bool l, bool c = false)
baseArrayReference102         : base(b), offset(o), type(t), lhs(l), is_string_char_at(c) { }
103     bool operator<(const ArrayReference& other) const {
104       return
105           (base < other.base) ||
106           (base == other.base &&
107            (offset < other.offset || (offset == other.offset &&
108                                       (type < other.type ||
109                                        (type == other.type &&
110                                         (lhs < other.lhs ||
111                                          (lhs == other.lhs &&
112                                           is_string_char_at < other.is_string_char_at)))))));
113     }
114     HInstruction* base;      // base address
115     HInstruction* offset;    // offset + i
116     DataType::Type type;     // component type
117     bool lhs;                // def/use
118     bool is_string_char_at;  // compressed string read
119   };
120 
121   //
122   // Loop setup and traversal.
123   //
124 
125   bool LocalRun();
126   void AddLoop(HLoopInformation* loop_info);
127   void RemoveLoop(LoopNode* node);
128 
129   // Traverses all loops inner to outer to perform simplifications and optimizations.
130   // Returns true if loops nested inside current loop (node) have changed.
131   bool TraverseLoopsInnerToOuter(LoopNode* node);
132 
133   //
134   // Optimization.
135   //
136 
137   void SimplifyInduction(LoopNode* node);
138   void SimplifyBlocks(LoopNode* node);
139 
140   // Performs optimizations specific to inner loop with finite header logic (empty loop removal,
141   // unrolling, vectorization). Returns true if anything changed.
142   bool TryOptimizeInnerLoopFinite(LoopNode* node);
143 
144   // Performs optimizations specific to inner loop. Returns true if anything changed.
145   bool OptimizeInnerLoop(LoopNode* node);
146 
147   // Tries to apply loop unrolling for branch penalty reduction and better instruction scheduling
148   // opportunities. Returns whether transformation happened. 'generate_code' determines whether the
149   // optimization should be actually applied.
150   bool TryUnrollingForBranchPenaltyReduction(LoopAnalysisInfo* analysis_info,
151                                              bool generate_code = true);
152 
153   // Tries to apply loop peeling for loop invariant exits elimination. Returns whether
154   // transformation happened. 'generate_code' determines whether the optimization should be
155   // actually applied.
156   bool TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info,
157                                                   bool generate_code = true);
158 
159   // Tries to perform whole loop unrolling for a small loop with a small trip count to eliminate
160   // the loop check overhead and to have more opportunities for inter-iteration optimizations.
161   // Returns whether transformation happened. 'generate_code' determines whether the optimization
162   // should be actually applied.
163   bool TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code = true);
164 
165   // Tries to apply scalar loop peeling and unrolling.
166   bool TryPeelingAndUnrolling(LoopNode* node);
167 
168   //
169   // Vectorization analysis and synthesis.
170   //
171 
172   bool ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count);
173   void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count);
174   void GenerateNewLoop(LoopNode* node,
175                        HBasicBlock* block,
176                        HBasicBlock* new_preheader,
177                        HInstruction* lo,
178                        HInstruction* hi,
179                        HInstruction* step,
180                        uint32_t unroll);
181   bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code);
182   bool VectorizeUse(LoopNode* node,
183                     HInstruction* instruction,
184                     bool generate_code,
185                     DataType::Type type,
186                     uint64_t restrictions);
187   uint32_t GetVectorSizeInBytes();
188   bool TrySetVectorType(DataType::Type type, /*out*/ uint64_t* restrictions);
189   bool TrySetVectorLengthImpl(uint32_t length);
190 
TrySetVectorLength(DataType::Type type,uint32_t length)191   bool TrySetVectorLength(DataType::Type type, uint32_t length) {
192     bool res = TrySetVectorLengthImpl(length);
193     // Currently the vectorizer supports only the mode when full SIMD registers are used.
194     DCHECK(!res || (DataType::Size(type) * length == GetVectorSizeInBytes()));
195     return res;
196   }
197 
198   void GenerateVecInv(HInstruction* org, DataType::Type type);
199   void GenerateVecSub(HInstruction* org, HInstruction* offset);
200   void GenerateVecMem(HInstruction* org,
201                       HInstruction* opa,
202                       HInstruction* opb,
203                       HInstruction* offset,
204                       DataType::Type type);
205   void GenerateVecReductionPhi(HPhi* phi);
206   void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction);
207   HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction);
208   void GenerateVecOp(HInstruction* org,
209                      HInstruction* opa,
210                      HInstruction* opb,
211                      DataType::Type type);
212 
213   // Vectorization idioms.
214   bool VectorizeSaturationIdiom(LoopNode* node,
215                                 HInstruction* instruction,
216                                 bool generate_code,
217                                 DataType::Type type,
218                                 uint64_t restrictions);
219   bool VectorizeHalvingAddIdiom(LoopNode* node,
220                                 HInstruction* instruction,
221                                 bool generate_code,
222                                 DataType::Type type,
223                                 uint64_t restrictions);
224   bool VectorizeSADIdiom(LoopNode* node,
225                          HInstruction* instruction,
226                          bool generate_code,
227                          DataType::Type type,
228                          uint64_t restrictions);
229   bool VectorizeDotProdIdiom(LoopNode* node,
230                              HInstruction* instruction,
231                              bool generate_code,
232                              DataType::Type type,
233                              uint64_t restrictions);
234 
235   // Vectorization heuristics.
236   Alignment ComputeAlignment(HInstruction* offset,
237                              DataType::Type type,
238                              bool is_string_char_at,
239                              uint32_t peeling = 0);
240   void SetAlignmentStrategy(uint32_t peeling_votes[],
241                             const ArrayReference* peeling_candidate);
242   uint32_t MaxNumberPeeled();
243   bool IsVectorizationProfitable(int64_t trip_count);
244 
245   //
246   // Helpers.
247   //
248 
249   bool TrySetPhiInduction(HPhi* phi, bool restrict_uses);
250   bool TrySetPhiReduction(HPhi* phi);
251 
252   // Detects loop header with a single induction (returned in main_phi), possibly
253   // other phis for reductions, but no other side effects. Returns true on success.
254   bool TrySetSimpleLoopHeader(HBasicBlock* block, /*out*/ HPhi** main_phi);
255 
256   bool IsEmptyBody(HBasicBlock* block);
257   bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info,
258                            HInstruction* instruction,
259                            bool collect_loop_uses,
260                            /*out*/ uint32_t* use_count);
261   bool IsUsedOutsideLoop(HLoopInformation* loop_info,
262                          HInstruction* instruction);
263   bool TryReplaceWithLastValue(HLoopInformation* loop_info,
264                                HInstruction* instruction,
265                                HBasicBlock* block);
266   bool TryAssignLastValue(HLoopInformation* loop_info,
267                           HInstruction* instruction,
268                           HBasicBlock* block,
269                           bool collect_loop_uses);
270   void RemoveDeadInstructions(const HInstructionList& list);
271   bool CanRemoveCycle();  // Whether the current 'iset_' is removable.
272 
273   // Compiler options (to query ISA features).
274   const CompilerOptions* compiler_options_;
275 
276   // Cached target SIMD vector register size in bytes.
277   const size_t simd_register_size_;
278 
279   // Range information based on prior induction variable analysis.
280   InductionVarRange induction_range_;
281 
282   // Phase-local heap memory allocator for the loop optimizer. Storage obtained
283   // through this allocator is immediately released when the loop optimizer is done.
284   ScopedArenaAllocator* loop_allocator_;
285 
286   // Global heap memory allocator. Used to build HIR.
287   ArenaAllocator* global_allocator_;
288 
289   // Entries into the loop hierarchy representation. The hierarchy resides
290   // in phase-local heap memory.
291   LoopNode* top_loop_;
292   LoopNode* last_loop_;
293 
294   // Temporary bookkeeping of a set of instructions.
295   // Contents reside in phase-local heap memory.
296   ScopedArenaSet<HInstruction*>* iset_;
297 
298   // Temporary bookkeeping of reduction instructions. Mapping is two-fold:
299   // (1) reductions in the loop-body are mapped back to their phi definition,
300   // (2) phi definitions are mapped to their initial value (updated during
301   //     code generation to feed the proper values into the new chain).
302   // Contents reside in phase-local heap memory.
303   ScopedArenaSafeMap<HInstruction*, HInstruction*>* reductions_;
304 
305   // Flag that tracks if any simplifications have occurred.
306   bool simplified_;
307 
308   // Number of "lanes" for selected packed type.
309   uint32_t vector_length_;
310 
311   // Set of array references in the vector loop.
312   // Contents reside in phase-local heap memory.
313   ScopedArenaSet<ArrayReference>* vector_refs_;
314 
315   // Static or dynamic loop peeling for alignment.
316   uint32_t vector_static_peeling_factor_;
317   const ArrayReference* vector_dynamic_peeling_candidate_;
318 
319   // Dynamic data dependence test of the form a != b.
320   HInstruction* vector_runtime_test_a_;
321   HInstruction* vector_runtime_test_b_;
322 
323   // Mapping used during vectorization synthesis for both the scalar peeling/cleanup
324   // loop (mode is kSequential) and the actual vector loop (mode is kVector). The data
325   // structure maps original instructions into the new instructions.
326   // Contents reside in phase-local heap memory.
327   ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_map_;
328 
329   // Permanent mapping used during vectorization synthesis.
330   // Contents reside in phase-local heap memory.
331   ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_permanent_map_;
332 
333   // Temporary vectorization bookkeeping.
334   VectorMode vector_mode_;  // synthesis mode
335   HBasicBlock* vector_preheader_;  // preheader of the new loop
336   HBasicBlock* vector_header_;  // header of the new loop
337   HBasicBlock* vector_body_;  // body of the new loop
338   HInstruction* vector_index_;  // normalized index of the new loop
339 
340   // Helper for target-specific behaviour for loop optimizations.
341   ArchNoOptsLoopHelper* arch_loop_helper_;
342 
343   friend class LoopOptimizationTest;
344 
345   DISALLOW_COPY_AND_ASSIGN(HLoopOptimization);
346 };
347 
348 }  // namespace art
349 
350 #endif  // ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
351