1 /* 2 * Copyright (C) 2016 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_ 18 #define ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_ 19 20 #include "base/scoped_arena_allocator.h" 21 #include "base/scoped_arena_containers.h" 22 #include "induction_var_range.h" 23 #include "loop_analysis.h" 24 #include "nodes.h" 25 #include "optimization.h" 26 #include "superblock_cloner.h" 27 28 namespace art { 29 30 class CompilerOptions; 31 class ArchNoOptsLoopHelper; 32 33 /** 34 * Loop optimizations. Builds a loop hierarchy and applies optimizations to 35 * the detected nested loops, such as removal of dead induction and empty loops 36 * and inner loop vectorization. 37 */ 38 class HLoopOptimization : public HOptimization { 39 public: 40 HLoopOptimization(HGraph* graph, 41 const CodeGenerator& codegen, // Needs info about the target. 42 HInductionVarAnalysis* induction_analysis, 43 OptimizingCompilerStats* stats, 44 const char* name = kLoopOptimizationPassName); 45 46 bool Run() override; 47 48 static constexpr const char* kLoopOptimizationPassName = "loop_optimization"; 49 50 private: 51 /** 52 * A single loop inside the loop hierarchy representation. 53 */ 54 struct LoopNode : public ArenaObject<kArenaAllocLoopOptimization> { LoopNodeLoopNode55 explicit LoopNode(HLoopInformation* lp_info) 56 : loop_info(lp_info), 57 outer(nullptr), 58 inner(nullptr), 59 previous(nullptr), 60 next(nullptr) {} 61 HLoopInformation* loop_info; 62 LoopNode* outer; 63 LoopNode* inner; 64 LoopNode* previous; 65 LoopNode* next; 66 }; 67 68 /* 69 * Vectorization restrictions (bit mask). 70 */ 71 enum VectorRestrictions { 72 kNone = 0, // no restrictions 73 kNoMul = 1 << 0, // no multiplication 74 kNoDiv = 1 << 1, // no division 75 kNoShift = 1 << 2, // no shift 76 kNoShr = 1 << 3, // no arithmetic shift right 77 kNoHiBits = 1 << 4, // "wider" operations cannot bring in higher order bits 78 kNoSignedHAdd = 1 << 5, // no signed halving add 79 kNoUnroundedHAdd = 1 << 6, // no unrounded halving add 80 kNoAbs = 1 << 7, // no absolute value 81 kNoStringCharAt = 1 << 8, // no StringCharAt 82 kNoReduction = 1 << 9, // no reduction 83 kNoSAD = 1 << 10, // no sum of absolute differences (SAD) 84 kNoWideSAD = 1 << 11, // no sum of absolute differences (SAD) with operand widening 85 kNoDotProd = 1 << 12, // no dot product 86 }; 87 88 /* 89 * Vectorization mode during synthesis 90 * (sequential peeling/cleanup loop or vector loop). 91 */ 92 enum VectorMode { 93 kSequential, 94 kVector 95 }; 96 97 /* 98 * Representation of a unit-stride array reference. 99 */ 100 struct ArrayReference { 101 ArrayReference(HInstruction* b, HInstruction* o, DataType::Type t, bool l, bool c = false) baseArrayReference102 : base(b), offset(o), type(t), lhs(l), is_string_char_at(c) { } 103 bool operator<(const ArrayReference& other) const { 104 return 105 (base < other.base) || 106 (base == other.base && 107 (offset < other.offset || (offset == other.offset && 108 (type < other.type || 109 (type == other.type && 110 (lhs < other.lhs || 111 (lhs == other.lhs && 112 is_string_char_at < other.is_string_char_at))))))); 113 } 114 HInstruction* base; // base address 115 HInstruction* offset; // offset + i 116 DataType::Type type; // component type 117 bool lhs; // def/use 118 bool is_string_char_at; // compressed string read 119 }; 120 121 // 122 // Loop setup and traversal. 123 // 124 125 bool LocalRun(); 126 void AddLoop(HLoopInformation* loop_info); 127 void RemoveLoop(LoopNode* node); 128 129 // Traverses all loops inner to outer to perform simplifications and optimizations. 130 // Returns true if loops nested inside current loop (node) have changed. 131 bool TraverseLoopsInnerToOuter(LoopNode* node); 132 133 // 134 // Optimization. 135 // 136 137 void SimplifyInduction(LoopNode* node); 138 void SimplifyBlocks(LoopNode* node); 139 140 // Performs optimizations specific to inner loop with finite header logic (empty loop removal, 141 // unrolling, vectorization). Returns true if anything changed. 142 bool TryOptimizeInnerLoopFinite(LoopNode* node); 143 144 // Performs optimizations specific to inner loop. Returns true if anything changed. 145 bool OptimizeInnerLoop(LoopNode* node); 146 147 // Tries to apply loop unrolling for branch penalty reduction and better instruction scheduling 148 // opportunities. Returns whether transformation happened. 'generate_code' determines whether the 149 // optimization should be actually applied. 150 bool TryUnrollingForBranchPenaltyReduction(LoopAnalysisInfo* analysis_info, 151 bool generate_code = true); 152 153 // Tries to apply loop peeling for loop invariant exits elimination. Returns whether 154 // transformation happened. 'generate_code' determines whether the optimization should be 155 // actually applied. 156 bool TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info, 157 bool generate_code = true); 158 159 // Tries to perform whole loop unrolling for a small loop with a small trip count to eliminate 160 // the loop check overhead and to have more opportunities for inter-iteration optimizations. 161 // Returns whether transformation happened. 'generate_code' determines whether the optimization 162 // should be actually applied. 163 bool TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code = true); 164 165 // Tries to apply scalar loop peeling and unrolling. 166 bool TryPeelingAndUnrolling(LoopNode* node); 167 168 // 169 // Vectorization analysis and synthesis. 170 // 171 172 bool ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count); 173 void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count); 174 void GenerateNewLoop(LoopNode* node, 175 HBasicBlock* block, 176 HBasicBlock* new_preheader, 177 HInstruction* lo, 178 HInstruction* hi, 179 HInstruction* step, 180 uint32_t unroll); 181 bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code); 182 bool VectorizeUse(LoopNode* node, 183 HInstruction* instruction, 184 bool generate_code, 185 DataType::Type type, 186 uint64_t restrictions); 187 uint32_t GetVectorSizeInBytes(); 188 bool TrySetVectorType(DataType::Type type, /*out*/ uint64_t* restrictions); 189 bool TrySetVectorLengthImpl(uint32_t length); 190 TrySetVectorLength(DataType::Type type,uint32_t length)191 bool TrySetVectorLength(DataType::Type type, uint32_t length) { 192 bool res = TrySetVectorLengthImpl(length); 193 // Currently the vectorizer supports only the mode when full SIMD registers are used. 194 DCHECK(!res || (DataType::Size(type) * length == GetVectorSizeInBytes())); 195 return res; 196 } 197 198 void GenerateVecInv(HInstruction* org, DataType::Type type); 199 void GenerateVecSub(HInstruction* org, HInstruction* offset); 200 void GenerateVecMem(HInstruction* org, 201 HInstruction* opa, 202 HInstruction* opb, 203 HInstruction* offset, 204 DataType::Type type); 205 void GenerateVecReductionPhi(HPhi* phi); 206 void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction); 207 HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction); 208 void GenerateVecOp(HInstruction* org, 209 HInstruction* opa, 210 HInstruction* opb, 211 DataType::Type type); 212 213 // Vectorization idioms. 214 bool VectorizeSaturationIdiom(LoopNode* node, 215 HInstruction* instruction, 216 bool generate_code, 217 DataType::Type type, 218 uint64_t restrictions); 219 bool VectorizeHalvingAddIdiom(LoopNode* node, 220 HInstruction* instruction, 221 bool generate_code, 222 DataType::Type type, 223 uint64_t restrictions); 224 bool VectorizeSADIdiom(LoopNode* node, 225 HInstruction* instruction, 226 bool generate_code, 227 DataType::Type type, 228 uint64_t restrictions); 229 bool VectorizeDotProdIdiom(LoopNode* node, 230 HInstruction* instruction, 231 bool generate_code, 232 DataType::Type type, 233 uint64_t restrictions); 234 235 // Vectorization heuristics. 236 Alignment ComputeAlignment(HInstruction* offset, 237 DataType::Type type, 238 bool is_string_char_at, 239 uint32_t peeling = 0); 240 void SetAlignmentStrategy(uint32_t peeling_votes[], 241 const ArrayReference* peeling_candidate); 242 uint32_t MaxNumberPeeled(); 243 bool IsVectorizationProfitable(int64_t trip_count); 244 245 // 246 // Helpers. 247 // 248 249 bool TrySetPhiInduction(HPhi* phi, bool restrict_uses); 250 bool TrySetPhiReduction(HPhi* phi); 251 252 // Detects loop header with a single induction (returned in main_phi), possibly 253 // other phis for reductions, but no other side effects. Returns true on success. 254 bool TrySetSimpleLoopHeader(HBasicBlock* block, /*out*/ HPhi** main_phi); 255 256 bool IsEmptyBody(HBasicBlock* block); 257 bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info, 258 HInstruction* instruction, 259 bool collect_loop_uses, 260 /*out*/ uint32_t* use_count); 261 bool IsUsedOutsideLoop(HLoopInformation* loop_info, 262 HInstruction* instruction); 263 bool TryReplaceWithLastValue(HLoopInformation* loop_info, 264 HInstruction* instruction, 265 HBasicBlock* block); 266 bool TryAssignLastValue(HLoopInformation* loop_info, 267 HInstruction* instruction, 268 HBasicBlock* block, 269 bool collect_loop_uses); 270 void RemoveDeadInstructions(const HInstructionList& list); 271 bool CanRemoveCycle(); // Whether the current 'iset_' is removable. 272 273 // Compiler options (to query ISA features). 274 const CompilerOptions* compiler_options_; 275 276 // Cached target SIMD vector register size in bytes. 277 const size_t simd_register_size_; 278 279 // Range information based on prior induction variable analysis. 280 InductionVarRange induction_range_; 281 282 // Phase-local heap memory allocator for the loop optimizer. Storage obtained 283 // through this allocator is immediately released when the loop optimizer is done. 284 ScopedArenaAllocator* loop_allocator_; 285 286 // Global heap memory allocator. Used to build HIR. 287 ArenaAllocator* global_allocator_; 288 289 // Entries into the loop hierarchy representation. The hierarchy resides 290 // in phase-local heap memory. 291 LoopNode* top_loop_; 292 LoopNode* last_loop_; 293 294 // Temporary bookkeeping of a set of instructions. 295 // Contents reside in phase-local heap memory. 296 ScopedArenaSet<HInstruction*>* iset_; 297 298 // Temporary bookkeeping of reduction instructions. Mapping is two-fold: 299 // (1) reductions in the loop-body are mapped back to their phi definition, 300 // (2) phi definitions are mapped to their initial value (updated during 301 // code generation to feed the proper values into the new chain). 302 // Contents reside in phase-local heap memory. 303 ScopedArenaSafeMap<HInstruction*, HInstruction*>* reductions_; 304 305 // Flag that tracks if any simplifications have occurred. 306 bool simplified_; 307 308 // Number of "lanes" for selected packed type. 309 uint32_t vector_length_; 310 311 // Set of array references in the vector loop. 312 // Contents reside in phase-local heap memory. 313 ScopedArenaSet<ArrayReference>* vector_refs_; 314 315 // Static or dynamic loop peeling for alignment. 316 uint32_t vector_static_peeling_factor_; 317 const ArrayReference* vector_dynamic_peeling_candidate_; 318 319 // Dynamic data dependence test of the form a != b. 320 HInstruction* vector_runtime_test_a_; 321 HInstruction* vector_runtime_test_b_; 322 323 // Mapping used during vectorization synthesis for both the scalar peeling/cleanup 324 // loop (mode is kSequential) and the actual vector loop (mode is kVector). The data 325 // structure maps original instructions into the new instructions. 326 // Contents reside in phase-local heap memory. 327 ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_map_; 328 329 // Permanent mapping used during vectorization synthesis. 330 // Contents reside in phase-local heap memory. 331 ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_permanent_map_; 332 333 // Temporary vectorization bookkeeping. 334 VectorMode vector_mode_; // synthesis mode 335 HBasicBlock* vector_preheader_; // preheader of the new loop 336 HBasicBlock* vector_header_; // header of the new loop 337 HBasicBlock* vector_body_; // body of the new loop 338 HInstruction* vector_index_; // normalized index of the new loop 339 340 // Helper for target-specific behaviour for loop optimizations. 341 ArchNoOptsLoopHelper* arch_loop_helper_; 342 343 friend class LoopOptimizationTest; 344 345 DISALLOW_COPY_AND_ASSIGN(HLoopOptimization); 346 }; 347 348 } // namespace art 349 350 #endif // ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_ 351