1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "ExecutionPlan"
18 
19 #include "ExecutionPlan.h"
20 
21 #include <fcntl.h>
22 #include <openssl/sha.h>
23 #include <sys/stat.h>
24 #include <sys/types.h>
25 
26 #include <algorithm>
27 #include <functional>
28 #include <map>
29 #include <memory>
30 #include <mutex>
31 #include <queue>
32 #include <set>
33 #include <string>
34 #include <type_traits>
35 #include <unordered_set>
36 #include <utility>
37 #include <vector>
38 
39 #include "BurstBuilder.h"
40 #include "Callbacks.h"
41 #include "CompilationBuilder.h"
42 #include "ControlFlow.h"
43 #include "CpuExecutor.h"
44 #include "ExecutionBuilder.h"
45 #include "ExecutionBurstController.h"
46 #include "GraphDump.h"
47 #include "Manager.h"
48 #include "MetaModel.h"
49 #include "ModelBuilder.h"
50 #include "OperationsUtils.h"
51 #include "TokenHasher.h"
52 #include "Tracing.h"
53 #include "TypeManager.h"
54 #include "Utils.h"
55 
56 namespace android {
57 namespace nn {
58 
59 namespace {
60 
61 using namespace hal;
62 
63 // The index of the main model in SourceModels.
64 constexpr uint32_t kMainModelInSourceModels = 0;
65 
66 // Compiles the model on device.
67 // If compilation caching is available, depending on ExecutionPlan::mState, the token may only have
68 // been initialized by the user provided token (SIMPLE body), or is already re-hashed by the
69 // operation indices to be executed (COMPOUND body). The token will be re-hashed further by the
70 // device name, device version string, and the execution preference in this function.
compile(const Device & device,const ModelBuilder & model,int executionPreference,int compilationPriority,const std::optional<Deadline> & deadline,const std::string & cacheDir,TokenHasher * token,std::shared_ptr<PreparedModel> * preparedModel)71 int compile(const Device& device, const ModelBuilder& model, int executionPreference,
72             int compilationPriority, const std::optional<Deadline>& deadline,
73             const std::string& cacheDir, TokenHasher* token,
74             std::shared_ptr<PreparedModel>* preparedModel) {
75     CHECK(token != nullptr);
76     CHECK(preparedModel != nullptr);
77     *preparedModel = nullptr;
78 
79     std::optional<CacheToken> cacheToken;
80     if (device.isCachingSupported() && token->ok() &&
81         token->updateFromString(device.getName().c_str()) &&
82         token->updateFromString(device.getVersionString().c_str()) &&
83         token->update(&executionPreference, sizeof(executionPreference)) &&
84         token->update(&compilationPriority, sizeof(compilationPriority)) && token->finish()) {
85         cacheToken.emplace(token->getCacheToken());
86     }
87 
88     const ModelFactory makeModel = [&model] { return model.makeHidlModel(); };
89     const ExecutionPreference preference = static_cast<ExecutionPreference>(executionPreference);
90     const Priority priority = convertToHalPriority(compilationPriority);
91     const auto [n, returnedPreparedModel] =
92             device.prepareModel(makeModel, preference, priority, deadline, cacheDir, cacheToken);
93     *preparedModel = returnedPreparedModel;
94     return n;
95 }
96 
97 typedef std::function<void(uint32_t)> OperationReadyCallback;
98 
copyOperandExtraParams(ModelBuilder & model,uint32_t toOperandIndex,const Operand & fromOperand)99 int copyOperandExtraParams(ModelBuilder& model, uint32_t toOperandIndex,
100                            const Operand& fromOperand) {
101     if (fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL &&
102         fromOperand.extraParams.getDiscriminator() ==
103                 OperandExtraParams::hidl_discriminator::channelQuant) {
104         auto& fromChannelQuant = fromOperand.extraParams.channelQuant();
105         ANeuralNetworksSymmPerChannelQuantParams toChannelQuant = {
106                 .channelDim = fromChannelQuant.channelDim,
107                 .scaleCount = static_cast<uint32_t>(fromChannelQuant.scales.size()),
108                 .scales = fromChannelQuant.scales.data(),
109         };
110         return model.setOperandSymmPerChannelQuantParams(toOperandIndex, toChannelQuant);
111     } else if (isExtensionOperandType(fromOperand.type) &&
112                fromOperand.extraParams.getDiscriminator() ==
113                        OperandExtraParams::hidl_discriminator::extension) {
114         hidl_vec<uint8_t> extensionData = fromOperand.extraParams.extension();
115         return model.setOperandExtensionData(toOperandIndex, extensionData.data(),
116                                              extensionData.size());
117     } else if (fromOperand.extraParams.getDiscriminator() !=
118                        OperandExtraParams::hidl_discriminator::none ||
119                fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
120         LOG(ERROR) << "Type " << toString(fromOperand.type)
121                    << " has an unexpected extraParams discriminator: "
122                    << static_cast<int>(fromOperand.extraParams.getDiscriminator());
123         return ANEURALNETWORKS_BAD_DATA;
124     } else {
125         return ANEURALNETWORKS_NO_ERROR;
126     }
127 }
128 
129 // This class tracks whether we know the value of an operand as operations
130 // are processed.
131 class OperandTracker {
132    public:
133     // Creates the tracker for this model. Figure out which operations can be
134     // executed right away and cb for each one of them.
135     OperandTracker(const ModelBuilder* model, OperationReadyCallback cb);
136     // Mark the specified operation as having been processed. The output
137     // of the operation now being known, this may make new operations to be
138     // able to run.  Call cb for each one of them.
139     void markProcessed(uint32_t operationIndex, OperationReadyCallback cb);
140 
141    private:
142     const ModelBuilder* mModel;
143     std::multimap<uint32_t, uint32_t> mOperandToOperations;
144     std::vector<uint32_t> mUnknownInputCount;  // For each operation
145 };
146 
OperandTracker(const ModelBuilder * model,OperationReadyCallback cb)147 OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb)
148     : mModel(model) {
149     const auto& operations = mModel->getOperations();
150     mUnknownInputCount.resize(operations.size());
151     for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) {
152         const Operation& operation = operations[operationIndex];
153         uint32_t count = 0;
154         for (uint32_t operandIndex : operation.inputs) {
155             auto lifetime = mModel->getOperand(operandIndex).lifetime;
156             if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
157                 lifetime == OperandLifeTime::SUBGRAPH_OUTPUT) {
158                 count++;
159                 mOperandToOperations.emplace(operandIndex, operationIndex);
160             }
161         }
162         if (count == 0) {
163             cb(operationIndex);
164         }
165         mUnknownInputCount[operationIndex] = count;
166     }
167 }
168 
markProcessed(uint32_t operationIndex,OperationReadyCallback cb)169 void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) {
170     // Mark all its outputs as known.
171     const Operation& operation = mModel->getOperations()[operationIndex];
172     for (uint32_t operandIndex : operation.outputs) {
173         auto range = mOperandToOperations.equal_range(operandIndex);
174         for (auto i = range.first; i != range.second; i++) {
175             uint32_t& count = mUnknownInputCount[i->second];
176             if (--count == 0) {
177                 cb(i->second);
178             }
179         }
180     }
181 }
182 
183 }  // namespace
184 
ExecutionStep(ExecutionPlan * plan,uint32_t stepIndex,uint32_t sourceModelIndex,std::shared_ptr<Device> device)185 ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex, uint32_t sourceModelIndex,
186                              std::shared_ptr<Device> device)
187     : mPlan(plan),
188       mIndex(stepIndex),
189       mSourceModelIndex(sourceModelIndex),
190       mStepModel(),
191       mDevice(device),
192       mToken(plan->getCacheToken()) {}
193 
194 // Adds an operand if it has not been added already.
195 // Sets the index in the step model for the corresponding operand.
addOperand(uint32_t sourceOperandIndex,uint32_t * stepOperandIndex,OperandKind kind)196 int ExecutionStep::addOperand(uint32_t sourceOperandIndex, uint32_t* stepOperandIndex,
197                               OperandKind kind) {
198     // Have we added this operand already?
199     auto i = mOperandMap.find(sourceOperandIndex);
200     if (i != mOperandMap.end()) {
201         CHECK(kind == INPUT);
202         *stepOperandIndex = i->second;
203         return ANEURALNETWORKS_NO_ERROR;
204     }
205 
206     // First time we add this operand.
207     *stepOperandIndex = mStepModel.operandCount();
208     mOperandMap.emplace(sourceOperandIndex, *stepOperandIndex);
209 
210     // Add the operand to the step model.
211     const ModelBuilder& sourceModel = *getSourceModel();
212     const Operand& operand = sourceModel.getOperand(sourceOperandIndex);
213     ANeuralNetworksOperandType type = {
214             .type = static_cast<int32_t>(operand.type),
215             .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()),
216             .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr,
217             .scale = operand.scale,
218             .zeroPoint = operand.zeroPoint,
219     };
220 
221     int n = mStepModel.addOperand(type);
222     if (n != ANEURALNETWORKS_NO_ERROR) {
223         LOG(ERROR) << "Previous error occurred when partitioning the graph";
224         return n;
225     }
226 
227     n = copyOperandExtraParams(mStepModel, *stepOperandIndex, operand);
228     if (n != ANEURALNETWORKS_NO_ERROR) {
229         LOG(ERROR) << "Error when copying extra parameters to the operand";
230         return n;
231     }
232 
233     // Sets its value.
234     switch (operand.lifetime) {
235         case OperandLifeTime::CONSTANT_COPY: {
236             const uint8_t* data = sourceModel.getPointerToOperandValue(operand.location.offset);
237             n = mStepModel.setOperandValue(*stepOperandIndex, data, operand.location.length);
238             if (n != ANEURALNETWORKS_NO_ERROR) {
239                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
240                 return n;
241             }
242         } break;
243         case OperandLifeTime::CONSTANT_REFERENCE: {
244             const Memory* memory = sourceModel.getMemories()[operand.location.poolIndex];
245             n = mStepModel.setOperandValueFromMemory(
246                     *stepOperandIndex, memory, operand.location.offset, operand.location.length);
247             if (n != ANEURALNETWORKS_NO_ERROR) {
248                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
249                 return n;
250             }
251         } break;
252         case OperandLifeTime::NO_VALUE: {
253             n = mStepModel.setOperandValue(*stepOperandIndex, nullptr, 0);
254             if (n != ANEURALNETWORKS_NO_ERROR) {
255                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
256                 return n;
257             }
258         } break;
259         case OperandLifeTime::TEMPORARY_VARIABLE: {  // handled similarly to SUBGRAPH_OUTPUT
260             if (kind == INPUT) {
261                 // The first time we've seen this operand is as an
262                 // input.  That means it must be defined by a
263                 // different partition, and is an input to this one.
264                 mTempsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
265             } else {
266                 // The first time we've seen this operand is as an
267                 // output.  It may be an input to a different
268                 // partition, so keep track of it.
269                 mPlan->recordTemporaryDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex),
270                                           mIndex);
271             }
272         } break;
273         case OperandLifeTime::SUBGRAPH_INPUT: {
274             mModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
275         } break;
276         case OperandLifeTime::SUBGRAPH_OUTPUT: {  // handled similarly to TEMPORARY_VARIABLE
277             if (kind == INPUT) {
278                 // The first time we've seen this operand is as an
279                 // input.  That means it must be defined by a
280                 // different partition, and is an input to this one.
281                 mOutputsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
282             } else {
283                 // The first time we've seen this operand is as an
284                 // output.
285                 mModelOutputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
286             }
287         } break;
288         case OperandLifeTime::SUBGRAPH: {
289             const ModelBuilder* model = sourceModel.getReferencedModel(operand);
290             n = mStepModel.setOperandValueFromModel(*stepOperandIndex, model);
291             if (n != ANEURALNETWORKS_NO_ERROR) {
292                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
293                 return n;
294             }
295         } break;
296         default: {
297             CHECK(!"unexpected");
298         } break;
299     }
300 
301     return ANEURALNETWORKS_NO_ERROR;
302 }
303 
addOperation(int operationIndex)304 int ExecutionStep::addOperation(int operationIndex) {
305     const Operation& operation = getSourceModel()->getOperation(operationIndex);
306     if (mToken.ok()) {
307         mToken.update(&mSourceModelIndex, sizeof(mSourceModelIndex));
308         mToken.update(&operationIndex, sizeof(operationIndex));
309     }
310 
311     // Convert the input and output operand indexes.
312     //
313     // We expect operations to be added in topological order.  Therefore:
314     //
315     // - We may not have seen an input if it is a model input, a
316     //   constant, or an operand written by a different partition.
317     //
318     // - We should not have seen any outputs.
319     auto addOperands = [this](const hidl_vec<uint32_t>& sourceModelOperands,
320                               std::vector<uint32_t>* stepModelOperands, OperandKind kind) -> int {
321         const uint32_t operandCount = static_cast<uint32_t>(sourceModelOperands.size());
322         for (uint32_t i = 0; i < operandCount; i++) {
323             NN_RETURN_IF_ERROR(addOperand(sourceModelOperands[i], &stepModelOperands->at(i), kind));
324         }
325         return ANEURALNETWORKS_NO_ERROR;
326     };
327 
328     const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size());
329     const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size());
330     std::vector<uint32_t> inputs(inputCount);
331     std::vector<uint32_t> outputs(outputCount);
332     NN_RETURN_IF_ERROR(addOperands(operation.inputs, &inputs, INPUT));
333     NN_RETURN_IF_ERROR(addOperands(operation.outputs, &outputs, OUTPUT));
334     return mStepModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(),
335                                    outputCount, outputs.data());
336 }
337 
mapInputsAndOutputs(std::shared_ptr<StepExecutor> executor,const Memory * temporaryMemory,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToOffsetOfTemporary,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToInputIndex,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToOutputIndex,const std::map<SourceOperandIndex,ConstantReferenceLocation> & sourceOperandToConstantReference) const338 void ExecutionStep::mapInputsAndOutputs(
339         std::shared_ptr<StepExecutor> executor, const Memory* temporaryMemory,
340         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOffsetOfTemporary,
341         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToInputIndex,
342         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOutputIndex,
343         const std::map<SourceOperandIndex, ConstantReferenceLocation>&
344                 sourceOperandToConstantReference) const {
345     auto mapInput = [&](uint32_t stepModelOperandIndex, uint32_t stepInputIndex) {
346         SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex);
347         if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex);
348             it != sourceOperandToOffsetOfTemporary.end()) {
349             executor->setInputFromMemory(stepInputIndex, temporaryMemory, it->second);
350         } else if (auto it = sourceOperandToInputIndex.find(sourceOperandIndex);
351                    it != sourceOperandToInputIndex.end()) {
352             executor->mapInput(it->second, stepInputIndex);
353         } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
354                    it != sourceOperandToOutputIndex.end()) {
355             executor->mapOutputToInput(it->second, stepInputIndex);
356         } else if (auto it = sourceOperandToConstantReference.find(sourceOperandIndex);
357                    it != sourceOperandToConstantReference.end()) {
358             // Constant partition boundary operand. This could be an IF branch
359             // model input or a WHILE variable initializer.
360             executor->setInputFromMemory(stepInputIndex, it->second.memory, it->second.offset);
361         } else {
362             CHECK(false) << "Cannot map step input " << stepInputIndex << " from operand "
363                          << toString(sourceOperandIndex);
364         }
365     };
366     auto mapOutput = [&](uint32_t stepModelOperandIndex, uint32_t stepOutputIndex) {
367         SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex);
368         if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex);
369             it != sourceOperandToOffsetOfTemporary.end()) {
370             executor->setOutputFromMemory(stepOutputIndex, temporaryMemory, it->second);
371         } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
372                    it != sourceOperandToOutputIndex.end()) {
373             executor->mapOutput(it->second, stepOutputIndex);
374         } else {
375             CHECK(false) << "Cannot map step output " << stepOutputIndex << " from operand "
376                          << toString(sourceOperandIndex);
377         }
378     };
379     for (uint32_t i = 0, n = mStepModelInputs.size(); i < n; ++i) {
380         mapInput(mStepModelInputs[i].first, i);
381     }
382     for (uint32_t i = 0, n = mStepModelOutputs.size(); i < n; ++i) {
383         mapOutput(mStepModelOutputs[i].first, i);
384     }
385 }
386 
findTempsAsStepModelOutputs()387 void ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs() {
388     auto recordAsOutputIfTemporary = [this](const SourceOperandIndex& sourceOperandIndex) {
389         const auto it = mTemporaryToDefiningExecutionStep.find(sourceOperandIndex);
390         if (it == mTemporaryToDefiningExecutionStep.end()) {
391             // The operand is not a temporary or is not defined by an
392             // ExecutionStep (i.e. it's an output of an IF or a WHILE).
393             // The latter case is handled by ExecutionPlan::makeController().
394             return;
395         }
396         uint32_t stepIndex = it->second;
397         CHECK_LT(stepIndex, mSteps.size());
398         mSteps[stepIndex]->executionStep()->recordTempAsStepModelOutput(sourceOperandIndex.second);
399     };
400     for (const auto& logicalStep : mSteps) {
401         if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
402             for (const auto& input : step->getTempsAsStepModelInputs()) {
403                 SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), input.first);
404                 recordAsOutputIfTemporary(sourceOperandIndex);
405             }
406         } else if (const IfStep* step = logicalStep->tryIfStep()) {
407             recordAsOutputIfTemporary(step->conditionOperandIndex);
408             for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) {
409                 recordAsOutputIfTemporary(sourceOperandIndex);
410             }
411         } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
412             for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) {
413                 recordAsOutputIfTemporary(sourceOperandIndex);
414             }
415         } else {
416             CHECK(logicalStep->isGoto());
417         }
418     }
419 }
420 
recordTempAsStepModelOutput(uint32_t stepOperandIndex)421 void ExecutionStep::recordTempAsStepModelOutput(uint32_t stepOperandIndex) {
422     const auto it = mOperandMap.find(stepOperandIndex);
423     CHECK(it != mOperandMap.end());
424     mTempsAsStepModelOutputs.emplace(stepOperandIndex, it->second);
425 }
426 
getSourceModel() const427 const ModelBuilder* ExecutionStep::getSourceModel() const {
428     return mPlan->getSourceModels().getModel(mSourceModelIndex);
429 }
430 
logStepModel() const431 void ExecutionStep::logStepModel() const {
432     VLOG(COMPILATION) << "ExecutionStep::finishStepModel, step " << mIndex;
433 
434     auto logRemapEntry = [](std::string& toLog, const std::pair<uint32_t, uint32_t>& e) {
435         if (!toLog.empty()) {
436             toLog += ", ";
437         }
438         toLog += toString(e.first);
439         toLog += "->";
440         toLog += toString(e.second);
441     };
442 
443     auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) {
444         std::string toLog;
445         for (const auto& e : map) {
446             logRemapEntry(toLog, e);
447         }
448         VLOG(COMPILATION) << name << ": " << toLog;
449     };
450     auto logRemapSet = [&logRemapEntry](const char* name, const StepModelOutputSetType& set) {
451         std::string toLog;
452         for (const auto& e : set) {
453             logRemapEntry(toLog, e);
454         }
455         VLOG(COMPILATION) << name << ": " << toLog;
456     };
457 
458     logRemapVector("step model inputs", mStepModelInputs);
459     logRemapVector("step model outputs", mStepModelOutputs);
460     logRemapVector("model inputs", mModelInputs);
461     logRemapVector("model outputs", mModelOutputs);
462     logRemapVector("temps as step model inputs", mTempsAsStepModelInputs);
463     logRemapSet("temps as step model outputs", mTempsAsStepModelOutputs);
464     logRemapVector("outputs as step model inputs", mOutputsAsStepModelInputs);
465 }
466 
hasUnknownSize(const Operand & operand)467 static bool hasUnknownSize(const Operand& operand) {
468     if (operand.dimensions.size() == 0) {
469         return TypeManager::get()->isTensorType(operand.type);
470     }
471     for (uint32_t dimension : operand.dimensions) {
472         if (dimension == 0) {
473             return true;
474         }
475     }
476     return false;
477 }
478 
finishStepModel(const ModelBuilder * mainModel,bool * hasOutputOfUnknownSize,int32_t executionPreference,int32_t priority)479 int ExecutionStep::finishStepModel(const ModelBuilder* mainModel, bool* hasOutputOfUnknownSize,
480                                    int32_t executionPreference, int32_t priority) {
481     CHECK(mDevice != nullptr);
482 
483     for (const auto& stepModelOutput : mTempsAsStepModelOutputs) {
484         const Operand& operand = mStepModel.getOperand(stepModelOutput.second);
485         if (hasUnknownSize(operand)) {
486             *hasOutputOfUnknownSize = true;
487             VLOG(COMPILATION) << "StepModelOutput (operand#" << toString(stepModelOutput.first)
488                               << " of source graph) has unknown size: " << toString(operand);
489         }
490     }
491 
492     mStepModel.relaxComputationFloat32toFloat16(mainModel->isComputationFloat32RelaxedToFloat16());
493 
494     mStepModelInputs.insert(mStepModelInputs.end(), mModelInputs.begin(), mModelInputs.end());
495     mStepModelInputs.insert(mStepModelInputs.end(), mTempsAsStepModelInputs.begin(),
496                             mTempsAsStepModelInputs.end());
497     mStepModelInputs.insert(mStepModelInputs.end(), mOutputsAsStepModelInputs.begin(),
498                             mOutputsAsStepModelInputs.end());
499 
500     mStepModelOutputs.insert(mStepModelOutputs.end(), mModelOutputs.begin(), mModelOutputs.end());
501     mStepModelOutputs.insert(mStepModelOutputs.end(), mTempsAsStepModelOutputs.begin(),
502                              mTempsAsStepModelOutputs.end());
503 
504     if (mSourceModelIndex == kMainModelInSourceModels) {
505         std::map<uint32_t, uint32_t> mainModelOperandToInputIndex;
506         for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
507             mainModelOperandToInputIndex[mainModel->getInputOperandIndex(i)] = i;
508         }
509         std::map<uint32_t, uint32_t> mainModelOperandToOutputIndex;
510         for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
511             mainModelOperandToOutputIndex[mainModel->getOutputOperandIndex(i)] = i;
512         }
513 
514         // mInputIndexStepModelToMainModel is ordered by step model input index and relies on
515         // mModelInputs being the first inputs, as specified by mStepModelInputs.
516         mInputIndexStepModelToMainModel.resize(mModelInputs.size());
517         std::transform(mModelInputs.begin(), mModelInputs.end(),
518                        mInputIndexStepModelToMainModel.begin(),
519                        [&mainModelOperandToInputIndex](auto& e) {
520                            uint32_t sourceOperandIndex = e.first;
521                            return mainModelOperandToInputIndex[sourceOperandIndex];
522                        });
523 
524         // mOutputIndexStepModelToMainModel is ordered by step model output index and relies on
525         // mModelOutputs being the first outputs, as specified by mStepModelOutputs.
526         mOutputIndexStepModelToMainModel.resize(mModelOutputs.size());
527         std::transform(mModelOutputs.begin(), mModelOutputs.end(),
528                        mOutputIndexStepModelToMainModel.begin(),
529                        [&mainModelOperandToOutputIndex](auto& e) {
530                            uint32_t sourceOperandIndex = e.first;
531                            return mainModelOperandToOutputIndex[sourceOperandIndex];
532                        });
533 
534         // mOutputsAsStepModelInputsIndexToMainModel is ordered by step model input index and relies
535         // on mOutputsAsStepModelInputs being the first outputs.
536         mOutputsAsStepModelInputsIndexToMainModel.resize(mOutputsAsStepModelInputs.size());
537         std::transform(mOutputsAsStepModelInputs.begin(), mOutputsAsStepModelInputs.end(),
538                        mOutputsAsStepModelInputsIndexToMainModel.begin(),
539                        [&mainModelOperandToOutputIndex](auto& e) {
540                            uint32_t sourceOperandIndex = e.first;
541                            return mainModelOperandToOutputIndex[sourceOperandIndex];
542                        });
543     }
544 
545     if (VLOG_IS_ON(COMPILATION)) {
546         logStepModel();
547     }
548 
549     std::vector<uint32_t> inputs(mStepModelInputs.size());
550     std::vector<uint32_t> outputs(mStepModelOutputs.size());
551     std::transform(mStepModelInputs.begin(), mStepModelInputs.end(), inputs.begin(),
552                    [](auto& e) { return e.second; });
553     std::transform(mStepModelOutputs.begin(), mStepModelOutputs.end(), outputs.begin(),
554                    [](auto& e) { return e.second; });
555     NN_RETURN_IF_ERROR(mStepModel.identifyInputsAndOutputs(inputs.size(), inputs.data(),
556                                                            outputs.size(), outputs.data()));
557     // TODO: Model::finish() should use ValidationMode::RUNTIME when sending the
558     // step model to CpuDevice. Right now, this is harmless because the only
559     // difference in validation occurs with control flow operations and inputs
560     // or outputs of unknown size and we never send control flow operations to
561     // CpuDevice. We need to address this if this behavior changes (b/151634976).
562     NN_RETURN_IF_ERROR(mStepModel.finish());
563 
564     // TODO: Move compilation elsewhere?
565     VLOG(COMPILATION) << "ExecutionStep::finishStepModel, compilation on " << mDevice->getName();
566     return compile(*mDevice, mStepModel, executionPreference, priority, {}, *mPlan->getCacheDir(),
567                    &mToken, &mPreparedStepModel);
568 }
569 
dump() const570 void ExecutionStep::dump() const {
571     if (VLOG_IS_ON(COMPILATION)) {
572         VLOG(COMPILATION) << "Step#" << mIndex << ": execute on " << mDevice->getName();
573         logModelToInfo(mStepModel.makeHidlModel());
574     }
575 }
576 
toString(const IfStep & step)577 std::string toString(const IfStep& step) {
578     std::ostringstream oss;
579     oss << "Step#" << step.index << ": if " << toString(step.conditionOperandIndex)
580         << " then=" << step.thenStepIndex << " else=" << step.elseStepIndex;
581     return oss.str();
582 }
583 
toString(const WhileStep & step)584 std::string toString(const WhileStep& step) {
585     std::ostringstream oss;
586     oss << "Step#" << step.index << ": while cond=" << step.condStepIndex
587         << " body=" << step.bodyStepIndex << " exit=" << step.exitStepIndex;
588     return oss.str();
589 }
590 
toString(const GotoStep & step)591 std::string toString(const GotoStep& step) {
592     std::ostringstream oss;
593     oss << "Step#" << step.index << ": goto " << step.gotoStepIndex;
594     return oss.str();
595 }
596 
dump() const597 void LogicalStep::dump() const {
598     if (VLOG_IS_ON(COMPILATION)) {
599         if (const IfStep* step = tryIfStep()) {
600             VLOG(COMPILATION) << toString(*step);
601         } else if (const WhileStep* step = tryWhileStep()) {
602             VLOG(COMPILATION) << toString(*step);
603         } else if (const GotoStep* step = tryGotoStep()) {
604             VLOG(COMPILATION) << toString(*step);
605         } else {
606             executionStep()->dump();
607         }
608     }
609 }
610 
finish(const SourceModels * sourceModels,int32_t executionPreference,int32_t priority,const std::optional<Deadline> & deadline)611 int ExecutionPlan::CompoundBody::finish(const SourceModels* sourceModels,
612                                         int32_t executionPreference, int32_t priority,
613                                         const std::optional<Deadline>& deadline) {
614     CHECK(!mSuccessfulFinish);
615     CHECK(!deadline.has_value());
616     const ModelBuilder* mainModel = sourceModels->getModel(kMainModelInSourceModels);
617 
618     auto containsUnknownSize = [sourceModels](const std::vector<SourceOperandIndex>& operands) {
619         for (const auto& sourceOperandIndex : operands) {
620             const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first);
621             const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second);
622             if (hasUnknownSize(operand)) {
623                 return true;
624             }
625         }
626         return false;
627     };
628 
629     findTempsAsStepModelOutputs();
630     for (const auto& logicalStep : mSteps) {
631         if (ExecutionStep* step = logicalStep->tryExecutionStep()) {
632             int n = step->finishStepModel(mainModel, &mHasStepModelOutputOfUnknownSize,
633                                           executionPreference, priority);
634             if (n != ANEURALNETWORKS_NO_ERROR) {
635                 VLOG(COMPILATION)
636                         << "ExecutionPlan::CompoundBody::finish -- finishStepModel failed";
637                 return n;
638             }
639         } else if (IfStep* step = logicalStep->tryIfStep()) {
640             // The partitioner does not support dynamic temporaries (b/132458982).
641             CHECK(!containsUnknownSize(step->outerInputOperands));
642             CHECK(!containsUnknownSize(step->outerOutputOperands));
643             // step->conditionOperandIndex has a static shape. See b/158557728.
644             CHECK(!containsUnknownSize(step->thenBranchInputOperands));
645             CHECK(!containsUnknownSize(step->thenBranchOutputOperands));
646             CHECK(!containsUnknownSize(step->elseBranchInputOperands));
647             CHECK(!containsUnknownSize(step->elseBranchOutputOperands));
648         } else if (WhileStep* step = logicalStep->tryWhileStep()) {
649             // The partitioner does not support dynamic temporaries (b/132458982).
650             CHECK(!containsUnknownSize(step->outerInputOperands));
651             CHECK(!containsUnknownSize(step->outerOutputOperands));
652             CHECK(!containsUnknownSize(step->condInputOperands));
653             // step->condOutputOperand has a static shape. See b/158557728.
654             CHECK(!containsUnknownSize(step->bodyInputOperands));
655             CHECK(!containsUnknownSize(step->bodyOutputOperands));
656         } else {
657             CHECK(logicalStep->isGoto());
658         }
659     }
660     if (mHasStepModelOutputOfUnknownSize) {
661         VLOG(COMPILATION)
662                 << "ExecutionPlan::CompoundBody::finish -- mHasStepModelOutputOfUnknownSize";
663         return ANEURALNETWORKS_OP_FAILED;
664     }
665 
666     for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
667         SourceOperandIndex index(kMainModelInSourceModels, mainModel->getInputOperandIndex(i));
668         mSourceOperandToInputIndex[index] = i;
669     }
670     for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
671         SourceOperandIndex index(kMainModelInSourceModels, mainModel->getOutputOperandIndex(i));
672         mSourceOperandToOutputIndex[index] = i;
673     }
674 
675     findControlFlowBoundaryConstants(sourceModels);
676 
677     mSuccessfulFinish = true;
678     return ANEURALNETWORKS_NO_ERROR;
679 }
680 
findControlFlowBoundaryConstants(const SourceModels * sourceModels)681 void ExecutionPlan::CompoundBody::findControlFlowBoundaryConstants(
682         const SourceModels* sourceModels) {
683     auto handleBoundaryConstants = [this,
684                                     sourceModels](const SourceOperandIndex& sourceOperandIndex) {
685         const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first);
686         const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second);
687         const DataLocation& location = operand.location;
688         if (operand.lifetime == OperandLifeTime::CONSTANT_COPY) {
689             mSourceOperandToBoundaryConstantCopy[sourceOperandIndex] = {
690                     .buffer = sourceModel->getPointerToOperandValue(location.offset),
691                     .length = location.length,
692             };
693         } else if (operand.lifetime == OperandLifeTime::CONSTANT_REFERENCE) {
694             mSourceOperandToBoundaryConstantReference[sourceOperandIndex] = {
695                     .memory = sourceModel->getMemories()[location.poolIndex],
696                     .offset = location.offset,
697                     .length = location.length,
698             };
699         }
700     };
701     for (const auto& logicalStep : mSteps) {
702         if (const IfStep* step = logicalStep->tryIfStep()) {
703             handleBoundaryConstants(step->conditionOperandIndex);
704             for (const auto& sourceOperandIndex : step->outerInputOperands) {
705                 handleBoundaryConstants(sourceOperandIndex);
706             }
707         } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
708             for (const auto& sourceOperandIndex : step->outerInputOperands) {
709                 handleBoundaryConstants(sourceOperandIndex);
710             }
711         }
712     }
713 }
714 
finish(const SourceModels *,int32_t executionPreference,int32_t priority,const std::optional<Deadline> & deadline)715 int ExecutionPlan::SimpleBody::finish(const SourceModels*, int32_t executionPreference,
716                                       int32_t priority, const std::optional<Deadline>& deadline) {
717     CHECK(!mSuccessfulFinish);
718     CHECK(mDevice != nullptr);
719     VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
720     const int n = compile(*mDevice, *mModel, executionPreference, priority, deadline, *mCacheDir,
721                           &mToken, &mPreparedModel);
722     mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
723     return n;
724 }
725 
finish(int32_t executionPreference,int32_t priority,const std::optional<Deadline> & deadline)726 int ExecutionPlan::finish(int32_t executionPreference, int32_t priority,
727                           const std::optional<Deadline>& deadline) {
728     CHECK(mBody != nullptr);
729     return mBody->finish(&getSourceModels(), executionPreference, priority, deadline);
730 }
731 
Controller(const ExecutionPlan * plan,ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder)732 ExecutionPlan::Controller::Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
733                                       const BurstBuilder* burstBuilder)
734     : Controller(plan, executionBuilder, burstBuilder, 0, {}, {}, {}, {}, {}, {}) {}
735 
Controller(const ExecutionPlan * plan,ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder,uint32_t totalSizeOfTemporaries,std::map<SourceOperandIndex,uint32_t> sourceOperandToOffsetOfTemporary,std::map<SourceOperandIndex,uint32_t> sourceOperandToOffsetOfTemporary2,std::map<SourceOperandIndex,uint32_t> sourceOperandToInputIndex,std::map<SourceOperandIndex,uint32_t> sourceOperandToOutputIndex,const std::map<SourceOperandIndex,ConstantCopyLocation> & sourceOperandToConstantCopy,std::map<SourceOperandIndex,ConstantReferenceLocation> sourceOperandToConstantReference)736 ExecutionPlan::Controller::Controller(
737         const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
738         const BurstBuilder* burstBuilder, uint32_t totalSizeOfTemporaries,
739         std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary,
740         std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary2,
741         std::map<SourceOperandIndex, uint32_t> sourceOperandToInputIndex,
742         std::map<SourceOperandIndex, uint32_t> sourceOperandToOutputIndex,
743         const std::map<SourceOperandIndex, ConstantCopyLocation>& sourceOperandToConstantCopy,
744         std::map<SourceOperandIndex, ConstantReferenceLocation> sourceOperandToConstantReference)
745     : mPlan(plan),
746       mExecutionBuilder(executionBuilder),
747       mBurstBuilder(burstBuilder),
748       mSourceOperandToOffsetOfTemporary(std::move(sourceOperandToOffsetOfTemporary)),
749       mSourceOperandToOffsetOfTemporary2(std::move(sourceOperandToOffsetOfTemporary2)),
750       mSourceOperandToInputIndex(std::move(sourceOperandToInputIndex)),
751       mSourceOperandToOutputIndex(std::move(sourceOperandToOutputIndex)),
752       mSourceOperandToConstantReference(std::move(sourceOperandToConstantReference)),
753       mNextStepIndex(0),
754       mFallbackNextStepIndex(kBadStepIndex),
755       mLastStepSyncFd(-1) {
756     if (totalSizeOfTemporaries == 0) {
757         return;
758     }
759     int n;
760     std::tie(n, mTemporaries) = MemoryAshmem::create(totalSizeOfTemporaries);
761     if (n != ANEURALNETWORKS_NO_ERROR) {
762         LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries";
763         mNextStepIndex = kBadStepIndex;
764     }
765     for (const auto& [sourceOperandIndex, location] : sourceOperandToConstantCopy) {
766         memcpy(mTemporaries->getPointer() + mSourceOperandToOffsetOfTemporary[sourceOperandIndex],
767                location.buffer, location.length);
768     }
769 }
770 
771 // Attempt to create a burst object for each PreparedModel/Partition. If the
772 // burst controller object cannot be made, return a nullptr in its place to
773 // indicate the regular execution path should be used. This can occur either
774 // because PreparedModel was nullptr (cpu was best choice), or because the
775 // IPreparedModel was of insufficient version or failed to configure the burst.
makeBursts(int preference) const776 std::vector<std::shared_ptr<ExecutionBurstController>> ExecutionPlan::makeBursts(
777         int preference) const {
778     switch (mState) {
779         // burst object for each partition in the compound case
780         case COMPOUND: {
781             std::vector<std::shared_ptr<ExecutionBurstController>> bursts;
782             bursts.reserve(compound()->mSteps.size());
783             for (const auto& logicalStep : compound()->mSteps) {
784                 if (!logicalStep->isExecution()) {
785                     bursts.push_back(nullptr);
786                     continue;
787                 }
788                 if (const auto preparedModel =
789                             logicalStep->executionStep()->getPreparedStepModel()) {
790                     const bool preferPowerOverLatency =
791                             (preference == ANEURALNETWORKS_PREFER_LOW_POWER);
792                     bursts.push_back(
793                             preparedModel->configureExecutionBurst(preferPowerOverLatency));
794                 } else {
795                     bursts.push_back(nullptr);
796                 }
797             }
798             return bursts;
799         }
800         // single burst object for the simple case
801         case SIMPLE: {
802             std::vector<std::shared_ptr<ExecutionBurstController>> burst;
803             auto simpleBody = simple();
804             if (const auto preparedModel = simpleBody->mPreparedModel) {
805                 const bool preferPowerOverLatency =
806                         (preference == ANEURALNETWORKS_PREFER_LOW_POWER);
807                 burst.push_back(preparedModel->configureExecutionBurst(preferPowerOverLatency));
808             } else {
809                 burst.push_back(nullptr);
810             }
811             return burst;
812         }
813         // no burst objects made
814         default:
815             return {};
816     }
817 }
818 
makeController(ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder) const819 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
820         ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder) const {
821     CHECK(isValid());
822     if (mState == SIMPLE) {
823         return std::shared_ptr<Controller>(new Controller(this, executionBuilder, burstBuilder));
824     }
825     // Create the layout for a Memory object big enough to hold
826     // - every partition boundary TEMPORARY operand and
827     // - buffers required by the control flow implementation.
828     //
829     // TODO: Rethink this approach for managing temporaries.  Some
830     // alternatives:
831     //
832     // 1) Adopt a memory layout scheme analogous to stack allocation,
833     // where objects of non-overlapping lifetime can occupy the same
834     // storage.  We would still have a single Memory object in this
835     // case.
836     //
837     // 2) Do something like what CpuExecutor does, and do allocations
838     // and deallocations on the fly (during execution) before first
839     // reference and after last reference, respectively.  This would
840     // mean having one Memory object per TEMPORARY; or, in a more
841     // complicated implementation, one Memory object per set of
842     // temporaries that have the same lifetime.  Note that the Android
843     // system limits the number of shared memory objects, which are
844     // what our Memory objects represent.
845     //
846     uint32_t totalSizeOfTemporaries = 0;
847     auto addTemporaryOfSize = [&totalSizeOfTemporaries](uint32_t size) {
848         totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size);
849         const uint32_t offset = totalSizeOfTemporaries;
850         totalSizeOfTemporaries += size;
851         return offset;
852     };
853     // This function has two modes of operation:
854     // 1. When lifetime is TEMPORARY_VARIABLE, we allocate memory for
855     //    TEMPORARY_VARIABLE source operands, skip SUBGRAPH_OUTPUT source
856     //    operands, and panic if we see a source operand of another lifetime.
857     // 2. When lifetime is SUBGRAPH_OUTPUT, we allocate memory for
858     //    SUBGRAPH_OUTPUT source operands and panic if we see a source operand
859     //    of another lifetime.
860     auto mapTemporary =
861             [executionBuilder, addTemporaryOfSize](
862                     const SourceOperandIndex& sourceOperandIndex,
863                     std::map<SourceOperandIndex, uint32_t>* sourceOperandToOffsetOfTemporary,
864                     OperandLifeTime lifetime = OperandLifeTime::TEMPORARY_VARIABLE) {
865                 CHECK(lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
866                       lifetime == OperandLifeTime::SUBGRAPH_OUTPUT);
867                 const Operand& sourceOperand =
868                         executionBuilder->getSourceOperand(sourceOperandIndex);
869                 if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE &&
870                     sourceOperand.lifetime == OperandLifeTime::SUBGRAPH_OUTPUT) {
871                     // See the caller for explanation.
872                     return;
873                 }
874                 CHECK(sourceOperand.lifetime == lifetime);
875                 const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
876                 CHECK_NE(size, 0u);
877                 const uint32_t offset = addTemporaryOfSize(size);
878                 auto [_, isNew] =
879                         sourceOperandToOffsetOfTemporary->emplace(sourceOperandIndex, offset);
880                 CHECK(isNew);
881                 VLOG(EXECUTION) << "temp: operand " << toString(sourceOperandIndex)
882                                 << " offset = " << offset;
883             };
884     std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary;
885     std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary2;
886     for (const auto& logicalStep : compound()->mSteps) {
887         if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
888             // Allocate memory for ExecutionStep temporary outputs that are
889             // inputs to other steps, as determined by
890             // ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs().
891             //
892             // We don't allocate memory for step model output operands with
893             // source operand lifetime SUBGRAPH_OUTPUT because they will be
894             // - managed by the client (main model outputs),
895             // - assigned a location of another operand (when this step model
896             //   output is a branch model output of an IF; see
897             //   ExecutionPlan::nextCompound(const IfStep*, ...)), or
898             // - allocated by a WHILE (when this step model output
899             //   is a condition or body model output of a WHILE; see the
900             //   step->bodyOutputOperands and step->condOutputOperand handling
901             //   below).
902             for (const auto& output : step->getTempsAsStepModelOutputs()) {
903                 mapTemporary(SourceOperandIndex(step->getSourceModelIndex(), output.first),
904                              &sourceOperandToOffsetOfTemporary);
905             }
906         } else if (const IfStep* step = logicalStep->tryIfStep()) {
907             // Allocate memory for all temporary outputs of an IfStep because
908             // they are going to be written to by a branch model. We don't
909             // perform unused output operand optimisation for referenced models.
910             //
911             // We don't allocate memory for branch output operands because they
912             // use the same location as the corresponding outer output operands,
913             // as established in ExecutionPlan::nextCompound(const IfStep*, ...)
914             //
915             // We don't allocate memory for outer output operands with source
916             // operand lifetime SUBGRAPH_OUTPUT because they will be
917             // - managed by the client (main model outputs),
918             // - assigned a location of another operand (when this IF outer
919             //   output is a branch model output of another IF; see
920             //   ExecutionPlan::nextCompound(const IfStep*, ...)), or
921             // - allocated by a WHILE (when this IF outer output
922             //   is a condition or body model output of a WHILE; see the
923             //   step->bodyOutputOperands and step->condOutputOperand handling
924             //   below).
925             for (const auto& sourceOperandIndex : step->outerOutputOperands) {
926                 mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary);
927             }
928         } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
929             // Allocate memory for all temporary outputs of an WhileStep because
930             // they are going to be written to by the WHILE loop.
931             //
932             // We don't allocate memory for outer output operands with source
933             // operand lifetime SUBGRAPH_OUTPUT because they will be
934             // - managed by the client (main model outputs),
935             // - assigned a location of another operand (when this WHILE outer
936             //   output is a branch model output of an IF; see
937             //   ExecutionPlan::nextCompound(const IfStep*, ...)), or
938             // - allocated by another WHILE (when this WHILE outer output
939             //   is a condition or body model output of another WHILE; see the
940             //   step->bodyOutputOperands and step->condOutputOperand handling
941             //   below).
942             for (const auto& sourceOperandIndex : step->outerOutputOperands) {
943                 mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary);
944             }
945             // Allocate memory for body model outputs. Note that we could use
946             // the outer output operand memory instead but we currently don't do
947             // so (b/148206073).
948             for (const auto& sourceOperandIndex : step->bodyOutputOperands) {
949                 mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary,
950                              OperandLifeTime::SUBGRAPH_OUTPUT);
951                 // Allocate another set of temporaries for double buffering.
952                 mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary2,
953                              OperandLifeTime::SUBGRAPH_OUTPUT);
954             }
955             // Allocate memory for condition model output.
956             // TODO: Share one condition output memory region between all loops.
957             mapTemporary(step->condOutputOperand, &sourceOperandToOffsetOfTemporary,
958                          OperandLifeTime::SUBGRAPH_OUTPUT);
959         } else {
960             CHECK(logicalStep->isGoto());
961         }
962     }
963     // Allocate temporary memory for boundary CONSTANT_COPY operands.
964     for (const auto& [sourceOperandIndex, location] :
965          compound()->mSourceOperandToBoundaryConstantCopy) {
966         const uint32_t offset = addTemporaryOfSize(location.length);
967         sourceOperandToOffsetOfTemporary.emplace(sourceOperandIndex, offset);
968         VLOG(EXECUTION) << "temp (boundary constant): operand " << toString(sourceOperandIndex)
969                         << " offset = " << offset;
970     }
971     return std::shared_ptr<Controller>(new Controller(
972             this, executionBuilder, burstBuilder, totalSizeOfTemporaries,
973             std::move(sourceOperandToOffsetOfTemporary),
974             std::move(sourceOperandToOffsetOfTemporary2), compound()->mSourceOperandToInputIndex,
975             compound()->mSourceOperandToOutputIndex,
976             compound()->mSourceOperandToBoundaryConstantCopy,
977             compound()->mSourceOperandToBoundaryConstantReference));
978 }
979 
980 // TODO: Find a better way to provide this functionality.
fallback(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor) const981 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
982                             std::shared_ptr<StepExecutor>* executor) const {
983     *executor = nullptr;
984 
985     VLOG(EXECUTION) << "ExecutionPlan::fallback(" << SHOW_IF_DEBUG(controller << ", " << executor)
986                     << "): mFallbackNextStepIndex = " << controller->mFallbackNextStepIndex;
987 
988     if (controller->mFallbackNextStepIndex == Controller::kBadStepIndex) {
989         // We haven't called next().
990         return ANEURALNETWORKS_OP_FAILED;
991     }
992 
993     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
994         // The last call to next() did not produce an executor.
995         return ANEURALNETWORKS_OP_FAILED;
996     }
997 
998     controller->mNextStepIndex = controller->mFallbackNextStepIndex;
999     return next(controller, executor);
1000 }
1001 
Buffer(void * pointer,uint32_t size)1002 ExecutionPlan::Buffer::Buffer(void* pointer, uint32_t size)
1003     : mInfo(RunTimePoolInfo::createFromExistingBuffer(reinterpret_cast<uint8_t*>(pointer), size)),
1004       mOffset(0) {}
1005 
Buffer(RunTimePoolInfo info,uint32_t offset)1006 ExecutionPlan::Buffer::Buffer(RunTimePoolInfo info, uint32_t offset)
1007     : mInfo(std::move(info)), mOffset(offset) {}
1008 
getPointer() const1009 void* ExecutionPlan::Buffer::getPointer() const {
1010     return mInfo.getBuffer() + mOffset;
1011 }
1012 
getSize() const1013 uint32_t ExecutionPlan::Buffer::getSize() const {
1014     return mInfo.getSize() - mOffset;
1015 }
1016 
flush() const1017 void ExecutionPlan::Buffer::flush() const {
1018     mInfo.flush();
1019 }
1020 
getBufferFromModelArgumentInfo(const ModelArgumentInfo & info,const ExecutionBuilder * executionBuilder) const1021 std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBufferFromModelArgumentInfo(
1022         const ModelArgumentInfo& info, const ExecutionBuilder* executionBuilder) const {
1023     switch (info.state()) {
1024         case ModelArgumentInfo::POINTER: {
1025             return Buffer(info.buffer(), info.length());
1026         } break;
1027         case ModelArgumentInfo::MEMORY: {
1028             if (std::optional<RunTimePoolInfo> poolInfo =
1029                         executionBuilder->getRunTimePoolInfo(info.locationAndLength().poolIndex)) {
1030                 return Buffer(*poolInfo, info.locationAndLength().offset);
1031             } else {
1032                 LOG(ERROR) << "Unable to map operand memory pool";
1033                 return std::nullopt;
1034             }
1035         } break;
1036         case ModelArgumentInfo::HAS_NO_VALUE: {
1037             LOG(ERROR) << "Attempting to read an operand that has no value";
1038             return std::nullopt;
1039         } break;
1040         default: {
1041             LOG(ERROR) << "Unexpected operand memory state: " << static_cast<int>(info.state());
1042             return std::nullopt;
1043         } break;
1044     }
1045 }
1046 
getBuffer(std::shared_ptr<Controller> controller,SourceOperandIndex operandIndex) const1047 std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBuffer(
1048         std::shared_ptr<Controller> controller, SourceOperandIndex operandIndex) const {
1049     const auto& sourceOperandToOffsetOfTemporary = controller->mSourceOperandToOffsetOfTemporary;
1050     const auto& sourceOperandToInputIndex = controller->mSourceOperandToInputIndex;
1051     const auto& sourceOperandToOutputIndex = controller->mSourceOperandToOutputIndex;
1052     const auto& sourceOperandToConstantReference = controller->mSourceOperandToConstantReference;
1053     if (auto it = sourceOperandToOffsetOfTemporary.find(operandIndex);
1054         it != sourceOperandToOffsetOfTemporary.end()) {
1055         const uint32_t offset = it->second;
1056         const std::unique_ptr<MemoryAshmem>& memory = controller->mTemporaries;
1057         return Buffer(memory->getPointer() + offset, memory->getSize() - offset);
1058     } else if (auto it = sourceOperandToInputIndex.find(operandIndex);
1059                it != sourceOperandToInputIndex.end()) {
1060         const ModelArgumentInfo& info = controller->mExecutionBuilder->getInputInfo(it->second);
1061         return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder);
1062     } else if (auto it = sourceOperandToOutputIndex.find(operandIndex);
1063                it != sourceOperandToOutputIndex.end()) {
1064         const ModelArgumentInfo& info = controller->mExecutionBuilder->getOutputInfo(it->second);
1065         return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder);
1066     } else if (auto it = sourceOperandToConstantReference.find(operandIndex);
1067                it != sourceOperandToConstantReference.end()) {
1068         const ConstantReferenceLocation& location = it->second;
1069         const std::optional<RunTimePoolInfo> info = location.memory->getRunTimePoolInfo();
1070         if (info == std::nullopt) {
1071             return std::nullopt;
1072         }
1073         return Buffer(info->getBuffer() + location.offset, location.length);
1074     }
1075     return std::nullopt;
1076 }
1077 
readConditionValue(std::shared_ptr<Controller> controller,SourceOperandIndex operandIndex,bool * value) const1078 int ExecutionPlan::readConditionValue(std::shared_ptr<Controller> controller,
1079                                       SourceOperandIndex operandIndex, bool* value) const {
1080     std::optional<ExecutionPlan::Buffer> buffer = getBuffer(controller, operandIndex);
1081     if (buffer == std::nullopt) {
1082         LOG(ERROR) << "Unable to read operand " << toString(operandIndex);
1083         return ANEURALNETWORKS_OP_FAILED;
1084     }
1085     CHECK_GE(buffer->getSize(), sizeof(bool8));
1086     bool8 value8 = *static_cast<bool8*>(buffer->getPointer());
1087     *value = static_cast<bool>(value8);
1088     VLOG(EXECUTION) << "readConditionValue: " << *value;
1089     return ANEURALNETWORKS_NO_ERROR;
1090 }
1091 
next(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController,int syncFdOfLastStep) const1092 int ExecutionPlan::next(std::shared_ptr<Controller> controller,
1093                         std::shared_ptr<StepExecutor>* executor,
1094                         std::shared_ptr<ExecutionBurstController>* burstController,
1095                         int syncFdOfLastStep) const {
1096     controller->mLastStepSyncFd = syncFdOfLastStep;
1097     *executor = nullptr;
1098     if (burstController != nullptr) {
1099         *burstController = nullptr;
1100     }
1101 
1102     VLOG(EXECUTION) << "ExecutionPlan::next(" << SHOW_IF_DEBUG(controller << ", " << executor)
1103                     << "): mNextStepIndex = " << controller->mNextStepIndex;
1104 
1105     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1106         return ANEURALNETWORKS_OP_FAILED;
1107     }
1108 
1109     if (mState == EMPTY) {
1110         CHECK_EQ(controller->mNextStepIndex, 0u);  // end
1111         controller->mNextStepIndex = Controller::kBadStepIndex;
1112         return ANEURALNETWORKS_NO_ERROR;
1113     }
1114 
1115     if (mState == SIMPLE) {
1116         if (controller->mNextStepIndex == 0) {
1117             // First (and only) step.
1118             auto simpleBody = simple();
1119             *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder,
1120                                                        simpleBody->mModel, simpleBody->mDevice,
1121                                                        simpleBody->mPreparedModel);
1122             (*executor)->mapInputsAndOutputsTrivially();
1123             if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
1124                 *burstController = controller->mBurstBuilder->getControllerAt(0);
1125             }
1126             controller->mFallbackNextStepIndex = 0;
1127             controller->mNextStepIndex = 1;
1128             return ANEURALNETWORKS_NO_ERROR;
1129         }
1130 
1131         CHECK_EQ(controller->mNextStepIndex, 1u);  // end
1132         controller->mNextStepIndex = Controller::kBadStepIndex;
1133         return ANEURALNETWORKS_NO_ERROR;
1134     }
1135 
1136     return nextCompound(controller, executor, burstController);
1137 }
1138 
nextCompound(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const1139 int ExecutionPlan::nextCompound(std::shared_ptr<Controller> controller,
1140                                 std::shared_ptr<StepExecutor>* executor,
1141                                 std::shared_ptr<ExecutionBurstController>* burstController) const {
1142     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1143         return ANEURALNETWORKS_OP_FAILED;
1144     }
1145 
1146     auto compoundBody = compound();
1147     if (controller->mNextStepIndex == compoundBody->mSteps.size()) {
1148         controller->mNextStepIndex = Controller::kBadStepIndex;  // end
1149         return ANEURALNETWORKS_NO_ERROR;
1150     }
1151 
1152     const auto& logicalStep = compoundBody->mSteps[controller->mNextStepIndex];
1153     if (const IfStep* step = logicalStep->tryIfStep()) {
1154         return nextCompound(step, controller, executor, burstController);
1155     } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1156         return nextCompound(step, controller, executor, burstController);
1157     } else if (const GotoStep* step = logicalStep->tryGotoStep()) {
1158         return nextCompound(step, controller, executor, burstController);
1159     } else if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1160         return nextCompound(step, controller, executor, burstController);
1161     } else {
1162         CHECK(false) << "Unknown step variant";
1163         return ANEURALNETWORKS_BAD_STATE;
1164     }
1165 }
1166 
nextCompound(const ExecutionStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const1167 int ExecutionPlan::nextCompound(const ExecutionStep* step, std::shared_ptr<Controller> controller,
1168                                 std::shared_ptr<StepExecutor>* executor,
1169                                 std::shared_ptr<ExecutionBurstController>* burstController) const {
1170     VLOG(EXECUTION) << "next: Step#" << controller->mNextStepIndex << ": execute on "
1171                     << step->getDevice()->getName();
1172     *executor =
1173             std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getStepModel(),
1174                                            step->getDevice(), step->getPreparedStepModel(), step);
1175     step->mapInputsAndOutputs(
1176             *executor, controller->mTemporaries.get(),
1177             controller->mSourceOperandToOffsetOfTemporary, controller->mSourceOperandToInputIndex,
1178             controller->mSourceOperandToOutputIndex, controller->mSourceOperandToConstantReference);
1179     if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
1180         *burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex);
1181     }
1182 
1183     controller->mFallbackNextStepIndex = controller->mNextStepIndex;
1184     controller->mNextStepIndex++;
1185     return ANEURALNETWORKS_NO_ERROR;
1186 }
1187 
1188 // The first argument is the "source" operand, the second operand is the "destination".
setInput(const SourceOperandIndex & outerOperand,const SourceOperandIndex & innerOperand)1189 void ExecutionPlan::Controller::setInput(const SourceOperandIndex& outerOperand,
1190                                          const SourceOperandIndex& innerOperand) {
1191     VLOG(EXECUTION) << "mapping input " << toString(innerOperand) << " from "
1192                     << toString(outerOperand);
1193 #ifdef NN_DEBUGGABLE
1194     CHECK_LE(mSourceOperandToOffsetOfTemporary.count(innerOperand) +
1195                      mSourceOperandToInputIndex.count(innerOperand) +
1196                      mSourceOperandToOutputIndex.count(innerOperand) +
1197                      mSourceOperandToConstantReference.count(innerOperand),
1198              1u);
1199 #endif
1200     mSourceOperandToOffsetOfTemporary.erase(innerOperand);
1201     mSourceOperandToInputIndex.erase(innerOperand);
1202     mSourceOperandToOutputIndex.erase(innerOperand);
1203     mSourceOperandToConstantReference.erase(innerOperand);
1204     if (auto it = mSourceOperandToOffsetOfTemporary.find(outerOperand);
1205         it != mSourceOperandToOffsetOfTemporary.end()) {
1206         mSourceOperandToOffsetOfTemporary.emplace(innerOperand, it->second);
1207     } else if (auto it = mSourceOperandToInputIndex.find(outerOperand);
1208                it != mSourceOperandToInputIndex.end()) {
1209         mSourceOperandToInputIndex.emplace(innerOperand, it->second);
1210     } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand);
1211                it != mSourceOperandToOutputIndex.end()) {
1212         mSourceOperandToOutputIndex.emplace(innerOperand, it->second);
1213     } else if (auto it = mSourceOperandToConstantReference.find(outerOperand);
1214                it != mSourceOperandToConstantReference.end()) {
1215         mSourceOperandToConstantReference.emplace(innerOperand, it->second);
1216     } else {
1217         CHECK(false) << "Cannot set step model input operand " << toString(innerOperand)
1218                      << " from operand " << toString(outerOperand);
1219     }
1220 }
1221 
1222 // The first argument is the "source" operand, the second operand is the "destination".
setOutput(const SourceOperandIndex & outerOperand,const SourceOperandIndex & innerOperand)1223 void ExecutionPlan::Controller::setOutput(const SourceOperandIndex& outerOperand,
1224                                           const SourceOperandIndex& innerOperand) {
1225     VLOG(EXECUTION) << "mapping output " << toString(innerOperand) << " from "
1226                     << toString(outerOperand);
1227 #ifdef NN_DEBUGGABLE
1228     CHECK_LE(mSourceOperandToOffsetOfTemporary.count(innerOperand) +
1229                      mSourceOperandToOutputIndex.count(innerOperand),
1230              1u);
1231 #endif
1232     mSourceOperandToOffsetOfTemporary.erase(innerOperand);
1233     mSourceOperandToOutputIndex.erase(innerOperand);
1234     if (auto it = mSourceOperandToOffsetOfTemporary.find(outerOperand);
1235         it != mSourceOperandToOffsetOfTemporary.end()) {
1236         mSourceOperandToOffsetOfTemporary.emplace(innerOperand, it->second);
1237     } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand);
1238                it != mSourceOperandToOutputIndex.end()) {
1239         mSourceOperandToOutputIndex.emplace(innerOperand, it->second);
1240     } else {
1241         CHECK(false) << "Cannot set step model output operand " << toString(innerOperand)
1242                      << " from operand " << toString(outerOperand);
1243     }
1244 }
1245 
waitForLastStepSyncFence() const1246 int ExecutionPlan::Controller::waitForLastStepSyncFence() const {
1247     if (mLastStepSyncFd == -1) {
1248         return ANEURALNETWORKS_NO_ERROR;
1249     }
1250     VLOG(EXECUTION) << "wait for mLastStepSyncFd " << mLastStepSyncFd;
1251     auto r = syncWait(mLastStepSyncFd, -1);
1252     int n = ANEURALNETWORKS_NO_ERROR;
1253     if (r != FenceState::SIGNALED) {
1254         LOG(ERROR) << "syncWait failed, fd: " << mLastStepSyncFd;
1255         n = ANEURALNETWORKS_OP_FAILED;
1256     }
1257     return n;
1258 }
1259 
nextCompound(const IfStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const1260 int ExecutionPlan::nextCompound(const IfStep* step, std::shared_ptr<Controller> controller,
1261                                 std::shared_ptr<StepExecutor>* executor,
1262                                 std::shared_ptr<ExecutionBurstController>* burstController) const {
1263     VLOG(EXECUTION) << "next: " << toString(*step);
1264     // If the last step has a sync fence, wait for it to signal before reading the condition value.
1265     // This is safe because the steps are serialized when doing fenced compute.
1266     NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence());
1267     bool condValue;
1268     NN_RETURN_IF_ERROR(readConditionValue(controller, step->conditionOperandIndex, &condValue));
1269     controller->mNextStepIndex = condValue ? step->thenStepIndex : step->elseStepIndex;
1270     const std::vector<SourceOperandIndex>& branchInputOperands =
1271             condValue ? step->thenBranchInputOperands : step->elseBranchInputOperands;
1272     const std::vector<SourceOperandIndex>& branchOutputOperands =
1273             condValue ? step->thenBranchOutputOperands : step->elseBranchOutputOperands;
1274     CHECK_EQ(branchInputOperands.size(), step->outerInputOperands.size());
1275     CHECK_EQ(branchOutputOperands.size(), step->outerOutputOperands.size());
1276     for (uint32_t i = 0, n = step->outerInputOperands.size(); i < n; ++i) {
1277         // We have to do this assignment just before executing this step to
1278         // accommodate cases when the IF resides within a WHILE condition or
1279         // body model and for some j the i-th input of the IF branch model is
1280         // - an input of the WHILE condition model (whileStep->condInputOperands[j]),
1281         // - an input of the WHILE body model (whileStep->bodyInputOperands[j]), or
1282         // - an output of the WHILE body model (whileStep->bodyOutputOperands[j]).
1283         // In such cases, the WhileStep modifies the location of
1284         // step->outerInputOperands[i] to implement double buffering.
1285         controller->setInput(step->outerInputOperands[i], branchInputOperands[i]);
1286     }
1287     for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) {
1288         // We have to do this assignment just before executing this step to
1289         // accommodate the case when the IF resides within a WHILE body
1290         // model and the i-th output of the IF branch model is an
1291         // output of the WHILE body model (whileStep->bodyOutputOperands[j] for
1292         // some j). In that case, the WhileStep modifies the location of
1293         // step->outerOutputOperands[i] to implement double buffering.
1294         controller->setOutput(step->outerOutputOperands[i], branchOutputOperands[i]);
1295     }
1296     return nextCompound(controller, executor, burstController);
1297 }
1298 
nextCompound(const WhileStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const1299 int ExecutionPlan::nextCompound(const WhileStep* step, std::shared_ptr<Controller> controller,
1300                                 std::shared_ptr<StepExecutor>* executor,
1301                                 std::shared_ptr<ExecutionBurstController>* burstController) const {
1302     WhileState& state = controller->mWhileState[controller->mNextStepIndex];
1303     if (state.stage == WhileState::EVALUATE_CONDITION) {
1304         state.iteration = state.iteration == WhileState::kOutsideLoop ? 0 : state.iteration + 1;
1305         VLOG(EXECUTION) << "next: " << toString(*step) << ": iteration " << state.iteration
1306                         << ": evaluating condition";
1307         controller->mNextStepIndex = step->condStepIndex;
1308 
1309         if (state.iteration == 0) {
1310             state.startTime = std::chrono::steady_clock::now();
1311         }
1312 
1313         // iteration = 0   cond inputs = outer inputs
1314         // iteration = 1   cond inputs = body outputs
1315         // iteration = 2   cond inputs = body outputs
1316         // iteration = 3   cond inputs = ...
1317         uint32_t loopBodyOutputCount = step->bodyOutputOperands.size();
1318         CHECK_EQ(step->condInputOperands.size(), step->outerInputOperands.size());
1319         CHECK_GE(step->condInputOperands.size(), loopBodyOutputCount);
1320         for (uint32_t i = 0, n = step->condInputOperands.size(); i < n; ++i) {
1321             bool operandIsInputOnly = i >= loopBodyOutputCount;
1322             controller->setInput((state.iteration == 0 || operandIsInputOnly)
1323                                          ? step->outerInputOperands[i]
1324                                          : step->bodyOutputOperands[i],
1325                                  step->condInputOperands[i]);
1326         }
1327 
1328         state.stage = WhileState::EVALUATE_BODY;
1329         return nextCompound(controller, executor, burstController);
1330     }
1331 
1332     CHECK(state.stage == WhileState::EVALUATE_BODY);
1333     std::chrono::nanoseconds timeoutDuration(
1334             controller->mExecutionBuilder->getLoopTimeoutDuration());
1335     auto duration = std::chrono::steady_clock::now() - state.startTime;
1336     if (duration > timeoutDuration) {
1337         LOG(ERROR) << "WHILE loop timed out after "
1338                    << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count()
1339                    << " ms";
1340         return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
1341     }
1342 
1343     // If the last step has a sync fence, wait for it to signal before reading the condition value.
1344     // This is safe because the steps are serialized when doing fenced compute.
1345     NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence());
1346     bool condValue;
1347     NN_RETURN_IF_ERROR(readConditionValue(controller, step->condOutputOperand, &condValue));
1348     if (condValue) {
1349         VLOG(EXECUTION) << "next: " << toString(*step) << ": iteration " << state.iteration
1350                         << ": evaluating body";
1351         controller->mNextStepIndex = step->bodyStepIndex;
1352 
1353         // iteration = 0   body inputs = cond inputs = outer inputs   body outputs = tmp1
1354         // iteration = 1   body inputs = cond inputs = tmp1           body outputs = tmp2
1355         // iteration = 2   body inputs = cond inputs = tmp2           body outputs = tmp1
1356         // iteration = 3   body inputs = cond inputs = ...            body outputs = ...
1357 #ifdef NN_DEBUGGABLE
1358         CHECK_GE(step->bodyInputOperands.size(), step->bodyOutputOperands.size());
1359         CHECK_EQ(step->bodyInputOperands.size(), step->outerInputOperands.size());
1360         CHECK_EQ(step->bodyInputOperands.size(), step->condInputOperands.size());
1361         CHECK_GE(step->bodyOutputOperands.size(), step->outerOutputOperands.size());
1362 #endif
1363         for (uint32_t i = 0, n = step->bodyInputOperands.size(); i < n; ++i) {
1364             controller->setInput(step->condInputOperands[i], step->bodyInputOperands[i]);
1365         }
1366         if (state.iteration != 0) {
1367             for (const SourceOperandIndex& outputOperand : step->bodyOutputOperands) {
1368 #ifdef NN_DEBUGGABLE
1369                 CHECK_EQ(controller->mSourceOperandToInputIndex.count(outputOperand), 0u);
1370                 CHECK_EQ(controller->mSourceOperandToOutputIndex.count(outputOperand), 0u);
1371                 CHECK_EQ(controller->mSourceOperandToOffsetOfTemporary.count(outputOperand), 1u);
1372                 CHECK_EQ(controller->mSourceOperandToOffsetOfTemporary2.count(outputOperand), 1u);
1373 #endif
1374                 std::swap(controller->mSourceOperandToOffsetOfTemporary[outputOperand],
1375                           controller->mSourceOperandToOffsetOfTemporary2[outputOperand]);
1376             }
1377         }
1378     } else {
1379         VLOG(EXECUTION) << "next: " << toString(*step) << ": iteration " << state.iteration
1380                         << ": exiting loop";
1381         controller->mNextStepIndex = step->exitStepIndex;
1382 
1383         // Copy body outputs to outer outputs.
1384         // TODO: Use outer outputs instead of tmp2 to avoid copying?
1385         CHECK_LE(step->outerOutputOperands.size(), step->bodyOutputOperands.size());
1386         for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) {
1387             // condInputOperands[i] points to a body output operand from the
1388             // last iteration if we've executed at least one iteration and to a
1389             // WHILE operation input operand otherwise.
1390             const SourceOperandIndex& innerOperand = step->condInputOperands[i];
1391             const SourceOperandIndex& outerOperand = step->outerOutputOperands[i];
1392             std::optional<Buffer> outerBuffer = getBuffer(controller, outerOperand);
1393             if (outerBuffer == std::nullopt) {
1394                 // This should never happen.
1395                 LOG(ERROR) << "Unable to get outerBuffer for operand " << toString(outerOperand);
1396                 return ANEURALNETWORKS_OP_FAILED;
1397             }
1398             const Operand& sourceOperand =
1399                     controller->mExecutionBuilder->getSourceOperand(outerOperand);
1400             const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
1401             CHECK_NE(size, 0u);
1402             std::optional<Buffer> innerBuffer = getBuffer(controller, innerOperand);
1403             if (innerBuffer == std::nullopt) {
1404                 // This should never happen.
1405                 LOG(ERROR) << "Unable to get innerBuffer for operand " << toString(innerOperand);
1406                 return ANEURALNETWORKS_OP_FAILED;
1407             }
1408             CHECK_LE(size, innerBuffer->getSize());
1409             CHECK_LE(size, outerBuffer->getSize());
1410             memcpy(outerBuffer->getPointer(), innerBuffer->getPointer(), size);
1411             outerBuffer->flush();
1412         }
1413         state.iteration = WhileState::kOutsideLoop;
1414     }
1415 
1416     state.stage = WhileState::EVALUATE_CONDITION;
1417     return nextCompound(controller, executor, burstController);
1418 }
1419 
nextCompound(const GotoStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const1420 int ExecutionPlan::nextCompound(const GotoStep* step, std::shared_ptr<Controller> controller,
1421                                 std::shared_ptr<StepExecutor>* executor,
1422                                 std::shared_ptr<ExecutionBurstController>* burstController) const {
1423     VLOG(EXECUTION) << "next: " << toString(*step);
1424     controller->mNextStepIndex = step->gotoStepIndex;
1425     return nextCompound(controller, executor, burstController);
1426 }
1427 
becomeCompoundIfEmpty()1428 void ExecutionPlan::becomeCompoundIfEmpty() {
1429     CHECK(mState != SIMPLE);
1430     if (mState == EMPTY) {
1431         mBody = new CompoundBody();
1432         mState = COMPOUND;
1433     }
1434 }
1435 
createNewExecutionStep(uint32_t sourceModelIndex,const std::shared_ptr<Device> device)1436 ExecutionStep* ExecutionPlan::createNewExecutionStep(uint32_t sourceModelIndex,
1437                                                      const std::shared_ptr<Device> device) {
1438     becomeCompoundIfEmpty();
1439     auto step = std::make_shared<LogicalStep>(std::in_place_type<ExecutionStep>, this,
1440                                               compound()->mSteps.size(), sourceModelIndex, device);
1441     compound()->mSteps.push_back(step);
1442     return step->executionStep();
1443 }
1444 
createNewIfStep()1445 IfStep* ExecutionPlan::createNewIfStep() {
1446     becomeCompoundIfEmpty();
1447     auto step = std::make_shared<LogicalStep>(std::in_place_type<IfStep>);
1448     step->ifStep()->index = compound()->mSteps.size();
1449     compound()->mSteps.push_back(step);
1450     return step->ifStep();
1451 }
1452 
createNewWhileStep()1453 WhileStep* ExecutionPlan::createNewWhileStep() {
1454     becomeCompoundIfEmpty();
1455     auto step = std::make_shared<LogicalStep>(std::in_place_type<WhileStep>);
1456     step->whileStep()->index = compound()->mSteps.size();
1457     compound()->mSteps.push_back(step);
1458     return step->whileStep();
1459 }
1460 
createNewGotoStep()1461 GotoStep* ExecutionPlan::createNewGotoStep() {
1462     becomeCompoundIfEmpty();
1463     auto step = std::make_shared<LogicalStep>(std::in_place_type<GotoStep>);
1464     step->gotoStep()->index = compound()->mSteps.size();
1465     compound()->mSteps.push_back(step);
1466     return step->gotoStep();
1467 }
1468 
becomeSingleStep(const std::shared_ptr<Device> device,const ModelBuilder * model)1469 void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device,
1470                                      const ModelBuilder* model) {
1471     CHECK(mState == EMPTY);
1472     mBody = new SimpleBody(device, model, mCacheDir, mToken);
1473     mState = SIMPLE;
1474 }
1475 
recordTemporaryDef(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex)1476 void ExecutionPlan::recordTemporaryDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
1477     auto [it, isNew] =
1478             compound()->mTemporaryToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
1479     CHECK(isNew) << "Step " << stepIndex << " redefines temporary operand "
1480                  << toString(sourceOperandIndex) << " already defined by step " << it->second;
1481 }
1482 
dump() const1483 void ExecutionPlan::dump() const {
1484     if (mBody) {
1485         mBody->dump();
1486     } else {
1487         VLOG(COMPILATION) << "EMPTY";
1488     }
1489 }
1490 
reset()1491 void ExecutionPlan::reset() {
1492     if (mBody) {
1493         delete mBody;
1494         mBody = nullptr;
1495     }
1496     mState = EMPTY;
1497 }
1498 
isSimpleCpu() const1499 bool ExecutionPlan::isSimpleCpu() const {
1500     return isSimple() && simple()->mDevice == DeviceManager::getCpuDevice();
1501 }
1502 
forTest_getKind() const1503 ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const {
1504     switch (mState) {
1505         case EMPTY:
1506             return Kind::EMPTY;
1507         case SIMPLE:
1508             nnAssert(mBody);
1509             return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR;
1510         case COMPOUND:
1511             nnAssert(mBody);
1512             return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR;
1513         default:
1514             nnAssert(!"unexpected state");
1515             return Kind::ERROR;
1516     }
1517 }
1518 
forTest_simpleGetDevice() const1519 std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const {
1520     return simple()->mDevice;
1521 }
1522 
forTest_compoundGetSteps() const1523 const std::vector<std::shared_ptr<LogicalStep>>& ExecutionPlan::forTest_compoundGetSteps() const {
1524     return compound()->mSteps;
1525 }
1526 
forTest_hasStepModelOutputsOfUnknownSize() const1527 bool ExecutionPlan::forTest_hasStepModelOutputsOfUnknownSize() const {
1528     return mBody->hasStepModelOutputsOfUnknownSize();
1529 }
1530 
forTest_simpleGetCacheToken() const1531 const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const {
1532     return simple()->mToken.getCacheToken();
1533 }
1534 
dump() const1535 void ExecutionPlan::SimpleBody::dump() const {
1536     VLOG(COMPILATION) << "SIMPLE for " << mDevice->getName();
1537 }
1538 
dump() const1539 void ExecutionPlan::CompoundBody::dump() const {
1540     for (const auto& step : mSteps) {
1541         step->dump();
1542     }
1543 }
1544 
forEachStepRoleOfInput(uint32_t index,const StepRoleCallback & callback) const1545 void ExecutionPlan::SimpleBody::forEachStepRoleOfInput(uint32_t index,
1546                                                        const StepRoleCallback& callback) const {
1547     callback(mPreparedModel.get(), IOType::INPUT, index);
1548 }
1549 
forEachStepRoleOfOutput(uint32_t index,const StepRoleCallback & callback) const1550 void ExecutionPlan::SimpleBody::forEachStepRoleOfOutput(uint32_t index,
1551                                                         const StepRoleCallback& callback) const {
1552     callback(mPreparedModel.get(), IOType::OUTPUT, index);
1553 }
1554 
1555 // Map an input role of the main model to the input/output roles in the step models:
1556 // - An input role of the main model may be used as an input of multiple step models.
1557 // - An input role of the main model should not be used as an output of any step model.
forEachStepRoleOfInput(uint32_t index,const StepRoleCallback & callback) const1558 void ExecutionPlan::CompoundBody::forEachStepRoleOfInput(uint32_t index,
1559                                                          const StepRoleCallback& callback) const {
1560     for (const auto& logicalStep : mSteps) {
1561         if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1562             // Model input as step model input.
1563             const auto& inputMapping = step->getInputIndexStepModelToMainModel();
1564             for (uint32_t i = 0; i < inputMapping.size(); i++) {
1565                 if (inputMapping[i] == index) {
1566                     callback(step->getPreparedStepModel().get(), IOType::INPUT, i);
1567                 }
1568             }
1569         }
1570     }
1571 }
1572 
1573 // Map an output role of the main model to the input/output roles in the step models:
1574 // - An output role of the main model may only be used as one output of one single step model.
1575 // - An output role of the main model may be used as an input of multiple step models.
forEachStepRoleOfOutput(uint32_t index,const StepRoleCallback & callback) const1576 void ExecutionPlan::CompoundBody::forEachStepRoleOfOutput(uint32_t index,
1577                                                           const StepRoleCallback& callback) const {
1578     bool found = false;
1579     for (const auto& logicalStep : mSteps) {
1580         if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1581             // Model output as step model output.
1582             if (!found) {
1583                 const auto& outputMapping = step->getOutputIndexStepModelToMainModel();
1584                 for (uint32_t i = 0; i < outputMapping.size(); i++) {
1585                     if (outputMapping[i] == index) {
1586                         callback(step->getPreparedStepModel().get(), IOType::OUTPUT, i);
1587                         found = true;
1588                         break;
1589                     }
1590                 }
1591             }
1592             // Model output as step model input.
1593             const auto& inputToOutputMapping = step->getOutputsAsStepModelInputsIndexToMainModel();
1594             for (uint32_t i = 0; i < inputToOutputMapping.size(); i++) {
1595                 if (inputToOutputMapping[i] == index) {
1596                     callback(step->getPreparedStepModel().get(), IOType::INPUT, i);
1597                 }
1598             }
1599         }
1600     }
1601 }
1602 
partitionTheWork(const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,uint32_t priority,const std::optional<Deadline> & deadline,ExecutionPlan * plan) const1603 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
1604                                    uint32_t preference, uint32_t priority,
1605                                    const std::optional<Deadline>& deadline,
1606                                    ExecutionPlan* plan) const {
1607     uint32_t sourceModelIndex = plan->getSourceModels().addModel(this);
1608     NN_RETURN_IF_ERROR(partitionTheWorkInternal(sourceModelIndex, devices, preference, priority,
1609                                                 deadline, plan));
1610     int n = plan->finish(preference, priority, deadline);
1611     if (VLOG_IS_ON(COMPILATION)) {
1612         VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: source model: ";
1613         logModelToInfo(makeHidlModel());
1614         plan->dump();
1615     }
1616     return n;
1617 }
1618 
partitionTheWorkInternal(uint32_t sourceModelIndex,const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,uint32_t priority,const std::optional<Deadline> & deadline,ExecutionPlan * plan) const1619 int ModelBuilder::partitionTheWorkInternal(uint32_t sourceModelIndex,
1620                                            const std::vector<std::shared_ptr<Device>>& devices,
1621                                            uint32_t preference, uint32_t priority,
1622                                            const std::optional<Deadline>& deadline,
1623                                            ExecutionPlan* plan) const {
1624     // This function uses a heuristic approach to partitioning the graph.
1625     // It should be good enough for the first release.
1626 
1627     SourceModels* sourceModels = &plan->getSourceModels();
1628     const size_t deviceCount = devices.size();
1629     const size_t operationCount = mOperations.size();
1630 
1631     VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: "
1632                       << "sourceModelIndex = " << sourceModelIndex << ", "
1633                       << "deviceCount = " << deviceCount << ", "
1634                       << "operationCount = " << operationCount;
1635 
1636     // Figure out where each operation will best execute.
1637     // The value of the vector is the index in the devices vector.
1638     std::vector<int> bestDeviceForOperation(operationCount);
1639     NN_RETURN_IF_ERROR(
1640             findBestDeviceForEachOperation(preference, devices, &bestDeviceForOperation));
1641 
1642     // A special value produced by findBestDeviceForEachOperation meaning that
1643     // this is a control flow operation scheduled for interpreted execution
1644     // (see LogicalStep).
1645     const int kControlFlowInterpreter = deviceCount;
1646 
1647     // If one device will run all the operations, we don't need to split the
1648     // work. This shortcut does not apply when recursively partitioning
1649     // referenced models because our plan representation is flat.
1650     if (sourceModelIndex == kMainModelInSourceModels &&
1651         std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(),
1652                            std::not_equal_to<int>()) == bestDeviceForOperation.end()) {
1653         const int bestDeviceIndex = bestDeviceForOperation[0];
1654         // Bypass the partitioning process unless the only operation is a
1655         // control flow operation scheduled for interpreted execution.
1656         if (bestDeviceIndex != kControlFlowInterpreter) {
1657             VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: "
1658                               << bestDeviceIndex << " = " << devices[bestDeviceIndex]->getName();
1659             plan->becomeSingleStep(devices[bestDeviceIndex], this);
1660             return ANEURALNETWORKS_NO_ERROR;
1661         }
1662     }
1663 
1664     // No easy solution, we need to split the work.
1665 
1666     // We keep track of the operations that are ready to run for each device.
1667     // perDeviceQueue[deviceCount] is for interpreted execution of control flow
1668     // (see LogicalStep).
1669     std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount + 1);
1670 
1671     // This helper function enqueues the operation on the appropriate queue.
1672     auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
1673         int deviceIndex = bestDeviceForOperation[operationIndex];
1674         perDeviceQueue[deviceIndex].push(operationIndex);
1675         VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
1676                           << deviceIndex;
1677     };
1678 
1679     // This helper function finds a device that has operations ready to process.
1680     // We start by looking at the control flow queue, and then look at the
1681     // devices in reverse order (i.e., starting at the end of the devices
1682     // vector). Earlier devices have a chance to prepare more of the inputs
1683     // required by other devices. This function returns -1 if all queues are
1684     // empty.
1685     auto findNextDeviceToProcess = [&]() -> int {
1686         for (int i = perDeviceQueue.size() - 1; i >= 0; i--) {
1687             if (!perDeviceQueue[i].empty()) {
1688                 return i;
1689             }
1690         }
1691         return -1;
1692     };
1693 
1694     OperandTracker tracker(this, enqueueOnAppropriateDevice);
1695     // For each iteration of this loop, we'll create an execution step.
1696     while (true) {
1697         // Find the device we'll do this step for.
1698         int deviceIndex = findNextDeviceToProcess();
1699         VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex;
1700         if (deviceIndex < 0) {
1701             break;
1702         }
1703 
1704         // Assign as much as possible to this device.
1705         auto& queue = perDeviceQueue[deviceIndex];
1706         if (deviceIndex != kControlFlowInterpreter) {
1707             ExecutionStep* step =
1708                     plan->createNewExecutionStep(sourceModelIndex, devices[deviceIndex]);
1709             while (!queue.empty()) {
1710                 uint32_t operationIndex = queue.front();
1711                 queue.pop();
1712                 int n = step->addOperation(operationIndex);
1713                 if (n != ANEURALNETWORKS_NO_ERROR) {
1714                     LOG(ERROR) << "failed to add operation " << operationIndex << " to step";
1715                     return n;
1716                 }
1717                 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
1718             }
1719         } else {
1720             while (!queue.empty()) {
1721                 uint32_t operationIndex = queue.front();
1722                 queue.pop();
1723                 const Operation& operation = getOperation(operationIndex);
1724                 if (operation.type == OperationType::IF) {
1725                     namespace op = operation_if;
1726                     const Operand& thenOperand =
1727                             getOperand(operation.inputs[op::kThenModelOperand]);
1728                     const Operand& elseOperand =
1729                             getOperand(operation.inputs[op::kElseModelOperand]);
1730                     const ModelBuilder* thenModel = getReferencedModel(thenOperand);
1731                     const ModelBuilder* elseModel = getReferencedModel(elseOperand);
1732                     uint32_t thenModelIndex = sourceModels->addModel(thenModel);
1733                     uint32_t elseModelIndex = sourceModels->addModel(elseModel);
1734 
1735                     // Emits the following:
1736                     // Index  Step
1737                     //   i    if then=(i + 1) else=(j + 1)
1738                     //  ...   (then model steps)
1739                     //   j    goto k
1740                     //  ...   (else model steps)
1741                     //   k    (steps after the IF)
1742                     IfStep* ifStep = plan->createNewIfStep();
1743                     ifStep->conditionOperandIndex = SourceOperandIndex(
1744                             sourceModelIndex, operation.inputs[op::kCondBoolOperand]);
1745                     ifStep->thenStepIndex = plan->getNextStepIndex();
1746                     NN_RETURN_IF_ERROR(thenModel->partitionTheWorkInternal(
1747                             thenModelIndex, devices, preference, priority, deadline, plan));
1748                     GotoStep* afterThenBranch = plan->createNewGotoStep();
1749                     ifStep->elseStepIndex = plan->getNextStepIndex();
1750                     NN_RETURN_IF_ERROR(elseModel->partitionTheWorkInternal(
1751                             elseModelIndex, devices, preference, priority, deadline, plan));
1752                     afterThenBranch->gotoStepIndex = plan->getNextStepIndex();
1753 
1754                     // Outer model operands.
1755                     for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) {
1756                         ifStep->outerInputOperands.emplace_back(sourceModelIndex,
1757                                                                 operation.inputs[i]);
1758                     }
1759                     for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1760                         ifStep->outerOutputOperands.emplace_back(sourceModelIndex,
1761                                                                  operation.outputs[i]);
1762                     }
1763                     // Then model operands.
1764                     for (uint32_t i = 0, n = thenModel->inputCount(); i < n; ++i) {
1765                         ifStep->thenBranchInputOperands.emplace_back(
1766                                 thenModelIndex, thenModel->getInputOperandIndex(i));
1767                     }
1768                     for (uint32_t i = 0, n = thenModel->outputCount(); i < n; ++i) {
1769                         ifStep->thenBranchOutputOperands.emplace_back(
1770                                 thenModelIndex, thenModel->getOutputOperandIndex(i));
1771                     }
1772                     // Else model operands.
1773                     for (uint32_t i = 0, n = elseModel->inputCount(); i < n; ++i) {
1774                         ifStep->elseBranchInputOperands.emplace_back(
1775                                 elseModelIndex, elseModel->getInputOperandIndex(i));
1776                     }
1777                     for (uint32_t i = 0, n = elseModel->outputCount(); i < n; ++i) {
1778                         ifStep->elseBranchOutputOperands.emplace_back(
1779                                 elseModelIndex, elseModel->getOutputOperandIndex(i));
1780                     }
1781                 } else if (operation.type == OperationType::WHILE) {
1782                     namespace op = operation_while;
1783                     const Operand& condOperand =
1784                             getOperand(operation.inputs[op::kCondModelOperand]);
1785                     const Operand& bodyOperand =
1786                             getOperand(operation.inputs[op::kBodyModelOperand]);
1787                     const ModelBuilder* condModel = getReferencedModel(condOperand);
1788                     const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
1789                     uint32_t condModelIndex = sourceModels->addModel(condModel);
1790                     uint32_t bodyModelIndex = sourceModels->addModel(bodyModel);
1791 
1792                     // Emits the following:
1793                     // Index  Step
1794                     //   i    while cond=(i + 1) body=(j + 1) exit=(k + 1)
1795                     //  ...   (cond model steps)
1796                     //   j    goto i
1797                     //  ...   (body model steps)
1798                     //   k    goto i
1799                     //  ...   (steps after the WHILE)
1800                     //
1801                     //  Note that WhileStep has WhileState associated with it.
1802                     WhileStep* whileStep = plan->createNewWhileStep();
1803                     whileStep->condStepIndex = plan->getNextStepIndex();
1804                     NN_RETURN_IF_ERROR(condModel->partitionTheWorkInternal(
1805                             condModelIndex, devices, preference, priority, deadline, plan));
1806                     GotoStep* afterCond = plan->createNewGotoStep();
1807                     afterCond->gotoStepIndex = whileStep->index;
1808                     whileStep->bodyStepIndex = plan->getNextStepIndex();
1809                     NN_RETURN_IF_ERROR(bodyModel->partitionTheWorkInternal(
1810                             bodyModelIndex, devices, preference, priority, deadline, plan));
1811                     GotoStep* afterBody = plan->createNewGotoStep();
1812                     afterBody->gotoStepIndex = whileStep->index;
1813                     whileStep->exitStepIndex = plan->getNextStepIndex();
1814 
1815                     // Outer model operands.
1816                     for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) {
1817                         whileStep->outerInputOperands.emplace_back(sourceModelIndex,
1818                                                                    operation.inputs[i]);
1819                     }
1820                     for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1821                         whileStep->outerOutputOperands.emplace_back(sourceModelIndex,
1822                                                                     operation.outputs[i]);
1823                     }
1824                     // Cond model operands.
1825                     for (uint32_t i = 0, n = condModel->inputCount(); i < n; ++i) {
1826                         whileStep->condInputOperands.emplace_back(
1827                                 condModelIndex, condModel->getInputOperandIndex(i));
1828                     }
1829                     whileStep->condOutputOperand =
1830                             SourceOperandIndex(condModelIndex, condModel->getOutputOperandIndex(0));
1831                     // Body model operands.
1832                     for (uint32_t i = 0, n = bodyModel->inputCount(); i < n; ++i) {
1833                         whileStep->bodyInputOperands.emplace_back(
1834                                 bodyModelIndex, bodyModel->getInputOperandIndex(i));
1835                     }
1836                     for (uint32_t i = 0, n = bodyModel->outputCount(); i < n; ++i) {
1837                         whileStep->bodyOutputOperands.emplace_back(
1838                                 bodyModelIndex, bodyModel->getOutputOperandIndex(i));
1839                     }
1840                 } else {
1841                     CHECK(false) << toString(operation.type) << " is not a control flow operation";
1842                 }
1843                 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
1844             }
1845         }
1846     }
1847     return ANEURALNETWORKS_NO_ERROR;
1848 }
1849 
getPerformance(uint32_t preference,const std::shared_ptr<Device> device) const1850 float ModelBuilder::getPerformance(uint32_t preference,
1851                                    const std::shared_ptr<Device> device) const {
1852     // Note that we will call this method multiple times per compilation with
1853     // the same arguments if there are nested control flow operations and we
1854     // decide to execute the outer operation on the ExecutionPlan::next()
1855     // interpreter.
1856     //
1857     // This is a potential compilation performance problem. To work around it,
1858     // the performance value could be cached for the duration of a compilation.
1859     float perf = 0;
1860     const size_t operationCount = mOperations.size();
1861     for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
1862         perf += getPerformance(preference, device, operationIndex);
1863     }
1864     return perf;
1865 }
1866 
getPerformance(uint32_t preference,const std::shared_ptr<Device> device,uint32_t operationIndex) const1867 float ModelBuilder::getPerformance(uint32_t preference, const std::shared_ptr<Device> device,
1868                                    uint32_t operationIndex) const {
1869     auto applyPreference = [preference](const PerformanceInfo& perf) {
1870         return preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage : perf.execTime;
1871     };
1872 
1873     const Operation& operation = getOperation(operationIndex);
1874 
1875     if (operation.type == OperationType::IF) {
1876         namespace op = operation_if;
1877         const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]);
1878         const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]);
1879         const ModelBuilder* thenModel = getReferencedModel(thenOperand);
1880         const ModelBuilder* elseModel = getReferencedModel(elseOperand);
1881         return applyPreference(device->getIfPerformance()) +
1882                0.5 * (thenModel->getPerformance(preference, device) +
1883                       elseModel->getPerformance(preference, device));
1884     }
1885 
1886     if (operation.type == OperationType::WHILE) {
1887         namespace op = operation_while;
1888         const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]);
1889         const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]);
1890         const ModelBuilder* condModel = getReferencedModel(condOperand);
1891         const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
1892         return applyPreference(device->getWhilePerformance()) +
1893                condModel->getPerformance(preference, device) +
1894                bodyModel->getPerformance(preference, device);
1895     }
1896 
1897     // TODO This assumes that the type is dictated by the first operand. This is
1898     // currently the case but is not a safe assumption to make in the long term.
1899     const uint32_t operandIndex = operation.inputs[0];
1900     const OperandType operandType = mOperands[operandIndex].type;
1901     switch (operandType) {
1902         case OperandType::FLOAT32:
1903             if (mRelaxComputationFloat32toFloat16) {
1904                 return applyPreference(device->getRelaxedFloat32toFloat16PerformanceScalar());
1905             }
1906             break;
1907         case OperandType::TENSOR_FLOAT32:
1908             if (mRelaxComputationFloat32toFloat16) {
1909                 return applyPreference(device->getRelaxedFloat32toFloat16PerformanceTensor());
1910             }
1911             break;
1912         default:
1913             break;
1914     }
1915 
1916     return applyPreference(device->getPerformance(operandType));
1917 }
1918 
isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const1919 bool ModelBuilder::isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const {
1920     auto containsUnknownSize = [](const ModelBuilder* model,
1921                                   const std::vector<uint32_t>& operandIndexes) {
1922         for (uint32_t operandIndex : operandIndexes) {
1923             if (hasUnknownSize(model->getOperand(operandIndex))) {
1924                 return true;
1925             }
1926         }
1927         return false;
1928     };
1929 
1930     const Operation& operation = getOperation(operationIndex);
1931 
1932     if (operation.type == OperationType::IF) {
1933         namespace op = operation_if;
1934         const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]);
1935         const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]);
1936         const ModelBuilder* thenModel = getReferencedModel(thenOperand);
1937         const ModelBuilder* elseModel = getReferencedModel(elseOperand);
1938         return containsUnknownSize(this, operation.inputs) ||
1939                containsUnknownSize(this, operation.outputs) ||
1940                containsUnknownSize(thenModel, thenModel->getInputOperandIndexes()) ||
1941                containsUnknownSize(thenModel, thenModel->getOutputOperandIndexes()) ||
1942                containsUnknownSize(elseModel, elseModel->getInputOperandIndexes()) ||
1943                containsUnknownSize(elseModel, elseModel->getOutputOperandIndexes());
1944     }
1945 
1946     if (operation.type == OperationType::WHILE) {
1947         namespace op = operation_while;
1948         const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]);
1949         const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]);
1950         const ModelBuilder* condModel = getReferencedModel(condOperand);
1951         const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
1952         return containsUnknownSize(this, operation.inputs) ||
1953                containsUnknownSize(this, operation.outputs) ||
1954                containsUnknownSize(condModel, condModel->getInputOperandIndexes()) ||
1955                containsUnknownSize(condModel, condModel->getOutputOperandIndexes()) ||
1956                containsUnknownSize(bodyModel, bodyModel->getInputOperandIndexes()) ||
1957                containsUnknownSize(bodyModel, bodyModel->getOutputOperandIndexes());
1958     }
1959 
1960     // Not a control flow operation.
1961     return false;
1962 }
1963 
supportedByControlFlowInterpreter(uint32_t operationIndex) const1964 bool ModelBuilder::supportedByControlFlowInterpreter(uint32_t operationIndex) const {
1965     const Operation& operation = getOperation(operationIndex);
1966     return (operation.type == OperationType::IF || operation.type == OperationType::WHILE) &&
1967            // The partitioner does not support dynamic temporaries (b/132458982).
1968            !isControlFlowOperationWithOperandOfUnknownSize(operationIndex);
1969 }
1970 
1971 namespace {
1972 
1973 // This class determines whether a given device can execute a given operation
1974 class CanDo {
1975    public:
CanDo()1976     CanDo() {}
1977 
initialize(const MetaModel & metaModel,std::shared_ptr<Device> device)1978     void initialize(const MetaModel& metaModel, std::shared_ptr<Device> device) {
1979         mSupportsOperationByIndex = device->getSupportedOperations(metaModel);
1980     }
1981 
check(size_t operationIndex) const1982     bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; }
1983 
1984    private:
1985     std::vector<bool> mSupportsOperationByIndex;
1986 };
1987 
1988 }  // anonymous namespace
1989 
findBestDeviceForEachOperation(uint32_t preference,const std::vector<std::shared_ptr<Device>> & devices,std::vector<int> * bestDeviceForOperation) const1990 int ModelBuilder::findBestDeviceForEachOperation(
1991         uint32_t preference, const std::vector<std::shared_ptr<Device>>& devices,
1992         std::vector<int>* bestDeviceForOperation) const {
1993     const MetaModel metaModel(makeHidlModel(), DeviceManager::get()->strictSlicing());
1994 
1995     const size_t deviceCount = devices.size();
1996     std::vector<CanDo> canDo(deviceCount);
1997     for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
1998         canDo[deviceIndex].initialize(metaModel, devices[deviceIndex]);
1999     }
2000 
2001     // Figure out the best driver for each operation.
2002     const size_t operationCount = mOperations.size();
2003     for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
2004         const Operation& operation = getOperation(operationIndex);
2005         // Find which device, including CPU fallback, gives the best performance for this operation.
2006         int bestChoice = -1;
2007 
2008         if (isControlFlowOperationWithOperandOfUnknownSize(operationIndex)) {
2009             // Do not schedule control flow operations with unknown size to
2010             // non-CPU devices because this is not supported by the 1.3 HAL.
2011             // See http://b/159076604#comment5.
2012             auto cpuDeviceIterator =
2013                     std::find(devices.begin(), devices.end(), DeviceManager::getCpuDevice());
2014             if (cpuDeviceIterator != devices.end()) {
2015                 int cpuDeviceIndex = cpuDeviceIterator - devices.begin();
2016                 if (canDo[cpuDeviceIndex].check(operationIndex)) {
2017                     bestChoice = cpuDeviceIndex;
2018                 }
2019             }
2020         } else {
2021             float bestPerfVal = 0.0;  // Do not check bestPerfVal if bestChoice < 0.
2022             for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
2023                 const auto& device = devices[deviceIndex];
2024                 if (canDo[deviceIndex].check(operationIndex)) {
2025                     const float perfVal = getPerformance(preference, device, operationIndex);
2026                     if (bestChoice < 0 || perfVal < bestPerfVal ||
2027                         (perfVal == bestPerfVal && device == DeviceManager::getCpuDevice())) {
2028                         bestChoice = deviceIndex;
2029                         bestPerfVal = perfVal;
2030                     }
2031                 } else {
2032                     // Somewhat noisy logging, but only place where the user of NNAPI can get
2033                     // feedback on why an operation was not run on a specific device.
2034                     //
2035                     // Logs O(operationCount * deviceCount) times, but typically deviceCount is
2036                     // very small.
2037                     VLOG(COMPILATION) << "Device " << device->getName() << " can't do operation "
2038                                       << toString(operation.type);
2039                 }
2040             }
2041         }
2042 
2043         if (bestChoice < 0) {
2044             LOG(ERROR) << "No driver can do operation " << toString(operation.type);
2045             return ANEURALNETWORKS_BAD_DATA;
2046         } else if (devices[bestChoice] == DeviceManager::getCpuDevice() &&
2047                    supportedByControlFlowInterpreter(operationIndex)) {
2048             // Run control flow on the ExecutionPlan::next() interpreter and try
2049             // to delegate referenced models.
2050             const int kControlFlowInterpreter = deviceCount;
2051             (*bestDeviceForOperation)[operationIndex] = kControlFlowInterpreter;
2052             VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
2053                               << toString(operation.type) << ") = -1"
2054                               << " (NNAPI)";
2055         } else {
2056             (*bestDeviceForOperation)[operationIndex] = bestChoice;
2057             VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
2058                               << toString(operation.type) << ") = " << bestChoice << " ("
2059                               << devices[bestChoice]->getName() << ")";
2060         }
2061     }
2062     return ANEURALNETWORKS_NO_ERROR;
2063 }
2064 
2065 }  // namespace nn
2066 }  // namespace android
2067