1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "ExecutionPlan"
18
19 #include "ExecutionPlan.h"
20
21 #include <fcntl.h>
22 #include <openssl/sha.h>
23 #include <sys/stat.h>
24 #include <sys/types.h>
25
26 #include <algorithm>
27 #include <functional>
28 #include <map>
29 #include <memory>
30 #include <mutex>
31 #include <queue>
32 #include <set>
33 #include <string>
34 #include <type_traits>
35 #include <unordered_set>
36 #include <utility>
37 #include <vector>
38
39 #include "BurstBuilder.h"
40 #include "Callbacks.h"
41 #include "CompilationBuilder.h"
42 #include "ControlFlow.h"
43 #include "CpuExecutor.h"
44 #include "ExecutionBuilder.h"
45 #include "ExecutionBurstController.h"
46 #include "GraphDump.h"
47 #include "Manager.h"
48 #include "MetaModel.h"
49 #include "ModelBuilder.h"
50 #include "OperationsUtils.h"
51 #include "TokenHasher.h"
52 #include "Tracing.h"
53 #include "TypeManager.h"
54 #include "Utils.h"
55
56 namespace android {
57 namespace nn {
58
59 namespace {
60
61 using namespace hal;
62
63 // The index of the main model in SourceModels.
64 constexpr uint32_t kMainModelInSourceModels = 0;
65
66 // Compiles the model on device.
67 // If compilation caching is available, depending on ExecutionPlan::mState, the token may only have
68 // been initialized by the user provided token (SIMPLE body), or is already re-hashed by the
69 // operation indices to be executed (COMPOUND body). The token will be re-hashed further by the
70 // device name, device version string, and the execution preference in this function.
compile(const Device & device,const ModelBuilder & model,int executionPreference,int compilationPriority,const std::optional<Deadline> & deadline,const std::string & cacheDir,TokenHasher * token,std::shared_ptr<PreparedModel> * preparedModel)71 int compile(const Device& device, const ModelBuilder& model, int executionPreference,
72 int compilationPriority, const std::optional<Deadline>& deadline,
73 const std::string& cacheDir, TokenHasher* token,
74 std::shared_ptr<PreparedModel>* preparedModel) {
75 CHECK(token != nullptr);
76 CHECK(preparedModel != nullptr);
77 *preparedModel = nullptr;
78
79 std::optional<CacheToken> cacheToken;
80 if (device.isCachingSupported() && token->ok() &&
81 token->updateFromString(device.getName().c_str()) &&
82 token->updateFromString(device.getVersionString().c_str()) &&
83 token->update(&executionPreference, sizeof(executionPreference)) &&
84 token->update(&compilationPriority, sizeof(compilationPriority)) && token->finish()) {
85 cacheToken.emplace(token->getCacheToken());
86 }
87
88 const ModelFactory makeModel = [&model] { return model.makeHidlModel(); };
89 const ExecutionPreference preference = static_cast<ExecutionPreference>(executionPreference);
90 const Priority priority = convertToHalPriority(compilationPriority);
91 const auto [n, returnedPreparedModel] =
92 device.prepareModel(makeModel, preference, priority, deadline, cacheDir, cacheToken);
93 *preparedModel = returnedPreparedModel;
94 return n;
95 }
96
97 typedef std::function<void(uint32_t)> OperationReadyCallback;
98
copyOperandExtraParams(ModelBuilder & model,uint32_t toOperandIndex,const Operand & fromOperand)99 int copyOperandExtraParams(ModelBuilder& model, uint32_t toOperandIndex,
100 const Operand& fromOperand) {
101 if (fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL &&
102 fromOperand.extraParams.getDiscriminator() ==
103 OperandExtraParams::hidl_discriminator::channelQuant) {
104 auto& fromChannelQuant = fromOperand.extraParams.channelQuant();
105 ANeuralNetworksSymmPerChannelQuantParams toChannelQuant = {
106 .channelDim = fromChannelQuant.channelDim,
107 .scaleCount = static_cast<uint32_t>(fromChannelQuant.scales.size()),
108 .scales = fromChannelQuant.scales.data(),
109 };
110 return model.setOperandSymmPerChannelQuantParams(toOperandIndex, toChannelQuant);
111 } else if (isExtensionOperandType(fromOperand.type) &&
112 fromOperand.extraParams.getDiscriminator() ==
113 OperandExtraParams::hidl_discriminator::extension) {
114 hidl_vec<uint8_t> extensionData = fromOperand.extraParams.extension();
115 return model.setOperandExtensionData(toOperandIndex, extensionData.data(),
116 extensionData.size());
117 } else if (fromOperand.extraParams.getDiscriminator() !=
118 OperandExtraParams::hidl_discriminator::none ||
119 fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
120 LOG(ERROR) << "Type " << toString(fromOperand.type)
121 << " has an unexpected extraParams discriminator: "
122 << static_cast<int>(fromOperand.extraParams.getDiscriminator());
123 return ANEURALNETWORKS_BAD_DATA;
124 } else {
125 return ANEURALNETWORKS_NO_ERROR;
126 }
127 }
128
129 // This class tracks whether we know the value of an operand as operations
130 // are processed.
131 class OperandTracker {
132 public:
133 // Creates the tracker for this model. Figure out which operations can be
134 // executed right away and cb for each one of them.
135 OperandTracker(const ModelBuilder* model, OperationReadyCallback cb);
136 // Mark the specified operation as having been processed. The output
137 // of the operation now being known, this may make new operations to be
138 // able to run. Call cb for each one of them.
139 void markProcessed(uint32_t operationIndex, OperationReadyCallback cb);
140
141 private:
142 const ModelBuilder* mModel;
143 std::multimap<uint32_t, uint32_t> mOperandToOperations;
144 std::vector<uint32_t> mUnknownInputCount; // For each operation
145 };
146
OperandTracker(const ModelBuilder * model,OperationReadyCallback cb)147 OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb)
148 : mModel(model) {
149 const auto& operations = mModel->getOperations();
150 mUnknownInputCount.resize(operations.size());
151 for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) {
152 const Operation& operation = operations[operationIndex];
153 uint32_t count = 0;
154 for (uint32_t operandIndex : operation.inputs) {
155 auto lifetime = mModel->getOperand(operandIndex).lifetime;
156 if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
157 lifetime == OperandLifeTime::SUBGRAPH_OUTPUT) {
158 count++;
159 mOperandToOperations.emplace(operandIndex, operationIndex);
160 }
161 }
162 if (count == 0) {
163 cb(operationIndex);
164 }
165 mUnknownInputCount[operationIndex] = count;
166 }
167 }
168
markProcessed(uint32_t operationIndex,OperationReadyCallback cb)169 void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) {
170 // Mark all its outputs as known.
171 const Operation& operation = mModel->getOperations()[operationIndex];
172 for (uint32_t operandIndex : operation.outputs) {
173 auto range = mOperandToOperations.equal_range(operandIndex);
174 for (auto i = range.first; i != range.second; i++) {
175 uint32_t& count = mUnknownInputCount[i->second];
176 if (--count == 0) {
177 cb(i->second);
178 }
179 }
180 }
181 }
182
183 } // namespace
184
ExecutionStep(ExecutionPlan * plan,uint32_t stepIndex,uint32_t sourceModelIndex,std::shared_ptr<Device> device)185 ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex, uint32_t sourceModelIndex,
186 std::shared_ptr<Device> device)
187 : mPlan(plan),
188 mIndex(stepIndex),
189 mSourceModelIndex(sourceModelIndex),
190 mStepModel(),
191 mDevice(device),
192 mToken(plan->getCacheToken()) {}
193
194 // Adds an operand if it has not been added already.
195 // Sets the index in the step model for the corresponding operand.
addOperand(uint32_t sourceOperandIndex,uint32_t * stepOperandIndex,OperandKind kind)196 int ExecutionStep::addOperand(uint32_t sourceOperandIndex, uint32_t* stepOperandIndex,
197 OperandKind kind) {
198 // Have we added this operand already?
199 auto i = mOperandMap.find(sourceOperandIndex);
200 if (i != mOperandMap.end()) {
201 CHECK(kind == INPUT);
202 *stepOperandIndex = i->second;
203 return ANEURALNETWORKS_NO_ERROR;
204 }
205
206 // First time we add this operand.
207 *stepOperandIndex = mStepModel.operandCount();
208 mOperandMap.emplace(sourceOperandIndex, *stepOperandIndex);
209
210 // Add the operand to the step model.
211 const ModelBuilder& sourceModel = *getSourceModel();
212 const Operand& operand = sourceModel.getOperand(sourceOperandIndex);
213 ANeuralNetworksOperandType type = {
214 .type = static_cast<int32_t>(operand.type),
215 .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()),
216 .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr,
217 .scale = operand.scale,
218 .zeroPoint = operand.zeroPoint,
219 };
220
221 int n = mStepModel.addOperand(type);
222 if (n != ANEURALNETWORKS_NO_ERROR) {
223 LOG(ERROR) << "Previous error occurred when partitioning the graph";
224 return n;
225 }
226
227 n = copyOperandExtraParams(mStepModel, *stepOperandIndex, operand);
228 if (n != ANEURALNETWORKS_NO_ERROR) {
229 LOG(ERROR) << "Error when copying extra parameters to the operand";
230 return n;
231 }
232
233 // Sets its value.
234 switch (operand.lifetime) {
235 case OperandLifeTime::CONSTANT_COPY: {
236 const uint8_t* data = sourceModel.getPointerToOperandValue(operand.location.offset);
237 n = mStepModel.setOperandValue(*stepOperandIndex, data, operand.location.length);
238 if (n != ANEURALNETWORKS_NO_ERROR) {
239 LOG(ERROR) << "Previous error occurred when partitioning the graph";
240 return n;
241 }
242 } break;
243 case OperandLifeTime::CONSTANT_REFERENCE: {
244 const Memory* memory = sourceModel.getMemories()[operand.location.poolIndex];
245 n = mStepModel.setOperandValueFromMemory(
246 *stepOperandIndex, memory, operand.location.offset, operand.location.length);
247 if (n != ANEURALNETWORKS_NO_ERROR) {
248 LOG(ERROR) << "Previous error occurred when partitioning the graph";
249 return n;
250 }
251 } break;
252 case OperandLifeTime::NO_VALUE: {
253 n = mStepModel.setOperandValue(*stepOperandIndex, nullptr, 0);
254 if (n != ANEURALNETWORKS_NO_ERROR) {
255 LOG(ERROR) << "Previous error occurred when partitioning the graph";
256 return n;
257 }
258 } break;
259 case OperandLifeTime::TEMPORARY_VARIABLE: { // handled similarly to SUBGRAPH_OUTPUT
260 if (kind == INPUT) {
261 // The first time we've seen this operand is as an
262 // input. That means it must be defined by a
263 // different partition, and is an input to this one.
264 mTempsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
265 } else {
266 // The first time we've seen this operand is as an
267 // output. It may be an input to a different
268 // partition, so keep track of it.
269 mPlan->recordTemporaryDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex),
270 mIndex);
271 }
272 } break;
273 case OperandLifeTime::SUBGRAPH_INPUT: {
274 mModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
275 } break;
276 case OperandLifeTime::SUBGRAPH_OUTPUT: { // handled similarly to TEMPORARY_VARIABLE
277 if (kind == INPUT) {
278 // The first time we've seen this operand is as an
279 // input. That means it must be defined by a
280 // different partition, and is an input to this one.
281 mOutputsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
282 } else {
283 // The first time we've seen this operand is as an
284 // output.
285 mModelOutputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
286 }
287 } break;
288 case OperandLifeTime::SUBGRAPH: {
289 const ModelBuilder* model = sourceModel.getReferencedModel(operand);
290 n = mStepModel.setOperandValueFromModel(*stepOperandIndex, model);
291 if (n != ANEURALNETWORKS_NO_ERROR) {
292 LOG(ERROR) << "Previous error occurred when partitioning the graph";
293 return n;
294 }
295 } break;
296 default: {
297 CHECK(!"unexpected");
298 } break;
299 }
300
301 return ANEURALNETWORKS_NO_ERROR;
302 }
303
addOperation(int operationIndex)304 int ExecutionStep::addOperation(int operationIndex) {
305 const Operation& operation = getSourceModel()->getOperation(operationIndex);
306 if (mToken.ok()) {
307 mToken.update(&mSourceModelIndex, sizeof(mSourceModelIndex));
308 mToken.update(&operationIndex, sizeof(operationIndex));
309 }
310
311 // Convert the input and output operand indexes.
312 //
313 // We expect operations to be added in topological order. Therefore:
314 //
315 // - We may not have seen an input if it is a model input, a
316 // constant, or an operand written by a different partition.
317 //
318 // - We should not have seen any outputs.
319 auto addOperands = [this](const hidl_vec<uint32_t>& sourceModelOperands,
320 std::vector<uint32_t>* stepModelOperands, OperandKind kind) -> int {
321 const uint32_t operandCount = static_cast<uint32_t>(sourceModelOperands.size());
322 for (uint32_t i = 0; i < operandCount; i++) {
323 NN_RETURN_IF_ERROR(addOperand(sourceModelOperands[i], &stepModelOperands->at(i), kind));
324 }
325 return ANEURALNETWORKS_NO_ERROR;
326 };
327
328 const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size());
329 const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size());
330 std::vector<uint32_t> inputs(inputCount);
331 std::vector<uint32_t> outputs(outputCount);
332 NN_RETURN_IF_ERROR(addOperands(operation.inputs, &inputs, INPUT));
333 NN_RETURN_IF_ERROR(addOperands(operation.outputs, &outputs, OUTPUT));
334 return mStepModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(),
335 outputCount, outputs.data());
336 }
337
mapInputsAndOutputs(std::shared_ptr<StepExecutor> executor,const Memory * temporaryMemory,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToOffsetOfTemporary,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToInputIndex,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToOutputIndex,const std::map<SourceOperandIndex,ConstantReferenceLocation> & sourceOperandToConstantReference) const338 void ExecutionStep::mapInputsAndOutputs(
339 std::shared_ptr<StepExecutor> executor, const Memory* temporaryMemory,
340 const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOffsetOfTemporary,
341 const std::map<SourceOperandIndex, uint32_t>& sourceOperandToInputIndex,
342 const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOutputIndex,
343 const std::map<SourceOperandIndex, ConstantReferenceLocation>&
344 sourceOperandToConstantReference) const {
345 auto mapInput = [&](uint32_t stepModelOperandIndex, uint32_t stepInputIndex) {
346 SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex);
347 if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex);
348 it != sourceOperandToOffsetOfTemporary.end()) {
349 executor->setInputFromMemory(stepInputIndex, temporaryMemory, it->second);
350 } else if (auto it = sourceOperandToInputIndex.find(sourceOperandIndex);
351 it != sourceOperandToInputIndex.end()) {
352 executor->mapInput(it->second, stepInputIndex);
353 } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
354 it != sourceOperandToOutputIndex.end()) {
355 executor->mapOutputToInput(it->second, stepInputIndex);
356 } else if (auto it = sourceOperandToConstantReference.find(sourceOperandIndex);
357 it != sourceOperandToConstantReference.end()) {
358 // Constant partition boundary operand. This could be an IF branch
359 // model input or a WHILE variable initializer.
360 executor->setInputFromMemory(stepInputIndex, it->second.memory, it->second.offset);
361 } else {
362 CHECK(false) << "Cannot map step input " << stepInputIndex << " from operand "
363 << toString(sourceOperandIndex);
364 }
365 };
366 auto mapOutput = [&](uint32_t stepModelOperandIndex, uint32_t stepOutputIndex) {
367 SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex);
368 if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex);
369 it != sourceOperandToOffsetOfTemporary.end()) {
370 executor->setOutputFromMemory(stepOutputIndex, temporaryMemory, it->second);
371 } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
372 it != sourceOperandToOutputIndex.end()) {
373 executor->mapOutput(it->second, stepOutputIndex);
374 } else {
375 CHECK(false) << "Cannot map step output " << stepOutputIndex << " from operand "
376 << toString(sourceOperandIndex);
377 }
378 };
379 for (uint32_t i = 0, n = mStepModelInputs.size(); i < n; ++i) {
380 mapInput(mStepModelInputs[i].first, i);
381 }
382 for (uint32_t i = 0, n = mStepModelOutputs.size(); i < n; ++i) {
383 mapOutput(mStepModelOutputs[i].first, i);
384 }
385 }
386
findTempsAsStepModelOutputs()387 void ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs() {
388 auto recordAsOutputIfTemporary = [this](const SourceOperandIndex& sourceOperandIndex) {
389 const auto it = mTemporaryToDefiningExecutionStep.find(sourceOperandIndex);
390 if (it == mTemporaryToDefiningExecutionStep.end()) {
391 // The operand is not a temporary or is not defined by an
392 // ExecutionStep (i.e. it's an output of an IF or a WHILE).
393 // The latter case is handled by ExecutionPlan::makeController().
394 return;
395 }
396 uint32_t stepIndex = it->second;
397 CHECK_LT(stepIndex, mSteps.size());
398 mSteps[stepIndex]->executionStep()->recordTempAsStepModelOutput(sourceOperandIndex.second);
399 };
400 for (const auto& logicalStep : mSteps) {
401 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
402 for (const auto& input : step->getTempsAsStepModelInputs()) {
403 SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), input.first);
404 recordAsOutputIfTemporary(sourceOperandIndex);
405 }
406 } else if (const IfStep* step = logicalStep->tryIfStep()) {
407 recordAsOutputIfTemporary(step->conditionOperandIndex);
408 for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) {
409 recordAsOutputIfTemporary(sourceOperandIndex);
410 }
411 } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
412 for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) {
413 recordAsOutputIfTemporary(sourceOperandIndex);
414 }
415 } else {
416 CHECK(logicalStep->isGoto());
417 }
418 }
419 }
420
recordTempAsStepModelOutput(uint32_t stepOperandIndex)421 void ExecutionStep::recordTempAsStepModelOutput(uint32_t stepOperandIndex) {
422 const auto it = mOperandMap.find(stepOperandIndex);
423 CHECK(it != mOperandMap.end());
424 mTempsAsStepModelOutputs.emplace(stepOperandIndex, it->second);
425 }
426
getSourceModel() const427 const ModelBuilder* ExecutionStep::getSourceModel() const {
428 return mPlan->getSourceModels().getModel(mSourceModelIndex);
429 }
430
logStepModel() const431 void ExecutionStep::logStepModel() const {
432 VLOG(COMPILATION) << "ExecutionStep::finishStepModel, step " << mIndex;
433
434 auto logRemapEntry = [](std::string& toLog, const std::pair<uint32_t, uint32_t>& e) {
435 if (!toLog.empty()) {
436 toLog += ", ";
437 }
438 toLog += toString(e.first);
439 toLog += "->";
440 toLog += toString(e.second);
441 };
442
443 auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) {
444 std::string toLog;
445 for (const auto& e : map) {
446 logRemapEntry(toLog, e);
447 }
448 VLOG(COMPILATION) << name << ": " << toLog;
449 };
450 auto logRemapSet = [&logRemapEntry](const char* name, const StepModelOutputSetType& set) {
451 std::string toLog;
452 for (const auto& e : set) {
453 logRemapEntry(toLog, e);
454 }
455 VLOG(COMPILATION) << name << ": " << toLog;
456 };
457
458 logRemapVector("step model inputs", mStepModelInputs);
459 logRemapVector("step model outputs", mStepModelOutputs);
460 logRemapVector("model inputs", mModelInputs);
461 logRemapVector("model outputs", mModelOutputs);
462 logRemapVector("temps as step model inputs", mTempsAsStepModelInputs);
463 logRemapSet("temps as step model outputs", mTempsAsStepModelOutputs);
464 logRemapVector("outputs as step model inputs", mOutputsAsStepModelInputs);
465 }
466
hasUnknownSize(const Operand & operand)467 static bool hasUnknownSize(const Operand& operand) {
468 if (operand.dimensions.size() == 0) {
469 return TypeManager::get()->isTensorType(operand.type);
470 }
471 for (uint32_t dimension : operand.dimensions) {
472 if (dimension == 0) {
473 return true;
474 }
475 }
476 return false;
477 }
478
finishStepModel(const ModelBuilder * mainModel,bool * hasOutputOfUnknownSize,int32_t executionPreference,int32_t priority)479 int ExecutionStep::finishStepModel(const ModelBuilder* mainModel, bool* hasOutputOfUnknownSize,
480 int32_t executionPreference, int32_t priority) {
481 CHECK(mDevice != nullptr);
482
483 for (const auto& stepModelOutput : mTempsAsStepModelOutputs) {
484 const Operand& operand = mStepModel.getOperand(stepModelOutput.second);
485 if (hasUnknownSize(operand)) {
486 *hasOutputOfUnknownSize = true;
487 VLOG(COMPILATION) << "StepModelOutput (operand#" << toString(stepModelOutput.first)
488 << " of source graph) has unknown size: " << toString(operand);
489 }
490 }
491
492 mStepModel.relaxComputationFloat32toFloat16(mainModel->isComputationFloat32RelaxedToFloat16());
493
494 mStepModelInputs.insert(mStepModelInputs.end(), mModelInputs.begin(), mModelInputs.end());
495 mStepModelInputs.insert(mStepModelInputs.end(), mTempsAsStepModelInputs.begin(),
496 mTempsAsStepModelInputs.end());
497 mStepModelInputs.insert(mStepModelInputs.end(), mOutputsAsStepModelInputs.begin(),
498 mOutputsAsStepModelInputs.end());
499
500 mStepModelOutputs.insert(mStepModelOutputs.end(), mModelOutputs.begin(), mModelOutputs.end());
501 mStepModelOutputs.insert(mStepModelOutputs.end(), mTempsAsStepModelOutputs.begin(),
502 mTempsAsStepModelOutputs.end());
503
504 if (mSourceModelIndex == kMainModelInSourceModels) {
505 std::map<uint32_t, uint32_t> mainModelOperandToInputIndex;
506 for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
507 mainModelOperandToInputIndex[mainModel->getInputOperandIndex(i)] = i;
508 }
509 std::map<uint32_t, uint32_t> mainModelOperandToOutputIndex;
510 for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
511 mainModelOperandToOutputIndex[mainModel->getOutputOperandIndex(i)] = i;
512 }
513
514 // mInputIndexStepModelToMainModel is ordered by step model input index and relies on
515 // mModelInputs being the first inputs, as specified by mStepModelInputs.
516 mInputIndexStepModelToMainModel.resize(mModelInputs.size());
517 std::transform(mModelInputs.begin(), mModelInputs.end(),
518 mInputIndexStepModelToMainModel.begin(),
519 [&mainModelOperandToInputIndex](auto& e) {
520 uint32_t sourceOperandIndex = e.first;
521 return mainModelOperandToInputIndex[sourceOperandIndex];
522 });
523
524 // mOutputIndexStepModelToMainModel is ordered by step model output index and relies on
525 // mModelOutputs being the first outputs, as specified by mStepModelOutputs.
526 mOutputIndexStepModelToMainModel.resize(mModelOutputs.size());
527 std::transform(mModelOutputs.begin(), mModelOutputs.end(),
528 mOutputIndexStepModelToMainModel.begin(),
529 [&mainModelOperandToOutputIndex](auto& e) {
530 uint32_t sourceOperandIndex = e.first;
531 return mainModelOperandToOutputIndex[sourceOperandIndex];
532 });
533
534 // mOutputsAsStepModelInputsIndexToMainModel is ordered by step model input index and relies
535 // on mOutputsAsStepModelInputs being the first outputs.
536 mOutputsAsStepModelInputsIndexToMainModel.resize(mOutputsAsStepModelInputs.size());
537 std::transform(mOutputsAsStepModelInputs.begin(), mOutputsAsStepModelInputs.end(),
538 mOutputsAsStepModelInputsIndexToMainModel.begin(),
539 [&mainModelOperandToOutputIndex](auto& e) {
540 uint32_t sourceOperandIndex = e.first;
541 return mainModelOperandToOutputIndex[sourceOperandIndex];
542 });
543 }
544
545 if (VLOG_IS_ON(COMPILATION)) {
546 logStepModel();
547 }
548
549 std::vector<uint32_t> inputs(mStepModelInputs.size());
550 std::vector<uint32_t> outputs(mStepModelOutputs.size());
551 std::transform(mStepModelInputs.begin(), mStepModelInputs.end(), inputs.begin(),
552 [](auto& e) { return e.second; });
553 std::transform(mStepModelOutputs.begin(), mStepModelOutputs.end(), outputs.begin(),
554 [](auto& e) { return e.second; });
555 NN_RETURN_IF_ERROR(mStepModel.identifyInputsAndOutputs(inputs.size(), inputs.data(),
556 outputs.size(), outputs.data()));
557 // TODO: Model::finish() should use ValidationMode::RUNTIME when sending the
558 // step model to CpuDevice. Right now, this is harmless because the only
559 // difference in validation occurs with control flow operations and inputs
560 // or outputs of unknown size and we never send control flow operations to
561 // CpuDevice. We need to address this if this behavior changes (b/151634976).
562 NN_RETURN_IF_ERROR(mStepModel.finish());
563
564 // TODO: Move compilation elsewhere?
565 VLOG(COMPILATION) << "ExecutionStep::finishStepModel, compilation on " << mDevice->getName();
566 return compile(*mDevice, mStepModel, executionPreference, priority, {}, *mPlan->getCacheDir(),
567 &mToken, &mPreparedStepModel);
568 }
569
dump() const570 void ExecutionStep::dump() const {
571 if (VLOG_IS_ON(COMPILATION)) {
572 VLOG(COMPILATION) << "Step#" << mIndex << ": execute on " << mDevice->getName();
573 logModelToInfo(mStepModel.makeHidlModel());
574 }
575 }
576
toString(const IfStep & step)577 std::string toString(const IfStep& step) {
578 std::ostringstream oss;
579 oss << "Step#" << step.index << ": if " << toString(step.conditionOperandIndex)
580 << " then=" << step.thenStepIndex << " else=" << step.elseStepIndex;
581 return oss.str();
582 }
583
toString(const WhileStep & step)584 std::string toString(const WhileStep& step) {
585 std::ostringstream oss;
586 oss << "Step#" << step.index << ": while cond=" << step.condStepIndex
587 << " body=" << step.bodyStepIndex << " exit=" << step.exitStepIndex;
588 return oss.str();
589 }
590
toString(const GotoStep & step)591 std::string toString(const GotoStep& step) {
592 std::ostringstream oss;
593 oss << "Step#" << step.index << ": goto " << step.gotoStepIndex;
594 return oss.str();
595 }
596
dump() const597 void LogicalStep::dump() const {
598 if (VLOG_IS_ON(COMPILATION)) {
599 if (const IfStep* step = tryIfStep()) {
600 VLOG(COMPILATION) << toString(*step);
601 } else if (const WhileStep* step = tryWhileStep()) {
602 VLOG(COMPILATION) << toString(*step);
603 } else if (const GotoStep* step = tryGotoStep()) {
604 VLOG(COMPILATION) << toString(*step);
605 } else {
606 executionStep()->dump();
607 }
608 }
609 }
610
finish(const SourceModels * sourceModels,int32_t executionPreference,int32_t priority,const std::optional<Deadline> & deadline)611 int ExecutionPlan::CompoundBody::finish(const SourceModels* sourceModels,
612 int32_t executionPreference, int32_t priority,
613 const std::optional<Deadline>& deadline) {
614 CHECK(!mSuccessfulFinish);
615 CHECK(!deadline.has_value());
616 const ModelBuilder* mainModel = sourceModels->getModel(kMainModelInSourceModels);
617
618 auto containsUnknownSize = [sourceModels](const std::vector<SourceOperandIndex>& operands) {
619 for (const auto& sourceOperandIndex : operands) {
620 const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first);
621 const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second);
622 if (hasUnknownSize(operand)) {
623 return true;
624 }
625 }
626 return false;
627 };
628
629 findTempsAsStepModelOutputs();
630 for (const auto& logicalStep : mSteps) {
631 if (ExecutionStep* step = logicalStep->tryExecutionStep()) {
632 int n = step->finishStepModel(mainModel, &mHasStepModelOutputOfUnknownSize,
633 executionPreference, priority);
634 if (n != ANEURALNETWORKS_NO_ERROR) {
635 VLOG(COMPILATION)
636 << "ExecutionPlan::CompoundBody::finish -- finishStepModel failed";
637 return n;
638 }
639 } else if (IfStep* step = logicalStep->tryIfStep()) {
640 // The partitioner does not support dynamic temporaries (b/132458982).
641 CHECK(!containsUnknownSize(step->outerInputOperands));
642 CHECK(!containsUnknownSize(step->outerOutputOperands));
643 // step->conditionOperandIndex has a static shape. See b/158557728.
644 CHECK(!containsUnknownSize(step->thenBranchInputOperands));
645 CHECK(!containsUnknownSize(step->thenBranchOutputOperands));
646 CHECK(!containsUnknownSize(step->elseBranchInputOperands));
647 CHECK(!containsUnknownSize(step->elseBranchOutputOperands));
648 } else if (WhileStep* step = logicalStep->tryWhileStep()) {
649 // The partitioner does not support dynamic temporaries (b/132458982).
650 CHECK(!containsUnknownSize(step->outerInputOperands));
651 CHECK(!containsUnknownSize(step->outerOutputOperands));
652 CHECK(!containsUnknownSize(step->condInputOperands));
653 // step->condOutputOperand has a static shape. See b/158557728.
654 CHECK(!containsUnknownSize(step->bodyInputOperands));
655 CHECK(!containsUnknownSize(step->bodyOutputOperands));
656 } else {
657 CHECK(logicalStep->isGoto());
658 }
659 }
660 if (mHasStepModelOutputOfUnknownSize) {
661 VLOG(COMPILATION)
662 << "ExecutionPlan::CompoundBody::finish -- mHasStepModelOutputOfUnknownSize";
663 return ANEURALNETWORKS_OP_FAILED;
664 }
665
666 for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
667 SourceOperandIndex index(kMainModelInSourceModels, mainModel->getInputOperandIndex(i));
668 mSourceOperandToInputIndex[index] = i;
669 }
670 for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
671 SourceOperandIndex index(kMainModelInSourceModels, mainModel->getOutputOperandIndex(i));
672 mSourceOperandToOutputIndex[index] = i;
673 }
674
675 findControlFlowBoundaryConstants(sourceModels);
676
677 mSuccessfulFinish = true;
678 return ANEURALNETWORKS_NO_ERROR;
679 }
680
findControlFlowBoundaryConstants(const SourceModels * sourceModels)681 void ExecutionPlan::CompoundBody::findControlFlowBoundaryConstants(
682 const SourceModels* sourceModels) {
683 auto handleBoundaryConstants = [this,
684 sourceModels](const SourceOperandIndex& sourceOperandIndex) {
685 const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first);
686 const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second);
687 const DataLocation& location = operand.location;
688 if (operand.lifetime == OperandLifeTime::CONSTANT_COPY) {
689 mSourceOperandToBoundaryConstantCopy[sourceOperandIndex] = {
690 .buffer = sourceModel->getPointerToOperandValue(location.offset),
691 .length = location.length,
692 };
693 } else if (operand.lifetime == OperandLifeTime::CONSTANT_REFERENCE) {
694 mSourceOperandToBoundaryConstantReference[sourceOperandIndex] = {
695 .memory = sourceModel->getMemories()[location.poolIndex],
696 .offset = location.offset,
697 .length = location.length,
698 };
699 }
700 };
701 for (const auto& logicalStep : mSteps) {
702 if (const IfStep* step = logicalStep->tryIfStep()) {
703 handleBoundaryConstants(step->conditionOperandIndex);
704 for (const auto& sourceOperandIndex : step->outerInputOperands) {
705 handleBoundaryConstants(sourceOperandIndex);
706 }
707 } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
708 for (const auto& sourceOperandIndex : step->outerInputOperands) {
709 handleBoundaryConstants(sourceOperandIndex);
710 }
711 }
712 }
713 }
714
finish(const SourceModels *,int32_t executionPreference,int32_t priority,const std::optional<Deadline> & deadline)715 int ExecutionPlan::SimpleBody::finish(const SourceModels*, int32_t executionPreference,
716 int32_t priority, const std::optional<Deadline>& deadline) {
717 CHECK(!mSuccessfulFinish);
718 CHECK(mDevice != nullptr);
719 VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
720 const int n = compile(*mDevice, *mModel, executionPreference, priority, deadline, *mCacheDir,
721 &mToken, &mPreparedModel);
722 mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
723 return n;
724 }
725
finish(int32_t executionPreference,int32_t priority,const std::optional<Deadline> & deadline)726 int ExecutionPlan::finish(int32_t executionPreference, int32_t priority,
727 const std::optional<Deadline>& deadline) {
728 CHECK(mBody != nullptr);
729 return mBody->finish(&getSourceModels(), executionPreference, priority, deadline);
730 }
731
Controller(const ExecutionPlan * plan,ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder)732 ExecutionPlan::Controller::Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
733 const BurstBuilder* burstBuilder)
734 : Controller(plan, executionBuilder, burstBuilder, 0, {}, {}, {}, {}, {}, {}) {}
735
Controller(const ExecutionPlan * plan,ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder,uint32_t totalSizeOfTemporaries,std::map<SourceOperandIndex,uint32_t> sourceOperandToOffsetOfTemporary,std::map<SourceOperandIndex,uint32_t> sourceOperandToOffsetOfTemporary2,std::map<SourceOperandIndex,uint32_t> sourceOperandToInputIndex,std::map<SourceOperandIndex,uint32_t> sourceOperandToOutputIndex,const std::map<SourceOperandIndex,ConstantCopyLocation> & sourceOperandToConstantCopy,std::map<SourceOperandIndex,ConstantReferenceLocation> sourceOperandToConstantReference)736 ExecutionPlan::Controller::Controller(
737 const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
738 const BurstBuilder* burstBuilder, uint32_t totalSizeOfTemporaries,
739 std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary,
740 std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary2,
741 std::map<SourceOperandIndex, uint32_t> sourceOperandToInputIndex,
742 std::map<SourceOperandIndex, uint32_t> sourceOperandToOutputIndex,
743 const std::map<SourceOperandIndex, ConstantCopyLocation>& sourceOperandToConstantCopy,
744 std::map<SourceOperandIndex, ConstantReferenceLocation> sourceOperandToConstantReference)
745 : mPlan(plan),
746 mExecutionBuilder(executionBuilder),
747 mBurstBuilder(burstBuilder),
748 mSourceOperandToOffsetOfTemporary(std::move(sourceOperandToOffsetOfTemporary)),
749 mSourceOperandToOffsetOfTemporary2(std::move(sourceOperandToOffsetOfTemporary2)),
750 mSourceOperandToInputIndex(std::move(sourceOperandToInputIndex)),
751 mSourceOperandToOutputIndex(std::move(sourceOperandToOutputIndex)),
752 mSourceOperandToConstantReference(std::move(sourceOperandToConstantReference)),
753 mNextStepIndex(0),
754 mFallbackNextStepIndex(kBadStepIndex),
755 mLastStepSyncFd(-1) {
756 if (totalSizeOfTemporaries == 0) {
757 return;
758 }
759 int n;
760 std::tie(n, mTemporaries) = MemoryAshmem::create(totalSizeOfTemporaries);
761 if (n != ANEURALNETWORKS_NO_ERROR) {
762 LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries";
763 mNextStepIndex = kBadStepIndex;
764 }
765 for (const auto& [sourceOperandIndex, location] : sourceOperandToConstantCopy) {
766 memcpy(mTemporaries->getPointer() + mSourceOperandToOffsetOfTemporary[sourceOperandIndex],
767 location.buffer, location.length);
768 }
769 }
770
771 // Attempt to create a burst object for each PreparedModel/Partition. If the
772 // burst controller object cannot be made, return a nullptr in its place to
773 // indicate the regular execution path should be used. This can occur either
774 // because PreparedModel was nullptr (cpu was best choice), or because the
775 // IPreparedModel was of insufficient version or failed to configure the burst.
makeBursts(int preference) const776 std::vector<std::shared_ptr<ExecutionBurstController>> ExecutionPlan::makeBursts(
777 int preference) const {
778 switch (mState) {
779 // burst object for each partition in the compound case
780 case COMPOUND: {
781 std::vector<std::shared_ptr<ExecutionBurstController>> bursts;
782 bursts.reserve(compound()->mSteps.size());
783 for (const auto& logicalStep : compound()->mSteps) {
784 if (!logicalStep->isExecution()) {
785 bursts.push_back(nullptr);
786 continue;
787 }
788 if (const auto preparedModel =
789 logicalStep->executionStep()->getPreparedStepModel()) {
790 const bool preferPowerOverLatency =
791 (preference == ANEURALNETWORKS_PREFER_LOW_POWER);
792 bursts.push_back(
793 preparedModel->configureExecutionBurst(preferPowerOverLatency));
794 } else {
795 bursts.push_back(nullptr);
796 }
797 }
798 return bursts;
799 }
800 // single burst object for the simple case
801 case SIMPLE: {
802 std::vector<std::shared_ptr<ExecutionBurstController>> burst;
803 auto simpleBody = simple();
804 if (const auto preparedModel = simpleBody->mPreparedModel) {
805 const bool preferPowerOverLatency =
806 (preference == ANEURALNETWORKS_PREFER_LOW_POWER);
807 burst.push_back(preparedModel->configureExecutionBurst(preferPowerOverLatency));
808 } else {
809 burst.push_back(nullptr);
810 }
811 return burst;
812 }
813 // no burst objects made
814 default:
815 return {};
816 }
817 }
818
makeController(ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder) const819 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
820 ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder) const {
821 CHECK(isValid());
822 if (mState == SIMPLE) {
823 return std::shared_ptr<Controller>(new Controller(this, executionBuilder, burstBuilder));
824 }
825 // Create the layout for a Memory object big enough to hold
826 // - every partition boundary TEMPORARY operand and
827 // - buffers required by the control flow implementation.
828 //
829 // TODO: Rethink this approach for managing temporaries. Some
830 // alternatives:
831 //
832 // 1) Adopt a memory layout scheme analogous to stack allocation,
833 // where objects of non-overlapping lifetime can occupy the same
834 // storage. We would still have a single Memory object in this
835 // case.
836 //
837 // 2) Do something like what CpuExecutor does, and do allocations
838 // and deallocations on the fly (during execution) before first
839 // reference and after last reference, respectively. This would
840 // mean having one Memory object per TEMPORARY; or, in a more
841 // complicated implementation, one Memory object per set of
842 // temporaries that have the same lifetime. Note that the Android
843 // system limits the number of shared memory objects, which are
844 // what our Memory objects represent.
845 //
846 uint32_t totalSizeOfTemporaries = 0;
847 auto addTemporaryOfSize = [&totalSizeOfTemporaries](uint32_t size) {
848 totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size);
849 const uint32_t offset = totalSizeOfTemporaries;
850 totalSizeOfTemporaries += size;
851 return offset;
852 };
853 // This function has two modes of operation:
854 // 1. When lifetime is TEMPORARY_VARIABLE, we allocate memory for
855 // TEMPORARY_VARIABLE source operands, skip SUBGRAPH_OUTPUT source
856 // operands, and panic if we see a source operand of another lifetime.
857 // 2. When lifetime is SUBGRAPH_OUTPUT, we allocate memory for
858 // SUBGRAPH_OUTPUT source operands and panic if we see a source operand
859 // of another lifetime.
860 auto mapTemporary =
861 [executionBuilder, addTemporaryOfSize](
862 const SourceOperandIndex& sourceOperandIndex,
863 std::map<SourceOperandIndex, uint32_t>* sourceOperandToOffsetOfTemporary,
864 OperandLifeTime lifetime = OperandLifeTime::TEMPORARY_VARIABLE) {
865 CHECK(lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
866 lifetime == OperandLifeTime::SUBGRAPH_OUTPUT);
867 const Operand& sourceOperand =
868 executionBuilder->getSourceOperand(sourceOperandIndex);
869 if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE &&
870 sourceOperand.lifetime == OperandLifeTime::SUBGRAPH_OUTPUT) {
871 // See the caller for explanation.
872 return;
873 }
874 CHECK(sourceOperand.lifetime == lifetime);
875 const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
876 CHECK_NE(size, 0u);
877 const uint32_t offset = addTemporaryOfSize(size);
878 auto [_, isNew] =
879 sourceOperandToOffsetOfTemporary->emplace(sourceOperandIndex, offset);
880 CHECK(isNew);
881 VLOG(EXECUTION) << "temp: operand " << toString(sourceOperandIndex)
882 << " offset = " << offset;
883 };
884 std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary;
885 std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary2;
886 for (const auto& logicalStep : compound()->mSteps) {
887 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
888 // Allocate memory for ExecutionStep temporary outputs that are
889 // inputs to other steps, as determined by
890 // ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs().
891 //
892 // We don't allocate memory for step model output operands with
893 // source operand lifetime SUBGRAPH_OUTPUT because they will be
894 // - managed by the client (main model outputs),
895 // - assigned a location of another operand (when this step model
896 // output is a branch model output of an IF; see
897 // ExecutionPlan::nextCompound(const IfStep*, ...)), or
898 // - allocated by a WHILE (when this step model output
899 // is a condition or body model output of a WHILE; see the
900 // step->bodyOutputOperands and step->condOutputOperand handling
901 // below).
902 for (const auto& output : step->getTempsAsStepModelOutputs()) {
903 mapTemporary(SourceOperandIndex(step->getSourceModelIndex(), output.first),
904 &sourceOperandToOffsetOfTemporary);
905 }
906 } else if (const IfStep* step = logicalStep->tryIfStep()) {
907 // Allocate memory for all temporary outputs of an IfStep because
908 // they are going to be written to by a branch model. We don't
909 // perform unused output operand optimisation for referenced models.
910 //
911 // We don't allocate memory for branch output operands because they
912 // use the same location as the corresponding outer output operands,
913 // as established in ExecutionPlan::nextCompound(const IfStep*, ...)
914 //
915 // We don't allocate memory for outer output operands with source
916 // operand lifetime SUBGRAPH_OUTPUT because they will be
917 // - managed by the client (main model outputs),
918 // - assigned a location of another operand (when this IF outer
919 // output is a branch model output of another IF; see
920 // ExecutionPlan::nextCompound(const IfStep*, ...)), or
921 // - allocated by a WHILE (when this IF outer output
922 // is a condition or body model output of a WHILE; see the
923 // step->bodyOutputOperands and step->condOutputOperand handling
924 // below).
925 for (const auto& sourceOperandIndex : step->outerOutputOperands) {
926 mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary);
927 }
928 } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
929 // Allocate memory for all temporary outputs of an WhileStep because
930 // they are going to be written to by the WHILE loop.
931 //
932 // We don't allocate memory for outer output operands with source
933 // operand lifetime SUBGRAPH_OUTPUT because they will be
934 // - managed by the client (main model outputs),
935 // - assigned a location of another operand (when this WHILE outer
936 // output is a branch model output of an IF; see
937 // ExecutionPlan::nextCompound(const IfStep*, ...)), or
938 // - allocated by another WHILE (when this WHILE outer output
939 // is a condition or body model output of another WHILE; see the
940 // step->bodyOutputOperands and step->condOutputOperand handling
941 // below).
942 for (const auto& sourceOperandIndex : step->outerOutputOperands) {
943 mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary);
944 }
945 // Allocate memory for body model outputs. Note that we could use
946 // the outer output operand memory instead but we currently don't do
947 // so (b/148206073).
948 for (const auto& sourceOperandIndex : step->bodyOutputOperands) {
949 mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary,
950 OperandLifeTime::SUBGRAPH_OUTPUT);
951 // Allocate another set of temporaries for double buffering.
952 mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary2,
953 OperandLifeTime::SUBGRAPH_OUTPUT);
954 }
955 // Allocate memory for condition model output.
956 // TODO: Share one condition output memory region between all loops.
957 mapTemporary(step->condOutputOperand, &sourceOperandToOffsetOfTemporary,
958 OperandLifeTime::SUBGRAPH_OUTPUT);
959 } else {
960 CHECK(logicalStep->isGoto());
961 }
962 }
963 // Allocate temporary memory for boundary CONSTANT_COPY operands.
964 for (const auto& [sourceOperandIndex, location] :
965 compound()->mSourceOperandToBoundaryConstantCopy) {
966 const uint32_t offset = addTemporaryOfSize(location.length);
967 sourceOperandToOffsetOfTemporary.emplace(sourceOperandIndex, offset);
968 VLOG(EXECUTION) << "temp (boundary constant): operand " << toString(sourceOperandIndex)
969 << " offset = " << offset;
970 }
971 return std::shared_ptr<Controller>(new Controller(
972 this, executionBuilder, burstBuilder, totalSizeOfTemporaries,
973 std::move(sourceOperandToOffsetOfTemporary),
974 std::move(sourceOperandToOffsetOfTemporary2), compound()->mSourceOperandToInputIndex,
975 compound()->mSourceOperandToOutputIndex,
976 compound()->mSourceOperandToBoundaryConstantCopy,
977 compound()->mSourceOperandToBoundaryConstantReference));
978 }
979
980 // TODO: Find a better way to provide this functionality.
fallback(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor) const981 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
982 std::shared_ptr<StepExecutor>* executor) const {
983 *executor = nullptr;
984
985 VLOG(EXECUTION) << "ExecutionPlan::fallback(" << SHOW_IF_DEBUG(controller << ", " << executor)
986 << "): mFallbackNextStepIndex = " << controller->mFallbackNextStepIndex;
987
988 if (controller->mFallbackNextStepIndex == Controller::kBadStepIndex) {
989 // We haven't called next().
990 return ANEURALNETWORKS_OP_FAILED;
991 }
992
993 if (controller->mNextStepIndex == Controller::kBadStepIndex) {
994 // The last call to next() did not produce an executor.
995 return ANEURALNETWORKS_OP_FAILED;
996 }
997
998 controller->mNextStepIndex = controller->mFallbackNextStepIndex;
999 return next(controller, executor);
1000 }
1001
Buffer(void * pointer,uint32_t size)1002 ExecutionPlan::Buffer::Buffer(void* pointer, uint32_t size)
1003 : mInfo(RunTimePoolInfo::createFromExistingBuffer(reinterpret_cast<uint8_t*>(pointer), size)),
1004 mOffset(0) {}
1005
Buffer(RunTimePoolInfo info,uint32_t offset)1006 ExecutionPlan::Buffer::Buffer(RunTimePoolInfo info, uint32_t offset)
1007 : mInfo(std::move(info)), mOffset(offset) {}
1008
getPointer() const1009 void* ExecutionPlan::Buffer::getPointer() const {
1010 return mInfo.getBuffer() + mOffset;
1011 }
1012
getSize() const1013 uint32_t ExecutionPlan::Buffer::getSize() const {
1014 return mInfo.getSize() - mOffset;
1015 }
1016
flush() const1017 void ExecutionPlan::Buffer::flush() const {
1018 mInfo.flush();
1019 }
1020
getBufferFromModelArgumentInfo(const ModelArgumentInfo & info,const ExecutionBuilder * executionBuilder) const1021 std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBufferFromModelArgumentInfo(
1022 const ModelArgumentInfo& info, const ExecutionBuilder* executionBuilder) const {
1023 switch (info.state()) {
1024 case ModelArgumentInfo::POINTER: {
1025 return Buffer(info.buffer(), info.length());
1026 } break;
1027 case ModelArgumentInfo::MEMORY: {
1028 if (std::optional<RunTimePoolInfo> poolInfo =
1029 executionBuilder->getRunTimePoolInfo(info.locationAndLength().poolIndex)) {
1030 return Buffer(*poolInfo, info.locationAndLength().offset);
1031 } else {
1032 LOG(ERROR) << "Unable to map operand memory pool";
1033 return std::nullopt;
1034 }
1035 } break;
1036 case ModelArgumentInfo::HAS_NO_VALUE: {
1037 LOG(ERROR) << "Attempting to read an operand that has no value";
1038 return std::nullopt;
1039 } break;
1040 default: {
1041 LOG(ERROR) << "Unexpected operand memory state: " << static_cast<int>(info.state());
1042 return std::nullopt;
1043 } break;
1044 }
1045 }
1046
getBuffer(std::shared_ptr<Controller> controller,SourceOperandIndex operandIndex) const1047 std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBuffer(
1048 std::shared_ptr<Controller> controller, SourceOperandIndex operandIndex) const {
1049 const auto& sourceOperandToOffsetOfTemporary = controller->mSourceOperandToOffsetOfTemporary;
1050 const auto& sourceOperandToInputIndex = controller->mSourceOperandToInputIndex;
1051 const auto& sourceOperandToOutputIndex = controller->mSourceOperandToOutputIndex;
1052 const auto& sourceOperandToConstantReference = controller->mSourceOperandToConstantReference;
1053 if (auto it = sourceOperandToOffsetOfTemporary.find(operandIndex);
1054 it != sourceOperandToOffsetOfTemporary.end()) {
1055 const uint32_t offset = it->second;
1056 const std::unique_ptr<MemoryAshmem>& memory = controller->mTemporaries;
1057 return Buffer(memory->getPointer() + offset, memory->getSize() - offset);
1058 } else if (auto it = sourceOperandToInputIndex.find(operandIndex);
1059 it != sourceOperandToInputIndex.end()) {
1060 const ModelArgumentInfo& info = controller->mExecutionBuilder->getInputInfo(it->second);
1061 return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder);
1062 } else if (auto it = sourceOperandToOutputIndex.find(operandIndex);
1063 it != sourceOperandToOutputIndex.end()) {
1064 const ModelArgumentInfo& info = controller->mExecutionBuilder->getOutputInfo(it->second);
1065 return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder);
1066 } else if (auto it = sourceOperandToConstantReference.find(operandIndex);
1067 it != sourceOperandToConstantReference.end()) {
1068 const ConstantReferenceLocation& location = it->second;
1069 const std::optional<RunTimePoolInfo> info = location.memory->getRunTimePoolInfo();
1070 if (info == std::nullopt) {
1071 return std::nullopt;
1072 }
1073 return Buffer(info->getBuffer() + location.offset, location.length);
1074 }
1075 return std::nullopt;
1076 }
1077
readConditionValue(std::shared_ptr<Controller> controller,SourceOperandIndex operandIndex,bool * value) const1078 int ExecutionPlan::readConditionValue(std::shared_ptr<Controller> controller,
1079 SourceOperandIndex operandIndex, bool* value) const {
1080 std::optional<ExecutionPlan::Buffer> buffer = getBuffer(controller, operandIndex);
1081 if (buffer == std::nullopt) {
1082 LOG(ERROR) << "Unable to read operand " << toString(operandIndex);
1083 return ANEURALNETWORKS_OP_FAILED;
1084 }
1085 CHECK_GE(buffer->getSize(), sizeof(bool8));
1086 bool8 value8 = *static_cast<bool8*>(buffer->getPointer());
1087 *value = static_cast<bool>(value8);
1088 VLOG(EXECUTION) << "readConditionValue: " << *value;
1089 return ANEURALNETWORKS_NO_ERROR;
1090 }
1091
next(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController,int syncFdOfLastStep) const1092 int ExecutionPlan::next(std::shared_ptr<Controller> controller,
1093 std::shared_ptr<StepExecutor>* executor,
1094 std::shared_ptr<ExecutionBurstController>* burstController,
1095 int syncFdOfLastStep) const {
1096 controller->mLastStepSyncFd = syncFdOfLastStep;
1097 *executor = nullptr;
1098 if (burstController != nullptr) {
1099 *burstController = nullptr;
1100 }
1101
1102 VLOG(EXECUTION) << "ExecutionPlan::next(" << SHOW_IF_DEBUG(controller << ", " << executor)
1103 << "): mNextStepIndex = " << controller->mNextStepIndex;
1104
1105 if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1106 return ANEURALNETWORKS_OP_FAILED;
1107 }
1108
1109 if (mState == EMPTY) {
1110 CHECK_EQ(controller->mNextStepIndex, 0u); // end
1111 controller->mNextStepIndex = Controller::kBadStepIndex;
1112 return ANEURALNETWORKS_NO_ERROR;
1113 }
1114
1115 if (mState == SIMPLE) {
1116 if (controller->mNextStepIndex == 0) {
1117 // First (and only) step.
1118 auto simpleBody = simple();
1119 *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder,
1120 simpleBody->mModel, simpleBody->mDevice,
1121 simpleBody->mPreparedModel);
1122 (*executor)->mapInputsAndOutputsTrivially();
1123 if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
1124 *burstController = controller->mBurstBuilder->getControllerAt(0);
1125 }
1126 controller->mFallbackNextStepIndex = 0;
1127 controller->mNextStepIndex = 1;
1128 return ANEURALNETWORKS_NO_ERROR;
1129 }
1130
1131 CHECK_EQ(controller->mNextStepIndex, 1u); // end
1132 controller->mNextStepIndex = Controller::kBadStepIndex;
1133 return ANEURALNETWORKS_NO_ERROR;
1134 }
1135
1136 return nextCompound(controller, executor, burstController);
1137 }
1138
nextCompound(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const1139 int ExecutionPlan::nextCompound(std::shared_ptr<Controller> controller,
1140 std::shared_ptr<StepExecutor>* executor,
1141 std::shared_ptr<ExecutionBurstController>* burstController) const {
1142 if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1143 return ANEURALNETWORKS_OP_FAILED;
1144 }
1145
1146 auto compoundBody = compound();
1147 if (controller->mNextStepIndex == compoundBody->mSteps.size()) {
1148 controller->mNextStepIndex = Controller::kBadStepIndex; // end
1149 return ANEURALNETWORKS_NO_ERROR;
1150 }
1151
1152 const auto& logicalStep = compoundBody->mSteps[controller->mNextStepIndex];
1153 if (const IfStep* step = logicalStep->tryIfStep()) {
1154 return nextCompound(step, controller, executor, burstController);
1155 } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1156 return nextCompound(step, controller, executor, burstController);
1157 } else if (const GotoStep* step = logicalStep->tryGotoStep()) {
1158 return nextCompound(step, controller, executor, burstController);
1159 } else if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1160 return nextCompound(step, controller, executor, burstController);
1161 } else {
1162 CHECK(false) << "Unknown step variant";
1163 return ANEURALNETWORKS_BAD_STATE;
1164 }
1165 }
1166
nextCompound(const ExecutionStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const1167 int ExecutionPlan::nextCompound(const ExecutionStep* step, std::shared_ptr<Controller> controller,
1168 std::shared_ptr<StepExecutor>* executor,
1169 std::shared_ptr<ExecutionBurstController>* burstController) const {
1170 VLOG(EXECUTION) << "next: Step#" << controller->mNextStepIndex << ": execute on "
1171 << step->getDevice()->getName();
1172 *executor =
1173 std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getStepModel(),
1174 step->getDevice(), step->getPreparedStepModel(), step);
1175 step->mapInputsAndOutputs(
1176 *executor, controller->mTemporaries.get(),
1177 controller->mSourceOperandToOffsetOfTemporary, controller->mSourceOperandToInputIndex,
1178 controller->mSourceOperandToOutputIndex, controller->mSourceOperandToConstantReference);
1179 if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
1180 *burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex);
1181 }
1182
1183 controller->mFallbackNextStepIndex = controller->mNextStepIndex;
1184 controller->mNextStepIndex++;
1185 return ANEURALNETWORKS_NO_ERROR;
1186 }
1187
1188 // The first argument is the "source" operand, the second operand is the "destination".
setInput(const SourceOperandIndex & outerOperand,const SourceOperandIndex & innerOperand)1189 void ExecutionPlan::Controller::setInput(const SourceOperandIndex& outerOperand,
1190 const SourceOperandIndex& innerOperand) {
1191 VLOG(EXECUTION) << "mapping input " << toString(innerOperand) << " from "
1192 << toString(outerOperand);
1193 #ifdef NN_DEBUGGABLE
1194 CHECK_LE(mSourceOperandToOffsetOfTemporary.count(innerOperand) +
1195 mSourceOperandToInputIndex.count(innerOperand) +
1196 mSourceOperandToOutputIndex.count(innerOperand) +
1197 mSourceOperandToConstantReference.count(innerOperand),
1198 1u);
1199 #endif
1200 mSourceOperandToOffsetOfTemporary.erase(innerOperand);
1201 mSourceOperandToInputIndex.erase(innerOperand);
1202 mSourceOperandToOutputIndex.erase(innerOperand);
1203 mSourceOperandToConstantReference.erase(innerOperand);
1204 if (auto it = mSourceOperandToOffsetOfTemporary.find(outerOperand);
1205 it != mSourceOperandToOffsetOfTemporary.end()) {
1206 mSourceOperandToOffsetOfTemporary.emplace(innerOperand, it->second);
1207 } else if (auto it = mSourceOperandToInputIndex.find(outerOperand);
1208 it != mSourceOperandToInputIndex.end()) {
1209 mSourceOperandToInputIndex.emplace(innerOperand, it->second);
1210 } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand);
1211 it != mSourceOperandToOutputIndex.end()) {
1212 mSourceOperandToOutputIndex.emplace(innerOperand, it->second);
1213 } else if (auto it = mSourceOperandToConstantReference.find(outerOperand);
1214 it != mSourceOperandToConstantReference.end()) {
1215 mSourceOperandToConstantReference.emplace(innerOperand, it->second);
1216 } else {
1217 CHECK(false) << "Cannot set step model input operand " << toString(innerOperand)
1218 << " from operand " << toString(outerOperand);
1219 }
1220 }
1221
1222 // The first argument is the "source" operand, the second operand is the "destination".
setOutput(const SourceOperandIndex & outerOperand,const SourceOperandIndex & innerOperand)1223 void ExecutionPlan::Controller::setOutput(const SourceOperandIndex& outerOperand,
1224 const SourceOperandIndex& innerOperand) {
1225 VLOG(EXECUTION) << "mapping output " << toString(innerOperand) << " from "
1226 << toString(outerOperand);
1227 #ifdef NN_DEBUGGABLE
1228 CHECK_LE(mSourceOperandToOffsetOfTemporary.count(innerOperand) +
1229 mSourceOperandToOutputIndex.count(innerOperand),
1230 1u);
1231 #endif
1232 mSourceOperandToOffsetOfTemporary.erase(innerOperand);
1233 mSourceOperandToOutputIndex.erase(innerOperand);
1234 if (auto it = mSourceOperandToOffsetOfTemporary.find(outerOperand);
1235 it != mSourceOperandToOffsetOfTemporary.end()) {
1236 mSourceOperandToOffsetOfTemporary.emplace(innerOperand, it->second);
1237 } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand);
1238 it != mSourceOperandToOutputIndex.end()) {
1239 mSourceOperandToOutputIndex.emplace(innerOperand, it->second);
1240 } else {
1241 CHECK(false) << "Cannot set step model output operand " << toString(innerOperand)
1242 << " from operand " << toString(outerOperand);
1243 }
1244 }
1245
waitForLastStepSyncFence() const1246 int ExecutionPlan::Controller::waitForLastStepSyncFence() const {
1247 if (mLastStepSyncFd == -1) {
1248 return ANEURALNETWORKS_NO_ERROR;
1249 }
1250 VLOG(EXECUTION) << "wait for mLastStepSyncFd " << mLastStepSyncFd;
1251 auto r = syncWait(mLastStepSyncFd, -1);
1252 int n = ANEURALNETWORKS_NO_ERROR;
1253 if (r != FenceState::SIGNALED) {
1254 LOG(ERROR) << "syncWait failed, fd: " << mLastStepSyncFd;
1255 n = ANEURALNETWORKS_OP_FAILED;
1256 }
1257 return n;
1258 }
1259
nextCompound(const IfStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const1260 int ExecutionPlan::nextCompound(const IfStep* step, std::shared_ptr<Controller> controller,
1261 std::shared_ptr<StepExecutor>* executor,
1262 std::shared_ptr<ExecutionBurstController>* burstController) const {
1263 VLOG(EXECUTION) << "next: " << toString(*step);
1264 // If the last step has a sync fence, wait for it to signal before reading the condition value.
1265 // This is safe because the steps are serialized when doing fenced compute.
1266 NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence());
1267 bool condValue;
1268 NN_RETURN_IF_ERROR(readConditionValue(controller, step->conditionOperandIndex, &condValue));
1269 controller->mNextStepIndex = condValue ? step->thenStepIndex : step->elseStepIndex;
1270 const std::vector<SourceOperandIndex>& branchInputOperands =
1271 condValue ? step->thenBranchInputOperands : step->elseBranchInputOperands;
1272 const std::vector<SourceOperandIndex>& branchOutputOperands =
1273 condValue ? step->thenBranchOutputOperands : step->elseBranchOutputOperands;
1274 CHECK_EQ(branchInputOperands.size(), step->outerInputOperands.size());
1275 CHECK_EQ(branchOutputOperands.size(), step->outerOutputOperands.size());
1276 for (uint32_t i = 0, n = step->outerInputOperands.size(); i < n; ++i) {
1277 // We have to do this assignment just before executing this step to
1278 // accommodate cases when the IF resides within a WHILE condition or
1279 // body model and for some j the i-th input of the IF branch model is
1280 // - an input of the WHILE condition model (whileStep->condInputOperands[j]),
1281 // - an input of the WHILE body model (whileStep->bodyInputOperands[j]), or
1282 // - an output of the WHILE body model (whileStep->bodyOutputOperands[j]).
1283 // In such cases, the WhileStep modifies the location of
1284 // step->outerInputOperands[i] to implement double buffering.
1285 controller->setInput(step->outerInputOperands[i], branchInputOperands[i]);
1286 }
1287 for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) {
1288 // We have to do this assignment just before executing this step to
1289 // accommodate the case when the IF resides within a WHILE body
1290 // model and the i-th output of the IF branch model is an
1291 // output of the WHILE body model (whileStep->bodyOutputOperands[j] for
1292 // some j). In that case, the WhileStep modifies the location of
1293 // step->outerOutputOperands[i] to implement double buffering.
1294 controller->setOutput(step->outerOutputOperands[i], branchOutputOperands[i]);
1295 }
1296 return nextCompound(controller, executor, burstController);
1297 }
1298
nextCompound(const WhileStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const1299 int ExecutionPlan::nextCompound(const WhileStep* step, std::shared_ptr<Controller> controller,
1300 std::shared_ptr<StepExecutor>* executor,
1301 std::shared_ptr<ExecutionBurstController>* burstController) const {
1302 WhileState& state = controller->mWhileState[controller->mNextStepIndex];
1303 if (state.stage == WhileState::EVALUATE_CONDITION) {
1304 state.iteration = state.iteration == WhileState::kOutsideLoop ? 0 : state.iteration + 1;
1305 VLOG(EXECUTION) << "next: " << toString(*step) << ": iteration " << state.iteration
1306 << ": evaluating condition";
1307 controller->mNextStepIndex = step->condStepIndex;
1308
1309 if (state.iteration == 0) {
1310 state.startTime = std::chrono::steady_clock::now();
1311 }
1312
1313 // iteration = 0 cond inputs = outer inputs
1314 // iteration = 1 cond inputs = body outputs
1315 // iteration = 2 cond inputs = body outputs
1316 // iteration = 3 cond inputs = ...
1317 uint32_t loopBodyOutputCount = step->bodyOutputOperands.size();
1318 CHECK_EQ(step->condInputOperands.size(), step->outerInputOperands.size());
1319 CHECK_GE(step->condInputOperands.size(), loopBodyOutputCount);
1320 for (uint32_t i = 0, n = step->condInputOperands.size(); i < n; ++i) {
1321 bool operandIsInputOnly = i >= loopBodyOutputCount;
1322 controller->setInput((state.iteration == 0 || operandIsInputOnly)
1323 ? step->outerInputOperands[i]
1324 : step->bodyOutputOperands[i],
1325 step->condInputOperands[i]);
1326 }
1327
1328 state.stage = WhileState::EVALUATE_BODY;
1329 return nextCompound(controller, executor, burstController);
1330 }
1331
1332 CHECK(state.stage == WhileState::EVALUATE_BODY);
1333 std::chrono::nanoseconds timeoutDuration(
1334 controller->mExecutionBuilder->getLoopTimeoutDuration());
1335 auto duration = std::chrono::steady_clock::now() - state.startTime;
1336 if (duration > timeoutDuration) {
1337 LOG(ERROR) << "WHILE loop timed out after "
1338 << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count()
1339 << " ms";
1340 return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
1341 }
1342
1343 // If the last step has a sync fence, wait for it to signal before reading the condition value.
1344 // This is safe because the steps are serialized when doing fenced compute.
1345 NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence());
1346 bool condValue;
1347 NN_RETURN_IF_ERROR(readConditionValue(controller, step->condOutputOperand, &condValue));
1348 if (condValue) {
1349 VLOG(EXECUTION) << "next: " << toString(*step) << ": iteration " << state.iteration
1350 << ": evaluating body";
1351 controller->mNextStepIndex = step->bodyStepIndex;
1352
1353 // iteration = 0 body inputs = cond inputs = outer inputs body outputs = tmp1
1354 // iteration = 1 body inputs = cond inputs = tmp1 body outputs = tmp2
1355 // iteration = 2 body inputs = cond inputs = tmp2 body outputs = tmp1
1356 // iteration = 3 body inputs = cond inputs = ... body outputs = ...
1357 #ifdef NN_DEBUGGABLE
1358 CHECK_GE(step->bodyInputOperands.size(), step->bodyOutputOperands.size());
1359 CHECK_EQ(step->bodyInputOperands.size(), step->outerInputOperands.size());
1360 CHECK_EQ(step->bodyInputOperands.size(), step->condInputOperands.size());
1361 CHECK_GE(step->bodyOutputOperands.size(), step->outerOutputOperands.size());
1362 #endif
1363 for (uint32_t i = 0, n = step->bodyInputOperands.size(); i < n; ++i) {
1364 controller->setInput(step->condInputOperands[i], step->bodyInputOperands[i]);
1365 }
1366 if (state.iteration != 0) {
1367 for (const SourceOperandIndex& outputOperand : step->bodyOutputOperands) {
1368 #ifdef NN_DEBUGGABLE
1369 CHECK_EQ(controller->mSourceOperandToInputIndex.count(outputOperand), 0u);
1370 CHECK_EQ(controller->mSourceOperandToOutputIndex.count(outputOperand), 0u);
1371 CHECK_EQ(controller->mSourceOperandToOffsetOfTemporary.count(outputOperand), 1u);
1372 CHECK_EQ(controller->mSourceOperandToOffsetOfTemporary2.count(outputOperand), 1u);
1373 #endif
1374 std::swap(controller->mSourceOperandToOffsetOfTemporary[outputOperand],
1375 controller->mSourceOperandToOffsetOfTemporary2[outputOperand]);
1376 }
1377 }
1378 } else {
1379 VLOG(EXECUTION) << "next: " << toString(*step) << ": iteration " << state.iteration
1380 << ": exiting loop";
1381 controller->mNextStepIndex = step->exitStepIndex;
1382
1383 // Copy body outputs to outer outputs.
1384 // TODO: Use outer outputs instead of tmp2 to avoid copying?
1385 CHECK_LE(step->outerOutputOperands.size(), step->bodyOutputOperands.size());
1386 for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) {
1387 // condInputOperands[i] points to a body output operand from the
1388 // last iteration if we've executed at least one iteration and to a
1389 // WHILE operation input operand otherwise.
1390 const SourceOperandIndex& innerOperand = step->condInputOperands[i];
1391 const SourceOperandIndex& outerOperand = step->outerOutputOperands[i];
1392 std::optional<Buffer> outerBuffer = getBuffer(controller, outerOperand);
1393 if (outerBuffer == std::nullopt) {
1394 // This should never happen.
1395 LOG(ERROR) << "Unable to get outerBuffer for operand " << toString(outerOperand);
1396 return ANEURALNETWORKS_OP_FAILED;
1397 }
1398 const Operand& sourceOperand =
1399 controller->mExecutionBuilder->getSourceOperand(outerOperand);
1400 const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
1401 CHECK_NE(size, 0u);
1402 std::optional<Buffer> innerBuffer = getBuffer(controller, innerOperand);
1403 if (innerBuffer == std::nullopt) {
1404 // This should never happen.
1405 LOG(ERROR) << "Unable to get innerBuffer for operand " << toString(innerOperand);
1406 return ANEURALNETWORKS_OP_FAILED;
1407 }
1408 CHECK_LE(size, innerBuffer->getSize());
1409 CHECK_LE(size, outerBuffer->getSize());
1410 memcpy(outerBuffer->getPointer(), innerBuffer->getPointer(), size);
1411 outerBuffer->flush();
1412 }
1413 state.iteration = WhileState::kOutsideLoop;
1414 }
1415
1416 state.stage = WhileState::EVALUATE_CONDITION;
1417 return nextCompound(controller, executor, burstController);
1418 }
1419
nextCompound(const GotoStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const1420 int ExecutionPlan::nextCompound(const GotoStep* step, std::shared_ptr<Controller> controller,
1421 std::shared_ptr<StepExecutor>* executor,
1422 std::shared_ptr<ExecutionBurstController>* burstController) const {
1423 VLOG(EXECUTION) << "next: " << toString(*step);
1424 controller->mNextStepIndex = step->gotoStepIndex;
1425 return nextCompound(controller, executor, burstController);
1426 }
1427
becomeCompoundIfEmpty()1428 void ExecutionPlan::becomeCompoundIfEmpty() {
1429 CHECK(mState != SIMPLE);
1430 if (mState == EMPTY) {
1431 mBody = new CompoundBody();
1432 mState = COMPOUND;
1433 }
1434 }
1435
createNewExecutionStep(uint32_t sourceModelIndex,const std::shared_ptr<Device> device)1436 ExecutionStep* ExecutionPlan::createNewExecutionStep(uint32_t sourceModelIndex,
1437 const std::shared_ptr<Device> device) {
1438 becomeCompoundIfEmpty();
1439 auto step = std::make_shared<LogicalStep>(std::in_place_type<ExecutionStep>, this,
1440 compound()->mSteps.size(), sourceModelIndex, device);
1441 compound()->mSteps.push_back(step);
1442 return step->executionStep();
1443 }
1444
createNewIfStep()1445 IfStep* ExecutionPlan::createNewIfStep() {
1446 becomeCompoundIfEmpty();
1447 auto step = std::make_shared<LogicalStep>(std::in_place_type<IfStep>);
1448 step->ifStep()->index = compound()->mSteps.size();
1449 compound()->mSteps.push_back(step);
1450 return step->ifStep();
1451 }
1452
createNewWhileStep()1453 WhileStep* ExecutionPlan::createNewWhileStep() {
1454 becomeCompoundIfEmpty();
1455 auto step = std::make_shared<LogicalStep>(std::in_place_type<WhileStep>);
1456 step->whileStep()->index = compound()->mSteps.size();
1457 compound()->mSteps.push_back(step);
1458 return step->whileStep();
1459 }
1460
createNewGotoStep()1461 GotoStep* ExecutionPlan::createNewGotoStep() {
1462 becomeCompoundIfEmpty();
1463 auto step = std::make_shared<LogicalStep>(std::in_place_type<GotoStep>);
1464 step->gotoStep()->index = compound()->mSteps.size();
1465 compound()->mSteps.push_back(step);
1466 return step->gotoStep();
1467 }
1468
becomeSingleStep(const std::shared_ptr<Device> device,const ModelBuilder * model)1469 void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device,
1470 const ModelBuilder* model) {
1471 CHECK(mState == EMPTY);
1472 mBody = new SimpleBody(device, model, mCacheDir, mToken);
1473 mState = SIMPLE;
1474 }
1475
recordTemporaryDef(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex)1476 void ExecutionPlan::recordTemporaryDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
1477 auto [it, isNew] =
1478 compound()->mTemporaryToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
1479 CHECK(isNew) << "Step " << stepIndex << " redefines temporary operand "
1480 << toString(sourceOperandIndex) << " already defined by step " << it->second;
1481 }
1482
dump() const1483 void ExecutionPlan::dump() const {
1484 if (mBody) {
1485 mBody->dump();
1486 } else {
1487 VLOG(COMPILATION) << "EMPTY";
1488 }
1489 }
1490
reset()1491 void ExecutionPlan::reset() {
1492 if (mBody) {
1493 delete mBody;
1494 mBody = nullptr;
1495 }
1496 mState = EMPTY;
1497 }
1498
isSimpleCpu() const1499 bool ExecutionPlan::isSimpleCpu() const {
1500 return isSimple() && simple()->mDevice == DeviceManager::getCpuDevice();
1501 }
1502
forTest_getKind() const1503 ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const {
1504 switch (mState) {
1505 case EMPTY:
1506 return Kind::EMPTY;
1507 case SIMPLE:
1508 nnAssert(mBody);
1509 return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR;
1510 case COMPOUND:
1511 nnAssert(mBody);
1512 return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR;
1513 default:
1514 nnAssert(!"unexpected state");
1515 return Kind::ERROR;
1516 }
1517 }
1518
forTest_simpleGetDevice() const1519 std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const {
1520 return simple()->mDevice;
1521 }
1522
forTest_compoundGetSteps() const1523 const std::vector<std::shared_ptr<LogicalStep>>& ExecutionPlan::forTest_compoundGetSteps() const {
1524 return compound()->mSteps;
1525 }
1526
forTest_hasStepModelOutputsOfUnknownSize() const1527 bool ExecutionPlan::forTest_hasStepModelOutputsOfUnknownSize() const {
1528 return mBody->hasStepModelOutputsOfUnknownSize();
1529 }
1530
forTest_simpleGetCacheToken() const1531 const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const {
1532 return simple()->mToken.getCacheToken();
1533 }
1534
dump() const1535 void ExecutionPlan::SimpleBody::dump() const {
1536 VLOG(COMPILATION) << "SIMPLE for " << mDevice->getName();
1537 }
1538
dump() const1539 void ExecutionPlan::CompoundBody::dump() const {
1540 for (const auto& step : mSteps) {
1541 step->dump();
1542 }
1543 }
1544
forEachStepRoleOfInput(uint32_t index,const StepRoleCallback & callback) const1545 void ExecutionPlan::SimpleBody::forEachStepRoleOfInput(uint32_t index,
1546 const StepRoleCallback& callback) const {
1547 callback(mPreparedModel.get(), IOType::INPUT, index);
1548 }
1549
forEachStepRoleOfOutput(uint32_t index,const StepRoleCallback & callback) const1550 void ExecutionPlan::SimpleBody::forEachStepRoleOfOutput(uint32_t index,
1551 const StepRoleCallback& callback) const {
1552 callback(mPreparedModel.get(), IOType::OUTPUT, index);
1553 }
1554
1555 // Map an input role of the main model to the input/output roles in the step models:
1556 // - An input role of the main model may be used as an input of multiple step models.
1557 // - An input role of the main model should not be used as an output of any step model.
forEachStepRoleOfInput(uint32_t index,const StepRoleCallback & callback) const1558 void ExecutionPlan::CompoundBody::forEachStepRoleOfInput(uint32_t index,
1559 const StepRoleCallback& callback) const {
1560 for (const auto& logicalStep : mSteps) {
1561 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1562 // Model input as step model input.
1563 const auto& inputMapping = step->getInputIndexStepModelToMainModel();
1564 for (uint32_t i = 0; i < inputMapping.size(); i++) {
1565 if (inputMapping[i] == index) {
1566 callback(step->getPreparedStepModel().get(), IOType::INPUT, i);
1567 }
1568 }
1569 }
1570 }
1571 }
1572
1573 // Map an output role of the main model to the input/output roles in the step models:
1574 // - An output role of the main model may only be used as one output of one single step model.
1575 // - An output role of the main model may be used as an input of multiple step models.
forEachStepRoleOfOutput(uint32_t index,const StepRoleCallback & callback) const1576 void ExecutionPlan::CompoundBody::forEachStepRoleOfOutput(uint32_t index,
1577 const StepRoleCallback& callback) const {
1578 bool found = false;
1579 for (const auto& logicalStep : mSteps) {
1580 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1581 // Model output as step model output.
1582 if (!found) {
1583 const auto& outputMapping = step->getOutputIndexStepModelToMainModel();
1584 for (uint32_t i = 0; i < outputMapping.size(); i++) {
1585 if (outputMapping[i] == index) {
1586 callback(step->getPreparedStepModel().get(), IOType::OUTPUT, i);
1587 found = true;
1588 break;
1589 }
1590 }
1591 }
1592 // Model output as step model input.
1593 const auto& inputToOutputMapping = step->getOutputsAsStepModelInputsIndexToMainModel();
1594 for (uint32_t i = 0; i < inputToOutputMapping.size(); i++) {
1595 if (inputToOutputMapping[i] == index) {
1596 callback(step->getPreparedStepModel().get(), IOType::INPUT, i);
1597 }
1598 }
1599 }
1600 }
1601 }
1602
partitionTheWork(const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,uint32_t priority,const std::optional<Deadline> & deadline,ExecutionPlan * plan) const1603 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
1604 uint32_t preference, uint32_t priority,
1605 const std::optional<Deadline>& deadline,
1606 ExecutionPlan* plan) const {
1607 uint32_t sourceModelIndex = plan->getSourceModels().addModel(this);
1608 NN_RETURN_IF_ERROR(partitionTheWorkInternal(sourceModelIndex, devices, preference, priority,
1609 deadline, plan));
1610 int n = plan->finish(preference, priority, deadline);
1611 if (VLOG_IS_ON(COMPILATION)) {
1612 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: source model: ";
1613 logModelToInfo(makeHidlModel());
1614 plan->dump();
1615 }
1616 return n;
1617 }
1618
partitionTheWorkInternal(uint32_t sourceModelIndex,const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,uint32_t priority,const std::optional<Deadline> & deadline,ExecutionPlan * plan) const1619 int ModelBuilder::partitionTheWorkInternal(uint32_t sourceModelIndex,
1620 const std::vector<std::shared_ptr<Device>>& devices,
1621 uint32_t preference, uint32_t priority,
1622 const std::optional<Deadline>& deadline,
1623 ExecutionPlan* plan) const {
1624 // This function uses a heuristic approach to partitioning the graph.
1625 // It should be good enough for the first release.
1626
1627 SourceModels* sourceModels = &plan->getSourceModels();
1628 const size_t deviceCount = devices.size();
1629 const size_t operationCount = mOperations.size();
1630
1631 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: "
1632 << "sourceModelIndex = " << sourceModelIndex << ", "
1633 << "deviceCount = " << deviceCount << ", "
1634 << "operationCount = " << operationCount;
1635
1636 // Figure out where each operation will best execute.
1637 // The value of the vector is the index in the devices vector.
1638 std::vector<int> bestDeviceForOperation(operationCount);
1639 NN_RETURN_IF_ERROR(
1640 findBestDeviceForEachOperation(preference, devices, &bestDeviceForOperation));
1641
1642 // A special value produced by findBestDeviceForEachOperation meaning that
1643 // this is a control flow operation scheduled for interpreted execution
1644 // (see LogicalStep).
1645 const int kControlFlowInterpreter = deviceCount;
1646
1647 // If one device will run all the operations, we don't need to split the
1648 // work. This shortcut does not apply when recursively partitioning
1649 // referenced models because our plan representation is flat.
1650 if (sourceModelIndex == kMainModelInSourceModels &&
1651 std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(),
1652 std::not_equal_to<int>()) == bestDeviceForOperation.end()) {
1653 const int bestDeviceIndex = bestDeviceForOperation[0];
1654 // Bypass the partitioning process unless the only operation is a
1655 // control flow operation scheduled for interpreted execution.
1656 if (bestDeviceIndex != kControlFlowInterpreter) {
1657 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: "
1658 << bestDeviceIndex << " = " << devices[bestDeviceIndex]->getName();
1659 plan->becomeSingleStep(devices[bestDeviceIndex], this);
1660 return ANEURALNETWORKS_NO_ERROR;
1661 }
1662 }
1663
1664 // No easy solution, we need to split the work.
1665
1666 // We keep track of the operations that are ready to run for each device.
1667 // perDeviceQueue[deviceCount] is for interpreted execution of control flow
1668 // (see LogicalStep).
1669 std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount + 1);
1670
1671 // This helper function enqueues the operation on the appropriate queue.
1672 auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
1673 int deviceIndex = bestDeviceForOperation[operationIndex];
1674 perDeviceQueue[deviceIndex].push(operationIndex);
1675 VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
1676 << deviceIndex;
1677 };
1678
1679 // This helper function finds a device that has operations ready to process.
1680 // We start by looking at the control flow queue, and then look at the
1681 // devices in reverse order (i.e., starting at the end of the devices
1682 // vector). Earlier devices have a chance to prepare more of the inputs
1683 // required by other devices. This function returns -1 if all queues are
1684 // empty.
1685 auto findNextDeviceToProcess = [&]() -> int {
1686 for (int i = perDeviceQueue.size() - 1; i >= 0; i--) {
1687 if (!perDeviceQueue[i].empty()) {
1688 return i;
1689 }
1690 }
1691 return -1;
1692 };
1693
1694 OperandTracker tracker(this, enqueueOnAppropriateDevice);
1695 // For each iteration of this loop, we'll create an execution step.
1696 while (true) {
1697 // Find the device we'll do this step for.
1698 int deviceIndex = findNextDeviceToProcess();
1699 VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex;
1700 if (deviceIndex < 0) {
1701 break;
1702 }
1703
1704 // Assign as much as possible to this device.
1705 auto& queue = perDeviceQueue[deviceIndex];
1706 if (deviceIndex != kControlFlowInterpreter) {
1707 ExecutionStep* step =
1708 plan->createNewExecutionStep(sourceModelIndex, devices[deviceIndex]);
1709 while (!queue.empty()) {
1710 uint32_t operationIndex = queue.front();
1711 queue.pop();
1712 int n = step->addOperation(operationIndex);
1713 if (n != ANEURALNETWORKS_NO_ERROR) {
1714 LOG(ERROR) << "failed to add operation " << operationIndex << " to step";
1715 return n;
1716 }
1717 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
1718 }
1719 } else {
1720 while (!queue.empty()) {
1721 uint32_t operationIndex = queue.front();
1722 queue.pop();
1723 const Operation& operation = getOperation(operationIndex);
1724 if (operation.type == OperationType::IF) {
1725 namespace op = operation_if;
1726 const Operand& thenOperand =
1727 getOperand(operation.inputs[op::kThenModelOperand]);
1728 const Operand& elseOperand =
1729 getOperand(operation.inputs[op::kElseModelOperand]);
1730 const ModelBuilder* thenModel = getReferencedModel(thenOperand);
1731 const ModelBuilder* elseModel = getReferencedModel(elseOperand);
1732 uint32_t thenModelIndex = sourceModels->addModel(thenModel);
1733 uint32_t elseModelIndex = sourceModels->addModel(elseModel);
1734
1735 // Emits the following:
1736 // Index Step
1737 // i if then=(i + 1) else=(j + 1)
1738 // ... (then model steps)
1739 // j goto k
1740 // ... (else model steps)
1741 // k (steps after the IF)
1742 IfStep* ifStep = plan->createNewIfStep();
1743 ifStep->conditionOperandIndex = SourceOperandIndex(
1744 sourceModelIndex, operation.inputs[op::kCondBoolOperand]);
1745 ifStep->thenStepIndex = plan->getNextStepIndex();
1746 NN_RETURN_IF_ERROR(thenModel->partitionTheWorkInternal(
1747 thenModelIndex, devices, preference, priority, deadline, plan));
1748 GotoStep* afterThenBranch = plan->createNewGotoStep();
1749 ifStep->elseStepIndex = plan->getNextStepIndex();
1750 NN_RETURN_IF_ERROR(elseModel->partitionTheWorkInternal(
1751 elseModelIndex, devices, preference, priority, deadline, plan));
1752 afterThenBranch->gotoStepIndex = plan->getNextStepIndex();
1753
1754 // Outer model operands.
1755 for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) {
1756 ifStep->outerInputOperands.emplace_back(sourceModelIndex,
1757 operation.inputs[i]);
1758 }
1759 for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1760 ifStep->outerOutputOperands.emplace_back(sourceModelIndex,
1761 operation.outputs[i]);
1762 }
1763 // Then model operands.
1764 for (uint32_t i = 0, n = thenModel->inputCount(); i < n; ++i) {
1765 ifStep->thenBranchInputOperands.emplace_back(
1766 thenModelIndex, thenModel->getInputOperandIndex(i));
1767 }
1768 for (uint32_t i = 0, n = thenModel->outputCount(); i < n; ++i) {
1769 ifStep->thenBranchOutputOperands.emplace_back(
1770 thenModelIndex, thenModel->getOutputOperandIndex(i));
1771 }
1772 // Else model operands.
1773 for (uint32_t i = 0, n = elseModel->inputCount(); i < n; ++i) {
1774 ifStep->elseBranchInputOperands.emplace_back(
1775 elseModelIndex, elseModel->getInputOperandIndex(i));
1776 }
1777 for (uint32_t i = 0, n = elseModel->outputCount(); i < n; ++i) {
1778 ifStep->elseBranchOutputOperands.emplace_back(
1779 elseModelIndex, elseModel->getOutputOperandIndex(i));
1780 }
1781 } else if (operation.type == OperationType::WHILE) {
1782 namespace op = operation_while;
1783 const Operand& condOperand =
1784 getOperand(operation.inputs[op::kCondModelOperand]);
1785 const Operand& bodyOperand =
1786 getOperand(operation.inputs[op::kBodyModelOperand]);
1787 const ModelBuilder* condModel = getReferencedModel(condOperand);
1788 const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
1789 uint32_t condModelIndex = sourceModels->addModel(condModel);
1790 uint32_t bodyModelIndex = sourceModels->addModel(bodyModel);
1791
1792 // Emits the following:
1793 // Index Step
1794 // i while cond=(i + 1) body=(j + 1) exit=(k + 1)
1795 // ... (cond model steps)
1796 // j goto i
1797 // ... (body model steps)
1798 // k goto i
1799 // ... (steps after the WHILE)
1800 //
1801 // Note that WhileStep has WhileState associated with it.
1802 WhileStep* whileStep = plan->createNewWhileStep();
1803 whileStep->condStepIndex = plan->getNextStepIndex();
1804 NN_RETURN_IF_ERROR(condModel->partitionTheWorkInternal(
1805 condModelIndex, devices, preference, priority, deadline, plan));
1806 GotoStep* afterCond = plan->createNewGotoStep();
1807 afterCond->gotoStepIndex = whileStep->index;
1808 whileStep->bodyStepIndex = plan->getNextStepIndex();
1809 NN_RETURN_IF_ERROR(bodyModel->partitionTheWorkInternal(
1810 bodyModelIndex, devices, preference, priority, deadline, plan));
1811 GotoStep* afterBody = plan->createNewGotoStep();
1812 afterBody->gotoStepIndex = whileStep->index;
1813 whileStep->exitStepIndex = plan->getNextStepIndex();
1814
1815 // Outer model operands.
1816 for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) {
1817 whileStep->outerInputOperands.emplace_back(sourceModelIndex,
1818 operation.inputs[i]);
1819 }
1820 for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1821 whileStep->outerOutputOperands.emplace_back(sourceModelIndex,
1822 operation.outputs[i]);
1823 }
1824 // Cond model operands.
1825 for (uint32_t i = 0, n = condModel->inputCount(); i < n; ++i) {
1826 whileStep->condInputOperands.emplace_back(
1827 condModelIndex, condModel->getInputOperandIndex(i));
1828 }
1829 whileStep->condOutputOperand =
1830 SourceOperandIndex(condModelIndex, condModel->getOutputOperandIndex(0));
1831 // Body model operands.
1832 for (uint32_t i = 0, n = bodyModel->inputCount(); i < n; ++i) {
1833 whileStep->bodyInputOperands.emplace_back(
1834 bodyModelIndex, bodyModel->getInputOperandIndex(i));
1835 }
1836 for (uint32_t i = 0, n = bodyModel->outputCount(); i < n; ++i) {
1837 whileStep->bodyOutputOperands.emplace_back(
1838 bodyModelIndex, bodyModel->getOutputOperandIndex(i));
1839 }
1840 } else {
1841 CHECK(false) << toString(operation.type) << " is not a control flow operation";
1842 }
1843 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
1844 }
1845 }
1846 }
1847 return ANEURALNETWORKS_NO_ERROR;
1848 }
1849
getPerformance(uint32_t preference,const std::shared_ptr<Device> device) const1850 float ModelBuilder::getPerformance(uint32_t preference,
1851 const std::shared_ptr<Device> device) const {
1852 // Note that we will call this method multiple times per compilation with
1853 // the same arguments if there are nested control flow operations and we
1854 // decide to execute the outer operation on the ExecutionPlan::next()
1855 // interpreter.
1856 //
1857 // This is a potential compilation performance problem. To work around it,
1858 // the performance value could be cached for the duration of a compilation.
1859 float perf = 0;
1860 const size_t operationCount = mOperations.size();
1861 for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
1862 perf += getPerformance(preference, device, operationIndex);
1863 }
1864 return perf;
1865 }
1866
getPerformance(uint32_t preference,const std::shared_ptr<Device> device,uint32_t operationIndex) const1867 float ModelBuilder::getPerformance(uint32_t preference, const std::shared_ptr<Device> device,
1868 uint32_t operationIndex) const {
1869 auto applyPreference = [preference](const PerformanceInfo& perf) {
1870 return preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage : perf.execTime;
1871 };
1872
1873 const Operation& operation = getOperation(operationIndex);
1874
1875 if (operation.type == OperationType::IF) {
1876 namespace op = operation_if;
1877 const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]);
1878 const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]);
1879 const ModelBuilder* thenModel = getReferencedModel(thenOperand);
1880 const ModelBuilder* elseModel = getReferencedModel(elseOperand);
1881 return applyPreference(device->getIfPerformance()) +
1882 0.5 * (thenModel->getPerformance(preference, device) +
1883 elseModel->getPerformance(preference, device));
1884 }
1885
1886 if (operation.type == OperationType::WHILE) {
1887 namespace op = operation_while;
1888 const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]);
1889 const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]);
1890 const ModelBuilder* condModel = getReferencedModel(condOperand);
1891 const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
1892 return applyPreference(device->getWhilePerformance()) +
1893 condModel->getPerformance(preference, device) +
1894 bodyModel->getPerformance(preference, device);
1895 }
1896
1897 // TODO This assumes that the type is dictated by the first operand. This is
1898 // currently the case but is not a safe assumption to make in the long term.
1899 const uint32_t operandIndex = operation.inputs[0];
1900 const OperandType operandType = mOperands[operandIndex].type;
1901 switch (operandType) {
1902 case OperandType::FLOAT32:
1903 if (mRelaxComputationFloat32toFloat16) {
1904 return applyPreference(device->getRelaxedFloat32toFloat16PerformanceScalar());
1905 }
1906 break;
1907 case OperandType::TENSOR_FLOAT32:
1908 if (mRelaxComputationFloat32toFloat16) {
1909 return applyPreference(device->getRelaxedFloat32toFloat16PerformanceTensor());
1910 }
1911 break;
1912 default:
1913 break;
1914 }
1915
1916 return applyPreference(device->getPerformance(operandType));
1917 }
1918
isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const1919 bool ModelBuilder::isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const {
1920 auto containsUnknownSize = [](const ModelBuilder* model,
1921 const std::vector<uint32_t>& operandIndexes) {
1922 for (uint32_t operandIndex : operandIndexes) {
1923 if (hasUnknownSize(model->getOperand(operandIndex))) {
1924 return true;
1925 }
1926 }
1927 return false;
1928 };
1929
1930 const Operation& operation = getOperation(operationIndex);
1931
1932 if (operation.type == OperationType::IF) {
1933 namespace op = operation_if;
1934 const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]);
1935 const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]);
1936 const ModelBuilder* thenModel = getReferencedModel(thenOperand);
1937 const ModelBuilder* elseModel = getReferencedModel(elseOperand);
1938 return containsUnknownSize(this, operation.inputs) ||
1939 containsUnknownSize(this, operation.outputs) ||
1940 containsUnknownSize(thenModel, thenModel->getInputOperandIndexes()) ||
1941 containsUnknownSize(thenModel, thenModel->getOutputOperandIndexes()) ||
1942 containsUnknownSize(elseModel, elseModel->getInputOperandIndexes()) ||
1943 containsUnknownSize(elseModel, elseModel->getOutputOperandIndexes());
1944 }
1945
1946 if (operation.type == OperationType::WHILE) {
1947 namespace op = operation_while;
1948 const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]);
1949 const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]);
1950 const ModelBuilder* condModel = getReferencedModel(condOperand);
1951 const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
1952 return containsUnknownSize(this, operation.inputs) ||
1953 containsUnknownSize(this, operation.outputs) ||
1954 containsUnknownSize(condModel, condModel->getInputOperandIndexes()) ||
1955 containsUnknownSize(condModel, condModel->getOutputOperandIndexes()) ||
1956 containsUnknownSize(bodyModel, bodyModel->getInputOperandIndexes()) ||
1957 containsUnknownSize(bodyModel, bodyModel->getOutputOperandIndexes());
1958 }
1959
1960 // Not a control flow operation.
1961 return false;
1962 }
1963
supportedByControlFlowInterpreter(uint32_t operationIndex) const1964 bool ModelBuilder::supportedByControlFlowInterpreter(uint32_t operationIndex) const {
1965 const Operation& operation = getOperation(operationIndex);
1966 return (operation.type == OperationType::IF || operation.type == OperationType::WHILE) &&
1967 // The partitioner does not support dynamic temporaries (b/132458982).
1968 !isControlFlowOperationWithOperandOfUnknownSize(operationIndex);
1969 }
1970
1971 namespace {
1972
1973 // This class determines whether a given device can execute a given operation
1974 class CanDo {
1975 public:
CanDo()1976 CanDo() {}
1977
initialize(const MetaModel & metaModel,std::shared_ptr<Device> device)1978 void initialize(const MetaModel& metaModel, std::shared_ptr<Device> device) {
1979 mSupportsOperationByIndex = device->getSupportedOperations(metaModel);
1980 }
1981
check(size_t operationIndex) const1982 bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; }
1983
1984 private:
1985 std::vector<bool> mSupportsOperationByIndex;
1986 };
1987
1988 } // anonymous namespace
1989
findBestDeviceForEachOperation(uint32_t preference,const std::vector<std::shared_ptr<Device>> & devices,std::vector<int> * bestDeviceForOperation) const1990 int ModelBuilder::findBestDeviceForEachOperation(
1991 uint32_t preference, const std::vector<std::shared_ptr<Device>>& devices,
1992 std::vector<int>* bestDeviceForOperation) const {
1993 const MetaModel metaModel(makeHidlModel(), DeviceManager::get()->strictSlicing());
1994
1995 const size_t deviceCount = devices.size();
1996 std::vector<CanDo> canDo(deviceCount);
1997 for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
1998 canDo[deviceIndex].initialize(metaModel, devices[deviceIndex]);
1999 }
2000
2001 // Figure out the best driver for each operation.
2002 const size_t operationCount = mOperations.size();
2003 for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
2004 const Operation& operation = getOperation(operationIndex);
2005 // Find which device, including CPU fallback, gives the best performance for this operation.
2006 int bestChoice = -1;
2007
2008 if (isControlFlowOperationWithOperandOfUnknownSize(operationIndex)) {
2009 // Do not schedule control flow operations with unknown size to
2010 // non-CPU devices because this is not supported by the 1.3 HAL.
2011 // See http://b/159076604#comment5.
2012 auto cpuDeviceIterator =
2013 std::find(devices.begin(), devices.end(), DeviceManager::getCpuDevice());
2014 if (cpuDeviceIterator != devices.end()) {
2015 int cpuDeviceIndex = cpuDeviceIterator - devices.begin();
2016 if (canDo[cpuDeviceIndex].check(operationIndex)) {
2017 bestChoice = cpuDeviceIndex;
2018 }
2019 }
2020 } else {
2021 float bestPerfVal = 0.0; // Do not check bestPerfVal if bestChoice < 0.
2022 for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
2023 const auto& device = devices[deviceIndex];
2024 if (canDo[deviceIndex].check(operationIndex)) {
2025 const float perfVal = getPerformance(preference, device, operationIndex);
2026 if (bestChoice < 0 || perfVal < bestPerfVal ||
2027 (perfVal == bestPerfVal && device == DeviceManager::getCpuDevice())) {
2028 bestChoice = deviceIndex;
2029 bestPerfVal = perfVal;
2030 }
2031 } else {
2032 // Somewhat noisy logging, but only place where the user of NNAPI can get
2033 // feedback on why an operation was not run on a specific device.
2034 //
2035 // Logs O(operationCount * deviceCount) times, but typically deviceCount is
2036 // very small.
2037 VLOG(COMPILATION) << "Device " << device->getName() << " can't do operation "
2038 << toString(operation.type);
2039 }
2040 }
2041 }
2042
2043 if (bestChoice < 0) {
2044 LOG(ERROR) << "No driver can do operation " << toString(operation.type);
2045 return ANEURALNETWORKS_BAD_DATA;
2046 } else if (devices[bestChoice] == DeviceManager::getCpuDevice() &&
2047 supportedByControlFlowInterpreter(operationIndex)) {
2048 // Run control flow on the ExecutionPlan::next() interpreter and try
2049 // to delegate referenced models.
2050 const int kControlFlowInterpreter = deviceCount;
2051 (*bestDeviceForOperation)[operationIndex] = kControlFlowInterpreter;
2052 VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
2053 << toString(operation.type) << ") = -1"
2054 << " (NNAPI)";
2055 } else {
2056 (*bestDeviceForOperation)[operationIndex] = bestChoice;
2057 VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
2058 << toString(operation.type) << ") = " << bestChoice << " ("
2059 << devices[bestChoice]->getName() << ")";
2060 }
2061 }
2062 return ANEURALNETWORKS_NO_ERROR;
2063 }
2064
2065 } // namespace nn
2066 } // namespace android
2067