1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "ExecutionBuilder"
18 
19 #include "ExecutionBuilder.h"
20 
21 #include <algorithm>
22 #include <limits>
23 #include <memory>
24 #include <mutex>
25 #include <optional>
26 #include <string>
27 #include <thread>
28 #include <tuple>
29 #include <utility>
30 #include <vector>
31 
32 #include "CompilationBuilder.h"
33 #include "ControlFlow.h"
34 #include "CpuExecutor.h"
35 #include "ExecutionBurstController.h"
36 #include "HalInterfaces.h"
37 #include "Manager.h"
38 #include "ModelArgumentInfo.h"
39 #include "ModelBuilder.h"
40 #include "Tracing.h"
41 #include "TypeManager.h"
42 #include "Utils.h"
43 
44 namespace android {
45 namespace nn {
46 
47 using namespace hal;
48 
49 const Timing kNoTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
50 
measureTiming(const ExecutionBuilder * execution)51 static MeasureTiming measureTiming(const ExecutionBuilder* execution) {
52     return execution->measureTiming() ? MeasureTiming::YES : MeasureTiming::NO;
53 }
54 
checkDimensionInfo(const Operand & operand,const ANeuralNetworksOperandType * newType,const char * tag,bool allowUnspecified)55 static bool checkDimensionInfo(const Operand& operand, const ANeuralNetworksOperandType* newType,
56                                const char* tag, bool allowUnspecified) {
57     if (newType != nullptr) {
58         const Extension::OperandTypeInformation* info = nullptr;
59         if (isExtensionOperandType(operand.type)) {
60             NN_RET_CHECK(TypeManager::get()->getExtensionOperandTypeInfo(operand.type, &info));
61         }
62         if (validateOperandType(*newType, info, tag, allowUnspecified) !=
63             ANEURALNETWORKS_NO_ERROR) {
64             LOG(ERROR) << tag << ": Invalid newType";
65             return false;
66         }
67         if (operand.dimensions.size() == 0) {
68             return true;
69         }
70         if (operand.dimensions.size() != newType->dimensionCount) {
71             LOG(ERROR) << tag << ": Setting with incompatible dimension count";
72             return false;
73         }
74         for (uint32_t i = 0; i < newType->dimensionCount; i++) {
75             if (operand.dimensions[i] != newType->dimensions[i] && operand.dimensions[i] != 0) {
76                 LOG(ERROR) << tag << ": Overriding a fully specified dimension is disallowed";
77                 return false;
78             }
79         }
80     } else {
81         if (!allowUnspecified && TypeManager::get()->isTensorType(operand.type) &&
82             tensorHasUnspecifiedDimensions(operand)) {
83             LOG(ERROR) << tag << ": Setting with operand type that is not fully specified";
84             return false;
85         }
86     }
87     return true;
88 }
89 
ExecutionBuilder(const CompilationBuilder * compilation)90 ExecutionBuilder::ExecutionBuilder(const CompilationBuilder* compilation)
91     : mCompilation(compilation),
92       mModel(compilation->mModel),
93       mPlan(&compilation->mPlan),
94       mPartitioning(compilation->mPartitioning),
95       mInputs(mModel->inputCount()),
96       mOutputs(mModel->outputCount()) {
97     VLOG(EXECUTION) << "ExecutionBuilder::ExecutionBuilder with " << mInputs.size()
98                     << " inputs and " << mOutputs.size() << " outputs";
99 }
100 
getSourceModel(uint32_t index) const101 const ModelBuilder* ExecutionBuilder::getSourceModel(uint32_t index) const {
102     return mPlan->getSourceModels().getModel(index);
103 }
104 
isFinished() const105 bool ExecutionBuilder::isFinished() const {
106     CHECK(!(mFinishedWithoutSyncFence && hasSyncFence()));
107     if (mFinishedWithoutSyncFence) {
108         return true;
109     }
110     if (hasSyncFence()) {
111         auto r = syncWait(mSyncFenceFd, 0);
112         CHECK(r != FenceState::UNKNOWN);
113         return r != FenceState::ACTIVE;
114     }
115     return false;
116 }
117 
completedWith() const118 ExecutionBuilder::Completion ExecutionBuilder::completedWith() const {
119     CHECK(isFinished());
120     if (hasSyncFence()) {
121         auto r = syncWait(mSyncFenceFd, 0);
122         CHECK(r == FenceState::SIGNALED || r == FenceState::ERROR);
123         return (r == FenceState::SIGNALED) ? Completion::NO_ERROR : Completion::OTHER_ERROR;
124     } else {
125         return mCompletionWithoutSyncFence;
126     }
127 }
128 
setInput(uint32_t index,const ANeuralNetworksOperandType * type,const void * buffer,size_t length)129 int ExecutionBuilder::setInput(uint32_t index, const ANeuralNetworksOperandType* type,
130                                const void* buffer, size_t length) {
131     if (mStarted) {
132         LOG(ERROR) << "ANeuralNetworksExecution_setInput called after the "
133                       "execution has started.";
134         return ANEURALNETWORKS_BAD_STATE;
135     }
136     uint32_t count = static_cast<uint32_t>(mInputs.size());
137     if (index >= count) {
138         LOG(ERROR) << "ANeuralNetworksExecution_setInput bad index " << index << " " << count;
139         return ANEURALNETWORKS_BAD_DATA;
140     }
141     if (!checkDimensionInfo(mModel->getInputOperand(index), type,
142                             "ANeuralNetworksExecution_setInput", buffer == nullptr)) {
143         return ANEURALNETWORKS_BAD_DATA;
144     }
145     if (length > 0xFFFFFFFF) {
146         LOG(ERROR) << "ANeuralNetworksExecution_setInput input exceeds max length " << length;
147         return ANEURALNETWORKS_BAD_DATA;
148     }
149     uint32_t l = static_cast<uint32_t>(length);
150     if (!mInputs[index].unspecified()) {
151         LOG(ERROR) << "ANeuralNetworksExecution_setInput called when an input has already been "
152                       "provided";
153         return ANEURALNETWORKS_BAD_STATE;
154     }
155     int n;
156     std::tie(n, mInputs[index]) = ModelArgumentInfo::createFromPointer(
157             mModel->getInputOperand(index), type, const_cast<void*>(buffer), l);
158     return n;
159 }
160 
setInputFromMemory(uint32_t index,const ANeuralNetworksOperandType * type,const Memory * memory,size_t offset,size_t length)161 int ExecutionBuilder::setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
162                                          const Memory* memory, size_t offset, size_t length) {
163     // Should be similar to StepExecutor::setInputOrOutputFromMemory()
164 
165     if (mStarted) {
166         LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory called after the "
167                       "execution has started.";
168         return ANEURALNETWORKS_BAD_STATE;
169     }
170     uint32_t count = static_cast<uint32_t>(mInputs.size());
171     if (index >= count) {
172         LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory bad index " << index << " "
173                    << count;
174         return ANEURALNETWORKS_BAD_DATA;
175     }
176     if (!checkDimensionInfo(mModel->getInputOperand(index), type,
177                             "ANeuralNetworksExecution_setInputFromMemory", false)) {
178         return ANEURALNETWORKS_BAD_DATA;
179     }
180     if (!memory->getValidator().validate(mCompilation, IOType::INPUT, index, type, offset,
181                                          length)) {
182         return ANEURALNETWORKS_BAD_DATA;
183     }
184     // For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we
185     // allow the client to specify offset == 0 && length == 0 indicating that the entire memory
186     // region is used. We update the length here because the drivers are still expecting a real
187     // length. For other memories that do not allow this semantic, it is checked in
188     // MemoryValidatorBase::validate before reaching here.
189     if (memory->getHidlMemory().valid() && offset == 0 && length == 0) {
190         length = memory->getHidlMemory().size();
191     }
192     // TODO validate the rest
193     uint32_t poolIndex = mMemories.add(memory);
194     if (!mInputs[index].unspecified()) {
195         LOG(ERROR)
196                 << "ANeuralNetworksExecution_setInputFromMemory called when an input has already "
197                    "been provided";
198         return ANEURALNETWORKS_BAD_STATE;
199     }
200     int n;
201     std::tie(n, mInputs[index]) = ModelArgumentInfo::createFromMemory(
202             mModel->getInputOperand(index), type, poolIndex, offset, length);
203     return n;
204 }
205 
setOutput(uint32_t index,const ANeuralNetworksOperandType * type,void * buffer,size_t length)206 int ExecutionBuilder::setOutput(uint32_t index, const ANeuralNetworksOperandType* type,
207                                 void* buffer, size_t length) {
208     if (mStarted) {
209         LOG(ERROR) << "ANeuralNetworksExecution_setOutput called after the "
210                       "execution has started.";
211         return ANEURALNETWORKS_BAD_STATE;
212     }
213     uint32_t count = static_cast<uint32_t>(mOutputs.size());
214     if (index >= count) {
215         LOG(ERROR) << "ANeuralNetworksExecution_setOutput bad index " << index << " " << count;
216         return ANEURALNETWORKS_BAD_DATA;
217     }
218     if (!checkDimensionInfo(mModel->getOutputOperand(index), type,
219                             "ANeuralNetworksExecution_setOutput", true)) {
220         return ANEURALNETWORKS_BAD_DATA;
221     }
222     if (length > 0xFFFFFFFF) {
223         LOG(ERROR) << "ANeuralNetworksExecution_setOutput input exceeds max length " << length;
224         return ANEURALNETWORKS_BAD_DATA;
225     }
226     uint32_t l = static_cast<uint32_t>(length);
227     if (!mOutputs[index].unspecified()) {
228         LOG(ERROR) << "ANeuralNetworksExecution_setOutput called when an output has already been "
229                       "provided";
230         return ANEURALNETWORKS_BAD_STATE;
231     }
232     int n;
233     std::tie(n, mOutputs[index]) =
234             ModelArgumentInfo::createFromPointer(mModel->getOutputOperand(index), type, buffer, l);
235     return n;
236 }
237 
setOutputFromMemory(uint32_t index,const ANeuralNetworksOperandType * type,const Memory * memory,size_t offset,size_t length)238 int ExecutionBuilder::setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
239                                           const Memory* memory, size_t offset, size_t length) {
240     // Should be similar to StepExecutor::setInputOrOutputFromMemory()
241 
242     if (mStarted) {
243         LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called after the "
244                       "execution has started.";
245         return ANEURALNETWORKS_BAD_STATE;
246     }
247     uint32_t count = static_cast<uint32_t>(mOutputs.size());
248     if (index >= count) {
249         LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory bad index " << index << " "
250                    << count;
251         return ANEURALNETWORKS_BAD_DATA;
252     }
253     if (!checkDimensionInfo(mModel->getOutputOperand(index), type,
254                             "ANeuralNetworksExecution_setOutputFromMemory", true)) {
255         return ANEURALNETWORKS_BAD_DATA;
256     }
257     if (!memory->getValidator().validate(mCompilation, IOType::OUTPUT, index, type, offset,
258                                          length)) {
259         return ANEURALNETWORKS_BAD_DATA;
260     }
261     // For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we
262     // allow the client to specify offset == 0 && length == 0 indicating that the entire memory
263     // region is used. We update the length here because the drivers are still expecting a real
264     // length. For other memories that do not allow this semantic, it is checked in
265     // MemoryValidatorBase::validate before reaching here.
266     if (memory->getHidlMemory().valid() && offset == 0 && length == 0) {
267         length = memory->getHidlMemory().size();
268     }
269     // TODO validate the rest
270     uint32_t poolIndex = mMemories.add(memory);
271     if (!mOutputs[index].unspecified()) {
272         LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called when an output has "
273                       "already been provided";
274         return ANEURALNETWORKS_BAD_STATE;
275     }
276     int n;
277     std::tie(n, mOutputs[index]) = ModelArgumentInfo::createFromMemory(
278             mModel->getOutputOperand(index), type, poolIndex, offset, length);
279     return n;
280 }
281 
setMeasureTiming(bool measure)282 int ExecutionBuilder::setMeasureTiming(bool measure) {
283     if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
284         LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called on "
285                    << "an ANeuralNetworksExecution created from an ANeuralNetworksCompilation "
286                    << "that was not created by ANeuralNetworksCompilation_createForDevices "
287                    << "with numDevices = 1";
288         return ANEURALNETWORKS_BAD_DATA;
289     }
290     if (mStarted) {
291         LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called after the "
292                       "execution has started.";
293         return ANEURALNETWORKS_BAD_STATE;
294     }
295     mMeasureTiming = measure;
296     return ANEURALNETWORKS_NO_ERROR;
297 }
298 
getDuration(int32_t durationCode,uint64_t * duration) const299 int ExecutionBuilder::getDuration(int32_t durationCode, uint64_t* duration) const {
300     if (!isFinished()) {
301         LOG(ERROR) << "ANeuralNetworksExecution_getDuration called before the "
302                       "execution has finished.";
303         *duration = UINT64_MAX;
304         return ANEURALNETWORKS_BAD_STATE;
305     }
306     if (completedWith() != Completion::NO_ERROR) {
307         LOG(ERROR) << "ANeuralNetworksExecution_getDuration called on an execution "
308                       "that has encountered an error.";
309         *duration = UINT64_MAX;
310         return ANEURALNETWORKS_BAD_STATE;
311     }
312 
313     // NOTE: At the HAL level, timing is in microseconds. At the NDK level, nanoseconds.
314     const uint64_t kNanoPerMicro = 1000;
315 
316     if (!mMeasureTiming) {
317         *duration = UINT64_MAX;
318         return ANEURALNETWORKS_BAD_STATE;
319     }
320 
321     Timing timingLaunched = mTimingWithoutFencedExecutionCallback;
322     Timing timingFenced = timingLaunched;
323     if (mFencedExecutionCallback != nullptr) {
324         ErrorStatus status;
325         const Return<void> ret = mFencedExecutionCallback->getExecutionInfo(
326                 [&status, &timingLaunched, &timingFenced](ErrorStatus error, Timing tLaunched,
327                                                           Timing tFenced) {
328                     status = error;
329                     timingLaunched = tLaunched;
330                     timingFenced = tFenced;
331                 });
332         if (!ret.isOk()) {
333             *duration = UINT64_MAX;
334             return ANEURALNETWORKS_OP_FAILED;
335         }
336         if (status != ErrorStatus::NONE) {
337             *duration = UINT64_MAX;
338             return ANEURALNETWORKS_BAD_STATE;
339         }
340     }
341     uint64_t microDuration = UINT64_MAX;
342     switch (durationCode) {
343         case ANEURALNETWORKS_DURATION_ON_HARDWARE:
344             microDuration = timingLaunched.timeOnDevice;
345             break;
346         case ANEURALNETWORKS_DURATION_IN_DRIVER:
347             microDuration = timingLaunched.timeInDriver;
348             break;
349         case ANEURALNETWORKS_FENCED_DURATION_ON_HARDWARE:
350             microDuration = timingFenced.timeOnDevice;
351             break;
352         case ANEURALNETWORKS_FENCED_DURATION_IN_DRIVER:
353             microDuration = timingFenced.timeInDriver;
354             break;
355         default:
356             CHECK(!"unexpected");
357     }
358     *duration = (microDuration == UINT64_MAX) ? UINT64_MAX : kNanoPerMicro * microDuration;
359 
360     VLOG(EXECUTION) << "getDuration(" << durationCode << "): " << *duration;
361     return ANEURALNETWORKS_NO_ERROR;
362 }
363 
setTimeoutDuration(uint64_t duration)364 int ExecutionBuilder::setTimeoutDuration(uint64_t duration) {
365     if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
366         LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called on an ANeuralNetworksExecution "
367                       "created from an ANeuralNetworksCompilation that was not created by "
368                       "ANeuralNetworksCompilation_createForDevices with numDevices = 1";
369         return ANEURALNETWORKS_BAD_DATA;
370     }
371     if (mStarted) {
372         LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called after the execution has started.";
373         return ANEURALNETWORKS_BAD_STATE;
374     }
375     if (duration > 0) {
376         mTimeoutDuration = duration;
377     } else {
378         mTimeoutDuration.reset();
379     }
380     return ANEURALNETWORKS_NO_ERROR;
381 }
382 
getTimeoutDuration() const383 std::optional<uint64_t> ExecutionBuilder::getTimeoutDuration() const {
384     return mTimeoutDuration;
385 }
386 
setLoopTimeout(uint64_t duration)387 int ExecutionBuilder::setLoopTimeout(uint64_t duration) {
388     if (mStarted) {
389         LOG(ERROR) << "ANeuralNetworksExecution_setLoopTimeout called after the "
390                       "execution has started.";
391         return ANEURALNETWORKS_BAD_STATE;
392     }
393     if (duration > operation_while::kTimeoutNsMaximum) {
394         LOG(WARNING) << "ANeuralNetworksExecution_setLoopTimeout input exceeds the maximum allowed "
395                      << "duration: " << duration << " > " << operation_while::kTimeoutNsMaximum;
396         duration = operation_while::kTimeoutNsMaximum;
397     }
398     mLoopTimeoutDuration = duration;
399     return ANEURALNETWORKS_NO_ERROR;
400 }
401 
getOutputOperandDimensions(uint32_t index,uint32_t * dimensions)402 int ExecutionBuilder::getOutputOperandDimensions(uint32_t index, uint32_t* dimensions) {
403     if (!isFinished()) {
404         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called before the "
405                       "execution has finished.";
406         return ANEURALNETWORKS_BAD_STATE;
407     }
408     if (completedWith() == Completion::OTHER_ERROR) {
409         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called on an execution "
410                       "that has encountered an error.";
411         return ANEURALNETWORKS_BAD_STATE;
412     }
413 
414     uint32_t count = static_cast<uint32_t>(mOutputs.size());
415     if (index >= count) {
416         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions bad index " << index
417                    << " " << count;
418         return ANEURALNETWORKS_BAD_DATA;
419     }
420     const auto& dims = mOutputs[index].dimensions();
421     if (dims.empty()) {
422         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions can not query "
423                       "dimensions of a scalar";
424         return ANEURALNETWORKS_BAD_DATA;
425     }
426     std::copy(dims.begin(), dims.end(), dimensions);
427     return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR
428                                           : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
429 }
430 
getOutputOperandRank(uint32_t index,uint32_t * rank)431 int ExecutionBuilder::getOutputOperandRank(uint32_t index, uint32_t* rank) {
432     if (!isFinished()) {
433         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called before the "
434                       "execution has finished.";
435         return ANEURALNETWORKS_BAD_STATE;
436     }
437     if (completedWith() == Completion::OTHER_ERROR) {
438         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called on an execution "
439                       "that has encountered an error.";
440         return ANEURALNETWORKS_BAD_STATE;
441     }
442     uint32_t count = static_cast<uint32_t>(mOutputs.size());
443     if (index >= count) {
444         LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank bad index " << index << " "
445                    << count;
446         return ANEURALNETWORKS_BAD_DATA;
447     }
448     *rank = static_cast<uint32_t>(mOutputs[index].dimensions().size());
449     return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR
450                                           : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
451 }
452 
453 // Attempt synchronous execution of full model on CPU.
454 // TODO: How should we handle timing in this case?
455 //       For Q this is irrelevant: We only support timing in conjunction
456 //         with an explicit device list; and we do not support CPU fallback
457 //         with an explicit device list.  See CompilationBuilder::mExplicitDeviceList.
cpuFallbackFull(ExecutionBuilder * executionBuilder)458 static std::tuple<int, std::vector<OutputShape>, Timing> cpuFallbackFull(
459         ExecutionBuilder* executionBuilder) {
460     CHECK(executionBuilder != nullptr);
461     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackFull");
462     VLOG(EXECUTION) << "cpuFallbackFull";
463 
464     // Get fallback executor.
465     StepExecutor executor(executionBuilder, executionBuilder->getModel(),
466                           DeviceManager::getCpuDevice(), /*preparedModel=*/nullptr);
467     executor.mapInputsAndOutputsTrivially();
468 
469     // Attempt fallback execution.
470     return executor.computeOnCpuFallback();
471 }
472 
473 // Attempt synchronous execution on CPU.
474 // TODO: How should we handle timing in this case?
475 //       For Q this is irrelevant: We only support timing in conjunction
476 //         with an explicit device list; and we do not support CPU fallback
477 //         with an explicit device list.  See CompilationBuilder::mExplicitDeviceList.
478 static std::tuple<int, std::vector<OutputShape>, Timing, std::shared_ptr<StepExecutor>>
cpuFallbackPartial(const ExecutionPlan & plan,std::shared_ptr<ExecutionPlan::Controller> controller)479 cpuFallbackPartial(const ExecutionPlan& plan,
480                    std::shared_ptr<ExecutionPlan::Controller> controller) {
481     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackPartial");
482     VLOG(EXECUTION) << "cpuFallbackPartial";
483 
484     // Get fallback executor.
485     std::shared_ptr<StepExecutor> executor;
486     int n1 = plan.fallback(controller, &executor);
487     if (n1 != ANEURALNETWORKS_NO_ERROR) {
488         return {n1, {}, kNoTiming, nullptr};
489     }
490     CHECK(executor != nullptr);
491 
492     // Attempt fallback execution.
493     auto [n2, outputShapes, timing] = executor->computeOnCpuFallback();
494     return {n2, std::move(outputShapes), timing, executor};
495 }
496 
asyncStartComputePartitioned(ExecutionBuilder * executionBuilder,const ExecutionPlan & plan,std::shared_ptr<ExecutionPlan::Controller> controller,bool allowFallback,const std::optional<Deadline> & deadline,const sp<ExecutionCallback> & executionCallback)497 static void asyncStartComputePartitioned(ExecutionBuilder* executionBuilder,
498                                          const ExecutionPlan& plan,
499                                          std::shared_ptr<ExecutionPlan::Controller> controller,
500                                          bool allowFallback,
501                                          const std::optional<Deadline>& deadline,
502                                          const sp<ExecutionCallback>& executionCallback) {
503     CHECK(executionBuilder != nullptr);
504     VLOG(EXECUTION) << "ExecutionBuilder::compute (from plan, iteratively)";
505 
506     std::vector<OutputShape> outputShapes = executionBuilder->getInitialOutputShapes();
507     Timing timing = kNoTiming;
508     // Disallow fallback when the ExecutionPlan is simple on CPU.
509     allowFallback &= !plan.isSimpleCpu();
510 
511     while (true) {
512         VLOG(EXECUTION) << "looking for next StepExecutor";
513 
514         // Get the current step of the execution.
515         std::shared_ptr<StepExecutor> executor;
516         std::shared_ptr<ExecutionBurstController> burstController;
517         int n = plan.next(controller, &executor, &burstController);
518         if (n != ANEURALNETWORKS_NO_ERROR) {
519             // During the interpreted execution of control flow, a loop timeout
520             // might occur in ExecutionPlan::next().
521             bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
522                                   n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
523             if (allowFallback && !missedDeadline) break;
524             executionCallback->notify(convertResultCodeToErrorStatus(n), {}, kNoTiming);
525             return;
526         }
527 
528         // If the code reached the end of the plan without error, then return
529         // with no error.
530         if (executor == nullptr) {
531             executionCallback->notify(ErrorStatus::NONE, outputShapes, timing);
532             return;
533         }
534         const bool executorIsCpu = executor->isCpu();
535 
536         // Attempt to execute a single step of the execution.
537         auto [stepN, stepOutputShapes, stepTiming] = executor->compute(deadline, burstController);
538 
539         // Update global outputs.
540         if (!executor->updateOutputShapes(stepOutputShapes, &outputShapes)) {
541             stepN = ANEURALNETWORKS_OP_FAILED;
542         }
543 
544         // If execution was successful, continue to next step.
545         if (stepN == ANEURALNETWORKS_NO_ERROR) {
546             // We only support collection of timing information in the case of a
547             // single step, so it's safe to just keep track of the last step's
548             // timing information.
549             timing = stepTiming;
550             continue;
551         }
552 
553         // OUTPUT_INSUFFICIENT_SIZE is not recoverable, so end execution.
554         if (stepN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
555             const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
556             executionCallback->notify(stepStatus, outputShapes, kNoTiming);
557             return;
558         }
559 
560         // If fallback is not allowed and there was an error, end execution.
561         if (!allowFallback) {
562             const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
563             executionCallback->notify(stepStatus, {}, kNoTiming);
564             return;
565         }
566 
567         // If CPU execution was already attempted, either:
568         // (1) perform a full fallback if the plan is not simple, or
569         // (2) return from the function with an error
570         if (executorIsCpu) {
571             if (!plan.isSimple()) break;
572             executionCallback->notify(convertResultCodeToErrorStatus(stepN), {}, kNoTiming);
573             return;
574         }
575 
576         // If the code reaches this point, attempt a partial fallback to CPU.
577         CHECK(allowFallback);
578         auto [fallbackN, fallbackOutputShapes, fallbackTiming, fallbackExecutor] =
579                 cpuFallbackPartial(plan, controller);
580 
581         // Update global outputs.
582         if (fallbackExecutor != nullptr &&
583             !fallbackExecutor->updateOutputShapes(fallbackOutputShapes, &outputShapes)) {
584             fallbackN = ANEURALNETWORKS_OP_FAILED;
585         }
586 
587         // If execution was successful, continue to next step.
588         if (fallbackN == ANEURALNETWORKS_NO_ERROR) {
589             // We only support collection of timing information in the case of a
590             // single step, so it's safe to just keep track of the last step's
591             // timing information.
592             timing = fallbackTiming;
593             continue;
594         }
595 
596         // OUTPUT_INSUFFICIENT_SIZE is not recoverable, so end execution.
597         if (fallbackN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
598             const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
599             executionCallback->notify(fallbackStatus, outputShapes, kNoTiming);
600             return;
601         }
602 
603         // Do not fallback twice if the ExecutionPlan is simple.
604         if (plan.isSimple()) {
605             const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
606             executionCallback->notify(fallbackStatus, {}, kNoTiming);
607             return;
608         }
609 
610         // If the code reaches this point, then there was an error with the
611         // fallback. In this case, attempt full fallback.
612         break;
613     }
614 
615     // If the code has reached this point, a potentially recoverable error
616     // occurred during the step executions. Instead, do a full execution
617     // fallback on the CPU.
618     auto [fullN, fullOutputShapes, fullTiming] = cpuFallbackFull(executionBuilder);
619     const ErrorStatus fullStatus = convertResultCodeToErrorStatus(fullN);
620     executionCallback->notify(fullStatus, fullOutputShapes, fullTiming);
621 }
622 
623 // In case of partitioned execution, startComputeFenced call will return the sync
624 // fence and the fenced compute callback returned from the last partition.
625 // Any failed partition will result in the whole execution fallback to CPU if
626 // allowFallback is set to true.
startComputeFenced(ExecutionBuilder * executionBuilder,const ExecutionPlan & plan,std::shared_ptr<ExecutionPlan::Controller> controller,const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,const std::optional<Deadline> & deadline,bool allowFallback)627 static std::tuple<int, int, sp<hal::IFencedExecutionCallback>> startComputeFenced(
628         ExecutionBuilder* executionBuilder, const ExecutionPlan& plan,
629         std::shared_ptr<ExecutionPlan::Controller> controller, const std::vector<int>& waitFor,
630         uint64_t timeoutDurationAfterFence, const std::optional<Deadline>& deadline,
631         bool allowFallback) {
632     CHECK(executionBuilder != nullptr);
633     VLOG(EXECUTION) << "ExecutionBuilder::computeFenced (from plan, iteratively)";
634     // Disallow fallback when the ExecutionPlan is simple on CPU.
635     allowFallback &= !plan.isSimpleCpu();
636 
637     // Initiate waitForFds, syncFence for the first step.
638     std::vector<int> waitForFds = waitFor;
639     int syncFence = -1;
640     sp<hal::IFencedExecutionCallback> computeFencedCallback;
641 
642     while (true) {
643         VLOG(EXECUTION) << "looking for next StepExecutor";
644 
645         // Get the current step of the execution.
646         std::shared_ptr<StepExecutor> executor;
647         int n = plan.next(controller, &executor, nullptr, syncFence);
648         if (n != ANEURALNETWORKS_NO_ERROR) {
649             // During the interpreted execution of control flow, a loop timeout
650             // might occur in ExecutionPlan::next().
651             bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
652                                   n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
653             if (allowFallback && !missedDeadline) break;
654             // Return -1 for the sync fence fd, and nullptr for the callback.
655             return std::make_tuple(n, -1, nullptr);
656         }
657 
658         // If the code reached the end of the plan without error, then return
659         // with no error.
660         if (executor == nullptr) {
661             // If the final step returns a -1 for sync fence, the execution is finished.
662             // Update the output shapes.
663             if (syncFence == -1) {
664                 // TODO(miaowang): support dynamic output shape only with memory domain.
665                 // For now just return the initial output shapes.
666                 executionBuilder->finishWithoutSyncFence(
667                         ErrorStatus::NONE, executionBuilder->getInitialOutputShapes());
668             }
669             return std::make_tuple(ANEURALNETWORKS_NO_ERROR, syncFence, computeFencedCallback);
670         }
671         const bool executorIsCpu = executor->isCpu();
672 
673         // Attempt to execute a single step of the execution.
674         auto [stepN, syncFd, callback] =
675                 executor->computeFenced(waitForFds, timeoutDurationAfterFence, deadline);
676 
677         // Update waitForFds, syncFence for the next step.
678         syncFence = syncFd;
679         computeFencedCallback = callback;
680         waitForFds.clear();
681         if (syncFd > 0) {
682             waitForFds = {syncFd};
683         }
684 
685         // If execution was successful, continue to next step.
686         if (stepN == ANEURALNETWORKS_NO_ERROR) {
687             continue;
688         }
689         // If fallback is not allowed and there was an error, end execution.
690         if (!allowFallback) {
691             return std::make_tuple(stepN, -1, nullptr);
692         }
693 
694         // If CPU execution was already attempted, either:
695         // (1) perform a full fallback if the plan is not simple, or
696         // (2) return from the function with an error
697         if (executorIsCpu) {
698             if (!plan.isSimple()) break;
699             return std::make_tuple(stepN, -1, nullptr);
700         }
701         // If the code reaches this point, then there was an error with the
702         // fallback. In this case, attempt full fallback.
703         break;
704     }
705 
706     // If the code has reached this point, a potentially recoverable error
707     // occurred during the step executions. Instead, do a full execution
708     // fallback on the CPU.
709     VLOG(EXECUTION) << "Performing full fallback on the CPU.";
710     for (int syncFd : waitFor) {
711         if (syncFd > 0) {
712             auto r = syncWait(syncFd, -1);
713             if (r != FenceState::SIGNALED) {
714                 VLOG(EXECUTION) << "syncWait failed, fd: " << syncFd;
715                 return std::make_tuple(ANEURALNETWORKS_OP_FAILED, -1, nullptr);
716             }
717         }
718     }
719     auto [fullN, fullOutputShapes, fullTiming] = cpuFallbackFull(executionBuilder);
720     const ErrorStatus fullStatus = convertResultCodeToErrorStatus(fullN);
721     syncFence = -1;
722     executionBuilder->finishWithoutSyncFence(fullStatus, fullOutputShapes);
723     executionBuilder->reportTimingWithoutFencedExecutionCallback(fullTiming);
724     return std::make_tuple(fullN, syncFence, nullptr);
725 }
726 
computeFenced(const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,int * syncFence)727 int ExecutionBuilder::computeFenced(const std::vector<int>& waitFor,
728                                     uint64_t timeoutDurationAfterFence, int* syncFence) {
729     CHECK(syncFence != nullptr);
730     if (mStarted) {
731         LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies"
732                       " called on an execution that has already started";
733         return ANEURALNETWORKS_BAD_STATE;
734     }
735     if (timeoutDurationAfterFence > 0) {
736         if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
737             LOG(ERROR)
738                     << "ANeuralNetworksExecution_startComputeWithDependencies called with non-zero "
739                        "duration on an ANeuralNetworksExecution "
740                        "created from an ANeuralNetworksCompilation that was not created by "
741                        "ANeuralNetworksCompilation_createForDevices with numDevices = 1";
742             return ANEURALNETWORKS_BAD_DATA;
743         }
744     }
745     const auto deadline = makeDeadline(mTimeoutDuration);
746     for (auto& p : mInputs) {
747         if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
748             LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies"
749                           " not all inputs specified";
750             return ANEURALNETWORKS_BAD_DATA;
751         }
752     }
753     for (auto& p : mOutputs) {
754         if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
755             LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies"
756                           " not all outputs specified";
757             return ANEURALNETWORKS_BAD_DATA;
758         }
759     }
760     for (uint32_t i = 0; i < mOutputs.size(); i++) {
761         if (mOutputs[i].state() != ModelArgumentInfo::HAS_NO_VALUE &&
762             !checkDimensionInfo(mModel->getOutputOperand(i), nullptr,
763                                 "ANeuralNetworksExecution_startComputeWithDependencies", false)) {
764             LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies"
765                           " not all outputs have fully specified dimensions";
766             return ANEURALNETWORKS_BAD_DATA;
767         }
768     }
769     mStarted = true;
770     const bool allowFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
771     std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this, nullptr);
772     VLOG(EXECUTION) << "ExecutionBuilder::computeFenced";
773     int result;
774     std::tie(result, mSyncFenceFd, mFencedExecutionCallback) = startComputeFenced(
775             this, *mPlan, controller, waitFor, timeoutDurationAfterFence, deadline, allowFallback);
776     *syncFence = mSyncFenceFd;
777     return result;
778 }
779 
compute(sp<ExecutionCallback> * synchronizationCallback,BurstBuilder * burstBuilder)780 int ExecutionBuilder::compute(sp<ExecutionCallback>* synchronizationCallback,
781                               BurstBuilder* burstBuilder) {
782     CHECK(synchronizationCallback == nullptr || burstBuilder == nullptr)
783             << "synchronizationCallback and burstBuilder cannot simultaneously be used";
784 
785     const bool synchronous = (synchronizationCallback == nullptr);
786     if (!synchronous) {
787         *synchronizationCallback = nullptr;
788     }
789 
790     const auto deadline = makeDeadline(mTimeoutDuration);
791 
792     // TODO validate that we have full types for all inputs and outputs,
793     // that the graph is not cyclic,
794 
795     auto name = [synchronous, burstBuilder] {
796         return burstBuilder ? "burstCompute" : synchronous ? "compute" : "startCompute";
797     };
798     if (mStarted) {
799         LOG(ERROR) << "ANeuralNetworksExecution_" << name()
800                    << " called on an execution that has already started";
801         return ANEURALNETWORKS_BAD_STATE;
802     }
803     for (auto& p : mInputs) {
804         if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
805             LOG(ERROR) << "ANeuralNetworksExecution_" << name() << " not all inputs specified";
806             return ANEURALNETWORKS_BAD_DATA;
807         } else if (p.state() == ModelArgumentInfo::MEMORY) {
808             const Memory* memory = mMemories[p.locationAndLength().poolIndex];
809             if (!memory->getValidator().validateInputDimensions(p.dimensions())) {
810                 return ANEURALNETWORKS_OP_FAILED;
811             }
812         }
813     }
814     for (auto& p : mOutputs) {
815         if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
816             LOG(ERROR) << "ANeuralNetworksExecution_" << name() << " not all outputs specified";
817             return ANEURALNETWORKS_BAD_DATA;
818         }
819     }
820 
821     auto wrappedFinish = [this](ErrorStatus error, const std::vector<OutputShape>& outputShapes) {
822         return finishWithoutSyncFence(error, outputShapes);
823     };
824 
825     // TODO: For asynchronous execution, entire plan-based-path should run in an
826     // asynchronous thread -- take the asynchronous thread logic out of
827     // CpuPreparedModel::execute() and use it to wrap the plan-based-path.
828     mStarted = true;
829     const bool allowFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
830     std::shared_ptr<ExecutionPlan::Controller> controller =
831             mPlan->makeController(this, burstBuilder);
832     if (synchronous) {
833         VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)";
834         sp<ExecutionCallback> localSynchronizationCallback = new ExecutionCallback();
835         localSynchronizationCallback->setOnFinish(wrappedFinish);
836         asyncStartComputePartitioned(this, *mPlan, controller, allowFallback, deadline,
837                                      localSynchronizationCallback);
838         localSynchronizationCallback->wait();
839         if (mMeasureTiming) {
840             mTimingWithoutFencedExecutionCallback = localSynchronizationCallback->getTiming();
841         }
842         return convertErrorStatusToResultCode(localSynchronizationCallback->getStatus());
843     } else /* asynchronous */ {
844         // TODO: use a thread pool
845         // TODO(mikie): this could have NNTRACE so we could measure the overhead
846         //              of spinning up a new thread.
847 
848         // Prepare the callback for asynchronous execution.
849         // sp<ExecutionCallback> object is returned when the
850         // execution has been successfully launched, otherwise a
851         // nullptr is returned.  The executionCallback is
852         // abstracted in the NN API as an "event".
853         sp<ExecutionCallback> executionCallback = new ExecutionCallback();
854         executionCallback->setOnFinish(wrappedFinish);
855         if (DeviceManager::get()->syncExecRuntime()) {
856             VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API, non-threaded)";
857             asyncStartComputePartitioned(this, *mPlan, controller, allowFallback, deadline,
858                                          executionCallback);
859         } else {
860             VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API)";
861             std::thread asyncExecution(
862                     [this, controller, allowFallback, deadline, executionCallback] {
863                         asyncStartComputePartitioned(this, *mPlan, controller, allowFallback,
864                                                      deadline, executionCallback);
865                     });
866             executionCallback->bindThread(std::move(asyncExecution));
867         }
868         *synchronizationCallback = executionCallback;
869         return ANEURALNETWORKS_NO_ERROR;
870     }
871 }
872 
getInitialOutputShapes() const873 std::vector<OutputShape> ExecutionBuilder::getInitialOutputShapes() const {
874     std::vector<OutputShape> outputShapes(mOutputs.size());
875     std::transform(mOutputs.begin(), mOutputs.end(), outputShapes.begin(),
876                    [](const auto& x) -> OutputShape {
877                        hidl_vec<uint32_t> dimensions;
878                        if (x.state() != ModelArgumentInfo::HAS_NO_VALUE) {
879                            dimensions = x.dimensions();
880                        }
881                        return {.dimensions = std::move(dimensions), .isSufficient = true};
882                    });
883     return outputShapes;
884 }
885 
886 // Check if the dimensions "to" is updatable by dimensions "from", where "from" must
887 // have a higher specification level.
isUpdatable(const std::vector<uint32_t> & to,const std::vector<uint32_t> & from)888 static bool isUpdatable(const std::vector<uint32_t>& to, const std::vector<uint32_t>& from) {
889     if (to.size() == 0) return true;
890     NN_RET_CHECK_EQ(to.size(), from.size());
891     for (uint32_t i = 0; i < to.size(); i++) {
892         NN_RET_CHECK(to[i] == from[i] || to[i] == 0);
893     }
894     return true;
895 }
896 
updateOutputShapes(const std::vector<OutputShape> & outputShapes)897 bool ExecutionBuilder::updateOutputShapes(const std::vector<OutputShape>& outputShapes) {
898     if (outputShapes.size() == 0) {
899         return true;
900     }
901     NN_RET_CHECK_EQ(outputShapes.size(), mOutputs.size());
902     for (uint32_t i = 0; i < outputShapes.size(); i++) {
903         // Check if only unspecified dimensions or rank are overwritten.
904         NN_RET_CHECK(isUpdatable(mOutputs[i].dimensions(), outputShapes[i].dimensions));
905         const OperandType operandType = mModel->getOutputOperand(i).type;
906         NN_RET_CHECK(!TypeManager::get()->sizeOfDataOverflowsUInt32(operandType,
907                                                                     outputShapes[i].dimensions));
908     }
909     for (uint32_t i = 0; i < outputShapes.size(); i++) {
910         mOutputs[i].dimensions() = outputShapes[i].dimensions;
911         mOutputs[i].isSufficient() = outputShapes[i].isSufficient;
912     }
913     return true;
914 }
915 
updateMemories()916 bool ExecutionBuilder::updateMemories() {
917     for (const auto& output : mOutputs) {
918         if (output.state() != ModelArgumentInfo::MEMORY) continue;
919         const Memory* memory = mMemories[output.locationAndLength().poolIndex];
920         NN_RET_CHECK(memory->getValidator().updateMetadata({.dimensions = output.dimensions()}));
921     }
922     return true;
923 }
924 
finishWithoutSyncFence(ErrorStatus status,const std::vector<OutputShape> & outputShapes)925 ErrorStatus ExecutionBuilder::finishWithoutSyncFence(ErrorStatus status,
926                                                      const std::vector<OutputShape>& outputShapes) {
927     CHECK(!mFinishedWithoutSyncFence) << "ExecutionBuilder::finishWithoutSyncFence is called twice";
928     CHECK(!hasSyncFence())
929             << "ExecutionBuilder::finishWithoutSyncFence is called when hasSyncFence()";
930     if (!updateOutputShapes(outputShapes) || !updateMemories()) {
931         status = ErrorStatus::GENERAL_FAILURE;
932     }
933     bool success = status == ErrorStatus::NONE;
934     for (const auto& output : mOutputs) {
935         if (output.state() != ModelArgumentInfo::MEMORY) continue;
936         const Memory* memory = mMemories[output.locationAndLength().poolIndex];
937         memory->getValidator().setInitialized(success);
938     }
939     switch (convertErrorStatusToResultCode(status)) {
940         case ANEURALNETWORKS_NO_ERROR:
941             mCompletionWithoutSyncFence = Completion::NO_ERROR;
942             break;
943         case ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE:
944             mCompletionWithoutSyncFence = Completion::OUTPUT_INSUFFICIENT_SIZE;
945             break;
946         default:
947             mCompletionWithoutSyncFence = Completion::OTHER_ERROR;
948             break;
949     }
950     mFinishedWithoutSyncFence = true;
951     return status;
952 }
953 
updateOutputShapes(const std::vector<OutputShape> & from,std::vector<OutputShape> * to)954 bool StepExecutor::updateOutputShapes(const std::vector<OutputShape>& from,
955                                       std::vector<OutputShape>* to) {
956     if (from.size() == 0) {
957         return true;
958     }
959     if (mExecutionStep != nullptr) {
960         const auto& indexMapping = mExecutionStep->getOutputIndexStepModelToMainModel();
961         NN_RET_CHECK_LE(indexMapping.size(), from.size());
962         for (uint32_t i = 0, e = indexMapping.size(); i < e; i++) {
963             uint32_t toIndex = indexMapping[i];
964             NN_RET_CHECK_GT(to->size(), toIndex);
965             NN_RET_CHECK(isUpdatable(to->at(toIndex).dimensions, from[i].dimensions));
966             (*to)[toIndex] = from[i];
967         }
968     } else {
969         NN_RET_CHECK_EQ(from.size(), to->size());
970         for (uint32_t i = 0, e = from.size(); i < e; i++) {
971             NN_RET_CHECK(isUpdatable(to->at(i).dimensions, from[i].dimensions));
972             (*to)[i] = from[i];
973         }
974     }
975     return true;
976 }
977 
StepExecutor(ExecutionBuilder * executionBuilder,const ModelBuilder * model,std::shared_ptr<Device> device,std::shared_ptr<PreparedModel> preparedModel,const ExecutionStep * step)978 StepExecutor::StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
979                            std::shared_ptr<Device> device,
980                            std::shared_ptr<PreparedModel> preparedModel, const ExecutionStep* step)
981     : mExecutionBuilder(executionBuilder),
982       mExecutionStep(step),
983       mModel(model),
984       mDevice(device),
985       mPreparedModel(preparedModel),
986       mInputs(model->inputCount()),
987       mOutputs(model->outputCount()) {
988     CHECK(mDevice != nullptr);
989     VLOG(EXECUTION) << "StepExecutor::StepExecutor with " << mInputs.size() << " inputs and "
990                     << mOutputs.size() << " outputs";
991 }
992 
mapInputsAndOutputsTrivially()993 void StepExecutor::mapInputsAndOutputsTrivially() {
994     mInputs = mExecutionBuilder->mInputs;
995     mOutputs = mExecutionBuilder->mOutputs;
996     mMemories = mExecutionBuilder->mMemories;
997 }
998 
mapInputOrOutput(const ModelArgumentInfo & builderInputOrOutput,ModelArgumentInfo * executorInputOrOutput)999 void StepExecutor::mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
1000                                     ModelArgumentInfo* executorInputOrOutput) {
1001     *executorInputOrOutput = builderInputOrOutput;
1002     switch (executorInputOrOutput->state()) {
1003         default:
1004             CHECK(false) << "unexpected ModelArgumentInfo::state";
1005             break;
1006         case ModelArgumentInfo::HAS_NO_VALUE:
1007         case ModelArgumentInfo::POINTER:
1008         case ModelArgumentInfo::UNSPECIFIED:
1009             break;
1010         case ModelArgumentInfo::MEMORY: {
1011             const uint32_t builderPoolIndex = builderInputOrOutput.locationAndLength().poolIndex;
1012             const Memory* memory = mExecutionBuilder->mMemories[builderPoolIndex];
1013             const uint32_t executorPoolIndex = mMemories.add(memory);
1014             executorInputOrOutput->locationAndLength().poolIndex = executorPoolIndex;
1015             break;
1016         }
1017     }
1018 }
1019 
setInputOrOutputFromMemory(const Operand & inputOrOutputOperand,const Memory * memory,uint32_t offset,ModelArgumentInfo * inputOrOutputInfo)1020 int StepExecutor::setInputOrOutputFromMemory(const Operand& inputOrOutputOperand,
1021                                              const Memory* memory, uint32_t offset,
1022                                              ModelArgumentInfo* inputOrOutputInfo) {
1023     // Should be similar to
1024     //     ExecutionBuilder::setInputFromMemory()
1025     //     ExecutionBuilder::setOutputFromMemory()
1026 
1027     uint32_t poolIndex = mMemories.add(memory);
1028     uint32_t length = TypeManager::get()->getSizeOfData(inputOrOutputOperand);
1029     CHECK(inputOrOutputInfo->unspecified());
1030     int n;
1031     std::tie(n, *inputOrOutputInfo) =
1032             ModelArgumentInfo::createFromMemory(inputOrOutputOperand,
1033                                                 /*type=*/nullptr, poolIndex, offset, length);
1034     return n;
1035 }
1036 
logArguments(const char * kind,const std::vector<ModelArgumentInfo> & args)1037 static void logArguments(const char* kind, const std::vector<ModelArgumentInfo>& args) {
1038     for (unsigned i = 0; i < args.size(); i++) {
1039         const auto& arg = args[i];
1040         std::string prefix = kind + std::string("[") + std::to_string(i) + "] = ";
1041         switch (arg.state()) {
1042             case ModelArgumentInfo::POINTER:
1043                 VLOG(EXECUTION) << prefix << "POINTER(" << SHOW_IF_DEBUG(arg.buffer()) << ")";
1044                 break;
1045             case ModelArgumentInfo::MEMORY:
1046                 VLOG(EXECUTION) << prefix << "MEMORY("
1047                                 << "pool=" << arg.locationAndLength().poolIndex << ", "
1048                                 << "off=" << arg.locationAndLength().offset << ")";
1049                 break;
1050             case ModelArgumentInfo::HAS_NO_VALUE:
1051                 VLOG(EXECUTION) << prefix << "HAS_NO_VALUE";
1052                 break;
1053             case ModelArgumentInfo::UNSPECIFIED:
1054                 VLOG(EXECUTION) << prefix << "UNSPECIFIED";
1055                 break;
1056             default:
1057                 VLOG(EXECUTION) << prefix << "state(" << arg.state() << ")";
1058                 break;
1059         }
1060     }
1061 }
1062 
isCpu() const1063 bool StepExecutor::isCpu() const {
1064     return mDevice == DeviceManager::getCpuDevice();
1065 }
1066 
makeTimeoutDuration(uint64_t nanoseconds)1067 static OptionalTimeoutDuration makeTimeoutDuration(uint64_t nanoseconds) {
1068     OptionalTimeoutDuration otd;
1069     otd.nanoseconds(nanoseconds);
1070     return otd;
1071 }
1072 
compute(const std::optional<Deadline> & deadline,const std::shared_ptr<ExecutionBurstController> & burstController)1073 std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::compute(
1074         const std::optional<Deadline>& deadline,
1075         const std::shared_ptr<ExecutionBurstController>& burstController) {
1076     return computeWithMemories(deadline, mMemories.getObjects(), burstController);
1077 }
1078 
computeWithMemories(const std::optional<Deadline> & deadline,const std::vector<const Memory * > & memories,const std::shared_ptr<ExecutionBurstController> & burstController)1079 std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::computeWithMemories(
1080         const std::optional<Deadline>& deadline, const std::vector<const Memory*>& memories,
1081         const std::shared_ptr<ExecutionBurstController>& burstController) {
1082     CHECK(mPreparedModel != nullptr);
1083 
1084     if (VLOG_IS_ON(EXECUTION)) {
1085         logArguments("input", mInputs);
1086         logArguments("output", mOutputs);
1087     }
1088 
1089     const MeasureTiming measure = measureTiming(mExecutionBuilder);
1090     const OptionalTimeoutDuration loopTimeoutDuration =
1091             makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
1092     const auto [n, outputShapes, timing] = mPreparedModel->execute(
1093             mInputs, mOutputs, memories, burstController, measure, deadline, loopTimeoutDuration);
1094     mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
1095 
1096     return {n, std::move(outputShapes), timing};
1097 }
1098 
computeFenced(const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,const std::optional<Deadline> & deadline)1099 std::tuple<int, int, sp<hal::IFencedExecutionCallback>> StepExecutor::computeFenced(
1100         const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
1101         const std::optional<Deadline>& deadline) {
1102     CHECK(mPreparedModel != nullptr);
1103 
1104     if (VLOG_IS_ON(EXECUTION)) {
1105         logArguments("input", mInputs);
1106         logArguments("output", mOutputs);
1107     }
1108 
1109     const MeasureTiming measure = measureTiming(mExecutionBuilder);
1110     const OptionalTimeoutDuration loopTimeoutDuration =
1111             makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
1112     OptionalTimeoutDuration optionalTimeoutDurationAfterFence;
1113     if (timeoutDurationAfterFence > 0) {
1114         optionalTimeoutDurationAfterFence.nanoseconds(timeoutDurationAfterFence);
1115     }
1116     const auto [n, syncFence, computeFencedCallback, timing] = mPreparedModel->executeFenced(
1117             mInputs, mOutputs, mMemories.getObjects(), waitFor, measure, deadline,
1118             loopTimeoutDuration, optionalTimeoutDurationAfterFence);
1119     if (syncFence < 0 && computeFencedCallback == nullptr) {
1120         mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
1121     }
1122     return {n, syncFence, computeFencedCallback};
1123 }
1124 
1125 // For cpuFallback{Partial,Full}, recompile the model on CPU and then start compute.
computeOnCpuFallback()1126 std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::computeOnCpuFallback() {
1127     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "StepExecutor::computeOnCpuFallback");
1128     VLOG(EXECUTION) << "Re-compile the model on CPU";
1129     mDevice = DeviceManager::getCpuDevice();
1130     mPreparedModel = nullptr;
1131     const ModelFactory makeModel = [this] { return mModel->makeHidlModel(); };
1132     // TODO: Propagate user preference and compilation priority to this point instead of using
1133     // default values of ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER and
1134     // ANEURALNETWORKS_PRIORITY_MEDIUM
1135     const ExecutionPreference preference =
1136             static_cast<ExecutionPreference>(ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER);
1137     const Priority priority = convertToHalPriority(ANEURALNETWORKS_PRIORITY_DEFAULT);
1138     auto [n, preparedModel] = mDevice->prepareModel(makeModel, preference, priority, {}, {}, {});
1139     mPreparedModel = std::move(preparedModel);
1140     if (n != ANEURALNETWORKS_NO_ERROR) {
1141         return {n, {}, kNoTiming};
1142     }
1143 
1144     // Prepare device memories for CPU fallback.
1145     std::vector<const Memory*> memories = mMemories.getObjects();
1146     std::vector<bool> isUsedAsInput(memories.size(), false);
1147     std::vector<bool> isUsedAsOutput(memories.size(), false);
1148     std::vector<std::unique_ptr<Memory>> blobAhwbs;
1149 
1150     // Mark the input and output usages.
1151     for (auto& input : mInputs) {
1152         if (input.state() == ModelArgumentInfo::MEMORY) {
1153             const uint32_t poolIndex = input.locationAndLength().poolIndex;
1154             isUsedAsInput[poolIndex] = true;
1155         }
1156     }
1157     for (auto& output : mOutputs) {
1158         if (output.state() == ModelArgumentInfo::MEMORY) {
1159             const uint32_t poolIndex = output.locationAndLength().poolIndex;
1160             // Cannot allocate output buffers with unknown shapes.
1161             if (mMemories[poolIndex]->getValidator().createdWithUnknownShape()) {
1162                 LOG(ERROR) << "Cannot fallback to CPU because at least one of the output operands "
1163                               "has unknown shape.";
1164                 return {ANEURALNETWORKS_OP_FAILED, {}, kNoTiming};
1165             }
1166             isUsedAsOutput[poolIndex] = true;
1167         }
1168     }
1169 
1170     // Allocate BLOB mode AHardwareBuffers and read the data from input device memories.
1171     for (uint32_t i = 0; i < memories.size(); i++) {
1172         const Memory* memory = mMemories[i];
1173         if (memory->getIBuffer() != nullptr) {
1174             const uint32_t size = memory->getValidator().getMetadata().logicalSize;
1175             auto [nAhwb, blobAhwb] = MemoryRuntimeAHWB::create(size);
1176             if (nAhwb != ANEURALNETWORKS_NO_ERROR) {
1177                 return {nAhwb, {}, kNoTiming};
1178             }
1179             if (isUsedAsInput[i]) {
1180                 n = copyIBufferToHidlMemory(memory->getIBuffer(), blobAhwb->getHidlMemory());
1181                 if (n != ANEURALNETWORKS_NO_ERROR) {
1182                     return {n, {}, kNoTiming};
1183                 }
1184             }
1185             memories[i] = blobAhwb.get();
1186             blobAhwbs.push_back(std::move(blobAhwb));
1187         }
1188     }
1189 
1190     auto [nCompute, outputShapes, timing] = computeWithMemories({}, memories);
1191     if (nCompute != ANEURALNETWORKS_NO_ERROR) {
1192         return {nCompute, std::move(outputShapes), timing};
1193     }
1194 
1195     // Write back to output device memories.
1196     for (uint32_t i = 0; i < memories.size(); i++) {
1197         const Memory* memory = mMemories[i];
1198         if (memory->getIBuffer() != nullptr && isUsedAsOutput[i]) {
1199             n = copyHidlMemoryToIBuffer(memories[i]->getHidlMemory(), memory->getIBuffer(), {});
1200             if (n != ANEURALNETWORKS_NO_ERROR) {
1201                 return {n, {}, kNoTiming};
1202             }
1203         }
1204     }
1205     return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
1206 }
1207 
1208 }  // namespace nn
1209 }  // namespace android
1210