1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "ExecutionBuilder"
18
19 #include "ExecutionBuilder.h"
20
21 #include <algorithm>
22 #include <limits>
23 #include <memory>
24 #include <mutex>
25 #include <optional>
26 #include <string>
27 #include <thread>
28 #include <tuple>
29 #include <utility>
30 #include <vector>
31
32 #include "CompilationBuilder.h"
33 #include "ControlFlow.h"
34 #include "CpuExecutor.h"
35 #include "ExecutionBurstController.h"
36 #include "HalInterfaces.h"
37 #include "Manager.h"
38 #include "ModelArgumentInfo.h"
39 #include "ModelBuilder.h"
40 #include "Tracing.h"
41 #include "TypeManager.h"
42 #include "Utils.h"
43
44 namespace android {
45 namespace nn {
46
47 using namespace hal;
48
49 const Timing kNoTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
50
measureTiming(const ExecutionBuilder * execution)51 static MeasureTiming measureTiming(const ExecutionBuilder* execution) {
52 return execution->measureTiming() ? MeasureTiming::YES : MeasureTiming::NO;
53 }
54
checkDimensionInfo(const Operand & operand,const ANeuralNetworksOperandType * newType,const char * tag,bool allowUnspecified)55 static bool checkDimensionInfo(const Operand& operand, const ANeuralNetworksOperandType* newType,
56 const char* tag, bool allowUnspecified) {
57 if (newType != nullptr) {
58 const Extension::OperandTypeInformation* info = nullptr;
59 if (isExtensionOperandType(operand.type)) {
60 NN_RET_CHECK(TypeManager::get()->getExtensionOperandTypeInfo(operand.type, &info));
61 }
62 if (validateOperandType(*newType, info, tag, allowUnspecified) !=
63 ANEURALNETWORKS_NO_ERROR) {
64 LOG(ERROR) << tag << ": Invalid newType";
65 return false;
66 }
67 if (operand.dimensions.size() == 0) {
68 return true;
69 }
70 if (operand.dimensions.size() != newType->dimensionCount) {
71 LOG(ERROR) << tag << ": Setting with incompatible dimension count";
72 return false;
73 }
74 for (uint32_t i = 0; i < newType->dimensionCount; i++) {
75 if (operand.dimensions[i] != newType->dimensions[i] && operand.dimensions[i] != 0) {
76 LOG(ERROR) << tag << ": Overriding a fully specified dimension is disallowed";
77 return false;
78 }
79 }
80 } else {
81 if (!allowUnspecified && TypeManager::get()->isTensorType(operand.type) &&
82 tensorHasUnspecifiedDimensions(operand)) {
83 LOG(ERROR) << tag << ": Setting with operand type that is not fully specified";
84 return false;
85 }
86 }
87 return true;
88 }
89
ExecutionBuilder(const CompilationBuilder * compilation)90 ExecutionBuilder::ExecutionBuilder(const CompilationBuilder* compilation)
91 : mCompilation(compilation),
92 mModel(compilation->mModel),
93 mPlan(&compilation->mPlan),
94 mPartitioning(compilation->mPartitioning),
95 mInputs(mModel->inputCount()),
96 mOutputs(mModel->outputCount()) {
97 VLOG(EXECUTION) << "ExecutionBuilder::ExecutionBuilder with " << mInputs.size()
98 << " inputs and " << mOutputs.size() << " outputs";
99 }
100
getSourceModel(uint32_t index) const101 const ModelBuilder* ExecutionBuilder::getSourceModel(uint32_t index) const {
102 return mPlan->getSourceModels().getModel(index);
103 }
104
isFinished() const105 bool ExecutionBuilder::isFinished() const {
106 CHECK(!(mFinishedWithoutSyncFence && hasSyncFence()));
107 if (mFinishedWithoutSyncFence) {
108 return true;
109 }
110 if (hasSyncFence()) {
111 auto r = syncWait(mSyncFenceFd, 0);
112 CHECK(r != FenceState::UNKNOWN);
113 return r != FenceState::ACTIVE;
114 }
115 return false;
116 }
117
completedWith() const118 ExecutionBuilder::Completion ExecutionBuilder::completedWith() const {
119 CHECK(isFinished());
120 if (hasSyncFence()) {
121 auto r = syncWait(mSyncFenceFd, 0);
122 CHECK(r == FenceState::SIGNALED || r == FenceState::ERROR);
123 return (r == FenceState::SIGNALED) ? Completion::NO_ERROR : Completion::OTHER_ERROR;
124 } else {
125 return mCompletionWithoutSyncFence;
126 }
127 }
128
setInput(uint32_t index,const ANeuralNetworksOperandType * type,const void * buffer,size_t length)129 int ExecutionBuilder::setInput(uint32_t index, const ANeuralNetworksOperandType* type,
130 const void* buffer, size_t length) {
131 if (mStarted) {
132 LOG(ERROR) << "ANeuralNetworksExecution_setInput called after the "
133 "execution has started.";
134 return ANEURALNETWORKS_BAD_STATE;
135 }
136 uint32_t count = static_cast<uint32_t>(mInputs.size());
137 if (index >= count) {
138 LOG(ERROR) << "ANeuralNetworksExecution_setInput bad index " << index << " " << count;
139 return ANEURALNETWORKS_BAD_DATA;
140 }
141 if (!checkDimensionInfo(mModel->getInputOperand(index), type,
142 "ANeuralNetworksExecution_setInput", buffer == nullptr)) {
143 return ANEURALNETWORKS_BAD_DATA;
144 }
145 if (length > 0xFFFFFFFF) {
146 LOG(ERROR) << "ANeuralNetworksExecution_setInput input exceeds max length " << length;
147 return ANEURALNETWORKS_BAD_DATA;
148 }
149 uint32_t l = static_cast<uint32_t>(length);
150 if (!mInputs[index].unspecified()) {
151 LOG(ERROR) << "ANeuralNetworksExecution_setInput called when an input has already been "
152 "provided";
153 return ANEURALNETWORKS_BAD_STATE;
154 }
155 int n;
156 std::tie(n, mInputs[index]) = ModelArgumentInfo::createFromPointer(
157 mModel->getInputOperand(index), type, const_cast<void*>(buffer), l);
158 return n;
159 }
160
setInputFromMemory(uint32_t index,const ANeuralNetworksOperandType * type,const Memory * memory,size_t offset,size_t length)161 int ExecutionBuilder::setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
162 const Memory* memory, size_t offset, size_t length) {
163 // Should be similar to StepExecutor::setInputOrOutputFromMemory()
164
165 if (mStarted) {
166 LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory called after the "
167 "execution has started.";
168 return ANEURALNETWORKS_BAD_STATE;
169 }
170 uint32_t count = static_cast<uint32_t>(mInputs.size());
171 if (index >= count) {
172 LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory bad index " << index << " "
173 << count;
174 return ANEURALNETWORKS_BAD_DATA;
175 }
176 if (!checkDimensionInfo(mModel->getInputOperand(index), type,
177 "ANeuralNetworksExecution_setInputFromMemory", false)) {
178 return ANEURALNETWORKS_BAD_DATA;
179 }
180 if (!memory->getValidator().validate(mCompilation, IOType::INPUT, index, type, offset,
181 length)) {
182 return ANEURALNETWORKS_BAD_DATA;
183 }
184 // For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we
185 // allow the client to specify offset == 0 && length == 0 indicating that the entire memory
186 // region is used. We update the length here because the drivers are still expecting a real
187 // length. For other memories that do not allow this semantic, it is checked in
188 // MemoryValidatorBase::validate before reaching here.
189 if (memory->getHidlMemory().valid() && offset == 0 && length == 0) {
190 length = memory->getHidlMemory().size();
191 }
192 // TODO validate the rest
193 uint32_t poolIndex = mMemories.add(memory);
194 if (!mInputs[index].unspecified()) {
195 LOG(ERROR)
196 << "ANeuralNetworksExecution_setInputFromMemory called when an input has already "
197 "been provided";
198 return ANEURALNETWORKS_BAD_STATE;
199 }
200 int n;
201 std::tie(n, mInputs[index]) = ModelArgumentInfo::createFromMemory(
202 mModel->getInputOperand(index), type, poolIndex, offset, length);
203 return n;
204 }
205
setOutput(uint32_t index,const ANeuralNetworksOperandType * type,void * buffer,size_t length)206 int ExecutionBuilder::setOutput(uint32_t index, const ANeuralNetworksOperandType* type,
207 void* buffer, size_t length) {
208 if (mStarted) {
209 LOG(ERROR) << "ANeuralNetworksExecution_setOutput called after the "
210 "execution has started.";
211 return ANEURALNETWORKS_BAD_STATE;
212 }
213 uint32_t count = static_cast<uint32_t>(mOutputs.size());
214 if (index >= count) {
215 LOG(ERROR) << "ANeuralNetworksExecution_setOutput bad index " << index << " " << count;
216 return ANEURALNETWORKS_BAD_DATA;
217 }
218 if (!checkDimensionInfo(mModel->getOutputOperand(index), type,
219 "ANeuralNetworksExecution_setOutput", true)) {
220 return ANEURALNETWORKS_BAD_DATA;
221 }
222 if (length > 0xFFFFFFFF) {
223 LOG(ERROR) << "ANeuralNetworksExecution_setOutput input exceeds max length " << length;
224 return ANEURALNETWORKS_BAD_DATA;
225 }
226 uint32_t l = static_cast<uint32_t>(length);
227 if (!mOutputs[index].unspecified()) {
228 LOG(ERROR) << "ANeuralNetworksExecution_setOutput called when an output has already been "
229 "provided";
230 return ANEURALNETWORKS_BAD_STATE;
231 }
232 int n;
233 std::tie(n, mOutputs[index]) =
234 ModelArgumentInfo::createFromPointer(mModel->getOutputOperand(index), type, buffer, l);
235 return n;
236 }
237
setOutputFromMemory(uint32_t index,const ANeuralNetworksOperandType * type,const Memory * memory,size_t offset,size_t length)238 int ExecutionBuilder::setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
239 const Memory* memory, size_t offset, size_t length) {
240 // Should be similar to StepExecutor::setInputOrOutputFromMemory()
241
242 if (mStarted) {
243 LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called after the "
244 "execution has started.";
245 return ANEURALNETWORKS_BAD_STATE;
246 }
247 uint32_t count = static_cast<uint32_t>(mOutputs.size());
248 if (index >= count) {
249 LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory bad index " << index << " "
250 << count;
251 return ANEURALNETWORKS_BAD_DATA;
252 }
253 if (!checkDimensionInfo(mModel->getOutputOperand(index), type,
254 "ANeuralNetworksExecution_setOutputFromMemory", true)) {
255 return ANEURALNETWORKS_BAD_DATA;
256 }
257 if (!memory->getValidator().validate(mCompilation, IOType::OUTPUT, index, type, offset,
258 length)) {
259 return ANEURALNETWORKS_BAD_DATA;
260 }
261 // For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we
262 // allow the client to specify offset == 0 && length == 0 indicating that the entire memory
263 // region is used. We update the length here because the drivers are still expecting a real
264 // length. For other memories that do not allow this semantic, it is checked in
265 // MemoryValidatorBase::validate before reaching here.
266 if (memory->getHidlMemory().valid() && offset == 0 && length == 0) {
267 length = memory->getHidlMemory().size();
268 }
269 // TODO validate the rest
270 uint32_t poolIndex = mMemories.add(memory);
271 if (!mOutputs[index].unspecified()) {
272 LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called when an output has "
273 "already been provided";
274 return ANEURALNETWORKS_BAD_STATE;
275 }
276 int n;
277 std::tie(n, mOutputs[index]) = ModelArgumentInfo::createFromMemory(
278 mModel->getOutputOperand(index), type, poolIndex, offset, length);
279 return n;
280 }
281
setMeasureTiming(bool measure)282 int ExecutionBuilder::setMeasureTiming(bool measure) {
283 if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
284 LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called on "
285 << "an ANeuralNetworksExecution created from an ANeuralNetworksCompilation "
286 << "that was not created by ANeuralNetworksCompilation_createForDevices "
287 << "with numDevices = 1";
288 return ANEURALNETWORKS_BAD_DATA;
289 }
290 if (mStarted) {
291 LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called after the "
292 "execution has started.";
293 return ANEURALNETWORKS_BAD_STATE;
294 }
295 mMeasureTiming = measure;
296 return ANEURALNETWORKS_NO_ERROR;
297 }
298
getDuration(int32_t durationCode,uint64_t * duration) const299 int ExecutionBuilder::getDuration(int32_t durationCode, uint64_t* duration) const {
300 if (!isFinished()) {
301 LOG(ERROR) << "ANeuralNetworksExecution_getDuration called before the "
302 "execution has finished.";
303 *duration = UINT64_MAX;
304 return ANEURALNETWORKS_BAD_STATE;
305 }
306 if (completedWith() != Completion::NO_ERROR) {
307 LOG(ERROR) << "ANeuralNetworksExecution_getDuration called on an execution "
308 "that has encountered an error.";
309 *duration = UINT64_MAX;
310 return ANEURALNETWORKS_BAD_STATE;
311 }
312
313 // NOTE: At the HAL level, timing is in microseconds. At the NDK level, nanoseconds.
314 const uint64_t kNanoPerMicro = 1000;
315
316 if (!mMeasureTiming) {
317 *duration = UINT64_MAX;
318 return ANEURALNETWORKS_BAD_STATE;
319 }
320
321 Timing timingLaunched = mTimingWithoutFencedExecutionCallback;
322 Timing timingFenced = timingLaunched;
323 if (mFencedExecutionCallback != nullptr) {
324 ErrorStatus status;
325 const Return<void> ret = mFencedExecutionCallback->getExecutionInfo(
326 [&status, &timingLaunched, &timingFenced](ErrorStatus error, Timing tLaunched,
327 Timing tFenced) {
328 status = error;
329 timingLaunched = tLaunched;
330 timingFenced = tFenced;
331 });
332 if (!ret.isOk()) {
333 *duration = UINT64_MAX;
334 return ANEURALNETWORKS_OP_FAILED;
335 }
336 if (status != ErrorStatus::NONE) {
337 *duration = UINT64_MAX;
338 return ANEURALNETWORKS_BAD_STATE;
339 }
340 }
341 uint64_t microDuration = UINT64_MAX;
342 switch (durationCode) {
343 case ANEURALNETWORKS_DURATION_ON_HARDWARE:
344 microDuration = timingLaunched.timeOnDevice;
345 break;
346 case ANEURALNETWORKS_DURATION_IN_DRIVER:
347 microDuration = timingLaunched.timeInDriver;
348 break;
349 case ANEURALNETWORKS_FENCED_DURATION_ON_HARDWARE:
350 microDuration = timingFenced.timeOnDevice;
351 break;
352 case ANEURALNETWORKS_FENCED_DURATION_IN_DRIVER:
353 microDuration = timingFenced.timeInDriver;
354 break;
355 default:
356 CHECK(!"unexpected");
357 }
358 *duration = (microDuration == UINT64_MAX) ? UINT64_MAX : kNanoPerMicro * microDuration;
359
360 VLOG(EXECUTION) << "getDuration(" << durationCode << "): " << *duration;
361 return ANEURALNETWORKS_NO_ERROR;
362 }
363
setTimeoutDuration(uint64_t duration)364 int ExecutionBuilder::setTimeoutDuration(uint64_t duration) {
365 if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
366 LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called on an ANeuralNetworksExecution "
367 "created from an ANeuralNetworksCompilation that was not created by "
368 "ANeuralNetworksCompilation_createForDevices with numDevices = 1";
369 return ANEURALNETWORKS_BAD_DATA;
370 }
371 if (mStarted) {
372 LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called after the execution has started.";
373 return ANEURALNETWORKS_BAD_STATE;
374 }
375 if (duration > 0) {
376 mTimeoutDuration = duration;
377 } else {
378 mTimeoutDuration.reset();
379 }
380 return ANEURALNETWORKS_NO_ERROR;
381 }
382
getTimeoutDuration() const383 std::optional<uint64_t> ExecutionBuilder::getTimeoutDuration() const {
384 return mTimeoutDuration;
385 }
386
setLoopTimeout(uint64_t duration)387 int ExecutionBuilder::setLoopTimeout(uint64_t duration) {
388 if (mStarted) {
389 LOG(ERROR) << "ANeuralNetworksExecution_setLoopTimeout called after the "
390 "execution has started.";
391 return ANEURALNETWORKS_BAD_STATE;
392 }
393 if (duration > operation_while::kTimeoutNsMaximum) {
394 LOG(WARNING) << "ANeuralNetworksExecution_setLoopTimeout input exceeds the maximum allowed "
395 << "duration: " << duration << " > " << operation_while::kTimeoutNsMaximum;
396 duration = operation_while::kTimeoutNsMaximum;
397 }
398 mLoopTimeoutDuration = duration;
399 return ANEURALNETWORKS_NO_ERROR;
400 }
401
getOutputOperandDimensions(uint32_t index,uint32_t * dimensions)402 int ExecutionBuilder::getOutputOperandDimensions(uint32_t index, uint32_t* dimensions) {
403 if (!isFinished()) {
404 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called before the "
405 "execution has finished.";
406 return ANEURALNETWORKS_BAD_STATE;
407 }
408 if (completedWith() == Completion::OTHER_ERROR) {
409 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called on an execution "
410 "that has encountered an error.";
411 return ANEURALNETWORKS_BAD_STATE;
412 }
413
414 uint32_t count = static_cast<uint32_t>(mOutputs.size());
415 if (index >= count) {
416 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions bad index " << index
417 << " " << count;
418 return ANEURALNETWORKS_BAD_DATA;
419 }
420 const auto& dims = mOutputs[index].dimensions();
421 if (dims.empty()) {
422 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions can not query "
423 "dimensions of a scalar";
424 return ANEURALNETWORKS_BAD_DATA;
425 }
426 std::copy(dims.begin(), dims.end(), dimensions);
427 return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR
428 : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
429 }
430
getOutputOperandRank(uint32_t index,uint32_t * rank)431 int ExecutionBuilder::getOutputOperandRank(uint32_t index, uint32_t* rank) {
432 if (!isFinished()) {
433 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called before the "
434 "execution has finished.";
435 return ANEURALNETWORKS_BAD_STATE;
436 }
437 if (completedWith() == Completion::OTHER_ERROR) {
438 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called on an execution "
439 "that has encountered an error.";
440 return ANEURALNETWORKS_BAD_STATE;
441 }
442 uint32_t count = static_cast<uint32_t>(mOutputs.size());
443 if (index >= count) {
444 LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank bad index " << index << " "
445 << count;
446 return ANEURALNETWORKS_BAD_DATA;
447 }
448 *rank = static_cast<uint32_t>(mOutputs[index].dimensions().size());
449 return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR
450 : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
451 }
452
453 // Attempt synchronous execution of full model on CPU.
454 // TODO: How should we handle timing in this case?
455 // For Q this is irrelevant: We only support timing in conjunction
456 // with an explicit device list; and we do not support CPU fallback
457 // with an explicit device list. See CompilationBuilder::mExplicitDeviceList.
cpuFallbackFull(ExecutionBuilder * executionBuilder)458 static std::tuple<int, std::vector<OutputShape>, Timing> cpuFallbackFull(
459 ExecutionBuilder* executionBuilder) {
460 CHECK(executionBuilder != nullptr);
461 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackFull");
462 VLOG(EXECUTION) << "cpuFallbackFull";
463
464 // Get fallback executor.
465 StepExecutor executor(executionBuilder, executionBuilder->getModel(),
466 DeviceManager::getCpuDevice(), /*preparedModel=*/nullptr);
467 executor.mapInputsAndOutputsTrivially();
468
469 // Attempt fallback execution.
470 return executor.computeOnCpuFallback();
471 }
472
473 // Attempt synchronous execution on CPU.
474 // TODO: How should we handle timing in this case?
475 // For Q this is irrelevant: We only support timing in conjunction
476 // with an explicit device list; and we do not support CPU fallback
477 // with an explicit device list. See CompilationBuilder::mExplicitDeviceList.
478 static std::tuple<int, std::vector<OutputShape>, Timing, std::shared_ptr<StepExecutor>>
cpuFallbackPartial(const ExecutionPlan & plan,std::shared_ptr<ExecutionPlan::Controller> controller)479 cpuFallbackPartial(const ExecutionPlan& plan,
480 std::shared_ptr<ExecutionPlan::Controller> controller) {
481 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackPartial");
482 VLOG(EXECUTION) << "cpuFallbackPartial";
483
484 // Get fallback executor.
485 std::shared_ptr<StepExecutor> executor;
486 int n1 = plan.fallback(controller, &executor);
487 if (n1 != ANEURALNETWORKS_NO_ERROR) {
488 return {n1, {}, kNoTiming, nullptr};
489 }
490 CHECK(executor != nullptr);
491
492 // Attempt fallback execution.
493 auto [n2, outputShapes, timing] = executor->computeOnCpuFallback();
494 return {n2, std::move(outputShapes), timing, executor};
495 }
496
asyncStartComputePartitioned(ExecutionBuilder * executionBuilder,const ExecutionPlan & plan,std::shared_ptr<ExecutionPlan::Controller> controller,bool allowFallback,const std::optional<Deadline> & deadline,const sp<ExecutionCallback> & executionCallback)497 static void asyncStartComputePartitioned(ExecutionBuilder* executionBuilder,
498 const ExecutionPlan& plan,
499 std::shared_ptr<ExecutionPlan::Controller> controller,
500 bool allowFallback,
501 const std::optional<Deadline>& deadline,
502 const sp<ExecutionCallback>& executionCallback) {
503 CHECK(executionBuilder != nullptr);
504 VLOG(EXECUTION) << "ExecutionBuilder::compute (from plan, iteratively)";
505
506 std::vector<OutputShape> outputShapes = executionBuilder->getInitialOutputShapes();
507 Timing timing = kNoTiming;
508 // Disallow fallback when the ExecutionPlan is simple on CPU.
509 allowFallback &= !plan.isSimpleCpu();
510
511 while (true) {
512 VLOG(EXECUTION) << "looking for next StepExecutor";
513
514 // Get the current step of the execution.
515 std::shared_ptr<StepExecutor> executor;
516 std::shared_ptr<ExecutionBurstController> burstController;
517 int n = plan.next(controller, &executor, &burstController);
518 if (n != ANEURALNETWORKS_NO_ERROR) {
519 // During the interpreted execution of control flow, a loop timeout
520 // might occur in ExecutionPlan::next().
521 bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
522 n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
523 if (allowFallback && !missedDeadline) break;
524 executionCallback->notify(convertResultCodeToErrorStatus(n), {}, kNoTiming);
525 return;
526 }
527
528 // If the code reached the end of the plan without error, then return
529 // with no error.
530 if (executor == nullptr) {
531 executionCallback->notify(ErrorStatus::NONE, outputShapes, timing);
532 return;
533 }
534 const bool executorIsCpu = executor->isCpu();
535
536 // Attempt to execute a single step of the execution.
537 auto [stepN, stepOutputShapes, stepTiming] = executor->compute(deadline, burstController);
538
539 // Update global outputs.
540 if (!executor->updateOutputShapes(stepOutputShapes, &outputShapes)) {
541 stepN = ANEURALNETWORKS_OP_FAILED;
542 }
543
544 // If execution was successful, continue to next step.
545 if (stepN == ANEURALNETWORKS_NO_ERROR) {
546 // We only support collection of timing information in the case of a
547 // single step, so it's safe to just keep track of the last step's
548 // timing information.
549 timing = stepTiming;
550 continue;
551 }
552
553 // OUTPUT_INSUFFICIENT_SIZE is not recoverable, so end execution.
554 if (stepN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
555 const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
556 executionCallback->notify(stepStatus, outputShapes, kNoTiming);
557 return;
558 }
559
560 // If fallback is not allowed and there was an error, end execution.
561 if (!allowFallback) {
562 const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
563 executionCallback->notify(stepStatus, {}, kNoTiming);
564 return;
565 }
566
567 // If CPU execution was already attempted, either:
568 // (1) perform a full fallback if the plan is not simple, or
569 // (2) return from the function with an error
570 if (executorIsCpu) {
571 if (!plan.isSimple()) break;
572 executionCallback->notify(convertResultCodeToErrorStatus(stepN), {}, kNoTiming);
573 return;
574 }
575
576 // If the code reaches this point, attempt a partial fallback to CPU.
577 CHECK(allowFallback);
578 auto [fallbackN, fallbackOutputShapes, fallbackTiming, fallbackExecutor] =
579 cpuFallbackPartial(plan, controller);
580
581 // Update global outputs.
582 if (fallbackExecutor != nullptr &&
583 !fallbackExecutor->updateOutputShapes(fallbackOutputShapes, &outputShapes)) {
584 fallbackN = ANEURALNETWORKS_OP_FAILED;
585 }
586
587 // If execution was successful, continue to next step.
588 if (fallbackN == ANEURALNETWORKS_NO_ERROR) {
589 // We only support collection of timing information in the case of a
590 // single step, so it's safe to just keep track of the last step's
591 // timing information.
592 timing = fallbackTiming;
593 continue;
594 }
595
596 // OUTPUT_INSUFFICIENT_SIZE is not recoverable, so end execution.
597 if (fallbackN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
598 const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
599 executionCallback->notify(fallbackStatus, outputShapes, kNoTiming);
600 return;
601 }
602
603 // Do not fallback twice if the ExecutionPlan is simple.
604 if (plan.isSimple()) {
605 const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
606 executionCallback->notify(fallbackStatus, {}, kNoTiming);
607 return;
608 }
609
610 // If the code reaches this point, then there was an error with the
611 // fallback. In this case, attempt full fallback.
612 break;
613 }
614
615 // If the code has reached this point, a potentially recoverable error
616 // occurred during the step executions. Instead, do a full execution
617 // fallback on the CPU.
618 auto [fullN, fullOutputShapes, fullTiming] = cpuFallbackFull(executionBuilder);
619 const ErrorStatus fullStatus = convertResultCodeToErrorStatus(fullN);
620 executionCallback->notify(fullStatus, fullOutputShapes, fullTiming);
621 }
622
623 // In case of partitioned execution, startComputeFenced call will return the sync
624 // fence and the fenced compute callback returned from the last partition.
625 // Any failed partition will result in the whole execution fallback to CPU if
626 // allowFallback is set to true.
startComputeFenced(ExecutionBuilder * executionBuilder,const ExecutionPlan & plan,std::shared_ptr<ExecutionPlan::Controller> controller,const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,const std::optional<Deadline> & deadline,bool allowFallback)627 static std::tuple<int, int, sp<hal::IFencedExecutionCallback>> startComputeFenced(
628 ExecutionBuilder* executionBuilder, const ExecutionPlan& plan,
629 std::shared_ptr<ExecutionPlan::Controller> controller, const std::vector<int>& waitFor,
630 uint64_t timeoutDurationAfterFence, const std::optional<Deadline>& deadline,
631 bool allowFallback) {
632 CHECK(executionBuilder != nullptr);
633 VLOG(EXECUTION) << "ExecutionBuilder::computeFenced (from plan, iteratively)";
634 // Disallow fallback when the ExecutionPlan is simple on CPU.
635 allowFallback &= !plan.isSimpleCpu();
636
637 // Initiate waitForFds, syncFence for the first step.
638 std::vector<int> waitForFds = waitFor;
639 int syncFence = -1;
640 sp<hal::IFencedExecutionCallback> computeFencedCallback;
641
642 while (true) {
643 VLOG(EXECUTION) << "looking for next StepExecutor";
644
645 // Get the current step of the execution.
646 std::shared_ptr<StepExecutor> executor;
647 int n = plan.next(controller, &executor, nullptr, syncFence);
648 if (n != ANEURALNETWORKS_NO_ERROR) {
649 // During the interpreted execution of control flow, a loop timeout
650 // might occur in ExecutionPlan::next().
651 bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
652 n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
653 if (allowFallback && !missedDeadline) break;
654 // Return -1 for the sync fence fd, and nullptr for the callback.
655 return std::make_tuple(n, -1, nullptr);
656 }
657
658 // If the code reached the end of the plan without error, then return
659 // with no error.
660 if (executor == nullptr) {
661 // If the final step returns a -1 for sync fence, the execution is finished.
662 // Update the output shapes.
663 if (syncFence == -1) {
664 // TODO(miaowang): support dynamic output shape only with memory domain.
665 // For now just return the initial output shapes.
666 executionBuilder->finishWithoutSyncFence(
667 ErrorStatus::NONE, executionBuilder->getInitialOutputShapes());
668 }
669 return std::make_tuple(ANEURALNETWORKS_NO_ERROR, syncFence, computeFencedCallback);
670 }
671 const bool executorIsCpu = executor->isCpu();
672
673 // Attempt to execute a single step of the execution.
674 auto [stepN, syncFd, callback] =
675 executor->computeFenced(waitForFds, timeoutDurationAfterFence, deadline);
676
677 // Update waitForFds, syncFence for the next step.
678 syncFence = syncFd;
679 computeFencedCallback = callback;
680 waitForFds.clear();
681 if (syncFd > 0) {
682 waitForFds = {syncFd};
683 }
684
685 // If execution was successful, continue to next step.
686 if (stepN == ANEURALNETWORKS_NO_ERROR) {
687 continue;
688 }
689 // If fallback is not allowed and there was an error, end execution.
690 if (!allowFallback) {
691 return std::make_tuple(stepN, -1, nullptr);
692 }
693
694 // If CPU execution was already attempted, either:
695 // (1) perform a full fallback if the plan is not simple, or
696 // (2) return from the function with an error
697 if (executorIsCpu) {
698 if (!plan.isSimple()) break;
699 return std::make_tuple(stepN, -1, nullptr);
700 }
701 // If the code reaches this point, then there was an error with the
702 // fallback. In this case, attempt full fallback.
703 break;
704 }
705
706 // If the code has reached this point, a potentially recoverable error
707 // occurred during the step executions. Instead, do a full execution
708 // fallback on the CPU.
709 VLOG(EXECUTION) << "Performing full fallback on the CPU.";
710 for (int syncFd : waitFor) {
711 if (syncFd > 0) {
712 auto r = syncWait(syncFd, -1);
713 if (r != FenceState::SIGNALED) {
714 VLOG(EXECUTION) << "syncWait failed, fd: " << syncFd;
715 return std::make_tuple(ANEURALNETWORKS_OP_FAILED, -1, nullptr);
716 }
717 }
718 }
719 auto [fullN, fullOutputShapes, fullTiming] = cpuFallbackFull(executionBuilder);
720 const ErrorStatus fullStatus = convertResultCodeToErrorStatus(fullN);
721 syncFence = -1;
722 executionBuilder->finishWithoutSyncFence(fullStatus, fullOutputShapes);
723 executionBuilder->reportTimingWithoutFencedExecutionCallback(fullTiming);
724 return std::make_tuple(fullN, syncFence, nullptr);
725 }
726
computeFenced(const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,int * syncFence)727 int ExecutionBuilder::computeFenced(const std::vector<int>& waitFor,
728 uint64_t timeoutDurationAfterFence, int* syncFence) {
729 CHECK(syncFence != nullptr);
730 if (mStarted) {
731 LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies"
732 " called on an execution that has already started";
733 return ANEURALNETWORKS_BAD_STATE;
734 }
735 if (timeoutDurationAfterFence > 0) {
736 if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) {
737 LOG(ERROR)
738 << "ANeuralNetworksExecution_startComputeWithDependencies called with non-zero "
739 "duration on an ANeuralNetworksExecution "
740 "created from an ANeuralNetworksCompilation that was not created by "
741 "ANeuralNetworksCompilation_createForDevices with numDevices = 1";
742 return ANEURALNETWORKS_BAD_DATA;
743 }
744 }
745 const auto deadline = makeDeadline(mTimeoutDuration);
746 for (auto& p : mInputs) {
747 if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
748 LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies"
749 " not all inputs specified";
750 return ANEURALNETWORKS_BAD_DATA;
751 }
752 }
753 for (auto& p : mOutputs) {
754 if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
755 LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies"
756 " not all outputs specified";
757 return ANEURALNETWORKS_BAD_DATA;
758 }
759 }
760 for (uint32_t i = 0; i < mOutputs.size(); i++) {
761 if (mOutputs[i].state() != ModelArgumentInfo::HAS_NO_VALUE &&
762 !checkDimensionInfo(mModel->getOutputOperand(i), nullptr,
763 "ANeuralNetworksExecution_startComputeWithDependencies", false)) {
764 LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies"
765 " not all outputs have fully specified dimensions";
766 return ANEURALNETWORKS_BAD_DATA;
767 }
768 }
769 mStarted = true;
770 const bool allowFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
771 std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this, nullptr);
772 VLOG(EXECUTION) << "ExecutionBuilder::computeFenced";
773 int result;
774 std::tie(result, mSyncFenceFd, mFencedExecutionCallback) = startComputeFenced(
775 this, *mPlan, controller, waitFor, timeoutDurationAfterFence, deadline, allowFallback);
776 *syncFence = mSyncFenceFd;
777 return result;
778 }
779
compute(sp<ExecutionCallback> * synchronizationCallback,BurstBuilder * burstBuilder)780 int ExecutionBuilder::compute(sp<ExecutionCallback>* synchronizationCallback,
781 BurstBuilder* burstBuilder) {
782 CHECK(synchronizationCallback == nullptr || burstBuilder == nullptr)
783 << "synchronizationCallback and burstBuilder cannot simultaneously be used";
784
785 const bool synchronous = (synchronizationCallback == nullptr);
786 if (!synchronous) {
787 *synchronizationCallback = nullptr;
788 }
789
790 const auto deadline = makeDeadline(mTimeoutDuration);
791
792 // TODO validate that we have full types for all inputs and outputs,
793 // that the graph is not cyclic,
794
795 auto name = [synchronous, burstBuilder] {
796 return burstBuilder ? "burstCompute" : synchronous ? "compute" : "startCompute";
797 };
798 if (mStarted) {
799 LOG(ERROR) << "ANeuralNetworksExecution_" << name()
800 << " called on an execution that has already started";
801 return ANEURALNETWORKS_BAD_STATE;
802 }
803 for (auto& p : mInputs) {
804 if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
805 LOG(ERROR) << "ANeuralNetworksExecution_" << name() << " not all inputs specified";
806 return ANEURALNETWORKS_BAD_DATA;
807 } else if (p.state() == ModelArgumentInfo::MEMORY) {
808 const Memory* memory = mMemories[p.locationAndLength().poolIndex];
809 if (!memory->getValidator().validateInputDimensions(p.dimensions())) {
810 return ANEURALNETWORKS_OP_FAILED;
811 }
812 }
813 }
814 for (auto& p : mOutputs) {
815 if (p.state() == ModelArgumentInfo::UNSPECIFIED) {
816 LOG(ERROR) << "ANeuralNetworksExecution_" << name() << " not all outputs specified";
817 return ANEURALNETWORKS_BAD_DATA;
818 }
819 }
820
821 auto wrappedFinish = [this](ErrorStatus error, const std::vector<OutputShape>& outputShapes) {
822 return finishWithoutSyncFence(error, outputShapes);
823 };
824
825 // TODO: For asynchronous execution, entire plan-based-path should run in an
826 // asynchronous thread -- take the asynchronous thread logic out of
827 // CpuPreparedModel::execute() and use it to wrap the plan-based-path.
828 mStarted = true;
829 const bool allowFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
830 std::shared_ptr<ExecutionPlan::Controller> controller =
831 mPlan->makeController(this, burstBuilder);
832 if (synchronous) {
833 VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)";
834 sp<ExecutionCallback> localSynchronizationCallback = new ExecutionCallback();
835 localSynchronizationCallback->setOnFinish(wrappedFinish);
836 asyncStartComputePartitioned(this, *mPlan, controller, allowFallback, deadline,
837 localSynchronizationCallback);
838 localSynchronizationCallback->wait();
839 if (mMeasureTiming) {
840 mTimingWithoutFencedExecutionCallback = localSynchronizationCallback->getTiming();
841 }
842 return convertErrorStatusToResultCode(localSynchronizationCallback->getStatus());
843 } else /* asynchronous */ {
844 // TODO: use a thread pool
845 // TODO(mikie): this could have NNTRACE so we could measure the overhead
846 // of spinning up a new thread.
847
848 // Prepare the callback for asynchronous execution.
849 // sp<ExecutionCallback> object is returned when the
850 // execution has been successfully launched, otherwise a
851 // nullptr is returned. The executionCallback is
852 // abstracted in the NN API as an "event".
853 sp<ExecutionCallback> executionCallback = new ExecutionCallback();
854 executionCallback->setOnFinish(wrappedFinish);
855 if (DeviceManager::get()->syncExecRuntime()) {
856 VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API, non-threaded)";
857 asyncStartComputePartitioned(this, *mPlan, controller, allowFallback, deadline,
858 executionCallback);
859 } else {
860 VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API)";
861 std::thread asyncExecution(
862 [this, controller, allowFallback, deadline, executionCallback] {
863 asyncStartComputePartitioned(this, *mPlan, controller, allowFallback,
864 deadline, executionCallback);
865 });
866 executionCallback->bindThread(std::move(asyncExecution));
867 }
868 *synchronizationCallback = executionCallback;
869 return ANEURALNETWORKS_NO_ERROR;
870 }
871 }
872
getInitialOutputShapes() const873 std::vector<OutputShape> ExecutionBuilder::getInitialOutputShapes() const {
874 std::vector<OutputShape> outputShapes(mOutputs.size());
875 std::transform(mOutputs.begin(), mOutputs.end(), outputShapes.begin(),
876 [](const auto& x) -> OutputShape {
877 hidl_vec<uint32_t> dimensions;
878 if (x.state() != ModelArgumentInfo::HAS_NO_VALUE) {
879 dimensions = x.dimensions();
880 }
881 return {.dimensions = std::move(dimensions), .isSufficient = true};
882 });
883 return outputShapes;
884 }
885
886 // Check if the dimensions "to" is updatable by dimensions "from", where "from" must
887 // have a higher specification level.
isUpdatable(const std::vector<uint32_t> & to,const std::vector<uint32_t> & from)888 static bool isUpdatable(const std::vector<uint32_t>& to, const std::vector<uint32_t>& from) {
889 if (to.size() == 0) return true;
890 NN_RET_CHECK_EQ(to.size(), from.size());
891 for (uint32_t i = 0; i < to.size(); i++) {
892 NN_RET_CHECK(to[i] == from[i] || to[i] == 0);
893 }
894 return true;
895 }
896
updateOutputShapes(const std::vector<OutputShape> & outputShapes)897 bool ExecutionBuilder::updateOutputShapes(const std::vector<OutputShape>& outputShapes) {
898 if (outputShapes.size() == 0) {
899 return true;
900 }
901 NN_RET_CHECK_EQ(outputShapes.size(), mOutputs.size());
902 for (uint32_t i = 0; i < outputShapes.size(); i++) {
903 // Check if only unspecified dimensions or rank are overwritten.
904 NN_RET_CHECK(isUpdatable(mOutputs[i].dimensions(), outputShapes[i].dimensions));
905 const OperandType operandType = mModel->getOutputOperand(i).type;
906 NN_RET_CHECK(!TypeManager::get()->sizeOfDataOverflowsUInt32(operandType,
907 outputShapes[i].dimensions));
908 }
909 for (uint32_t i = 0; i < outputShapes.size(); i++) {
910 mOutputs[i].dimensions() = outputShapes[i].dimensions;
911 mOutputs[i].isSufficient() = outputShapes[i].isSufficient;
912 }
913 return true;
914 }
915
updateMemories()916 bool ExecutionBuilder::updateMemories() {
917 for (const auto& output : mOutputs) {
918 if (output.state() != ModelArgumentInfo::MEMORY) continue;
919 const Memory* memory = mMemories[output.locationAndLength().poolIndex];
920 NN_RET_CHECK(memory->getValidator().updateMetadata({.dimensions = output.dimensions()}));
921 }
922 return true;
923 }
924
finishWithoutSyncFence(ErrorStatus status,const std::vector<OutputShape> & outputShapes)925 ErrorStatus ExecutionBuilder::finishWithoutSyncFence(ErrorStatus status,
926 const std::vector<OutputShape>& outputShapes) {
927 CHECK(!mFinishedWithoutSyncFence) << "ExecutionBuilder::finishWithoutSyncFence is called twice";
928 CHECK(!hasSyncFence())
929 << "ExecutionBuilder::finishWithoutSyncFence is called when hasSyncFence()";
930 if (!updateOutputShapes(outputShapes) || !updateMemories()) {
931 status = ErrorStatus::GENERAL_FAILURE;
932 }
933 bool success = status == ErrorStatus::NONE;
934 for (const auto& output : mOutputs) {
935 if (output.state() != ModelArgumentInfo::MEMORY) continue;
936 const Memory* memory = mMemories[output.locationAndLength().poolIndex];
937 memory->getValidator().setInitialized(success);
938 }
939 switch (convertErrorStatusToResultCode(status)) {
940 case ANEURALNETWORKS_NO_ERROR:
941 mCompletionWithoutSyncFence = Completion::NO_ERROR;
942 break;
943 case ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE:
944 mCompletionWithoutSyncFence = Completion::OUTPUT_INSUFFICIENT_SIZE;
945 break;
946 default:
947 mCompletionWithoutSyncFence = Completion::OTHER_ERROR;
948 break;
949 }
950 mFinishedWithoutSyncFence = true;
951 return status;
952 }
953
updateOutputShapes(const std::vector<OutputShape> & from,std::vector<OutputShape> * to)954 bool StepExecutor::updateOutputShapes(const std::vector<OutputShape>& from,
955 std::vector<OutputShape>* to) {
956 if (from.size() == 0) {
957 return true;
958 }
959 if (mExecutionStep != nullptr) {
960 const auto& indexMapping = mExecutionStep->getOutputIndexStepModelToMainModel();
961 NN_RET_CHECK_LE(indexMapping.size(), from.size());
962 for (uint32_t i = 0, e = indexMapping.size(); i < e; i++) {
963 uint32_t toIndex = indexMapping[i];
964 NN_RET_CHECK_GT(to->size(), toIndex);
965 NN_RET_CHECK(isUpdatable(to->at(toIndex).dimensions, from[i].dimensions));
966 (*to)[toIndex] = from[i];
967 }
968 } else {
969 NN_RET_CHECK_EQ(from.size(), to->size());
970 for (uint32_t i = 0, e = from.size(); i < e; i++) {
971 NN_RET_CHECK(isUpdatable(to->at(i).dimensions, from[i].dimensions));
972 (*to)[i] = from[i];
973 }
974 }
975 return true;
976 }
977
StepExecutor(ExecutionBuilder * executionBuilder,const ModelBuilder * model,std::shared_ptr<Device> device,std::shared_ptr<PreparedModel> preparedModel,const ExecutionStep * step)978 StepExecutor::StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
979 std::shared_ptr<Device> device,
980 std::shared_ptr<PreparedModel> preparedModel, const ExecutionStep* step)
981 : mExecutionBuilder(executionBuilder),
982 mExecutionStep(step),
983 mModel(model),
984 mDevice(device),
985 mPreparedModel(preparedModel),
986 mInputs(model->inputCount()),
987 mOutputs(model->outputCount()) {
988 CHECK(mDevice != nullptr);
989 VLOG(EXECUTION) << "StepExecutor::StepExecutor with " << mInputs.size() << " inputs and "
990 << mOutputs.size() << " outputs";
991 }
992
mapInputsAndOutputsTrivially()993 void StepExecutor::mapInputsAndOutputsTrivially() {
994 mInputs = mExecutionBuilder->mInputs;
995 mOutputs = mExecutionBuilder->mOutputs;
996 mMemories = mExecutionBuilder->mMemories;
997 }
998
mapInputOrOutput(const ModelArgumentInfo & builderInputOrOutput,ModelArgumentInfo * executorInputOrOutput)999 void StepExecutor::mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
1000 ModelArgumentInfo* executorInputOrOutput) {
1001 *executorInputOrOutput = builderInputOrOutput;
1002 switch (executorInputOrOutput->state()) {
1003 default:
1004 CHECK(false) << "unexpected ModelArgumentInfo::state";
1005 break;
1006 case ModelArgumentInfo::HAS_NO_VALUE:
1007 case ModelArgumentInfo::POINTER:
1008 case ModelArgumentInfo::UNSPECIFIED:
1009 break;
1010 case ModelArgumentInfo::MEMORY: {
1011 const uint32_t builderPoolIndex = builderInputOrOutput.locationAndLength().poolIndex;
1012 const Memory* memory = mExecutionBuilder->mMemories[builderPoolIndex];
1013 const uint32_t executorPoolIndex = mMemories.add(memory);
1014 executorInputOrOutput->locationAndLength().poolIndex = executorPoolIndex;
1015 break;
1016 }
1017 }
1018 }
1019
setInputOrOutputFromMemory(const Operand & inputOrOutputOperand,const Memory * memory,uint32_t offset,ModelArgumentInfo * inputOrOutputInfo)1020 int StepExecutor::setInputOrOutputFromMemory(const Operand& inputOrOutputOperand,
1021 const Memory* memory, uint32_t offset,
1022 ModelArgumentInfo* inputOrOutputInfo) {
1023 // Should be similar to
1024 // ExecutionBuilder::setInputFromMemory()
1025 // ExecutionBuilder::setOutputFromMemory()
1026
1027 uint32_t poolIndex = mMemories.add(memory);
1028 uint32_t length = TypeManager::get()->getSizeOfData(inputOrOutputOperand);
1029 CHECK(inputOrOutputInfo->unspecified());
1030 int n;
1031 std::tie(n, *inputOrOutputInfo) =
1032 ModelArgumentInfo::createFromMemory(inputOrOutputOperand,
1033 /*type=*/nullptr, poolIndex, offset, length);
1034 return n;
1035 }
1036
logArguments(const char * kind,const std::vector<ModelArgumentInfo> & args)1037 static void logArguments(const char* kind, const std::vector<ModelArgumentInfo>& args) {
1038 for (unsigned i = 0; i < args.size(); i++) {
1039 const auto& arg = args[i];
1040 std::string prefix = kind + std::string("[") + std::to_string(i) + "] = ";
1041 switch (arg.state()) {
1042 case ModelArgumentInfo::POINTER:
1043 VLOG(EXECUTION) << prefix << "POINTER(" << SHOW_IF_DEBUG(arg.buffer()) << ")";
1044 break;
1045 case ModelArgumentInfo::MEMORY:
1046 VLOG(EXECUTION) << prefix << "MEMORY("
1047 << "pool=" << arg.locationAndLength().poolIndex << ", "
1048 << "off=" << arg.locationAndLength().offset << ")";
1049 break;
1050 case ModelArgumentInfo::HAS_NO_VALUE:
1051 VLOG(EXECUTION) << prefix << "HAS_NO_VALUE";
1052 break;
1053 case ModelArgumentInfo::UNSPECIFIED:
1054 VLOG(EXECUTION) << prefix << "UNSPECIFIED";
1055 break;
1056 default:
1057 VLOG(EXECUTION) << prefix << "state(" << arg.state() << ")";
1058 break;
1059 }
1060 }
1061 }
1062
isCpu() const1063 bool StepExecutor::isCpu() const {
1064 return mDevice == DeviceManager::getCpuDevice();
1065 }
1066
makeTimeoutDuration(uint64_t nanoseconds)1067 static OptionalTimeoutDuration makeTimeoutDuration(uint64_t nanoseconds) {
1068 OptionalTimeoutDuration otd;
1069 otd.nanoseconds(nanoseconds);
1070 return otd;
1071 }
1072
compute(const std::optional<Deadline> & deadline,const std::shared_ptr<ExecutionBurstController> & burstController)1073 std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::compute(
1074 const std::optional<Deadline>& deadline,
1075 const std::shared_ptr<ExecutionBurstController>& burstController) {
1076 return computeWithMemories(deadline, mMemories.getObjects(), burstController);
1077 }
1078
computeWithMemories(const std::optional<Deadline> & deadline,const std::vector<const Memory * > & memories,const std::shared_ptr<ExecutionBurstController> & burstController)1079 std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::computeWithMemories(
1080 const std::optional<Deadline>& deadline, const std::vector<const Memory*>& memories,
1081 const std::shared_ptr<ExecutionBurstController>& burstController) {
1082 CHECK(mPreparedModel != nullptr);
1083
1084 if (VLOG_IS_ON(EXECUTION)) {
1085 logArguments("input", mInputs);
1086 logArguments("output", mOutputs);
1087 }
1088
1089 const MeasureTiming measure = measureTiming(mExecutionBuilder);
1090 const OptionalTimeoutDuration loopTimeoutDuration =
1091 makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
1092 const auto [n, outputShapes, timing] = mPreparedModel->execute(
1093 mInputs, mOutputs, memories, burstController, measure, deadline, loopTimeoutDuration);
1094 mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
1095
1096 return {n, std::move(outputShapes), timing};
1097 }
1098
computeFenced(const std::vector<int> & waitFor,uint64_t timeoutDurationAfterFence,const std::optional<Deadline> & deadline)1099 std::tuple<int, int, sp<hal::IFencedExecutionCallback>> StepExecutor::computeFenced(
1100 const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
1101 const std::optional<Deadline>& deadline) {
1102 CHECK(mPreparedModel != nullptr);
1103
1104 if (VLOG_IS_ON(EXECUTION)) {
1105 logArguments("input", mInputs);
1106 logArguments("output", mOutputs);
1107 }
1108
1109 const MeasureTiming measure = measureTiming(mExecutionBuilder);
1110 const OptionalTimeoutDuration loopTimeoutDuration =
1111 makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
1112 OptionalTimeoutDuration optionalTimeoutDurationAfterFence;
1113 if (timeoutDurationAfterFence > 0) {
1114 optionalTimeoutDurationAfterFence.nanoseconds(timeoutDurationAfterFence);
1115 }
1116 const auto [n, syncFence, computeFencedCallback, timing] = mPreparedModel->executeFenced(
1117 mInputs, mOutputs, mMemories.getObjects(), waitFor, measure, deadline,
1118 loopTimeoutDuration, optionalTimeoutDurationAfterFence);
1119 if (syncFence < 0 && computeFencedCallback == nullptr) {
1120 mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
1121 }
1122 return {n, syncFence, computeFencedCallback};
1123 }
1124
1125 // For cpuFallback{Partial,Full}, recompile the model on CPU and then start compute.
computeOnCpuFallback()1126 std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::computeOnCpuFallback() {
1127 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "StepExecutor::computeOnCpuFallback");
1128 VLOG(EXECUTION) << "Re-compile the model on CPU";
1129 mDevice = DeviceManager::getCpuDevice();
1130 mPreparedModel = nullptr;
1131 const ModelFactory makeModel = [this] { return mModel->makeHidlModel(); };
1132 // TODO: Propagate user preference and compilation priority to this point instead of using
1133 // default values of ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER and
1134 // ANEURALNETWORKS_PRIORITY_MEDIUM
1135 const ExecutionPreference preference =
1136 static_cast<ExecutionPreference>(ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER);
1137 const Priority priority = convertToHalPriority(ANEURALNETWORKS_PRIORITY_DEFAULT);
1138 auto [n, preparedModel] = mDevice->prepareModel(makeModel, preference, priority, {}, {}, {});
1139 mPreparedModel = std::move(preparedModel);
1140 if (n != ANEURALNETWORKS_NO_ERROR) {
1141 return {n, {}, kNoTiming};
1142 }
1143
1144 // Prepare device memories for CPU fallback.
1145 std::vector<const Memory*> memories = mMemories.getObjects();
1146 std::vector<bool> isUsedAsInput(memories.size(), false);
1147 std::vector<bool> isUsedAsOutput(memories.size(), false);
1148 std::vector<std::unique_ptr<Memory>> blobAhwbs;
1149
1150 // Mark the input and output usages.
1151 for (auto& input : mInputs) {
1152 if (input.state() == ModelArgumentInfo::MEMORY) {
1153 const uint32_t poolIndex = input.locationAndLength().poolIndex;
1154 isUsedAsInput[poolIndex] = true;
1155 }
1156 }
1157 for (auto& output : mOutputs) {
1158 if (output.state() == ModelArgumentInfo::MEMORY) {
1159 const uint32_t poolIndex = output.locationAndLength().poolIndex;
1160 // Cannot allocate output buffers with unknown shapes.
1161 if (mMemories[poolIndex]->getValidator().createdWithUnknownShape()) {
1162 LOG(ERROR) << "Cannot fallback to CPU because at least one of the output operands "
1163 "has unknown shape.";
1164 return {ANEURALNETWORKS_OP_FAILED, {}, kNoTiming};
1165 }
1166 isUsedAsOutput[poolIndex] = true;
1167 }
1168 }
1169
1170 // Allocate BLOB mode AHardwareBuffers and read the data from input device memories.
1171 for (uint32_t i = 0; i < memories.size(); i++) {
1172 const Memory* memory = mMemories[i];
1173 if (memory->getIBuffer() != nullptr) {
1174 const uint32_t size = memory->getValidator().getMetadata().logicalSize;
1175 auto [nAhwb, blobAhwb] = MemoryRuntimeAHWB::create(size);
1176 if (nAhwb != ANEURALNETWORKS_NO_ERROR) {
1177 return {nAhwb, {}, kNoTiming};
1178 }
1179 if (isUsedAsInput[i]) {
1180 n = copyIBufferToHidlMemory(memory->getIBuffer(), blobAhwb->getHidlMemory());
1181 if (n != ANEURALNETWORKS_NO_ERROR) {
1182 return {n, {}, kNoTiming};
1183 }
1184 }
1185 memories[i] = blobAhwb.get();
1186 blobAhwbs.push_back(std::move(blobAhwb));
1187 }
1188 }
1189
1190 auto [nCompute, outputShapes, timing] = computeWithMemories({}, memories);
1191 if (nCompute != ANEURALNETWORKS_NO_ERROR) {
1192 return {nCompute, std::move(outputShapes), timing};
1193 }
1194
1195 // Write back to output device memories.
1196 for (uint32_t i = 0; i < memories.size(); i++) {
1197 const Memory* memory = mMemories[i];
1198 if (memory->getIBuffer() != nullptr && isUsedAsOutput[i]) {
1199 n = copyHidlMemoryToIBuffer(memories[i]->getHidlMemory(), memory->getIBuffer(), {});
1200 if (n != ANEURALNETWORKS_NO_ERROR) {
1201 return {n, {}, kNoTiming};
1202 }
1203 }
1204 }
1205 return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
1206 }
1207
1208 } // namespace nn
1209 } // namespace android
1210