1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "Manager"
18
19 #include "Manager.h"
20
21 #include <android/hidl/manager/1.2/IServiceManager.h>
22 #include <build/version.h>
23 #include <cutils/native_handle.h>
24 #include <hidl/HidlTransportSupport.h>
25 #include <hidl/ServiceManagement.h>
26
27 #include <algorithm>
28 #include <functional>
29 #include <memory>
30 #include <string>
31 #include <tuple>
32 #include <utility>
33 #include <vector>
34
35 #include "Callbacks.h"
36 #include "CpuExecutor.h"
37 #include "ExecutionBurstController.h"
38 #include "HalInterfaces.h"
39 #include "Memory.h"
40 #include "MetaModel.h"
41 #include "ModelArgumentInfo.h"
42 #include "Tracing.h"
43 #include "TypeManager.h"
44 #include "Utils.h"
45 #include "VersionedInterfaces.h"
46
47 namespace android {
48 namespace nn {
49
50 using namespace hal;
51
52 const Timing kNoTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
53
54 // A Device with actual underlying driver
55 class DriverDevice : public Device {
56 public:
57 // Create a DriverDevice from a name and a DeviceFactory function.
58 // Returns nullptr on failure.
59 static std::shared_ptr<DriverDevice> create(const std::string& name,
60 const DeviceFactory& makeDevice);
61
62 // Prefer using DriverDevice::create
63 DriverDevice(std::shared_ptr<VersionedIDevice> device);
64
getName() const65 const std::string& getName() const override { return kInterface->getName(); }
getVersionString() const66 const std::string& getVersionString() const override { return kInterface->getVersionString(); }
getFeatureLevel() const67 int64_t getFeatureLevel() const override { return kInterface->getFeatureLevel(); }
getType() const68 int32_t getType() const override { return kInterface->getType(); }
getSupportedExtensions() const69 const std::vector<Extension>& getSupportedExtensions() const override {
70 return kInterface->getSupportedExtensions();
71 }
72 std::vector<bool> getSupportedOperations(const MetaModel& metaModel) const override;
getPerformance(OperandType type) const73 PerformanceInfo getPerformance(OperandType type) const override {
74 const auto& capabilities = kInterface->getCapabilities();
75 return lookup(capabilities.operandPerformance, type);
76 }
getRelaxedFloat32toFloat16PerformanceScalar() const77 PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override {
78 const auto& capabilities = kInterface->getCapabilities();
79 return capabilities.relaxedFloat32toFloat16PerformanceScalar;
80 }
getRelaxedFloat32toFloat16PerformanceTensor() const81 PerformanceInfo getRelaxedFloat32toFloat16PerformanceTensor() const override {
82 const auto& capabilities = kInterface->getCapabilities();
83 return capabilities.relaxedFloat32toFloat16PerformanceTensor;
84 }
getIfPerformance() const85 PerformanceInfo getIfPerformance() const override {
86 const auto& capabilities = kInterface->getCapabilities();
87 return capabilities.ifPerformance;
88 }
getWhilePerformance() const89 PerformanceInfo getWhilePerformance() const override {
90 const auto& capabilities = kInterface->getCapabilities();
91 return capabilities.whilePerformance;
92 }
isCachingSupported() const93 bool isCachingSupported() const override {
94 // Caching is supported if either of numModelCache or numDataCache is greater than 0.
95 const auto [numModelCacheFiles, numDataCacheFiles] =
96 kInterface->getNumberOfCacheFilesNeeded();
97 return numModelCacheFiles > 0 || numDataCacheFiles > 0;
98 }
wait() const99 int wait() const override { return kInterface->wait(); }
100
101 std::pair<int, std::shared_ptr<PreparedModel>> prepareModel(
102 const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
103 const std::optional<Deadline>& deadline, const std::string& cacheDir,
104 const std::optional<CacheToken>& maybeToken) const override;
105
106 std::pair<int, std::unique_ptr<Memory>> allocate(const MemoryDescriptor& desc,
107 hal::OperandType) const override;
108
109 private:
110 const std::shared_ptr<VersionedIDevice> kInterface;
111
112 #ifdef NN_DEBUGGABLE
113 // For debugging: behavior of IDevice::getSupportedOperations for SampleDriver.
114 // 0 - all operations reported by IDevice::getSupportedOperations() supported
115 // 1 - some operations reported by IDevice::getSupportedOperations() supported
116 uint32_t mSupported = 0;
117 #endif // NN_DEBUGGABLE
118 };
119
120 // A PreparedModel with underlying IPreparedModel instance return by actual driver.
121 class DriverPreparedModel : public PreparedModel {
122 public:
DriverPreparedModel(const Device * device,const std::shared_ptr<VersionedIPreparedModel> & preparedModel)123 DriverPreparedModel(const Device* device,
124 const std::shared_ptr<VersionedIPreparedModel>& preparedModel)
125 : mDevice(device), mPreparedModel(preparedModel) {
126 CHECK(mDevice != nullptr);
127 CHECK(mPreparedModel != nullptr);
128 }
129
getDevice() const130 const Device* getDevice() const override { return mDevice; }
getInterface() const131 std::shared_ptr<VersionedIPreparedModel> getInterface() const override {
132 return mPreparedModel;
133 }
134 std::tuple<int, std::vector<OutputShape>, Timing> execute(
135 const std::vector<ModelArgumentInfo>& inputs,
136 const std::vector<ModelArgumentInfo>& outputs,
137 const std::vector<const Memory*>& memories,
138 const std::shared_ptr<ExecutionBurstController>& burstController, MeasureTiming measure,
139 const std::optional<Deadline>& deadline,
140 const OptionalTimeoutDuration& loopTimeoutDuration) const override;
141
142 std::tuple<int, int, sp<hal::IFencedExecutionCallback>, hal::Timing> executeFenced(
143 const std::vector<ModelArgumentInfo>& inputs,
144 const std::vector<ModelArgumentInfo>& outputs,
145 const std::vector<const Memory*>& memories, const std::vector<int>& waitFor,
146 MeasureTiming measure, const std::optional<Deadline>& deadline,
147 const OptionalTimeoutDuration& loopTimeoutDuration,
148 const hal::OptionalTimeoutDuration& timeoutDurationAfterFence) const override;
149
configureExecutionBurst(bool preferPowerOverLatency) const150 std::shared_ptr<ExecutionBurstController> configureExecutionBurst(
151 bool preferPowerOverLatency) const override {
152 return mPreparedModel->configureExecutionBurst(preferPowerOverLatency);
153 }
154
155 private:
156 const Device* mDevice;
157 const std::shared_ptr<VersionedIPreparedModel> mPreparedModel;
158 };
159
DriverDevice(std::shared_ptr<VersionedIDevice> device)160 DriverDevice::DriverDevice(std::shared_ptr<VersionedIDevice> device)
161 : kInterface(std::move(device)) {
162 CHECK(kInterface != nullptr);
163 #ifdef NN_DEBUGGABLE
164 static const char samplePrefix[] = "sample";
165 if (getName().substr(0, sizeof(samplePrefix) - 1) == samplePrefix) {
166 mSupported = getProp("debug.nn.sample.supported");
167 }
168 #endif // NN_DEBUGGABLE
169 }
170
create(const std::string & name,const DeviceFactory & makeDevice)171 std::shared_ptr<DriverDevice> DriverDevice::create(const std::string& name,
172 const DeviceFactory& makeDevice) {
173 CHECK(makeDevice != nullptr);
174 std::shared_ptr<VersionedIDevice> device = VersionedIDevice::create(name, makeDevice);
175 if (device == nullptr) {
176 LOG(ERROR) << "DriverDevice::create failed to create VersionedIDevice object for service "
177 << name;
178 return nullptr;
179 }
180
181 return std::make_shared<DriverDevice>(std::move(device));
182 }
183
getSupportedOperations(const MetaModel & metaModel) const184 std::vector<bool> DriverDevice::getSupportedOperations(const MetaModel& metaModel) const {
185 // Query the driver for what it can do.
186 ErrorStatus status = ErrorStatus::GENERAL_FAILURE;
187 std::vector<bool> supportedOperations;
188 std::tie(status, supportedOperations) = kInterface->getSupportedOperations(metaModel);
189
190 const Model& hidlModel = metaModel.getModel();
191 const uint32_t operationCount = hidlModel.main.operations.size();
192 if (status != ErrorStatus::NONE) {
193 LOG(ERROR) << "IDevice::getSupportedOperations returned the error " << toString(status);
194 // Set the supported operation vectors to all false, so we won't use this driver.
195 return std::vector<bool>(operationCount, false);
196 }
197 if (supportedOperations.size() != operationCount) {
198 LOG(ERROR) << "IDevice::getSupportedOperations returned a vector of length "
199 << supportedOperations.size() << " when expecting " << operationCount;
200 // Set the supported operation vectors to all false, so we won't use this driver.
201 return std::vector<bool>(operationCount, false);
202 }
203
204 #ifdef NN_DEBUGGABLE
205 if (mSupported != 1) {
206 return supportedOperations;
207 }
208
209 const uint32_t baseAccumulator = std::hash<std::string>{}(getName());
210 for (size_t operationIndex = 0; operationIndex < supportedOperations.size(); operationIndex++) {
211 if (!supportedOperations[operationIndex]) {
212 continue;
213 }
214
215 uint32_t accumulator = baseAccumulator;
216 const Operation& operation = hidlModel.main.operations[operationIndex];
217 accumulator ^= static_cast<uint32_t>(operation.type);
218 auto accumulateOperands = [&hidlModel, &accumulator](const hidl_vec<uint32_t>& operands) {
219 for (uint32_t operandIndex : operands) {
220 const Operand& operand = hidlModel.main.operands[operandIndex];
221 accumulator ^= static_cast<uint32_t>(operand.type);
222 accumulator ^= operand.dimensions.size();
223 for (uint32_t dimension : operand.dimensions) {
224 accumulator ^= dimension;
225 if (operand.lifetime == OperandLifeTime::CONSTANT_COPY ||
226 operand.lifetime == OperandLifeTime::CONSTANT_REFERENCE) {
227 accumulator ^= 1;
228 }
229 }
230 }
231 };
232 accumulateOperands(operation.inputs);
233 accumulateOperands(operation.outputs);
234 if (accumulator & 1) {
235 supportedOperations[operationIndex] = false;
236 }
237 }
238 #endif // NN_DEBUGGABLE
239
240 return supportedOperations;
241 }
242
prepareModel(const ModelFactory & makeModel,ExecutionPreference preference,Priority priority,const std::optional<Deadline> & deadline,const std::string & cacheDir,const std::optional<CacheToken> & maybeToken) const243 std::pair<int, std::shared_ptr<PreparedModel>> DriverDevice::prepareModel(
244 const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
245 const std::optional<Deadline>& deadline, const std::string& cacheDir,
246 const std::optional<CacheToken>& maybeToken) const {
247 const auto [n, preparedModel] = kInterface->prepareModel(makeModel, preference, priority,
248 deadline, cacheDir, maybeToken);
249 if (n != ANEURALNETWORKS_NO_ERROR) {
250 return {n, nullptr};
251 }
252 CHECK(preparedModel != nullptr) << "prepareModel returned nullptr without error code";
253 return {ANEURALNETWORKS_NO_ERROR, std::make_shared<DriverPreparedModel>(this, preparedModel)};
254 }
255
allocate(const MemoryDescriptor & desc,hal::OperandType) const256 std::pair<int, std::unique_ptr<Memory>> DriverDevice::allocate(const MemoryDescriptor& desc,
257 hal::OperandType) const {
258 const BufferDesc hidlDesc = {.dimensions = desc.dimensions};
259 std::vector<std::shared_ptr<VersionedIPreparedModel>> preparedModels(
260 desc.preparedModels.size());
261 std::transform(desc.preparedModels.begin(), desc.preparedModels.end(), preparedModels.begin(),
262 [](const auto* preparedModel) {
263 const auto versionedPreparedModel = preparedModel->getInterface();
264 CHECK(versionedPreparedModel != nullptr);
265 return versionedPreparedModel;
266 });
267 auto [status, buffer, token] =
268 kInterface->allocate(hidlDesc, preparedModels, desc.inputRoles, desc.outputRoles);
269 if (status != ErrorStatus::NONE) {
270 LOG(ERROR) << "DriverDevice::allocate -- memory allocation on device " << getName()
271 << " failed!";
272 return {convertErrorStatusToResultCode(status), nullptr};
273 }
274 return MemoryFromDevice::create(std::move(buffer), token);
275 }
276
277 // Figures out how to place each of the input or outputs in a buffer. This just
278 // does the layout and memory allocation, it does not copy data. Aligns each
279 // input a bit.
280 static std::tuple<int, std::unique_ptr<MemoryAshmem>, std::vector<DataLocation>>
allocatePointerArgumentsToPool(const std::vector<ModelArgumentInfo> & args,std::vector<const Memory * > * memories)281 allocatePointerArgumentsToPool(const std::vector<ModelArgumentInfo>& args,
282 std::vector<const Memory*>* memories) {
283 CHECK(memories != nullptr);
284 std::vector<DataLocation> ptrArgsLocations;
285 const uint32_t nextPoolIndex = memories->size();
286 int64_t total = 0;
287 for (const auto& info : args) {
288 if (info.state() == ModelArgumentInfo::POINTER) {
289 // TODO Good enough alignment?
290 total += alignBytesNeeded(static_cast<uint32_t>(total), info.length());
291 ptrArgsLocations.push_back({.poolIndex = nextPoolIndex,
292 .offset = static_cast<uint32_t>(total),
293 .length = info.length()});
294 total += info.length();
295 }
296 };
297 if (total > 0xFFFFFFFF) {
298 LOG(ERROR) << "allocatePointerArgumentsToPool: ANeuralNetworksExecution: Size of all "
299 "inputs or outputs exceeds 2^32.";
300 return {ANEURALNETWORKS_BAD_DATA, nullptr, std::vector<DataLocation>{}};
301 }
302 if (total <= 0) {
303 return {ANEURALNETWORKS_NO_ERROR, nullptr, std::vector<DataLocation>{}};
304 }
305 auto [n, memory] = MemoryAshmem::create(total);
306 if (n != ANEURALNETWORKS_NO_ERROR) {
307 return {n, nullptr, std::vector<DataLocation>{}};
308 }
309 memories->push_back(memory.get());
310 return {ANEURALNETWORKS_NO_ERROR, std::move(memory), std::move(ptrArgsLocations)};
311 }
312
313 // Perform computation on an actual HIDL driver.
314 //
315 // Because HIDL cannot take raw pointers, two separate memory pools will be allocated for inputs and
316 // outputs specified by pointers. The input pointer data will be copied to the input pool prior to
317 // execution, and the output pointer data will be copied out from the output pool after the
318 // execution.
319 //
320 // The HIDL invocation will choose between sync/async execution according to
321 // DeviceManager::mSyncExecHal.
execute(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const Memory * > & memories,const std::shared_ptr<ExecutionBurstController> & burstController,MeasureTiming measure,const std::optional<Deadline> & deadline,const OptionalTimeoutDuration & loopTimeoutDuration) const322 std::tuple<int, std::vector<OutputShape>, Timing> DriverPreparedModel::execute(
323 const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
324 const std::vector<const Memory*>& memories,
325 const std::shared_ptr<ExecutionBurstController>& burstController, MeasureTiming measure,
326 const std::optional<Deadline>& deadline,
327 const OptionalTimeoutDuration& loopTimeoutDuration) const {
328 NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::execute");
329
330 // Make a copy of the memory tracker as we will append memory pools for pointer arguments.
331 std::vector<const Memory*> localMemories = memories;
332
333 // We separate the input & output pools so accelerators only need to copy
334 // the contents of the input pools. We could also use it to set protection
335 // on read only memory but that's not currently done.
336
337 // Layout the input and output data
338 const auto [n1, inputPtrArgsMemory, inputPtrArgsLocations] =
339 allocatePointerArgumentsToPool(inputs, &localMemories);
340 if (n1 != ANEURALNETWORKS_NO_ERROR) {
341 return {n1, {}, kNoTiming};
342 }
343 const auto [n2, outputPtrArgsMemory, outputPtrArgsLocations] =
344 allocatePointerArgumentsToPool(outputs, &localMemories);
345 if (n2 != ANEURALNETWORKS_NO_ERROR) {
346 return {n2, {}, kNoTiming};
347 }
348
349 // Copy the input data that was specified via a pointer.
350 if (inputPtrArgsMemory != nullptr) {
351 uint32_t ptrInputIndex = 0;
352 for (const auto& info : inputs) {
353 if (info.state() == ModelArgumentInfo::POINTER) {
354 const DataLocation& loc = inputPtrArgsLocations[ptrInputIndex++];
355 uint8_t* const data = inputPtrArgsMemory->getPointer();
356 memcpy(data + loc.offset, info.buffer(), loc.length);
357 }
358 }
359 }
360
361 Request request;
362 request.inputs = createRequestArguments(inputs, inputPtrArgsLocations);
363 request.outputs = createRequestArguments(outputs, outputPtrArgsLocations);
364 uint32_t count = localMemories.size();
365 request.pools.resize(count);
366 for (uint32_t i = 0; i < count; i++) {
367 request.pools[i] = localMemories[i]->getMemoryPool();
368 }
369
370 NNTRACE_FULL_SWITCH(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION,
371 "DriverPreparedModel::execute::execute");
372
373 int n = ANEURALNETWORKS_OP_FAILED;
374 std::vector<OutputShape> outputShapes;
375 Timing timing = kNoTiming;
376
377 // compute using burst if present
378 const bool burstCompute = (burstController != nullptr);
379 bool burstFallback = true;
380 if (burstCompute) {
381 const bool compliant = compliantWithV1_2(request);
382 if (compliant) {
383 V1_0::Request request12 = convertToV1_2(request);
384 std::vector<intptr_t> memoryIds;
385 memoryIds.reserve(localMemories.size());
386 for (const Memory* memory : localMemories) {
387 memory->usedBy(burstController);
388 memoryIds.push_back(memory->getKey());
389 }
390
391 VLOG(EXECUTION) << "Before ExecutionBurstController->compute() "
392 << SHOW_IF_DEBUG(toString(request12));
393 std::tie(n, outputShapes, timing, burstFallback) =
394 burstController->compute(request12, measure, memoryIds);
395 }
396 }
397
398 // compute from IPreparedModel if either:
399 // (1) burst was not supplied, or
400 // (2) the burst execution failed and requested a fallback execution
401 if (!burstCompute || burstFallback) {
402 const bool preferSynchronous = DeviceManager::get()->syncExecHal();
403 std::tie(n, outputShapes, timing) = mPreparedModel->execute(
404 request, measure, deadline, loopTimeoutDuration, preferSynchronous);
405 }
406
407 if (n != ANEURALNETWORKS_NO_ERROR) {
408 VLOG(EXECUTION) << "**Execution failed**";
409 return {n, std::move(outputShapes), timing};
410 }
411
412 // Copy the output data from shared memory to the output buffers.
413 NNTRACE_RT_SWITCH(NNTRACE_PHASE_RESULTS, "DriverPreparedModel::execute");
414 if (outputPtrArgsMemory != nullptr) {
415 uint32_t ptrOutputIndex = 0;
416 for (const auto& info : outputs) {
417 if (info.state() == ModelArgumentInfo::POINTER) {
418 const DataLocation& loc = outputPtrArgsLocations[ptrOutputIndex++];
419 const uint8_t* const data = outputPtrArgsMemory->getPointer();
420 memcpy(info.buffer(), data + loc.offset, loc.length);
421 }
422 }
423 }
424
425 VLOG(EXECUTION) << "DriverPreparedModel::execute completed";
426 return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
427 }
428
429 std::tuple<int, int, sp<hal::IFencedExecutionCallback>, hal::Timing>
executeFenced(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const Memory * > & memories,const std::vector<int> & waitFor,hal::MeasureTiming measure,const std::optional<Deadline> & deadline,const OptionalTimeoutDuration & loopTimeoutDuration,const hal::OptionalTimeoutDuration & timeoutDurationAfterFence) const430 DriverPreparedModel::executeFenced(
431 const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
432 const std::vector<const Memory*>& memories, const std::vector<int>& waitFor,
433 hal::MeasureTiming measure, const std::optional<Deadline>& deadline,
434 const OptionalTimeoutDuration& loopTimeoutDuration,
435 const hal::OptionalTimeoutDuration& timeoutDurationAfterFence) const {
436 NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::executeFenced");
437 CHECK(std::all_of(waitFor.begin(), waitFor.end(), [](int fd) { return fd > 0; }));
438 // Make a copy of the memory tracker as we will append memory pools for pointer arguments.
439 std::vector<const Memory*> localMemories = memories;
440 sp<hal::IFencedExecutionCallback> executeFencedCallback;
441 hal::Timing timing = kNoTiming;
442
443 // We separate the input & output pools so accelerators only need to copy
444 // the contents of the input pools. We could also use it to set protection
445 // on read only memory but that's not currently done.
446
447 // Layout the input and output data
448 const auto [n1, inputPtrArgsMemory, inputPtrArgsLocations] =
449 allocatePointerArgumentsToPool(inputs, &localMemories);
450 if (n1 != ANEURALNETWORKS_NO_ERROR) {
451 return {n1, -1, nullptr, timing};
452 }
453 const auto [n2, outputPtrArgsMemory, outputPtrArgsLocations] =
454 allocatePointerArgumentsToPool(outputs, &localMemories);
455 if (n2 != ANEURALNETWORKS_NO_ERROR) {
456 return {n2, -1, nullptr, timing};
457 }
458
459 // Copy the input data that was specified via a pointer.
460 if (inputPtrArgsMemory != nullptr) {
461 uint32_t ptrInputIndex = 0;
462 for (const auto& info : inputs) {
463 if (info.state() == ModelArgumentInfo::POINTER) {
464 const DataLocation& loc = inputPtrArgsLocations[ptrInputIndex++];
465 uint8_t* const data = inputPtrArgsMemory->getPointer();
466 memcpy(data + loc.offset, info.buffer(), loc.length);
467 }
468 }
469 }
470
471 Request request;
472 request.inputs = createRequestArguments(inputs, inputPtrArgsLocations);
473 request.outputs = createRequestArguments(outputs, outputPtrArgsLocations);
474 uint32_t count = localMemories.size();
475 request.pools.resize(count);
476 for (uint32_t i = 0; i < count; i++) {
477 request.pools[i] = localMemories[i]->getMemoryPool();
478 }
479
480 NNTRACE_FULL_SWITCH(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION,
481 "DriverPreparedModel::executeFenced");
482
483 int n = ANEURALNETWORKS_OP_FAILED;
484 hidl_vec<hidl_handle> waitForHandles;
485 waitForHandles.resize(waitFor.size());
486 for (uint32_t i = 0; i < waitFor.size(); i++) {
487 native_handle_t* nativeHandle = native_handle_create(1, 0);
488 if (nativeHandle == nullptr) {
489 LOG(ERROR) << "Failed to create native_handle";
490 return {n, -1, nullptr, timing};
491 }
492 int dupFd = dup(waitFor[i]);
493 if (dupFd <= 0) {
494 LOG(ERROR) << "Unable to dup the file descriptor";
495 return {n, -1, nullptr, timing};
496 }
497 nativeHandle->data[0] = dupFd;
498 hidl_handle hidlHandle;
499 hidlHandle.setTo(nativeHandle, /*shouldOwn=*/true);
500 waitForHandles[i] = std::move(hidlHandle);
501 }
502
503 hidl_handle syncFence;
504 std::tie(n, syncFence, executeFencedCallback, timing) =
505 mPreparedModel->executeFenced(request, waitForHandles, measure, deadline,
506 loopTimeoutDuration, timeoutDurationAfterFence);
507
508 if (n != ANEURALNETWORKS_NO_ERROR) {
509 VLOG(EXECUTION) << "**executeFenced failed**";
510 return {n, -1, nullptr, timing};
511 }
512
513 int syncFenceFd = -1;
514 if (syncFence.getNativeHandle()) {
515 syncFenceFd = dup(syncFence.getNativeHandle()->data[0]);
516 if (syncFenceFd < 0) {
517 LOG(ERROR) << "Failed to dup the file descriptor";
518 return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, timing};
519 }
520 }
521 // If output buffer is provided as a malloc pointer, wait for the execution to finish.
522 // Then copy the output data from shared memory to the output buffers.
523 if (outputPtrArgsMemory != nullptr) {
524 NNTRACE_RT_SWITCH(NNTRACE_PHASE_RESULTS, "DriverPreparedModel::executeFenced");
525 if (syncFenceFd > 0) {
526 auto r = syncWait(syncFenceFd, -1);
527 if (r != FenceState::SIGNALED) {
528 LOG(ERROR) << "syncWait failed, fd: " << syncFenceFd;
529 return {ANEURALNETWORKS_OP_FAILED, syncFenceFd, nullptr, timing};
530 }
531 }
532 uint32_t ptrOutputIndex = 0;
533 for (const auto& info : outputs) {
534 if (info.state() == ModelArgumentInfo::POINTER) {
535 const DataLocation& loc = outputPtrArgsLocations[ptrOutputIndex++];
536 const uint8_t* const data = outputPtrArgsMemory->getPointer();
537 memcpy(info.buffer(), data + loc.offset, loc.length);
538 }
539 }
540 }
541
542 VLOG(EXECUTION) << "DriverPreparedModel::executeFenced completed";
543 return {ANEURALNETWORKS_NO_ERROR, syncFenceFd, executeFencedCallback, timing};
544 }
545
546 // A special abstracted device for the CPU. Only one instance of this class will exist.
547 // Use get() to retrieve it.
548 class CpuDevice : public Device {
549 public:
550 // Returns the singleton CPU fallback device.
get()551 static std::shared_ptr<CpuDevice> get() {
552 static std::shared_ptr<CpuDevice> instance(new CpuDevice);
553 return instance;
554 }
555
getName() const556 const std::string& getName() const override { return kName; }
getVersionString() const557 const std::string& getVersionString() const override { return kVersionString; }
getFeatureLevel() const558 int64_t getFeatureLevel() const override { return kFeatureLevel; }
getType() const559 int32_t getType() const override { return ANEURALNETWORKS_DEVICE_CPU; }
getSupportedExtensions() const560 const std::vector<Extension>& getSupportedExtensions() const override {
561 return kSupportedExtensions;
562 }
563 std::vector<bool> getSupportedOperations(const MetaModel& metaModel) const override;
getPerformance(OperandType) const564 PerformanceInfo getPerformance(OperandType) const override { return kPerformance; }
getRelaxedFloat32toFloat16PerformanceScalar() const565 PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override {
566 return kPerformance;
567 }
getRelaxedFloat32toFloat16PerformanceTensor() const568 PerformanceInfo getRelaxedFloat32toFloat16PerformanceTensor() const override {
569 return kPerformance;
570 }
getIfPerformance() const571 PerformanceInfo getIfPerformance() const override { return kPerformance; }
getWhilePerformance() const572 PerformanceInfo getWhilePerformance() const override { return kPerformance; }
isCachingSupported() const573 bool isCachingSupported() const override { return false; }
wait() const574 int wait() const override { return ANEURALNETWORKS_NO_ERROR; }
575
576 std::pair<int, std::shared_ptr<PreparedModel>> prepareModel(
577 const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
578 const std::optional<Deadline>& deadline, const std::string& cacheDir,
579 const std::optional<CacheToken>& maybeToken) const override;
580
581 std::pair<int, std::unique_ptr<Memory>> allocate(const MemoryDescriptor& desc,
582 OperandType type) const override;
583
584 private:
585 CpuDevice() = default;
586 const int64_t kFeatureLevel = __ANDROID_API__;
587 const std::string kName = "nnapi-reference";
588 const std::string kVersionString = build::GetBuildNumber();
589 // Since the performance is a ratio compared to the CPU performance,
590 // by definition the performance of the CPU is 1.0.
591 const PerformanceInfo kPerformance = {.execTime = 1.0f, .powerUsage = 1.0f};
592 const std::vector<Extension> kSupportedExtensions{/* No extensions. */};
593 };
594
595 // A special abstracted PreparedModel for the CPU, constructed by CpuDevice.
596 class CpuPreparedModel : public PreparedModel {
597 public:
598 // Factory method for CpuPreparedModel. Returns ANEURALNETWORKS_NO_ERROR and
599 // a prepared model object if successfully created. Returns an error code
600 // and nullptr otherwise.
601 static std::pair<int, std::shared_ptr<PreparedModel>> create(Model hidlModel);
602
getDevice() const603 const Device* getDevice() const override { return CpuDevice::get().get(); }
getInterface() const604 std::shared_ptr<VersionedIPreparedModel> getInterface() const override { return nullptr; }
605
606 std::tuple<int, std::vector<OutputShape>, Timing> execute(
607 const std::vector<ModelArgumentInfo>& inputs,
608 const std::vector<ModelArgumentInfo>& outputs,
609 const std::vector<const Memory*>& memories,
610 const std::shared_ptr<ExecutionBurstController>& burstController, MeasureTiming measure,
611 const std::optional<Deadline>& deadline,
612 const OptionalTimeoutDuration& loopTimeoutDuration) const override;
613
configureExecutionBurst(bool) const614 std::shared_ptr<ExecutionBurstController> configureExecutionBurst(
615 bool /*preferPowerOverLatency*/) const override {
616 return nullptr;
617 }
618
619 std::tuple<int, int, sp<hal::IFencedExecutionCallback>, hal::Timing> executeFenced(
620 const std::vector<ModelArgumentInfo>& inputs,
621 const std::vector<ModelArgumentInfo>& outputs,
622 const std::vector<const Memory*>& memories, const std::vector<int>& wait_for,
623 MeasureTiming measure, const std::optional<Deadline>& deadline,
624 const OptionalTimeoutDuration& loopTimeoutDuration,
625 const hal::OptionalTimeoutDuration& timeoutDurationAfterFence) const override;
626
627 // Prefer to use CpuPreparedModel::create.
CpuPreparedModel(Model model,std::vector<RunTimePoolInfo> poolInfos)628 CpuPreparedModel(Model model, std::vector<RunTimePoolInfo> poolInfos)
629 : mModel(std::move(model)), mModelPoolInfos(std::move(poolInfos)) {}
630
631 private:
632 const Model mModel;
633 const std::vector<RunTimePoolInfo> mModelPoolInfos;
634 };
635
getSupportedOperations(const MetaModel & metaModel) const636 std::vector<bool> CpuDevice::getSupportedOperations(const MetaModel& metaModel) const {
637 const Model& hidlModel = metaModel.getModel();
638 const size_t count = hidlModel.main.operations.size();
639 std::vector<bool> result(count, false);
640 for (size_t i = 0; i < count; i++) {
641 // TODO(b/119870033): Decide whether and how post-P operations would be supported on CPU.
642 // We may want to use the slicer for CpuDevice just as we do for
643 // DriverDevice.
644 OperationType operationType = hidlModel.main.operations[i].type;
645 result[i] = !isExtensionOperationType(operationType) &&
646 operationType != OperationType::OEM_OPERATION;
647 }
648 return result;
649 }
650
prepareModel(const ModelFactory & makeModel,ExecutionPreference preference,Priority priority,const std::optional<Deadline> & deadline,const std::string &,const std::optional<CacheToken> & maybeToken) const651 std::pair<int, std::shared_ptr<PreparedModel>> CpuDevice::prepareModel(
652 const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
653 const std::optional<Deadline>& deadline, const std::string& /*cacheDir*/,
654 const std::optional<CacheToken>& maybeToken) const {
655 CHECK(!maybeToken.has_value())
656 << "Should never call prepareModel with cache information on CpuDevice";
657
658 const Model model = makeModel();
659 if (!validateModel(model, ValidationMode::RUNTIME) ||
660 !validateExecutionPreference(preference) || !validatePriority(priority)) {
661 return {ANEURALNETWORKS_OP_FAILED, nullptr};
662 }
663 if (hasDeadlinePassed(deadline)) {
664 return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, nullptr};
665 }
666
667 return CpuPreparedModel::create(model);
668 }
669
allocate(const MemoryDescriptor & desc,OperandType type) const670 std::pair<int, std::unique_ptr<Memory>> CpuDevice::allocate(const MemoryDescriptor& desc,
671 OperandType type) const {
672 uint32_t size = TypeManager::get()->getSizeOfData(type, desc.dimensions);
673 if (size == 0) {
674 LOG(ERROR) << "CpuDevice::allocate -- does not support unknown dimensions.";
675 return {ANEURALNETWORKS_OP_FAILED, nullptr};
676 }
677 return MemoryAshmem::create(size);
678 }
679
create(Model hidlModel)680 std::pair<int, std::shared_ptr<PreparedModel>> CpuPreparedModel::create(Model hidlModel) {
681 std::vector<RunTimePoolInfo> poolInfos;
682 if (!setRunTimePoolInfosFromHidlMemories(&poolInfos, hidlModel.pools)) {
683 return {ANEURALNETWORKS_UNMAPPABLE, nullptr};
684 }
685
686 std::shared_ptr<PreparedModel> preparedModel =
687 std::make_shared<CpuPreparedModel>(std::move(hidlModel), std::move(poolInfos));
688 return {ANEURALNETWORKS_NO_ERROR, std::move(preparedModel)};
689 }
690
computeOnCpu(const Model & model,const Request & request,const std::vector<RunTimePoolInfo> & modelPoolInfos,const std::vector<RunTimePoolInfo> & requestPoolInfos,const std::optional<Deadline> & deadline,const OptionalTimeoutDuration & loopTimeoutDuration)691 static std::tuple<int, std::vector<OutputShape>, Timing> computeOnCpu(
692 const Model& model, const Request& request,
693 const std::vector<RunTimePoolInfo>& modelPoolInfos,
694 const std::vector<RunTimePoolInfo>& requestPoolInfos,
695 const std::optional<Deadline>& deadline,
696 const OptionalTimeoutDuration& loopTimeoutDuration) {
697 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "computeOnCpu");
698 CpuExecutor executor;
699 if (loopTimeoutDuration.getDiscriminator() !=
700 OptionalTimeoutDuration::hidl_discriminator::none) {
701 executor.setLoopTimeout(loopTimeoutDuration.nanoseconds());
702 }
703 if (deadline.has_value()) {
704 executor.setDeadline(*deadline);
705 }
706 int err = executor.run(model, request, modelPoolInfos, requestPoolInfos);
707 const auto& outputShapes = executor.getOutputShapes();
708 return {err, outputShapes, kNoTiming};
709 }
710
711 std::tuple<int, int, sp<hal::IFencedExecutionCallback>, hal::Timing>
executeFenced(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const Memory * > & memories,const std::vector<int> & waitFor,hal::MeasureTiming measure,const std::optional<Deadline> & deadline,const OptionalTimeoutDuration & loopTimeoutDuration,const hal::OptionalTimeoutDuration & duration) const712 CpuPreparedModel::executeFenced(const std::vector<ModelArgumentInfo>& inputs,
713 const std::vector<ModelArgumentInfo>& outputs,
714 const std::vector<const Memory*>& memories,
715 const std::vector<int>& waitFor, hal::MeasureTiming measure,
716 const std::optional<Deadline>& deadline,
717 const OptionalTimeoutDuration& loopTimeoutDuration,
718 const hal::OptionalTimeoutDuration& duration) const {
719 VLOG(EXECUTION)
720 << "CpuPreparedModel::executeFenced wait for sync fences to signal before execution";
721 for (int syncFd : waitFor) {
722 if (syncFd > 0) {
723 auto r = syncWait(syncFd, -1);
724 if (r != FenceState::SIGNALED) {
725 LOG(ERROR) << "sync wait failed, fd: " << syncFd;
726 return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {UINT64_MAX, UINT64_MAX}};
727 }
728 }
729 }
730
731 // Update deadline if the timeout duration is closer than the deadline.
732 auto closestDeadline = deadline;
733 if (duration.getDiscriminator() != OptionalTimeoutDuration::hidl_discriminator::none) {
734 const auto timeoutDurationDeadline = makeDeadline(duration.nanoseconds());
735 if (!closestDeadline.has_value() || *closestDeadline > timeoutDurationDeadline) {
736 closestDeadline = timeoutDurationDeadline;
737 }
738 }
739
740 const auto [result, outputShapes, timing] = execute(inputs, outputs, memories, nullptr, measure,
741 closestDeadline, loopTimeoutDuration);
742 return {result, -1, nullptr, timing};
743 }
744
745 // Perform computation on NNAPI CPU reference implementation.
746 //
747 // Contrary to DriverPreparedModel::execute, the NNAPI CPU reference executor lives in the
748 // same process as the NNAPI runtime and can take raw pointers. We will create as many pools as
749 // there are input/output in this method to avoid data copying.
750 //
751 // Will choose between sync/async execution according to DeviceManager::mSyncExecCpu.
execute(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const Memory * > & memories,const std::shared_ptr<ExecutionBurstController> &,MeasureTiming,const std::optional<Deadline> & deadline,const OptionalTimeoutDuration & loopTimeoutDuration) const752 std::tuple<int, std::vector<OutputShape>, Timing> CpuPreparedModel::execute(
753 const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
754 const std::vector<const Memory*>& memories,
755 const std::shared_ptr<ExecutionBurstController>& /*burstController*/,
756 MeasureTiming /*measure*/, const std::optional<Deadline>& deadline,
757 const OptionalTimeoutDuration& loopTimeoutDuration) const {
758 if (hasDeadlinePassed(deadline)) {
759 return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, {}, kNoTiming};
760 }
761
762 std::vector<RunTimePoolInfo> requestPoolInfos;
763 requestPoolInfos.reserve(memories.size());
764 for (const Memory* mem : memories) {
765 if (std::optional<RunTimePoolInfo> poolInfo = mem->getRunTimePoolInfo()) {
766 requestPoolInfos.emplace_back(*poolInfo);
767 } else {
768 return {ANEURALNETWORKS_UNMAPPABLE, {}, kNoTiming};
769 }
770 }
771 // Create as many pools as there are input / output.
772 auto fixPointerArguments =
773 [&requestPoolInfos](const std::vector<ModelArgumentInfo>& argumentInfos) {
774 std::vector<DataLocation> ptrArgsLocations;
775 for (const ModelArgumentInfo& argumentInfo : argumentInfos) {
776 if (argumentInfo.state() == ModelArgumentInfo::POINTER) {
777 ptrArgsLocations.push_back(
778 {.poolIndex = static_cast<uint32_t>(requestPoolInfos.size()),
779 .offset = 0,
780 .length = argumentInfo.length()});
781 requestPoolInfos.emplace_back(RunTimePoolInfo::createFromExistingBuffer(
782 static_cast<uint8_t*>(argumentInfo.buffer())));
783 }
784 }
785 return ptrArgsLocations;
786 };
787 const std::vector<DataLocation> inputPtrArgsLocations = fixPointerArguments(inputs);
788 const std::vector<DataLocation> outputPtrArgsLocations = fixPointerArguments(outputs);
789
790 Request request;
791 request.inputs = createRequestArguments(inputs, inputPtrArgsLocations);
792 request.outputs = createRequestArguments(outputs, outputPtrArgsLocations);
793
794 if (!DeviceManager::get()->syncExecCpu()) {
795 // TODO: use a thread pool
796 // TODO(mikie): this could have NNTRACE so we could measure the overhead
797 // of spinning up a new thread.
798 std::tuple<int, std::vector<OutputShape>, Timing> result = {};
799 std::thread([this, &request, &requestPoolInfos, &deadline, &loopTimeoutDuration, &result] {
800 result = computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, deadline,
801 loopTimeoutDuration);
802 }).join();
803 return result;
804 }
805
806 return computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, deadline,
807 loopTimeoutDuration);
808 }
809
get()810 DeviceManager* DeviceManager::get() {
811 static DeviceManager manager;
812 return &manager;
813 }
814
getCpuDevice()815 std::shared_ptr<Device> DeviceManager::getCpuDevice() {
816 return CpuDevice::get();
817 }
818
forTest_makeDriverDevice(const std::string & name,const sp<V1_0::IDevice> & device)819 std::shared_ptr<Device> DeviceManager::forTest_makeDriverDevice(const std::string& name,
820 const sp<V1_0::IDevice>& device) {
821 const DeviceFactory makeDevice = [device](bool /*blocking*/) { return device; };
822 const auto driverDevice = DriverDevice::create(name, makeDevice);
823 CHECK(driverDevice != nullptr);
824 return driverDevice;
825 }
826
findAvailableDevices()827 void DeviceManager::findAvailableDevices() {
828 VLOG(MANAGER) << "findAvailableDevices";
829
830 // register driver devices
831 const auto names = hardware::getAllHalInstanceNames(V1_0::IDevice::descriptor);
832 for (const auto& name : names) {
833 VLOG(MANAGER) << "Found interface " << name;
834 const DeviceFactory makeDevice = [name](bool blocking) {
835 return blocking ? V1_0::IDevice::getService(name) : V1_0::IDevice::tryGetService(name);
836 };
837 registerDevice(name, makeDevice);
838 }
839
840 // register CPU fallback device
841 mDevices.push_back(CpuDevice::get());
842 mDevicesCpuOnly.push_back(CpuDevice::get());
843 }
844
registerDevice(const std::string & name,const DeviceFactory & makeDevice)845 void DeviceManager::registerDevice(const std::string& name, const DeviceFactory& makeDevice) {
846 if (auto device = DriverDevice::create(name, makeDevice)) {
847 mDevices.push_back(std::move(device));
848 }
849 }
850
DeviceManager()851 DeviceManager::DeviceManager() {
852 VLOG(MANAGER) << "DeviceManager::DeviceManager";
853 findAvailableDevices();
854 #ifdef NN_DEBUGGABLE
855 mStrictSlicing = (getProp("debug.nn.strict-slicing") != 0);
856 mPartitioning = getProp("debug.nn.partition", kPartitioningDefault);
857 mDebugNNCpuOnly = (getProp("debug.nn.cpuonly") != 0);
858 mSyncExecCpu = (getProp("debug.nn.syncexec-cpu", 1) != 0);
859 if (!mSyncExecHalSetter) {
860 mSyncExecHal = (getProp("debug.nn.syncexec-hal", 1) != 0);
861 }
862 mSyncExecRuntime = (getProp("debug.nn.syncexec-runtime") != 0);
863 #endif // NN_DEBUGGABLE
864 }
865
866 } // namespace nn
867 } // namespace android
868