1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "Manager"
18 
19 #include "Manager.h"
20 
21 #include <android/hidl/manager/1.2/IServiceManager.h>
22 #include <build/version.h>
23 #include <cutils/native_handle.h>
24 #include <hidl/HidlTransportSupport.h>
25 #include <hidl/ServiceManagement.h>
26 
27 #include <algorithm>
28 #include <functional>
29 #include <memory>
30 #include <string>
31 #include <tuple>
32 #include <utility>
33 #include <vector>
34 
35 #include "Callbacks.h"
36 #include "CpuExecutor.h"
37 #include "ExecutionBurstController.h"
38 #include "HalInterfaces.h"
39 #include "Memory.h"
40 #include "MetaModel.h"
41 #include "ModelArgumentInfo.h"
42 #include "Tracing.h"
43 #include "TypeManager.h"
44 #include "Utils.h"
45 #include "VersionedInterfaces.h"
46 
47 namespace android {
48 namespace nn {
49 
50 using namespace hal;
51 
52 const Timing kNoTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
53 
54 // A Device with actual underlying driver
55 class DriverDevice : public Device {
56    public:
57     // Create a DriverDevice from a name and a DeviceFactory function.
58     // Returns nullptr on failure.
59     static std::shared_ptr<DriverDevice> create(const std::string& name,
60                                                 const DeviceFactory& makeDevice);
61 
62     // Prefer using DriverDevice::create
63     DriverDevice(std::shared_ptr<VersionedIDevice> device);
64 
getName() const65     const std::string& getName() const override { return kInterface->getName(); }
getVersionString() const66     const std::string& getVersionString() const override { return kInterface->getVersionString(); }
getFeatureLevel() const67     int64_t getFeatureLevel() const override { return kInterface->getFeatureLevel(); }
getType() const68     int32_t getType() const override { return kInterface->getType(); }
getSupportedExtensions() const69     const std::vector<Extension>& getSupportedExtensions() const override {
70         return kInterface->getSupportedExtensions();
71     }
72     std::vector<bool> getSupportedOperations(const MetaModel& metaModel) const override;
getPerformance(OperandType type) const73     PerformanceInfo getPerformance(OperandType type) const override {
74         const auto& capabilities = kInterface->getCapabilities();
75         return lookup(capabilities.operandPerformance, type);
76     }
getRelaxedFloat32toFloat16PerformanceScalar() const77     PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override {
78         const auto& capabilities = kInterface->getCapabilities();
79         return capabilities.relaxedFloat32toFloat16PerformanceScalar;
80     }
getRelaxedFloat32toFloat16PerformanceTensor() const81     PerformanceInfo getRelaxedFloat32toFloat16PerformanceTensor() const override {
82         const auto& capabilities = kInterface->getCapabilities();
83         return capabilities.relaxedFloat32toFloat16PerformanceTensor;
84     }
getIfPerformance() const85     PerformanceInfo getIfPerformance() const override {
86         const auto& capabilities = kInterface->getCapabilities();
87         return capabilities.ifPerformance;
88     }
getWhilePerformance() const89     PerformanceInfo getWhilePerformance() const override {
90         const auto& capabilities = kInterface->getCapabilities();
91         return capabilities.whilePerformance;
92     }
isCachingSupported() const93     bool isCachingSupported() const override {
94         // Caching is supported if either of numModelCache or numDataCache is greater than 0.
95         const auto [numModelCacheFiles, numDataCacheFiles] =
96                 kInterface->getNumberOfCacheFilesNeeded();
97         return numModelCacheFiles > 0 || numDataCacheFiles > 0;
98     }
wait() const99     int wait() const override { return kInterface->wait(); }
100 
101     std::pair<int, std::shared_ptr<PreparedModel>> prepareModel(
102             const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
103             const std::optional<Deadline>& deadline, const std::string& cacheDir,
104             const std::optional<CacheToken>& maybeToken) const override;
105 
106     std::pair<int, std::unique_ptr<Memory>> allocate(const MemoryDescriptor& desc,
107                                                      hal::OperandType) const override;
108 
109    private:
110     const std::shared_ptr<VersionedIDevice> kInterface;
111 
112 #ifdef NN_DEBUGGABLE
113     // For debugging: behavior of IDevice::getSupportedOperations for SampleDriver.
114     // 0 - all operations reported by IDevice::getSupportedOperations() supported
115     // 1 - some operations reported by IDevice::getSupportedOperations() supported
116     uint32_t mSupported = 0;
117 #endif  // NN_DEBUGGABLE
118 };
119 
120 // A PreparedModel with underlying IPreparedModel instance return by actual driver.
121 class DriverPreparedModel : public PreparedModel {
122    public:
DriverPreparedModel(const Device * device,const std::shared_ptr<VersionedIPreparedModel> & preparedModel)123     DriverPreparedModel(const Device* device,
124                         const std::shared_ptr<VersionedIPreparedModel>& preparedModel)
125         : mDevice(device), mPreparedModel(preparedModel) {
126         CHECK(mDevice != nullptr);
127         CHECK(mPreparedModel != nullptr);
128     }
129 
getDevice() const130     const Device* getDevice() const override { return mDevice; }
getInterface() const131     std::shared_ptr<VersionedIPreparedModel> getInterface() const override {
132         return mPreparedModel;
133     }
134     std::tuple<int, std::vector<OutputShape>, Timing> execute(
135             const std::vector<ModelArgumentInfo>& inputs,
136             const std::vector<ModelArgumentInfo>& outputs,
137             const std::vector<const Memory*>& memories,
138             const std::shared_ptr<ExecutionBurstController>& burstController, MeasureTiming measure,
139             const std::optional<Deadline>& deadline,
140             const OptionalTimeoutDuration& loopTimeoutDuration) const override;
141 
142     std::tuple<int, int, sp<hal::IFencedExecutionCallback>, hal::Timing> executeFenced(
143             const std::vector<ModelArgumentInfo>& inputs,
144             const std::vector<ModelArgumentInfo>& outputs,
145             const std::vector<const Memory*>& memories, const std::vector<int>& waitFor,
146             MeasureTiming measure, const std::optional<Deadline>& deadline,
147             const OptionalTimeoutDuration& loopTimeoutDuration,
148             const hal::OptionalTimeoutDuration& timeoutDurationAfterFence) const override;
149 
configureExecutionBurst(bool preferPowerOverLatency) const150     std::shared_ptr<ExecutionBurstController> configureExecutionBurst(
151             bool preferPowerOverLatency) const override {
152         return mPreparedModel->configureExecutionBurst(preferPowerOverLatency);
153     }
154 
155    private:
156     const Device* mDevice;
157     const std::shared_ptr<VersionedIPreparedModel> mPreparedModel;
158 };
159 
DriverDevice(std::shared_ptr<VersionedIDevice> device)160 DriverDevice::DriverDevice(std::shared_ptr<VersionedIDevice> device)
161     : kInterface(std::move(device)) {
162     CHECK(kInterface != nullptr);
163 #ifdef NN_DEBUGGABLE
164     static const char samplePrefix[] = "sample";
165     if (getName().substr(0, sizeof(samplePrefix) - 1) == samplePrefix) {
166         mSupported = getProp("debug.nn.sample.supported");
167     }
168 #endif  // NN_DEBUGGABLE
169 }
170 
create(const std::string & name,const DeviceFactory & makeDevice)171 std::shared_ptr<DriverDevice> DriverDevice::create(const std::string& name,
172                                                    const DeviceFactory& makeDevice) {
173     CHECK(makeDevice != nullptr);
174     std::shared_ptr<VersionedIDevice> device = VersionedIDevice::create(name, makeDevice);
175     if (device == nullptr) {
176         LOG(ERROR) << "DriverDevice::create failed to create VersionedIDevice object for service "
177                    << name;
178         return nullptr;
179     }
180 
181     return std::make_shared<DriverDevice>(std::move(device));
182 }
183 
getSupportedOperations(const MetaModel & metaModel) const184 std::vector<bool> DriverDevice::getSupportedOperations(const MetaModel& metaModel) const {
185     // Query the driver for what it can do.
186     ErrorStatus status = ErrorStatus::GENERAL_FAILURE;
187     std::vector<bool> supportedOperations;
188     std::tie(status, supportedOperations) = kInterface->getSupportedOperations(metaModel);
189 
190     const Model& hidlModel = metaModel.getModel();
191     const uint32_t operationCount = hidlModel.main.operations.size();
192     if (status != ErrorStatus::NONE) {
193         LOG(ERROR) << "IDevice::getSupportedOperations returned the error " << toString(status);
194         // Set the supported operation vectors to all false, so we won't use this driver.
195         return std::vector<bool>(operationCount, false);
196     }
197     if (supportedOperations.size() != operationCount) {
198         LOG(ERROR) << "IDevice::getSupportedOperations returned a vector of length "
199                    << supportedOperations.size() << " when expecting " << operationCount;
200         // Set the supported operation vectors to all false, so we won't use this driver.
201         return std::vector<bool>(operationCount, false);
202     }
203 
204 #ifdef NN_DEBUGGABLE
205     if (mSupported != 1) {
206         return supportedOperations;
207     }
208 
209     const uint32_t baseAccumulator = std::hash<std::string>{}(getName());
210     for (size_t operationIndex = 0; operationIndex < supportedOperations.size(); operationIndex++) {
211         if (!supportedOperations[operationIndex]) {
212             continue;
213         }
214 
215         uint32_t accumulator = baseAccumulator;
216         const Operation& operation = hidlModel.main.operations[operationIndex];
217         accumulator ^= static_cast<uint32_t>(operation.type);
218         auto accumulateOperands = [&hidlModel, &accumulator](const hidl_vec<uint32_t>& operands) {
219             for (uint32_t operandIndex : operands) {
220                 const Operand& operand = hidlModel.main.operands[operandIndex];
221                 accumulator ^= static_cast<uint32_t>(operand.type);
222                 accumulator ^= operand.dimensions.size();
223                 for (uint32_t dimension : operand.dimensions) {
224                     accumulator ^= dimension;
225                     if (operand.lifetime == OperandLifeTime::CONSTANT_COPY ||
226                         operand.lifetime == OperandLifeTime::CONSTANT_REFERENCE) {
227                         accumulator ^= 1;
228                     }
229                 }
230             }
231         };
232         accumulateOperands(operation.inputs);
233         accumulateOperands(operation.outputs);
234         if (accumulator & 1) {
235             supportedOperations[operationIndex] = false;
236         }
237     }
238 #endif  // NN_DEBUGGABLE
239 
240     return supportedOperations;
241 }
242 
prepareModel(const ModelFactory & makeModel,ExecutionPreference preference,Priority priority,const std::optional<Deadline> & deadline,const std::string & cacheDir,const std::optional<CacheToken> & maybeToken) const243 std::pair<int, std::shared_ptr<PreparedModel>> DriverDevice::prepareModel(
244         const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
245         const std::optional<Deadline>& deadline, const std::string& cacheDir,
246         const std::optional<CacheToken>& maybeToken) const {
247     const auto [n, preparedModel] = kInterface->prepareModel(makeModel, preference, priority,
248                                                              deadline, cacheDir, maybeToken);
249     if (n != ANEURALNETWORKS_NO_ERROR) {
250         return {n, nullptr};
251     }
252     CHECK(preparedModel != nullptr) << "prepareModel returned nullptr without error code";
253     return {ANEURALNETWORKS_NO_ERROR, std::make_shared<DriverPreparedModel>(this, preparedModel)};
254 }
255 
allocate(const MemoryDescriptor & desc,hal::OperandType) const256 std::pair<int, std::unique_ptr<Memory>> DriverDevice::allocate(const MemoryDescriptor& desc,
257                                                                hal::OperandType) const {
258     const BufferDesc hidlDesc = {.dimensions = desc.dimensions};
259     std::vector<std::shared_ptr<VersionedIPreparedModel>> preparedModels(
260             desc.preparedModels.size());
261     std::transform(desc.preparedModels.begin(), desc.preparedModels.end(), preparedModels.begin(),
262                    [](const auto* preparedModel) {
263                        const auto versionedPreparedModel = preparedModel->getInterface();
264                        CHECK(versionedPreparedModel != nullptr);
265                        return versionedPreparedModel;
266                    });
267     auto [status, buffer, token] =
268             kInterface->allocate(hidlDesc, preparedModels, desc.inputRoles, desc.outputRoles);
269     if (status != ErrorStatus::NONE) {
270         LOG(ERROR) << "DriverDevice::allocate -- memory allocation on device " << getName()
271                    << " failed!";
272         return {convertErrorStatusToResultCode(status), nullptr};
273     }
274     return MemoryFromDevice::create(std::move(buffer), token);
275 }
276 
277 // Figures out how to place each of the input or outputs in a buffer. This just
278 // does the layout and memory allocation, it does not copy data.  Aligns each
279 // input a bit.
280 static std::tuple<int, std::unique_ptr<MemoryAshmem>, std::vector<DataLocation>>
allocatePointerArgumentsToPool(const std::vector<ModelArgumentInfo> & args,std::vector<const Memory * > * memories)281 allocatePointerArgumentsToPool(const std::vector<ModelArgumentInfo>& args,
282                                std::vector<const Memory*>* memories) {
283     CHECK(memories != nullptr);
284     std::vector<DataLocation> ptrArgsLocations;
285     const uint32_t nextPoolIndex = memories->size();
286     int64_t total = 0;
287     for (const auto& info : args) {
288         if (info.state() == ModelArgumentInfo::POINTER) {
289             // TODO Good enough alignment?
290             total += alignBytesNeeded(static_cast<uint32_t>(total), info.length());
291             ptrArgsLocations.push_back({.poolIndex = nextPoolIndex,
292                                         .offset = static_cast<uint32_t>(total),
293                                         .length = info.length()});
294             total += info.length();
295         }
296     };
297     if (total > 0xFFFFFFFF) {
298         LOG(ERROR) << "allocatePointerArgumentsToPool: ANeuralNetworksExecution: Size of all "
299                       "inputs or outputs exceeds 2^32.";
300         return {ANEURALNETWORKS_BAD_DATA, nullptr, std::vector<DataLocation>{}};
301     }
302     if (total <= 0) {
303         return {ANEURALNETWORKS_NO_ERROR, nullptr, std::vector<DataLocation>{}};
304     }
305     auto [n, memory] = MemoryAshmem::create(total);
306     if (n != ANEURALNETWORKS_NO_ERROR) {
307         return {n, nullptr, std::vector<DataLocation>{}};
308     }
309     memories->push_back(memory.get());
310     return {ANEURALNETWORKS_NO_ERROR, std::move(memory), std::move(ptrArgsLocations)};
311 }
312 
313 // Perform computation on an actual HIDL driver.
314 //
315 // Because HIDL cannot take raw pointers, two separate memory pools will be allocated for inputs and
316 // outputs specified by pointers. The input pointer data will be copied to the input pool prior to
317 // execution, and the output pointer data will be copied out from the output pool after the
318 // execution.
319 //
320 // The HIDL invocation will choose between sync/async execution according to
321 // DeviceManager::mSyncExecHal.
execute(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const Memory * > & memories,const std::shared_ptr<ExecutionBurstController> & burstController,MeasureTiming measure,const std::optional<Deadline> & deadline,const OptionalTimeoutDuration & loopTimeoutDuration) const322 std::tuple<int, std::vector<OutputShape>, Timing> DriverPreparedModel::execute(
323         const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
324         const std::vector<const Memory*>& memories,
325         const std::shared_ptr<ExecutionBurstController>& burstController, MeasureTiming measure,
326         const std::optional<Deadline>& deadline,
327         const OptionalTimeoutDuration& loopTimeoutDuration) const {
328     NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::execute");
329 
330     // Make a copy of the memory tracker as we will append memory pools for pointer arguments.
331     std::vector<const Memory*> localMemories = memories;
332 
333     // We separate the input & output pools so accelerators only need to copy
334     // the contents of the input pools. We could also use it to set protection
335     // on read only memory but that's not currently done.
336 
337     // Layout the input and output data
338     const auto [n1, inputPtrArgsMemory, inputPtrArgsLocations] =
339             allocatePointerArgumentsToPool(inputs, &localMemories);
340     if (n1 != ANEURALNETWORKS_NO_ERROR) {
341         return {n1, {}, kNoTiming};
342     }
343     const auto [n2, outputPtrArgsMemory, outputPtrArgsLocations] =
344             allocatePointerArgumentsToPool(outputs, &localMemories);
345     if (n2 != ANEURALNETWORKS_NO_ERROR) {
346         return {n2, {}, kNoTiming};
347     }
348 
349     // Copy the input data that was specified via a pointer.
350     if (inputPtrArgsMemory != nullptr) {
351         uint32_t ptrInputIndex = 0;
352         for (const auto& info : inputs) {
353             if (info.state() == ModelArgumentInfo::POINTER) {
354                 const DataLocation& loc = inputPtrArgsLocations[ptrInputIndex++];
355                 uint8_t* const data = inputPtrArgsMemory->getPointer();
356                 memcpy(data + loc.offset, info.buffer(), loc.length);
357             }
358         }
359     }
360 
361     Request request;
362     request.inputs = createRequestArguments(inputs, inputPtrArgsLocations);
363     request.outputs = createRequestArguments(outputs, outputPtrArgsLocations);
364     uint32_t count = localMemories.size();
365     request.pools.resize(count);
366     for (uint32_t i = 0; i < count; i++) {
367         request.pools[i] = localMemories[i]->getMemoryPool();
368     }
369 
370     NNTRACE_FULL_SWITCH(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION,
371                         "DriverPreparedModel::execute::execute");
372 
373     int n = ANEURALNETWORKS_OP_FAILED;
374     std::vector<OutputShape> outputShapes;
375     Timing timing = kNoTiming;
376 
377     // compute using burst if present
378     const bool burstCompute = (burstController != nullptr);
379     bool burstFallback = true;
380     if (burstCompute) {
381         const bool compliant = compliantWithV1_2(request);
382         if (compliant) {
383             V1_0::Request request12 = convertToV1_2(request);
384             std::vector<intptr_t> memoryIds;
385             memoryIds.reserve(localMemories.size());
386             for (const Memory* memory : localMemories) {
387                 memory->usedBy(burstController);
388                 memoryIds.push_back(memory->getKey());
389             }
390 
391             VLOG(EXECUTION) << "Before ExecutionBurstController->compute() "
392                             << SHOW_IF_DEBUG(toString(request12));
393             std::tie(n, outputShapes, timing, burstFallback) =
394                     burstController->compute(request12, measure, memoryIds);
395         }
396     }
397 
398     // compute from IPreparedModel if either:
399     // (1) burst was not supplied, or
400     // (2) the burst execution failed and requested a fallback execution
401     if (!burstCompute || burstFallback) {
402         const bool preferSynchronous = DeviceManager::get()->syncExecHal();
403         std::tie(n, outputShapes, timing) = mPreparedModel->execute(
404                 request, measure, deadline, loopTimeoutDuration, preferSynchronous);
405     }
406 
407     if (n != ANEURALNETWORKS_NO_ERROR) {
408         VLOG(EXECUTION) << "**Execution failed**";
409         return {n, std::move(outputShapes), timing};
410     }
411 
412     // Copy the output data from shared memory to the output buffers.
413     NNTRACE_RT_SWITCH(NNTRACE_PHASE_RESULTS, "DriverPreparedModel::execute");
414     if (outputPtrArgsMemory != nullptr) {
415         uint32_t ptrOutputIndex = 0;
416         for (const auto& info : outputs) {
417             if (info.state() == ModelArgumentInfo::POINTER) {
418                 const DataLocation& loc = outputPtrArgsLocations[ptrOutputIndex++];
419                 const uint8_t* const data = outputPtrArgsMemory->getPointer();
420                 memcpy(info.buffer(), data + loc.offset, loc.length);
421             }
422         }
423     }
424 
425     VLOG(EXECUTION) << "DriverPreparedModel::execute completed";
426     return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
427 }
428 
429 std::tuple<int, int, sp<hal::IFencedExecutionCallback>, hal::Timing>
executeFenced(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const Memory * > & memories,const std::vector<int> & waitFor,hal::MeasureTiming measure,const std::optional<Deadline> & deadline,const OptionalTimeoutDuration & loopTimeoutDuration,const hal::OptionalTimeoutDuration & timeoutDurationAfterFence) const430 DriverPreparedModel::executeFenced(
431         const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
432         const std::vector<const Memory*>& memories, const std::vector<int>& waitFor,
433         hal::MeasureTiming measure, const std::optional<Deadline>& deadline,
434         const OptionalTimeoutDuration& loopTimeoutDuration,
435         const hal::OptionalTimeoutDuration& timeoutDurationAfterFence) const {
436     NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::executeFenced");
437     CHECK(std::all_of(waitFor.begin(), waitFor.end(), [](int fd) { return fd > 0; }));
438     // Make a copy of the memory tracker as we will append memory pools for pointer arguments.
439     std::vector<const Memory*> localMemories = memories;
440     sp<hal::IFencedExecutionCallback> executeFencedCallback;
441     hal::Timing timing = kNoTiming;
442 
443     // We separate the input & output pools so accelerators only need to copy
444     // the contents of the input pools. We could also use it to set protection
445     // on read only memory but that's not currently done.
446 
447     // Layout the input and output data
448     const auto [n1, inputPtrArgsMemory, inputPtrArgsLocations] =
449             allocatePointerArgumentsToPool(inputs, &localMemories);
450     if (n1 != ANEURALNETWORKS_NO_ERROR) {
451         return {n1, -1, nullptr, timing};
452     }
453     const auto [n2, outputPtrArgsMemory, outputPtrArgsLocations] =
454             allocatePointerArgumentsToPool(outputs, &localMemories);
455     if (n2 != ANEURALNETWORKS_NO_ERROR) {
456         return {n2, -1, nullptr, timing};
457     }
458 
459     // Copy the input data that was specified via a pointer.
460     if (inputPtrArgsMemory != nullptr) {
461         uint32_t ptrInputIndex = 0;
462         for (const auto& info : inputs) {
463             if (info.state() == ModelArgumentInfo::POINTER) {
464                 const DataLocation& loc = inputPtrArgsLocations[ptrInputIndex++];
465                 uint8_t* const data = inputPtrArgsMemory->getPointer();
466                 memcpy(data + loc.offset, info.buffer(), loc.length);
467             }
468         }
469     }
470 
471     Request request;
472     request.inputs = createRequestArguments(inputs, inputPtrArgsLocations);
473     request.outputs = createRequestArguments(outputs, outputPtrArgsLocations);
474     uint32_t count = localMemories.size();
475     request.pools.resize(count);
476     for (uint32_t i = 0; i < count; i++) {
477         request.pools[i] = localMemories[i]->getMemoryPool();
478     }
479 
480     NNTRACE_FULL_SWITCH(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION,
481                         "DriverPreparedModel::executeFenced");
482 
483     int n = ANEURALNETWORKS_OP_FAILED;
484     hidl_vec<hidl_handle> waitForHandles;
485     waitForHandles.resize(waitFor.size());
486     for (uint32_t i = 0; i < waitFor.size(); i++) {
487         native_handle_t* nativeHandle = native_handle_create(1, 0);
488         if (nativeHandle == nullptr) {
489             LOG(ERROR) << "Failed to create native_handle";
490             return {n, -1, nullptr, timing};
491         }
492         int dupFd = dup(waitFor[i]);
493         if (dupFd <= 0) {
494             LOG(ERROR) << "Unable to dup the file descriptor";
495             return {n, -1, nullptr, timing};
496         }
497         nativeHandle->data[0] = dupFd;
498         hidl_handle hidlHandle;
499         hidlHandle.setTo(nativeHandle, /*shouldOwn=*/true);
500         waitForHandles[i] = std::move(hidlHandle);
501     }
502 
503     hidl_handle syncFence;
504     std::tie(n, syncFence, executeFencedCallback, timing) =
505             mPreparedModel->executeFenced(request, waitForHandles, measure, deadline,
506                                           loopTimeoutDuration, timeoutDurationAfterFence);
507 
508     if (n != ANEURALNETWORKS_NO_ERROR) {
509         VLOG(EXECUTION) << "**executeFenced failed**";
510         return {n, -1, nullptr, timing};
511     }
512 
513     int syncFenceFd = -1;
514     if (syncFence.getNativeHandle()) {
515         syncFenceFd = dup(syncFence.getNativeHandle()->data[0]);
516         if (syncFenceFd < 0) {
517             LOG(ERROR) << "Failed to dup the file descriptor";
518             return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, timing};
519         }
520     }
521     // If output buffer is provided as a malloc pointer, wait for the execution to finish.
522     // Then copy the output data from shared memory to the output buffers.
523     if (outputPtrArgsMemory != nullptr) {
524         NNTRACE_RT_SWITCH(NNTRACE_PHASE_RESULTS, "DriverPreparedModel::executeFenced");
525         if (syncFenceFd > 0) {
526             auto r = syncWait(syncFenceFd, -1);
527             if (r != FenceState::SIGNALED) {
528                 LOG(ERROR) << "syncWait failed, fd: " << syncFenceFd;
529                 return {ANEURALNETWORKS_OP_FAILED, syncFenceFd, nullptr, timing};
530             }
531         }
532         uint32_t ptrOutputIndex = 0;
533         for (const auto& info : outputs) {
534             if (info.state() == ModelArgumentInfo::POINTER) {
535                 const DataLocation& loc = outputPtrArgsLocations[ptrOutputIndex++];
536                 const uint8_t* const data = outputPtrArgsMemory->getPointer();
537                 memcpy(info.buffer(), data + loc.offset, loc.length);
538             }
539         }
540     }
541 
542     VLOG(EXECUTION) << "DriverPreparedModel::executeFenced completed";
543     return {ANEURALNETWORKS_NO_ERROR, syncFenceFd, executeFencedCallback, timing};
544 }
545 
546 // A special abstracted device for the CPU. Only one instance of this class will exist.
547 // Use get() to retrieve it.
548 class CpuDevice : public Device {
549    public:
550     // Returns the singleton CPU fallback device.
get()551     static std::shared_ptr<CpuDevice> get() {
552         static std::shared_ptr<CpuDevice> instance(new CpuDevice);
553         return instance;
554     }
555 
getName() const556     const std::string& getName() const override { return kName; }
getVersionString() const557     const std::string& getVersionString() const override { return kVersionString; }
getFeatureLevel() const558     int64_t getFeatureLevel() const override { return kFeatureLevel; }
getType() const559     int32_t getType() const override { return ANEURALNETWORKS_DEVICE_CPU; }
getSupportedExtensions() const560     const std::vector<Extension>& getSupportedExtensions() const override {
561         return kSupportedExtensions;
562     }
563     std::vector<bool> getSupportedOperations(const MetaModel& metaModel) const override;
getPerformance(OperandType) const564     PerformanceInfo getPerformance(OperandType) const override { return kPerformance; }
getRelaxedFloat32toFloat16PerformanceScalar() const565     PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override {
566         return kPerformance;
567     }
getRelaxedFloat32toFloat16PerformanceTensor() const568     PerformanceInfo getRelaxedFloat32toFloat16PerformanceTensor() const override {
569         return kPerformance;
570     }
getIfPerformance() const571     PerformanceInfo getIfPerformance() const override { return kPerformance; }
getWhilePerformance() const572     PerformanceInfo getWhilePerformance() const override { return kPerformance; }
isCachingSupported() const573     bool isCachingSupported() const override { return false; }
wait() const574     int wait() const override { return ANEURALNETWORKS_NO_ERROR; }
575 
576     std::pair<int, std::shared_ptr<PreparedModel>> prepareModel(
577             const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
578             const std::optional<Deadline>& deadline, const std::string& cacheDir,
579             const std::optional<CacheToken>& maybeToken) const override;
580 
581     std::pair<int, std::unique_ptr<Memory>> allocate(const MemoryDescriptor& desc,
582                                                      OperandType type) const override;
583 
584    private:
585     CpuDevice() = default;
586     const int64_t kFeatureLevel = __ANDROID_API__;
587     const std::string kName = "nnapi-reference";
588     const std::string kVersionString = build::GetBuildNumber();
589     // Since the performance is a ratio compared to the CPU performance,
590     // by definition the performance of the CPU is 1.0.
591     const PerformanceInfo kPerformance = {.execTime = 1.0f, .powerUsage = 1.0f};
592     const std::vector<Extension> kSupportedExtensions{/* No extensions. */};
593 };
594 
595 // A special abstracted PreparedModel for the CPU, constructed by CpuDevice.
596 class CpuPreparedModel : public PreparedModel {
597    public:
598     // Factory method for CpuPreparedModel. Returns ANEURALNETWORKS_NO_ERROR and
599     // a prepared model object if successfully created. Returns an error code
600     // and nullptr otherwise.
601     static std::pair<int, std::shared_ptr<PreparedModel>> create(Model hidlModel);
602 
getDevice() const603     const Device* getDevice() const override { return CpuDevice::get().get(); }
getInterface() const604     std::shared_ptr<VersionedIPreparedModel> getInterface() const override { return nullptr; }
605 
606     std::tuple<int, std::vector<OutputShape>, Timing> execute(
607             const std::vector<ModelArgumentInfo>& inputs,
608             const std::vector<ModelArgumentInfo>& outputs,
609             const std::vector<const Memory*>& memories,
610             const std::shared_ptr<ExecutionBurstController>& burstController, MeasureTiming measure,
611             const std::optional<Deadline>& deadline,
612             const OptionalTimeoutDuration& loopTimeoutDuration) const override;
613 
configureExecutionBurst(bool) const614     std::shared_ptr<ExecutionBurstController> configureExecutionBurst(
615             bool /*preferPowerOverLatency*/) const override {
616         return nullptr;
617     }
618 
619     std::tuple<int, int, sp<hal::IFencedExecutionCallback>, hal::Timing> executeFenced(
620             const std::vector<ModelArgumentInfo>& inputs,
621             const std::vector<ModelArgumentInfo>& outputs,
622             const std::vector<const Memory*>& memories, const std::vector<int>& wait_for,
623             MeasureTiming measure, const std::optional<Deadline>& deadline,
624             const OptionalTimeoutDuration& loopTimeoutDuration,
625             const hal::OptionalTimeoutDuration& timeoutDurationAfterFence) const override;
626 
627     // Prefer to use CpuPreparedModel::create.
CpuPreparedModel(Model model,std::vector<RunTimePoolInfo> poolInfos)628     CpuPreparedModel(Model model, std::vector<RunTimePoolInfo> poolInfos)
629         : mModel(std::move(model)), mModelPoolInfos(std::move(poolInfos)) {}
630 
631    private:
632     const Model mModel;
633     const std::vector<RunTimePoolInfo> mModelPoolInfos;
634 };
635 
getSupportedOperations(const MetaModel & metaModel) const636 std::vector<bool> CpuDevice::getSupportedOperations(const MetaModel& metaModel) const {
637     const Model& hidlModel = metaModel.getModel();
638     const size_t count = hidlModel.main.operations.size();
639     std::vector<bool> result(count, false);
640     for (size_t i = 0; i < count; i++) {
641         // TODO(b/119870033): Decide whether and how post-P operations would be supported on CPU.
642         //                    We may want to use the slicer for CpuDevice just as we do for
643         //                    DriverDevice.
644         OperationType operationType = hidlModel.main.operations[i].type;
645         result[i] = !isExtensionOperationType(operationType) &&
646                     operationType != OperationType::OEM_OPERATION;
647     }
648     return result;
649 }
650 
prepareModel(const ModelFactory & makeModel,ExecutionPreference preference,Priority priority,const std::optional<Deadline> & deadline,const std::string &,const std::optional<CacheToken> & maybeToken) const651 std::pair<int, std::shared_ptr<PreparedModel>> CpuDevice::prepareModel(
652         const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
653         const std::optional<Deadline>& deadline, const std::string& /*cacheDir*/,
654         const std::optional<CacheToken>& maybeToken) const {
655     CHECK(!maybeToken.has_value())
656             << "Should never call prepareModel with cache information on CpuDevice";
657 
658     const Model model = makeModel();
659     if (!validateModel(model, ValidationMode::RUNTIME) ||
660         !validateExecutionPreference(preference) || !validatePriority(priority)) {
661         return {ANEURALNETWORKS_OP_FAILED, nullptr};
662     }
663     if (hasDeadlinePassed(deadline)) {
664         return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, nullptr};
665     }
666 
667     return CpuPreparedModel::create(model);
668 }
669 
allocate(const MemoryDescriptor & desc,OperandType type) const670 std::pair<int, std::unique_ptr<Memory>> CpuDevice::allocate(const MemoryDescriptor& desc,
671                                                             OperandType type) const {
672     uint32_t size = TypeManager::get()->getSizeOfData(type, desc.dimensions);
673     if (size == 0) {
674         LOG(ERROR) << "CpuDevice::allocate -- does not support unknown dimensions.";
675         return {ANEURALNETWORKS_OP_FAILED, nullptr};
676     }
677     return MemoryAshmem::create(size);
678 }
679 
create(Model hidlModel)680 std::pair<int, std::shared_ptr<PreparedModel>> CpuPreparedModel::create(Model hidlModel) {
681     std::vector<RunTimePoolInfo> poolInfos;
682     if (!setRunTimePoolInfosFromHidlMemories(&poolInfos, hidlModel.pools)) {
683         return {ANEURALNETWORKS_UNMAPPABLE, nullptr};
684     }
685 
686     std::shared_ptr<PreparedModel> preparedModel =
687             std::make_shared<CpuPreparedModel>(std::move(hidlModel), std::move(poolInfos));
688     return {ANEURALNETWORKS_NO_ERROR, std::move(preparedModel)};
689 }
690 
computeOnCpu(const Model & model,const Request & request,const std::vector<RunTimePoolInfo> & modelPoolInfos,const std::vector<RunTimePoolInfo> & requestPoolInfos,const std::optional<Deadline> & deadline,const OptionalTimeoutDuration & loopTimeoutDuration)691 static std::tuple<int, std::vector<OutputShape>, Timing> computeOnCpu(
692         const Model& model, const Request& request,
693         const std::vector<RunTimePoolInfo>& modelPoolInfos,
694         const std::vector<RunTimePoolInfo>& requestPoolInfos,
695         const std::optional<Deadline>& deadline,
696         const OptionalTimeoutDuration& loopTimeoutDuration) {
697     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "computeOnCpu");
698     CpuExecutor executor;
699     if (loopTimeoutDuration.getDiscriminator() !=
700         OptionalTimeoutDuration::hidl_discriminator::none) {
701         executor.setLoopTimeout(loopTimeoutDuration.nanoseconds());
702     }
703     if (deadline.has_value()) {
704         executor.setDeadline(*deadline);
705     }
706     int err = executor.run(model, request, modelPoolInfos, requestPoolInfos);
707     const auto& outputShapes = executor.getOutputShapes();
708     return {err, outputShapes, kNoTiming};
709 }
710 
711 std::tuple<int, int, sp<hal::IFencedExecutionCallback>, hal::Timing>
executeFenced(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const Memory * > & memories,const std::vector<int> & waitFor,hal::MeasureTiming measure,const std::optional<Deadline> & deadline,const OptionalTimeoutDuration & loopTimeoutDuration,const hal::OptionalTimeoutDuration & duration) const712 CpuPreparedModel::executeFenced(const std::vector<ModelArgumentInfo>& inputs,
713                                 const std::vector<ModelArgumentInfo>& outputs,
714                                 const std::vector<const Memory*>& memories,
715                                 const std::vector<int>& waitFor, hal::MeasureTiming measure,
716                                 const std::optional<Deadline>& deadline,
717                                 const OptionalTimeoutDuration& loopTimeoutDuration,
718                                 const hal::OptionalTimeoutDuration& duration) const {
719     VLOG(EXECUTION)
720             << "CpuPreparedModel::executeFenced wait for sync fences to signal before execution";
721     for (int syncFd : waitFor) {
722         if (syncFd > 0) {
723             auto r = syncWait(syncFd, -1);
724             if (r != FenceState::SIGNALED) {
725                 LOG(ERROR) << "sync wait failed, fd: " << syncFd;
726                 return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {UINT64_MAX, UINT64_MAX}};
727             }
728         }
729     }
730 
731     // Update deadline if the timeout duration is closer than the deadline.
732     auto closestDeadline = deadline;
733     if (duration.getDiscriminator() != OptionalTimeoutDuration::hidl_discriminator::none) {
734         const auto timeoutDurationDeadline = makeDeadline(duration.nanoseconds());
735         if (!closestDeadline.has_value() || *closestDeadline > timeoutDurationDeadline) {
736             closestDeadline = timeoutDurationDeadline;
737         }
738     }
739 
740     const auto [result, outputShapes, timing] = execute(inputs, outputs, memories, nullptr, measure,
741                                                         closestDeadline, loopTimeoutDuration);
742     return {result, -1, nullptr, timing};
743 }
744 
745 // Perform computation on NNAPI CPU reference implementation.
746 //
747 // Contrary to DriverPreparedModel::execute, the NNAPI CPU reference executor lives in the
748 // same process as the NNAPI runtime and can take raw pointers. We will create as many pools as
749 // there are input/output in this method to avoid data copying.
750 //
751 // Will choose between sync/async execution according to DeviceManager::mSyncExecCpu.
execute(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const Memory * > & memories,const std::shared_ptr<ExecutionBurstController> &,MeasureTiming,const std::optional<Deadline> & deadline,const OptionalTimeoutDuration & loopTimeoutDuration) const752 std::tuple<int, std::vector<OutputShape>, Timing> CpuPreparedModel::execute(
753         const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
754         const std::vector<const Memory*>& memories,
755         const std::shared_ptr<ExecutionBurstController>& /*burstController*/,
756         MeasureTiming /*measure*/, const std::optional<Deadline>& deadline,
757         const OptionalTimeoutDuration& loopTimeoutDuration) const {
758     if (hasDeadlinePassed(deadline)) {
759         return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, {}, kNoTiming};
760     }
761 
762     std::vector<RunTimePoolInfo> requestPoolInfos;
763     requestPoolInfos.reserve(memories.size());
764     for (const Memory* mem : memories) {
765         if (std::optional<RunTimePoolInfo> poolInfo = mem->getRunTimePoolInfo()) {
766             requestPoolInfos.emplace_back(*poolInfo);
767         } else {
768             return {ANEURALNETWORKS_UNMAPPABLE, {}, kNoTiming};
769         }
770     }
771     // Create as many pools as there are input / output.
772     auto fixPointerArguments =
773             [&requestPoolInfos](const std::vector<ModelArgumentInfo>& argumentInfos) {
774                 std::vector<DataLocation> ptrArgsLocations;
775                 for (const ModelArgumentInfo& argumentInfo : argumentInfos) {
776                     if (argumentInfo.state() == ModelArgumentInfo::POINTER) {
777                         ptrArgsLocations.push_back(
778                                 {.poolIndex = static_cast<uint32_t>(requestPoolInfos.size()),
779                                  .offset = 0,
780                                  .length = argumentInfo.length()});
781                         requestPoolInfos.emplace_back(RunTimePoolInfo::createFromExistingBuffer(
782                                 static_cast<uint8_t*>(argumentInfo.buffer())));
783                     }
784                 }
785                 return ptrArgsLocations;
786             };
787     const std::vector<DataLocation> inputPtrArgsLocations = fixPointerArguments(inputs);
788     const std::vector<DataLocation> outputPtrArgsLocations = fixPointerArguments(outputs);
789 
790     Request request;
791     request.inputs = createRequestArguments(inputs, inputPtrArgsLocations);
792     request.outputs = createRequestArguments(outputs, outputPtrArgsLocations);
793 
794     if (!DeviceManager::get()->syncExecCpu()) {
795         // TODO: use a thread pool
796         // TODO(mikie): this could have NNTRACE so we could measure the overhead
797         //              of spinning up a new thread.
798         std::tuple<int, std::vector<OutputShape>, Timing> result = {};
799         std::thread([this, &request, &requestPoolInfos, &deadline, &loopTimeoutDuration, &result] {
800             result = computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, deadline,
801                                   loopTimeoutDuration);
802         }).join();
803         return result;
804     }
805 
806     return computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, deadline,
807                         loopTimeoutDuration);
808 }
809 
get()810 DeviceManager* DeviceManager::get() {
811     static DeviceManager manager;
812     return &manager;
813 }
814 
getCpuDevice()815 std::shared_ptr<Device> DeviceManager::getCpuDevice() {
816     return CpuDevice::get();
817 }
818 
forTest_makeDriverDevice(const std::string & name,const sp<V1_0::IDevice> & device)819 std::shared_ptr<Device> DeviceManager::forTest_makeDriverDevice(const std::string& name,
820                                                                 const sp<V1_0::IDevice>& device) {
821     const DeviceFactory makeDevice = [device](bool /*blocking*/) { return device; };
822     const auto driverDevice = DriverDevice::create(name, makeDevice);
823     CHECK(driverDevice != nullptr);
824     return driverDevice;
825 }
826 
findAvailableDevices()827 void DeviceManager::findAvailableDevices() {
828     VLOG(MANAGER) << "findAvailableDevices";
829 
830     // register driver devices
831     const auto names = hardware::getAllHalInstanceNames(V1_0::IDevice::descriptor);
832     for (const auto& name : names) {
833         VLOG(MANAGER) << "Found interface " << name;
834         const DeviceFactory makeDevice = [name](bool blocking) {
835             return blocking ? V1_0::IDevice::getService(name) : V1_0::IDevice::tryGetService(name);
836         };
837         registerDevice(name, makeDevice);
838     }
839 
840     // register CPU fallback device
841     mDevices.push_back(CpuDevice::get());
842     mDevicesCpuOnly.push_back(CpuDevice::get());
843 }
844 
registerDevice(const std::string & name,const DeviceFactory & makeDevice)845 void DeviceManager::registerDevice(const std::string& name, const DeviceFactory& makeDevice) {
846     if (auto device = DriverDevice::create(name, makeDevice)) {
847         mDevices.push_back(std::move(device));
848     }
849 }
850 
DeviceManager()851 DeviceManager::DeviceManager() {
852     VLOG(MANAGER) << "DeviceManager::DeviceManager";
853     findAvailableDevices();
854 #ifdef NN_DEBUGGABLE
855     mStrictSlicing = (getProp("debug.nn.strict-slicing") != 0);
856     mPartitioning = getProp("debug.nn.partition", kPartitioningDefault);
857     mDebugNNCpuOnly = (getProp("debug.nn.cpuonly") != 0);
858     mSyncExecCpu = (getProp("debug.nn.syncexec-cpu", 1) != 0);
859     if (!mSyncExecHalSetter) {
860         mSyncExecHal = (getProp("debug.nn.syncexec-hal", 1) != 0);
861     }
862     mSyncExecRuntime = (getProp("debug.nn.syncexec-runtime") != 0);
863 #endif  // NN_DEBUGGABLE
864 }
865 
866 }  // namespace nn
867 }  // namespace android
868