1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "CpuExecutor"
18 
19 #include "CpuExecutor.h"
20 
21 #include <android/hardware_buffer.h>
22 #include <android-base/scopeguard.h>
23 
24 #include <sys/mman.h>
25 #include <vndk/hardware_buffer.h>
26 
27 #include <Eigen/Core>
28 #include <memory>
29 #include <utility>
30 #include <vector>
31 
32 // b/109953668, disable OpenMP
33 #ifdef NNAPI_OPENMP
34 #include <omp.h>
35 #endif  // NNAPI_OPENMP
36 
37 #include "ControlFlow.h"
38 #include "NeuralNetworks.h"
39 #include "OperationResolver.h"
40 #include "Operations.h"
41 #include "OperationsUtils.h"
42 #include "Tracing.h"
43 
44 namespace android {
45 namespace nn {
46 
47 namespace {
48 
49 using namespace hal;
50 
51 class OperationExecutionContext : public IOperationExecutionContext {
52     DISALLOW_IMPLICIT_CONSTRUCTORS(OperationExecutionContext);
53 
54    public:
OperationExecutionContext(const Operation * operation,RunTimeOperandInfo * operands)55     OperationExecutionContext(const Operation* operation, RunTimeOperandInfo* operands)
56         : operation(operation), operands(operands) {}
57 
58     uint32_t getNumInputs() const override;
59     OperandType getInputType(uint32_t index) const override;
60     Shape getInputShape(uint32_t index) const override;
61     const void* getInputBuffer(uint32_t index) const override;
62     const OperandExtraParams getInputExtraParams(uint32_t index) const override;
63 
64     uint32_t getNumOutputs() const override;
65     OperandType getOutputType(uint32_t index) const override;
66     Shape getOutputShape(uint32_t index) const override;
67     void* getOutputBuffer(uint32_t index) override;
68 
69     // Return false on failure and store the result code.
70     // Use getResultCode() to retrieve it at the end of the operation execution.
71     bool setOutputShape(uint32_t index, const Shape& shape) override;
72     int getResultCode() const;
73 
74     bool isOmittedInput(uint32_t index) const override;
75     bool isOmittedOutput(uint32_t index) const override;
76 
77     // Return false if any of inputs or outputs is omitted, i.e. has lifetime of NO_VALUE.
78     bool checkNoOmittedOperand() const;
79     // Return false if any of inputs has dimension 0.
80     bool checkNoZeroSizedInput() const;
81 
82    private:
83     const RunTimeOperandInfo* getInputInfo(uint32_t index) const;
84     const RunTimeOperandInfo* getOutputInfo(uint32_t index) const;
85     RunTimeOperandInfo* getOutputInfo(uint32_t index);
86 
87     const Operation* operation;
88     RunTimeOperandInfo* operands;
89 
90     int result = ANEURALNETWORKS_NO_ERROR;
91 };
92 
getInputInfo(uint32_t index) const93 const RunTimeOperandInfo* OperationExecutionContext::getInputInfo(uint32_t index) const {
94     CHECK(index < operation->inputs.size());
95     return &operands[operation->inputs[index]];
96 }
97 
getOutputInfo(uint32_t index) const98 const RunTimeOperandInfo* OperationExecutionContext::getOutputInfo(uint32_t index) const {
99     CHECK(index < operation->outputs.size());
100     return &operands[operation->outputs[index]];
101 }
102 
getOutputInfo(uint32_t index)103 RunTimeOperandInfo* OperationExecutionContext::getOutputInfo(uint32_t index) {
104     CHECK(index < operation->outputs.size());
105     return &operands[operation->outputs[index]];
106 }
107 
getInputType(uint32_t index) const108 OperandType OperationExecutionContext::getInputType(uint32_t index) const {
109     return getInputInfo(index)->type;
110 }
111 
getInputShape(uint32_t index) const112 Shape OperationExecutionContext::getInputShape(uint32_t index) const {
113     return getInputInfo(index)->shape();
114 }
115 
getInputBuffer(uint32_t index) const116 const void* OperationExecutionContext::getInputBuffer(uint32_t index) const {
117     return getInputInfo(index)->buffer;
118 }
119 
getInputExtraParams(uint32_t index) const120 const OperandExtraParams OperationExecutionContext::getInputExtraParams(uint32_t index) const {
121     return getInputInfo(index)->extraParams;
122 }
123 
getOutputType(uint32_t index) const124 OperandType OperationExecutionContext::getOutputType(uint32_t index) const {
125     return getOutputInfo(index)->type;
126 }
127 
getOutputShape(uint32_t index) const128 Shape OperationExecutionContext::getOutputShape(uint32_t index) const {
129     return getOutputInfo(index)->shape();
130 }
131 
getOutputBuffer(uint32_t index)132 void* OperationExecutionContext::getOutputBuffer(uint32_t index) {
133     return getOutputInfo(index)->buffer;
134 }
135 
getNumInputs() const136 uint32_t OperationExecutionContext::getNumInputs() const {
137     return operation->inputs.size();
138 }
139 
getNumOutputs() const140 uint32_t OperationExecutionContext::getNumOutputs() const {
141     return operation->outputs.size();
142 }
143 
getResultCode() const144 int OperationExecutionContext::getResultCode() const {
145     return result;
146 }
147 
148 // TODO: Return error code directly once we've fully integrated OperationResolver with all ops.
149 // Updates the RunTimeOperandInfo with the newly calculated shape.
150 // Allocate the buffer if we need to.
151 //
152 // TODO(b/153081229): This function currently cannot handle extension operands well. We need to
153 //                    propagate the extension type info into this function.
setInfoAndAllocateIfNeeded(RunTimeOperandInfo * info,const Shape & shape,int * result)154 bool setInfoAndAllocateIfNeeded(RunTimeOperandInfo* info, const Shape& shape, int* result) {
155     // For user-provided model output operands, the parameters must match the Shape
156     // calculated from the preparation step.
157     if (info->lifetime == OperandLifeTime::SUBGRAPH_OUTPUT) {
158         if (info->type != shape.type) {
159             LOG(ERROR) << "Invalid type for model output";
160             *result = ANEURALNETWORKS_OP_FAILED;
161             return false;
162         }
163         if (info->scale != shape.scale) {
164             LOG(ERROR) << "Invalid scale for model output";
165             *result = ANEURALNETWORKS_OP_FAILED;
166             return false;
167         }
168         if (info->zeroPoint != shape.offset) {
169             LOG(ERROR) << "Invalid zeroPoint for model output";
170             *result = ANEURALNETWORKS_OP_FAILED;
171             return false;
172         }
173         if (info->extraParams != shape.extraParams) {
174             LOG(ERROR) << "Invalid extraParams for model output";
175             *result = ANEURALNETWORKS_OP_FAILED;
176             return false;
177         }
178     }
179 
180     auto combined = combineDimensions(shape.dimensions, info->dimensions);
181     if (!combined.has_value()) {
182         LOG(ERROR) << "Invalid dimensions for model operand";
183         *result = ANEURALNETWORKS_OP_FAILED;
184         return false;
185     }
186     info->dimensions = std::move(combined.value());
187     info->type = shape.type;
188     info->scale = shape.scale;
189     info->zeroPoint = shape.offset;
190     info->extraParams = shape.extraParams;
191 
192     // TODO(b/153081229): We bypass the overflow check on extension operands because we do not know
193     //                    the sizes of extension types.
194     if (!isExtensionOperandType(info->type) &&
195         nonExtensionOperandSizeOfDataOverflowsUInt32(info->type, info->dimensions)) {
196         LOG(ERROR) << "Operand data size overflows uint32_t";
197         *result = ANEURALNETWORKS_OP_FAILED;
198         return false;
199     }
200 
201     // Allocate the buffer only if the combined dimension is fully specified
202     if (info->buffer == nullptr && (info->lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
203                                     info->lifetime == OperandLifeTime::SUBGRAPH_OUTPUT)) {
204         if (isExtensionOperandType(info->type)) {
205             LOG(ERROR) << "Cannot allocate a variable of an extension type";
206             *result = ANEURALNETWORKS_OP_FAILED;
207             return false;
208         }
209         uint32_t length = nonExtensionOperandSizeOfData(info->type, info->dimensions);
210         if (length > 0) {
211             info->buffer = new uint8_t[length];
212             if (info->buffer == nullptr) {
213                 *result = ANEURALNETWORKS_OUT_OF_MEMORY;
214                 return false;
215             }
216             info->length = length;
217         }
218     }
219     if (!info->isSufficient()) {
220         uint32_t length = nonExtensionOperandSizeOfData(info->type, info->dimensions);
221         LOG(ERROR) << "Insufficient size for model operand: require = " << length
222                    << ", provided = " << info->length;
223         *result = ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
224         return false;
225     }
226     *result = ANEURALNETWORKS_NO_ERROR;
227     return true;
228 }
229 
setOutputShape(uint32_t index,const Shape & shape)230 bool OperationExecutionContext::setOutputShape(uint32_t index, const Shape& shape) {
231     return setInfoAndAllocateIfNeeded(getOutputInfo(index), shape, &result);
232 }
233 
isOmittedInput(uint32_t index) const234 bool OperationExecutionContext::isOmittedInput(uint32_t index) const {
235     return getInputInfo(index)->lifetime == OperandLifeTime::NO_VALUE;
236 }
237 
isOmittedOutput(uint32_t index) const238 bool OperationExecutionContext::isOmittedOutput(uint32_t index) const {
239     return getOutputInfo(index)->lifetime == OperandLifeTime::NO_VALUE;
240 }
241 
checkNoOmittedOperand() const242 bool OperationExecutionContext::checkNoOmittedOperand() const {
243     for (uint32_t i = 0; i < operation->inputs.size(); i++) {
244         NN_RET_CHECK(!isOmittedInput(i)) << getOperationName(operation->type) << " input operand "
245                                          << i << " is required but missing.";
246     }
247     for (uint32_t i = 0; i < operation->outputs.size(); i++) {
248         NN_RET_CHECK(!isOmittedOutput(i)) << getOperationName(operation->type) << " output operand "
249                                           << i << " is required but missing.";
250     }
251     return true;
252 }
253 
checkNoZeroSizedInput() const254 bool OperationExecutionContext::checkNoZeroSizedInput() const {
255     for (uint32_t i = 0; i < operation->inputs.size(); i++) {
256         if (isOmittedInput(i)) continue;
257         for (uint32_t j = 0; j < getInputInfo(i)->dimensions.size(); j++) {
258             NN_RET_CHECK_NE(getInputInfo(i)->dimensions[j], 0)
259                     << getOperationName(operation->type)
260                     << " does not support zero-sized tensor, but input " << i << " dimension " << j
261                     << " is 0.";
262         }
263     }
264     return true;
265 }
266 
267 }  // namespace
268 
269 // Used to keep a pointer to a memory pool.
270 //
271 // In the case of an "mmap_fd" pool, owns the mmap region
272 // returned by getBuffer() -- i.e., that region goes away
273 // when the RunTimePoolInfo is destroyed or is assigned to.
274 class RunTimePoolInfo::RunTimePoolInfoImpl {
275    public:
276     RunTimePoolInfoImpl(const hidl_memory& hidlMemory, uint8_t* buffer, const sp<IMemory>& memory,
277                         AHardwareBuffer* hardwareBuffer, uint32_t size);
278 
279     // rule of five...
280     ~RunTimePoolInfoImpl();
281     RunTimePoolInfoImpl(const RunTimePoolInfoImpl&) = delete;
282     RunTimePoolInfoImpl(RunTimePoolInfoImpl&&) noexcept = delete;
283     RunTimePoolInfoImpl& operator=(const RunTimePoolInfoImpl&) = delete;
284     RunTimePoolInfoImpl& operator=(RunTimePoolInfoImpl&&) noexcept = delete;
285 
getBuffer() const286     uint8_t* getBuffer() const { return mBuffer; }
getSize() const287     uint32_t getSize() const { return mSize; }
288 
289     bool flush() const;
290 
getHidlMemory() const291     const hidl_memory& getHidlMemory() const { return mHidlMemory; }
292 
293    private:
294     const hidl_memory mHidlMemory;     // always used
295     uint8_t* const mBuffer = nullptr;  // always used
296     const sp<IMemory> mMemory;         // only used when hidlMemory.name() == "ashmem"
297     AHardwareBuffer*
298             mAHardwareBuffer;  // only used when hidlMemory.name() == "hardware_buffer_blob"
299     const uint32_t mSize;
300 };
301 
RunTimePoolInfoImpl(const hidl_memory & hidlMemory,uint8_t * buffer,const sp<IMemory> & memory,AHardwareBuffer * hardwareBuffer,uint32_t size)302 RunTimePoolInfo::RunTimePoolInfoImpl::RunTimePoolInfoImpl(const hidl_memory& hidlMemory,
303                                                           uint8_t* buffer,
304                                                           const sp<IMemory>& memory,
305                                                           AHardwareBuffer* hardwareBuffer,
306                                                           uint32_t size)
307     : mHidlMemory(hidlMemory),
308       mBuffer(buffer),
309       mMemory(memory),
310       mAHardwareBuffer(hardwareBuffer),
311       mSize(size) {}
312 
~RunTimePoolInfoImpl()313 RunTimePoolInfo::RunTimePoolInfoImpl::~RunTimePoolInfoImpl() {
314     if (mBuffer == nullptr) {
315         return;
316     }
317 
318     const auto& memType = mHidlMemory.name();
319     if (memType == "ashmem") {
320         // nothing to do
321     } else if (memType == "mmap_fd") {
322         const size_t size = mHidlMemory.size();
323         if (munmap(mBuffer, size)) {
324             LOG(ERROR) << "RunTimePoolInfoImpl::~RunTimePoolInfo(): Can't munmap";
325         }
326     } else if (memType == "hardware_buffer_blob") {
327         AHardwareBuffer_unlock(mAHardwareBuffer, nullptr);
328     } else if (memType == "") {
329         // Represents a POINTER argument; nothing to do
330     } else {
331         LOG(ERROR) << "RunTimePoolInfoImpl::~RunTimePoolInfoImpl(): unsupported hidl_memory type";
332     }
333 
334     if (mAHardwareBuffer != nullptr) {
335         AHardwareBuffer_release(mAHardwareBuffer);
336     }
337 }
338 
339 // Making sure the output data are correctly updated after execution.
flush() const340 bool RunTimePoolInfo::RunTimePoolInfoImpl::flush() const {
341     const auto& memType = mHidlMemory.name();
342     if (memType == "mmap_fd") {
343         const int prot = mHidlMemory.handle()->data[1];
344         if (prot & PROT_WRITE) {
345             const size_t size = mHidlMemory.size();
346             return msync(mBuffer, size, MS_SYNC) == 0;
347         }
348     }
349     // No-op for other types of memory.
350     return true;
351 }
352 
353 // TODO: short term, make share memory mapping and updating a utility function.
354 // TODO: long term, implement mmap_fd as a hidl IMemory service.
createFromHidlMemory(const hidl_memory & hidlMemory)355 std::optional<RunTimePoolInfo> RunTimePoolInfo::createFromHidlMemory(
356         const hidl_memory& hidlMemory) {
357     uint8_t* buffer = nullptr;
358     sp<IMemory> memory;
359     AHardwareBuffer* hardwareBuffer = nullptr;
360 
361     const auto& memType = hidlMemory.name();
362     if (memType == "ashmem") {
363         memory = mapMemory(hidlMemory);
364         if (memory == nullptr) {
365             LOG(ERROR) << "Can't map shared memory.";
366             return std::nullopt;
367         }
368         buffer = static_cast<uint8_t*>(static_cast<void*>(memory->getPointer()));
369         if (buffer == nullptr) {
370             LOG(ERROR) << "Can't access shared memory.";
371             return std::nullopt;
372         }
373     } else if (memType == "mmap_fd") {
374         size_t size = hidlMemory.size();
375         int fd = hidlMemory.handle()->data[0];
376         int prot = hidlMemory.handle()->data[1];
377         size_t offset = getSizeFromInts(hidlMemory.handle()->data[2], hidlMemory.handle()->data[3]);
378         buffer = static_cast<uint8_t*>(mmap(nullptr, size, prot, MAP_SHARED, fd, offset));
379         if (buffer == MAP_FAILED) {
380             LOG(ERROR) << "RunTimePoolInfo::set(): Can't mmap the file descriptor.";
381             return std::nullopt;
382         }
383     } else if (memType == "hardware_buffer_blob") {
384         auto handle = hidlMemory.handle();
385         auto format = AHARDWAREBUFFER_FORMAT_BLOB;
386         auto usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN | AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN;
387         const uint32_t width = hidlMemory.size();
388         const uint32_t height = 1;  // height is always 1 for BLOB mode AHardwareBuffer.
389         const uint32_t layers = 1;  // layers is always 1 for BLOB mode AHardwareBuffer.
390         const uint32_t stride = hidlMemory.size();
391 
392         AHardwareBuffer_Desc desc{
393                 .width = width,
394                 .height = height,
395                 .layers = layers,
396                 .format = format,
397                 .usage = usage,
398                 .stride = stride,
399         };
400         status_t status = AHardwareBuffer_createFromHandle(
401                 &desc, handle, AHARDWAREBUFFER_CREATE_FROM_HANDLE_METHOD_CLONE, &hardwareBuffer);
402         if (status != NO_ERROR) {
403             LOG(ERROR) << "RunTimePoolInfo Can't create AHardwareBuffer from handle. Error: "
404                        << status;
405             return std::nullopt;
406         }
407         void* gBuffer = nullptr;
408         status = AHardwareBuffer_lock(hardwareBuffer, usage, -1, nullptr, &gBuffer);
409         if (status != NO_ERROR) {
410             LOG(ERROR) << "RunTimePoolInfo Can't lock the AHardwareBuffer. Error: " << status;
411             return std::nullopt;
412         }
413         buffer = static_cast<uint8_t*>(gBuffer);
414     } else {
415         LOG(ERROR) << "RunTimePoolInfo::set(): unsupported hidl_memory type";
416         return std::nullopt;
417     }
418 
419     const auto impl = std::make_shared<const RunTimePoolInfoImpl>(
420             hidlMemory, buffer, memory, hardwareBuffer, hidlMemory.size());
421     return {RunTimePoolInfo(impl)};
422 }
423 
createFromExistingBuffer(uint8_t * buffer,uint32_t size)424 RunTimePoolInfo RunTimePoolInfo::createFromExistingBuffer(uint8_t* buffer, uint32_t size) {
425     const auto impl = std::make_shared<const RunTimePoolInfoImpl>(hidl_memory{}, buffer, nullptr,
426                                                                   nullptr, size);
427     return {impl};
428 }
429 
RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl> & impl)430 RunTimePoolInfo::RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl>& impl)
431     : mImpl(impl) {}
432 
getBuffer() const433 uint8_t* RunTimePoolInfo::getBuffer() const {
434     return mImpl->getBuffer();
435 }
436 
getSize() const437 uint32_t RunTimePoolInfo::getSize() const {
438     return mImpl->getSize();
439 }
440 
flush() const441 bool RunTimePoolInfo::flush() const {
442     return mImpl->flush();
443 }
444 
getHidlMemory() const445 const hidl_memory& RunTimePoolInfo::getHidlMemory() const {
446     return mImpl->getHidlMemory();
447 }
448 
setRunTimePoolInfosFromHidlMemories(std::vector<RunTimePoolInfo> * poolInfos,const hidl_vec<hidl_memory> & pools)449 bool setRunTimePoolInfosFromHidlMemories(std::vector<RunTimePoolInfo>* poolInfos,
450                                          const hidl_vec<hidl_memory>& pools) {
451     CHECK(poolInfos != nullptr);
452     poolInfos->clear();
453     poolInfos->reserve(pools.size());
454     for (const auto& pool : pools) {
455         if (std::optional<RunTimePoolInfo> poolInfo = RunTimePoolInfo::createFromHidlMemory(pool)) {
456             poolInfos->push_back(*poolInfo);
457         } else {
458             LOG(ERROR) << "Could not map pools";
459             poolInfos->clear();
460             return false;
461         }
462     }
463     return true;
464 }
465 
setRunTimePoolInfosFromMemoryPools(std::vector<RunTimePoolInfo> * poolInfos,const hidl_vec<Request::MemoryPool> & pools)466 bool setRunTimePoolInfosFromMemoryPools(std::vector<RunTimePoolInfo>* poolInfos,
467                                         const hidl_vec<Request::MemoryPool>& pools) {
468     CHECK(poolInfos != nullptr);
469     poolInfos->clear();
470     poolInfos->reserve(pools.size());
471     for (const auto& pool : pools) {
472         if (pool.getDiscriminator() != Request::MemoryPool::hidl_discriminator::hidlMemory) {
473             LOG(ERROR) << "Unknown memory token";
474             poolInfos->clear();
475             return false;
476         }
477         if (std::optional<RunTimePoolInfo> poolInfo =
478                     RunTimePoolInfo::createFromHidlMemory(pool.hidlMemory())) {
479             poolInfos->push_back(*poolInfo);
480         } else {
481             LOG(ERROR) << "Could not map pools";
482             poolInfos->clear();
483             return false;
484         }
485     }
486     return true;
487 }
488 
489 template <typename T>
convertToNhwcImpl(T * to,const T * from,const std::vector<uint32_t> & fromDim)490 inline bool convertToNhwcImpl(T* to, const T* from, const std::vector<uint32_t>& fromDim) {
491     uint32_t spatialSize = fromDim[2] * fromDim[3];
492     for (uint32_t n = 0; n < fromDim[0]; n++) {
493         for (uint32_t hw = 0; hw < spatialSize; hw++) {
494             for (uint32_t c = 0; c < fromDim[1]; c++) {
495                 uint32_t fromIndex = n * fromDim[1] * spatialSize + c * spatialSize + hw;
496                 *to++ = from[fromIndex];
497             }
498         }
499     }
500     return true;
501 }
502 
503 template <typename T>
convertFromNhwcImpl(T * to,const T * from,const std::vector<uint32_t> & fromDim)504 inline bool convertFromNhwcImpl(T* to, const T* from, const std::vector<uint32_t>& fromDim) {
505     uint32_t spatialSize = fromDim[1] * fromDim[2];
506     for (uint32_t n = 0; n < fromDim[0]; n++) {
507         for (uint32_t c = 0; c < fromDim[3]; c++) {
508             for (uint32_t hw = 0; hw < spatialSize; hw++) {
509                 uint32_t fromIndex = n * spatialSize * fromDim[3] + hw * fromDim[3] + c;
510                 *to++ = from[fromIndex];
511             }
512         }
513     }
514     return true;
515 }
516 
convertToNhwc(RunTimeOperandInfo & to,const RunTimeOperandInfo & from,std::unique_ptr<uint8_t[]> & ptr_guard,bool data_layout)517 static bool convertToNhwc(RunTimeOperandInfo& to, const RunTimeOperandInfo& from,
518                           std::unique_ptr<uint8_t[]>& ptr_guard, bool data_layout) {
519     int result;
520     if (from.dimensions.size() != 4) {
521         LOG(ERROR) << "Error converting a non-4-D tensor to NHWC layout";
522         return false;
523     }
524     to.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
525     if (data_layout) {
526         // convert dimensions
527         Shape inShape = from.shape();
528         auto& fromDim = from.dimensions;
529         inShape.dimensions = {fromDim[0], fromDim[2], fromDim[3], fromDim[1]};
530         // allocate buffer
531         to.buffer = nullptr;
532         if (!setInfoAndAllocateIfNeeded(&to, inShape, &result)) {
533             return false;
534         }
535         ptr_guard.reset(to.buffer);
536         // convert value
537         if (from.type == OperandType::TENSOR_FLOAT32) {
538             return convertToNhwcImpl<float>(reinterpret_cast<float*>(to.buffer),
539                                             reinterpret_cast<const float*>(from.buffer), fromDim);
540         } else if (from.type == OperandType::TENSOR_FLOAT16) {
541             return convertToNhwcImpl<_Float16>(reinterpret_cast<_Float16*>(to.buffer),
542                                                reinterpret_cast<const _Float16*>(from.buffer),
543                                                fromDim);
544         } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM) {
545             return convertToNhwcImpl<uint8_t>(reinterpret_cast<uint8_t*>(to.buffer),
546                                               reinterpret_cast<const uint8_t*>(from.buffer),
547                                               fromDim);
548         } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
549             return convertToNhwcImpl<int8_t>(reinterpret_cast<int8_t*>(to.buffer),
550                                              reinterpret_cast<const int8_t*>(from.buffer), fromDim);
551         } else {
552             LOG(ERROR) << "Unsupported data type";
553             return false;
554         }
555     } else {
556         to = from;
557     }
558     return true;
559 }
560 
convertFromNhwc(RunTimeOperandInfo & to,const RunTimeOperandInfo & from,bool data_layout,int * result)561 static bool convertFromNhwc(RunTimeOperandInfo& to, const RunTimeOperandInfo& from,
562                             bool data_layout, int* result) {
563     if (from.dimensions.size() != 4) {
564         LOG(ERROR) << "Error converting a non-4-D tensor from NHWC layout";
565         return false;
566     }
567     if (data_layout) {
568         // convert dimensions
569         Shape outShape = from.shape();
570         auto& fromDim = from.dimensions;
571         outShape.dimensions = {fromDim[0], fromDim[3], fromDim[1], fromDim[2]};
572         // allocate buffer
573         if (!setInfoAndAllocateIfNeeded(&to, outShape, result)) {
574             return false;
575         }
576         // convert value
577         if (from.type == OperandType::TENSOR_FLOAT32) {
578             return convertFromNhwcImpl<float>(reinterpret_cast<float*>(to.buffer),
579                                               reinterpret_cast<const float*>(from.buffer), fromDim);
580         } else if (from.type == OperandType::TENSOR_FLOAT16) {
581             return convertFromNhwcImpl<_Float16>(reinterpret_cast<_Float16*>(to.buffer),
582                                                  reinterpret_cast<const _Float16*>(from.buffer),
583                                                  fromDim);
584         } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM) {
585             return convertFromNhwcImpl<uint8_t>(reinterpret_cast<uint8_t*>(to.buffer),
586                                                 reinterpret_cast<const uint8_t*>(from.buffer),
587                                                 fromDim);
588         } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
589             return convertFromNhwcImpl<int8_t>(reinterpret_cast<int8_t*>(to.buffer),
590                                                reinterpret_cast<const int8_t*>(from.buffer),
591                                                fromDim);
592         } else {
593             LOG(ERROR) << "Unsupported data type";
594             return false;
595         }
596     } else {
597         Shape outShape = from.shape();
598         to.buffer = from.buffer;
599         to.length = from.length;
600         if (!setInfoAndAllocateIfNeeded(&to, outShape, result)) {
601             return false;
602         }
603     }
604     return true;
605 }
606 
607 // Decrements the usage count for the operands listed.  Frees the memory
608 // allocated for any temporary variable with a count of zero.
consumeOperationInputs(const std::vector<uint32_t> & inputs,RunTimeOperandInfo * operands)609 static void consumeOperationInputs(const std::vector<uint32_t>& inputs,
610                                    RunTimeOperandInfo* operands) {
611     for (uint32_t i : inputs) {
612         auto& info = operands[i];
613         // Check if it's a static or model input/output.
614         if (info.numberOfUsesLeft == 0) {
615             continue;
616         }
617         info.numberOfUsesLeft--;
618         if (info.numberOfUsesLeft == 0 && info.buffer != nullptr) {
619             delete[] info.buffer;
620             info.buffer = nullptr;
621         }
622     }
623 }
624 
625 // This function only frees TEMPORARY_VARIABLE operands that are unused
626 // outputs because consumeOperationInputs takes care of any operands
627 // that are inputs to an operation.
freeUnusedSubgraphOperands(std::vector<RunTimeOperandInfo> * operands)628 static void freeUnusedSubgraphOperands(std::vector<RunTimeOperandInfo>* operands) {
629     for (auto& info : *operands) {
630         if (info.lifetime == OperandLifeTime::TEMPORARY_VARIABLE && info.numberOfUsesLeft == 0 &&
631             info.buffer != nullptr) {
632             delete[] info.buffer;
633             info.buffer = nullptr;
634         }
635     }
636 }
637 
638 // Ignore the .pools entry in model and request.  This will have been taken care of
639 // by the caller.
run(const Model & model,const Request & request,const std::vector<RunTimePoolInfo> & modelPoolInfos,const std::vector<RunTimePoolInfo> & requestPoolInfos)640 int CpuExecutor::run(const Model& model, const Request& request,
641                      const std::vector<RunTimePoolInfo>& modelPoolInfos,
642                      const std::vector<RunTimePoolInfo>& requestPoolInfos) {
643     NNTRACE_CPU(NNTRACE_PHASE_EXECUTION, "run");
644     VLOG(CPUEXE) << "CpuExecutor::run() with request(" << SHOW_IF_DEBUG(toString(request)) << ")";
645     mModelOperandValues = &model.operandValues;
646     mModelPoolInfos = &modelPoolInfos;
647     mReferencedSubgraphs = &model.referenced;
648 
649     // b/109953668, disable OpenMP
650 #ifdef NNAPI_OPENMP
651     ScopedOpenmpSettings openMpSettings;
652 #endif  // NNAPI_OPENMP
653 
654     std::vector<RunTimeOperandInfo> operands = initializeRunTimeInfo(model.main);
655     updateForArguments(model.main.inputIndexes, request.inputs, requestPoolInfos, operands.data());
656     updateForArguments(model.main.outputIndexes, request.outputs, requestPoolInfos,
657                        operands.data());
658     int result = executeSubgraph(model.main, operands.data());
659     freeUnusedSubgraphOperands(&operands);
660 
661     if (result == ANEURALNETWORKS_NO_ERROR) {
662         VLOG(CPUEXE) << "Completed run normally";
663         for (auto& runtimeInfo : requestPoolInfos) {
664             runtimeInfo.flush();
665         }
666     }
667 
668     // Only report the output shapes when the result code is NO_ERROR or OUTPUT_INSUFFICIENT_SIZE.
669     if (result == ANEURALNETWORKS_NO_ERROR || result == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
670         setOutputShapes(model.main.outputIndexes, operands);
671     } else {
672         mOutputShapes.clear();
673     }
674 
675     mFinished = true;
676     mModelOperandValues = nullptr;
677     mModelPoolInfos = nullptr;
678     mReferencedSubgraphs = nullptr;
679     return result;
680 }
681 
executeSubgraph(const Subgraph & subgraph,RunTimeOperandInfo * operands)682 int CpuExecutor::executeSubgraph(const Subgraph& subgraph, RunTimeOperandInfo* operands) {
683     VLOG(CPUEXE) << "CpuExecutor::executeSubgraph " << toString(subgraph);
684     // The graph has serialized the operation in execution order.
685     for (const auto& operation : subgraph.operations) {
686         NN_RETURN_IF_ERROR(executeOperation(operation, operands));
687     }
688     return ANEURALNETWORKS_NO_ERROR;
689 }
690 
initializeRunTimeInfo(const Subgraph & subgraph)691 std::vector<RunTimeOperandInfo> CpuExecutor::initializeRunTimeInfo(const Subgraph& subgraph) {
692     VLOG(CPUEXE) << "CpuExecutor::initializeRunTimeInfo";
693     const size_t count = subgraph.operands.size();
694     std::vector<RunTimeOperandInfo> operands(count);
695     for (size_t i = 0; i < count; i++) {
696         const Operand& from = subgraph.operands[i];
697         RunTimeOperandInfo& to = operands[i];
698         to.type = from.type;
699         to.dimensions = from.dimensions;
700         to.scale = from.scale;
701         to.zeroPoint = from.zeroPoint;
702         to.length = from.location.length;
703         to.lifetime = from.lifetime;
704         to.extraParams = from.extraParams;
705         switch (from.lifetime) {
706             case OperandLifeTime::TEMPORARY_VARIABLE:
707                 to.buffer = nullptr;
708                 to.numberOfUsesLeft = from.numberOfConsumers;
709                 break;
710             case OperandLifeTime::CONSTANT_COPY:
711                 to.buffer = const_cast<uint8_t*>(&(*mModelOperandValues)[from.location.offset]);
712                 to.numberOfUsesLeft = 0;
713                 break;
714             case OperandLifeTime::CONSTANT_REFERENCE: {
715                 auto poolIndex = from.location.poolIndex;
716                 CHECK_LT(poolIndex, mModelPoolInfos->size());
717                 auto& r = (*mModelPoolInfos)[poolIndex];
718                 to.buffer = r.getBuffer() + from.location.offset;
719                 to.numberOfUsesLeft = 0;
720                 break;
721             }
722             case OperandLifeTime::SUBGRAPH: {
723                 auto subgraphIndex = from.location.offset;
724                 CHECK_LT(subgraphIndex, mReferencedSubgraphs->size());
725                 to.buffer = reinterpret_cast<uint8_t*>(
726                         const_cast<Subgraph*>(&(*mReferencedSubgraphs)[subgraphIndex]));
727                 to.numberOfUsesLeft = 0;
728             } break;
729             case OperandLifeTime::SUBGRAPH_INPUT:
730             case OperandLifeTime::SUBGRAPH_OUTPUT:
731             case OperandLifeTime::NO_VALUE:
732                 to.buffer = nullptr;
733                 to.numberOfUsesLeft = 0;
734                 break;
735         }
736     }
737     return operands;
738 }
739 
updateForArguments(const std::vector<uint32_t> & indexes,const hal::hidl_vec<hal::RequestArgument> & arguments,const std::vector<RunTimePoolInfo> & requestPoolInfos,RunTimeOperandInfo * operands)740 void CpuExecutor::updateForArguments(const std::vector<uint32_t>& indexes,
741                                      const hal::hidl_vec<hal::RequestArgument>& arguments,
742                                      const std::vector<RunTimePoolInfo>& requestPoolInfos,
743                                      RunTimeOperandInfo* operands) {
744     CHECK_EQ(indexes.size(), arguments.size());
745     for (size_t i = 0; i < indexes.size(); i++) {
746         const uint32_t operandIndex = indexes[i];
747         const RequestArgument& from = arguments[i];
748         RunTimeOperandInfo& to = operands[operandIndex];
749         if (from.dimensions.size() > 0) {
750             // It's the responsibility of the caller to validate that
751             // from.dimensions only modifies the dimensions that were
752             // unspecified in the model.  That's the case in SampleDriver.cpp
753             // with the call to validateRequest().
754             // TODO make sure that's the case for the default CPU path.
755             to.dimensions = from.dimensions;
756         }
757         if (from.hasNoValue) {
758             to.lifetime = OperandLifeTime::NO_VALUE;
759             CHECK(to.buffer == nullptr);
760             to.length = 0;
761         } else {
762             auto poolIndex = from.location.poolIndex;
763             CHECK_LT(poolIndex, requestPoolInfos.size());
764             auto& r = requestPoolInfos[poolIndex];
765             to.buffer = r.getBuffer() + from.location.offset;
766             if (from.location.offset == 0 && from.location.length == 0) {
767                 // Use the entire memory region.
768                 to.length = r.getSize();
769             } else {
770                 to.length = from.location.length;
771             }
772         }
773     }
774 }
775 
executeOperation(const Operation & operation,RunTimeOperandInfo * operands)776 int CpuExecutor::executeOperation(const Operation& operation, RunTimeOperandInfo* operands) {
777     if (hasDeadlinePassed(mDeadline)) {
778         return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
779     }
780     if (operation.type == OperationType::IF) {
781         int result = executeIfOperation(operation, operands);
782         if (result != ANEURALNETWORKS_NO_ERROR) {
783             LOG(ERROR) << "IF failed.";
784         }
785         return result;
786     }
787     if (operation.type == OperationType::WHILE) {
788         int result = executeWhileOperation(operation, operands);
789         if (result != ANEURALNETWORKS_NO_ERROR) {
790             LOG(ERROR) << "WHILE failed.";
791         }
792         return result;
793     }
794 
795     // VLOG(CPUEXE) << "CpuExecutor::executeOperation(" << toString(operation) << ")";
796     const hidl_vec<uint32_t>& ins = operation.inputs;
797     const hidl_vec<uint32_t>& outs = operation.outputs;
798     bool success = false;
799     int result = ANEURALNETWORKS_NO_ERROR;
800 
801     // Function to verify that the number of input and output parameters
802     // matches what is expected.  Also checks that all the parameters have
803     // values. This function is to be used only for operations that do not
804     // accept optional arguments.
805     // TODO Have a version that works for optional arguments.
806     auto allParametersPresent = [&operation, &operands, &ins, &outs](size_t requiredIns,
807                                                                      size_t requiredOuts) -> bool {
808         auto verify = [&operation, &operands](size_t requiredCount,
809                                               const hidl_vec<uint32_t>& indexes,
810                                               const char* type) -> bool {
811             size_t actualCount = indexes.size();
812             if (actualCount != requiredCount) {
813                 LOG(ERROR) << getOperationName(operation.type) << ": Invalid number of " << type
814                            << " operands. Got " << actualCount << " of " << requiredCount;
815                 return false;
816             }
817             for (size_t i = 0; i < actualCount; i++) {
818                 if (operands[indexes[i]].lifetime == OperandLifeTime::NO_VALUE) {
819                     LOG(ERROR) << getOperationName(operation.type) << " " << type << " operand "
820                                << i << " is required but missing.";
821                     return false;
822                 }
823             }
824             return true;
825         };
826 
827         auto verifyNoZeroSizedInputs = [&operation, &operands](const hidl_vec<uint32_t>& indexes) {
828             for (size_t i = 0; i < indexes.size(); i++) {
829                 for (size_t j = 0; j < operands[indexes[i]].dimensions.size(); j++) {
830                     if (operands[indexes[i]].dimensions[j] == 0) {
831                         LOG(ERROR) << getOperationName(operation.type)
832                                    << " does not support zero-sized tensor, but input " << i
833                                    << " dimension " << j << " is zero.";
834                         return false;
835                     }
836                 }
837             }
838             return true;
839         };
840 
841         return verify(requiredIns, ins, "in") && verify(requiredOuts, outs, "out") &&
842                verifyNoZeroSizedInputs(ins);
843     };
844 
845     switch (operation.type) {
846         case OperationType::OEM_OPERATION: {
847             LOG(ERROR) << "OEM operation not supported for CPU execution";
848             success = false;
849         } break;
850         case OperationType::RESHAPE: {
851             if (!allParametersPresent(2, 1)) {
852                 return ANEURALNETWORKS_BAD_DATA;
853             }
854             const RunTimeOperandInfo& input = operands[ins[0]];
855             const RunTimeOperandInfo& targetShape = operands[ins[1]];
856 
857             RunTimeOperandInfo& output = operands[outs[0]];
858             Shape outShape = output.shape();
859 
860             success = reshapePrepare(input.shape(),
861                                      reinterpret_cast<const int32_t*>(targetShape.buffer),
862                                      getNumberOfElements(targetShape.shape()), &outShape) &&
863                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
864                       copyData(input.buffer, input.shape(), output.buffer, outShape);
865         } break;
866         case OperationType::DEPTH_TO_SPACE: {
867             const size_t inCount = ins.size();
868             if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
869                 return ANEURALNETWORKS_BAD_DATA;
870             }
871             const RunTimeOperandInfo& input = operands[ins[0]];
872             int32_t blockSize = getScalarData<int32_t>(operands[ins[1]]);
873             bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
874 
875             RunTimeOperandInfo& output = operands[outs[0]];
876             Shape outShape = output.shape();
877 
878             RunTimeOperandInfo input_tmp, output_tmp;
879             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
880             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
881                 success = false;
882                 break;
883             }
884             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
885             output_tmp.buffer = data_layout ? nullptr : output.buffer;
886             output_tmp.length = data_layout ? 0 : output.length;
887             if (!depthToSpacePrepare(input_tmp.shape(), blockSize, &outShape) ||
888                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
889                 if (!data_layout) output.dimensions = output_tmp.dimensions;
890                 break;
891             }
892             switch (input_tmp.type) {
893                 case OperandType::TENSOR_FLOAT32: {
894                     success = depthToSpaceGeneric(
895                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
896                             blockSize, reinterpret_cast<float*>(output_tmp.buffer), outShape);
897                     break;
898                 }
899                 case OperandType::TENSOR_FLOAT16: {
900                     success = depthToSpaceGeneric(
901                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
902                             blockSize, reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
903                     break;
904                 }
905                 case OperandType::TENSOR_QUANT8_ASYMM: {
906                     success = depthToSpaceGeneric(
907                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
908                             blockSize, reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
909                     break;
910                 }
911                 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
912                     success = depthToSpaceGeneric(
913                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
914                             blockSize, reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
915                     break;
916                 }
917                 default: {
918                     LOG(ERROR) << "Unsupported data type";
919                     success = false;
920                 }
921             }
922             if (data_layout) {
923                 output_tmp_guard.reset(output_tmp.buffer);
924             }
925             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
926                 success = false;
927                 break;
928             }
929         } break;
930         case OperationType::SPACE_TO_DEPTH: {
931             const size_t inCount = ins.size();
932             if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
933                 return ANEURALNETWORKS_BAD_DATA;
934             }
935             const RunTimeOperandInfo& input = operands[ins[0]];
936             int32_t blockSize = getScalarData<int32_t>(operands[ins[1]]);
937             bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
938 
939             RunTimeOperandInfo& output = operands[outs[0]];
940             Shape outShape = output.shape();
941 
942             RunTimeOperandInfo input_tmp, output_tmp;
943             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
944             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
945                 success = false;
946                 break;
947             }
948             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
949             output_tmp.buffer = data_layout ? nullptr : output.buffer;
950             output_tmp.length = data_layout ? 0 : output.length;
951 
952             if (!spaceToDepthPrepare(input_tmp.shape(), blockSize, &outShape) ||
953                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
954                 if (!data_layout) output.dimensions = output_tmp.dimensions;
955                 break;
956             }
957             switch (input_tmp.type) {
958                 case OperandType::TENSOR_FLOAT32: {
959                     success = spaceToDepthGeneric(
960                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
961                             blockSize, reinterpret_cast<float*>(output_tmp.buffer), outShape);
962                     break;
963                 }
964                 case OperandType::TENSOR_FLOAT16: {
965                     success = spaceToDepthGeneric(
966                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
967                             blockSize, reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
968                     break;
969                 }
970                 case OperandType::TENSOR_QUANT8_ASYMM: {
971                     success = spaceToDepthGeneric(
972                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
973                             blockSize, reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
974                     break;
975                 }
976                 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
977                     success = spaceToDepthGeneric(
978                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
979                             blockSize, reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
980                     break;
981                 }
982                 default: {
983                     LOG(ERROR) << "Unsupported data type";
984                     success = false;
985                 }
986             }
987             if (data_layout) {
988                 output_tmp_guard.reset(output_tmp.buffer);
989             }
990             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
991                 success = false;
992                 break;
993             }
994         } break;
995         case OperationType::EMBEDDING_LOOKUP: {
996             if (!allParametersPresent(2, 1)) {
997                 return ANEURALNETWORKS_BAD_DATA;
998             }
999             const RunTimeOperandInfo& values = operands[ins[EmbeddingLookup::kValueTensor]];
1000             const RunTimeOperandInfo& lookups = operands[ins[EmbeddingLookup::kLookupTensor]];
1001             RunTimeOperandInfo& output = operands[outs[EmbeddingLookup::kOutputTensor]];
1002 
1003             Shape outputShape;
1004             EmbeddingLookup lookup(operation, operands);
1005 
1006             success = embeddingLookupPrepare(values.shape(), lookups.shape(), &outputShape) &&
1007                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && lookup.Eval();
1008         } break;
1009         case OperationType::HASHTABLE_LOOKUP: {
1010             if (!allParametersPresent(3, 2)) {
1011                 return ANEURALNETWORKS_BAD_DATA;
1012             }
1013             const RunTimeOperandInfo& lookups = operands[ins[HashtableLookup::kLookupTensor]];
1014             const RunTimeOperandInfo& keys = operands[ins[HashtableLookup::kKeyTensor]];
1015             const RunTimeOperandInfo& values = operands[ins[HashtableLookup::kValueTensor]];
1016 
1017             RunTimeOperandInfo& output = operands[outs[HashtableLookup::kOutputTensor]];
1018             RunTimeOperandInfo& hits = operands[outs[HashtableLookup::kHitsTensor]];
1019 
1020             Shape outputShape, hitShape;
1021             HashtableLookup lookup(operation, operands);
1022 
1023             success = hashtableLookupPrepare(lookups.shape(), keys.shape(), values.shape(),
1024                                              &outputShape, &hitShape) &&
1025                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1026                       setInfoAndAllocateIfNeeded(&hits, hitShape, &result) && lookup.Eval();
1027         } break;
1028         case OperationType::LSH_PROJECTION: {
1029             RunTimeOperandInfo& output = operands[outs[LSHProjection::kOutputTensor]];
1030             Shape outputShape;
1031             if (!LSHProjection::Prepare(operation, operands, &outputShape) ||
1032                 !setInfoAndAllocateIfNeeded(&output, outputShape, &result)) {
1033                 break;
1034             }
1035 
1036             LSHProjection lsh(operation, operands);
1037             const RunTimeOperandInfo& hash = operands[ins[LSHProjection::kHashTensor]];
1038             switch (hash.type) {
1039                 case OperandType::TENSOR_FLOAT32: {
1040                     success = lsh.Eval<float>();
1041                     break;
1042                 }
1043                 case OperandType::TENSOR_FLOAT16: {
1044                     success = lsh.Eval<_Float16>();
1045                     break;
1046                 }
1047                 default: {
1048                     success = false;
1049                     LOG(ERROR) << "Unsupported data type";
1050                 }
1051             }
1052         } break;
1053         case OperationType::BIDIRECTIONAL_SEQUENCE_LSTM: {
1054             const auto merge_outputs = getScalarData<bool>(
1055                     operands[ins[BidirectionalSequenceLSTM::kMergeOutputsParam]]);
1056             const bool output_state = (outs.size() == 5 || outs.size() == 6);
1057             RunTimeOperandInfo& fwOutput =
1058                     operands[outs[BidirectionalSequenceLSTM::kFwOutputTensor]];
1059             Shape fwOutputShape, bwOutputShape, fwOutputActivationStateShape,
1060                     fwOutputCellStateShape, bwOutputActivationStateShape, bwOutputCellStateShape;
1061 
1062             BidirectionalSequenceLSTM lstm(operation, operands);
1063             success = lstm.Prepare(operation, operands, &fwOutputShape, &bwOutputShape,
1064                                    &fwOutputActivationStateShape, &fwOutputCellStateShape,
1065                                    &bwOutputActivationStateShape, &bwOutputCellStateShape) &&
1066                       setInfoAndAllocateIfNeeded(&fwOutput, fwOutputShape, &result);
1067             if (!merge_outputs) {
1068                 RunTimeOperandInfo& bwOutput =
1069                         operands[outs[BidirectionalSequenceLSTM::kBwOutputTensor]];
1070                 success = success && setInfoAndAllocateIfNeeded(&bwOutput, bwOutputShape, &result);
1071             }
1072             if (output_state) {
1073                 uint32_t delta = merge_outputs ? 1 : 0;
1074                 RunTimeOperandInfo& fwOutputActivationState =
1075                         operands[outs[BidirectionalSequenceLSTM::kFwOutputActivationStateTensor -
1076                                       delta]];
1077                 RunTimeOperandInfo& fwOutputCellState =
1078                         operands[outs[BidirectionalSequenceLSTM::kFwOutputCellStateTensor - delta]];
1079                 RunTimeOperandInfo& bwOutputActivationState =
1080                         operands[outs[BidirectionalSequenceLSTM::kBwOutputActivationStateTensor -
1081                                       delta]];
1082                 RunTimeOperandInfo& bwOutputCellState =
1083                         operands[outs[BidirectionalSequenceLSTM::kBwOutputCellStateTensor - delta]];
1084                 success = success &&
1085                           setInfoAndAllocateIfNeeded(&fwOutputActivationState,
1086                                                      fwOutputActivationStateShape, &result) &&
1087                           setInfoAndAllocateIfNeeded(&fwOutputCellState, fwOutputCellStateShape,
1088                                                      &result) &&
1089                           setInfoAndAllocateIfNeeded(&bwOutputActivationState,
1090                                                      bwOutputActivationStateShape, &result) &&
1091                           setInfoAndAllocateIfNeeded(&bwOutputCellState, bwOutputCellStateShape,
1092                                                      &result);
1093             }
1094             success = success && lstm.Eval();
1095         } break;
1096         case OperationType::LSTM: {
1097             RunTimeOperandInfo& scratch = operands[outs[LSTMCell::kScratchBufferTensor]];
1098             RunTimeOperandInfo& outputStateOut = operands[outs[LSTMCell::kOutputStateOutTensor]];
1099             RunTimeOperandInfo& cellStateOut = operands[outs[LSTMCell::kCellStateOutTensor]];
1100             RunTimeOperandInfo& output = operands[outs[LSTMCell::kOutputTensor]];
1101 
1102             Shape scratchShape, outputStateShape, cellStateShape, outputShape;
1103             LSTMCell lstm_cell(operation, operands);
1104 
1105             success = lstm_cell.Prepare(operation, operands, &scratchShape, &outputStateShape,
1106                                         &cellStateShape, &outputShape) &&
1107                       setInfoAndAllocateIfNeeded(&scratch, scratchShape, &result) &&
1108                       setInfoAndAllocateIfNeeded(&outputStateOut, outputStateShape, &result) &&
1109                       setInfoAndAllocateIfNeeded(&cellStateOut, cellStateShape, &result) &&
1110                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && lstm_cell.Eval();
1111         } break;
1112         case OperationType::RANDOM_MULTINOMIAL: {
1113             if (!allParametersPresent(3, 1)) {
1114                 return ANEURALNETWORKS_BAD_DATA;
1115             }
1116             RunTimeOperandInfo& output = operands[outs[Multinomial::kOutputTensor]];
1117 
1118             Shape outputShape;
1119             Multinomial multinomial(operation, operands);
1120 
1121             success = Multinomial::Prepare(operation, operands, &outputShape) &&
1122                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1123                       multinomial.Eval();
1124         } break;
1125         case OperationType::RNN: {
1126             if (!allParametersPresent(6, 2)) {
1127                 return ANEURALNETWORKS_BAD_DATA;
1128             }
1129 
1130             RunTimeOperandInfo& hiddenStateOut = operands[outs[RNN::kHiddenStateOutTensor]];
1131             RunTimeOperandInfo& output = operands[outs[RNN::kOutputTensor]];
1132 
1133             Shape hiddenStateShape, outputShape;
1134             RNN rnn_cell(operation, operands);
1135 
1136             success = RNN::Prepare(operation, operands, &hiddenStateShape, &outputShape) &&
1137                       setInfoAndAllocateIfNeeded(&hiddenStateOut, hiddenStateShape, &result) &&
1138                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && rnn_cell.Eval();
1139         } break;
1140         case OperationType::SVDF: {
1141             RunTimeOperandInfo& stateOut = operands[outs[SVDF::kStateOutTensor]];
1142             RunTimeOperandInfo& output = operands[outs[SVDF::kOutputTensor]];
1143 
1144             Shape stateShape, outputShape;
1145             SVDF svdf(operation, operands);
1146 
1147             success = SVDF::Prepare(operation, operands, &stateShape, &outputShape) &&
1148                       setInfoAndAllocateIfNeeded(&stateOut, stateShape, &result) &&
1149                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && svdf.Eval();
1150         } break;
1151         case OperationType::BATCH_TO_SPACE_ND: {
1152             const size_t inCount = ins.size();
1153             if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
1154                 return ANEURALNETWORKS_BAD_DATA;
1155             }
1156             const RunTimeOperandInfo& input = operands[ins[0]];
1157             const RunTimeOperandInfo& blockSize = operands[ins[1]];
1158             bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
1159 
1160             RunTimeOperandInfo& output = operands[outs[0]];
1161             Shape outShape = output.shape();
1162 
1163             RunTimeOperandInfo input_tmp, output_tmp;
1164             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1165             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1166                 success = false;
1167                 break;
1168             }
1169             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
1170             output_tmp.buffer = data_layout ? nullptr : output.buffer;
1171             output_tmp.length = data_layout ? 0 : output.length;
1172 
1173             if (!batchToSpacePrepare(input_tmp.shape(),
1174                                      reinterpret_cast<const int32_t*>(blockSize.buffer),
1175                                      blockSize.shape(), &outShape) ||
1176                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1177                 if (!data_layout) output.dimensions = output_tmp.dimensions;
1178                 break;
1179             }
1180             switch (input_tmp.type) {
1181                 case OperandType::TENSOR_FLOAT32: {
1182                     success = batchToSpaceGeneric(
1183                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1184                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1185                             reinterpret_cast<float*>(output_tmp.buffer), outShape);
1186                     break;
1187                 }
1188                 case OperandType::TENSOR_FLOAT16: {
1189                     success = batchToSpaceGeneric(
1190                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1191                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1192                             reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
1193                     break;
1194                 }
1195                 case OperandType::TENSOR_QUANT8_ASYMM: {
1196                     success = batchToSpaceGeneric(
1197                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1198                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1199                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1200                     break;
1201                 }
1202                 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1203                     success = batchToSpaceGeneric(
1204                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1205                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1206                             reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1207                     break;
1208                 }
1209                 default: {
1210                     LOG(ERROR) << "Unsupported data type";
1211                     success = false;
1212                 }
1213             }
1214             if (data_layout) {
1215                 output_tmp_guard.reset(output_tmp.buffer);
1216             }
1217             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1218                 success = false;
1219                 break;
1220             }
1221         } break;
1222         case OperationType::SPACE_TO_BATCH_ND: {
1223             const size_t inCount = ins.size();
1224             if ((inCount != 4 && inCount != 3) || !allParametersPresent(inCount, 1)) {
1225                 return ANEURALNETWORKS_BAD_DATA;
1226             }
1227             const RunTimeOperandInfo& input = operands[ins[0]];
1228             const RunTimeOperandInfo& blockSize = operands[ins[1]];
1229             const RunTimeOperandInfo& paddings = operands[ins[2]];
1230             bool data_layout = inCount == 4 ? getScalarData<bool>(operands[ins[3]]) : false;
1231 
1232             RunTimeOperandInfo& output = operands[outs[0]];
1233             Shape outShape = output.shape();
1234 
1235             RunTimeOperandInfo input_tmp, output_tmp;
1236             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1237             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1238                 success = false;
1239                 break;
1240             }
1241             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
1242             output_tmp.buffer = data_layout ? nullptr : output.buffer;
1243             output_tmp.length = data_layout ? 0 : output.length;
1244 
1245             if (!spaceToBatchPrepare(
1246                         input_tmp.shape(), reinterpret_cast<const int32_t*>(blockSize.buffer),
1247                         blockSize.shape(), reinterpret_cast<const int32_t*>(paddings.buffer),
1248                         paddings.shape(), &outShape) ||
1249                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1250                 if (!data_layout) output.dimensions = output_tmp.dimensions;
1251                 break;
1252             }
1253             switch (input_tmp.type) {
1254                 case OperandType::TENSOR_FLOAT32: {
1255                     success = spaceToBatchGeneric(
1256                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1257                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1258                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1259                             reinterpret_cast<float*>(output_tmp.buffer), outShape);
1260                     break;
1261                 }
1262                 case OperandType::TENSOR_FLOAT16: {
1263                     success = spaceToBatchGeneric(
1264                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1265                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1266                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1267                             reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
1268                     break;
1269                 }
1270                 case OperandType::TENSOR_QUANT8_ASYMM: {
1271                     success = spaceToBatchGeneric(
1272                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1273                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1274                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1275                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1276                     break;
1277                 }
1278                 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1279                     success = spaceToBatchGeneric(
1280                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1281                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1282                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1283                             reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1284                     break;
1285                 }
1286                 default: {
1287                     LOG(ERROR) << "Unsupported data type";
1288                     success = false;
1289                 }
1290             }
1291             if (data_layout) {
1292                 output_tmp_guard.reset(output_tmp.buffer);
1293             }
1294             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1295                 success = false;
1296                 break;
1297             }
1298         } break;
1299         case OperationType::PAD:
1300         case OperationType::PAD_V2: {
1301             const bool isV2 = operation.type == OperationType::PAD_V2;
1302             if (!allParametersPresent(isV2 ? 3 : 2, 1)) {
1303                 return ANEURALNETWORKS_BAD_DATA;
1304             }
1305             const RunTimeOperandInfo& input = operands[ins[0]];
1306             const RunTimeOperandInfo& paddings = operands[ins[1]];
1307 
1308             RunTimeOperandInfo& output = operands[outs[0]];
1309             Shape outShape = output.shape();
1310 
1311             if (!padPrepare(input.shape(), reinterpret_cast<const int32_t*>(paddings.buffer),
1312                             paddings.shape(), &outShape) ||
1313                 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
1314                 break;
1315             }
1316             if (input.type == OperandType::TENSOR_FLOAT32) {
1317                 float pad_value = isV2 ? getScalarData<float>(operands[ins[2]]) : 0;
1318                 success = padGeneric(reinterpret_cast<const float*>(input.buffer), input.shape(),
1319                                      reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1320                                      reinterpret_cast<float*>(output.buffer), outShape);
1321             } else if (input.type == OperandType::TENSOR_FLOAT16) {
1322                 _Float16 pad_value = isV2 ? getScalarData<_Float16>(operands[ins[2]]) : 0;
1323                 success = padGeneric(reinterpret_cast<const _Float16*>(input.buffer), input.shape(),
1324                                      reinterpret_cast<const int32_t*>(paddings.buffer),
1325                                      static_cast<_Float16>(pad_value),
1326                                      reinterpret_cast<_Float16*>(output.buffer), outShape);
1327             } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
1328                 uint8_t pad_value =
1329                         isV2 ? getScalarData<uint8_t>(operands[ins[2]]) : outShape.offset;
1330                 success = padGeneric(input.buffer, input.shape(),
1331                                      reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1332                                      output.buffer, outShape);
1333             } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1334                 uint8_t pad_value =
1335                         isV2 ? getScalarData<int8_t>(operands[ins[2]]) : outShape.offset;
1336                 success = padGeneric(input.buffer, input.shape(),
1337                                      reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1338                                      output.buffer, outShape);
1339             }
1340         } break;
1341         case OperationType::CAST: {
1342             if (!allParametersPresent(1, 1)) {
1343                 return ANEURALNETWORKS_BAD_DATA;
1344             }
1345             const RunTimeOperandInfo& input = operands[ins[0]];
1346 
1347             RunTimeOperandInfo& output = operands[outs[0]];
1348             Shape outShape = output.shape();
1349 
1350             success = cast::prepare(input.shape(), &outShape) &&
1351                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1352                       cast::eval(input.buffer, input.shape(), output.buffer, outShape);
1353         } break;
1354         case OperationType::MEAN: {
1355             if (!allParametersPresent(3, 1)) {
1356                 return ANEURALNETWORKS_BAD_DATA;
1357             }
1358             const RunTimeOperandInfo& input = operands[ins[0]];
1359             const RunTimeOperandInfo& axis = operands[ins[1]];
1360             int32_t keepDims = getScalarData<int32_t>(operands[ins[2]]);
1361 
1362             RunTimeOperandInfo& output = operands[outs[0]];
1363             Shape outShape = output.shape();
1364 
1365             if (!meanPrepare(input.shape(), reinterpret_cast<const int32_t*>(axis.buffer),
1366                              axis.shape(), keepDims > 0, &outShape) ||
1367                 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
1368                 break;
1369             }
1370             if (input.type == OperandType::TENSOR_FLOAT16) {
1371                 success = meanFloat16(reinterpret_cast<_Float16*>(input.buffer), input.shape(),
1372                                       reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(),
1373                                       keepDims > 0, reinterpret_cast<_Float16*>(output.buffer),
1374                                       outShape);
1375             } else if (input.type == OperandType::TENSOR_FLOAT32) {
1376                 success = meanGeneric<float, float>(
1377                         reinterpret_cast<float*>(input.buffer), input.shape(),
1378                         reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1379                         reinterpret_cast<float*>(output.buffer), outShape);
1380             } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
1381                 success = meanGeneric<uint8_t, int32_t>(
1382                         reinterpret_cast<uint8_t*>(input.buffer), input.shape(),
1383                         reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1384                         reinterpret_cast<uint8_t*>(output.buffer), outShape);
1385             } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1386                 success = meanGeneric<int8_t, int32_t>(
1387                         reinterpret_cast<int8_t*>(input.buffer), input.shape(),
1388                         reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1389                         reinterpret_cast<int8_t*>(output.buffer), outShape);
1390             }
1391         } break;
1392         case OperationType::ARGMAX:
1393         case OperationType::ARGMIN: {
1394             if (!allParametersPresent(2, 1)) {
1395                 return ANEURALNETWORKS_BAD_DATA;
1396             }
1397             const RunTimeOperandInfo& input = operands[ins[0]];
1398             int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1399 
1400             RunTimeOperandInfo& output = operands[outs[0]];
1401             Shape outShape = output.shape();
1402 
1403             const bool isArgMin = operation.type == OperationType::ARGMIN;
1404             success = argMinMaxPrepare(input.shape(), axis, &outShape) &&
1405                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1406                       argMinMaxGeneric(input.buffer, input.shape(), axis, isArgMin, output.buffer,
1407                                        outShape);
1408         } break;
1409         case OperationType::EXPAND_DIMS: {
1410             if (!allParametersPresent(2, 1)) {
1411                 return ANEURALNETWORKS_BAD_DATA;
1412             }
1413             const RunTimeOperandInfo& input = operands[ins[0]];
1414             int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1415 
1416             RunTimeOperandInfo& output = operands[outs[0]];
1417             Shape outShape = output.shape();
1418 
1419             success = expand_dims::prepare(input.shape(), axis, &outShape) &&
1420                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1421                       expand_dims::eval(input.buffer, input.shape(), axis, output.buffer, outShape);
1422         } break;
1423         case OperationType::SPLIT: {
1424             const size_t outCount = outs.size();
1425             if (!allParametersPresent(3, outCount)) {
1426                 return ANEURALNETWORKS_BAD_DATA;
1427             }
1428 
1429             const RunTimeOperandInfo& input = operands[ins[0]];
1430             const int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1431             const int32_t numOutputs = getScalarData<int32_t>(operands[ins[2]]);
1432 
1433             if (numOutputs != outs.size()) {
1434                 return ANEURALNETWORKS_BAD_DATA;
1435             }
1436 
1437             std::vector<Shape> outputShapes(numOutputs);
1438             for (int i = 0; i < numOutputs; ++i) {
1439                 outputShapes[i] = operands[outs[i]].shape();
1440             }
1441 
1442             success = splitPrepare(input.shape(), axis, numOutputs, &outputShapes);
1443             for (int i = 0; i < numOutputs; ++i) {
1444                 success = success && setInfoAndAllocateIfNeeded(&(operands[outs[i]]),
1445                                                                 outputShapes[i], &result);
1446             }
1447             switch (input.type) {
1448                 case OperandType::TENSOR_FLOAT16: {
1449                     std::vector<_Float16*> outputDataPtrs(numOutputs);
1450                     for (int i = 0; i < numOutputs; ++i) {
1451                         outputDataPtrs[i] = reinterpret_cast<_Float16*>(operands[outs[i]].buffer);
1452                     }
1453                     success = success &&
1454                               splitFloat16(reinterpret_cast<const _Float16*>(input.buffer),
1455                                            input.shape(), axis, &outputDataPtrs, outputShapes);
1456                 } break;
1457                 case OperandType::TENSOR_FLOAT32: {
1458                     std::vector<float*> outputDataPtrs(numOutputs);
1459                     for (int i = 0; i < numOutputs; ++i) {
1460                         outputDataPtrs[i] = reinterpret_cast<float*>(operands[outs[i]].buffer);
1461                     }
1462                     success = success &&
1463                               splitFloat32(reinterpret_cast<const float*>(input.buffer),
1464                                            input.shape(), axis, &outputDataPtrs, outputShapes);
1465                 } break;
1466                 case OperandType::TENSOR_INT32: {
1467                     std::vector<int32_t*> outputDataPtrs(numOutputs);
1468                     for (int i = 0; i < numOutputs; ++i) {
1469                         outputDataPtrs[i] = reinterpret_cast<int32_t*>(operands[outs[i]].buffer);
1470                     }
1471                     success = success &&
1472                               splitInt32(reinterpret_cast<const int32_t*>(input.buffer),
1473                                          input.shape(), axis, &outputDataPtrs, outputShapes);
1474                 } break;
1475                 case OperandType::TENSOR_QUANT8_ASYMM: {
1476                     std::vector<uint8_t*> outputDataPtrs(numOutputs);
1477                     for (int i = 0; i < numOutputs; ++i) {
1478                         outputDataPtrs[i] = reinterpret_cast<uint8_t*>(operands[outs[i]].buffer);
1479                     }
1480                     success = success &&
1481                               splitQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
1482                                           input.shape(), axis, &outputDataPtrs, outputShapes);
1483                 } break;
1484                 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1485                     std::vector<int8_t*> outputDataPtrs(numOutputs);
1486                     for (int i = 0; i < numOutputs; ++i) {
1487                         outputDataPtrs[i] = reinterpret_cast<int8_t*>(operands[outs[i]].buffer);
1488                     }
1489                     success = success &&
1490                               splitQuant8Signed(reinterpret_cast<const int8_t*>(input.buffer),
1491                                                 input.shape(), axis, &outputDataPtrs, outputShapes);
1492                 } break;
1493                 default: {
1494                     return ANEURALNETWORKS_BAD_DATA;
1495                 }
1496             }
1497         } break;
1498         case OperationType::MAXIMUM:
1499         case OperationType::MINIMUM: {
1500             if (!allParametersPresent(2, 1)) {
1501                 return ANEURALNETWORKS_BAD_DATA;
1502             }
1503             const RunTimeOperandInfo& in1 = operands[ins[0]];
1504             const RunTimeOperandInfo& in2 = operands[ins[1]];
1505 
1506             RunTimeOperandInfo& output = operands[outs[0]];
1507             Shape outputShape = output.shape();
1508 
1509             const bool isMinimum = operation.type == OperationType::MINIMUM;
1510             success = maximum_minimum::prepare(in1.shape(), in2.shape(), &outputShape) &&
1511                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1512                       maximum_minimum::eval(in1.buffer, in1.shape(), in2.buffer, in2.shape(),
1513                                             isMinimum, output.buffer, outputShape);
1514         } break;
1515         case OperationType::GROUPED_CONV_2D: {
1516             const size_t inCount = ins.size();
1517             if ((inCount != 12 && inCount != 9) || !allParametersPresent(inCount, 1)) {
1518                 return ANEURALNETWORKS_BAD_DATA;
1519             }
1520             const RunTimeOperandInfo& input = operands[ins[0]];
1521             const RunTimeOperandInfo& filter = operands[ins[1]];
1522             const RunTimeOperandInfo& bias = operands[ins[2]];
1523 
1524             int32_t padding_left, padding_right;
1525             int32_t padding_top, padding_bottom;
1526             int32_t padding_implicit = 0;
1527             int32_t stride_width, stride_height;
1528             int32_t numGroups;
1529             int32_t activation;
1530             bool data_layout = false;
1531 
1532             if (inCount == 12) {
1533                 padding_left = getScalarData<int32_t>(operands[ins[3]]);
1534                 padding_right = getScalarData<int32_t>(operands[ins[4]]);
1535                 padding_top = getScalarData<int32_t>(operands[ins[5]]);
1536                 padding_bottom = getScalarData<int32_t>(operands[ins[6]]);
1537                 stride_width = getScalarData<int32_t>(operands[ins[7]]);
1538                 stride_height = getScalarData<int32_t>(operands[ins[8]]);
1539                 numGroups = getScalarData<int32_t>(operands[ins[9]]);
1540                 activation = getScalarData<int32_t>(operands[ins[10]]);
1541                 data_layout = getScalarData<bool>(operands[ins[11]]);
1542             } else {
1543                 padding_implicit = getScalarData<int32_t>(operands[ins[3]]);
1544                 stride_width = getScalarData<int32_t>(operands[ins[4]]);
1545                 stride_height = getScalarData<int32_t>(operands[ins[5]]);
1546                 numGroups = getScalarData<int32_t>(operands[ins[6]]);
1547                 activation = getScalarData<int32_t>(operands[ins[7]]);
1548                 data_layout = getScalarData<bool>(operands[ins[8]]);
1549             }
1550 
1551             RunTimeOperandInfo& output = operands[outs[0]];
1552             Shape outShape = output.shape();
1553 
1554             RunTimeOperandInfo input_tmp, output_tmp;
1555             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1556             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1557                 success = false;
1558                 break;
1559             }
1560             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
1561             output_tmp.buffer = data_layout ? nullptr : output.buffer;
1562             output_tmp.length = data_layout ? 0 : output.length;
1563 
1564             if (inCount == 9) {
1565                 Shape inputShape = input_tmp.shape();
1566                 Shape filterShape = filter.shape();
1567                 int32_t input_width = getSizeOfDimension(inputShape, 2);
1568                 int32_t input_height = getSizeOfDimension(inputShape, 1);
1569                 int32_t filter_width = getSizeOfDimension(filterShape, 2);
1570                 int32_t filter_height = getSizeOfDimension(filterShape, 1);
1571                 calculateExplicitPadding(input_width, stride_width, filter_width, padding_implicit,
1572                                          &padding_left, &padding_right);
1573                 calculateExplicitPadding(input_height, stride_height, filter_height,
1574                                          padding_implicit, &padding_top, &padding_bottom);
1575             }
1576 
1577             if (!groupedConvPrepare(input_tmp.shape(), filter.shape(), bias.shape(), padding_left,
1578                                     padding_right, padding_top, padding_bottom, stride_width,
1579                                     stride_height, numGroups, &outShape) ||
1580                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1581                 if (!data_layout) output.dimensions = output_tmp.dimensions;
1582                 success = false;
1583                 break;
1584             }
1585 
1586             if (input_tmp.type == OperandType::TENSOR_FLOAT32) {
1587                 success = groupedConvFloat32(
1588                         reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1589                         reinterpret_cast<const float*>(filter.buffer), filter.shape(),
1590                         reinterpret_cast<const float*>(bias.buffer), bias.shape(), padding_left,
1591                         padding_right, padding_top, padding_bottom, stride_width, stride_height,
1592                         numGroups, activation, reinterpret_cast<float*>(output_tmp.buffer),
1593                         outShape);
1594             } else if (input_tmp.type == OperandType::TENSOR_FLOAT16) {
1595                 success = groupedConvFloat16(
1596                         reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1597                         reinterpret_cast<const _Float16*>(filter.buffer), filter.shape(),
1598                         reinterpret_cast<const _Float16*>(bias.buffer), bias.shape(), padding_left,
1599                         padding_right, padding_top, padding_bottom, stride_width, stride_height,
1600                         numGroups, activation, reinterpret_cast<_Float16*>(output_tmp.buffer),
1601                         outShape);
1602             } else if (input_tmp.type == OperandType::TENSOR_QUANT8_ASYMM) {
1603                 if (filter.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
1604                     success = groupedConvQuant8PerChannel(
1605                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1606                             reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1607                             filter.extraParams.channelQuant().scales.data(),
1608                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1609                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
1610                             stride_height, numGroups, activation,
1611                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1612                 } else if (filter.type == OperandType::TENSOR_QUANT8_ASYMM) {
1613                     success = groupedConvQuant8(
1614                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1615                             reinterpret_cast<const uint8_t*>(filter.buffer), filter.shape(),
1616                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1617                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
1618                             stride_height, numGroups, activation,
1619                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1620                 }
1621             } else if (input_tmp.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1622                 if (filter.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
1623                     success = groupedConvQuant8PerChannel(
1624                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1625                             reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1626                             filter.extraParams.channelQuant().scales.data(),
1627                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1628                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
1629                             stride_height, numGroups, activation,
1630                             reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1631                 } else if (filter.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1632                     success = groupedConvQuant8(
1633                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1634                             reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1635                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1636                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
1637                             stride_height, numGroups, activation,
1638                             reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1639                 }
1640             }
1641 
1642             if (data_layout) {
1643                 output_tmp_guard.reset(output_tmp.buffer);
1644             }
1645             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1646                 success = false;
1647                 break;
1648             }
1649         } break;
1650         case OperationType::TILE: {
1651             if (!allParametersPresent(2, 1)) {
1652                 return ANEURALNETWORKS_BAD_DATA;
1653             }
1654             const RunTimeOperandInfo& input = operands[ins[0]];
1655             const RunTimeOperandInfo& multiples = operands[ins[1]];
1656 
1657             RunTimeOperandInfo& output = operands[outs[0]];
1658             Shape outShape = output.shape();
1659 
1660             success =
1661                     tile::prepare(input.shape(), reinterpret_cast<const int32_t*>(multiples.buffer),
1662                                   multiples.shape(), &outShape) &&
1663                     setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1664                     tile::eval(input.buffer, input.shape(),
1665                                reinterpret_cast<const int32_t*>(multiples.buffer), output.buffer,
1666                                outShape);
1667         } break;
1668         case OperationType::QUANTIZED_16BIT_LSTM: {
1669             if (!allParametersPresent(15, 2)) {
1670                 return ANEURALNETWORKS_BAD_DATA;
1671             }
1672 
1673             RunTimeOperandInfo& cellStateOut =
1674                     operands[outs[QuantizedLSTMCell::kCellStateOutTensor]];
1675             RunTimeOperandInfo& output = operands[outs[QuantizedLSTMCell::kOutputTensor]];
1676 
1677             Shape cellStateOutShape, outputShape;
1678             QuantizedLSTMCell quantizedLSTMCell(operation, operands);
1679 
1680             success = QuantizedLSTMCell::prepare(operation, operands, &cellStateOutShape,
1681                                                  &outputShape) &&
1682                       setInfoAndAllocateIfNeeded(&cellStateOut, cellStateOutShape, &result) &&
1683                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1684                       quantizedLSTMCell.eval();
1685         } break;
1686         case OperationType::POW: {
1687             if (!allParametersPresent(2, 1)) {
1688                 return ANEURALNETWORKS_BAD_DATA;
1689             }
1690             const RunTimeOperandInfo& base = operands[ins[0]];
1691             const RunTimeOperandInfo& exponent = operands[ins[1]];
1692 
1693             RunTimeOperandInfo& output = operands[outs[0]];
1694             Shape outShape = output.shape();
1695 
1696             success = pow::prepare(base.shape(), exponent.shape(), &outShape) &&
1697                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1698                       pow::eval(base.buffer, base.shape(), exponent.buffer, exponent.shape(),
1699                                 output.buffer, outShape);
1700         } break;
1701         default: {
1702             const OperationRegistration* operationRegistration =
1703                     mOperationResolver->findOperation(operation.type);
1704             if (operationRegistration == nullptr) {
1705                 LOG(ERROR) << getOperationName(operation.type) << " not registered";
1706             } else if (operationRegistration->prepare == nullptr ||
1707                        operationRegistration->execute == nullptr) {
1708                 LOG(ERROR) << "Incomplete operation registration: "
1709                            << getOperationName(operation.type);
1710             } else {
1711                 OperationExecutionContext context(&operation, operands);
1712                 success = operationRegistration->flags.allowOmittedOperand ||
1713                           context.checkNoOmittedOperand();
1714                 success = success && (operationRegistration->flags.allowZeroSizedInput ||
1715                                       context.checkNoZeroSizedInput());
1716                 success = success && operationRegistration->prepare(&context) &&
1717                           operationRegistration->execute(&context);
1718                 result = context.getResultCode();
1719             }
1720         }
1721     }
1722     if (!success && result == ANEURALNETWORKS_NO_ERROR) {
1723         result = ANEURALNETWORKS_OP_FAILED;
1724     }
1725     if (result != ANEURALNETWORKS_NO_ERROR) {
1726         LOG(ERROR) << getOperationName(operation.type) << " failed.";
1727     }
1728 
1729     consumeOperationInputs(ins, operands);
1730     return result;
1731 }
1732 
1733 // Copies RunTimeOperandInfo, preserving the original lifetime and numberOfUsesLeft
1734 // to prevent deallocation of subgraph inputs and outputs.
setInfoExceptLifetime(RunTimeOperandInfo * to,const RunTimeOperandInfo & from)1735 static void setInfoExceptLifetime(RunTimeOperandInfo* to, const RunTimeOperandInfo& from) {
1736     auto originalLifetime = to->lifetime;
1737     auto originalNumberOfUsesLeft = to->numberOfUsesLeft;
1738     *to = from;
1739     to->lifetime = originalLifetime;
1740     to->numberOfUsesLeft = originalNumberOfUsesLeft;
1741 }
1742 
executeIfOperation(const Operation & operation,RunTimeOperandInfo * operands)1743 int CpuExecutor::executeIfOperation(const Operation& operation, RunTimeOperandInfo* operands) {
1744     namespace op = operation_if;
1745     const RunTimeOperandInfo& condOperand = operands[operation.inputs[op::kCondBoolOperand]];
1746     if (condOperand.buffer == nullptr) {
1747         LOG(ERROR) << "Cannot read IF condition operand value";
1748         return ANEURALNETWORKS_OP_FAILED;
1749     }
1750     const bool condValue = *reinterpret_cast<const bool8*>(condOperand.buffer);
1751     VLOG(CPUEXE) << "CpuExecutor::executeIfOperation: condition value: " << condValue;
1752 
1753     const uint32_t branchInputIndex = condValue ? op::kThenModelOperand : op::kElseModelOperand;
1754     const RunTimeOperandInfo& branchOperand = operands[operation.inputs[branchInputIndex]];
1755     const Subgraph& branchSubgraph = *reinterpret_cast<const Subgraph*>(branchOperand.buffer);
1756     std::vector<RunTimeOperandInfo> branchOperands = initializeRunTimeInfo(branchSubgraph);
1757 
1758     // Initialize inner input and output operands from outer operands.
1759     for (uint32_t i = 0, n = branchSubgraph.inputIndexes.size(); i < n; ++i) {
1760         setInfoExceptLifetime(&branchOperands[branchSubgraph.inputIndexes[i]],
1761                               operands[operation.inputs[op::kFirstInput + i]]);
1762     }
1763     for (uint32_t i = 0, n = branchSubgraph.outputIndexes.size(); i < n; ++i) {
1764         setInfoExceptLifetime(&branchOperands[branchSubgraph.outputIndexes[i]],
1765                               operands[operation.outputs[i]]);
1766     }
1767 
1768     NN_RETURN_IF_ERROR(executeSubgraph(branchSubgraph, branchOperands.data()));
1769     freeUnusedSubgraphOperands(&branchOperands);
1770 
1771     // Update outer outputs.
1772     for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1773         setInfoExceptLifetime(&operands[operation.outputs[i]],
1774                               branchOperands[branchSubgraph.outputIndexes[i]]);
1775     }
1776 
1777     consumeOperationInputs(operation.inputs, operands);
1778     return ANEURALNETWORKS_NO_ERROR;
1779 }
1780 
executeWhileOperation(const Operation & operation,RunTimeOperandInfo * operands)1781 int CpuExecutor::executeWhileOperation(const Operation& operation, RunTimeOperandInfo* operands) {
1782     namespace op = operation_while;
1783     const RunTimeOperandInfo& condModelOperand = operands[operation.inputs[op::kCondModelOperand]];
1784     const RunTimeOperandInfo& bodyModelOperand = operands[operation.inputs[op::kBodyModelOperand]];
1785     const Subgraph& condSubgraph = *reinterpret_cast<const Subgraph*>(condModelOperand.buffer);
1786     const Subgraph& bodySubgraph = *reinterpret_cast<const Subgraph*>(bodyModelOperand.buffer);
1787     std::vector<RunTimeOperandInfo> condOperands = initializeRunTimeInfo(condSubgraph);
1788     std::vector<RunTimeOperandInfo> bodyOperands = initializeRunTimeInfo(bodySubgraph);
1789 
1790     // The code below implements the following sequence of subgraph input and output buffer
1791     // assignments:
1792     // iteration = 0   cond inputs = body inputs = outer inputs   body outputs = tmp1
1793     // iteration = 1   cond inputs = body inputs = tmp1           body outputs = tmp2
1794     // iteration = 2   cond inputs = body inputs = tmp2           body outputs = tmp1
1795     // iteration = 3   cond inputs = body inputs = ...            body outputs = ...
1796 
1797     // For body output double buffering.
1798     std::vector<uint8_t*> tmp1(bodySubgraph.outputIndexes.size());
1799     std::vector<uint8_t*> tmp2(bodySubgraph.outputIndexes.size());
1800 
1801     // Ensure objects are freed
1802     auto cleanupGuard = base::make_scope_guard(
1803         [&tmp1, &tmp2, &condOperands, &bodyOperands, &operation, &operands] {
1804             auto freeLoopOutputs = [](const std::vector<uint8_t*>& tmp) {
1805                 for (auto buffer : tmp) {
1806                     if (buffer != nullptr) {
1807                         delete[] buffer;
1808                     }
1809                 }
1810             };
1811 
1812             freeLoopOutputs(tmp1);
1813             freeLoopOutputs(tmp2);
1814             freeUnusedSubgraphOperands(&condOperands);
1815             freeUnusedSubgraphOperands(&bodyOperands);
1816             consumeOperationInputs(operation.inputs, operands);
1817         }
1818     );
1819 
1820     // For body outputs with unknown shape, we skip double buffering and
1821     // allocate on each iteration instead. This allows growing output tensors
1822     // inside a WHILE loop.
1823     std::vector<bool> bodyOutputHasUnknownShape(bodySubgraph.outputIndexes.size());
1824     for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1825         const Operand& operand = bodySubgraph.operands[bodySubgraph.outputIndexes[i]];
1826         bodyOutputHasUnknownShape[i] = nonExtensionOperandSizeOfData(operand) == 0;
1827     }
1828 
1829     // Initialize condition inputs from outer operands.
1830     for (uint32_t i = 0, n = condSubgraph.inputIndexes.size(); i < n; ++i) {
1831         setInfoExceptLifetime(&condOperands[condSubgraph.inputIndexes[i]],
1832                               operands[operation.inputs[op::kFirstInput + i]]);
1833     }
1834 
1835     // Store condition output on the stack.
1836     RunTimeOperandInfo& condOutput = condOperands[condSubgraph.outputIndexes[0]];
1837     bool8 condValue = {/* initialized memory */};
1838     condOutput.buffer = &condValue;
1839     condOutput.length = sizeof(condValue);
1840 
1841     std::chrono::nanoseconds timeoutDuration(mLoopTimeoutDuration);
1842     const auto startTime = std::chrono::steady_clock::now();
1843     for (uint32_t iteration = 0;; ++iteration) {
1844         VLOG(CPUEXE) << "CpuExecutor::executeWhileOperation: iteration " << iteration;
1845         if (iteration != 0) {
1846             // Set condition inputs from previous iteration outputs.
1847             for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1848                 setInfoExceptLifetime(&condOperands[condSubgraph.inputIndexes[i]],
1849                                       bodyOperands[bodySubgraph.outputIndexes[i]]);
1850             }
1851         }
1852         NN_RETURN_IF_ERROR(executeSubgraph(condSubgraph, condOperands.data()));
1853         VLOG(CPUEXE) << "CpuExecutor::executeWhileOperation: condition value: "
1854                      << static_cast<int>(condValue);
1855         if (!condValue) {
1856             break;
1857         }
1858 
1859         const auto duration = std::chrono::steady_clock::now() - startTime;
1860         if (duration > timeoutDuration) {
1861             LOG(ERROR) << "CpuExecutor::executeWhileOperation: timed out after "
1862                        << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count()
1863                        << " ms";
1864             return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
1865         }
1866 
1867         // Set body inputs from condition inputs.
1868         for (uint32_t i = 0, n = bodySubgraph.inputIndexes.size(); i < n; ++i) {
1869             bodyOperands[bodySubgraph.inputIndexes[i]] = condOperands[condSubgraph.inputIndexes[i]];
1870         }
1871         // Set body outputs.
1872         auto& outputBuffer = iteration % 2 == 0 ? tmp1 : tmp2;
1873         for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1874             RunTimeOperandInfo& info = bodyOperands[bodySubgraph.outputIndexes[i]];
1875             if (bodyOutputHasUnknownShape[i]) {
1876                 // Reset dimensions and buffer.
1877                 info.dimensions = bodySubgraph.operands[bodySubgraph.outputIndexes[i]].dimensions;
1878                 if (outputBuffer[i] != nullptr) {
1879                     delete[] outputBuffer[i];
1880                     outputBuffer[i] = nullptr;
1881                 }
1882             }
1883             info.buffer = outputBuffer[i];
1884         }
1885 
1886         NN_RETURN_IF_ERROR(executeSubgraph(bodySubgraph, bodyOperands.data()));
1887 
1888         // Update output buffer information in case we have allocated new buffers.
1889         for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1890             outputBuffer[i] = bodyOperands[bodySubgraph.outputIndexes[i]].buffer;
1891         }
1892     }
1893 
1894     // Copy body outputs to outer outputs.
1895     for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1896         RunTimeOperandInfo& outerOperand = operands[operation.outputs[i]];
1897         RunTimeOperandInfo& innerOperand = condOperands[condSubgraph.inputIndexes[i]];
1898         if (int error; !setInfoAndAllocateIfNeeded(&outerOperand, innerOperand.shape(), &error)) {
1899             return error;
1900         }
1901         CHECK_EQ(outerOperand.length, innerOperand.length);
1902         // TODO: Use the outer buffer as tmp1 to avoid copies.
1903         std::memcpy(outerOperand.buffer, innerOperand.buffer, innerOperand.length);
1904     }
1905 
1906     return ANEURALNETWORKS_NO_ERROR;
1907 }
1908 
setOutputShapes(const std::vector<uint32_t> & outputIndexes,const std::vector<RunTimeOperandInfo> & operands)1909 void CpuExecutor::setOutputShapes(const std::vector<uint32_t>& outputIndexes,
1910                                   const std::vector<RunTimeOperandInfo>& operands) {
1911     mOutputShapes.resize(outputIndexes.size());
1912     for (uint32_t i = 0; i < outputIndexes.size(); i++) {
1913         const uint32_t operandIndex = outputIndexes[i];
1914         const RunTimeOperandInfo& from = operands[operandIndex];
1915         mOutputShapes[i].dimensions = from.dimensions;
1916         mOutputShapes[i].isSufficient = from.isSufficient();
1917     }
1918 }
1919 
1920 // b/109953668, disable OpenMP
1921 #ifdef NNAPI_OPENMP
ScopedOpenmpSettings()1922 ScopedOpenmpSettings::ScopedOpenmpSettings() {
1923     mBlocktimeInitial = kmp_get_blocktime();
1924     kmp_set_blocktime(20);  // ms, see b/109645291
1925 
1926 #if NNAPI_LIMIT_CPU_THREADS
1927     // Code not yet enabled. Choosing the number of threads to be based on
1928     // benchmarking. See longer comment by the class declaration.
1929     mMaxThreadsInitial = Eigen::nbThreads();
1930     const int nProcs = omp_get_num_procs();
1931     int threads = nProcs;
1932     if (nProcs >= 8) {
1933         threads = nProcs - 4;
1934     } else if (nProcs >= 4) {
1935         threads = nProcs - 2;
1936     }
1937     Eigen::setNbThreads(threads);
1938 #endif
1939 }
1940 
~ScopedOpenmpSettings()1941 ScopedOpenmpSettings::~ScopedOpenmpSettings() {
1942     kmp_set_blocktime(mBlocktimeInitial);
1943 #if NNAPI_LIMIT_CPU_THREADS
1944     Eigen::setNbThreads(mMaxThreadsInitial);
1945 #endif
1946 }
1947 #endif  // NNAPI_OPENMP
1948 
1949 }  // namespace nn
1950 }  // namespace android
1951