1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "CpuExecutor"
18
19 #include "CpuExecutor.h"
20
21 #include <android/hardware_buffer.h>
22 #include <android-base/scopeguard.h>
23
24 #include <sys/mman.h>
25 #include <vndk/hardware_buffer.h>
26
27 #include <Eigen/Core>
28 #include <memory>
29 #include <utility>
30 #include <vector>
31
32 // b/109953668, disable OpenMP
33 #ifdef NNAPI_OPENMP
34 #include <omp.h>
35 #endif // NNAPI_OPENMP
36
37 #include "ControlFlow.h"
38 #include "NeuralNetworks.h"
39 #include "OperationResolver.h"
40 #include "Operations.h"
41 #include "OperationsUtils.h"
42 #include "Tracing.h"
43
44 namespace android {
45 namespace nn {
46
47 namespace {
48
49 using namespace hal;
50
51 class OperationExecutionContext : public IOperationExecutionContext {
52 DISALLOW_IMPLICIT_CONSTRUCTORS(OperationExecutionContext);
53
54 public:
OperationExecutionContext(const Operation * operation,RunTimeOperandInfo * operands)55 OperationExecutionContext(const Operation* operation, RunTimeOperandInfo* operands)
56 : operation(operation), operands(operands) {}
57
58 uint32_t getNumInputs() const override;
59 OperandType getInputType(uint32_t index) const override;
60 Shape getInputShape(uint32_t index) const override;
61 const void* getInputBuffer(uint32_t index) const override;
62 const OperandExtraParams getInputExtraParams(uint32_t index) const override;
63
64 uint32_t getNumOutputs() const override;
65 OperandType getOutputType(uint32_t index) const override;
66 Shape getOutputShape(uint32_t index) const override;
67 void* getOutputBuffer(uint32_t index) override;
68
69 // Return false on failure and store the result code.
70 // Use getResultCode() to retrieve it at the end of the operation execution.
71 bool setOutputShape(uint32_t index, const Shape& shape) override;
72 int getResultCode() const;
73
74 bool isOmittedInput(uint32_t index) const override;
75 bool isOmittedOutput(uint32_t index) const override;
76
77 // Return false if any of inputs or outputs is omitted, i.e. has lifetime of NO_VALUE.
78 bool checkNoOmittedOperand() const;
79 // Return false if any of inputs has dimension 0.
80 bool checkNoZeroSizedInput() const;
81
82 private:
83 const RunTimeOperandInfo* getInputInfo(uint32_t index) const;
84 const RunTimeOperandInfo* getOutputInfo(uint32_t index) const;
85 RunTimeOperandInfo* getOutputInfo(uint32_t index);
86
87 const Operation* operation;
88 RunTimeOperandInfo* operands;
89
90 int result = ANEURALNETWORKS_NO_ERROR;
91 };
92
getInputInfo(uint32_t index) const93 const RunTimeOperandInfo* OperationExecutionContext::getInputInfo(uint32_t index) const {
94 CHECK(index < operation->inputs.size());
95 return &operands[operation->inputs[index]];
96 }
97
getOutputInfo(uint32_t index) const98 const RunTimeOperandInfo* OperationExecutionContext::getOutputInfo(uint32_t index) const {
99 CHECK(index < operation->outputs.size());
100 return &operands[operation->outputs[index]];
101 }
102
getOutputInfo(uint32_t index)103 RunTimeOperandInfo* OperationExecutionContext::getOutputInfo(uint32_t index) {
104 CHECK(index < operation->outputs.size());
105 return &operands[operation->outputs[index]];
106 }
107
getInputType(uint32_t index) const108 OperandType OperationExecutionContext::getInputType(uint32_t index) const {
109 return getInputInfo(index)->type;
110 }
111
getInputShape(uint32_t index) const112 Shape OperationExecutionContext::getInputShape(uint32_t index) const {
113 return getInputInfo(index)->shape();
114 }
115
getInputBuffer(uint32_t index) const116 const void* OperationExecutionContext::getInputBuffer(uint32_t index) const {
117 return getInputInfo(index)->buffer;
118 }
119
getInputExtraParams(uint32_t index) const120 const OperandExtraParams OperationExecutionContext::getInputExtraParams(uint32_t index) const {
121 return getInputInfo(index)->extraParams;
122 }
123
getOutputType(uint32_t index) const124 OperandType OperationExecutionContext::getOutputType(uint32_t index) const {
125 return getOutputInfo(index)->type;
126 }
127
getOutputShape(uint32_t index) const128 Shape OperationExecutionContext::getOutputShape(uint32_t index) const {
129 return getOutputInfo(index)->shape();
130 }
131
getOutputBuffer(uint32_t index)132 void* OperationExecutionContext::getOutputBuffer(uint32_t index) {
133 return getOutputInfo(index)->buffer;
134 }
135
getNumInputs() const136 uint32_t OperationExecutionContext::getNumInputs() const {
137 return operation->inputs.size();
138 }
139
getNumOutputs() const140 uint32_t OperationExecutionContext::getNumOutputs() const {
141 return operation->outputs.size();
142 }
143
getResultCode() const144 int OperationExecutionContext::getResultCode() const {
145 return result;
146 }
147
148 // TODO: Return error code directly once we've fully integrated OperationResolver with all ops.
149 // Updates the RunTimeOperandInfo with the newly calculated shape.
150 // Allocate the buffer if we need to.
151 //
152 // TODO(b/153081229): This function currently cannot handle extension operands well. We need to
153 // propagate the extension type info into this function.
setInfoAndAllocateIfNeeded(RunTimeOperandInfo * info,const Shape & shape,int * result)154 bool setInfoAndAllocateIfNeeded(RunTimeOperandInfo* info, const Shape& shape, int* result) {
155 // For user-provided model output operands, the parameters must match the Shape
156 // calculated from the preparation step.
157 if (info->lifetime == OperandLifeTime::SUBGRAPH_OUTPUT) {
158 if (info->type != shape.type) {
159 LOG(ERROR) << "Invalid type for model output";
160 *result = ANEURALNETWORKS_OP_FAILED;
161 return false;
162 }
163 if (info->scale != shape.scale) {
164 LOG(ERROR) << "Invalid scale for model output";
165 *result = ANEURALNETWORKS_OP_FAILED;
166 return false;
167 }
168 if (info->zeroPoint != shape.offset) {
169 LOG(ERROR) << "Invalid zeroPoint for model output";
170 *result = ANEURALNETWORKS_OP_FAILED;
171 return false;
172 }
173 if (info->extraParams != shape.extraParams) {
174 LOG(ERROR) << "Invalid extraParams for model output";
175 *result = ANEURALNETWORKS_OP_FAILED;
176 return false;
177 }
178 }
179
180 auto combined = combineDimensions(shape.dimensions, info->dimensions);
181 if (!combined.has_value()) {
182 LOG(ERROR) << "Invalid dimensions for model operand";
183 *result = ANEURALNETWORKS_OP_FAILED;
184 return false;
185 }
186 info->dimensions = std::move(combined.value());
187 info->type = shape.type;
188 info->scale = shape.scale;
189 info->zeroPoint = shape.offset;
190 info->extraParams = shape.extraParams;
191
192 // TODO(b/153081229): We bypass the overflow check on extension operands because we do not know
193 // the sizes of extension types.
194 if (!isExtensionOperandType(info->type) &&
195 nonExtensionOperandSizeOfDataOverflowsUInt32(info->type, info->dimensions)) {
196 LOG(ERROR) << "Operand data size overflows uint32_t";
197 *result = ANEURALNETWORKS_OP_FAILED;
198 return false;
199 }
200
201 // Allocate the buffer only if the combined dimension is fully specified
202 if (info->buffer == nullptr && (info->lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
203 info->lifetime == OperandLifeTime::SUBGRAPH_OUTPUT)) {
204 if (isExtensionOperandType(info->type)) {
205 LOG(ERROR) << "Cannot allocate a variable of an extension type";
206 *result = ANEURALNETWORKS_OP_FAILED;
207 return false;
208 }
209 uint32_t length = nonExtensionOperandSizeOfData(info->type, info->dimensions);
210 if (length > 0) {
211 info->buffer = new uint8_t[length];
212 if (info->buffer == nullptr) {
213 *result = ANEURALNETWORKS_OUT_OF_MEMORY;
214 return false;
215 }
216 info->length = length;
217 }
218 }
219 if (!info->isSufficient()) {
220 uint32_t length = nonExtensionOperandSizeOfData(info->type, info->dimensions);
221 LOG(ERROR) << "Insufficient size for model operand: require = " << length
222 << ", provided = " << info->length;
223 *result = ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
224 return false;
225 }
226 *result = ANEURALNETWORKS_NO_ERROR;
227 return true;
228 }
229
setOutputShape(uint32_t index,const Shape & shape)230 bool OperationExecutionContext::setOutputShape(uint32_t index, const Shape& shape) {
231 return setInfoAndAllocateIfNeeded(getOutputInfo(index), shape, &result);
232 }
233
isOmittedInput(uint32_t index) const234 bool OperationExecutionContext::isOmittedInput(uint32_t index) const {
235 return getInputInfo(index)->lifetime == OperandLifeTime::NO_VALUE;
236 }
237
isOmittedOutput(uint32_t index) const238 bool OperationExecutionContext::isOmittedOutput(uint32_t index) const {
239 return getOutputInfo(index)->lifetime == OperandLifeTime::NO_VALUE;
240 }
241
checkNoOmittedOperand() const242 bool OperationExecutionContext::checkNoOmittedOperand() const {
243 for (uint32_t i = 0; i < operation->inputs.size(); i++) {
244 NN_RET_CHECK(!isOmittedInput(i)) << getOperationName(operation->type) << " input operand "
245 << i << " is required but missing.";
246 }
247 for (uint32_t i = 0; i < operation->outputs.size(); i++) {
248 NN_RET_CHECK(!isOmittedOutput(i)) << getOperationName(operation->type) << " output operand "
249 << i << " is required but missing.";
250 }
251 return true;
252 }
253
checkNoZeroSizedInput() const254 bool OperationExecutionContext::checkNoZeroSizedInput() const {
255 for (uint32_t i = 0; i < operation->inputs.size(); i++) {
256 if (isOmittedInput(i)) continue;
257 for (uint32_t j = 0; j < getInputInfo(i)->dimensions.size(); j++) {
258 NN_RET_CHECK_NE(getInputInfo(i)->dimensions[j], 0)
259 << getOperationName(operation->type)
260 << " does not support zero-sized tensor, but input " << i << " dimension " << j
261 << " is 0.";
262 }
263 }
264 return true;
265 }
266
267 } // namespace
268
269 // Used to keep a pointer to a memory pool.
270 //
271 // In the case of an "mmap_fd" pool, owns the mmap region
272 // returned by getBuffer() -- i.e., that region goes away
273 // when the RunTimePoolInfo is destroyed or is assigned to.
274 class RunTimePoolInfo::RunTimePoolInfoImpl {
275 public:
276 RunTimePoolInfoImpl(const hidl_memory& hidlMemory, uint8_t* buffer, const sp<IMemory>& memory,
277 AHardwareBuffer* hardwareBuffer, uint32_t size);
278
279 // rule of five...
280 ~RunTimePoolInfoImpl();
281 RunTimePoolInfoImpl(const RunTimePoolInfoImpl&) = delete;
282 RunTimePoolInfoImpl(RunTimePoolInfoImpl&&) noexcept = delete;
283 RunTimePoolInfoImpl& operator=(const RunTimePoolInfoImpl&) = delete;
284 RunTimePoolInfoImpl& operator=(RunTimePoolInfoImpl&&) noexcept = delete;
285
getBuffer() const286 uint8_t* getBuffer() const { return mBuffer; }
getSize() const287 uint32_t getSize() const { return mSize; }
288
289 bool flush() const;
290
getHidlMemory() const291 const hidl_memory& getHidlMemory() const { return mHidlMemory; }
292
293 private:
294 const hidl_memory mHidlMemory; // always used
295 uint8_t* const mBuffer = nullptr; // always used
296 const sp<IMemory> mMemory; // only used when hidlMemory.name() == "ashmem"
297 AHardwareBuffer*
298 mAHardwareBuffer; // only used when hidlMemory.name() == "hardware_buffer_blob"
299 const uint32_t mSize;
300 };
301
RunTimePoolInfoImpl(const hidl_memory & hidlMemory,uint8_t * buffer,const sp<IMemory> & memory,AHardwareBuffer * hardwareBuffer,uint32_t size)302 RunTimePoolInfo::RunTimePoolInfoImpl::RunTimePoolInfoImpl(const hidl_memory& hidlMemory,
303 uint8_t* buffer,
304 const sp<IMemory>& memory,
305 AHardwareBuffer* hardwareBuffer,
306 uint32_t size)
307 : mHidlMemory(hidlMemory),
308 mBuffer(buffer),
309 mMemory(memory),
310 mAHardwareBuffer(hardwareBuffer),
311 mSize(size) {}
312
~RunTimePoolInfoImpl()313 RunTimePoolInfo::RunTimePoolInfoImpl::~RunTimePoolInfoImpl() {
314 if (mBuffer == nullptr) {
315 return;
316 }
317
318 const auto& memType = mHidlMemory.name();
319 if (memType == "ashmem") {
320 // nothing to do
321 } else if (memType == "mmap_fd") {
322 const size_t size = mHidlMemory.size();
323 if (munmap(mBuffer, size)) {
324 LOG(ERROR) << "RunTimePoolInfoImpl::~RunTimePoolInfo(): Can't munmap";
325 }
326 } else if (memType == "hardware_buffer_blob") {
327 AHardwareBuffer_unlock(mAHardwareBuffer, nullptr);
328 } else if (memType == "") {
329 // Represents a POINTER argument; nothing to do
330 } else {
331 LOG(ERROR) << "RunTimePoolInfoImpl::~RunTimePoolInfoImpl(): unsupported hidl_memory type";
332 }
333
334 if (mAHardwareBuffer != nullptr) {
335 AHardwareBuffer_release(mAHardwareBuffer);
336 }
337 }
338
339 // Making sure the output data are correctly updated after execution.
flush() const340 bool RunTimePoolInfo::RunTimePoolInfoImpl::flush() const {
341 const auto& memType = mHidlMemory.name();
342 if (memType == "mmap_fd") {
343 const int prot = mHidlMemory.handle()->data[1];
344 if (prot & PROT_WRITE) {
345 const size_t size = mHidlMemory.size();
346 return msync(mBuffer, size, MS_SYNC) == 0;
347 }
348 }
349 // No-op for other types of memory.
350 return true;
351 }
352
353 // TODO: short term, make share memory mapping and updating a utility function.
354 // TODO: long term, implement mmap_fd as a hidl IMemory service.
createFromHidlMemory(const hidl_memory & hidlMemory)355 std::optional<RunTimePoolInfo> RunTimePoolInfo::createFromHidlMemory(
356 const hidl_memory& hidlMemory) {
357 uint8_t* buffer = nullptr;
358 sp<IMemory> memory;
359 AHardwareBuffer* hardwareBuffer = nullptr;
360
361 const auto& memType = hidlMemory.name();
362 if (memType == "ashmem") {
363 memory = mapMemory(hidlMemory);
364 if (memory == nullptr) {
365 LOG(ERROR) << "Can't map shared memory.";
366 return std::nullopt;
367 }
368 buffer = static_cast<uint8_t*>(static_cast<void*>(memory->getPointer()));
369 if (buffer == nullptr) {
370 LOG(ERROR) << "Can't access shared memory.";
371 return std::nullopt;
372 }
373 } else if (memType == "mmap_fd") {
374 size_t size = hidlMemory.size();
375 int fd = hidlMemory.handle()->data[0];
376 int prot = hidlMemory.handle()->data[1];
377 size_t offset = getSizeFromInts(hidlMemory.handle()->data[2], hidlMemory.handle()->data[3]);
378 buffer = static_cast<uint8_t*>(mmap(nullptr, size, prot, MAP_SHARED, fd, offset));
379 if (buffer == MAP_FAILED) {
380 LOG(ERROR) << "RunTimePoolInfo::set(): Can't mmap the file descriptor.";
381 return std::nullopt;
382 }
383 } else if (memType == "hardware_buffer_blob") {
384 auto handle = hidlMemory.handle();
385 auto format = AHARDWAREBUFFER_FORMAT_BLOB;
386 auto usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN | AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN;
387 const uint32_t width = hidlMemory.size();
388 const uint32_t height = 1; // height is always 1 for BLOB mode AHardwareBuffer.
389 const uint32_t layers = 1; // layers is always 1 for BLOB mode AHardwareBuffer.
390 const uint32_t stride = hidlMemory.size();
391
392 AHardwareBuffer_Desc desc{
393 .width = width,
394 .height = height,
395 .layers = layers,
396 .format = format,
397 .usage = usage,
398 .stride = stride,
399 };
400 status_t status = AHardwareBuffer_createFromHandle(
401 &desc, handle, AHARDWAREBUFFER_CREATE_FROM_HANDLE_METHOD_CLONE, &hardwareBuffer);
402 if (status != NO_ERROR) {
403 LOG(ERROR) << "RunTimePoolInfo Can't create AHardwareBuffer from handle. Error: "
404 << status;
405 return std::nullopt;
406 }
407 void* gBuffer = nullptr;
408 status = AHardwareBuffer_lock(hardwareBuffer, usage, -1, nullptr, &gBuffer);
409 if (status != NO_ERROR) {
410 LOG(ERROR) << "RunTimePoolInfo Can't lock the AHardwareBuffer. Error: " << status;
411 return std::nullopt;
412 }
413 buffer = static_cast<uint8_t*>(gBuffer);
414 } else {
415 LOG(ERROR) << "RunTimePoolInfo::set(): unsupported hidl_memory type";
416 return std::nullopt;
417 }
418
419 const auto impl = std::make_shared<const RunTimePoolInfoImpl>(
420 hidlMemory, buffer, memory, hardwareBuffer, hidlMemory.size());
421 return {RunTimePoolInfo(impl)};
422 }
423
createFromExistingBuffer(uint8_t * buffer,uint32_t size)424 RunTimePoolInfo RunTimePoolInfo::createFromExistingBuffer(uint8_t* buffer, uint32_t size) {
425 const auto impl = std::make_shared<const RunTimePoolInfoImpl>(hidl_memory{}, buffer, nullptr,
426 nullptr, size);
427 return {impl};
428 }
429
RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl> & impl)430 RunTimePoolInfo::RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl>& impl)
431 : mImpl(impl) {}
432
getBuffer() const433 uint8_t* RunTimePoolInfo::getBuffer() const {
434 return mImpl->getBuffer();
435 }
436
getSize() const437 uint32_t RunTimePoolInfo::getSize() const {
438 return mImpl->getSize();
439 }
440
flush() const441 bool RunTimePoolInfo::flush() const {
442 return mImpl->flush();
443 }
444
getHidlMemory() const445 const hidl_memory& RunTimePoolInfo::getHidlMemory() const {
446 return mImpl->getHidlMemory();
447 }
448
setRunTimePoolInfosFromHidlMemories(std::vector<RunTimePoolInfo> * poolInfos,const hidl_vec<hidl_memory> & pools)449 bool setRunTimePoolInfosFromHidlMemories(std::vector<RunTimePoolInfo>* poolInfos,
450 const hidl_vec<hidl_memory>& pools) {
451 CHECK(poolInfos != nullptr);
452 poolInfos->clear();
453 poolInfos->reserve(pools.size());
454 for (const auto& pool : pools) {
455 if (std::optional<RunTimePoolInfo> poolInfo = RunTimePoolInfo::createFromHidlMemory(pool)) {
456 poolInfos->push_back(*poolInfo);
457 } else {
458 LOG(ERROR) << "Could not map pools";
459 poolInfos->clear();
460 return false;
461 }
462 }
463 return true;
464 }
465
setRunTimePoolInfosFromMemoryPools(std::vector<RunTimePoolInfo> * poolInfos,const hidl_vec<Request::MemoryPool> & pools)466 bool setRunTimePoolInfosFromMemoryPools(std::vector<RunTimePoolInfo>* poolInfos,
467 const hidl_vec<Request::MemoryPool>& pools) {
468 CHECK(poolInfos != nullptr);
469 poolInfos->clear();
470 poolInfos->reserve(pools.size());
471 for (const auto& pool : pools) {
472 if (pool.getDiscriminator() != Request::MemoryPool::hidl_discriminator::hidlMemory) {
473 LOG(ERROR) << "Unknown memory token";
474 poolInfos->clear();
475 return false;
476 }
477 if (std::optional<RunTimePoolInfo> poolInfo =
478 RunTimePoolInfo::createFromHidlMemory(pool.hidlMemory())) {
479 poolInfos->push_back(*poolInfo);
480 } else {
481 LOG(ERROR) << "Could not map pools";
482 poolInfos->clear();
483 return false;
484 }
485 }
486 return true;
487 }
488
489 template <typename T>
convertToNhwcImpl(T * to,const T * from,const std::vector<uint32_t> & fromDim)490 inline bool convertToNhwcImpl(T* to, const T* from, const std::vector<uint32_t>& fromDim) {
491 uint32_t spatialSize = fromDim[2] * fromDim[3];
492 for (uint32_t n = 0; n < fromDim[0]; n++) {
493 for (uint32_t hw = 0; hw < spatialSize; hw++) {
494 for (uint32_t c = 0; c < fromDim[1]; c++) {
495 uint32_t fromIndex = n * fromDim[1] * spatialSize + c * spatialSize + hw;
496 *to++ = from[fromIndex];
497 }
498 }
499 }
500 return true;
501 }
502
503 template <typename T>
convertFromNhwcImpl(T * to,const T * from,const std::vector<uint32_t> & fromDim)504 inline bool convertFromNhwcImpl(T* to, const T* from, const std::vector<uint32_t>& fromDim) {
505 uint32_t spatialSize = fromDim[1] * fromDim[2];
506 for (uint32_t n = 0; n < fromDim[0]; n++) {
507 for (uint32_t c = 0; c < fromDim[3]; c++) {
508 for (uint32_t hw = 0; hw < spatialSize; hw++) {
509 uint32_t fromIndex = n * spatialSize * fromDim[3] + hw * fromDim[3] + c;
510 *to++ = from[fromIndex];
511 }
512 }
513 }
514 return true;
515 }
516
convertToNhwc(RunTimeOperandInfo & to,const RunTimeOperandInfo & from,std::unique_ptr<uint8_t[]> & ptr_guard,bool data_layout)517 static bool convertToNhwc(RunTimeOperandInfo& to, const RunTimeOperandInfo& from,
518 std::unique_ptr<uint8_t[]>& ptr_guard, bool data_layout) {
519 int result;
520 if (from.dimensions.size() != 4) {
521 LOG(ERROR) << "Error converting a non-4-D tensor to NHWC layout";
522 return false;
523 }
524 to.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
525 if (data_layout) {
526 // convert dimensions
527 Shape inShape = from.shape();
528 auto& fromDim = from.dimensions;
529 inShape.dimensions = {fromDim[0], fromDim[2], fromDim[3], fromDim[1]};
530 // allocate buffer
531 to.buffer = nullptr;
532 if (!setInfoAndAllocateIfNeeded(&to, inShape, &result)) {
533 return false;
534 }
535 ptr_guard.reset(to.buffer);
536 // convert value
537 if (from.type == OperandType::TENSOR_FLOAT32) {
538 return convertToNhwcImpl<float>(reinterpret_cast<float*>(to.buffer),
539 reinterpret_cast<const float*>(from.buffer), fromDim);
540 } else if (from.type == OperandType::TENSOR_FLOAT16) {
541 return convertToNhwcImpl<_Float16>(reinterpret_cast<_Float16*>(to.buffer),
542 reinterpret_cast<const _Float16*>(from.buffer),
543 fromDim);
544 } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM) {
545 return convertToNhwcImpl<uint8_t>(reinterpret_cast<uint8_t*>(to.buffer),
546 reinterpret_cast<const uint8_t*>(from.buffer),
547 fromDim);
548 } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
549 return convertToNhwcImpl<int8_t>(reinterpret_cast<int8_t*>(to.buffer),
550 reinterpret_cast<const int8_t*>(from.buffer), fromDim);
551 } else {
552 LOG(ERROR) << "Unsupported data type";
553 return false;
554 }
555 } else {
556 to = from;
557 }
558 return true;
559 }
560
convertFromNhwc(RunTimeOperandInfo & to,const RunTimeOperandInfo & from,bool data_layout,int * result)561 static bool convertFromNhwc(RunTimeOperandInfo& to, const RunTimeOperandInfo& from,
562 bool data_layout, int* result) {
563 if (from.dimensions.size() != 4) {
564 LOG(ERROR) << "Error converting a non-4-D tensor from NHWC layout";
565 return false;
566 }
567 if (data_layout) {
568 // convert dimensions
569 Shape outShape = from.shape();
570 auto& fromDim = from.dimensions;
571 outShape.dimensions = {fromDim[0], fromDim[3], fromDim[1], fromDim[2]};
572 // allocate buffer
573 if (!setInfoAndAllocateIfNeeded(&to, outShape, result)) {
574 return false;
575 }
576 // convert value
577 if (from.type == OperandType::TENSOR_FLOAT32) {
578 return convertFromNhwcImpl<float>(reinterpret_cast<float*>(to.buffer),
579 reinterpret_cast<const float*>(from.buffer), fromDim);
580 } else if (from.type == OperandType::TENSOR_FLOAT16) {
581 return convertFromNhwcImpl<_Float16>(reinterpret_cast<_Float16*>(to.buffer),
582 reinterpret_cast<const _Float16*>(from.buffer),
583 fromDim);
584 } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM) {
585 return convertFromNhwcImpl<uint8_t>(reinterpret_cast<uint8_t*>(to.buffer),
586 reinterpret_cast<const uint8_t*>(from.buffer),
587 fromDim);
588 } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
589 return convertFromNhwcImpl<int8_t>(reinterpret_cast<int8_t*>(to.buffer),
590 reinterpret_cast<const int8_t*>(from.buffer),
591 fromDim);
592 } else {
593 LOG(ERROR) << "Unsupported data type";
594 return false;
595 }
596 } else {
597 Shape outShape = from.shape();
598 to.buffer = from.buffer;
599 to.length = from.length;
600 if (!setInfoAndAllocateIfNeeded(&to, outShape, result)) {
601 return false;
602 }
603 }
604 return true;
605 }
606
607 // Decrements the usage count for the operands listed. Frees the memory
608 // allocated for any temporary variable with a count of zero.
consumeOperationInputs(const std::vector<uint32_t> & inputs,RunTimeOperandInfo * operands)609 static void consumeOperationInputs(const std::vector<uint32_t>& inputs,
610 RunTimeOperandInfo* operands) {
611 for (uint32_t i : inputs) {
612 auto& info = operands[i];
613 // Check if it's a static or model input/output.
614 if (info.numberOfUsesLeft == 0) {
615 continue;
616 }
617 info.numberOfUsesLeft--;
618 if (info.numberOfUsesLeft == 0 && info.buffer != nullptr) {
619 delete[] info.buffer;
620 info.buffer = nullptr;
621 }
622 }
623 }
624
625 // This function only frees TEMPORARY_VARIABLE operands that are unused
626 // outputs because consumeOperationInputs takes care of any operands
627 // that are inputs to an operation.
freeUnusedSubgraphOperands(std::vector<RunTimeOperandInfo> * operands)628 static void freeUnusedSubgraphOperands(std::vector<RunTimeOperandInfo>* operands) {
629 for (auto& info : *operands) {
630 if (info.lifetime == OperandLifeTime::TEMPORARY_VARIABLE && info.numberOfUsesLeft == 0 &&
631 info.buffer != nullptr) {
632 delete[] info.buffer;
633 info.buffer = nullptr;
634 }
635 }
636 }
637
638 // Ignore the .pools entry in model and request. This will have been taken care of
639 // by the caller.
run(const Model & model,const Request & request,const std::vector<RunTimePoolInfo> & modelPoolInfos,const std::vector<RunTimePoolInfo> & requestPoolInfos)640 int CpuExecutor::run(const Model& model, const Request& request,
641 const std::vector<RunTimePoolInfo>& modelPoolInfos,
642 const std::vector<RunTimePoolInfo>& requestPoolInfos) {
643 NNTRACE_CPU(NNTRACE_PHASE_EXECUTION, "run");
644 VLOG(CPUEXE) << "CpuExecutor::run() with request(" << SHOW_IF_DEBUG(toString(request)) << ")";
645 mModelOperandValues = &model.operandValues;
646 mModelPoolInfos = &modelPoolInfos;
647 mReferencedSubgraphs = &model.referenced;
648
649 // b/109953668, disable OpenMP
650 #ifdef NNAPI_OPENMP
651 ScopedOpenmpSettings openMpSettings;
652 #endif // NNAPI_OPENMP
653
654 std::vector<RunTimeOperandInfo> operands = initializeRunTimeInfo(model.main);
655 updateForArguments(model.main.inputIndexes, request.inputs, requestPoolInfos, operands.data());
656 updateForArguments(model.main.outputIndexes, request.outputs, requestPoolInfos,
657 operands.data());
658 int result = executeSubgraph(model.main, operands.data());
659 freeUnusedSubgraphOperands(&operands);
660
661 if (result == ANEURALNETWORKS_NO_ERROR) {
662 VLOG(CPUEXE) << "Completed run normally";
663 for (auto& runtimeInfo : requestPoolInfos) {
664 runtimeInfo.flush();
665 }
666 }
667
668 // Only report the output shapes when the result code is NO_ERROR or OUTPUT_INSUFFICIENT_SIZE.
669 if (result == ANEURALNETWORKS_NO_ERROR || result == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
670 setOutputShapes(model.main.outputIndexes, operands);
671 } else {
672 mOutputShapes.clear();
673 }
674
675 mFinished = true;
676 mModelOperandValues = nullptr;
677 mModelPoolInfos = nullptr;
678 mReferencedSubgraphs = nullptr;
679 return result;
680 }
681
executeSubgraph(const Subgraph & subgraph,RunTimeOperandInfo * operands)682 int CpuExecutor::executeSubgraph(const Subgraph& subgraph, RunTimeOperandInfo* operands) {
683 VLOG(CPUEXE) << "CpuExecutor::executeSubgraph " << toString(subgraph);
684 // The graph has serialized the operation in execution order.
685 for (const auto& operation : subgraph.operations) {
686 NN_RETURN_IF_ERROR(executeOperation(operation, operands));
687 }
688 return ANEURALNETWORKS_NO_ERROR;
689 }
690
initializeRunTimeInfo(const Subgraph & subgraph)691 std::vector<RunTimeOperandInfo> CpuExecutor::initializeRunTimeInfo(const Subgraph& subgraph) {
692 VLOG(CPUEXE) << "CpuExecutor::initializeRunTimeInfo";
693 const size_t count = subgraph.operands.size();
694 std::vector<RunTimeOperandInfo> operands(count);
695 for (size_t i = 0; i < count; i++) {
696 const Operand& from = subgraph.operands[i];
697 RunTimeOperandInfo& to = operands[i];
698 to.type = from.type;
699 to.dimensions = from.dimensions;
700 to.scale = from.scale;
701 to.zeroPoint = from.zeroPoint;
702 to.length = from.location.length;
703 to.lifetime = from.lifetime;
704 to.extraParams = from.extraParams;
705 switch (from.lifetime) {
706 case OperandLifeTime::TEMPORARY_VARIABLE:
707 to.buffer = nullptr;
708 to.numberOfUsesLeft = from.numberOfConsumers;
709 break;
710 case OperandLifeTime::CONSTANT_COPY:
711 to.buffer = const_cast<uint8_t*>(&(*mModelOperandValues)[from.location.offset]);
712 to.numberOfUsesLeft = 0;
713 break;
714 case OperandLifeTime::CONSTANT_REFERENCE: {
715 auto poolIndex = from.location.poolIndex;
716 CHECK_LT(poolIndex, mModelPoolInfos->size());
717 auto& r = (*mModelPoolInfos)[poolIndex];
718 to.buffer = r.getBuffer() + from.location.offset;
719 to.numberOfUsesLeft = 0;
720 break;
721 }
722 case OperandLifeTime::SUBGRAPH: {
723 auto subgraphIndex = from.location.offset;
724 CHECK_LT(subgraphIndex, mReferencedSubgraphs->size());
725 to.buffer = reinterpret_cast<uint8_t*>(
726 const_cast<Subgraph*>(&(*mReferencedSubgraphs)[subgraphIndex]));
727 to.numberOfUsesLeft = 0;
728 } break;
729 case OperandLifeTime::SUBGRAPH_INPUT:
730 case OperandLifeTime::SUBGRAPH_OUTPUT:
731 case OperandLifeTime::NO_VALUE:
732 to.buffer = nullptr;
733 to.numberOfUsesLeft = 0;
734 break;
735 }
736 }
737 return operands;
738 }
739
updateForArguments(const std::vector<uint32_t> & indexes,const hal::hidl_vec<hal::RequestArgument> & arguments,const std::vector<RunTimePoolInfo> & requestPoolInfos,RunTimeOperandInfo * operands)740 void CpuExecutor::updateForArguments(const std::vector<uint32_t>& indexes,
741 const hal::hidl_vec<hal::RequestArgument>& arguments,
742 const std::vector<RunTimePoolInfo>& requestPoolInfos,
743 RunTimeOperandInfo* operands) {
744 CHECK_EQ(indexes.size(), arguments.size());
745 for (size_t i = 0; i < indexes.size(); i++) {
746 const uint32_t operandIndex = indexes[i];
747 const RequestArgument& from = arguments[i];
748 RunTimeOperandInfo& to = operands[operandIndex];
749 if (from.dimensions.size() > 0) {
750 // It's the responsibility of the caller to validate that
751 // from.dimensions only modifies the dimensions that were
752 // unspecified in the model. That's the case in SampleDriver.cpp
753 // with the call to validateRequest().
754 // TODO make sure that's the case for the default CPU path.
755 to.dimensions = from.dimensions;
756 }
757 if (from.hasNoValue) {
758 to.lifetime = OperandLifeTime::NO_VALUE;
759 CHECK(to.buffer == nullptr);
760 to.length = 0;
761 } else {
762 auto poolIndex = from.location.poolIndex;
763 CHECK_LT(poolIndex, requestPoolInfos.size());
764 auto& r = requestPoolInfos[poolIndex];
765 to.buffer = r.getBuffer() + from.location.offset;
766 if (from.location.offset == 0 && from.location.length == 0) {
767 // Use the entire memory region.
768 to.length = r.getSize();
769 } else {
770 to.length = from.location.length;
771 }
772 }
773 }
774 }
775
executeOperation(const Operation & operation,RunTimeOperandInfo * operands)776 int CpuExecutor::executeOperation(const Operation& operation, RunTimeOperandInfo* operands) {
777 if (hasDeadlinePassed(mDeadline)) {
778 return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
779 }
780 if (operation.type == OperationType::IF) {
781 int result = executeIfOperation(operation, operands);
782 if (result != ANEURALNETWORKS_NO_ERROR) {
783 LOG(ERROR) << "IF failed.";
784 }
785 return result;
786 }
787 if (operation.type == OperationType::WHILE) {
788 int result = executeWhileOperation(operation, operands);
789 if (result != ANEURALNETWORKS_NO_ERROR) {
790 LOG(ERROR) << "WHILE failed.";
791 }
792 return result;
793 }
794
795 // VLOG(CPUEXE) << "CpuExecutor::executeOperation(" << toString(operation) << ")";
796 const hidl_vec<uint32_t>& ins = operation.inputs;
797 const hidl_vec<uint32_t>& outs = operation.outputs;
798 bool success = false;
799 int result = ANEURALNETWORKS_NO_ERROR;
800
801 // Function to verify that the number of input and output parameters
802 // matches what is expected. Also checks that all the parameters have
803 // values. This function is to be used only for operations that do not
804 // accept optional arguments.
805 // TODO Have a version that works for optional arguments.
806 auto allParametersPresent = [&operation, &operands, &ins, &outs](size_t requiredIns,
807 size_t requiredOuts) -> bool {
808 auto verify = [&operation, &operands](size_t requiredCount,
809 const hidl_vec<uint32_t>& indexes,
810 const char* type) -> bool {
811 size_t actualCount = indexes.size();
812 if (actualCount != requiredCount) {
813 LOG(ERROR) << getOperationName(operation.type) << ": Invalid number of " << type
814 << " operands. Got " << actualCount << " of " << requiredCount;
815 return false;
816 }
817 for (size_t i = 0; i < actualCount; i++) {
818 if (operands[indexes[i]].lifetime == OperandLifeTime::NO_VALUE) {
819 LOG(ERROR) << getOperationName(operation.type) << " " << type << " operand "
820 << i << " is required but missing.";
821 return false;
822 }
823 }
824 return true;
825 };
826
827 auto verifyNoZeroSizedInputs = [&operation, &operands](const hidl_vec<uint32_t>& indexes) {
828 for (size_t i = 0; i < indexes.size(); i++) {
829 for (size_t j = 0; j < operands[indexes[i]].dimensions.size(); j++) {
830 if (operands[indexes[i]].dimensions[j] == 0) {
831 LOG(ERROR) << getOperationName(operation.type)
832 << " does not support zero-sized tensor, but input " << i
833 << " dimension " << j << " is zero.";
834 return false;
835 }
836 }
837 }
838 return true;
839 };
840
841 return verify(requiredIns, ins, "in") && verify(requiredOuts, outs, "out") &&
842 verifyNoZeroSizedInputs(ins);
843 };
844
845 switch (operation.type) {
846 case OperationType::OEM_OPERATION: {
847 LOG(ERROR) << "OEM operation not supported for CPU execution";
848 success = false;
849 } break;
850 case OperationType::RESHAPE: {
851 if (!allParametersPresent(2, 1)) {
852 return ANEURALNETWORKS_BAD_DATA;
853 }
854 const RunTimeOperandInfo& input = operands[ins[0]];
855 const RunTimeOperandInfo& targetShape = operands[ins[1]];
856
857 RunTimeOperandInfo& output = operands[outs[0]];
858 Shape outShape = output.shape();
859
860 success = reshapePrepare(input.shape(),
861 reinterpret_cast<const int32_t*>(targetShape.buffer),
862 getNumberOfElements(targetShape.shape()), &outShape) &&
863 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
864 copyData(input.buffer, input.shape(), output.buffer, outShape);
865 } break;
866 case OperationType::DEPTH_TO_SPACE: {
867 const size_t inCount = ins.size();
868 if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
869 return ANEURALNETWORKS_BAD_DATA;
870 }
871 const RunTimeOperandInfo& input = operands[ins[0]];
872 int32_t blockSize = getScalarData<int32_t>(operands[ins[1]]);
873 bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
874
875 RunTimeOperandInfo& output = operands[outs[0]];
876 Shape outShape = output.shape();
877
878 RunTimeOperandInfo input_tmp, output_tmp;
879 std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
880 if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
881 success = false;
882 break;
883 }
884 output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
885 output_tmp.buffer = data_layout ? nullptr : output.buffer;
886 output_tmp.length = data_layout ? 0 : output.length;
887 if (!depthToSpacePrepare(input_tmp.shape(), blockSize, &outShape) ||
888 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
889 if (!data_layout) output.dimensions = output_tmp.dimensions;
890 break;
891 }
892 switch (input_tmp.type) {
893 case OperandType::TENSOR_FLOAT32: {
894 success = depthToSpaceGeneric(
895 reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
896 blockSize, reinterpret_cast<float*>(output_tmp.buffer), outShape);
897 break;
898 }
899 case OperandType::TENSOR_FLOAT16: {
900 success = depthToSpaceGeneric(
901 reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
902 blockSize, reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
903 break;
904 }
905 case OperandType::TENSOR_QUANT8_ASYMM: {
906 success = depthToSpaceGeneric(
907 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
908 blockSize, reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
909 break;
910 }
911 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
912 success = depthToSpaceGeneric(
913 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
914 blockSize, reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
915 break;
916 }
917 default: {
918 LOG(ERROR) << "Unsupported data type";
919 success = false;
920 }
921 }
922 if (data_layout) {
923 output_tmp_guard.reset(output_tmp.buffer);
924 }
925 if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
926 success = false;
927 break;
928 }
929 } break;
930 case OperationType::SPACE_TO_DEPTH: {
931 const size_t inCount = ins.size();
932 if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
933 return ANEURALNETWORKS_BAD_DATA;
934 }
935 const RunTimeOperandInfo& input = operands[ins[0]];
936 int32_t blockSize = getScalarData<int32_t>(operands[ins[1]]);
937 bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
938
939 RunTimeOperandInfo& output = operands[outs[0]];
940 Shape outShape = output.shape();
941
942 RunTimeOperandInfo input_tmp, output_tmp;
943 std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
944 if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
945 success = false;
946 break;
947 }
948 output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
949 output_tmp.buffer = data_layout ? nullptr : output.buffer;
950 output_tmp.length = data_layout ? 0 : output.length;
951
952 if (!spaceToDepthPrepare(input_tmp.shape(), blockSize, &outShape) ||
953 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
954 if (!data_layout) output.dimensions = output_tmp.dimensions;
955 break;
956 }
957 switch (input_tmp.type) {
958 case OperandType::TENSOR_FLOAT32: {
959 success = spaceToDepthGeneric(
960 reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
961 blockSize, reinterpret_cast<float*>(output_tmp.buffer), outShape);
962 break;
963 }
964 case OperandType::TENSOR_FLOAT16: {
965 success = spaceToDepthGeneric(
966 reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
967 blockSize, reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
968 break;
969 }
970 case OperandType::TENSOR_QUANT8_ASYMM: {
971 success = spaceToDepthGeneric(
972 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
973 blockSize, reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
974 break;
975 }
976 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
977 success = spaceToDepthGeneric(
978 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
979 blockSize, reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
980 break;
981 }
982 default: {
983 LOG(ERROR) << "Unsupported data type";
984 success = false;
985 }
986 }
987 if (data_layout) {
988 output_tmp_guard.reset(output_tmp.buffer);
989 }
990 if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
991 success = false;
992 break;
993 }
994 } break;
995 case OperationType::EMBEDDING_LOOKUP: {
996 if (!allParametersPresent(2, 1)) {
997 return ANEURALNETWORKS_BAD_DATA;
998 }
999 const RunTimeOperandInfo& values = operands[ins[EmbeddingLookup::kValueTensor]];
1000 const RunTimeOperandInfo& lookups = operands[ins[EmbeddingLookup::kLookupTensor]];
1001 RunTimeOperandInfo& output = operands[outs[EmbeddingLookup::kOutputTensor]];
1002
1003 Shape outputShape;
1004 EmbeddingLookup lookup(operation, operands);
1005
1006 success = embeddingLookupPrepare(values.shape(), lookups.shape(), &outputShape) &&
1007 setInfoAndAllocateIfNeeded(&output, outputShape, &result) && lookup.Eval();
1008 } break;
1009 case OperationType::HASHTABLE_LOOKUP: {
1010 if (!allParametersPresent(3, 2)) {
1011 return ANEURALNETWORKS_BAD_DATA;
1012 }
1013 const RunTimeOperandInfo& lookups = operands[ins[HashtableLookup::kLookupTensor]];
1014 const RunTimeOperandInfo& keys = operands[ins[HashtableLookup::kKeyTensor]];
1015 const RunTimeOperandInfo& values = operands[ins[HashtableLookup::kValueTensor]];
1016
1017 RunTimeOperandInfo& output = operands[outs[HashtableLookup::kOutputTensor]];
1018 RunTimeOperandInfo& hits = operands[outs[HashtableLookup::kHitsTensor]];
1019
1020 Shape outputShape, hitShape;
1021 HashtableLookup lookup(operation, operands);
1022
1023 success = hashtableLookupPrepare(lookups.shape(), keys.shape(), values.shape(),
1024 &outputShape, &hitShape) &&
1025 setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1026 setInfoAndAllocateIfNeeded(&hits, hitShape, &result) && lookup.Eval();
1027 } break;
1028 case OperationType::LSH_PROJECTION: {
1029 RunTimeOperandInfo& output = operands[outs[LSHProjection::kOutputTensor]];
1030 Shape outputShape;
1031 if (!LSHProjection::Prepare(operation, operands, &outputShape) ||
1032 !setInfoAndAllocateIfNeeded(&output, outputShape, &result)) {
1033 break;
1034 }
1035
1036 LSHProjection lsh(operation, operands);
1037 const RunTimeOperandInfo& hash = operands[ins[LSHProjection::kHashTensor]];
1038 switch (hash.type) {
1039 case OperandType::TENSOR_FLOAT32: {
1040 success = lsh.Eval<float>();
1041 break;
1042 }
1043 case OperandType::TENSOR_FLOAT16: {
1044 success = lsh.Eval<_Float16>();
1045 break;
1046 }
1047 default: {
1048 success = false;
1049 LOG(ERROR) << "Unsupported data type";
1050 }
1051 }
1052 } break;
1053 case OperationType::BIDIRECTIONAL_SEQUENCE_LSTM: {
1054 const auto merge_outputs = getScalarData<bool>(
1055 operands[ins[BidirectionalSequenceLSTM::kMergeOutputsParam]]);
1056 const bool output_state = (outs.size() == 5 || outs.size() == 6);
1057 RunTimeOperandInfo& fwOutput =
1058 operands[outs[BidirectionalSequenceLSTM::kFwOutputTensor]];
1059 Shape fwOutputShape, bwOutputShape, fwOutputActivationStateShape,
1060 fwOutputCellStateShape, bwOutputActivationStateShape, bwOutputCellStateShape;
1061
1062 BidirectionalSequenceLSTM lstm(operation, operands);
1063 success = lstm.Prepare(operation, operands, &fwOutputShape, &bwOutputShape,
1064 &fwOutputActivationStateShape, &fwOutputCellStateShape,
1065 &bwOutputActivationStateShape, &bwOutputCellStateShape) &&
1066 setInfoAndAllocateIfNeeded(&fwOutput, fwOutputShape, &result);
1067 if (!merge_outputs) {
1068 RunTimeOperandInfo& bwOutput =
1069 operands[outs[BidirectionalSequenceLSTM::kBwOutputTensor]];
1070 success = success && setInfoAndAllocateIfNeeded(&bwOutput, bwOutputShape, &result);
1071 }
1072 if (output_state) {
1073 uint32_t delta = merge_outputs ? 1 : 0;
1074 RunTimeOperandInfo& fwOutputActivationState =
1075 operands[outs[BidirectionalSequenceLSTM::kFwOutputActivationStateTensor -
1076 delta]];
1077 RunTimeOperandInfo& fwOutputCellState =
1078 operands[outs[BidirectionalSequenceLSTM::kFwOutputCellStateTensor - delta]];
1079 RunTimeOperandInfo& bwOutputActivationState =
1080 operands[outs[BidirectionalSequenceLSTM::kBwOutputActivationStateTensor -
1081 delta]];
1082 RunTimeOperandInfo& bwOutputCellState =
1083 operands[outs[BidirectionalSequenceLSTM::kBwOutputCellStateTensor - delta]];
1084 success = success &&
1085 setInfoAndAllocateIfNeeded(&fwOutputActivationState,
1086 fwOutputActivationStateShape, &result) &&
1087 setInfoAndAllocateIfNeeded(&fwOutputCellState, fwOutputCellStateShape,
1088 &result) &&
1089 setInfoAndAllocateIfNeeded(&bwOutputActivationState,
1090 bwOutputActivationStateShape, &result) &&
1091 setInfoAndAllocateIfNeeded(&bwOutputCellState, bwOutputCellStateShape,
1092 &result);
1093 }
1094 success = success && lstm.Eval();
1095 } break;
1096 case OperationType::LSTM: {
1097 RunTimeOperandInfo& scratch = operands[outs[LSTMCell::kScratchBufferTensor]];
1098 RunTimeOperandInfo& outputStateOut = operands[outs[LSTMCell::kOutputStateOutTensor]];
1099 RunTimeOperandInfo& cellStateOut = operands[outs[LSTMCell::kCellStateOutTensor]];
1100 RunTimeOperandInfo& output = operands[outs[LSTMCell::kOutputTensor]];
1101
1102 Shape scratchShape, outputStateShape, cellStateShape, outputShape;
1103 LSTMCell lstm_cell(operation, operands);
1104
1105 success = lstm_cell.Prepare(operation, operands, &scratchShape, &outputStateShape,
1106 &cellStateShape, &outputShape) &&
1107 setInfoAndAllocateIfNeeded(&scratch, scratchShape, &result) &&
1108 setInfoAndAllocateIfNeeded(&outputStateOut, outputStateShape, &result) &&
1109 setInfoAndAllocateIfNeeded(&cellStateOut, cellStateShape, &result) &&
1110 setInfoAndAllocateIfNeeded(&output, outputShape, &result) && lstm_cell.Eval();
1111 } break;
1112 case OperationType::RANDOM_MULTINOMIAL: {
1113 if (!allParametersPresent(3, 1)) {
1114 return ANEURALNETWORKS_BAD_DATA;
1115 }
1116 RunTimeOperandInfo& output = operands[outs[Multinomial::kOutputTensor]];
1117
1118 Shape outputShape;
1119 Multinomial multinomial(operation, operands);
1120
1121 success = Multinomial::Prepare(operation, operands, &outputShape) &&
1122 setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1123 multinomial.Eval();
1124 } break;
1125 case OperationType::RNN: {
1126 if (!allParametersPresent(6, 2)) {
1127 return ANEURALNETWORKS_BAD_DATA;
1128 }
1129
1130 RunTimeOperandInfo& hiddenStateOut = operands[outs[RNN::kHiddenStateOutTensor]];
1131 RunTimeOperandInfo& output = operands[outs[RNN::kOutputTensor]];
1132
1133 Shape hiddenStateShape, outputShape;
1134 RNN rnn_cell(operation, operands);
1135
1136 success = RNN::Prepare(operation, operands, &hiddenStateShape, &outputShape) &&
1137 setInfoAndAllocateIfNeeded(&hiddenStateOut, hiddenStateShape, &result) &&
1138 setInfoAndAllocateIfNeeded(&output, outputShape, &result) && rnn_cell.Eval();
1139 } break;
1140 case OperationType::SVDF: {
1141 RunTimeOperandInfo& stateOut = operands[outs[SVDF::kStateOutTensor]];
1142 RunTimeOperandInfo& output = operands[outs[SVDF::kOutputTensor]];
1143
1144 Shape stateShape, outputShape;
1145 SVDF svdf(operation, operands);
1146
1147 success = SVDF::Prepare(operation, operands, &stateShape, &outputShape) &&
1148 setInfoAndAllocateIfNeeded(&stateOut, stateShape, &result) &&
1149 setInfoAndAllocateIfNeeded(&output, outputShape, &result) && svdf.Eval();
1150 } break;
1151 case OperationType::BATCH_TO_SPACE_ND: {
1152 const size_t inCount = ins.size();
1153 if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
1154 return ANEURALNETWORKS_BAD_DATA;
1155 }
1156 const RunTimeOperandInfo& input = operands[ins[0]];
1157 const RunTimeOperandInfo& blockSize = operands[ins[1]];
1158 bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
1159
1160 RunTimeOperandInfo& output = operands[outs[0]];
1161 Shape outShape = output.shape();
1162
1163 RunTimeOperandInfo input_tmp, output_tmp;
1164 std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1165 if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1166 success = false;
1167 break;
1168 }
1169 output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
1170 output_tmp.buffer = data_layout ? nullptr : output.buffer;
1171 output_tmp.length = data_layout ? 0 : output.length;
1172
1173 if (!batchToSpacePrepare(input_tmp.shape(),
1174 reinterpret_cast<const int32_t*>(blockSize.buffer),
1175 blockSize.shape(), &outShape) ||
1176 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1177 if (!data_layout) output.dimensions = output_tmp.dimensions;
1178 break;
1179 }
1180 switch (input_tmp.type) {
1181 case OperandType::TENSOR_FLOAT32: {
1182 success = batchToSpaceGeneric(
1183 reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1184 reinterpret_cast<const int32_t*>(blockSize.buffer),
1185 reinterpret_cast<float*>(output_tmp.buffer), outShape);
1186 break;
1187 }
1188 case OperandType::TENSOR_FLOAT16: {
1189 success = batchToSpaceGeneric(
1190 reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1191 reinterpret_cast<const int32_t*>(blockSize.buffer),
1192 reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
1193 break;
1194 }
1195 case OperandType::TENSOR_QUANT8_ASYMM: {
1196 success = batchToSpaceGeneric(
1197 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1198 reinterpret_cast<const int32_t*>(blockSize.buffer),
1199 reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1200 break;
1201 }
1202 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1203 success = batchToSpaceGeneric(
1204 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1205 reinterpret_cast<const int32_t*>(blockSize.buffer),
1206 reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1207 break;
1208 }
1209 default: {
1210 LOG(ERROR) << "Unsupported data type";
1211 success = false;
1212 }
1213 }
1214 if (data_layout) {
1215 output_tmp_guard.reset(output_tmp.buffer);
1216 }
1217 if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1218 success = false;
1219 break;
1220 }
1221 } break;
1222 case OperationType::SPACE_TO_BATCH_ND: {
1223 const size_t inCount = ins.size();
1224 if ((inCount != 4 && inCount != 3) || !allParametersPresent(inCount, 1)) {
1225 return ANEURALNETWORKS_BAD_DATA;
1226 }
1227 const RunTimeOperandInfo& input = operands[ins[0]];
1228 const RunTimeOperandInfo& blockSize = operands[ins[1]];
1229 const RunTimeOperandInfo& paddings = operands[ins[2]];
1230 bool data_layout = inCount == 4 ? getScalarData<bool>(operands[ins[3]]) : false;
1231
1232 RunTimeOperandInfo& output = operands[outs[0]];
1233 Shape outShape = output.shape();
1234
1235 RunTimeOperandInfo input_tmp, output_tmp;
1236 std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1237 if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1238 success = false;
1239 break;
1240 }
1241 output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
1242 output_tmp.buffer = data_layout ? nullptr : output.buffer;
1243 output_tmp.length = data_layout ? 0 : output.length;
1244
1245 if (!spaceToBatchPrepare(
1246 input_tmp.shape(), reinterpret_cast<const int32_t*>(blockSize.buffer),
1247 blockSize.shape(), reinterpret_cast<const int32_t*>(paddings.buffer),
1248 paddings.shape(), &outShape) ||
1249 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1250 if (!data_layout) output.dimensions = output_tmp.dimensions;
1251 break;
1252 }
1253 switch (input_tmp.type) {
1254 case OperandType::TENSOR_FLOAT32: {
1255 success = spaceToBatchGeneric(
1256 reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1257 reinterpret_cast<const int32_t*>(blockSize.buffer),
1258 reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1259 reinterpret_cast<float*>(output_tmp.buffer), outShape);
1260 break;
1261 }
1262 case OperandType::TENSOR_FLOAT16: {
1263 success = spaceToBatchGeneric(
1264 reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1265 reinterpret_cast<const int32_t*>(blockSize.buffer),
1266 reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1267 reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
1268 break;
1269 }
1270 case OperandType::TENSOR_QUANT8_ASYMM: {
1271 success = spaceToBatchGeneric(
1272 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1273 reinterpret_cast<const int32_t*>(blockSize.buffer),
1274 reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1275 reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1276 break;
1277 }
1278 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1279 success = spaceToBatchGeneric(
1280 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1281 reinterpret_cast<const int32_t*>(blockSize.buffer),
1282 reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1283 reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1284 break;
1285 }
1286 default: {
1287 LOG(ERROR) << "Unsupported data type";
1288 success = false;
1289 }
1290 }
1291 if (data_layout) {
1292 output_tmp_guard.reset(output_tmp.buffer);
1293 }
1294 if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1295 success = false;
1296 break;
1297 }
1298 } break;
1299 case OperationType::PAD:
1300 case OperationType::PAD_V2: {
1301 const bool isV2 = operation.type == OperationType::PAD_V2;
1302 if (!allParametersPresent(isV2 ? 3 : 2, 1)) {
1303 return ANEURALNETWORKS_BAD_DATA;
1304 }
1305 const RunTimeOperandInfo& input = operands[ins[0]];
1306 const RunTimeOperandInfo& paddings = operands[ins[1]];
1307
1308 RunTimeOperandInfo& output = operands[outs[0]];
1309 Shape outShape = output.shape();
1310
1311 if (!padPrepare(input.shape(), reinterpret_cast<const int32_t*>(paddings.buffer),
1312 paddings.shape(), &outShape) ||
1313 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
1314 break;
1315 }
1316 if (input.type == OperandType::TENSOR_FLOAT32) {
1317 float pad_value = isV2 ? getScalarData<float>(operands[ins[2]]) : 0;
1318 success = padGeneric(reinterpret_cast<const float*>(input.buffer), input.shape(),
1319 reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1320 reinterpret_cast<float*>(output.buffer), outShape);
1321 } else if (input.type == OperandType::TENSOR_FLOAT16) {
1322 _Float16 pad_value = isV2 ? getScalarData<_Float16>(operands[ins[2]]) : 0;
1323 success = padGeneric(reinterpret_cast<const _Float16*>(input.buffer), input.shape(),
1324 reinterpret_cast<const int32_t*>(paddings.buffer),
1325 static_cast<_Float16>(pad_value),
1326 reinterpret_cast<_Float16*>(output.buffer), outShape);
1327 } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
1328 uint8_t pad_value =
1329 isV2 ? getScalarData<uint8_t>(operands[ins[2]]) : outShape.offset;
1330 success = padGeneric(input.buffer, input.shape(),
1331 reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1332 output.buffer, outShape);
1333 } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1334 uint8_t pad_value =
1335 isV2 ? getScalarData<int8_t>(operands[ins[2]]) : outShape.offset;
1336 success = padGeneric(input.buffer, input.shape(),
1337 reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1338 output.buffer, outShape);
1339 }
1340 } break;
1341 case OperationType::CAST: {
1342 if (!allParametersPresent(1, 1)) {
1343 return ANEURALNETWORKS_BAD_DATA;
1344 }
1345 const RunTimeOperandInfo& input = operands[ins[0]];
1346
1347 RunTimeOperandInfo& output = operands[outs[0]];
1348 Shape outShape = output.shape();
1349
1350 success = cast::prepare(input.shape(), &outShape) &&
1351 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1352 cast::eval(input.buffer, input.shape(), output.buffer, outShape);
1353 } break;
1354 case OperationType::MEAN: {
1355 if (!allParametersPresent(3, 1)) {
1356 return ANEURALNETWORKS_BAD_DATA;
1357 }
1358 const RunTimeOperandInfo& input = operands[ins[0]];
1359 const RunTimeOperandInfo& axis = operands[ins[1]];
1360 int32_t keepDims = getScalarData<int32_t>(operands[ins[2]]);
1361
1362 RunTimeOperandInfo& output = operands[outs[0]];
1363 Shape outShape = output.shape();
1364
1365 if (!meanPrepare(input.shape(), reinterpret_cast<const int32_t*>(axis.buffer),
1366 axis.shape(), keepDims > 0, &outShape) ||
1367 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
1368 break;
1369 }
1370 if (input.type == OperandType::TENSOR_FLOAT16) {
1371 success = meanFloat16(reinterpret_cast<_Float16*>(input.buffer), input.shape(),
1372 reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(),
1373 keepDims > 0, reinterpret_cast<_Float16*>(output.buffer),
1374 outShape);
1375 } else if (input.type == OperandType::TENSOR_FLOAT32) {
1376 success = meanGeneric<float, float>(
1377 reinterpret_cast<float*>(input.buffer), input.shape(),
1378 reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1379 reinterpret_cast<float*>(output.buffer), outShape);
1380 } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
1381 success = meanGeneric<uint8_t, int32_t>(
1382 reinterpret_cast<uint8_t*>(input.buffer), input.shape(),
1383 reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1384 reinterpret_cast<uint8_t*>(output.buffer), outShape);
1385 } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1386 success = meanGeneric<int8_t, int32_t>(
1387 reinterpret_cast<int8_t*>(input.buffer), input.shape(),
1388 reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1389 reinterpret_cast<int8_t*>(output.buffer), outShape);
1390 }
1391 } break;
1392 case OperationType::ARGMAX:
1393 case OperationType::ARGMIN: {
1394 if (!allParametersPresent(2, 1)) {
1395 return ANEURALNETWORKS_BAD_DATA;
1396 }
1397 const RunTimeOperandInfo& input = operands[ins[0]];
1398 int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1399
1400 RunTimeOperandInfo& output = operands[outs[0]];
1401 Shape outShape = output.shape();
1402
1403 const bool isArgMin = operation.type == OperationType::ARGMIN;
1404 success = argMinMaxPrepare(input.shape(), axis, &outShape) &&
1405 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1406 argMinMaxGeneric(input.buffer, input.shape(), axis, isArgMin, output.buffer,
1407 outShape);
1408 } break;
1409 case OperationType::EXPAND_DIMS: {
1410 if (!allParametersPresent(2, 1)) {
1411 return ANEURALNETWORKS_BAD_DATA;
1412 }
1413 const RunTimeOperandInfo& input = operands[ins[0]];
1414 int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1415
1416 RunTimeOperandInfo& output = operands[outs[0]];
1417 Shape outShape = output.shape();
1418
1419 success = expand_dims::prepare(input.shape(), axis, &outShape) &&
1420 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1421 expand_dims::eval(input.buffer, input.shape(), axis, output.buffer, outShape);
1422 } break;
1423 case OperationType::SPLIT: {
1424 const size_t outCount = outs.size();
1425 if (!allParametersPresent(3, outCount)) {
1426 return ANEURALNETWORKS_BAD_DATA;
1427 }
1428
1429 const RunTimeOperandInfo& input = operands[ins[0]];
1430 const int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1431 const int32_t numOutputs = getScalarData<int32_t>(operands[ins[2]]);
1432
1433 if (numOutputs != outs.size()) {
1434 return ANEURALNETWORKS_BAD_DATA;
1435 }
1436
1437 std::vector<Shape> outputShapes(numOutputs);
1438 for (int i = 0; i < numOutputs; ++i) {
1439 outputShapes[i] = operands[outs[i]].shape();
1440 }
1441
1442 success = splitPrepare(input.shape(), axis, numOutputs, &outputShapes);
1443 for (int i = 0; i < numOutputs; ++i) {
1444 success = success && setInfoAndAllocateIfNeeded(&(operands[outs[i]]),
1445 outputShapes[i], &result);
1446 }
1447 switch (input.type) {
1448 case OperandType::TENSOR_FLOAT16: {
1449 std::vector<_Float16*> outputDataPtrs(numOutputs);
1450 for (int i = 0; i < numOutputs; ++i) {
1451 outputDataPtrs[i] = reinterpret_cast<_Float16*>(operands[outs[i]].buffer);
1452 }
1453 success = success &&
1454 splitFloat16(reinterpret_cast<const _Float16*>(input.buffer),
1455 input.shape(), axis, &outputDataPtrs, outputShapes);
1456 } break;
1457 case OperandType::TENSOR_FLOAT32: {
1458 std::vector<float*> outputDataPtrs(numOutputs);
1459 for (int i = 0; i < numOutputs; ++i) {
1460 outputDataPtrs[i] = reinterpret_cast<float*>(operands[outs[i]].buffer);
1461 }
1462 success = success &&
1463 splitFloat32(reinterpret_cast<const float*>(input.buffer),
1464 input.shape(), axis, &outputDataPtrs, outputShapes);
1465 } break;
1466 case OperandType::TENSOR_INT32: {
1467 std::vector<int32_t*> outputDataPtrs(numOutputs);
1468 for (int i = 0; i < numOutputs; ++i) {
1469 outputDataPtrs[i] = reinterpret_cast<int32_t*>(operands[outs[i]].buffer);
1470 }
1471 success = success &&
1472 splitInt32(reinterpret_cast<const int32_t*>(input.buffer),
1473 input.shape(), axis, &outputDataPtrs, outputShapes);
1474 } break;
1475 case OperandType::TENSOR_QUANT8_ASYMM: {
1476 std::vector<uint8_t*> outputDataPtrs(numOutputs);
1477 for (int i = 0; i < numOutputs; ++i) {
1478 outputDataPtrs[i] = reinterpret_cast<uint8_t*>(operands[outs[i]].buffer);
1479 }
1480 success = success &&
1481 splitQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
1482 input.shape(), axis, &outputDataPtrs, outputShapes);
1483 } break;
1484 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1485 std::vector<int8_t*> outputDataPtrs(numOutputs);
1486 for (int i = 0; i < numOutputs; ++i) {
1487 outputDataPtrs[i] = reinterpret_cast<int8_t*>(operands[outs[i]].buffer);
1488 }
1489 success = success &&
1490 splitQuant8Signed(reinterpret_cast<const int8_t*>(input.buffer),
1491 input.shape(), axis, &outputDataPtrs, outputShapes);
1492 } break;
1493 default: {
1494 return ANEURALNETWORKS_BAD_DATA;
1495 }
1496 }
1497 } break;
1498 case OperationType::MAXIMUM:
1499 case OperationType::MINIMUM: {
1500 if (!allParametersPresent(2, 1)) {
1501 return ANEURALNETWORKS_BAD_DATA;
1502 }
1503 const RunTimeOperandInfo& in1 = operands[ins[0]];
1504 const RunTimeOperandInfo& in2 = operands[ins[1]];
1505
1506 RunTimeOperandInfo& output = operands[outs[0]];
1507 Shape outputShape = output.shape();
1508
1509 const bool isMinimum = operation.type == OperationType::MINIMUM;
1510 success = maximum_minimum::prepare(in1.shape(), in2.shape(), &outputShape) &&
1511 setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1512 maximum_minimum::eval(in1.buffer, in1.shape(), in2.buffer, in2.shape(),
1513 isMinimum, output.buffer, outputShape);
1514 } break;
1515 case OperationType::GROUPED_CONV_2D: {
1516 const size_t inCount = ins.size();
1517 if ((inCount != 12 && inCount != 9) || !allParametersPresent(inCount, 1)) {
1518 return ANEURALNETWORKS_BAD_DATA;
1519 }
1520 const RunTimeOperandInfo& input = operands[ins[0]];
1521 const RunTimeOperandInfo& filter = operands[ins[1]];
1522 const RunTimeOperandInfo& bias = operands[ins[2]];
1523
1524 int32_t padding_left, padding_right;
1525 int32_t padding_top, padding_bottom;
1526 int32_t padding_implicit = 0;
1527 int32_t stride_width, stride_height;
1528 int32_t numGroups;
1529 int32_t activation;
1530 bool data_layout = false;
1531
1532 if (inCount == 12) {
1533 padding_left = getScalarData<int32_t>(operands[ins[3]]);
1534 padding_right = getScalarData<int32_t>(operands[ins[4]]);
1535 padding_top = getScalarData<int32_t>(operands[ins[5]]);
1536 padding_bottom = getScalarData<int32_t>(operands[ins[6]]);
1537 stride_width = getScalarData<int32_t>(operands[ins[7]]);
1538 stride_height = getScalarData<int32_t>(operands[ins[8]]);
1539 numGroups = getScalarData<int32_t>(operands[ins[9]]);
1540 activation = getScalarData<int32_t>(operands[ins[10]]);
1541 data_layout = getScalarData<bool>(operands[ins[11]]);
1542 } else {
1543 padding_implicit = getScalarData<int32_t>(operands[ins[3]]);
1544 stride_width = getScalarData<int32_t>(operands[ins[4]]);
1545 stride_height = getScalarData<int32_t>(operands[ins[5]]);
1546 numGroups = getScalarData<int32_t>(operands[ins[6]]);
1547 activation = getScalarData<int32_t>(operands[ins[7]]);
1548 data_layout = getScalarData<bool>(operands[ins[8]]);
1549 }
1550
1551 RunTimeOperandInfo& output = operands[outs[0]];
1552 Shape outShape = output.shape();
1553
1554 RunTimeOperandInfo input_tmp, output_tmp;
1555 std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1556 if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1557 success = false;
1558 break;
1559 }
1560 output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
1561 output_tmp.buffer = data_layout ? nullptr : output.buffer;
1562 output_tmp.length = data_layout ? 0 : output.length;
1563
1564 if (inCount == 9) {
1565 Shape inputShape = input_tmp.shape();
1566 Shape filterShape = filter.shape();
1567 int32_t input_width = getSizeOfDimension(inputShape, 2);
1568 int32_t input_height = getSizeOfDimension(inputShape, 1);
1569 int32_t filter_width = getSizeOfDimension(filterShape, 2);
1570 int32_t filter_height = getSizeOfDimension(filterShape, 1);
1571 calculateExplicitPadding(input_width, stride_width, filter_width, padding_implicit,
1572 &padding_left, &padding_right);
1573 calculateExplicitPadding(input_height, stride_height, filter_height,
1574 padding_implicit, &padding_top, &padding_bottom);
1575 }
1576
1577 if (!groupedConvPrepare(input_tmp.shape(), filter.shape(), bias.shape(), padding_left,
1578 padding_right, padding_top, padding_bottom, stride_width,
1579 stride_height, numGroups, &outShape) ||
1580 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1581 if (!data_layout) output.dimensions = output_tmp.dimensions;
1582 success = false;
1583 break;
1584 }
1585
1586 if (input_tmp.type == OperandType::TENSOR_FLOAT32) {
1587 success = groupedConvFloat32(
1588 reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1589 reinterpret_cast<const float*>(filter.buffer), filter.shape(),
1590 reinterpret_cast<const float*>(bias.buffer), bias.shape(), padding_left,
1591 padding_right, padding_top, padding_bottom, stride_width, stride_height,
1592 numGroups, activation, reinterpret_cast<float*>(output_tmp.buffer),
1593 outShape);
1594 } else if (input_tmp.type == OperandType::TENSOR_FLOAT16) {
1595 success = groupedConvFloat16(
1596 reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1597 reinterpret_cast<const _Float16*>(filter.buffer), filter.shape(),
1598 reinterpret_cast<const _Float16*>(bias.buffer), bias.shape(), padding_left,
1599 padding_right, padding_top, padding_bottom, stride_width, stride_height,
1600 numGroups, activation, reinterpret_cast<_Float16*>(output_tmp.buffer),
1601 outShape);
1602 } else if (input_tmp.type == OperandType::TENSOR_QUANT8_ASYMM) {
1603 if (filter.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
1604 success = groupedConvQuant8PerChannel(
1605 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1606 reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1607 filter.extraParams.channelQuant().scales.data(),
1608 reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1609 padding_left, padding_right, padding_top, padding_bottom, stride_width,
1610 stride_height, numGroups, activation,
1611 reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1612 } else if (filter.type == OperandType::TENSOR_QUANT8_ASYMM) {
1613 success = groupedConvQuant8(
1614 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1615 reinterpret_cast<const uint8_t*>(filter.buffer), filter.shape(),
1616 reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1617 padding_left, padding_right, padding_top, padding_bottom, stride_width,
1618 stride_height, numGroups, activation,
1619 reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1620 }
1621 } else if (input_tmp.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1622 if (filter.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
1623 success = groupedConvQuant8PerChannel(
1624 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1625 reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1626 filter.extraParams.channelQuant().scales.data(),
1627 reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1628 padding_left, padding_right, padding_top, padding_bottom, stride_width,
1629 stride_height, numGroups, activation,
1630 reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1631 } else if (filter.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1632 success = groupedConvQuant8(
1633 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1634 reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1635 reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1636 padding_left, padding_right, padding_top, padding_bottom, stride_width,
1637 stride_height, numGroups, activation,
1638 reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1639 }
1640 }
1641
1642 if (data_layout) {
1643 output_tmp_guard.reset(output_tmp.buffer);
1644 }
1645 if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1646 success = false;
1647 break;
1648 }
1649 } break;
1650 case OperationType::TILE: {
1651 if (!allParametersPresent(2, 1)) {
1652 return ANEURALNETWORKS_BAD_DATA;
1653 }
1654 const RunTimeOperandInfo& input = operands[ins[0]];
1655 const RunTimeOperandInfo& multiples = operands[ins[1]];
1656
1657 RunTimeOperandInfo& output = operands[outs[0]];
1658 Shape outShape = output.shape();
1659
1660 success =
1661 tile::prepare(input.shape(), reinterpret_cast<const int32_t*>(multiples.buffer),
1662 multiples.shape(), &outShape) &&
1663 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1664 tile::eval(input.buffer, input.shape(),
1665 reinterpret_cast<const int32_t*>(multiples.buffer), output.buffer,
1666 outShape);
1667 } break;
1668 case OperationType::QUANTIZED_16BIT_LSTM: {
1669 if (!allParametersPresent(15, 2)) {
1670 return ANEURALNETWORKS_BAD_DATA;
1671 }
1672
1673 RunTimeOperandInfo& cellStateOut =
1674 operands[outs[QuantizedLSTMCell::kCellStateOutTensor]];
1675 RunTimeOperandInfo& output = operands[outs[QuantizedLSTMCell::kOutputTensor]];
1676
1677 Shape cellStateOutShape, outputShape;
1678 QuantizedLSTMCell quantizedLSTMCell(operation, operands);
1679
1680 success = QuantizedLSTMCell::prepare(operation, operands, &cellStateOutShape,
1681 &outputShape) &&
1682 setInfoAndAllocateIfNeeded(&cellStateOut, cellStateOutShape, &result) &&
1683 setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1684 quantizedLSTMCell.eval();
1685 } break;
1686 case OperationType::POW: {
1687 if (!allParametersPresent(2, 1)) {
1688 return ANEURALNETWORKS_BAD_DATA;
1689 }
1690 const RunTimeOperandInfo& base = operands[ins[0]];
1691 const RunTimeOperandInfo& exponent = operands[ins[1]];
1692
1693 RunTimeOperandInfo& output = operands[outs[0]];
1694 Shape outShape = output.shape();
1695
1696 success = pow::prepare(base.shape(), exponent.shape(), &outShape) &&
1697 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1698 pow::eval(base.buffer, base.shape(), exponent.buffer, exponent.shape(),
1699 output.buffer, outShape);
1700 } break;
1701 default: {
1702 const OperationRegistration* operationRegistration =
1703 mOperationResolver->findOperation(operation.type);
1704 if (operationRegistration == nullptr) {
1705 LOG(ERROR) << getOperationName(operation.type) << " not registered";
1706 } else if (operationRegistration->prepare == nullptr ||
1707 operationRegistration->execute == nullptr) {
1708 LOG(ERROR) << "Incomplete operation registration: "
1709 << getOperationName(operation.type);
1710 } else {
1711 OperationExecutionContext context(&operation, operands);
1712 success = operationRegistration->flags.allowOmittedOperand ||
1713 context.checkNoOmittedOperand();
1714 success = success && (operationRegistration->flags.allowZeroSizedInput ||
1715 context.checkNoZeroSizedInput());
1716 success = success && operationRegistration->prepare(&context) &&
1717 operationRegistration->execute(&context);
1718 result = context.getResultCode();
1719 }
1720 }
1721 }
1722 if (!success && result == ANEURALNETWORKS_NO_ERROR) {
1723 result = ANEURALNETWORKS_OP_FAILED;
1724 }
1725 if (result != ANEURALNETWORKS_NO_ERROR) {
1726 LOG(ERROR) << getOperationName(operation.type) << " failed.";
1727 }
1728
1729 consumeOperationInputs(ins, operands);
1730 return result;
1731 }
1732
1733 // Copies RunTimeOperandInfo, preserving the original lifetime and numberOfUsesLeft
1734 // to prevent deallocation of subgraph inputs and outputs.
setInfoExceptLifetime(RunTimeOperandInfo * to,const RunTimeOperandInfo & from)1735 static void setInfoExceptLifetime(RunTimeOperandInfo* to, const RunTimeOperandInfo& from) {
1736 auto originalLifetime = to->lifetime;
1737 auto originalNumberOfUsesLeft = to->numberOfUsesLeft;
1738 *to = from;
1739 to->lifetime = originalLifetime;
1740 to->numberOfUsesLeft = originalNumberOfUsesLeft;
1741 }
1742
executeIfOperation(const Operation & operation,RunTimeOperandInfo * operands)1743 int CpuExecutor::executeIfOperation(const Operation& operation, RunTimeOperandInfo* operands) {
1744 namespace op = operation_if;
1745 const RunTimeOperandInfo& condOperand = operands[operation.inputs[op::kCondBoolOperand]];
1746 if (condOperand.buffer == nullptr) {
1747 LOG(ERROR) << "Cannot read IF condition operand value";
1748 return ANEURALNETWORKS_OP_FAILED;
1749 }
1750 const bool condValue = *reinterpret_cast<const bool8*>(condOperand.buffer);
1751 VLOG(CPUEXE) << "CpuExecutor::executeIfOperation: condition value: " << condValue;
1752
1753 const uint32_t branchInputIndex = condValue ? op::kThenModelOperand : op::kElseModelOperand;
1754 const RunTimeOperandInfo& branchOperand = operands[operation.inputs[branchInputIndex]];
1755 const Subgraph& branchSubgraph = *reinterpret_cast<const Subgraph*>(branchOperand.buffer);
1756 std::vector<RunTimeOperandInfo> branchOperands = initializeRunTimeInfo(branchSubgraph);
1757
1758 // Initialize inner input and output operands from outer operands.
1759 for (uint32_t i = 0, n = branchSubgraph.inputIndexes.size(); i < n; ++i) {
1760 setInfoExceptLifetime(&branchOperands[branchSubgraph.inputIndexes[i]],
1761 operands[operation.inputs[op::kFirstInput + i]]);
1762 }
1763 for (uint32_t i = 0, n = branchSubgraph.outputIndexes.size(); i < n; ++i) {
1764 setInfoExceptLifetime(&branchOperands[branchSubgraph.outputIndexes[i]],
1765 operands[operation.outputs[i]]);
1766 }
1767
1768 NN_RETURN_IF_ERROR(executeSubgraph(branchSubgraph, branchOperands.data()));
1769 freeUnusedSubgraphOperands(&branchOperands);
1770
1771 // Update outer outputs.
1772 for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1773 setInfoExceptLifetime(&operands[operation.outputs[i]],
1774 branchOperands[branchSubgraph.outputIndexes[i]]);
1775 }
1776
1777 consumeOperationInputs(operation.inputs, operands);
1778 return ANEURALNETWORKS_NO_ERROR;
1779 }
1780
executeWhileOperation(const Operation & operation,RunTimeOperandInfo * operands)1781 int CpuExecutor::executeWhileOperation(const Operation& operation, RunTimeOperandInfo* operands) {
1782 namespace op = operation_while;
1783 const RunTimeOperandInfo& condModelOperand = operands[operation.inputs[op::kCondModelOperand]];
1784 const RunTimeOperandInfo& bodyModelOperand = operands[operation.inputs[op::kBodyModelOperand]];
1785 const Subgraph& condSubgraph = *reinterpret_cast<const Subgraph*>(condModelOperand.buffer);
1786 const Subgraph& bodySubgraph = *reinterpret_cast<const Subgraph*>(bodyModelOperand.buffer);
1787 std::vector<RunTimeOperandInfo> condOperands = initializeRunTimeInfo(condSubgraph);
1788 std::vector<RunTimeOperandInfo> bodyOperands = initializeRunTimeInfo(bodySubgraph);
1789
1790 // The code below implements the following sequence of subgraph input and output buffer
1791 // assignments:
1792 // iteration = 0 cond inputs = body inputs = outer inputs body outputs = tmp1
1793 // iteration = 1 cond inputs = body inputs = tmp1 body outputs = tmp2
1794 // iteration = 2 cond inputs = body inputs = tmp2 body outputs = tmp1
1795 // iteration = 3 cond inputs = body inputs = ... body outputs = ...
1796
1797 // For body output double buffering.
1798 std::vector<uint8_t*> tmp1(bodySubgraph.outputIndexes.size());
1799 std::vector<uint8_t*> tmp2(bodySubgraph.outputIndexes.size());
1800
1801 // Ensure objects are freed
1802 auto cleanupGuard = base::make_scope_guard(
1803 [&tmp1, &tmp2, &condOperands, &bodyOperands, &operation, &operands] {
1804 auto freeLoopOutputs = [](const std::vector<uint8_t*>& tmp) {
1805 for (auto buffer : tmp) {
1806 if (buffer != nullptr) {
1807 delete[] buffer;
1808 }
1809 }
1810 };
1811
1812 freeLoopOutputs(tmp1);
1813 freeLoopOutputs(tmp2);
1814 freeUnusedSubgraphOperands(&condOperands);
1815 freeUnusedSubgraphOperands(&bodyOperands);
1816 consumeOperationInputs(operation.inputs, operands);
1817 }
1818 );
1819
1820 // For body outputs with unknown shape, we skip double buffering and
1821 // allocate on each iteration instead. This allows growing output tensors
1822 // inside a WHILE loop.
1823 std::vector<bool> bodyOutputHasUnknownShape(bodySubgraph.outputIndexes.size());
1824 for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1825 const Operand& operand = bodySubgraph.operands[bodySubgraph.outputIndexes[i]];
1826 bodyOutputHasUnknownShape[i] = nonExtensionOperandSizeOfData(operand) == 0;
1827 }
1828
1829 // Initialize condition inputs from outer operands.
1830 for (uint32_t i = 0, n = condSubgraph.inputIndexes.size(); i < n; ++i) {
1831 setInfoExceptLifetime(&condOperands[condSubgraph.inputIndexes[i]],
1832 operands[operation.inputs[op::kFirstInput + i]]);
1833 }
1834
1835 // Store condition output on the stack.
1836 RunTimeOperandInfo& condOutput = condOperands[condSubgraph.outputIndexes[0]];
1837 bool8 condValue = {/* initialized memory */};
1838 condOutput.buffer = &condValue;
1839 condOutput.length = sizeof(condValue);
1840
1841 std::chrono::nanoseconds timeoutDuration(mLoopTimeoutDuration);
1842 const auto startTime = std::chrono::steady_clock::now();
1843 for (uint32_t iteration = 0;; ++iteration) {
1844 VLOG(CPUEXE) << "CpuExecutor::executeWhileOperation: iteration " << iteration;
1845 if (iteration != 0) {
1846 // Set condition inputs from previous iteration outputs.
1847 for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1848 setInfoExceptLifetime(&condOperands[condSubgraph.inputIndexes[i]],
1849 bodyOperands[bodySubgraph.outputIndexes[i]]);
1850 }
1851 }
1852 NN_RETURN_IF_ERROR(executeSubgraph(condSubgraph, condOperands.data()));
1853 VLOG(CPUEXE) << "CpuExecutor::executeWhileOperation: condition value: "
1854 << static_cast<int>(condValue);
1855 if (!condValue) {
1856 break;
1857 }
1858
1859 const auto duration = std::chrono::steady_clock::now() - startTime;
1860 if (duration > timeoutDuration) {
1861 LOG(ERROR) << "CpuExecutor::executeWhileOperation: timed out after "
1862 << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count()
1863 << " ms";
1864 return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
1865 }
1866
1867 // Set body inputs from condition inputs.
1868 for (uint32_t i = 0, n = bodySubgraph.inputIndexes.size(); i < n; ++i) {
1869 bodyOperands[bodySubgraph.inputIndexes[i]] = condOperands[condSubgraph.inputIndexes[i]];
1870 }
1871 // Set body outputs.
1872 auto& outputBuffer = iteration % 2 == 0 ? tmp1 : tmp2;
1873 for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1874 RunTimeOperandInfo& info = bodyOperands[bodySubgraph.outputIndexes[i]];
1875 if (bodyOutputHasUnknownShape[i]) {
1876 // Reset dimensions and buffer.
1877 info.dimensions = bodySubgraph.operands[bodySubgraph.outputIndexes[i]].dimensions;
1878 if (outputBuffer[i] != nullptr) {
1879 delete[] outputBuffer[i];
1880 outputBuffer[i] = nullptr;
1881 }
1882 }
1883 info.buffer = outputBuffer[i];
1884 }
1885
1886 NN_RETURN_IF_ERROR(executeSubgraph(bodySubgraph, bodyOperands.data()));
1887
1888 // Update output buffer information in case we have allocated new buffers.
1889 for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1890 outputBuffer[i] = bodyOperands[bodySubgraph.outputIndexes[i]].buffer;
1891 }
1892 }
1893
1894 // Copy body outputs to outer outputs.
1895 for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1896 RunTimeOperandInfo& outerOperand = operands[operation.outputs[i]];
1897 RunTimeOperandInfo& innerOperand = condOperands[condSubgraph.inputIndexes[i]];
1898 if (int error; !setInfoAndAllocateIfNeeded(&outerOperand, innerOperand.shape(), &error)) {
1899 return error;
1900 }
1901 CHECK_EQ(outerOperand.length, innerOperand.length);
1902 // TODO: Use the outer buffer as tmp1 to avoid copies.
1903 std::memcpy(outerOperand.buffer, innerOperand.buffer, innerOperand.length);
1904 }
1905
1906 return ANEURALNETWORKS_NO_ERROR;
1907 }
1908
setOutputShapes(const std::vector<uint32_t> & outputIndexes,const std::vector<RunTimeOperandInfo> & operands)1909 void CpuExecutor::setOutputShapes(const std::vector<uint32_t>& outputIndexes,
1910 const std::vector<RunTimeOperandInfo>& operands) {
1911 mOutputShapes.resize(outputIndexes.size());
1912 for (uint32_t i = 0; i < outputIndexes.size(); i++) {
1913 const uint32_t operandIndex = outputIndexes[i];
1914 const RunTimeOperandInfo& from = operands[operandIndex];
1915 mOutputShapes[i].dimensions = from.dimensions;
1916 mOutputShapes[i].isSufficient = from.isSufficient();
1917 }
1918 }
1919
1920 // b/109953668, disable OpenMP
1921 #ifdef NNAPI_OPENMP
ScopedOpenmpSettings()1922 ScopedOpenmpSettings::ScopedOpenmpSettings() {
1923 mBlocktimeInitial = kmp_get_blocktime();
1924 kmp_set_blocktime(20); // ms, see b/109645291
1925
1926 #if NNAPI_LIMIT_CPU_THREADS
1927 // Code not yet enabled. Choosing the number of threads to be based on
1928 // benchmarking. See longer comment by the class declaration.
1929 mMaxThreadsInitial = Eigen::nbThreads();
1930 const int nProcs = omp_get_num_procs();
1931 int threads = nProcs;
1932 if (nProcs >= 8) {
1933 threads = nProcs - 4;
1934 } else if (nProcs >= 4) {
1935 threads = nProcs - 2;
1936 }
1937 Eigen::setNbThreads(threads);
1938 #endif
1939 }
1940
~ScopedOpenmpSettings()1941 ScopedOpenmpSettings::~ScopedOpenmpSettings() {
1942 kmp_set_blocktime(mBlocktimeInitial);
1943 #if NNAPI_LIMIT_CPU_THREADS
1944 Eigen::setNbThreads(mMaxThreadsInitial);
1945 #endif
1946 }
1947 #endif // NNAPI_OPENMP
1948
1949 } // namespace nn
1950 } // namespace android
1951