1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "Operations"
18 
19 #include <algorithm>
20 #include <cfloat>
21 #include <cmath>
22 #include <numeric>
23 #include <utility>
24 #include <vector>
25 
26 #include "CpuOperationUtils.h"
27 #include "HalInterfaces.h"
28 #include "OperationResolver.h"
29 #include "OperationsUtils.h"
30 #include "Tracing.h"
31 
32 namespace android {
33 namespace nn {
34 namespace bbox_ops {
35 
36 namespace {
37 
38 using namespace hal;
39 
40 struct BoxEncodingCorner {
41     float x1, y1, x2, y2;
42 };
43 struct BoxEncodingCenter {
44     float w, h, x, y;
45 };
toBoxEncodingCorner(const BoxEncodingCenter & ctr)46 BoxEncodingCorner toBoxEncodingCorner(const BoxEncodingCenter& ctr) {
47     return {.x1 = ctr.x - ctr.w / 2,
48             .y1 = ctr.y - ctr.h / 2,
49             .x2 = ctr.x + ctr.w / 2,
50             .y2 = ctr.y + ctr.h / 2};
51 }
toBoxEncodingCenter(const BoxEncodingCorner & cnr)52 BoxEncodingCenter toBoxEncodingCenter(const BoxEncodingCorner& cnr) {
53     return {.w = cnr.x2 - cnr.x1,
54             .h = cnr.y2 - cnr.y1,
55             .x = (cnr.x1 + cnr.x2) / 2,
56             .y = (cnr.y1 + cnr.y2) / 2};
57 }
58 
bboxTransformFloat32(const float * roiData,const Shape & roiShape,const float * bboxDeltasData,const Shape & bboxDeltasShape,const int32_t * batchesData,const Shape & batchesShape,const float * imageInfoData,const Shape & imageInfoDataShape,float * outputData,const Shape & outputShape)59 inline bool bboxTransformFloat32(const float* roiData, const Shape& roiShape,
60                                  const float* bboxDeltasData, const Shape& bboxDeltasShape,
61                                  const int32_t* batchesData, const Shape& batchesShape,
62                                  const float* imageInfoData, const Shape& imageInfoDataShape,
63                                  float* outputData, const Shape& outputShape) {
64     const uint32_t roiLength = 4;
65     const uint32_t imageLength = 2;
66 
67     uint32_t numClasses = getSizeOfDimension(bboxDeltasShape, 1) / roiLength;
68     uint32_t numBatches = getSizeOfDimension(imageInfoDataShape, 0);
69 
70     const float* roiDataEnd = roiData + getNumberOfElements(roiShape);
71     const float* deltas = bboxDeltasData;
72     float* outPtr = outputData;
73     uint32_t roiIndex = 0;
74     for (const float* roiBase = roiData; roiBase < roiDataEnd; roiBase += roiLength, roiIndex++) {
75         uint32_t batchIndex = batchesData[roiIndex];
76         // Check for malformed data
77         // 1. Invalid batch id
78         // 2. Invalid region: x2 < x1 || y2 < y1
79         NN_RET_CHECK_GE(batchIndex, 0);
80         NN_RET_CHECK_LT(batchIndex, numBatches);
81         NN_RET_CHECK_LE(roiBase[0], roiBase[2]);
82         NN_RET_CHECK_LE(roiBase[1], roiBase[3]);
83 
84         const float* imageInfoBase = imageInfoData + batchIndex * imageLength;
85         float imageHeight = imageInfoBase[0];
86         float imageWidth = imageInfoBase[1];
87         auto roiBefore = toBoxEncodingCenter(
88                 {.x1 = roiBase[0], .y1 = roiBase[1], .x2 = roiBase[2], .y2 = roiBase[3]});
89         for (uint32_t i = 0; i < numClasses; i++) {
90             auto roiAfter = toBoxEncodingCorner({.w = std::exp(deltas[2]) * roiBefore.w,
91                                                  .h = std::exp(deltas[3]) * roiBefore.h,
92                                                  .x = roiBefore.x + deltas[0] * roiBefore.w,
93                                                  .y = roiBefore.y + deltas[1] * roiBefore.h});
94             BoxEncodingCorner cliped = {.x1 = std::min(std::max(roiAfter.x1, 0.0f), imageWidth),
95                                         .y1 = std::min(std::max(roiAfter.y1, 0.0f), imageHeight),
96                                         .x2 = std::min(std::max(roiAfter.x2, 0.0f), imageWidth),
97                                         .y2 = std::min(std::max(roiAfter.y2, 0.0f), imageHeight)};
98             outPtr[0] = cliped.x1;
99             outPtr[1] = cliped.y1;
100             outPtr[2] = cliped.x2;
101             outPtr[3] = cliped.y2;
102             deltas += roiLength;
103             outPtr += roiLength;
104         }
105     }
106     return true;
107 }
108 
bboxTransformFloat16(const _Float16 * roiData,const Shape & roiShape,const _Float16 * bboxDeltasData,const Shape & bboxDeltasShape,const int32_t * batchesData,const Shape & batchesShape,const _Float16 * imageInfoData,const Shape & imageInfoDataShape,_Float16 * outputData,const Shape & outputShape)109 inline bool bboxTransformFloat16(const _Float16* roiData, const Shape& roiShape,
110                                  const _Float16* bboxDeltasData, const Shape& bboxDeltasShape,
111                                  const int32_t* batchesData, const Shape& batchesShape,
112                                  const _Float16* imageInfoData, const Shape& imageInfoDataShape,
113                                  _Float16* outputData, const Shape& outputShape) {
114     std::vector<float> roi_float32(getNumberOfElements(roiShape));
115     convertFloat16ToFloat32(roiData, &roi_float32);
116     std::vector<float> delta_float32(getNumberOfElements(bboxDeltasShape));
117     convertFloat16ToFloat32(bboxDeltasData, &delta_float32);
118     std::vector<float> imageInfo_float32(getNumberOfElements(imageInfoDataShape));
119     convertFloat16ToFloat32(imageInfoData, &imageInfo_float32);
120     std::vector<float> output_float32(getNumberOfElements(outputShape));
121     NN_RET_CHECK(bboxTransformFloat32(roi_float32.data(), roiShape, delta_float32.data(),
122                                       bboxDeltasShape, batchesData, batchesShape,
123                                       imageInfo_float32.data(), imageInfoDataShape,
124                                       output_float32.data(), outputShape));
125     convertFloat32ToFloat16(output_float32, outputData);
126     return true;
127 }
128 
bboxTransformQuant(const uint16_t * roiData,const Shape & roiShape,const uint8_t * bboxDeltasData,const Shape & bboxDeltasShape,const int32_t * batchesData,const Shape & batchesShape,const uint16_t * imageInfoData,const Shape & imageInfoDataShape,uint16_t * outputData,const Shape & outputShape)129 inline bool bboxTransformQuant(const uint16_t* roiData, const Shape& roiShape,
130                                const uint8_t* bboxDeltasData, const Shape& bboxDeltasShape,
131                                const int32_t* batchesData, const Shape& batchesShape,
132                                const uint16_t* imageInfoData, const Shape& imageInfoDataShape,
133                                uint16_t* outputData, const Shape& outputShape) {
134     std::vector<float> roi_float32(getNumberOfElements(roiShape));
135     convertQuantToFloat32(roiData, roiShape.scale, roiShape.offset, &roi_float32);
136     std::vector<float> delta_float32(getNumberOfElements(bboxDeltasShape));
137     convertQuantToFloat32(bboxDeltasData, bboxDeltasShape.scale, bboxDeltasShape.offset,
138                           &delta_float32);
139     std::vector<float> imageInfo_float32(getNumberOfElements(imageInfoDataShape));
140     convertQuantToFloat32(imageInfoData, imageInfoDataShape.scale, imageInfoDataShape.offset,
141                           &imageInfo_float32);
142     std::vector<float> output_float32(getNumberOfElements(outputShape));
143     NN_RET_CHECK(bboxTransformFloat32(roi_float32.data(), roiShape, delta_float32.data(),
144                                       bboxDeltasShape, batchesData, batchesShape,
145                                       imageInfo_float32.data(), imageInfoDataShape,
146                                       output_float32.data(), outputShape));
147     convertFloat32ToQuant(output_float32, outputShape.scale, outputShape.offset, outputData);
148     return true;
149 }
150 
bboxTransformQuant(const uint16_t * roiData,const Shape & roiShape,const int8_t * bboxDeltasData,const Shape & bboxDeltasShape,const int32_t * batchesData,const Shape & batchesShape,const uint16_t * imageInfoData,const Shape & imageInfoDataShape,uint16_t * outputData,const Shape & outputShape)151 inline bool bboxTransformQuant(const uint16_t* roiData, const Shape& roiShape,
152                                const int8_t* bboxDeltasData, const Shape& bboxDeltasShape,
153                                const int32_t* batchesData, const Shape& batchesShape,
154                                const uint16_t* imageInfoData, const Shape& imageInfoDataShape,
155                                uint16_t* outputData, const Shape& outputShape) {
156     std::vector<float> roi_float32(getNumberOfElements(roiShape));
157     convertQuantToFloat32(roiData, roiShape.scale, roiShape.offset, &roi_float32);
158     std::vector<float> delta_float32(getNumberOfElements(bboxDeltasShape));
159     convertQuantToFloat32<int8_t>(bboxDeltasData, bboxDeltasShape.scale, bboxDeltasShape.offset,
160                                   &delta_float32);
161     std::vector<float> imageInfo_float32(getNumberOfElements(imageInfoDataShape));
162     convertQuantToFloat32(imageInfoData, imageInfoDataShape.scale, imageInfoDataShape.offset,
163                           &imageInfo_float32);
164     std::vector<float> output_float32(getNumberOfElements(outputShape));
165     NN_RET_CHECK(bboxTransformFloat32(roi_float32.data(), roiShape, delta_float32.data(),
166                                       bboxDeltasShape, batchesData, batchesShape,
167                                       imageInfo_float32.data(), imageInfoDataShape,
168                                       output_float32.data(), outputShape));
169     convertFloat32ToQuant(output_float32, outputShape.scale, outputShape.offset, outputData);
170     return true;
171 }
172 
173 // Taking two indices of bounding boxes, return the intersection-of-union.
getIoUAxisAligned(const float * roi1,const float * roi2)174 float getIoUAxisAligned(const float* roi1, const float* roi2) {
175     const float area1 = (roi1[2] - roi1[0]) * (roi1[3] - roi1[1]);
176     const float area2 = (roi2[2] - roi2[0]) * (roi2[3] - roi2[1]);
177     const float x1 = std::max(roi1[0], roi2[0]);
178     const float x2 = std::min(roi1[2], roi2[2]);
179     const float y1 = std::max(roi1[1], roi2[1]);
180     const float y2 = std::min(roi1[3], roi2[3]);
181     const float w = std::max(x2 - x1, 0.0f);
182     const float h = std::max(y2 - y1, 0.0f);
183     const float areaIntersect = w * h;
184     const float areaUnion = area1 + area2 - areaIntersect;
185     return areaIntersect / areaUnion;
186 }
187 
188 }  // namespace
189 
190 namespace axis_aligned_bbox_transform {
191 
192 constexpr char kOperationName[] = "AXIS_ALIGNED_BBOX_TRANSFORM";
193 
194 constexpr uint32_t kNumInputs = 4;
195 constexpr uint32_t kRoiTensor = 0;
196 constexpr uint32_t kDeltaTensor = 1;
197 constexpr uint32_t kBatchesTensor = 2;
198 constexpr uint32_t kImageInfoTensor = 3;
199 
200 constexpr uint32_t kNumOutputs = 1;
201 constexpr uint32_t kOutputTensor = 0;
202 
validate(const IOperationValidationContext * context)203 bool validate(const IOperationValidationContext* context) {
204     NN_RET_CHECK_EQ(context->getNumInputs(), kNumInputs);
205     NN_RET_CHECK_EQ(context->getNumOutputs(), kNumOutputs);
206     std::vector<OperandType> inExpectedTypes;
207     auto inputType = context->getInputType(kRoiTensor);
208     auto deltaInputType = context->getInputType(kDeltaTensor);
209     if (inputType == OperandType::TENSOR_FLOAT32 || inputType == OperandType::TENSOR_FLOAT16) {
210         inExpectedTypes = {inputType, inputType, OperandType::TENSOR_INT32, inputType};
211     } else if (inputType == OperandType::TENSOR_QUANT16_ASYMM) {
212         if (deltaInputType == OperandType::TENSOR_QUANT8_ASYMM ||
213             deltaInputType == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
214             inExpectedTypes = {OperandType::TENSOR_QUANT16_ASYMM, deltaInputType,
215                                OperandType::TENSOR_INT32, OperandType::TENSOR_QUANT16_ASYMM};
216         } else {
217             LOG(ERROR) << "Unsupported input tensor type for operation " << kOperationName;
218             return false;
219         }
220     } else {
221         LOG(ERROR) << "Unsupported input tensor type for operation " << kOperationName;
222         return false;
223     }
224     NN_RET_CHECK(validateInputTypes(context, inExpectedTypes));
225     NN_RET_CHECK(validateOutputTypes(context, {inputType}));
226     return validateHalVersion(context, HalVersion::V1_2);
227 }
228 
prepare(IOperationExecutionContext * context)229 bool prepare(IOperationExecutionContext* context) {
230     Shape roiShape = context->getInputShape(kRoiTensor);
231     Shape bboxDeltasShape = context->getInputShape(kDeltaTensor);
232     Shape batchesShape = context->getInputShape(kBatchesTensor);
233     Shape imageInfoShape = context->getInputShape(kImageInfoTensor);
234     Shape outputShape = context->getOutputShape(kOutputTensor);
235 
236     NN_RET_CHECK_EQ(getNumberOfDimensions(roiShape), 2);
237     NN_RET_CHECK_EQ(getNumberOfDimensions(bboxDeltasShape), 2);
238     NN_RET_CHECK_EQ(getNumberOfDimensions(batchesShape), 1);
239     NN_RET_CHECK_EQ(getNumberOfDimensions(imageInfoShape), 2);
240 
241     // Only numRois can be zero.
242     const uint32_t kRoiDim = 4;
243     uint32_t numRois = getSizeOfDimension(roiShape, 0);
244     uint32_t numClasses = getSizeOfDimension(bboxDeltasShape, 1) / kRoiDim;
245     uint32_t numBatches = getSizeOfDimension(imageInfoShape, 0);
246     NN_RET_CHECK_GT(numClasses, 0);
247     NN_RET_CHECK_GT(numBatches, 0);
248     NN_RET_CHECK_EQ(getSizeOfDimension(roiShape, 1), kRoiDim);
249     NN_RET_CHECK_EQ(getSizeOfDimension(bboxDeltasShape, 0), numRois);
250     NN_RET_CHECK_EQ(getSizeOfDimension(bboxDeltasShape, 1), kRoiDim * numClasses);
251     NN_RET_CHECK_EQ(getSizeOfDimension(batchesShape, 0), numRois);
252     NN_RET_CHECK_EQ(getSizeOfDimension(imageInfoShape, 1), 2);
253 
254     if (roiShape.type == OperandType::TENSOR_QUANT16_ASYMM) {
255         NN_RET_CHECK_EQ(roiShape.scale, 0.125f);
256         NN_RET_CHECK_EQ(roiShape.offset, 0);
257         NN_RET_CHECK_EQ(imageInfoShape.scale, 0.125f);
258         NN_RET_CHECK_EQ(imageInfoShape.offset, 0);
259     }
260 
261     outputShape.type = roiShape.type;
262     outputShape.dimensions = {numRois, numClasses * kRoiDim};
263     outputShape.scale = 0.f;
264     outputShape.offset = 0;
265     if (roiShape.type == OperandType::TENSOR_QUANT16_ASYMM) {
266         outputShape.scale = 0.125f;
267     }
268     NN_RET_CHECK(context->setOutputShape(kOutputTensor, outputShape));
269     return true;
270 }
271 
execute(IOperationExecutionContext * context)272 bool execute(IOperationExecutionContext* context) {
273     NNTRACE_TRANS("axisAlignedBBoxTransform");
274     // Bypass execution in the case of zero-sized input.
275     if (getNumberOfElements(context->getOutputShape(kOutputTensor)) == 0) return true;
276     switch (context->getInputType(kRoiTensor)) {
277         case OperandType::TENSOR_FLOAT16: {
278             return bboxTransformFloat16(context->getInputBuffer<_Float16>(kRoiTensor),
279                                         context->getInputShape(kRoiTensor),
280                                         context->getInputBuffer<_Float16>(kDeltaTensor),
281                                         context->getInputShape(kDeltaTensor),
282                                         context->getInputBuffer<int32_t>(kBatchesTensor),
283                                         context->getInputShape(kBatchesTensor),
284                                         context->getInputBuffer<_Float16>(kImageInfoTensor),
285                                         context->getInputShape(kImageInfoTensor),
286                                         context->getOutputBuffer<_Float16>(kOutputTensor),
287                                         context->getOutputShape(kOutputTensor));
288         }
289         case OperandType::TENSOR_FLOAT32: {
290             return bboxTransformFloat32(context->getInputBuffer<float>(kRoiTensor),
291                                         context->getInputShape(kRoiTensor),
292                                         context->getInputBuffer<float>(kDeltaTensor),
293                                         context->getInputShape(kDeltaTensor),
294                                         context->getInputBuffer<int32_t>(kBatchesTensor),
295                                         context->getInputShape(kBatchesTensor),
296                                         context->getInputBuffer<float>(kImageInfoTensor),
297                                         context->getInputShape(kImageInfoTensor),
298                                         context->getOutputBuffer<float>(kOutputTensor),
299                                         context->getOutputShape(kOutputTensor));
300         }
301         case OperandType::TENSOR_QUANT16_ASYMM: {
302             if (context->getInputType(kDeltaTensor) == OperandType::TENSOR_QUANT8_ASYMM) {
303                 return bboxTransformQuant(context->getInputBuffer<uint16_t>(kRoiTensor),
304                                           context->getInputShape(kRoiTensor),
305                                           context->getInputBuffer<uint8_t>(kDeltaTensor),
306                                           context->getInputShape(kDeltaTensor),
307                                           context->getInputBuffer<int32_t>(kBatchesTensor),
308                                           context->getInputShape(kBatchesTensor),
309                                           context->getInputBuffer<uint16_t>(kImageInfoTensor),
310                                           context->getInputShape(kImageInfoTensor),
311                                           context->getOutputBuffer<uint16_t>(kOutputTensor),
312                                           context->getOutputShape(kOutputTensor));
313             } else {
314                 return bboxTransformQuant(context->getInputBuffer<uint16_t>(kRoiTensor),
315                                           context->getInputShape(kRoiTensor),
316                                           context->getInputBuffer<int8_t>(kDeltaTensor),
317                                           context->getInputShape(kDeltaTensor),
318                                           context->getInputBuffer<int32_t>(kBatchesTensor),
319                                           context->getInputShape(kBatchesTensor),
320                                           context->getInputBuffer<uint16_t>(kImageInfoTensor),
321                                           context->getInputShape(kImageInfoTensor),
322                                           context->getOutputBuffer<uint16_t>(kOutputTensor),
323                                           context->getOutputShape(kOutputTensor));
324             }
325         }
326         default:
327             NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
328     }
329 }
330 
331 }  // namespace axis_aligned_bbox_transform
332 
333 namespace box_with_nms_limit {
334 
335 constexpr char kOperationName[] = "BOX_WITH_NMS_LIMIT";
336 
337 constexpr uint32_t kNumInputs = 9;
338 constexpr uint32_t kScoreTensor = 0;
339 constexpr uint32_t kRoiTensor = 1;
340 constexpr uint32_t kBatchesTensor = 2;
341 constexpr uint32_t kScoreThresholdScalar = 3;
342 constexpr uint32_t kMaxNumDetectionScalar = 4;
343 constexpr uint32_t kNmsKernelScalar = 5;
344 constexpr uint32_t kIoUThresholdScalar = 6;
345 constexpr uint32_t kSigmaScalar = 7;
346 constexpr uint32_t kNmsScoreThresholdScalar = 8;
347 
348 constexpr uint32_t kNumOutputs = 4;
349 constexpr uint32_t kOutputScoreTensor = 0;
350 constexpr uint32_t kOutputRoiTensor = 1;
351 constexpr uint32_t kOutputClassTensor = 2;
352 constexpr uint32_t kOutputBatchesTensor = 3;
353 
354 namespace {
355 
356 // TODO(xusongw): Reduce code duplication with hard/soft nms path.
357 
358 // Inplace hard NMS within range [select, select + selectLength).
hardNmsSingleClass(const float * scoresData,float iouThreshold,int32_t maxNumDetections,std::function<const float * (uint32_t)> getRoiBase,uint32_t * select,uint32_t selectLength)359 uint32_t* hardNmsSingleClass(const float* scoresData, float iouThreshold, int32_t maxNumDetections,
360                              std::function<const float*(uint32_t)> getRoiBase, uint32_t* select,
361                              uint32_t selectLength) {
362     uint32_t *selectStart = select, *selectEnd = select + selectLength, numDetections = 0;
363     if (maxNumDetections < 0) {
364         maxNumDetections = selectLength;
365     }
366     while (selectStart < selectEnd && numDetections < maxNumDetections) {
367         // find max score and swap to the front
368         auto& maxScore = *std::max_element(selectStart, selectEnd,
369                                            [&scoresData](const uint32_t& lhs, const uint32_t& rhs) {
370                                                return scoresData[lhs] < scoresData[rhs];
371                                            });
372         std::swap(maxScore, *selectStart);
373 
374         // Calculate IoU of the rest, swap to the end (disgard) if needed.
375         for (uint32_t* i = selectStart + 1; i < selectEnd; i++) {
376             float iou = getIoUAxisAligned(getRoiBase(*i), getRoiBase(*selectStart));
377             if (iou >= iouThreshold) {
378                 std::swap(*i--, *(--selectEnd));
379             }
380         }
381         selectStart++;
382         numDetections++;
383     }
384     return selectStart;
385 }
386 
hardNmsMultiClass(const float * scoresData,uint32_t numClasses,uint32_t numRois,float scoreThreshold,float iouThreshold,int32_t maxNumDetections,int32_t maxNumDetectionsPerClass,std::function<const float * (uint32_t)> getRoiBase,std::vector<uint32_t> * select)387 void hardNmsMultiClass(const float* scoresData, uint32_t numClasses, uint32_t numRois,
388                        float scoreThreshold, float iouThreshold, int32_t maxNumDetections,
389                        int32_t maxNumDetectionsPerClass,
390                        std::function<const float*(uint32_t)> getRoiBase,
391                        std::vector<uint32_t>* select) {
392     // Exclude class 0 (background)
393     for (uint32_t c = 1; c < numClasses; c++) {
394         uint32_t size = select->size();
395         for (uint32_t b = 0; b < numRois; b++) {
396             const uint32_t index = b * numClasses + c;
397             const float score = scoresData[index];
398             if (score > scoreThreshold) {
399                 select->push_back(index);
400             }
401         }
402         uint32_t* selectStart = select->data() + size;
403         uint32_t selectLength = select->size() - size;
404         uint32_t* selectEnd = hardNmsSingleClass(scoresData, iouThreshold, maxNumDetectionsPerClass,
405                                                  getRoiBase, selectStart, selectLength);
406         select->resize(selectEnd - select->data());
407     }
408 
409     // Take top maxNumDetections.
410     std::sort(select->begin(), select->end(),
411               [&scoresData](const uint32_t& lhs, const uint32_t& rhs) {
412                   return scoresData[lhs] > scoresData[rhs];
413               });
414     if (maxNumDetections < 0 || select->size() <= maxNumDetections) {
415         return;
416     }
417     select->resize(maxNumDetections);
418 }
419 
420 // Inplace soft NMS within range [select, select + selectLength).
421 using SoftNmsKernel = std::function<float(float)>;
softNmsSingleClass(float * scoresData,float scoreThreshold,int32_t maxNumDetections,std::function<const float * (uint32_t)> getRoiBase,SoftNmsKernel kernel,uint32_t * select,uint32_t selectLength)422 uint32_t* softNmsSingleClass(float* scoresData, float scoreThreshold, int32_t maxNumDetections,
423                              std::function<const float*(uint32_t)> getRoiBase, SoftNmsKernel kernel,
424                              uint32_t* select, uint32_t selectLength) {
425     uint32_t *selectStart = select, *selectEnd = select + selectLength, numDetections = 0;
426     if (maxNumDetections < 0) {
427         maxNumDetections = selectLength;
428     }
429     while (selectStart < selectEnd && numDetections < maxNumDetections) {
430         // find max score and swap to the front
431         auto& maxScore = *std::max_element(selectStart, selectEnd,
432                                            [&scoresData](const uint32_t& lhs, const uint32_t& rhs) {
433                                                return scoresData[lhs] < scoresData[rhs];
434                                            });
435         std::swap(maxScore, *selectStart);
436 
437         // Calculate IoU of the rest, swap to the end (disgard) if needed.
438         for (uint32_t* i = selectStart + 1; i < selectEnd; i++) {
439             float iou = getIoUAxisAligned(getRoiBase(*i), getRoiBase(*selectStart));
440             scoresData[*i] *= kernel(iou);
441             if (scoresData[*i] < scoreThreshold) {
442                 std::swap(*i--, *(--selectEnd));
443             }
444         }
445         selectStart++;
446         numDetections++;
447     }
448     return selectStart;
449 }
450 
softNmsMultiClass(float * scoresData,uint32_t numClasses,uint32_t numRois,float scoreThreshold,float nmsScoreThreshold,int32_t maxNumDetections,int32_t maxNumDetectionsPerClass,std::function<const float * (uint32_t)> getRoiBase,SoftNmsKernel kernel,std::vector<uint32_t> * select)451 void softNmsMultiClass(float* scoresData, uint32_t numClasses, uint32_t numRois,
452                        float scoreThreshold, float nmsScoreThreshold, int32_t maxNumDetections,
453                        int32_t maxNumDetectionsPerClass,
454                        std::function<const float*(uint32_t)> getRoiBase, SoftNmsKernel kernel,
455                        std::vector<uint32_t>* select) {
456     // Exclude class 0 (background)
457     for (uint32_t c = 1; c < numClasses; c++) {
458         uint32_t size = select->size();
459         for (uint32_t b = 0; b < numRois; b++) {
460             const uint32_t index = b * numClasses + c;
461             const float score = scoresData[index];
462             if (score > scoreThreshold) {
463                 select->push_back(index);
464             }
465         }
466         uint32_t* selectStart = select->data() + size;
467         uint32_t selectLength = select->size() - size;
468         uint32_t* selectEnd =
469                 softNmsSingleClass(scoresData, nmsScoreThreshold, maxNumDetectionsPerClass,
470                                    getRoiBase, kernel, selectStart, selectLength);
471         select->resize(selectEnd - select->data());
472     }
473 
474     // Take top maxNumDetections.
475     std::sort(select->begin(), select->end(),
476               [&scoresData](const uint32_t& lhs, const uint32_t& rhs) {
477                   return scoresData[lhs] > scoresData[rhs];
478               });
479     if (maxNumDetections < 0 || select->size() <= maxNumDetections) {
480         return;
481     }
482     select->resize(maxNumDetections);
483 }
484 
boxWithNmsLimitFloat32Compute(float * scoresData,const Shape & scoresShape,const float * roiData,const Shape & roiShape,const int32_t * batchesData,const Shape & batchesShape,float scoreThreshold,int32_t maxNumDetections,int32_t softNmsKernel,float iouThreshold,float sigma,float nmsScoreThreshold,std::vector<uint32_t> * batchSplitIn,std::vector<uint32_t> * batchSplitOut,std::vector<uint32_t> * selected)485 bool boxWithNmsLimitFloat32Compute(float* scoresData, const Shape& scoresShape,
486                                    const float* roiData, const Shape& roiShape,
487                                    const int32_t* batchesData, const Shape& batchesShape,
488                                    float scoreThreshold, int32_t maxNumDetections,
489                                    int32_t softNmsKernel, float iouThreshold, float sigma,
490                                    float nmsScoreThreshold, std::vector<uint32_t>* batchSplitIn,
491                                    std::vector<uint32_t>* batchSplitOut,
492                                    std::vector<uint32_t>* selected) {
493     SoftNmsKernel kernel = nullptr;
494     if (softNmsKernel == 0) {
495         kernel = [&iouThreshold](float iou) { return iou < iouThreshold ? 1.0f : 0.0f; };
496     } else if (softNmsKernel == 1) {
497         kernel = [&iouThreshold](float iou) { return iou < iouThreshold ? 1.0f : 1.0f - iou; };
498     } else if (softNmsKernel == 2) {
499         kernel = [&sigma](float iou) { return std::exp(-1.0f * iou * iou / sigma); };
500     } else {
501         NN_RET_CHECK_FAIL() << "Unsupported soft NMS kernel " << softNmsKernel;
502     }
503 
504     const uint32_t kRoiDim = 4;
505     uint32_t numRois = getSizeOfDimension(scoresShape, 0);
506     uint32_t numClasses = getSizeOfDimension(scoresShape, 1);
507 
508     // We assume boxes of the same batch are grouped together.
509     std::vector<uint32_t> batch;
510     for (uint32_t i = 0, ind = -1; i < numRois; i++) {
511         if (batchesData[i] == ind) {
512             (batchSplitIn->back())++;
513         } else {
514             ind = batchesData[i];
515             batchSplitIn->push_back(1);
516         }
517     }
518 
519     float* scoresBase = scoresData;
520     const float* roiBase = roiData;
521     selected->clear();
522     for (uint32_t b = 0; b < batchSplitIn->size(); b++) {
523         for (uint32_t i = 0; i < batchSplitIn->at(b); i++) {
524             const float* roi = roiBase + i * kRoiDim;
525             // Check for malformed data: invalid region: x2 < x1 || y2 < y1
526             NN_RET_CHECK_LE(roi[0], roi[2]);
527             NN_RET_CHECK_LE(roi[1], roi[3]);
528         }
529         std::vector<uint32_t> result;
530         softNmsMultiClass(
531                 scoresBase, numClasses, batchSplitIn->at(b), scoreThreshold, nmsScoreThreshold,
532                 maxNumDetections, maxNumDetections,
533                 [&roiBase](uint32_t ind) { return roiBase + ind * kRoiDim; }, kernel, &result);
534         // Sort again by class.
535         std::sort(result.begin(), result.end(),
536                   [&scoresBase, numClasses](const uint32_t& lhs, const uint32_t& rhs) {
537                       uint32_t lhsClass = lhs % numClasses, rhsClass = rhs % numClasses;
538                       return lhsClass == rhsClass ? scoresBase[lhs] > scoresBase[rhs]
539                                                   : lhsClass < rhsClass;
540                   });
541         selected->insert(selected->end(), result.begin(), result.end());
542         batchSplitOut->push_back(result.size());
543         scoresBase += batchSplitIn->at(b) * numClasses;
544         roiBase += batchSplitIn->at(b) * numClasses * kRoiDim;
545     }
546     return true;
547 }
548 
549 template <typename T>
castTo(float val,const Shape &)550 T castTo(float val, const Shape&) {
551     return val;
552 }
553 template <>
castTo(float val,const Shape & shape)554 uint8_t castTo(float val, const Shape& shape) {
555     return saturateCast<uint8_t>(std::round(val / shape.scale + shape.offset));
556 }
557 
558 template <>
castTo(float val,const Shape & shape)559 int8_t castTo(float val, const Shape& shape) {
560     return saturateCast<int8_t>(std::round(val / shape.scale + shape.offset));
561 }
562 
563 template <typename T_Score, typename T_Roi>
boxWithNmsLimitWriteOutput(const std::vector<uint32_t> & selected,const std::vector<uint32_t> & batchSplitIn,const std::vector<uint32_t> & batchSplitOut,const std::vector<float> & scores,IOperationExecutionContext * context)564 bool boxWithNmsLimitWriteOutput(const std::vector<uint32_t>& selected,
565                                 const std::vector<uint32_t>& batchSplitIn,
566                                 const std::vector<uint32_t>& batchSplitOut,
567                                 const std::vector<float>& scores,
568                                 IOperationExecutionContext* context) {
569     const uint32_t kRoiDim = 4;
570     Shape scoresShape = context->getInputShape(kScoreTensor);
571     uint32_t numClasses = getSizeOfDimension(scoresShape, 1);
572 
573     // Set output dimensions.
574     uint32_t numOutRois = selected.size();
575     if (numOutRois == 0) return true;
576     Shape scoresOutShape = context->getOutputShape(kOutputScoreTensor);
577     scoresOutShape.dimensions = {numOutRois};
578     NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, scoresOutShape));
579 
580     Shape roiOutShape = context->getOutputShape(kOutputRoiTensor);
581     roiOutShape.dimensions = {numOutRois, 4};
582     NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, roiOutShape));
583 
584     Shape classesOutShape = context->getOutputShape(kOutputClassTensor);
585     classesOutShape.dimensions = {numOutRois};
586     NN_RET_CHECK(context->setOutputShape(kOutputClassTensor, classesOutShape));
587 
588     Shape batchesOutShape = context->getOutputShape(kOutputBatchesTensor);
589     batchesOutShape.dimensions = {numOutRois};
590     NN_RET_CHECK(context->setOutputShape(kOutputBatchesTensor, batchesOutShape));
591 
592     // Write outputs.
593     const float* scoresBase = scores.data();
594     const T_Roi* roiBase = context->getInputBuffer<T_Roi>(kRoiTensor);
595     const int32_t* batchesInPtr = context->getInputBuffer<int32_t>(kBatchesTensor);
596     T_Score* scoresOutPtr = context->getOutputBuffer<T_Score>(kOutputScoreTensor);
597     T_Roi* roiOutPtr = context->getOutputBuffer<T_Roi>(kOutputRoiTensor);
598     int32_t* classesOutPtr = context->getOutputBuffer<int32_t>(kOutputClassTensor);
599     int32_t* batchesOutPtr = context->getOutputBuffer<int32_t>(kOutputBatchesTensor);
600     uint32_t i = 0;
601     for (uint32_t b = 0; b < batchSplitOut.size(); b++) {
602         for (uint32_t j = 0; j < batchSplitOut[b]; j++) {
603             uint32_t index = selected[i++];
604             *scoresOutPtr++ = castTo<T_Score>(scoresBase[index], scoresOutShape);
605             memcpy(roiOutPtr, roiBase + index * kRoiDim, kRoiDim * sizeof(T_Roi));
606             roiOutPtr += kRoiDim;
607             *classesOutPtr++ = index % numClasses;
608             *batchesOutPtr++ = *batchesInPtr;
609         }
610         scoresBase += batchSplitIn[b] * numClasses;
611         roiBase += batchSplitIn[b] * numClasses * kRoiDim;
612         batchesInPtr += batchSplitIn[b];
613     }
614     return true;
615 }
616 
boxWithNmsLimitFloat32(const float * scoresData,const Shape & scoresShape,const float * roiData,const Shape & roiShape,const int32_t * batchesData,const Shape & batchesShape,float scoreThreshold,int32_t maxNumDetections,int32_t softNmsKernel,float iouThreshold,float sigma,float nmsScoreThreshold,float * scoresOutData,Shape scoresOutShape,float * roiOutData,Shape roiOutShape,int32_t * classesOutData,Shape classesOutShape,int32_t * batchesOutData,const Shape & batchSplitOutShape,IOperationExecutionContext * context)617 bool boxWithNmsLimitFloat32(const float* scoresData, const Shape& scoresShape, const float* roiData,
618                             const Shape& roiShape, const int32_t* batchesData,
619                             const Shape& batchesShape, float scoreThreshold,
620                             int32_t maxNumDetections, int32_t softNmsKernel, float iouThreshold,
621                             float sigma, float nmsScoreThreshold, float* scoresOutData,
622                             Shape scoresOutShape, float* roiOutData, Shape roiOutShape,
623                             int32_t* classesOutData, Shape classesOutShape, int32_t* batchesOutData,
624                             const Shape& batchSplitOutShape, IOperationExecutionContext* context) {
625     NNTRACE_TRANS("boxWithNmsLimit");
626     std::vector<float> scores_float32(getNumberOfElements(scoresShape));
627     for (uint32_t i = 0; i < scores_float32.size(); i++) {
628         scores_float32[i] = scoresData[i];
629     }
630     std::vector<uint32_t> selected, batchSplitIn, batchSplitOut;
631     NN_RET_CHECK(boxWithNmsLimitFloat32Compute(
632             scores_float32.data(), scoresShape, roiData, roiShape, batchesData, batchesShape,
633             scoreThreshold, maxNumDetections, softNmsKernel, iouThreshold, sigma, nmsScoreThreshold,
634             &batchSplitIn, &batchSplitOut, &selected));
635     return boxWithNmsLimitWriteOutput<float, float>(selected, batchSplitIn, batchSplitOut,
636                                                     scores_float32, context);
637 }
638 
boxWithNmsLimitFloat16(const _Float16 * scoresData,const Shape & scoresShape,const _Float16 * roiData,const Shape & roiShape,const int32_t * batchesData,const Shape & batchesShape,_Float16 scoreThreshold,int32_t maxNumDetections,int32_t softNmsKernel,_Float16 iouThreshold,_Float16 sigma,_Float16 nmsScoreThreshold,_Float16 * scoresOutData,const Shape & scoresOutShape,_Float16 * roiOutData,const Shape & roiOutShape,int32_t * classesOutData,const Shape & classesOutShape,int32_t * batchesOutData,const Shape & batchSplitOutShape,IOperationExecutionContext * context)639 bool boxWithNmsLimitFloat16(const _Float16* scoresData, const Shape& scoresShape,
640                             const _Float16* roiData, const Shape& roiShape,
641                             const int32_t* batchesData, const Shape& batchesShape,
642                             _Float16 scoreThreshold, int32_t maxNumDetections,
643                             int32_t softNmsKernel, _Float16 iouThreshold, _Float16 sigma,
644                             _Float16 nmsScoreThreshold, _Float16* scoresOutData,
645                             const Shape& scoresOutShape, _Float16* roiOutData,
646                             const Shape& roiOutShape, int32_t* classesOutData,
647                             const Shape& classesOutShape, int32_t* batchesOutData,
648                             const Shape& batchSplitOutShape, IOperationExecutionContext* context) {
649     std::vector<float> scores_float32(getNumberOfElements(scoresShape));
650     convertFloat16ToFloat32(scoresData, &scores_float32);
651     std::vector<float> roi_float32(getNumberOfElements(roiShape));
652     convertFloat16ToFloat32(roiData, &roi_float32);
653     std::vector<uint32_t> selected, batchSplitIn, batchSplitOut;
654     NN_RET_CHECK(boxWithNmsLimitFloat32Compute(
655             scores_float32.data(), scoresShape, roi_float32.data(), roiShape, batchesData,
656             batchesShape, scoreThreshold, maxNumDetections, softNmsKernel, iouThreshold, sigma,
657             nmsScoreThreshold, &batchSplitIn, &batchSplitOut, &selected));
658     return boxWithNmsLimitWriteOutput<_Float16, _Float16>(selected, batchSplitIn, batchSplitOut,
659                                                           scores_float32, context);
660 }
661 
boxWithNmsLimitQuant(const uint8_t * scoresData,const Shape & scoresShape,const uint16_t * roiData,const Shape & roiShape,const int32_t * batchesData,const Shape & batchesShape,float scoreThreshold,int32_t maxNumDetections,int32_t softNmsKernel,float iouThreshold,float sigma,float nmsScoreThreshold,uint8_t * scoresOutData,const Shape & scoresOutShape,uint16_t * roiOutData,const Shape & roiOutShape,int32_t * classesOutData,const Shape & classesOutShape,int32_t * batchesOutData,const Shape & batchSplitOutShape,IOperationExecutionContext * context)662 bool boxWithNmsLimitQuant(const uint8_t* scoresData, const Shape& scoresShape,
663                           const uint16_t* roiData, const Shape& roiShape,
664                           const int32_t* batchesData, const Shape& batchesShape,
665                           float scoreThreshold, int32_t maxNumDetections, int32_t softNmsKernel,
666                           float iouThreshold, float sigma, float nmsScoreThreshold,
667                           uint8_t* scoresOutData, const Shape& scoresOutShape, uint16_t* roiOutData,
668                           const Shape& roiOutShape, int32_t* classesOutData,
669                           const Shape& classesOutShape, int32_t* batchesOutData,
670                           const Shape& batchSplitOutShape, IOperationExecutionContext* context) {
671     std::vector<float> scores_float32(getNumberOfElements(scoresShape));
672     convertQuantToFloat32(scoresData, scoresShape.scale, scoresShape.offset, &scores_float32);
673     std::vector<float> roi_float32(getNumberOfElements(roiShape));
674     convertQuantToFloat32(roiData, roiShape.scale, roiShape.offset, &roi_float32);
675     std::vector<uint32_t> selected, batchSplitIn, batchSplitOut;
676     NN_RET_CHECK(boxWithNmsLimitFloat32Compute(
677             scores_float32.data(), scoresShape, roi_float32.data(), roiShape, batchesData,
678             batchesShape, scoreThreshold, maxNumDetections, softNmsKernel, iouThreshold, sigma,
679             nmsScoreThreshold, &batchSplitIn, &batchSplitOut, &selected));
680     return boxWithNmsLimitWriteOutput<uint8_t, uint16_t>(selected, batchSplitIn, batchSplitOut,
681                                                          scores_float32, context);
682 }
683 
boxWithNmsLimitQuant(const int8_t * scoresData,const Shape & scoresShape,const uint16_t * roiData,const Shape & roiShape,const int32_t * batchesData,const Shape & batchesShape,float scoreThreshold,int32_t maxNumDetections,int32_t softNmsKernel,float iouThreshold,float sigma,float nmsScoreThreshold,int8_t * scoresOutData,const Shape & scoresOutShape,uint16_t * roiOutData,const Shape & roiOutShape,int32_t * classesOutData,const Shape & classesOutShape,int32_t * batchesOutData,const Shape & batchSplitOutShape,IOperationExecutionContext * context)684 bool boxWithNmsLimitQuant(const int8_t* scoresData, const Shape& scoresShape,
685                           const uint16_t* roiData, const Shape& roiShape,
686                           const int32_t* batchesData, const Shape& batchesShape,
687                           float scoreThreshold, int32_t maxNumDetections, int32_t softNmsKernel,
688                           float iouThreshold, float sigma, float nmsScoreThreshold,
689                           int8_t* scoresOutData, const Shape& scoresOutShape, uint16_t* roiOutData,
690                           const Shape& roiOutShape, int32_t* classesOutData,
691                           const Shape& classesOutShape, int32_t* batchesOutData,
692                           const Shape& batchSplitOutShape, IOperationExecutionContext* context) {
693     std::vector<float> scores_float32(getNumberOfElements(scoresShape));
694     convertQuantToFloat32<int8_t>(scoresData, scoresShape.scale, scoresShape.offset,
695                                   &scores_float32);
696     std::vector<float> roi_float32(getNumberOfElements(roiShape));
697     convertQuantToFloat32(roiData, roiShape.scale, roiShape.offset, &roi_float32);
698     std::vector<uint32_t> selected, batchSplitIn, batchSplitOut;
699     NN_RET_CHECK(boxWithNmsLimitFloat32Compute(
700             scores_float32.data(), scoresShape, roi_float32.data(), roiShape, batchesData,
701             batchesShape, scoreThreshold, maxNumDetections, softNmsKernel, iouThreshold, sigma,
702             nmsScoreThreshold, &batchSplitIn, &batchSplitOut, &selected));
703     return boxWithNmsLimitWriteOutput<int8_t, uint16_t>(selected, batchSplitIn, batchSplitOut,
704                                                         scores_float32, context);
705 }
706 
707 }  // namespace
708 
validate(const IOperationValidationContext * context)709 bool validate(const IOperationValidationContext* context) {
710     NN_RET_CHECK_EQ(context->getNumInputs(), kNumInputs);
711     NN_RET_CHECK_EQ(context->getNumOutputs(), kNumOutputs);
712     std::vector<OperandType> inExpectedTypes;
713     std::vector<OperandType> outExpectedTypes;
714     auto inputType = context->getInputType(kScoreTensor);
715     if (inputType == OperandType::TENSOR_FLOAT16) {
716         inExpectedTypes = {
717                 OperandType::TENSOR_FLOAT16, OperandType::TENSOR_FLOAT16, OperandType::TENSOR_INT32,
718                 OperandType::FLOAT16,        OperandType::INT32,          OperandType::INT32,
719                 OperandType::FLOAT16,        OperandType::FLOAT16,        OperandType::FLOAT16};
720         outExpectedTypes = {OperandType::TENSOR_FLOAT16, OperandType::TENSOR_FLOAT16,
721                             OperandType::TENSOR_INT32, OperandType::TENSOR_INT32};
722     } else if (inputType == OperandType::TENSOR_FLOAT32) {
723         inExpectedTypes = {
724                 OperandType::TENSOR_FLOAT32, OperandType::TENSOR_FLOAT32, OperandType::TENSOR_INT32,
725                 OperandType::FLOAT32,        OperandType::INT32,          OperandType::INT32,
726                 OperandType::FLOAT32,        OperandType::FLOAT32,        OperandType::FLOAT32};
727         outExpectedTypes = {OperandType::TENSOR_FLOAT32, OperandType::TENSOR_FLOAT32,
728                             OperandType::TENSOR_INT32, OperandType::TENSOR_INT32};
729     } else if (inputType == OperandType::TENSOR_QUANT8_ASYMM ||
730                inputType == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
731         inExpectedTypes = {inputType,
732                            OperandType::TENSOR_QUANT16_ASYMM,
733                            OperandType::TENSOR_INT32,
734                            OperandType::FLOAT32,
735                            OperandType::INT32,
736                            OperandType::INT32,
737                            OperandType::FLOAT32,
738                            OperandType::FLOAT32,
739                            OperandType::FLOAT32};
740         outExpectedTypes = {inputType, OperandType::TENSOR_QUANT16_ASYMM, OperandType::TENSOR_INT32,
741                             OperandType::TENSOR_INT32};
742     } else {
743         NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
744     }
745     NN_RET_CHECK(validateInputTypes(context, inExpectedTypes));
746     NN_RET_CHECK(validateOutputTypes(context, outExpectedTypes));
747     if (inputType == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
748         return validateHalVersion(context, HalVersion::V1_3);
749     } else {
750         return validateHalVersion(context, HalVersion::V1_2);
751     }
752 }
753 
prepare(IOperationExecutionContext * context)754 bool prepare(IOperationExecutionContext* context) {
755     Shape scoreShape = context->getInputShape(kScoreTensor);
756     Shape roiShape = context->getInputShape(kRoiTensor);
757     Shape batchesShape = context->getInputShape(kBatchesTensor);
758     Shape outputScoreShape = context->getOutputShape(kOutputScoreTensor);
759     Shape outputRoiShape = context->getOutputShape(kOutputRoiTensor);
760     Shape outputClassShape = context->getOutputShape(kOutputClassTensor);
761     Shape outputBatchSplitShape = context->getOutputShape(kOutputBatchesTensor);
762 
763     NN_RET_CHECK(getNumberOfDimensions(scoreShape) == 2);
764     NN_RET_CHECK(getNumberOfDimensions(roiShape) == 2);
765     NN_RET_CHECK(getNumberOfDimensions(batchesShape) == 1);
766 
767     // Only numRois can be zero.
768     const uint32_t kRoiDim = 4;
769     uint32_t numRois = getSizeOfDimension(scoreShape, 0);
770     uint32_t numClasses = getSizeOfDimension(scoreShape, 1);
771     NN_RET_CHECK(getSizeOfDimension(roiShape, 0) == numRois);
772     NN_RET_CHECK(getSizeOfDimension(roiShape, 1) == kRoiDim * numClasses);
773     NN_RET_CHECK(getSizeOfDimension(batchesShape, 0) == numRois);
774     NN_RET_CHECK_GT(numClasses, 1);
775 
776     if (scoreShape.type == OperandType::TENSOR_QUANT8_ASYMM ||
777         scoreShape.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
778         NN_RET_CHECK_EQ(roiShape.scale, 0.125f);
779         NN_RET_CHECK_EQ(roiShape.offset, 0);
780     }
781 
782     outputScoreShape.type = scoreShape.type;
783     outputScoreShape.dimensions = {0};
784     outputScoreShape.scale = scoreShape.scale;
785     outputScoreShape.offset = scoreShape.offset;
786     NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, outputScoreShape));
787 
788     outputRoiShape.type = roiShape.type;
789     outputRoiShape.dimensions = {0, 4};
790     outputRoiShape.scale = 0.f;
791     outputRoiShape.offset = 0;
792     if (scoreShape.type == OperandType::TENSOR_QUANT8_ASYMM ||
793         scoreShape.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
794         outputRoiShape.scale = 0.125f;
795     }
796     NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, outputRoiShape));
797 
798     outputClassShape.type = OperandType::TENSOR_INT32;
799     outputClassShape.dimensions = {0};
800     NN_RET_CHECK(context->setOutputShape(kOutputClassTensor, outputClassShape));
801 
802     outputBatchSplitShape.type = batchesShape.type;
803     outputBatchSplitShape.dimensions = {0};
804     NN_RET_CHECK(context->setOutputShape(kOutputBatchesTensor, outputBatchSplitShape));
805     return true;
806 }
807 
execute(IOperationExecutionContext * context)808 bool execute(IOperationExecutionContext* context) {
809     NNTRACE_TRANS("boxWithNMSLimit");
810     // Bypass execution in the case of zero numRois.
811     if (getSizeOfDimension(context->getInputShape(kScoreTensor), 0) == 0) return true;
812     switch (context->getInputType(kScoreTensor)) {
813         case OperandType::TENSOR_FLOAT16: {
814             return boxWithNmsLimitFloat16(
815                     context->getInputBuffer<_Float16>(kScoreTensor),
816                     context->getInputShape(kScoreTensor),
817                     context->getInputBuffer<_Float16>(kRoiTensor),
818                     context->getInputShape(kRoiTensor),
819                     context->getInputBuffer<int32_t>(kBatchesTensor),
820                     context->getInputShape(kBatchesTensor),
821                     context->getInputValue<_Float16>(kScoreThresholdScalar),
822                     context->getInputValue<int32_t>(kMaxNumDetectionScalar),
823                     context->getInputValue<int32_t>(kNmsKernelScalar),
824                     context->getInputValue<_Float16>(kIoUThresholdScalar),
825                     context->getInputValue<_Float16>(kSigmaScalar),
826                     context->getInputValue<_Float16>(kNmsScoreThresholdScalar),
827                     context->getOutputBuffer<_Float16>(kOutputScoreTensor),
828                     context->getOutputShape(kOutputScoreTensor),
829                     context->getOutputBuffer<_Float16>(kOutputRoiTensor),
830                     context->getOutputShape(kOutputRoiTensor),
831                     context->getOutputBuffer<int32_t>(kOutputClassTensor),
832                     context->getOutputShape(kOutputClassTensor),
833                     context->getOutputBuffer<int32_t>(kOutputBatchesTensor),
834                     context->getOutputShape(kOutputBatchesTensor), context);
835         }
836         case OperandType::TENSOR_FLOAT32: {
837             return boxWithNmsLimitFloat32(context->getInputBuffer<float>(kScoreTensor),
838                                           context->getInputShape(kScoreTensor),
839                                           context->getInputBuffer<float>(kRoiTensor),
840                                           context->getInputShape(kRoiTensor),
841                                           context->getInputBuffer<int32_t>(kBatchesTensor),
842                                           context->getInputShape(kBatchesTensor),
843                                           context->getInputValue<float>(kScoreThresholdScalar),
844                                           context->getInputValue<int32_t>(kMaxNumDetectionScalar),
845                                           context->getInputValue<int32_t>(kNmsKernelScalar),
846                                           context->getInputValue<float>(kIoUThresholdScalar),
847                                           context->getInputValue<float>(kSigmaScalar),
848                                           context->getInputValue<float>(kNmsScoreThresholdScalar),
849                                           context->getOutputBuffer<float>(kOutputScoreTensor),
850                                           context->getOutputShape(kOutputScoreTensor),
851                                           context->getOutputBuffer<float>(kOutputRoiTensor),
852                                           context->getOutputShape(kOutputRoiTensor),
853                                           context->getOutputBuffer<int32_t>(kOutputClassTensor),
854                                           context->getOutputShape(kOutputClassTensor),
855                                           context->getOutputBuffer<int32_t>(kOutputBatchesTensor),
856                                           context->getOutputShape(kOutputBatchesTensor), context);
857         }
858         case OperandType::TENSOR_QUANT8_ASYMM: {
859             return boxWithNmsLimitQuant(context->getInputBuffer<uint8_t>(kScoreTensor),
860                                         context->getInputShape(kScoreTensor),
861                                         context->getInputBuffer<uint16_t>(kRoiTensor),
862                                         context->getInputShape(kRoiTensor),
863                                         context->getInputBuffer<int32_t>(kBatchesTensor),
864                                         context->getInputShape(kBatchesTensor),
865                                         context->getInputValue<float>(kScoreThresholdScalar),
866                                         context->getInputValue<int32_t>(kMaxNumDetectionScalar),
867                                         context->getInputValue<int32_t>(kNmsKernelScalar),
868                                         context->getInputValue<float>(kIoUThresholdScalar),
869                                         context->getInputValue<float>(kSigmaScalar),
870                                         context->getInputValue<float>(kNmsScoreThresholdScalar),
871                                         context->getOutputBuffer<uint8_t>(kOutputScoreTensor),
872                                         context->getOutputShape(kOutputScoreTensor),
873                                         context->getOutputBuffer<uint16_t>(kOutputRoiTensor),
874                                         context->getOutputShape(kOutputRoiTensor),
875                                         context->getOutputBuffer<int32_t>(kOutputClassTensor),
876                                         context->getOutputShape(kOutputClassTensor),
877                                         context->getOutputBuffer<int32_t>(kOutputBatchesTensor),
878                                         context->getOutputShape(kOutputBatchesTensor), context);
879         }
880         case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
881             return boxWithNmsLimitQuant(context->getInputBuffer<int8_t>(kScoreTensor),
882                                         context->getInputShape(kScoreTensor),
883                                         context->getInputBuffer<uint16_t>(kRoiTensor),
884                                         context->getInputShape(kRoiTensor),
885                                         context->getInputBuffer<int32_t>(kBatchesTensor),
886                                         context->getInputShape(kBatchesTensor),
887                                         context->getInputValue<float>(kScoreThresholdScalar),
888                                         context->getInputValue<int32_t>(kMaxNumDetectionScalar),
889                                         context->getInputValue<int32_t>(kNmsKernelScalar),
890                                         context->getInputValue<float>(kIoUThresholdScalar),
891                                         context->getInputValue<float>(kSigmaScalar),
892                                         context->getInputValue<float>(kNmsScoreThresholdScalar),
893                                         context->getOutputBuffer<int8_t>(kOutputScoreTensor),
894                                         context->getOutputShape(kOutputScoreTensor),
895                                         context->getOutputBuffer<uint16_t>(kOutputRoiTensor),
896                                         context->getOutputShape(kOutputRoiTensor),
897                                         context->getOutputBuffer<int32_t>(kOutputClassTensor),
898                                         context->getOutputShape(kOutputClassTensor),
899                                         context->getOutputBuffer<int32_t>(kOutputBatchesTensor),
900                                         context->getOutputShape(kOutputBatchesTensor), context);
901         }
902         default:
903             NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
904     }
905 }
906 
907 }  // namespace box_with_nms_limit
908 
909 namespace generate_proposals {
910 
911 constexpr char kOperationName[] = "GENERATE_PROPOSALS";
912 
913 constexpr uint32_t kNumInputs = 11;
914 constexpr uint32_t kScoreTensor = 0;
915 constexpr uint32_t kDeltaTensor = 1;
916 constexpr uint32_t kAnchorTensor = 2;
917 constexpr uint32_t kImageInfoTensor = 3;
918 constexpr uint32_t kHeightStrideSalar = 4;
919 constexpr uint32_t kWidthStrideScalar = 5;
920 constexpr uint32_t kPreNmsMaxScalar = 6;
921 constexpr uint32_t kPostNmsMaxScalar = 7;
922 constexpr uint32_t kIoUThresholdScalar = 8;
923 constexpr uint32_t kMinSizeScalar = 9;
924 constexpr uint32_t kLayoutScalar = 10;
925 
926 constexpr uint32_t kNumOutputs = 3;
927 constexpr uint32_t kOutputScoreTensor = 0;
928 constexpr uint32_t kOutputRoiTensor = 1;
929 constexpr uint32_t kOutputBatchesTensor = 2;
930 
931 namespace {
932 
filterBoxes(const float * roiBase,const float * imageInfoBase,float minSize,std::vector<uint32_t> * select)933 void filterBoxes(const float* roiBase, const float* imageInfoBase, float minSize,
934                  std::vector<uint32_t>* select) {
935     const uint32_t kRoiDim = 4;
936     uint32_t i = 0;
937     for (uint32_t j = 0; j < select->size(); j++) {
938         const float* roiInfo = roiBase + (*select)[j] * kRoiDim;
939         float roiWidth, roiHeight, xRoiCenter, yRoiCenter;
940         roiWidth = roiInfo[2] - roiInfo[0];
941         roiHeight = roiInfo[3] - roiInfo[1];
942         xRoiCenter = roiInfo[0] + roiWidth / 2.0f;
943         yRoiCenter = roiInfo[1] + roiHeight / 2.0f;
944         if (roiWidth > minSize && roiHeight > minSize && xRoiCenter < imageInfoBase[1] &&
945             yRoiCenter < imageInfoBase[0]) {
946             (*select)[i++] = (*select)[j];
947         }
948     }
949     select->resize(i);
950 }
951 
generateProposalsNhwcFloat32Compute(const float * scoresData,const Shape & scoresShape,const float * bboxDeltasData,const Shape & bboxDeltasShape,const float * anchorsData,const Shape & anchorsShape,const float * imageInfoData,const Shape & imageInfoShape,float heightStride,float widthStride,int32_t preNmsTopN,int32_t postNmsTopN,float iouThreshold,float minSize,std::vector<float> * scoresOutData,std::vector<float> * roiOutData,std::vector<int32_t> * batchesOutData)952 bool generateProposalsNhwcFloat32Compute(const float* scoresData, const Shape& scoresShape,
953                                          const float* bboxDeltasData, const Shape& bboxDeltasShape,
954                                          const float* anchorsData, const Shape& anchorsShape,
955                                          const float* imageInfoData, const Shape& imageInfoShape,
956                                          float heightStride, float widthStride, int32_t preNmsTopN,
957                                          int32_t postNmsTopN, float iouThreshold, float minSize,
958                                          std::vector<float>* scoresOutData,
959                                          std::vector<float>* roiOutData,
960                                          std::vector<int32_t>* batchesOutData) {
961     const uint32_t kRoiDim = 4;
962     uint32_t numBatches = getSizeOfDimension(scoresShape, 0);
963     uint32_t height = getSizeOfDimension(scoresShape, 1);
964     uint32_t width = getSizeOfDimension(scoresShape, 2);
965     uint32_t numAnchors = getSizeOfDimension(scoresShape, 3);
966     uint32_t imageInfoLength = getSizeOfDimension(imageInfoShape, 1);
967 
968     uint32_t batchSize = height * width * numAnchors;
969     uint32_t roiBufferSize = batchSize * kRoiDim;
970     std::vector<float> roiBuffer(roiBufferSize);
971     std::vector<float> roiTransformedBuffer(roiBufferSize);
972     scoresOutData->clear();
973     roiOutData->clear();
974     batchesOutData->clear();
975 
976     // Compute the roi region for each anchor.
977     float* roiBase = roiBuffer.data();
978     for (uint32_t h = 0; h < height; h++) {
979         float hShift = h * heightStride;
980         for (uint32_t w = 0; w < width; w++) {
981             const float* anchorsBase = anchorsData;
982             float wShift = w * widthStride;
983             for (uint32_t a = 0; a < numAnchors; a++, roiBase += kRoiDim, anchorsBase += kRoiDim) {
984                 roiBase[0] = anchorsBase[0] + wShift;
985                 roiBase[1] = anchorsBase[1] + hShift;
986                 roiBase[2] = anchorsBase[2] + wShift;
987                 roiBase[3] = anchorsBase[3] + hShift;
988             }
989         }
990     }
991 
992     const float* scoresBase = scoresData;
993     const float* bboxDeltasBase = bboxDeltasData;
994     const float* imageInfoBase = imageInfoData;
995     // Need to fake some data to satisfy bboxTransform.
996     Shape tempRoiShape = anchorsShape;
997     tempRoiShape.dimensions = {batchSize, kRoiDim};
998     Shape tempBBoxDeltasShape = bboxDeltasShape;
999     tempBBoxDeltasShape.dimensions = {batchSize, kRoiDim};
1000     std::vector<int32_t> tempBatchSplitData(batchSize, 0);
1001     Shape tempbatchSplitShape = {.dimensions = {batchSize}};
1002     Shape tempImageInfoShape = imageInfoShape;
1003     tempImageInfoShape.dimensions = {1, imageInfoLength};
1004 
1005     for (uint32_t b = 0; b < numBatches; b++) {
1006         // Apply bboxDeltas to anchor locations.
1007         float tempImageInfo[] = {imageInfoBase[0], imageInfoBase[1]};
1008         if (!bboxTransformFloat32(roiBuffer.data(), tempRoiShape, bboxDeltasBase,
1009                                   tempBBoxDeltasShape, tempBatchSplitData.data(),
1010                                   tempbatchSplitShape, tempImageInfo, tempImageInfoShape,
1011                                   roiTransformedBuffer.data(), tempRoiShape)) {
1012             LOG(ERROR) << "BBoxTransform step failed in GENERATE_PROPOSALS op.";
1013             return false;
1014         }
1015 
1016         // Find the top preNmsTopN scores.
1017         std::vector<uint32_t> select(batchSize);
1018         std::iota(select.begin(), select.end(), 0);
1019         if (preNmsTopN > 0 && preNmsTopN < select.size()) {
1020             std::sort(select.begin(), select.end(),
1021                       [&scoresBase](const uint32_t lhs, const uint32_t rhs) {
1022                           return scoresBase[lhs] > scoresBase[rhs];
1023                       });
1024             select.resize(preNmsTopN);
1025         }
1026 
1027         // Filter boxes, disgard regions with height or width < minSize.
1028         filterBoxes(roiTransformedBuffer.data(), imageInfoBase, minSize, &select);
1029 
1030         // Apply hard NMS.
1031         uint32_t* selectEnd = box_with_nms_limit::hardNmsSingleClass(
1032                 scoresBase, iouThreshold, postNmsTopN,
1033                 [&roiTransformedBuffer](uint32_t ind) {
1034                     return roiTransformedBuffer.data() + ind * kRoiDim;
1035                 },
1036                 select.data(), select.size());
1037         uint32_t selectSize = selectEnd - select.data();
1038         select.resize(selectSize);
1039 
1040         // Write output.
1041         for (auto i : select) {
1042             roiOutData->insert(roiOutData->end(), roiTransformedBuffer.begin() + i * kRoiDim,
1043                                roiTransformedBuffer.begin() + (i + 1) * kRoiDim);
1044             scoresOutData->push_back(scoresBase[i]);
1045             batchesOutData->push_back(b);
1046         }
1047         scoresBase += batchSize;
1048         bboxDeltasBase += roiBufferSize;
1049         imageInfoBase += imageInfoLength;
1050     }
1051     return true;
1052 }
1053 
generateProposalsFloat32Compute(const float * scoresData,const Shape & scoresShape,const float * bboxDeltasData,const Shape & bboxDeltasShape,const float * anchorsData,const Shape & anchorsShape,const float * imageInfoData,const Shape & imageInfoShape,float heightStride,float widthStride,int32_t preNmsTopN,int32_t postNmsTopN,float iouThreshold,float minSize,bool useNchw,std::vector<float> * scoresOutData,std::vector<float> * roiOutData,std::vector<int32_t> * batchesOutData)1054 bool generateProposalsFloat32Compute(const float* scoresData, const Shape& scoresShape,
1055                                      const float* bboxDeltasData, const Shape& bboxDeltasShape,
1056                                      const float* anchorsData, const Shape& anchorsShape,
1057                                      const float* imageInfoData, const Shape& imageInfoShape,
1058                                      float heightStride, float widthStride, int32_t preNmsTopN,
1059                                      int32_t postNmsTopN, float iouThreshold, float minSize,
1060                                      bool useNchw, std::vector<float>* scoresOutData,
1061                                      std::vector<float>* roiOutData,
1062                                      std::vector<int32_t>* batchesOutData) {
1063     InputWithLayout<float> score_nhwc(useNchw), delta_nhwc(useNchw);
1064     NN_RET_CHECK(score_nhwc.initialize(scoresData, scoresShape));
1065     NN_RET_CHECK(delta_nhwc.initialize(bboxDeltasData, bboxDeltasShape));
1066     return generateProposalsNhwcFloat32Compute(
1067             score_nhwc.getNhwcBuffer(), score_nhwc.getNhwcShape(), delta_nhwc.getNhwcBuffer(),
1068             delta_nhwc.getNhwcShape(), anchorsData, anchorsShape, imageInfoData, imageInfoShape,
1069             heightStride, widthStride, preNmsTopN, postNmsTopN, iouThreshold, minSize,
1070             scoresOutData, roiOutData, batchesOutData);
1071 }
1072 
generateProposalsFloat32(const float * scoresData,const Shape & scoresShape,const float * bboxDeltasData,const Shape & bboxDeltasShape,const float * anchorsData,const Shape & anchorsShape,const float * imageInfoData,const Shape & imageInfoShape,float heightStride,float widthStride,int32_t preNmsTopN,int32_t postNmsTopN,float iouThreshold,float minSize,bool useNchw,IOperationExecutionContext * context)1073 bool generateProposalsFloat32(const float* scoresData, const Shape& scoresShape,
1074                               const float* bboxDeltasData, const Shape& bboxDeltasShape,
1075                               const float* anchorsData, const Shape& anchorsShape,
1076                               const float* imageInfoData, const Shape& imageInfoShape,
1077                               float heightStride, float widthStride, int32_t preNmsTopN,
1078                               int32_t postNmsTopN, float iouThreshold, float minSize, bool useNchw,
1079                               IOperationExecutionContext* context) {
1080     std::vector<float> scoresOut_float32, roiOut_float32;
1081     std::vector<int32_t> batchesOut;
1082     NN_RET_CHECK(generateProposalsFloat32Compute(
1083             scoresData, scoresShape, bboxDeltasData, bboxDeltasShape, anchorsData, anchorsShape,
1084             imageInfoData, imageInfoShape, heightStride, widthStride, preNmsTopN, postNmsTopN,
1085             iouThreshold, minSize, useNchw, &scoresOut_float32, &roiOut_float32, &batchesOut));
1086 
1087     // Set output dimensions.
1088     uint32_t numOutRois = scoresOut_float32.size();
1089     if (numOutRois == 0) return true;
1090     Shape scoresOutShape = context->getOutputShape(kOutputScoreTensor);
1091     scoresOutShape.dimensions = {numOutRois};
1092     NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, scoresOutShape));
1093     Shape roiOutShape = context->getOutputShape(kOutputRoiTensor);
1094     roiOutShape.dimensions = {numOutRois, 4};
1095     NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, roiOutShape));
1096     Shape batchesOutShape = context->getOutputShape(kOutputBatchesTensor);
1097     batchesOutShape.dimensions = {numOutRois};
1098     NN_RET_CHECK(context->setOutputShape(kOutputBatchesTensor, batchesOutShape));
1099 
1100     // Write outputs.
1101     float* scoresOutData = context->getOutputBuffer<float>(kOutputScoreTensor);
1102     for (uint32_t i = 0; i < scoresOut_float32.size(); i++) {
1103         scoresOutData[i] = scoresOut_float32[i];
1104     }
1105     float* roiOutData = context->getOutputBuffer<float>(kOutputRoiTensor);
1106     for (uint32_t i = 0; i < roiOut_float32.size(); i++) {
1107         roiOutData[i] = roiOut_float32[i];
1108     }
1109     int32_t* batchesOutData = context->getOutputBuffer<int32_t>(kOutputBatchesTensor);
1110     for (uint32_t i = 0; i < batchesOut.size(); i++) {
1111         batchesOutData[i] = batchesOut[i];
1112     }
1113     return true;
1114 }
1115 
generateProposalsFloat16(const _Float16 * scoresData,const Shape & scoresShape,const _Float16 * bboxDeltasData,const Shape & bboxDeltasShape,const _Float16 * anchorsData,const Shape & anchorsShape,const _Float16 * imageInfoData,const Shape & imageInfoShape,float heightStride,float widthStride,int32_t preNmsTopN,int32_t postNmsTopN,float iouThreshold,float minSize,bool useNchw,IOperationExecutionContext * context)1116 bool generateProposalsFloat16(const _Float16* scoresData, const Shape& scoresShape,
1117                               const _Float16* bboxDeltasData, const Shape& bboxDeltasShape,
1118                               const _Float16* anchorsData, const Shape& anchorsShape,
1119                               const _Float16* imageInfoData, const Shape& imageInfoShape,
1120                               float heightStride, float widthStride, int32_t preNmsTopN,
1121                               int32_t postNmsTopN, float iouThreshold, float minSize, bool useNchw,
1122                               IOperationExecutionContext* context) {
1123     std::vector<float> score_float32(getNumberOfElements(scoresShape));
1124     convertFloat16ToFloat32(scoresData, &score_float32);
1125     std::vector<float> delta_float32(getNumberOfElements(bboxDeltasShape));
1126     convertFloat16ToFloat32(bboxDeltasData, &delta_float32);
1127     std::vector<float> anchors_float32(getNumberOfElements(anchorsShape));
1128     convertFloat16ToFloat32(anchorsData, &anchors_float32);
1129     std::vector<float> imageInfo_float32(getNumberOfElements(imageInfoShape));
1130     convertFloat16ToFloat32(imageInfoData, &imageInfo_float32);
1131     std::vector<float> scoresOut_float32, roiOut_float32;
1132     std::vector<int32_t> batchesOut;
1133     NN_RET_CHECK(generateProposalsFloat32Compute(
1134             score_float32.data(), scoresShape, delta_float32.data(), bboxDeltasShape,
1135             anchors_float32.data(), anchorsShape, imageInfo_float32.data(), imageInfoShape,
1136             heightStride, widthStride, preNmsTopN, postNmsTopN, iouThreshold, minSize, useNchw,
1137             &scoresOut_float32, &roiOut_float32, &batchesOut));
1138 
1139     // Set output dimensions.
1140     uint32_t numOutRois = scoresOut_float32.size();
1141     if (numOutRois == 0) return true;
1142     Shape scoresOutShape = context->getOutputShape(kOutputScoreTensor);
1143     scoresOutShape.dimensions = {numOutRois};
1144     NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, scoresOutShape));
1145     Shape roiOutShape = context->getOutputShape(kOutputRoiTensor);
1146     roiOutShape.dimensions = {numOutRois, 4};
1147     NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, roiOutShape));
1148     Shape batchesOutShape = context->getOutputShape(kOutputBatchesTensor);
1149     batchesOutShape.dimensions = {numOutRois};
1150     NN_RET_CHECK(context->setOutputShape(kOutputBatchesTensor, batchesOutShape));
1151 
1152     // Write outputs.
1153     _Float16* scoresOutData = context->getOutputBuffer<_Float16>(kOutputScoreTensor);
1154     convertFloat32ToFloat16(scoresOut_float32, scoresOutData);
1155     _Float16* roiOutData = context->getOutputBuffer<_Float16>(kOutputRoiTensor);
1156     convertFloat32ToFloat16(roiOut_float32, roiOutData);
1157     int32_t* batchesOutData = context->getOutputBuffer<int32_t>(kOutputBatchesTensor);
1158     for (uint32_t i = 0; i < batchesOut.size(); i++) {
1159         batchesOutData[i] = batchesOut[i];
1160     }
1161     return true;
1162 }
1163 
1164 template <typename T_8QInput>
generateProposalsQuant(const T_8QInput * scoresData,const Shape & scoresShape,const T_8QInput * bboxDeltasData,const Shape & bboxDeltasShape,const int16_t * anchorsData,const Shape & anchorsShape,const uint16_t * imageInfoData,const Shape & imageInfoShape,float heightStride,float widthStride,int32_t preNmsTopN,int32_t postNmsTopN,float iouThreshold,float minSize,bool useNchw,IOperationExecutionContext * context)1165 bool generateProposalsQuant(const T_8QInput* scoresData, const Shape& scoresShape,
1166                             const T_8QInput* bboxDeltasData, const Shape& bboxDeltasShape,
1167                             const int16_t* anchorsData, const Shape& anchorsShape,
1168                             const uint16_t* imageInfoData, const Shape& imageInfoShape,
1169                             float heightStride, float widthStride, int32_t preNmsTopN,
1170                             int32_t postNmsTopN, float iouThreshold, float minSize, bool useNchw,
1171                             IOperationExecutionContext* context) {
1172     std::vector<float> score_float32(getNumberOfElements(scoresShape));
1173     convertQuantToFloat32<T_8QInput>(scoresData, scoresShape.scale, scoresShape.offset,
1174                                      &score_float32);
1175     std::vector<float> delta_float32(getNumberOfElements(bboxDeltasShape));
1176     convertQuantToFloat32<T_8QInput>(bboxDeltasData, bboxDeltasShape.scale, bboxDeltasShape.offset,
1177                                      &delta_float32);
1178     std::vector<float> anchors_float32(getNumberOfElements(anchorsShape));
1179     convertQuantToFloat32(anchorsData, anchorsShape.scale, anchorsShape.offset, &anchors_float32);
1180     std::vector<float> imageInfo_float32(getNumberOfElements(imageInfoShape));
1181     convertQuantToFloat32(imageInfoData, imageInfoShape.scale, imageInfoShape.offset,
1182                           &imageInfo_float32);
1183     std::vector<float> scoresOut_float32, roiOut_float32;
1184     std::vector<int32_t> batchesOut;
1185     NN_RET_CHECK(generateProposalsFloat32Compute(
1186             score_float32.data(), scoresShape, delta_float32.data(), bboxDeltasShape,
1187             anchors_float32.data(), anchorsShape, imageInfo_float32.data(), imageInfoShape,
1188             heightStride, widthStride, preNmsTopN, postNmsTopN, iouThreshold, minSize, useNchw,
1189             &scoresOut_float32, &roiOut_float32, &batchesOut));
1190 
1191     // Set output dimensions.
1192     uint32_t numOutRois = scoresOut_float32.size();
1193     if (numOutRois == 0) return true;
1194     Shape scoresOutShape = context->getOutputShape(kOutputScoreTensor);
1195     scoresOutShape.dimensions = {numOutRois};
1196     NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, scoresOutShape));
1197     Shape roiOutShape = context->getOutputShape(kOutputRoiTensor);
1198     roiOutShape.dimensions = {numOutRois, 4};
1199     NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, roiOutShape));
1200     Shape batchesOutShape = context->getOutputShape(kOutputBatchesTensor);
1201     batchesOutShape.dimensions = {numOutRois};
1202     NN_RET_CHECK(context->setOutputShape(kOutputBatchesTensor, batchesOutShape));
1203 
1204     // Write outputs.
1205     T_8QInput* scoresOutData = context->getOutputBuffer<T_8QInput>(kOutputScoreTensor);
1206     convertFloat32ToQuant<T_8QInput>(scoresOut_float32, scoresOutShape.scale, scoresOutShape.offset,
1207                                      scoresOutData);
1208     uint16_t* roiOutData = context->getOutputBuffer<uint16_t>(kOutputRoiTensor);
1209     convertFloat32ToQuant(roiOut_float32, roiOutShape.scale, roiOutShape.offset, roiOutData);
1210     int32_t* batchesOutData = context->getOutputBuffer<int32_t>(kOutputBatchesTensor);
1211     for (uint32_t i = 0; i < batchesOut.size(); i++) {
1212         batchesOutData[i] = batchesOut[i];
1213     }
1214     return true;
1215 }
1216 
1217 }  // namespace
1218 
validate(const IOperationValidationContext * context)1219 bool validate(const IOperationValidationContext* context) {
1220     NN_RET_CHECK_EQ(context->getNumInputs(), kNumInputs);
1221     NN_RET_CHECK_EQ(context->getNumOutputs(), kNumOutputs);
1222     std::vector<OperandType> inExpectedTypes;
1223     std::vector<OperandType> outExpectedTypes;
1224     auto inputType = context->getInputType(kScoreTensor);
1225     if (inputType == OperandType::TENSOR_FLOAT16) {
1226         inExpectedTypes = {OperandType::TENSOR_FLOAT16,
1227                            OperandType::TENSOR_FLOAT16,
1228                            OperandType::TENSOR_FLOAT16,
1229                            OperandType::TENSOR_FLOAT16,
1230                            OperandType::FLOAT16,
1231                            OperandType::FLOAT16,
1232                            OperandType::INT32,
1233                            OperandType::INT32,
1234                            OperandType::FLOAT16,
1235                            OperandType::FLOAT16,
1236                            OperandType::BOOL};
1237         outExpectedTypes = {OperandType::TENSOR_FLOAT16, OperandType::TENSOR_FLOAT16,
1238                             OperandType::TENSOR_INT32};
1239     } else if (inputType == OperandType::TENSOR_FLOAT32) {
1240         inExpectedTypes = {OperandType::TENSOR_FLOAT32,
1241                            OperandType::TENSOR_FLOAT32,
1242                            OperandType::TENSOR_FLOAT32,
1243                            OperandType::TENSOR_FLOAT32,
1244                            OperandType::FLOAT32,
1245                            OperandType::FLOAT32,
1246                            OperandType::INT32,
1247                            OperandType::INT32,
1248                            OperandType::FLOAT32,
1249                            OperandType::FLOAT32,
1250                            OperandType::BOOL};
1251         outExpectedTypes = {OperandType::TENSOR_FLOAT32, OperandType::TENSOR_FLOAT32,
1252                             OperandType::TENSOR_INT32};
1253     } else if (inputType == OperandType::TENSOR_QUANT8_ASYMM ||
1254                inputType == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1255         inExpectedTypes = {inputType,
1256                            inputType,
1257                            OperandType::TENSOR_QUANT16_SYMM,
1258                            OperandType::TENSOR_QUANT16_ASYMM,
1259                            OperandType::FLOAT32,
1260                            OperandType::FLOAT32,
1261                            OperandType::INT32,
1262                            OperandType::INT32,
1263                            OperandType::FLOAT32,
1264                            OperandType::FLOAT32,
1265                            OperandType::BOOL};
1266         outExpectedTypes = {inputType, OperandType::TENSOR_QUANT16_ASYMM,
1267                             OperandType::TENSOR_INT32};
1268     } else {
1269         NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
1270     }
1271     NN_RET_CHECK(validateInputTypes(context, inExpectedTypes));
1272     NN_RET_CHECK(validateOutputTypes(context, outExpectedTypes));
1273     if (inputType == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1274         return validateHalVersion(context, HalVersion::V1_3);
1275     } else {
1276         return validateHalVersion(context, HalVersion::V1_2);
1277     }
1278 }
1279 
prepare(IOperationExecutionContext * context)1280 bool prepare(IOperationExecutionContext* context) {
1281     bool useNchw = context->getInputValue<bool>(kLayoutScalar);
1282     Shape scoreShape = context->getInputShape(kScoreTensor);
1283     Shape bboxDeltasShape = context->getInputShape(kDeltaTensor);
1284     Shape anchorsShape = context->getInputShape(kAnchorTensor);
1285     Shape imageInfoDataShape = context->getInputShape(kImageInfoTensor);
1286     Shape outputScoreShape = context->getOutputShape(kOutputScoreTensor);
1287     Shape outputRoiShape = context->getOutputShape(kOutputRoiTensor);
1288     Shape outputBatchSplitShape = context->getOutputShape(kOutputBatchesTensor);
1289 
1290     NN_RET_CHECK_EQ(getNumberOfDimensions(scoreShape), 4);
1291     NN_RET_CHECK_EQ(getNumberOfDimensions(bboxDeltasShape), 4);
1292     NN_RET_CHECK_EQ(getNumberOfDimensions(anchorsShape), 2);
1293     NN_RET_CHECK_EQ(getNumberOfDimensions(imageInfoDataShape), 2);
1294 
1295     const uint32_t kRoiDim = 4;
1296     uint32_t numBatches = getSizeOfDimension(scoreShape, 0);
1297     uint32_t height = getSizeOfDimension(scoreShape, useNchw ? 2 : 1);
1298     uint32_t width = getSizeOfDimension(scoreShape, useNchw ? 3 : 2);
1299     uint32_t numAnchors = getSizeOfDimension(scoreShape, useNchw ? 1 : 3);
1300 
1301     NN_RET_CHECK_EQ(getSizeOfDimension(bboxDeltasShape, 0), numBatches);
1302     NN_RET_CHECK_EQ(getSizeOfDimension(bboxDeltasShape, useNchw ? 2 : 1), height);
1303     NN_RET_CHECK_EQ(getSizeOfDimension(bboxDeltasShape, useNchw ? 3 : 2), width);
1304     NN_RET_CHECK_EQ(getSizeOfDimension(bboxDeltasShape, useNchw ? 1 : 3), numAnchors * kRoiDim);
1305     NN_RET_CHECK_EQ(getSizeOfDimension(imageInfoDataShape, 0), numBatches);
1306     NN_RET_CHECK_EQ(getSizeOfDimension(imageInfoDataShape, 1), 2);
1307     NN_RET_CHECK_EQ(getSizeOfDimension(anchorsShape, 0), numAnchors);
1308     NN_RET_CHECK_EQ(getSizeOfDimension(anchorsShape, 1), kRoiDim);
1309 
1310     if (scoreShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
1311         NN_RET_CHECK_EQ(anchorsShape.scale, 0.125f);
1312         NN_RET_CHECK_EQ(imageInfoDataShape.scale, 0.125f);
1313         NN_RET_CHECK_EQ(imageInfoDataShape.offset, 0);
1314     }
1315 
1316     outputScoreShape.type = scoreShape.type;
1317     outputScoreShape.dimensions = {0};
1318     outputScoreShape.scale = scoreShape.scale;
1319     outputScoreShape.offset = scoreShape.offset;
1320     NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, outputScoreShape));
1321 
1322     outputRoiShape.dimensions = {0, 4};
1323     if (scoreShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
1324         outputRoiShape.scale = 0.125f;
1325         outputRoiShape.offset = 0;
1326     }
1327     NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, outputRoiShape));
1328 
1329     outputBatchSplitShape.dimensions = {0};
1330     NN_RET_CHECK(context->setOutputShape(kOutputBatchesTensor, outputBatchSplitShape));
1331     return true;
1332 }
1333 
execute(IOperationExecutionContext * context)1334 bool execute(IOperationExecutionContext* context) {
1335     NNTRACE_TRANS("generateProposals");
1336     switch (context->getInputType(kScoreTensor)) {
1337         case OperandType::TENSOR_FLOAT16: {
1338             return generateProposalsFloat16(context->getInputBuffer<_Float16>(kScoreTensor),
1339                                             context->getInputShape(kScoreTensor),
1340                                             context->getInputBuffer<_Float16>(kDeltaTensor),
1341                                             context->getInputShape(kDeltaTensor),
1342                                             context->getInputBuffer<_Float16>(kAnchorTensor),
1343                                             context->getInputShape(kAnchorTensor),
1344                                             context->getInputBuffer<_Float16>(kImageInfoTensor),
1345                                             context->getInputShape(kImageInfoTensor),
1346                                             context->getInputValue<_Float16>(kHeightStrideSalar),
1347                                             context->getInputValue<_Float16>(kWidthStrideScalar),
1348                                             context->getInputValue<int32_t>(kPreNmsMaxScalar),
1349                                             context->getInputValue<int32_t>(kPostNmsMaxScalar),
1350                                             context->getInputValue<_Float16>(kIoUThresholdScalar),
1351                                             context->getInputValue<_Float16>(kMinSizeScalar),
1352                                             context->getInputValue<bool>(kLayoutScalar), context);
1353         }
1354         case OperandType::TENSOR_FLOAT32: {
1355             return generateProposalsFloat32(context->getInputBuffer<float>(kScoreTensor),
1356                                             context->getInputShape(kScoreTensor),
1357                                             context->getInputBuffer<float>(kDeltaTensor),
1358                                             context->getInputShape(kDeltaTensor),
1359                                             context->getInputBuffer<float>(kAnchorTensor),
1360                                             context->getInputShape(kAnchorTensor),
1361                                             context->getInputBuffer<float>(kImageInfoTensor),
1362                                             context->getInputShape(kImageInfoTensor),
1363                                             context->getInputValue<float>(kHeightStrideSalar),
1364                                             context->getInputValue<float>(kWidthStrideScalar),
1365                                             context->getInputValue<int32_t>(kPreNmsMaxScalar),
1366                                             context->getInputValue<int32_t>(kPostNmsMaxScalar),
1367                                             context->getInputValue<float>(kIoUThresholdScalar),
1368                                             context->getInputValue<float>(kMinSizeScalar),
1369                                             context->getInputValue<bool>(kLayoutScalar), context);
1370         }
1371         case OperandType::TENSOR_QUANT8_ASYMM: {
1372             return generateProposalsQuant(context->getInputBuffer<uint8_t>(kScoreTensor),
1373                                           context->getInputShape(kScoreTensor),
1374                                           context->getInputBuffer<uint8_t>(kDeltaTensor),
1375                                           context->getInputShape(kDeltaTensor),
1376                                           context->getInputBuffer<int16_t>(kAnchorTensor),
1377                                           context->getInputShape(kAnchorTensor),
1378                                           context->getInputBuffer<uint16_t>(kImageInfoTensor),
1379                                           context->getInputShape(kImageInfoTensor),
1380                                           context->getInputValue<float>(kHeightStrideSalar),
1381                                           context->getInputValue<float>(kWidthStrideScalar),
1382                                           context->getInputValue<int32_t>(kPreNmsMaxScalar),
1383                                           context->getInputValue<int32_t>(kPostNmsMaxScalar),
1384                                           context->getInputValue<float>(kIoUThresholdScalar),
1385                                           context->getInputValue<float>(kMinSizeScalar),
1386                                           context->getInputValue<bool>(kLayoutScalar), context);
1387         }
1388         case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1389             return generateProposalsQuant(context->getInputBuffer<int8_t>(kScoreTensor),
1390                                           context->getInputShape(kScoreTensor),
1391                                           context->getInputBuffer<int8_t>(kDeltaTensor),
1392                                           context->getInputShape(kDeltaTensor),
1393                                           context->getInputBuffer<int16_t>(kAnchorTensor),
1394                                           context->getInputShape(kAnchorTensor),
1395                                           context->getInputBuffer<uint16_t>(kImageInfoTensor),
1396                                           context->getInputShape(kImageInfoTensor),
1397                                           context->getInputValue<float>(kHeightStrideSalar),
1398                                           context->getInputValue<float>(kWidthStrideScalar),
1399                                           context->getInputValue<int32_t>(kPreNmsMaxScalar),
1400                                           context->getInputValue<int32_t>(kPostNmsMaxScalar),
1401                                           context->getInputValue<float>(kIoUThresholdScalar),
1402                                           context->getInputValue<float>(kMinSizeScalar),
1403                                           context->getInputValue<bool>(kLayoutScalar), context);
1404         }
1405         default:
1406             NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
1407     }
1408 }
1409 
1410 }  // namespace generate_proposals
1411 
1412 namespace detection_postprocess {
1413 
1414 constexpr char kOperationName[] = "DETECTION_POSTPROCESS";
1415 
1416 constexpr uint32_t kNumInputs = 14;
1417 constexpr uint32_t kScoreTensor = 0;
1418 constexpr uint32_t kDeltaTensor = 1;
1419 constexpr uint32_t kAnchorTensor = 2;
1420 constexpr uint32_t kScaleYScalar = 3;
1421 constexpr uint32_t kScaleXScalar = 4;
1422 constexpr uint32_t kScaleHScalar = 5;
1423 constexpr uint32_t kScaleWScalar = 6;
1424 constexpr uint32_t kUseRegularNmsScalar = 7;
1425 constexpr uint32_t kMaxNumDetectionScalar = 8;
1426 constexpr uint32_t kMaxClassesPerDetectionScalar = 9;
1427 constexpr uint32_t kMaxNumDetectionPerClassScalar = 10;
1428 constexpr uint32_t kScoreThresholdScalar = 11;
1429 constexpr uint32_t kIoUThresholdScalar = 12;
1430 constexpr uint32_t kIsBGInLabelScalar = 13;
1431 
1432 constexpr uint32_t kNumOutputs = 4;
1433 constexpr uint32_t kOutputScoreTensor = 0;
1434 constexpr uint32_t kOutputRoiTensor = 1;
1435 constexpr uint32_t kOutputClassTensor = 2;
1436 constexpr uint32_t kOutputDetectionTensor = 3;
1437 
1438 namespace {
1439 
detectionPostprocessFloat32(const float * scoreData,const Shape & scoreShape,const float * deltaData,const Shape & deltaShape,const float * anchorData,const Shape & anchorShape,float scaleY,float scaleX,float scaleH,float scaleW,bool useRegularNms,int32_t maxNumDetections,int32_t maxClassesPerDetection,int32_t maxNumDetectionsPerClass,float iouThreshold,float scoreThreshold,bool isBGInLabel,float * scoreOutData,const Shape & scoreOutShape,float * roiOutData,const Shape & roiOutShape,int32_t * classOutData,const Shape & classOutShape,int32_t * detectionOutData,const Shape & detectionOutShape)1440 bool detectionPostprocessFloat32(
1441         const float* scoreData, const Shape& scoreShape, const float* deltaData,
1442         const Shape& deltaShape, const float* anchorData, const Shape& anchorShape, float scaleY,
1443         float scaleX, float scaleH, float scaleW, bool useRegularNms, int32_t maxNumDetections,
1444         int32_t maxClassesPerDetection, int32_t maxNumDetectionsPerClass, float iouThreshold,
1445         float scoreThreshold, bool isBGInLabel, float* scoreOutData, const Shape& scoreOutShape,
1446         float* roiOutData, const Shape& roiOutShape, int32_t* classOutData,
1447         const Shape& classOutShape, int32_t* detectionOutData, const Shape& detectionOutShape) {
1448     const uint32_t kRoiDim = 4;
1449     uint32_t numBatches = getSizeOfDimension(scoreShape, 0);
1450     uint32_t numAnchors = getSizeOfDimension(scoreShape, 1);
1451     uint32_t numClasses = getSizeOfDimension(scoreShape, 2);
1452     uint32_t lengthBoxEncoding = getSizeOfDimension(deltaShape, 2);
1453     uint32_t numOutDetection = getSizeOfDimension(scoreOutShape, 1);
1454 
1455     memset(scoreOutData, 0, getNumberOfElements(scoreOutShape) * sizeof(float));
1456     memset(roiOutData, 0, getNumberOfElements(roiOutShape) * sizeof(float));
1457     memset(classOutData, 0, getNumberOfElements(classOutShape) * sizeof(int32_t));
1458     memset(detectionOutData, 0, getNumberOfElements(detectionOutShape) * sizeof(int32_t));
1459 
1460     const float* scoreBase = scoreData;
1461     const float* deltaBase = deltaData;
1462     float* scoreOutBase = scoreOutData;
1463     float* roiOutBase = roiOutData;
1464     int32_t* classOutBase = classOutData;
1465     std::vector<float> roiBuffer(numAnchors * kRoiDim);
1466     std::vector<float> scoreBuffer(numAnchors);
1467     for (uint32_t b = 0; b < numBatches; b++) {
1468         const float* anchorBase = anchorData;
1469         for (uint32_t a = 0; a < numAnchors; a++) {
1470             float yCtr = anchorBase[0] + anchorBase[2] * deltaBase[0] / scaleY;
1471             float xCtr = anchorBase[1] + anchorBase[3] * deltaBase[1] / scaleX;
1472             float hHalf = anchorBase[2] * std::exp(deltaBase[2] / scaleH) * 0.5f;
1473             float wHalf = anchorBase[3] * std::exp(deltaBase[3] / scaleW) * 0.5f;
1474             roiBuffer[a * kRoiDim] = yCtr - hHalf;
1475             roiBuffer[a * kRoiDim + 1] = xCtr - wHalf;
1476             roiBuffer[a * kRoiDim + 2] = yCtr + hHalf;
1477             roiBuffer[a * kRoiDim + 3] = xCtr + wHalf;
1478             anchorBase += kRoiDim;
1479             deltaBase += lengthBoxEncoding;
1480         }
1481 
1482         if (useRegularNms) {
1483             std::vector<uint32_t> select;
1484             box_with_nms_limit::hardNmsMultiClass(
1485                     scoreBase, numClasses, numAnchors, scoreThreshold, iouThreshold,
1486                     maxNumDetections, maxNumDetectionsPerClass,
1487                     [&roiBuffer, numClasses](uint32_t ind) {
1488                         return roiBuffer.data() + (ind / numClasses) * kRoiDim;
1489                     },
1490                     &select);
1491             for (uint32_t i = 0; i < select.size(); i++) {
1492                 uint32_t ind = select[i];
1493                 scoreOutBase[i] = scoreBase[ind];
1494                 memcpy(roiOutBase + i * kRoiDim, &roiBuffer[(ind / numClasses) * kRoiDim],
1495                        kRoiDim * sizeof(float));
1496                 classOutBase[i] = (ind % numClasses) - (isBGInLabel ? 0 : 1);
1497             }
1498             *detectionOutData++ = select.size();
1499         } else {
1500             uint32_t numOutClasses = std::min<uint32_t>(numClasses - 1, maxClassesPerDetection);
1501             std::vector<float> maxScores(numAnchors);
1502             for (uint32_t a = 0; a < numAnchors; a++) {
1503                 maxScores[a] = *std::max_element(scoreBase + a * numClasses + 1,
1504                                                  scoreBase + (a + 1) * numClasses);
1505             }
1506             std::vector<uint32_t> select;
1507             for (uint32_t a = 0; a < numAnchors; a++) {
1508                 if (maxScores[a] > scoreThreshold) {
1509                     select.push_back(a);
1510                 }
1511             }
1512             uint32_t* selectEnd = box_with_nms_limit::hardNmsSingleClass(
1513                     maxScores.data(), iouThreshold, maxNumDetections,
1514                     [&roiBuffer](uint32_t ind) { return roiBuffer.data() + ind * kRoiDim; },
1515                     select.data(), select.size());
1516             select.resize(selectEnd - select.data());
1517             float* scoreOutPtr = scoreOutBase;
1518             float* roiOutPtr = roiOutBase;
1519             int32_t* classOutPtr = classOutBase;
1520             for (auto i : select) {
1521                 const float* score = scoreBase + i * numClasses;
1522                 std::vector<uint32_t> scoreInds(numClasses - 1);
1523                 std::iota(scoreInds.begin(), scoreInds.end(), 1);
1524                 std::sort(scoreInds.begin(), scoreInds.end(),
1525                           [&score](const uint32_t lhs, const uint32_t rhs) {
1526                               return score[lhs] > score[rhs];
1527                           });
1528                 for (uint32_t c = 0; c < numOutClasses; c++) {
1529                     *scoreOutPtr++ = score[scoreInds[c]];
1530                     memcpy(roiOutPtr, &roiBuffer[i * kRoiDim], kRoiDim * sizeof(float));
1531                     roiOutPtr += kRoiDim;
1532                     *classOutPtr++ = scoreInds[c] - (isBGInLabel ? 0 : 1);
1533                 }
1534             }
1535             *detectionOutData++ = select.size() * numOutClasses;
1536         }
1537         scoreBase += numAnchors * numClasses;
1538         scoreOutBase += numOutDetection;
1539         roiOutBase += numOutDetection * kRoiDim;
1540         classOutBase += numOutDetection;
1541     }
1542     return true;
1543 }
1544 
detectionPostprocessFloat16(const _Float16 * scoreData,const Shape & scoreShape,const _Float16 * deltaData,const Shape & deltaShape,const _Float16 * anchorData,const Shape & anchorShape,float scaleY,float scaleX,float scaleH,float scaleW,bool useRegularNms,int32_t maxNumDetections,int32_t maxClassesPerDetection,int32_t maxNumDetectionsPerClass,float iouThreshold,float scoreThreshold,bool isBGInLabel,_Float16 * scoreOutData,const Shape & scoreOutShape,_Float16 * roiOutData,const Shape & roiOutShape,int32_t * classOutData,const Shape & classOutShape,int32_t * detectionOutData,const Shape & detectionOutShape)1545 bool detectionPostprocessFloat16(
1546         const _Float16* scoreData, const Shape& scoreShape, const _Float16* deltaData,
1547         const Shape& deltaShape, const _Float16* anchorData, const Shape& anchorShape, float scaleY,
1548         float scaleX, float scaleH, float scaleW, bool useRegularNms, int32_t maxNumDetections,
1549         int32_t maxClassesPerDetection, int32_t maxNumDetectionsPerClass, float iouThreshold,
1550         float scoreThreshold, bool isBGInLabel, _Float16* scoreOutData, const Shape& scoreOutShape,
1551         _Float16* roiOutData, const Shape& roiOutShape, int32_t* classOutData,
1552         const Shape& classOutShape, int32_t* detectionOutData, const Shape& detectionOutShape) {
1553     std::vector<float> scores_float32(getNumberOfElements(scoreShape));
1554     convertFloat16ToFloat32(scoreData, &scores_float32);
1555     std::vector<float> delta_float32(getNumberOfElements(deltaShape));
1556     convertFloat16ToFloat32(deltaData, &delta_float32);
1557     std::vector<float> anchor_float32(getNumberOfElements(anchorShape));
1558     convertFloat16ToFloat32(anchorData, &anchor_float32);
1559     std::vector<float> outputScore_float32(getNumberOfElements(scoreOutShape));
1560     std::vector<float> outputRoi_float32(getNumberOfElements(roiOutShape));
1561     NN_RET_CHECK(detectionPostprocessFloat32(
1562             scores_float32.data(), scoreShape, delta_float32.data(), deltaShape,
1563             anchor_float32.data(), anchorShape, scaleY, scaleX, scaleH, scaleW, useRegularNms,
1564             maxNumDetections, maxClassesPerDetection, maxNumDetectionsPerClass, iouThreshold,
1565             scoreThreshold, isBGInLabel, outputScore_float32.data(), scoreOutShape,
1566             outputRoi_float32.data(), roiOutShape, classOutData, classOutShape, detectionOutData,
1567             detectionOutShape));
1568     convertFloat32ToFloat16(outputScore_float32, scoreOutData);
1569     convertFloat32ToFloat16(outputRoi_float32, roiOutData);
1570     return true;
1571 }
1572 
1573 }  // namespace
1574 
validate(const IOperationValidationContext * context)1575 bool validate(const IOperationValidationContext* context) {
1576     NN_RET_CHECK_EQ(context->getNumInputs(), kNumInputs);
1577     NN_RET_CHECK_EQ(context->getNumOutputs(), kNumOutputs);
1578     std::vector<OperandType> inExpectedTypes;
1579     std::vector<OperandType> outExpectedTypes;
1580     auto inputType = context->getInputType(kScoreTensor);
1581     if (inputType == OperandType::TENSOR_FLOAT16) {
1582         inExpectedTypes = {OperandType::TENSOR_FLOAT16, OperandType::TENSOR_FLOAT16,
1583                            OperandType::TENSOR_FLOAT16, OperandType::FLOAT16,
1584                            OperandType::FLOAT16,        OperandType::FLOAT16,
1585                            OperandType::FLOAT16,        OperandType::BOOL,
1586                            OperandType::INT32,          OperandType::INT32,
1587                            OperandType::INT32,          OperandType::FLOAT16,
1588                            OperandType::FLOAT16,        OperandType::BOOL};
1589     } else if (inputType == OperandType::TENSOR_FLOAT32) {
1590         inExpectedTypes = {OperandType::TENSOR_FLOAT32, OperandType::TENSOR_FLOAT32,
1591                            OperandType::TENSOR_FLOAT32, OperandType::FLOAT32,
1592                            OperandType::FLOAT32,        OperandType::FLOAT32,
1593                            OperandType::FLOAT32,        OperandType::BOOL,
1594                            OperandType::INT32,          OperandType::INT32,
1595                            OperandType::INT32,          OperandType::FLOAT32,
1596                            OperandType::FLOAT32,        OperandType::BOOL};
1597     } else {
1598         NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
1599     }
1600     NN_RET_CHECK(validateInputTypes(context, inExpectedTypes));
1601     NN_RET_CHECK(validateOutputTypes(
1602             context, {inputType, inputType, OperandType::TENSOR_INT32, OperandType::TENSOR_INT32}));
1603     return validateHalVersion(context, HalVersion::V1_2);
1604 }
1605 
prepare(IOperationExecutionContext * context)1606 bool prepare(IOperationExecutionContext* context) {
1607     Shape scoreShape = context->getInputShape(kScoreTensor);
1608     Shape deltasShape = context->getInputShape(kDeltaTensor);
1609     Shape anchorsShape = context->getInputShape(kAnchorTensor);
1610     Shape outputScoreShape = context->getOutputShape(kOutputScoreTensor);
1611     Shape outputRoiShape = context->getOutputShape(kOutputRoiTensor);
1612     Shape outputClassShape = context->getOutputShape(kOutputClassTensor);
1613     Shape outputDetectionShape = context->getOutputShape(kOutputDetectionTensor);
1614 
1615     NN_RET_CHECK_EQ(getNumberOfDimensions(scoreShape), 3);
1616     NN_RET_CHECK_EQ(getNumberOfDimensions(deltasShape), 3);
1617     NN_RET_CHECK_EQ(getNumberOfDimensions(anchorsShape), 2);
1618 
1619     const uint32_t kRoiDim = 4;
1620     uint32_t numBatches = getSizeOfDimension(scoreShape, 0);
1621     uint32_t numAnchors = getSizeOfDimension(scoreShape, 1);
1622     uint32_t numClasses = getSizeOfDimension(scoreShape, 2);
1623     uint32_t lengthBoxEncoding = getSizeOfDimension(deltasShape, 2);
1624     uint32_t maxNumDetections = context->getInputValue<int32_t>(kMaxNumDetectionScalar);
1625     uint32_t maxClassesPerDetection =
1626             context->getInputValue<int32_t>(kMaxClassesPerDetectionScalar);
1627     uint32_t numOutDetections = maxNumDetections;
1628 
1629     NN_RET_CHECK_EQ(getSizeOfDimension(deltasShape, 0), numBatches);
1630     NN_RET_CHECK_EQ(getSizeOfDimension(deltasShape, 1), numAnchors);
1631     NN_RET_CHECK_EQ(getSizeOfDimension(anchorsShape, 0), numAnchors);
1632     NN_RET_CHECK_EQ(getSizeOfDimension(anchorsShape, 1), kRoiDim);
1633 
1634     if (scoreShape.type == OperandType::TENSOR_FLOAT32) {
1635         NN_RET_CHECK_GT(context->getInputValue<float>(kScaleYScalar), 0);
1636         NN_RET_CHECK_GT(context->getInputValue<float>(kScaleXScalar), 0);
1637         NN_RET_CHECK_GT(context->getInputValue<float>(kScaleHScalar), 0);
1638         NN_RET_CHECK_GT(context->getInputValue<float>(kScaleWScalar), 0);
1639         NN_RET_CHECK_GE(context->getInputValue<float>(kScoreThresholdScalar), 0);
1640         NN_RET_CHECK_GE(context->getInputValue<float>(kIoUThresholdScalar), 0);
1641     } else if (scoreShape.type == OperandType::TENSOR_FLOAT16) {
1642         NN_RET_CHECK(context->getInputValue<_Float16>(kScaleYScalar) > 0);
1643         NN_RET_CHECK(context->getInputValue<_Float16>(kScaleXScalar) > 0);
1644         NN_RET_CHECK(context->getInputValue<_Float16>(kScaleHScalar) > 0);
1645         NN_RET_CHECK(context->getInputValue<_Float16>(kScaleWScalar) > 0);
1646         NN_RET_CHECK(context->getInputValue<_Float16>(kScoreThresholdScalar) >= 0);
1647         NN_RET_CHECK(context->getInputValue<_Float16>(kIoUThresholdScalar) >= 0);
1648     } else {
1649         NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
1650     }
1651     NN_RET_CHECK_GT(numClasses, 1);
1652     NN_RET_CHECK_GE(lengthBoxEncoding, 4);
1653     NN_RET_CHECK_GT(maxNumDetections, 0);
1654     if (context->getInputValue<bool>(kUseRegularNmsScalar)) {
1655         NN_RET_CHECK_GT(context->getInputValue<int32_t>(kMaxNumDetectionPerClassScalar), 0);
1656     } else {
1657         NN_RET_CHECK_GT(maxClassesPerDetection, 0);
1658         numOutDetections *= maxClassesPerDetection;
1659     }
1660 
1661     outputScoreShape.type = scoreShape.type;
1662     outputScoreShape.dimensions = {numBatches, numOutDetections};
1663     NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, outputScoreShape));
1664 
1665     outputRoiShape.type = anchorsShape.type;
1666     outputRoiShape.dimensions = {numBatches, numOutDetections, 4};
1667     NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, outputRoiShape));
1668 
1669     outputClassShape.type = OperandType::TENSOR_INT32;
1670     outputClassShape.dimensions = {numBatches, numOutDetections};
1671     NN_RET_CHECK(context->setOutputShape(kOutputClassTensor, outputClassShape));
1672 
1673     outputDetectionShape.type = OperandType::TENSOR_INT32;
1674     outputDetectionShape.dimensions = {numBatches};
1675     NN_RET_CHECK(context->setOutputShape(kOutputDetectionTensor, outputDetectionShape));
1676     return true;
1677 }
1678 
execute(IOperationExecutionContext * context)1679 bool execute(IOperationExecutionContext* context) {
1680     NNTRACE_TRANS("detectionPostProcess");
1681     switch (context->getInputType(kScoreTensor)) {
1682         case OperandType::TENSOR_FLOAT16: {
1683             return detectionPostprocessFloat16(
1684                     context->getInputBuffer<_Float16>(kScoreTensor),
1685                     context->getInputShape(kScoreTensor),
1686                     context->getInputBuffer<_Float16>(kDeltaTensor),
1687                     context->getInputShape(kDeltaTensor),
1688                     context->getInputBuffer<_Float16>(kAnchorTensor),
1689                     context->getInputShape(kAnchorTensor),
1690                     context->getInputValue<_Float16>(kScaleYScalar),
1691                     context->getInputValue<_Float16>(kScaleXScalar),
1692                     context->getInputValue<_Float16>(kScaleHScalar),
1693                     context->getInputValue<_Float16>(kScaleWScalar),
1694                     context->getInputValue<bool>(kUseRegularNmsScalar),
1695                     context->getInputValue<int32_t>(kMaxNumDetectionScalar),
1696                     context->getInputValue<int32_t>(kMaxClassesPerDetectionScalar),
1697                     context->getInputValue<int32_t>(kMaxNumDetectionPerClassScalar),
1698                     context->getInputValue<_Float16>(kIoUThresholdScalar),
1699                     context->getInputValue<_Float16>(kScoreThresholdScalar),
1700                     context->getInputValue<bool>(kIsBGInLabelScalar),
1701                     context->getOutputBuffer<_Float16>(kOutputScoreTensor),
1702                     context->getOutputShape(kOutputScoreTensor),
1703                     context->getOutputBuffer<_Float16>(kOutputRoiTensor),
1704                     context->getOutputShape(kOutputRoiTensor),
1705                     context->getOutputBuffer<int32_t>(kOutputClassTensor),
1706                     context->getOutputShape(kOutputClassTensor),
1707                     context->getOutputBuffer<int32_t>(kOutputDetectionTensor),
1708                     context->getOutputShape(kOutputDetectionTensor));
1709         }
1710         case OperandType::TENSOR_FLOAT32: {
1711             return detectionPostprocessFloat32(
1712                     context->getInputBuffer<float>(kScoreTensor),
1713                     context->getInputShape(kScoreTensor),
1714                     context->getInputBuffer<float>(kDeltaTensor),
1715                     context->getInputShape(kDeltaTensor),
1716                     context->getInputBuffer<float>(kAnchorTensor),
1717                     context->getInputShape(kAnchorTensor),
1718                     context->getInputValue<float>(kScaleYScalar),
1719                     context->getInputValue<float>(kScaleXScalar),
1720                     context->getInputValue<float>(kScaleHScalar),
1721                     context->getInputValue<float>(kScaleWScalar),
1722                     context->getInputValue<bool>(kUseRegularNmsScalar),
1723                     context->getInputValue<int32_t>(kMaxNumDetectionScalar),
1724                     context->getInputValue<int32_t>(kMaxClassesPerDetectionScalar),
1725                     context->getInputValue<int32_t>(kMaxNumDetectionPerClassScalar),
1726                     context->getInputValue<float>(kIoUThresholdScalar),
1727                     context->getInputValue<float>(kScoreThresholdScalar),
1728                     context->getInputValue<bool>(kIsBGInLabelScalar),
1729                     context->getOutputBuffer<float>(kOutputScoreTensor),
1730                     context->getOutputShape(kOutputScoreTensor),
1731                     context->getOutputBuffer<float>(kOutputRoiTensor),
1732                     context->getOutputShape(kOutputRoiTensor),
1733                     context->getOutputBuffer<int32_t>(kOutputClassTensor),
1734                     context->getOutputShape(kOutputClassTensor),
1735                     context->getOutputBuffer<int32_t>(kOutputDetectionTensor),
1736                     context->getOutputShape(kOutputDetectionTensor));
1737         }
1738         default:
1739             NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
1740     }
1741 }
1742 
1743 }  // namespace detection_postprocess
1744 
1745 }  // namespace bbox_ops
1746 
1747 NN_REGISTER_OPERATION(AXIS_ALIGNED_BBOX_TRANSFORM,
1748                       bbox_ops::axis_aligned_bbox_transform::kOperationName,
1749                       bbox_ops::axis_aligned_bbox_transform::validate,
1750                       bbox_ops::axis_aligned_bbox_transform::prepare,
1751                       bbox_ops::axis_aligned_bbox_transform::execute, .allowZeroSizedInput = true);
1752 
1753 NN_REGISTER_OPERATION(BOX_WITH_NMS_LIMIT, bbox_ops::box_with_nms_limit::kOperationName,
1754                       bbox_ops::box_with_nms_limit::validate, bbox_ops::box_with_nms_limit::prepare,
1755                       bbox_ops::box_with_nms_limit::execute, .allowZeroSizedInput = true);
1756 
1757 NN_REGISTER_OPERATION(GENERATE_PROPOSALS, bbox_ops::generate_proposals::kOperationName,
1758                       bbox_ops::generate_proposals::validate, bbox_ops::generate_proposals::prepare,
1759                       bbox_ops::generate_proposals::execute);
1760 
1761 NN_REGISTER_OPERATION(DETECTION_POSTPROCESSING, bbox_ops::detection_postprocess::kOperationName,
1762                       bbox_ops::detection_postprocess::validate,
1763                       bbox_ops::detection_postprocess::prepare,
1764                       bbox_ops::detection_postprocess::execute);
1765 }  // namespace nn
1766 }  // namespace android
1767