1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ANDROID_FRAMEWORKS_ML_NN_COMMON_CPU_EXECUTOR_H
18 #define ANDROID_FRAMEWORKS_ML_NN_COMMON_CPU_EXECUTOR_H
19 
20 #include <android-base/macros.h>
21 
22 #include <algorithm>
23 #include <memory>
24 #include <optional>
25 #include <vector>
26 
27 #include "ControlFlow.h"
28 #include "HalInterfaces.h"
29 #include "OperationResolver.h"
30 #include "OperationsUtils.h"
31 #include "Utils.h"
32 
33 namespace android {
34 namespace nn {
35 
36 // Information we maintain about each operand during execution that
37 // may change during execution.
38 struct RunTimeOperandInfo {
39     // TODO Storing the type here is redundant, as it won't change during execution.
40     hal::OperandType type;
41     // The type and dimensions of the operand.  The dimensions can
42     // change at runtime.  We include the type because it's useful
43     // to pass together with the dimension to the functions implementing
44     // the operators.
45     //
46     // A dimension being zero has different meanings for different operands at different stages:
47     // - Model inputs:
48     //   * Specified in model: implies "dynamic", and must be fully-specified in request.
49     //   * Specified in request: illegal.
50     // - Constant operands: illegal.
51     // - Model outputs and internal operands:
52     //   * Before evaluation: implies unknown and to be deduced from execution.
53     //   * After evaluation:
54     //     - If isSufficient reports true: the tensor is zero-sized.
55     //     - Otherwise: implies unknown.
56     std::vector<uint32_t> dimensions;
57 
58     float scale;
59     int32_t zeroPoint;
60     // Where the operand's data is stored.  Check the corresponding
61     // location information in the model to figure out if this points
62     // to memory we have allocated for an temporary operand.
63     uint8_t* buffer;  // TODO(b/148273353): Change the type to void*.
64     // The length of the buffer.
65     uint32_t length;
66     // Whether this is a temporary variable, a model input, a constant, etc.
67     hal::OperandLifeTime lifetime;
68     // Keeps track of how many operations have yet to make use
69     // of this temporary variable.  When the count is decremented to 0,
70     // we free the buffer.  For non-temporary variables, this count is
71     // always 0.
72     uint32_t numberOfUsesLeft;
73 
74     hal::OperandExtraParams extraParams;
75 
shapeRunTimeOperandInfo76     Shape shape() const {
77         return {
78                 .type = type,
79                 .dimensions = dimensions,
80                 .scale = scale,
81                 .offset = zeroPoint,
82                 .extraParams = extraParams,
83         };
84     }
85 
isSufficientRunTimeOperandInfo86     bool isSufficient() const {
87         if (isExtensionOperandType(type)) {
88             // We don't know sizes of extension types.
89             return true;
90         }
91         return length >= nonExtensionOperandSizeOfData(type, dimensions);
92     }
93 };
94 
95 // Used to keep a pointer to each of the memory pools.
96 //
97 // RunTimePoolInfo references a region of memory. Other RunTimePoolInfo objects
98 // may reference the same region of memory by either:
99 // (1) copying an existing RunTimePoolInfo object, or
100 // (2) creating multiple RunTimePoolInfo objects from the same memory resource
101 //     (e.g., "createFromHidlMemory" or "createFromExistingBuffer")
102 //
103 // If the underlying region of memory is mapped by "createFromHidlMemory", the
104 // mapping will be sustained until it is no longer referenced by any
105 // RunTimePoolInfo objects.
106 class RunTimePoolInfo {
107    public:
108     static std::optional<RunTimePoolInfo> createFromHidlMemory(const hal::hidl_memory& hidlMemory);
109     static RunTimePoolInfo createFromExistingBuffer(uint8_t* buffer, uint32_t size = 0);
110 
111     uint8_t* getBuffer() const;
112     bool flush() const;
113     const hal::hidl_memory& getHidlMemory() const;
114     uint32_t getSize() const;
115 
116    private:
117     class RunTimePoolInfoImpl;
118     RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl>& impl);
119 
120     std::shared_ptr<const RunTimePoolInfoImpl> mImpl;
121 };
122 
123 bool setRunTimePoolInfosFromHidlMemories(std::vector<RunTimePoolInfo>* poolInfos,
124                                          const hal::hidl_vec<hal::hidl_memory>& pools);
125 
126 bool setRunTimePoolInfosFromMemoryPools(std::vector<RunTimePoolInfo>* poolInfos,
127                                         const hal::hidl_vec<hal::Request::MemoryPool>& pools);
128 
129 // This class is used to execute a model on the CPU.
130 class CpuExecutor {
131    public:
132     // This constructor allows clients of CpuExecutor to provide custom CPU
133     // operation implementations. It is used by a sample driver to test
134     // extension support.
135     //
136     // Note that it is not possible to provide custom CPU implementations for
137     // non-OperationResolver operations (b/124041202).
138     //
139     // The operation resolver must outlive the executor.
CpuExecutor(const IOperationResolver * operationResolver)140     explicit CpuExecutor(const IOperationResolver* operationResolver)
141         : mOperationResolver(operationResolver) {}
142 
CpuExecutor()143     CpuExecutor() : CpuExecutor(BuiltinOperationResolver::get()) {}
144 
145     // Executes the model. The results will be stored at the locations
146     // specified in the constructor.
147     // The model must outlive the executor.  We prevent it from being modified
148     // while this is executing.
149     int run(const hal::Model& model, const hal::Request& request,
150             const std::vector<RunTimePoolInfo>& modelPoolInfos,
151             const std::vector<RunTimePoolInfo>& requestPoolInfos);
152 
getOutputShapes()153     const std::vector<hal::OutputShape>& getOutputShapes() const {
154         CHECK(mFinished) << "getOutputShapes() called by an unfinished CpuExecutor.";
155         return mOutputShapes;
156     }
157 
setDeadline(const Deadline & deadline)158     void setDeadline(const Deadline& deadline) { mDeadline = deadline; }
setLoopTimeout(uint64_t duration)159     void setLoopTimeout(uint64_t duration) { mLoopTimeoutDuration = duration; }
160 
161    private:
162     // Creates runtime info from what's in the model.
163     std::vector<RunTimeOperandInfo> initializeRunTimeInfo(const hal::Subgraph& subgraph);
164     // Adjusts the runtime info for the arguments passed to the model,
165     // modifying the buffer location, and possibly the dimensions.
166     void updateForArguments(const std::vector<uint32_t>& indexes,
167                             const hal::hidl_vec<hal::RequestArgument>& arguments,
168                             const std::vector<RunTimePoolInfo>& requestPoolInfos,
169                             RunTimeOperandInfo* operands);
170     // Runs one subgraph.
171     int executeSubgraph(const hal::Subgraph& subgraph, RunTimeOperandInfo* operands);
172     // Runs one operation of the graph.
173     int executeOperation(const hal::Operation& operation, RunTimeOperandInfo* operands);
174     int executeIfOperation(const hal::Operation& operation, RunTimeOperandInfo* operands);
175     int executeWhileOperation(const hal::Operation& operation, RunTimeOperandInfo* operands);
176 
177     void setOutputShapes(const std::vector<uint32_t>& outputIndexes,
178                          const std::vector<RunTimeOperandInfo>& operands);
179 
180     // Compile-time operand value information used by initializeRunTimeInfo.
181     // The fields are only valid while run() is being executed.
182     const hal::hidl_vec<uint8_t>* mModelOperandValues = nullptr;
183     const std::vector<RunTimePoolInfo>* mModelPoolInfos = nullptr;
184     const hal::hidl_vec<hal::Subgraph>* mReferencedSubgraphs = nullptr;
185 
186     // The output operand shapes returning to the runtime.
187     std::vector<hal::OutputShape> mOutputShapes;
188 
189     // Whether execution is finished and mOutputShapes is ready
190     bool mFinished = false;
191 
192     // The deadline hint for the maximum amount of time the client expects the
193     // execution will take. If this deadline is exceeded, the CpuExecutor will
194     // abort the execution if there are remaining ops to execute.
195     std::optional<Deadline> mDeadline;
196 
197     // The maximum amount of time in nanoseconds that can be spent executing a
198     // WHILE loop.
199     uint64_t mLoopTimeoutDuration = operation_while::kTimeoutNsDefault;
200 
201     const IOperationResolver* mOperationResolver;
202 };
203 
204 // Class for setting reasonable OpenMP threading settings. (OpenMP is used by
205 // the Eigen matrix library.)
206 //
207 // Currently sets a low blocktime: the time OpenMP threads busy-wait for more
208 // work before going to sleep. See b/79159165, https://reviews.llvm.org/D18577.
209 // The default is 200ms, we set to 20ms here, see b/109645291. This keeps the
210 // cores enabled throughout inference computation without too much extra power
211 // consumption afterwards.
212 //
213 // The OpenMP settings are thread-local (applying only to worker threads formed
214 // from that thread), see https://software.intel.com/en-us/node/522688 and
215 // http://lists.llvm.org/pipermail/openmp-dev/2016-July/001432.html. This class
216 // ensures that within the scope in which an object is instantiated we use the
217 // right settings (scopes may be nested), as long as no other library changes
218 // them.  (Note that in current NNAPI usage only one instance is used in the
219 // CpuExecutor thread).
220 //
221 // TODO(mikie): consider also setting the number of threads used. Using as many
222 // threads as there are cores results in more variable performance: if we don't
223 // get all cores for our threads, the latency is doubled as we wait for one core
224 // to do twice the amount of work. Reality is complicated though as not all
225 // cores are the same. Decision to be based on benchmarking against a
226 // representative set of workloads and devices. I'm keeping the code here for
227 // reference.
228 // b/109953668, disable OpenMP
229 #ifdef NNAPI_OPENMP
230 class ScopedOpenmpSettings {
231    public:
232     ScopedOpenmpSettings();
233     ~ScopedOpenmpSettings();
234     DISALLOW_COPY_AND_ASSIGN(ScopedOpenmpSettings);
235 
236    private:
237     int mBlocktimeInitial;
238 #if NNAPI_LIMIT_CPU_THREADS
239     int mMaxThreadsInitial;
240 #endif
241 };
242 #endif  // NNAPI_OPENMP
243 
244 namespace {
245 
246 template <typename T>
getScalarData(const RunTimeOperandInfo & info)247 T getScalarData(const RunTimeOperandInfo& info) {
248     CHECK_GE(info.length, sizeof(T)) << "Cannot get scalar data: buffer too short";
249     T* data = reinterpret_cast<T*>(info.buffer);
250     return data[0];
251 }
252 
253 template <typename T>
getScalarDataWithDefault(const RunTimeOperandInfo & info,T defaultValue)254 T getScalarDataWithDefault(const RunTimeOperandInfo& info, T defaultValue) {
255     if (info.length < sizeof(T)) {
256         return defaultValue;
257     }
258     return getScalarData<T>(info);
259 }
260 
IsNullInput(const RunTimeOperandInfo * input)261 inline bool IsNullInput(const RunTimeOperandInfo* input) {
262     return input->lifetime == hal::OperandLifeTime::NO_VALUE;
263 }
264 
NumInputsWithValues(const hal::Operation & operation,const RunTimeOperandInfo * operands)265 inline int NumInputsWithValues(const hal::Operation& operation,
266                                const RunTimeOperandInfo* operands) {
267     const std::vector<uint32_t>& inputs = operation.inputs;
268     return std::count_if(inputs.begin(), inputs.end(),
269                          [&operands](uint32_t i) { return !IsNullInput(&operands[i]); });
270 }
271 
NumOutputs(const hal::Operation & operation)272 inline int NumOutputs(const hal::Operation& operation) {
273     return operation.outputs.size();
274 }
275 
NumDimensions(const RunTimeOperandInfo * operand)276 inline size_t NumDimensions(const RunTimeOperandInfo* operand) {
277     return operand->shape().dimensions.size();
278 }
279 
SizeOfDimension(const RunTimeOperandInfo * operand,int i)280 inline uint32_t SizeOfDimension(const RunTimeOperandInfo* operand, int i) {
281     return operand->shape().dimensions[i];
282 }
283 
GetInput(const hal::Operation & operation,RunTimeOperandInfo * operands,int index)284 inline RunTimeOperandInfo* GetInput(const hal::Operation& operation, RunTimeOperandInfo* operands,
285                                     int index) {
286     return &operands[operation.inputs[index]];
287 }
288 
GetOutput(const hal::Operation & operation,RunTimeOperandInfo * operands,int index)289 inline RunTimeOperandInfo* GetOutput(const hal::Operation& operation, RunTimeOperandInfo* operands,
290                                      int index) {
291     return &operands[operation.outputs[index]];
292 }
293 
294 }  // anonymous namespace
295 
296 }  // namespace nn
297 }  // namespace android
298 
299 #endif  // ANDROID_FRAMEWORKS_ML_NN_COMMON_CPU_EXECUTOR_H
300