• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H
18 #define ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H
19 
20 #include <ControlFlow.h>
21 #include <CpuExecutor.h>
22 #include <android-base/thread_annotations.h>
23 #include <nnapi/IBurst.h>
24 #include <nnapi/IPreparedModel.h>
25 #include <nnapi/Types.h>
26 #include <nnapi/Validation.h>
27 
28 #include <memory>
29 #include <string>
30 #include <tuple>
31 #include <utility>
32 #include <vector>
33 
34 #include "ExecutionCallback.h"
35 #include "Memory.h"
36 #include "ModelArgumentInfo.h"
37 #include "ModelBuilder.h"
38 #include "NeuralNetworks.h"
39 
40 namespace android {
41 namespace nn {
42 
43 class BurstBuilder;
44 class CompilationBuilder;
45 class Device;
46 class DynamicTemporaries;
47 class ExecutionPlan;
48 class ExecutionStep;
49 class ModelBuilder;
50 class RuntimeMemory;
51 class RuntimePreparedModel;
52 class RuntimeExecution;
53 class StepExecutor;
54 
55 class ExecutionBuilder {
56     friend class StepExecutor;
57 
58    public:
59     explicit ExecutionBuilder(const CompilationBuilder* compilation);
60     virtual ~ExecutionBuilder() = default;
61 
62     int setInput(uint32_t index, const ANeuralNetworksOperandType* type, const void* buffer,
63                  size_t length);
64     int setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
65                            const RuntimeMemory* memory, size_t offset, size_t length);
66     int setOutput(uint32_t index, const ANeuralNetworksOperandType* type, void* buffer,
67                   size_t length);
68     int setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
69                             const RuntimeMemory* memory, size_t offset, size_t length);
70 
71     int setMeasureTiming(bool measure);
72 
73     int getDuration(int32_t durationCode, uint64_t* duration) const;
74 
75     int setTimeoutDuration(uint64_t duration);
76 
77     std::optional<uint64_t> getTimeoutDuration() const;
78 
79     int setLoopTimeout(uint64_t duration);
80 
getLoopTimeoutDuration()81     uint64_t getLoopTimeoutDuration() const { return mLoopTimeoutDuration; }
82 
83     int enableInputAndOutputPadding(bool enable);
84 
85     int setReusable(bool reusable);
86 
87     int computeFenced(const std::vector<int>& wait_for, uint64_t timeoutDurationAfterFence,
88                       int* sync_fence);
89 
computeAsynchronously(std::shared_ptr<ExecutionCallback> * synchronizationCallback)90     int computeAsynchronously(std::shared_ptr<ExecutionCallback>* synchronizationCallback) {
91         CHECK(synchronizationCallback != nullptr);
92         return compute(synchronizationCallback);
93     }
computeSynchronously()94     int computeSynchronously() { return compute(nullptr); }
burstCompute(BurstBuilder * burst)95     int burstCompute(BurstBuilder* burst) { return compute(nullptr, burst); }
96 
97     // Initialize output dimensional information from ModelArgumentInfo.
98     std::vector<OutputShape> getInitialOutputShapes() const;
99 
100     int getOutputOperandDimensions(uint32_t index, uint32_t* dimensions);
101     int getOutputOperandRank(uint32_t index, uint32_t* rank);
102 
103     // Handshake with lower-level execution support
measureTiming()104     bool measureTiming() const { return mMeasureTiming; }
reportTimingWithoutFencedExecutionCallback(Timing timing)105     void reportTimingWithoutFencedExecutionCallback(Timing timing) {
106         mTimingWithoutFencedExecutionCallback = timing;
107     }
108 
getCompilation()109     const CompilationBuilder* getCompilation() const { return mCompilation; }
getModel()110     const ModelBuilder* getModel() const { return mModel; }
111     const ModelBuilder* getSourceModel(uint32_t index) const;
getSourceOperand(const std::pair<uint32_t,uint32_t> & sourceOperandIndex)112     const Operand& getSourceOperand(const std::pair<uint32_t, uint32_t>& sourceOperandIndex) const {
113         return getSourceModel(sourceOperandIndex.first)->getOperand(sourceOperandIndex.second);
114     }
115 
116     // This method will be called at the end of all computation paths to change the state
117     // of the execution object and update output shapes / memories.
118     int finishComputation(int result, const std::vector<OutputShape>& outputShapes);
finishComputation(ErrorStatus error,const std::vector<OutputShape> & outputShapes)119     ErrorStatus finishComputation(ErrorStatus error, const std::vector<OutputShape>& outputShapes) {
120         const int result = finishComputation(convertErrorStatusToResultCode(error), outputShapes);
121         return convertResultCodeToErrorStatus(result);
122     }
123 
getExecuteFencedInfoCallback()124     const ExecuteFencedInfoCallback& getExecuteFencedInfoCallback() {
125         return mFencedExecutionCallback;
126     }
127 
inFlight()128     bool inFlight() const {
129         std::lock_guard<std::mutex> lock(mStateMutex);
130         return mState == State::COMPUTATION;
131     }
132 
getInputInfo(uint32_t index)133     const ModelArgumentInfo& getInputInfo(uint32_t index) const { return mInputs[index]; }
getOutputInfo(uint32_t index)134     const ModelArgumentInfo& getOutputInfo(uint32_t index) const { return mOutputs[index]; }
135 
getRunTimePoolInfo(uint32_t poolIndex)136     std::optional<RunTimePoolInfo> getRunTimePoolInfo(uint32_t poolIndex) const {
137         return mMemories[poolIndex]->getRunTimePoolInfo();
138     }
139 
140    protected:
141     // If a callback is provided, then this is asynchronous. If a callback is
142     // not provided (i.e., is nullptr), then this is synchronous.
143     //
144     // If burst is provided, then the burst path will be used. If a burst is not
145     // provided (i.e., is nullptr), then a synchronous execution will occur.
146     //
147     // Providing both synchronizationCallback and burstBuilder is an error.
148     int compute(std::shared_ptr<ExecutionCallback>* synchronizationCallback,
149                 BurstBuilder* burstBuilder = nullptr);
150 
151     virtual std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
152             const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) = 0;
153 
154     virtual std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
155             const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
156             const OptionalTimePoint& deadline) = 0;
157 
158     // This method handles the common preparation and validation logic of compute and computeFenced.
159     // It will be called at the start of every computation.
160     int prepareForCompute(const char* name);
161 
162     const CompilationBuilder* mCompilation;
163 
164     // Update output dimensional information from OutputShape to ModelArgumentInfo.
165     bool updateOutputShapes(ErrorStatus status, const std::vector<OutputShape>& outputShapes);
166 
167     bool updateMemories();
168 
169     const ModelBuilder* mModel;
170     const ExecutionPlan* mPlan;
171 
172     // Whether CPU fallback is allowed based on the value of DeviceManager::kPartitioning* captured
173     // from CompilationBuilder when the ExecutionBuilder is constructed.
174     bool mAllowCpuFallback;
175 
176     // The information we'll send to the driver about the inputs and outputs.
177     // Note that we build this in two steps:
178     // 1. As the arguments are specified, set the corresponding mInputs or mOutputs element.
179     //    If set from a pointer, don't set the location in the Request::Argument but store it
180     //    instead in mInputBuffers or mOutputBuffers.
181     // 2. Once we have all the inputs and outputs, if needed, allocate shared memory for
182     //    the m*Buffers entries.  Copy the input values into the shared memory.
183     // We do this to avoid creating a lot of shared memory objects if we have a lot of
184     // parameters specified via pointers.  We also avoid copying in the case where
185     // some of the nodes will interpreted on the CPU anyway.
186     std::vector<ModelArgumentInfo> mInputs;
187     std::vector<ModelArgumentInfo> mOutputs;
188     MemoryTracker mMemories;
189 
190     // Do we ask the driver to measure timing?
191     bool mMeasureTiming = false;
192 
193     // Timing reported from the driver.  This field is only used if
194     // mFencedExecutionCallback is nullptr.
195     Timing mTimingWithoutFencedExecutionCallback = {};
196 
197     // Amount of time to complete or abort the execution.
198     std::optional<uint64_t> mTimeoutDuration;
199 
200     // Amount of time to complete or abort a loop.
201     uint64_t mLoopTimeoutDuration = operation_while::kTimeoutNsDefault;
202 
203     // The state of the execution.
204     // Properties can only been set when the execution is in the state State::PREPARATION.
205     // Timing and output shapes can only be queried when the execution is in the state
206     // State::COMPLETED.
207     enum class State { PREPARATION, COMPUTATION, COMPLETED };
208     State mState GUARDED_BY(mStateMutex) = State::PREPARATION;
computationStarted()209     bool computationStarted() const {
210         std::lock_guard<std::mutex> lock(mStateMutex);
211         return mState != State::PREPARATION;
212     }
completed()213     bool completed() const {
214         std::lock_guard<std::mutex> lock(mStateMutex);
215         return mState == State::COMPLETED;
216     }
217 
218     // Mutex to guard mState. Note that this not strictly needed because we provide
219     // no thread-safety guarantee to the ANeuralNetworksExecution object.
220     mutable std::mutex mStateMutex;
221 
222     // Return false if the execution is in a bad state for starting computation.
223     // Otherwise, return true and set the state to State::COMPUTATION.
224     bool checkAndSetComputationState(const char* name);
225 
226     // With what error status has execution completed?
227     enum class Completion { NO_ERROR, OUTPUT_INSUFFICIENT_SIZE, OTHER_ERROR };
228     Completion mCompletion = Completion::OTHER_ERROR;
completedWith()229     Completion completedWith() const {
230         CHECK(completed());
231         return mCompletion;
232     }
233 
234     // The result code of request validation.
235     // It is only evaluated once at the first time it's needed.
236     std::optional<int> mValidationResultCode;
237     int getValidationResultCode();
238 
239     // Does every tensor output operand of the model have a fully specified shape?
240     // It is only evaluated once at the first time it's needed.
241     std::optional<bool> mOutputsFullySpecified;
242     bool areOutputsFullySpecified();
243 
244     // The callback used to query execution related info in the case of fenced
245     // execution; otherwise, nullptr.  If the execution plan has multiple steps,
246     // this is the callback associated with the last step.  If the last step
247     // doesn't support fenced execution (e.g., the driver is too old), or if the
248     // launch of execution on the driver fails, then this callback will be
249     // nullptr.
250     ExecuteFencedInfoCallback mFencedExecutionCallback;
251 
252     // Whether set{Input,Output}[FromMemory] can accept padded length or not.
253     bool mInputAndOutputPaddingEnabled = false;
254 
255     // enableInputAndOutputPadding may only be called before any call of
256     // set{Input,Output}[FromMemory]
257     bool mHasCalledSetInputOutput = false;
258 
259     // Can compute APIs be invoked multiple times on the execution object?
260     bool mReusable = false;
261 };
262 
263 // For execution plan with a SIMPLE body, i.e. the whole model will be executed on a single device.
264 class SimpleExecutionBuilder : public ExecutionBuilder {
265    public:
266     SimpleExecutionBuilder(const CompilationBuilder* compilation);
267 
268     std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
269             const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override;
270 
271     std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
272             const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
273             const OptionalTimePoint& deadline) override;
274 
275    private:
276     std::shared_ptr<StepExecutor> mExecutor;
277 };
278 
279 // For execution plan with a COMPOUND body, i.e. partitioned execution with multiple steps.
280 class CompoundExecutionBuilder : public ExecutionBuilder {
281    public:
282     CompoundExecutionBuilder(const CompilationBuilder* compilation);
283 
284     std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
285             const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override;
286 
287     std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
288             const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
289             const OptionalTimePoint& deadline) override;
290 };
291 
292 // class StepExecutor is used to execute a single "step" in a
293 // potentially multiple step execution process.  The graph associated
294 // with that step is executed in its entirety on a single device (or
295 // on the CPU).
296 class StepExecutor {
297    public:
298     // executionBuilder
299     //     Describes the full (possibly multiple-"step") execution.
300     // model
301     //     The model to be executed by the executor.  Possibly a single
302     //     "step" model of a multiple-"step" executionBuilder.
303     // driver, preparedModel
304     //     The device on which to execute the "step", and the prepared
305     //     model to execute on that device. For non-fallback StepExecutor,
306     //     neither is nullptr; for fallback StepExecutor, both are ignored in
307     //     StepExecutor::computeOnCpuFallback and may be nullptr.
308     // reusable
309     //     If true, multiple StepExecutor::compute/computeFenced may be called on this
310     //     object; otherwise, only one StepExecutor::compute/computeFenced may be called.
311     //     reusable must be false if mDynamicTemporaries != nullptr.
312     // step
313     //     Contains the output index mapping from the excerpted "step" model to
314     //     main model if the execution has multiple "steps". Must be nullptr
315     //     otherwise.
316     //     (step == nullptr) == (dynamicTemporaries == nullptr)
317     // dynamicTemporaries
318     //     If the execution has multiple "steps", describes the temporaries
319     //     of source models that do not have fully specified types and are outputs
320     //     of "step" models. Must be nullptr otherwise.
321     //     (step == nullptr) == (dynamicTemporaries == nullptr)
322     StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
323                  std::shared_ptr<Device> device,
324                  std::shared_ptr<RuntimePreparedModel> preparedModel, bool reusable,
325                  const ExecutionStep* step = nullptr,
326                  DynamicTemporaries* dynamicTemporaries = nullptr);
327 
328     // Map inputs and outputs from ExecutionBuilder to StepExecutor,
329     // in the case where we have a single-"step" execution (i.e., the executor
330     // is executing the entire model from the ExecutionBuilder).
331     void mapInputsAndOutputsTrivially();
332 
333     // Update output shapes with shapes returned from execution.
334     struct UpdateOutputShapes {
335         // These fields are meaningless unless updateOutputShapes() returns true
336         bool updatedDynamicTemporary;  // did shape (dimensions, size) information change for at
337                                        // least one dynamic temporary?
338         bool mainOutputInsufficient;  // is at least one main model output written by this execution
339                                       // marked !isSufficient?
340         bool zeroSizedInput;  // is at least one output of this execution step a zero-sized tensor
341                               // that needs to be read by some other step of the same execution?
342     };
343     bool updateOutputShapes(int executionResultCode, const std::vector<OutputShape>& from,
344                             std::vector<OutputShape>* to, UpdateOutputShapes* update);
345 
346     // Map inputs and outputs from ExecutionBuilder to StepExecutor,
347     // one at a time.  Note that these are input/output indexes, not
348     // operand indexes.
349     //
350     // For mapOutputToInput(), outputDimensions may be nullptr if the input
351     // operand has fully specified dimensions.
mapInput(uint32_t builderIndex,uint32_t executorIndex)352     void mapInput(uint32_t builderIndex, uint32_t executorIndex) {
353         mapInputOrOutput(mExecutionBuilder->mInputs[builderIndex], &mInputs[executorIndex]);
354     }
mapOutput(uint32_t builderIndex,uint32_t executorIndex)355     void mapOutput(uint32_t builderIndex, uint32_t executorIndex) {
356         mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mOutputs[executorIndex]);
357     }
mapOutputToInput(uint32_t builderIndex,uint32_t executorIndex,const Dimensions * outputDimensions)358     void mapOutputToInput(uint32_t builderIndex, uint32_t executorIndex,
359                           const Dimensions* outputDimensions) {
360         mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mInputs[executorIndex],
361                          outputDimensions);
362     }
363 
364     // dimensions must either have zero rank or must be
365     // consistent with and at least as well specified as operand dimensions
366     // (i.e., either rank must match, or operand rank must be zero; and for each
367     // individual dimension, either dimension must match, or operand dimension
368     // must be zero).
369     int setInputFromMemory(uint32_t inputIndex, const RuntimeMemory* memory, uint32_t offset,
370                            uint32_t length, const Dimensions& dimensions = {}) {
371         return setInputOrOutputFromMemory(mModel->getInputOperand(inputIndex), memory, offset,
372                                           length, dimensions, &mInputs.at(inputIndex));
373     }
374     int setOutputFromMemory(uint32_t outputIndex, const RuntimeMemory* memory, uint32_t offset,
375                             uint32_t length, const Dimensions& dimensions = {}) {
376         return setInputOrOutputFromMemory(mModel->getOutputOperand(outputIndex), memory, offset,
377                                           length, dimensions, &mOutputs.at(outputIndex));
378     }
379 
380     // Executes using the (driver, preparedModel) specified at construction time.
381     std::tuple<int, std::vector<OutputShape>, Timing> compute(
382             const OptionalTimePoint& deadline, const SharedBurst& burstController = nullptr);
383 
384     // Re-compiles and executes using the CPU, regardless of the (driver,
385     // preparedModel) specified at construction time.
386     std::tuple<int, std::vector<OutputShape>, Timing> computeOnCpuFallback();
387 
388     bool isCpu() const;
389 
390     // Perform fenced execution and return error_code, sync_fence_fd and a
391     // callback.
392     std::tuple<int, int, ExecuteFencedInfoCallback> computeFenced(
393             const std::vector<int>& wait_for, uint64_t timeoutDurationAfterFence,
394             const OptionalTimePoint& deadline);
395 
396     // Do the dynamic temporaries defined by this step have valid allocations?
397     // (true if there are no dynamic temporaries defined by this step.)
398     bool areDynamicTemporariesAllocated() const;
399 
400    private:
401     // builderDimensions may be nullptr if executorInputOrOutput has fully
402     // specified dimensions.
403     void mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
404                           ModelArgumentInfo* executorInputOrOutput,
405                           const Dimensions* builderDimensions = nullptr);
406 
407     // dimensions must either have zero rank or
408     // must be consistent with and at least as well specified as operand
409     // dimensions (i.e., either rank must match, or operand rank must be zero;
410     // and for each individual dimension, either dimension must match, or
411     // operand dimension must be zero).
412     int setInputOrOutputFromMemory(const Operand& inputOrOutputOperand, const RuntimeMemory* memory,
413                                    uint32_t offset, uint32_t length, const Dimensions& dimensions,
414                                    ModelArgumentInfo* inputOrOutputInfo);
415 
416     // describes the full (possibly multiple-"step") execution
417     ExecutionBuilder* mExecutionBuilder;
418 
419     // describes the single execution step
420     const ExecutionStep* mExecutionStep;
421 
422     // describes the dynamic temporaries
423     DynamicTemporaries* mDynamicTemporaries;
424 
425     // model to be executed on the executor, in both original and
426     // compiled forms; and device on which to execute it
427     const ModelBuilder* mModel;
428     std::shared_ptr<Device> mDevice;
429     std::shared_ptr<RuntimePreparedModel> mPreparedModel;
430 
431     // The reusable execution to launch multiple computations.
432     // It is only created once at the first time it's needed.
433     std::shared_ptr<RuntimeExecution> mExecution;
434     // Returns {NO_ERROR, execution} on success, or {result_code, nullptr} on failure.
435     std::pair<int, std::shared_ptr<RuntimeExecution>> getReusableExecution();
436 
437     // The information we'll send to the driver about the inputs and outputs.
438     // Note that we build this in two steps:
439     // 1. As the arguments are specified, set the corresponding mInputs or mOutputs element.
440     //    If set from a pointer, don't set the location in the Request::Argument but store it
441     //    instead in mInputBuffers or mOutputBuffers.
442     // 2. Once we have all the inputs and outputs, if needed, allocate shared memory for
443     //    the m*Buffers entries.  Copy the input values into the shared memory.
444     // We do this to avoid creating a lot of shared memory objects if we have a lot of
445     // parameters specified via pointers.  We also avoid copying in the case where
446     // some of the nodes will interpreted on the CPU anyway.
447     std::vector<ModelArgumentInfo> mInputs;
448     std::vector<ModelArgumentInfo> mOutputs;
449     MemoryTracker mMemories;
450 
451     // Whether compute/computeFenced may be invoked multiple times.
452     bool mReusable = false;
453 };
454 
455 std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes);
456 
457 }  // namespace nn
458 }  // namespace android
459 
460 #endif  // ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H
461