1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H 18 #define ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H 19 20 #include <ControlFlow.h> 21 #include <CpuExecutor.h> 22 #include <android-base/thread_annotations.h> 23 #include <nnapi/IBurst.h> 24 #include <nnapi/IPreparedModel.h> 25 #include <nnapi/Types.h> 26 #include <nnapi/Validation.h> 27 28 #include <memory> 29 #include <string> 30 #include <tuple> 31 #include <utility> 32 #include <vector> 33 34 #include "ExecutionCallback.h" 35 #include "Memory.h" 36 #include "ModelArgumentInfo.h" 37 #include "ModelBuilder.h" 38 #include "NeuralNetworks.h" 39 40 namespace android { 41 namespace nn { 42 43 class BurstBuilder; 44 class CompilationBuilder; 45 class Device; 46 class DynamicTemporaries; 47 class ExecutionPlan; 48 class ExecutionStep; 49 class ModelBuilder; 50 class RuntimeMemory; 51 class RuntimePreparedModel; 52 class RuntimeExecution; 53 class StepExecutor; 54 55 class ExecutionBuilder { 56 friend class StepExecutor; 57 58 public: 59 explicit ExecutionBuilder(const CompilationBuilder* compilation); 60 virtual ~ExecutionBuilder() = default; 61 62 int setInput(uint32_t index, const ANeuralNetworksOperandType* type, const void* buffer, 63 size_t length); 64 int setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type, 65 const RuntimeMemory* memory, size_t offset, size_t length); 66 int setOutput(uint32_t index, const ANeuralNetworksOperandType* type, void* buffer, 67 size_t length); 68 int setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type, 69 const RuntimeMemory* memory, size_t offset, size_t length); 70 71 int setMeasureTiming(bool measure); 72 73 int getDuration(int32_t durationCode, uint64_t* duration) const; 74 75 int setTimeoutDuration(uint64_t duration); 76 77 std::optional<uint64_t> getTimeoutDuration() const; 78 79 int setLoopTimeout(uint64_t duration); 80 getLoopTimeoutDuration()81 uint64_t getLoopTimeoutDuration() const { return mLoopTimeoutDuration; } 82 83 int enableInputAndOutputPadding(bool enable); 84 85 int setReusable(bool reusable); 86 87 int computeFenced(const std::vector<int>& wait_for, uint64_t timeoutDurationAfterFence, 88 int* sync_fence); 89 computeAsynchronously(std::shared_ptr<ExecutionCallback> * synchronizationCallback)90 int computeAsynchronously(std::shared_ptr<ExecutionCallback>* synchronizationCallback) { 91 CHECK(synchronizationCallback != nullptr); 92 return compute(synchronizationCallback); 93 } computeSynchronously()94 int computeSynchronously() { return compute(nullptr); } burstCompute(BurstBuilder * burst)95 int burstCompute(BurstBuilder* burst) { return compute(nullptr, burst); } 96 97 // Initialize output dimensional information from ModelArgumentInfo. 98 std::vector<OutputShape> getInitialOutputShapes() const; 99 100 int getOutputOperandDimensions(uint32_t index, uint32_t* dimensions); 101 int getOutputOperandRank(uint32_t index, uint32_t* rank); 102 103 // Handshake with lower-level execution support measureTiming()104 bool measureTiming() const { return mMeasureTiming; } reportTimingWithoutFencedExecutionCallback(Timing timing)105 void reportTimingWithoutFencedExecutionCallback(Timing timing) { 106 mTimingWithoutFencedExecutionCallback = timing; 107 } 108 getCompilation()109 const CompilationBuilder* getCompilation() const { return mCompilation; } getModel()110 const ModelBuilder* getModel() const { return mModel; } 111 const ModelBuilder* getSourceModel(uint32_t index) const; getSourceOperand(const std::pair<uint32_t,uint32_t> & sourceOperandIndex)112 const Operand& getSourceOperand(const std::pair<uint32_t, uint32_t>& sourceOperandIndex) const { 113 return getSourceModel(sourceOperandIndex.first)->getOperand(sourceOperandIndex.second); 114 } 115 116 // This method will be called at the end of all computation paths to change the state 117 // of the execution object and update output shapes / memories. 118 int finishComputation(int result, const std::vector<OutputShape>& outputShapes); finishComputation(ErrorStatus error,const std::vector<OutputShape> & outputShapes)119 ErrorStatus finishComputation(ErrorStatus error, const std::vector<OutputShape>& outputShapes) { 120 const int result = finishComputation(convertErrorStatusToResultCode(error), outputShapes); 121 return convertResultCodeToErrorStatus(result); 122 } 123 getExecuteFencedInfoCallback()124 const ExecuteFencedInfoCallback& getExecuteFencedInfoCallback() { 125 return mFencedExecutionCallback; 126 } 127 inFlight()128 bool inFlight() const { 129 std::lock_guard<std::mutex> lock(mStateMutex); 130 return mState == State::COMPUTATION; 131 } 132 getInputInfo(uint32_t index)133 const ModelArgumentInfo& getInputInfo(uint32_t index) const { return mInputs[index]; } getOutputInfo(uint32_t index)134 const ModelArgumentInfo& getOutputInfo(uint32_t index) const { return mOutputs[index]; } 135 getRunTimePoolInfo(uint32_t poolIndex)136 std::optional<RunTimePoolInfo> getRunTimePoolInfo(uint32_t poolIndex) const { 137 return mMemories[poolIndex]->getRunTimePoolInfo(); 138 } 139 140 protected: 141 // If a callback is provided, then this is asynchronous. If a callback is 142 // not provided (i.e., is nullptr), then this is synchronous. 143 // 144 // If burst is provided, then the burst path will be used. If a burst is not 145 // provided (i.e., is nullptr), then a synchronous execution will occur. 146 // 147 // Providing both synchronizationCallback and burstBuilder is an error. 148 int compute(std::shared_ptr<ExecutionCallback>* synchronizationCallback, 149 BurstBuilder* burstBuilder = nullptr); 150 151 virtual std::tuple<int, std::vector<OutputShape>, Timing> computeInternal( 152 const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) = 0; 153 154 virtual std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal( 155 const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence, 156 const OptionalTimePoint& deadline) = 0; 157 158 // This method handles the common preparation and validation logic of compute and computeFenced. 159 // It will be called at the start of every computation. 160 int prepareForCompute(const char* name); 161 162 const CompilationBuilder* mCompilation; 163 164 // Update output dimensional information from OutputShape to ModelArgumentInfo. 165 bool updateOutputShapes(ErrorStatus status, const std::vector<OutputShape>& outputShapes); 166 167 bool updateMemories(); 168 169 const ModelBuilder* mModel; 170 const ExecutionPlan* mPlan; 171 172 // Whether CPU fallback is allowed based on the value of DeviceManager::kPartitioning* captured 173 // from CompilationBuilder when the ExecutionBuilder is constructed. 174 bool mAllowCpuFallback; 175 176 // The information we'll send to the driver about the inputs and outputs. 177 // Note that we build this in two steps: 178 // 1. As the arguments are specified, set the corresponding mInputs or mOutputs element. 179 // If set from a pointer, don't set the location in the Request::Argument but store it 180 // instead in mInputBuffers or mOutputBuffers. 181 // 2. Once we have all the inputs and outputs, if needed, allocate shared memory for 182 // the m*Buffers entries. Copy the input values into the shared memory. 183 // We do this to avoid creating a lot of shared memory objects if we have a lot of 184 // parameters specified via pointers. We also avoid copying in the case where 185 // some of the nodes will interpreted on the CPU anyway. 186 std::vector<ModelArgumentInfo> mInputs; 187 std::vector<ModelArgumentInfo> mOutputs; 188 MemoryTracker mMemories; 189 190 // Do we ask the driver to measure timing? 191 bool mMeasureTiming = false; 192 193 // Timing reported from the driver. This field is only used if 194 // mFencedExecutionCallback is nullptr. 195 Timing mTimingWithoutFencedExecutionCallback = {}; 196 197 // Amount of time to complete or abort the execution. 198 std::optional<uint64_t> mTimeoutDuration; 199 200 // Amount of time to complete or abort a loop. 201 uint64_t mLoopTimeoutDuration = operation_while::kTimeoutNsDefault; 202 203 // The state of the execution. 204 // Properties can only been set when the execution is in the state State::PREPARATION. 205 // Timing and output shapes can only be queried when the execution is in the state 206 // State::COMPLETED. 207 enum class State { PREPARATION, COMPUTATION, COMPLETED }; 208 State mState GUARDED_BY(mStateMutex) = State::PREPARATION; computationStarted()209 bool computationStarted() const { 210 std::lock_guard<std::mutex> lock(mStateMutex); 211 return mState != State::PREPARATION; 212 } completed()213 bool completed() const { 214 std::lock_guard<std::mutex> lock(mStateMutex); 215 return mState == State::COMPLETED; 216 } 217 218 // Mutex to guard mState. Note that this not strictly needed because we provide 219 // no thread-safety guarantee to the ANeuralNetworksExecution object. 220 mutable std::mutex mStateMutex; 221 222 // Return false if the execution is in a bad state for starting computation. 223 // Otherwise, return true and set the state to State::COMPUTATION. 224 bool checkAndSetComputationState(const char* name); 225 226 // With what error status has execution completed? 227 enum class Completion { NO_ERROR, OUTPUT_INSUFFICIENT_SIZE, OTHER_ERROR }; 228 Completion mCompletion = Completion::OTHER_ERROR; completedWith()229 Completion completedWith() const { 230 CHECK(completed()); 231 return mCompletion; 232 } 233 234 // The result code of request validation. 235 // It is only evaluated once at the first time it's needed. 236 std::optional<int> mValidationResultCode; 237 int getValidationResultCode(); 238 239 // Does every tensor output operand of the model have a fully specified shape? 240 // It is only evaluated once at the first time it's needed. 241 std::optional<bool> mOutputsFullySpecified; 242 bool areOutputsFullySpecified(); 243 244 // The callback used to query execution related info in the case of fenced 245 // execution; otherwise, nullptr. If the execution plan has multiple steps, 246 // this is the callback associated with the last step. If the last step 247 // doesn't support fenced execution (e.g., the driver is too old), or if the 248 // launch of execution on the driver fails, then this callback will be 249 // nullptr. 250 ExecuteFencedInfoCallback mFencedExecutionCallback; 251 252 // Whether set{Input,Output}[FromMemory] can accept padded length or not. 253 bool mInputAndOutputPaddingEnabled = false; 254 255 // enableInputAndOutputPadding may only be called before any call of 256 // set{Input,Output}[FromMemory] 257 bool mHasCalledSetInputOutput = false; 258 259 // Can compute APIs be invoked multiple times on the execution object? 260 bool mReusable = false; 261 }; 262 263 // For execution plan with a SIMPLE body, i.e. the whole model will be executed on a single device. 264 class SimpleExecutionBuilder : public ExecutionBuilder { 265 public: 266 SimpleExecutionBuilder(const CompilationBuilder* compilation); 267 268 std::tuple<int, std::vector<OutputShape>, Timing> computeInternal( 269 const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override; 270 271 std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal( 272 const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence, 273 const OptionalTimePoint& deadline) override; 274 275 private: 276 std::shared_ptr<StepExecutor> mExecutor; 277 }; 278 279 // For execution plan with a COMPOUND body, i.e. partitioned execution with multiple steps. 280 class CompoundExecutionBuilder : public ExecutionBuilder { 281 public: 282 CompoundExecutionBuilder(const CompilationBuilder* compilation); 283 284 std::tuple<int, std::vector<OutputShape>, Timing> computeInternal( 285 const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override; 286 287 std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal( 288 const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence, 289 const OptionalTimePoint& deadline) override; 290 }; 291 292 // class StepExecutor is used to execute a single "step" in a 293 // potentially multiple step execution process. The graph associated 294 // with that step is executed in its entirety on a single device (or 295 // on the CPU). 296 class StepExecutor { 297 public: 298 // executionBuilder 299 // Describes the full (possibly multiple-"step") execution. 300 // model 301 // The model to be executed by the executor. Possibly a single 302 // "step" model of a multiple-"step" executionBuilder. 303 // driver, preparedModel 304 // The device on which to execute the "step", and the prepared 305 // model to execute on that device. For non-fallback StepExecutor, 306 // neither is nullptr; for fallback StepExecutor, both are ignored in 307 // StepExecutor::computeOnCpuFallback and may be nullptr. 308 // reusable 309 // If true, multiple StepExecutor::compute/computeFenced may be called on this 310 // object; otherwise, only one StepExecutor::compute/computeFenced may be called. 311 // reusable must be false if mDynamicTemporaries != nullptr. 312 // step 313 // Contains the output index mapping from the excerpted "step" model to 314 // main model if the execution has multiple "steps". Must be nullptr 315 // otherwise. 316 // (step == nullptr) == (dynamicTemporaries == nullptr) 317 // dynamicTemporaries 318 // If the execution has multiple "steps", describes the temporaries 319 // of source models that do not have fully specified types and are outputs 320 // of "step" models. Must be nullptr otherwise. 321 // (step == nullptr) == (dynamicTemporaries == nullptr) 322 StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model, 323 std::shared_ptr<Device> device, 324 std::shared_ptr<RuntimePreparedModel> preparedModel, bool reusable, 325 const ExecutionStep* step = nullptr, 326 DynamicTemporaries* dynamicTemporaries = nullptr); 327 328 // Map inputs and outputs from ExecutionBuilder to StepExecutor, 329 // in the case where we have a single-"step" execution (i.e., the executor 330 // is executing the entire model from the ExecutionBuilder). 331 void mapInputsAndOutputsTrivially(); 332 333 // Update output shapes with shapes returned from execution. 334 struct UpdateOutputShapes { 335 // These fields are meaningless unless updateOutputShapes() returns true 336 bool updatedDynamicTemporary; // did shape (dimensions, size) information change for at 337 // least one dynamic temporary? 338 bool mainOutputInsufficient; // is at least one main model output written by this execution 339 // marked !isSufficient? 340 bool zeroSizedInput; // is at least one output of this execution step a zero-sized tensor 341 // that needs to be read by some other step of the same execution? 342 }; 343 bool updateOutputShapes(int executionResultCode, const std::vector<OutputShape>& from, 344 std::vector<OutputShape>* to, UpdateOutputShapes* update); 345 346 // Map inputs and outputs from ExecutionBuilder to StepExecutor, 347 // one at a time. Note that these are input/output indexes, not 348 // operand indexes. 349 // 350 // For mapOutputToInput(), outputDimensions may be nullptr if the input 351 // operand has fully specified dimensions. mapInput(uint32_t builderIndex,uint32_t executorIndex)352 void mapInput(uint32_t builderIndex, uint32_t executorIndex) { 353 mapInputOrOutput(mExecutionBuilder->mInputs[builderIndex], &mInputs[executorIndex]); 354 } mapOutput(uint32_t builderIndex,uint32_t executorIndex)355 void mapOutput(uint32_t builderIndex, uint32_t executorIndex) { 356 mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mOutputs[executorIndex]); 357 } mapOutputToInput(uint32_t builderIndex,uint32_t executorIndex,const Dimensions * outputDimensions)358 void mapOutputToInput(uint32_t builderIndex, uint32_t executorIndex, 359 const Dimensions* outputDimensions) { 360 mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mInputs[executorIndex], 361 outputDimensions); 362 } 363 364 // dimensions must either have zero rank or must be 365 // consistent with and at least as well specified as operand dimensions 366 // (i.e., either rank must match, or operand rank must be zero; and for each 367 // individual dimension, either dimension must match, or operand dimension 368 // must be zero). 369 int setInputFromMemory(uint32_t inputIndex, const RuntimeMemory* memory, uint32_t offset, 370 uint32_t length, const Dimensions& dimensions = {}) { 371 return setInputOrOutputFromMemory(mModel->getInputOperand(inputIndex), memory, offset, 372 length, dimensions, &mInputs.at(inputIndex)); 373 } 374 int setOutputFromMemory(uint32_t outputIndex, const RuntimeMemory* memory, uint32_t offset, 375 uint32_t length, const Dimensions& dimensions = {}) { 376 return setInputOrOutputFromMemory(mModel->getOutputOperand(outputIndex), memory, offset, 377 length, dimensions, &mOutputs.at(outputIndex)); 378 } 379 380 // Executes using the (driver, preparedModel) specified at construction time. 381 std::tuple<int, std::vector<OutputShape>, Timing> compute( 382 const OptionalTimePoint& deadline, const SharedBurst& burstController = nullptr); 383 384 // Re-compiles and executes using the CPU, regardless of the (driver, 385 // preparedModel) specified at construction time. 386 std::tuple<int, std::vector<OutputShape>, Timing> computeOnCpuFallback(); 387 388 bool isCpu() const; 389 390 // Perform fenced execution and return error_code, sync_fence_fd and a 391 // callback. 392 std::tuple<int, int, ExecuteFencedInfoCallback> computeFenced( 393 const std::vector<int>& wait_for, uint64_t timeoutDurationAfterFence, 394 const OptionalTimePoint& deadline); 395 396 // Do the dynamic temporaries defined by this step have valid allocations? 397 // (true if there are no dynamic temporaries defined by this step.) 398 bool areDynamicTemporariesAllocated() const; 399 400 private: 401 // builderDimensions may be nullptr if executorInputOrOutput has fully 402 // specified dimensions. 403 void mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput, 404 ModelArgumentInfo* executorInputOrOutput, 405 const Dimensions* builderDimensions = nullptr); 406 407 // dimensions must either have zero rank or 408 // must be consistent with and at least as well specified as operand 409 // dimensions (i.e., either rank must match, or operand rank must be zero; 410 // and for each individual dimension, either dimension must match, or 411 // operand dimension must be zero). 412 int setInputOrOutputFromMemory(const Operand& inputOrOutputOperand, const RuntimeMemory* memory, 413 uint32_t offset, uint32_t length, const Dimensions& dimensions, 414 ModelArgumentInfo* inputOrOutputInfo); 415 416 // describes the full (possibly multiple-"step") execution 417 ExecutionBuilder* mExecutionBuilder; 418 419 // describes the single execution step 420 const ExecutionStep* mExecutionStep; 421 422 // describes the dynamic temporaries 423 DynamicTemporaries* mDynamicTemporaries; 424 425 // model to be executed on the executor, in both original and 426 // compiled forms; and device on which to execute it 427 const ModelBuilder* mModel; 428 std::shared_ptr<Device> mDevice; 429 std::shared_ptr<RuntimePreparedModel> mPreparedModel; 430 431 // The reusable execution to launch multiple computations. 432 // It is only created once at the first time it's needed. 433 std::shared_ptr<RuntimeExecution> mExecution; 434 // Returns {NO_ERROR, execution} on success, or {result_code, nullptr} on failure. 435 std::pair<int, std::shared_ptr<RuntimeExecution>> getReusableExecution(); 436 437 // The information we'll send to the driver about the inputs and outputs. 438 // Note that we build this in two steps: 439 // 1. As the arguments are specified, set the corresponding mInputs or mOutputs element. 440 // If set from a pointer, don't set the location in the Request::Argument but store it 441 // instead in mInputBuffers or mOutputBuffers. 442 // 2. Once we have all the inputs and outputs, if needed, allocate shared memory for 443 // the m*Buffers entries. Copy the input values into the shared memory. 444 // We do this to avoid creating a lot of shared memory objects if we have a lot of 445 // parameters specified via pointers. We also avoid copying in the case where 446 // some of the nodes will interpreted on the CPU anyway. 447 std::vector<ModelArgumentInfo> mInputs; 448 std::vector<ModelArgumentInfo> mOutputs; 449 MemoryTracker mMemories; 450 451 // Whether compute/computeFenced may be invoked multiple times. 452 bool mReusable = false; 453 }; 454 455 std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes); 456 457 } // namespace nn 458 } // namespace android 459 460 #endif // ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H 461